IMP  2.0.1
The Integrative Modeling Platform
TuneRex.py
1 #!/usr/bin/env python
2 
3 __doc__="""
4 This module provides a few methods to improve the efficiency of a replica-exchange simulation
5 by tuning its parameters.
6 Author: Yannick Spill
7 """
8 
9 import sys
10 #from mpmath import erf, erfinv
11 #from math import sqrt
12 #from random import randint
13 from numpy import *
14 from numpy.random import randint
15 import rpy2.robjects as robjects
16 
17 kB = 1.3806503 * 6.0221415 / 4184.0 # Boltzmann constant in kcal/mol/K
18 EPSILON = 1e-8 #here for float comparison. Floats are equal if their difference
19  #is smaller than EPSILON
20 debug=False
21 def prdb(arg):
22  if debug:
23  print arg
24 
25 ##### R compatibility functions
26 
27 r = robjects.r
28 robjects.globalenv["kB"] = kB
29 _rinverf = r('invErf <- function(x) {qnorm((1 + x) /2) / sqrt(2)}')
30 _rerf = r('erf <- function(x) {2 * pnorm(x * sqrt(2)) - 1}')
31 _rinvF = r('qf')
32 _rinterp = None
33 
34 def erfinv(x):
35  return _rinverf(x)[0]
36 
37 def erf(x):
38  return _rerf(x)[0]
39 
40 def Finv(x,d1,d2):
41  return _rinvF(x,d1,d2)[0]
42 
43 def spline(xy, mean,method=None):
44  """spline interpolation of (x,y) coordinates. If interpolation goes
45  negative, replace by mean value.
46  """
47  x,y = zip(*xy)
48  robjects.globalenv["x"] = robjects.FloatVector(x)
49  robjects.globalenv["y"] = robjects.FloatVector(y)
50  global _rinterp
51  #_rinterp = r.splinefun(x,y)
52  if method == None:
53  r('cvsplinenonbounded <- splinefun(x,y)')
54  else:
55  r('cvsplinenonbounded <- splinefun(x,y,method="%s")' % method)
56  _rinterp = r('cvspline <- function(x) { tmp = cvsplinenonbounded(x); if (tmp>0) {tmp} else {%f}}' % mean)
57  def interpolated(x):
58  global _rinterp
59  return _rinterp(x)[0]
60  return interpolated
61 
62 def linear_interpolation(xy, mean):
63  """linear interpolation of (x,y) coordinates. No extrapolation possible.
64  """
65  x,y = zip(*xy)
66  robjects.globalenv["x"] = robjects.FloatVector(x)
67  robjects.globalenv["y"] = robjects.FloatVector(y)
68  global _rinterp
69  #_rinterp = r.splinefun(x,y)
70  _rinterp = r('cvspline <- approxfun(x,y)')
71  def interpolated(x):
72  global _rinterp
73  return _rinterp(x)[0]
74  return interpolated
75 
76 
77 ##### R testing functions
78 
79 def anova(*args):
80  """perform anova using R and return statistic, p-value, between and within variance"""
81  ngroups = len(args) #number of groups
82  #nreps = len(args[0]) #number of repetitions
83  #group = r.gl(ngroups,nreps)
84  reps = r.rep(0,len(args[0]))
85  weight = robjects.FloatVector(args[0])
86  for i in xrange(1,len(args)):
87  reps += r.rep(i,len(args[i]))
88  weight += robjects.FloatVector(args[i])
89  group = r.factor(reps)
90  robjects.globalenv["weight"] = weight
91  robjects.globalenv["group"] = group
92  lm = r.lm("weight ~ group")
93  aov = r.anova(lm)
94  prdb(aov)
95  # F statistic, p-value, between and within variance
96  anova_result = {'fstat':aov[3][0],
97  'pval':aov[4][0],
98  'between':aov[2][0],
99  'within':aov[2][1],
100  'nsteps':[len(i) for i in args],
101  'nreps':ngroups,
102  'test': 'anova'} #nreps: number of replicas
103  return anova_result
104 
105 def kruskal(*args):
106  """perform kruskal-wallis rank test"""
107  ngroups = len(args)
108  #nreps = len(args[0])
109  #group = r.gl(ngroups,nreps)
110  reps = r.rep(0,len(args[0]))
111  weight = robjects.FloatVector(args[0])
112  for i in xrange(1,len(args)):
113  reps += r.rep(i,len(args[i]))
114  weight += robjects.FloatVector(args[i])
115  group = r.factor(reps)
116  aov = r('kruskal.test')(group,weight)
117  prdb(aov)
118  kruskal_result = {'fstat':aov[0][0],
119  'pval':aov[2][0],
120  'nsteps':[len(i) for i in args],
121  'nreps':ngroups,
122  'test':'kruskal'}
123  return kruskal_result # F statistic and p-value
124 
125 def ttest(obs, target):
126  """perform a one-sample two-sided t-test on obs against target mean"""
127  test = r('t.test')(robjects.IntVector(obs),mu=target)
128  return test[0][0],test[2][0] #stat and p-value
129 
130 def binom(obs, target):
131  """perform an exact binomial test on the mean of obs against target"""
132  success = sum(obs)
133  trials = len(obs)
134  test = r('binom.test')(success,trials,p=target)
135  return test[0][0],test[2][0] #stat and p-value
136 
137 def bartlett(*args):
138  """perform bartlett's test on the equality of variances of the observations"""
139  ngroups = len(args)
140  nreps = len(args[0])
141  group = r.gl(ngroups,nreps)
142  weight = robjects.IntVector(args[0])
143  for i in args[1:]:
144  weight += robjects.IntVector(i)
145  robjects.globalenv["weight"] = weight
146  robjects.globalenv["group"] = group
147  var = r('bartlett.test')(weight,group)
148  return var[0][0], var[2][0] # statistic and p-value
149 
150 def fligner(*args):
151  """perform Fligner-Killeen non-parametric test of the variance equality"""
152  ngroups = len(args)
153  nreps = len(args[0])
154  group = r.gl(ngroups,nreps)
155  weight = robjects.IntVector(args[0])
156  for i in args[1:]:
157  weight += robjects.IntVector(i)
158  robjects.globalenv["weight"] = weight
159  robjects.globalenv["group"] = group
160  var = r('fligner.test')(weight,group)
161  return var[0][0], var[2][0] # statistic and p-value
162 
163 def power_test(ar, power=0.8, alpha=0.05):
164  """perform an anova power test and return
165  - the power of the test with this input data
166  - the number of points that would be needed to achieve a default power of 0.8
167  ar: the ouput of anova()
168  """
169  result = r('power.anova.test')(groups=ar['nreps'], n=min(ar['nsteps']),
170  between=ar['between'], within=ar['within'], sig=alpha)
171  prdb('the power of this anova was: %.3f' % result[5][0])
172  result = r('power.anova.test')(groups=ar['nreps'],
173  between=ar['between'], within=ar['within'], sig=alpha, pow=power)
174  prdb('To have a power of %.3f, there should be at least %d exchange attemps.' % (power, result[1][0]) )
175  return
176 
177 def minimum_n(ar, alpha=0.05):
178  """This routine tries to return an estimate of the additional number of exchange trials that
179  could lead to a positive result of the anova (e.g. the average ARs are not the same). It is
180  still very crude. It also assumes that a one-way anova was made.
181  ar: the output of anova()
182  alpha: type I error
183  """
184  nreps = ar['nreps']
185  nsteps = ar['nsteps']
186  try:
187  nsteps = min(nsteps)
188  except:
189  pass
190  fstat = ar['fstat']
191  return nsteps*(sqrt(Finv(1-alpha,nreps-1,nreps*(nsteps-1))/fstat)-1)
192 
193 
194 ##### Heat capacity class
195 class CvEstimator:
196  """When created, estimates the heat capacity from the energies or from the
197  indicator functions using the specified method. Two methods are then
198  available to guess the heat capacity at a given parameter value: get()
199  returns a single point, and mean() returns the linear average between two
200  points.
201  """
202 
203  def __init__(self, params, energies=None, indicators=None,
204  method="constant", temps=None, write_cv = False):
205 
206  kB = 1.3806503 * 6.0221415 / 4184.0 # Boltzmann constant in kcal/mol/K
207  self.__initialized = False
208  self.__cv = []
209  self.method = method
210 
211  if method == "interpolate":
212  self.estimate_cv_interpolate(params, indicators)
213  self.get = self.get_interp
214  self.mean = self.mean_interp
215  elif method == "constant":
216  self.estimate_cv_constant(params, indicators)
217  self.get = lambda p: self.__cv
218  self.mean = lambda p1,p2: self.__cv
219  self.__cvfun = self.get
220  r('cvspline <- function(x) {%f}' % self.__cv)
221  elif method == "mbar":
222  self.estimate_cv_mbar(params, energies, temps)
223  self.get = self.get_mbar
224  self.mean = self.mean_mbar
225  else:
226  raise NotImplementedError, method
227 
228  self.__initialized = True
229 
230  #write the heat capacity to a file
231  if write_cv:
232  fl=open('cv','w')
233  fl.write("".join(["%f %f\n" % (x,self.get(x)) for x in linspace(params[0]/2,2*params[-1])]))
234  fl.close()
235 
236  def estimate_cv_interpolate(self, params, indicators):
237  """interpolate using previous values, by reversing the approximate overlap
238  function
239  """
240  if self.__initialized :
241  return
242  if len(indicators) != len(params) - 1:
243  raise ValueError, \
244  "the length of indicators and params does not match!"
245  if params != tuple(sorted(params)):
246  raise NotImplementedError, \
247  "unable to work on parameters that do not change monotonically"
248 
249  prdb("storing params and means")
250  self.__params = params
251  self.__pmeans = [(params[i] + params[i+1])/2. \
252  for i in xrange(len(params)-1)]
253  prdb("computing __cv")
254  for i,ind in enumerate(indicators):
255  mean = sum(ind)/float(len(ind))
256  Y2 = 2*kB*erfinv(1-mean)**2
257  p1=params[i]
258  p2=params[i+1]
259  self.__cv.append((self.__pmeans[i], (p1**2+p2**2)*float(Y2)/(p2-p1)**2))
260  prdb(self.__params)
261  prdb(self.__cv)
262  self.__cvmean = sum([ i[1] for i in self.__cv])/float(len(self.__cv))
263  if self.__cvmean < 0:
264  raise ValueError, "Cv mean is negative!"
265  self.__cvfun = spline(self.__cv, self.__cvmean)
266  return
267 
268  def estimate_cv_constant(self, params, indicators):
269  """try to guess which constant cv fits best"""
270  if self.__initialized :
271  return
272  self.estimate_cv_interpolate(params, indicators)
273  self.__cv = self.__cvmean
274  return
275 
276  def needs_init(self):
277  if not self.__initialized:
278  raise RuntimeError, "Class was not initialized correctly!"
279 
280  def estimate_cv_mbar(params, energies, temps):
281  "use MBAR to get the heat capacity"
282  raise NotImplementedError, method
283  if self.__initialized :
284  return
285 
286  def _isinbounds(self,p,params):
287  """returns True if p is within parms, else false. the params list must be sorted ascendingly."""
288  if p < params[0] - EPSILON or p > params[-1] + EPSILON:
289  #prdb("Warning: value %f is outside of bounds, extrapolating." % p)
290  return False
291  else:
292  return True
293 
294  def _interpolate(self,xval,xlist):
295  """return interpolation of Cv at point xval, and return the average instead
296  if this value is negative.
297  """
298  self._isinbounds(xval,xlist)
299  val = self.__cvfun(xval)
300  if val > 0:
301  return val
302  else:
303  return self.__cvmean
304 
305  def get_interp(self,param):
306  """returns the point estimate of the first derivative of the energy with
307  respect to the replica exchange parameter (usually T or q).
308  This version assumes that the means of cv are given.
309  """
310  self.needs_init()
311  return self._interpolate(param, self.__pmeans)
312 
313  def get_mbar(self,param):
314  """returns the point estimate of the first derivative of the energy with
315  respect to the replica exchange parameter (usually T or q).
316  This version assumes that the values of cv are given.
317  """
318  self.needs_init()
319  return self._interpolate(param, self.__params)
320 
321  def mean_interp(self,pa,pb):
322  """estimate the mean of Cv between two points. Here the means were
323  stored previously
324  """
325  self.needs_init()
326  return self._interpolate((pa+pb)/2., self.__pmeans)
327 
328  def mean_mbar(self,pa,pb):
329  self.needs_init()
330  return (self.get_mbar(pb) + self.get_mbar(pa)) / 2.
331 
332 ##### Parameter updating methods
333 def update_good_dumb(newp, oldp, *args, **kwargs):
334  """Here the old parameters are oldp[0] and oldp[1], and the starting point
335  is newp[0]. We should modify newp[1] so that the AR in the following cycle
336  is equal to the targetAR.
337  In the "dumb" method, the Cv and targetAR keywords are ignored.
338  Here the newp[1] parameter is modified because prior changes have set
339  newp[0] to a different value than oldp[0]. Thus, we should move newp[1] by
340  minimizing the effect on the AR since it is supposedly equal to targetAR. In
341  this simple method, the parameter is just translated.
342  """
343  prdb("newp[0] has moved (%.3f -> %.3f), adjusting the position of newp[1]" %
344  (oldp[0],newp[0]))
345  return oldp[1] - (oldp[0] - newp[0])
346 
347 def update_bad_dumb(newp, oldp, ind, targetAR = 0.4, scale=0.1, **kwargs):
348  """Here the old parameters are oldp[0] and oldp[1], and the starting point
349  is newp[0]. We should modify newp[1] so that the AR in the following cycle
350  is equal to the targetAR.
351  In the "dumb" method, the Cv keyword is ignored. Here the newp[1]
352  parameter is modified to follow the possible translation of newp[0] by
353  calling update_good_dumb, and then newp[1] is added or substracted scale% of
354  (oldp[1] - oldp[0]) to adjust to targetAR.
355  """
356 
357  if newp[0] != oldp[0]:
358  newp[1] = update_good_dumb(newp,oldp)
359  if targetAR > sum(ind)/float(len(ind)):
360  prdb("""target AR is higher than expected, decreasing newp[1]""")
361  newp[1] -= scale * (oldp[1]-oldp[0])
362  else:
363  prdb("""target AR is lower than expected, increasing newp[1]""")
364  newp[1] += scale * (oldp[1]-oldp[0])
365  return newp[1]
366 
367 def update_any_cv_step(newp, oldp, ind, targetAR = 0.4, Cv = None, **kwargs):
368  """here we use the average AR formula of two gaussians to get newp[1] as a
369  function of newp[1], knowing the targetAR and estimating the Cv. If targetAR
370  is negative, consider that mean(ind) equals the target AR and skip any
371  calculation in the case that oldp[0] equals newp[0].
372  step: suppose the heat capacity is stepwise constant, i.e. use the heat
373  capacity at position newp[0] as an estimate of the mean of the heat capacity
374  between newp[0] and newp[1]. This does not require any self-consistent loop.
375  """
376 
377  global kB
378 
379  if abs(oldp[0] - newp[0]) < EPSILON and targetAR < 0:
380  return oldp[1]
381  if targetAR < 0:
382  targetAR = sum(ind)/float(len(ind))
383  cv = Cv.get(newp[0])
384  Y = sqrt(2*kB)*float(erfinv(1-targetAR))
385  if Y**2 >= cv:
386  raise ValueError, """targetAR too small for this approximate method, use
387  the full self-consistent method instead."""
388  return newp[0]*(cv+Y*sqrt(2*cv - Y**2))/(cv-Y**2)
389 
390 def update_any_cv_sc(newp, oldp, ind, targetAR=0.4, Cv=None,
391  tol=1e-6, maxiter=10000):
392  """self-consistent solver version"""
393 
394  global kB
395 
396  if abs(oldp[0] - newp[0]) < EPSILON and targetAR < 0:
397  return oldp[1]
398  if targetAR < 0:
399  targetAR = sum(ind)/float(len(ind))
400  cv = Cv.get(newp[0])
401  Y = sqrt(2*kB)*float(erfinv(1-targetAR))
402  if Y**2 >= cv:
403  raise ValueError, """targetAR too small for this approximate method, use
404  the full self-consistent method instead."""
405  targetp = newp[0]*(cv+Y*sqrt(2*cv - Y**2))/(cv-Y**2)
406  for i in xrange(maxiter):
407  cv = Cv.mean(newp[0],targetp)
408  (oldtargetp, targetp) = (targetp, newp[0]*(cv+Y*sqrt(2*cv-Y**2))/(cv-Y**2))
409  if abs(targetp - oldtargetp) <= tol:
410  break
411  if isnan(targetp):
412  if Y**2 >= cv:
413  raise ValueError, """targetAR too small for this approximate method, use
414  the full self-consistent method instead."""
415  else:
416  raise ValueError, """something unexpected happened"""
417  if i == maxiter - 1:
418  prdb("""Warning: unable to converge the self-consistent after %d
419  iterations and a tolerance of %f. Change the method or decrease the
420  tolerance!""" % (maxiter, tol))
421  prdb("converged after %d iterations and a tolerance of %f for x=%f" %
422  (i,tol,oldp[1]))
423  return targetp
424 
425 def update_any_cv_scfull(newp, oldp, ind, targetAR=0.4, Cv=None,
426  tol=1e-6, maxiter=10000):
427  """self-consistent solver version, on the exact average AR equation"""
428 
429  #create helper functions and overlap function
430  #_ru21 = r('u21 <- function(t1,t2) { (t2-t1)*(cvspline(t2)-cvspline(t1))/2. }')
431  _ru21 = r('u21 <- function(t1,t2) { integrate(Vectorize(cvspline),t1,t2)$value }')
432  _rb21 = r('b21 <- function(t1,t2) { 1./(kB*t2) - 1./(kB*t1) }')
433  _rsigma2 = r('sigma2 <- function(t) {cvspline(t)*kB*t**2}')
434  _rovboltz = r('ovboltz <- function(t1,t2) {\
435  1/2*( 1-erf(\
436  u21(t1,t2)/sqrt(2*(sigma2(t1)+sigma2(t2))))\
437  + exp(b21(t1,t2)*(u21(t1,t2)+b21(t1,t2)*(sigma2(t1)+sigma2(t2))/2))\
438  * (1+erf((u21(t1,t2)+b21(t1,t2)*(sigma2(t1)+sigma2(t2)))/(sqrt(2*(sigma2(t1)+sigma2(t2))))))\
439  )}')
440  _rrootfn = r('rootfn <- function(t2) {ovboltz(%f,t2)-%f}' % (newp[0],targetAR))
441 
442  #find upper bound for estimation, raise an error if cv is negative
443  if oldp[1] > oldp[0]:
444  tmp = newp[0]*1.05
445  else:
446  tmp = newp[0]*0.95
447  nloops = 0
448  while _rrootfn(tmp)[0] >= 0:
449  nloops += 1
450  tmp += (oldp[1] - oldp[0])
451  if Cv.get(tmp) < 0:
452  raise RuntimeError, "heat capacity goes negative"
453  if nloops > maxiter:
454  raise RuntimeError,'could not find zero of function!'
455 
456  #find root
457  _runiroot = r('uniroot(rootfn,c(%f,%f),f.lower = %f, f.upper = %f, tol = %f, maxiter = %d)' % (newp[0],tmp,
458  1-targetAR, -targetAR, tol, maxiter))
459  prdb("self-consistent solver converged after %s iterations and an estimated precision of %s " % (_runiroot[2][0],_runiroot[3][0]))
460  prdb(["root:",_runiroot[0][0],"overlap:", _rovboltz(newp[0],_runiroot[0][0])[0]])
461  return _runiroot[0][0]
462 
463 def update_any_cv_nr(newp, oldp, ind, targetAR = 0.4, Cv = None, **kwargs):
464  """newton-raphson solver version"""
465 
466  #use nlm
467  raise NotImplementedError
468 
469 ##### Testing methods
470 def are_equal_to_targetAR(indicators,targetAR=0.4,alpha=0.05, method = "binom"):
471  """here, all indicators have same average, we want to know if it is equal to
472  targetAR
473  """
474 
475  #calculate sample mean deviation of each indicator function from targetAR
476  deviations=[(abs(sum(ind)/float(len(ind))-targetAR),ind) for pos,ind in enumerate(indicators)]
477  deviations.sort()
478  deviant = deviations[-1]
479 
480  #perform t-test
481  if method == "ttest":
482  #from statlib.stats import ttest_1samp as ttest
483  ttest = ttest
484  elif method == "binom":
485  ttest = binom
486  else:
487  raise NotImplementedError
488 
489  try:
490  test,pval = ttest(deviant[1],targetAR)
491  except:
492  if abs(targetAR - sum(deviant[1])/len(deviant[1])) > EPSILON:
493  pval = 0
494  else:
495  pval = 1
496  if pval < alpha:
497  return False
498  else:
499  return True
500 
501 def are_stationnary(indicators, alpha=0.05, method = "anova"):
502  """test on the stationarity of the observations (block analysis). Done so by
503  launching an anova on the difference between the two halves of each observations.
504  """
505 
506  if method == "kruskal":
507  test = kruskal
508  else:
509  test = anova
510 
511  tmp = array(indicators)
512  blocklen = len(indicators[0])/2
513  block = tmp[:,:blocklen] - tmp[:,blocklen:2*blocklen]
514  if test(*block)['pval'] < alpha:
515  return False
516  else:
517  return True
518 
519 def are_equal(indicators, targetAR=0.4, alpha = 0.05,
520  method = "anova", varMethod = "skip", power=0.8):
521  """Perform a one-way ANOVA or kruskal-wallis test on the indicators set,
522  and return True if one cannot exclude with risk alpha that the indicators
523  AR are different (i.e. True = all means are equal). Also performs a test
524  on the variance (they must be equal).
525  """
526 
527  if min(targetAR,1-targetAR)*len(indicators[0]) <= 5 and \
528  (varMethod == "bartlett" or method == "anova"):
529  prdb("Warning: normal approximation to the binomial does not hold!")
530 
531  #test the variances
532  if varMethod == "skip":
533  pass
534  else:
535  if varMethod == "bartlett":
536  pval = bartlett(*indicators)[1]
537  elif varMethod == "fligner":
538  pval = killeen(*indicators)[1]
539  else:
540  raise NotImplementedError, "variance testing method unknown: %s" % varMethod
541  if pval < alpha:
542  prdb("Warning: performing mean test with unequal variances.")
543 
544 
545  if method == "kruskal":
546  test = kruskal
547  else:
548  test = anova
549 
550  tr = test(*indicators)
551  tr['alpha']=alpha
552  # p-value < alpha => H0 rejected => result == False
553  tr['result']= tr['pval'] >= alpha
554 
555  #if tr['test'] == 'anova':
556  #try:
557  # power_test(tr, power=power, alpha=alpha)
558  #except:
559  # prdb("power test failed")
560  # pass
561 
562  return tr
563 
564 def find_good_ARs(indicators, targetAR=0.4, alpha=0.05, method="binom"):
565  """perform one-sample t-tests on each of the data sets, and return
566  a tuple of bool of size N-1, False if AR i is not equal to targetAR
567  at risk alpha.
568  """
569 
570  #calculate sample means of each indicator function
571  means=[(sum(ind)/float(len(ind)),pos,ind) for pos,ind in enumerate(indicators)]
572  means.sort()
573 
574  #perform t-test
575  if method == "ttest":
576  #from statlib.stats import ttest_1samp as ttest
577  ttest = ttest
578  elif method == "binom":
579  ttest = binom
580  else:
581  raise NotImplementedError
582 
583  isGoodTuple = []
584  #start from the lowest means and stop when they are ok
585  prdb("starting left")
586  for (i,(mean,pos,ind)) in enumerate(means):
587  prdb("performing t-test on couple %d having average AR %f, position %d" % (pos,mean,i))
588  try:
589  test,pval = ttest(ind,targetAR)
590  except:
591  if abs(targetAR - mean) > EPSILON:
592  pval = 0
593  else:
594  pval = 1
595  if pval < alpha:
596  #means are different
597  isGoodTuple.append((pos,False))
598  else:
599  goodstart = i
600  break
601  #then start from the highest means
602  prdb("starting right")
603  for (i,(mean,pos,ind)) in enumerate(reversed(means)):
604  prdb("performing t-test on couple %d having average AR %f, position %d" \
605  % (pos,mean,len(means)-1-i))
606  if ttest(ind,targetAR)[1] < alpha:
607  #means are different
608  isGoodTuple.append((pos,False))
609  else:
610  goodstop = len(means)-1-i
611  break
612 
613  #limiting cases: all different
614  if len(isGoodTuple) > len(indicators):
615  return tuple([False]*len(indicators))
616  #all equal
617  elif len(isGoodTuple) == 0:
618  return tuple([True]*len(indicators))
619  #intermediate
620  else:
621  isGoodTuple.extend([(means[i][1],True) for i in
622  xrange(goodstart,goodstop+1)])
623  isGoodTuple.sort()
624  return tuple([tup[1] for tup in isGoodTuple])
625 
626 ##### Trebst, Katzgraber, Nadler and Hansmann optimum flux stuff
627 
628 def mean_first_passage_times(replicanums_ori, subs=1, start=0, use_avgAR=False):
629  """compute mean first passage times as suggested in
630  Nadler W, Meinke J, Hansmann UHE, Phys Rev E *78* 061905 (2008)
631 
632  use_avgAR : if a list of average ARs is given computes everything from
633  average AR; if it is False, compute by direct counting of events.
634 
635  returns:
636  If use_avgAR == False:
637  tau0, tauN, chose_N, times0, timesN
638  else:
639  tau0, tauN, None, None, None
640  tau0[i]: average time to go from replica 0 to replica i
641  tauN[i]: average time to go from replica N to replica i
642  times0 and timesN are the corresponding lists of single events.
643 
644  """
645 
646  from numpy import array,zeros
647  from pprint import pprint,pformat
648  replicanums = array(replicanums_ori)[:,start::subs]
649  N = len(replicanums)
650  tauN = [0]*N
651  tau0 = [0]*N
652 
653  if use_avgAR:
654  tau0[0] = 0.0
655  tauN[-1] = 0.0
656  for state in xrange(1,N):
657  tau0[state] = tau0[state-1] + state/(float(use_avgAR[state-1]))
658  for state in reversed(xrange(1,N)):
659  tauN[state-1] = tauN[state] \
660  + (N-(state-1))/(float(use_avgAR[state-1]))
661 
662  return tau0, tauN, None, None, None
663 
664  else:
665  #prdb('not using average AR')
666  from itertools import izip
667  #the algorithm looks for replicas that start at the lowest temp, and
668  #records the farthest state it went to before returning to zero. Once
669  #back it increments the counter of all concerned replicas. Similar
670  #procedure if starting from N.
671  store0 = zeros((N,N), dtype=bool)
672  last0=[0 for i in xrange(N)]
673  already0 = [False for i in xrange(N)]
674  times0=[[] for i in xrange(N)]
675  storeN = zeros((N,N), dtype=bool)
676  lastN=[0 for i in xrange(N)]
677  alreadyN = [False for i in xrange(N)]
678  timesN=[[] for i in xrange(N)]
679 
680  #prdb('looping over replicanums')
681  for time, frame in enumerate(izip(*replicanums)):
682  #case of the replica in state 0
683  if not already0[frame[0]]:
684  last0[frame[0]] = time
685  store0[frame[0],:] = True
686  already0[frame[0]] = True
687  #case of the replica in state N
688  if not alreadyN[frame[-1]]:
689  lastN[frame[-1]] = time
690  storeN[frame[-1],:] = True
691  alreadyN[frame[-1]] = True
692  #set already flags to False when in state 1 or N-1
693  already0[frame[1]] = False
694  alreadyN[frame[-2]] = False
695  #loop over all states
696  for state,rep in enumerate(frame):
697  if store0[rep,state]:
698  #reached a state for the first time since 0
699  store0[rep,state] = False
700  #store time since this replica left state 0
701  times0[state].append(time - last0[rep])
702  if storeN[rep,state]:
703  #reached a state for the first time since N
704  storeN[rep,state] = False
705  #store time since this replica left state N
706  timesN[state].append(time - lastN[rep])
707  #prdb([replicanums.shape, len(storeN), len(last0)])
708  times = [[] for i in xrange(N)]
709  chose_N = [len(timesN[state]) > len(times0[state]) for state in \
710  xrange(N)]
711  for state in xrange(N):
712  tauN[state] = sum(timesN[state])/float(len(timesN[state]))
713  tau0[state] = sum(times0[state])/float(len(times0[state]))
714  #prdb(len(chose_N))
715 
716  return tau0, tauN, chose_N, times0, timesN
717 
718 def compute_effective_fraction(tau0, tauN, chose_N):
719  """input: tau0, tauN, chose_N
720  output: effective fraction f(T) (P_up(n)) as introduced in
721  Trebst S, Troyer M, Hansmann UHE, J Chem Phys *124* 174903 (2006).
722  formalized in
723  Nadler W, Hansmann UHE, Phys Rev E *75* 026109 (2007)
724  and whose calculation is enhanced in
725  Nadler W, Meinke J, Hansmann UHE, Phys Rev E *78* 061905 (2008)
726  the ideal value of f(n) should be 1 - n/N
727  """
728 
729  #nstar is the index of the last state where tau0 should be used.
730  N = len(tau0)
731  if chose_N is None:
732  nstar =N/2
733  else:
734  nstar = N - sum([int(a) for a in chose_N]) - 1
735 
736  prdb("n* = %d" % nstar)
737  #compute helper functions h
738  h0=[0]*N
739  h0[1] = tau0[1]
740  for state in xrange(2,nstar+1):
741  h0[state] = h0[state-1] + (tau0[state]-tau0[state-1])/float(state)
742 
743  hN=[0]*N
744  hN[-2] = tauN[-2]
745  for state in reversed(xrange(nstar,N-1)):
746  hN[state] = hN[state+1] + \
747  (tauN[state]-tauN[state+1])/float(N-state)
748 
749  #compute flow probabilities
750  pup=[0]*N
751  pup[0]=1
752  for n in xrange(1,nstar+1):
753  pup[n] = 1 - h0[n]/(h0[nstar] + hN[nstar])
754  for n in xrange(nstar+1, N):
755  pup[n] = hN[n]/(h0[nstar]+hN[nstar])
756 
757  return pup
758 
759 def spline_diffusivity(pup,params):
760  """spline interpolation of diffusivity: D = 1/(df/dT * heta)
761  """
762  from numpy import linspace
763  robjects.globalenv["hetay"] = \
764  robjects.FloatVector(linspace(0,1,num=len(params)).tolist())
765  robjects.globalenv["hetax"] = robjects.FloatVector(params)
766  robjects.globalenv["pupx"] = robjects.FloatVector(params)
767  robjects.globalenv["pupy"] = robjects.FloatVector(pup)
768  heta=r('heta <- splinefun(hetax,hetay,method="monoH.FC")')
769  eff=r('eff <- splinefun(pupx,pupy,method="monoH.FC")')
770  diff = r('diff <- function(x) {-1/(heta(x,deriv=1)*eff(x,deriv=1))}')
771  return lambda x: diff(x)[0]
772 
773 
774 ##### Misc
775 def compute_indicators(replicanums, subs = 1, start=0):
776  """input: replicanums : a list of N lists of size M, where N is the number of
777  states and M is the length of the simulation. Each element is an integer,
778  and corresponds to the label of a replica.
779  output: an indicator function of exchanges (size (N-1)x(M-1)), 1 if exchange and
780  0 if not.
781  """
782  def exchange(n, m):
783  if replicanums[n][m] == replicanums[n+1][m+1] \
784  and replicanums[n][m+1] == replicanums[n+1][m]:
785  return 1
786  else:
787  return 0
788 
789  indicators = []
790  for n in xrange(len(replicanums)-1):
791  indicators.append([exchange(n, m) \
792  for m in xrange(len(replicanums[n])-1)][start::subs])
793  return indicators
794 
795 #### Main routines
796 
797 def update_params_nonergodic(pup, params, write_g=False, num=False):
798 
799  from numpy import linspace
800  #g = spline(zip(pup,params),0,method='monoH.FC')
801  g = linear_interpolation(zip(pup,params),0)
802  if write_g:
803  d = spline_diffusivity(pup,params)
804  fl=open('g','w')
805  fl.write("".join(["%f %f\n" % (x,g(x)) for x in linspace(0,1,num=100)]))
806  fl.close()
807  fl=open('diffusivity','w')
808  fl.write(''.join(["%f %f\n" % (x,d(x)) for x in
809  linspace(params[0],params[-1],num=100)]))
810  fl.close()
811  fl=open('pup','w')
812  fl.write("".join(["%f %f\n" % (i,j) for (i,j) in zip(params,pup)]))
813  fl.close()
814 
815 
816  if num is False:
817  newparams = [g(i) for i in reversed(linspace(0,1,num=len(params)))]
818  else:
819  newparams = [g(i) for i in reversed(linspace(0,1,num=num))]
820  #for numerical issues
821  newparams[0] = params[0]
822  newparams[-1] = params[-1]
823 
824  return newparams
825 
826 def update_params(indicators, params, isGood, targetAR = 0.4, immobilePoint = 1,
827  Cv = None, badMethod = "dumb", goodMethod = "dumb", dumb_scale = 0.1):
828  """update the parameters according to the isGood tuple and using the
829  specified methods"""
830 
831  newparams = list(params) # make a copy
832 
833  if immobilePoint != 1:
834  raise NotImplementedError
835 
836  if Cv is None and (badMethod != "dumb" or goodMethod != "dumb"):
837  raise RuntimeError, """Cv needs to be estimated if using other methods
838  than 'dumb' for updating!"""
839 
840  if goodMethod == "dumb":
841  update_good = update_good_dumb
842  elif goodMethod == "step":
843  update_good = update_any_cv_step
844  elif goodMethod == "sc":
845  update_good = update_any_cv_sc
846  elif goodMethod == "scfull":
847  update_good = update_any_cv_scfull
848  elif goodMethod == "nr":
849  update_good = update_any_cv_nr
850  else:
851  raise NotImplementedError, goodMethod
852  if badMethod == "dumb":
853  update_bad = update_bad_dumb
854  elif badMethod == "step":
855  update_bad = update_any_cv_step
856  elif badMethod == "sc":
857  update_bad = update_any_cv_sc
858  elif badMethod == "scfull":
859  update_bad = update_any_cv_scfull
860  elif badMethod == "nr":
861  update_bad = update_any_cv_nr
862  else:
863  raise NotImplementedError, badMethod
864 
865  #scan each position starting from the immobilePoint
866  for pos in xrange(len(params)-1):
867  if isGood[pos]:
868  newparams[pos+1] = update_good(newparams[pos:pos+2],params[pos:pos+2],
869  indicators[pos], targetAR = targetAR, Cv = Cv)
870  else:
871  newparams[pos+1] = update_bad(newparams[pos:pos+2],params[pos:pos+2],
872  indicators[pos], targetAR = targetAR, Cv = Cv, scale=dumb_scale)
873 
874  return tuple(newparams)
875 
876 def tune_params_flux(replicanums, params, subs=1, start=0, alpha = 0.05,
877  testMethod = 'anova', meanMethod = 'binom', use_avgAR=False,
878  power = 0.8, num=False):
879  #num is here if you want to add some more temperatures. indicate total
880  #number of replicas
881 
882  #TODO: do case where one estimates all based on target AR.
883  if use_avgAR is not False:
884  raise NotImplementedError
885 
886  prdb("computing mean first passage times")
887  tau0, tauN, chose_N, times0, timesN = mean_first_passage_times(replicanums,
888  subs=subs, start=start, use_avgAR=use_avgAR)
889 
890  prdb("average round trip time: %.2f (%d+%d events)" % \
891  (tau0[-1]+tauN[0],len(times0[-1]),len(timesN[0])))
892  prdb("checking if the parameterset needs to be improved")
893  N = len(replicanums)
894  if chose_N is None:
895  nstar =N/2
896  else:
897  nstar = N - sum([int(a) for a in chose_N]) - 1
898 
899  reduced = []
900  #no need to check for times?[0] or times?[N]
901  for n in xrange(1,N-1):
902  if n > nstar:
903  reduced.append([i*2.0/((N-n)*(N-n+1)) for i in timesN[n]])
904  else:
905  reduced.append([i*2.0/(n*(n+1)) for i in times0[n]])
906 
907  anova_result = are_equal(reduced, alpha = alpha, method = testMethod,
908  power=power)
909  if (anova_result['result']): #TODO test if equal to targetAR
910  prdb("flux is constant, nothing to do!")
911  min_n = minimum_n(anova_result, alpha)
912  prdb('Try to rerun this test with at least %d more samples.' % \
913  ceil(min_n))
914  return (False, min_n)
915 
916  #the flux is not constant so the parameters need improvement.
917  #calculate the estimate of the effective fraction
918  prdb("parameterset not optimal, computing effective fraction")
919  pup = compute_effective_fraction(tau0, tauN, chose_N)
920 
921  #improve parameterset
922  prdb("returning new parameterset")
923  params = update_params_nonergodic(pup,params,num=num)
924 
925  return (True, params)
926 
927 def tune_params_ar(indicators, params, targetAR = 0.4, alpha = 0.05, immobilePoint
928  = 1, CvMethod = "skip", badMethod = "dumb", goodMethod = "dumb",
929  varMethod = "skip", testMethod = "anova", meanMethod = "binom",
930  energies = None, temps = None, power=0.8, dumb_scale=0.1):
931  """Tune the replica-exchange parameters and return a new set.
932 
933  Arguments:
934  indicators -- an (N-1)x(M-1) table, where entry at position (j,i)
935  is True if replica j and j+1 performed an exchange between
936  times i and i+1.
937  params -- the current set of N parameters used in the simulation.
938 
939  Keyword arguments:
940  targetAR -- the target AR which is wanted for the simulation
941  (default: 0.4)
942  alpha -- the type 1 error on the one-way ANOVA and subsequent
943  t-tests (default: 5%)
944  immobilePoint -- which replica should keep it's parameter fixed
945  (default: 1st replica, e.g. 1)
946  CvMethod -- the heat capacity estimation method (default:
947  "skip", other options: "mbar", "spline", "constant")
948  badMethod -- how to correct the (j+1)th parameter if the
949  acceptance ratio between replicas j and j+1 is off the
950  target value (default: "dumb", options: "step", "sc", "scfull", "nr")
951  goodMethod -- how to update the value of the (j+1)th parameter
952  in the case of a correctly exchanging couple, but if the jth
953  parameter has been modified (default: "dumb",options: "step",
954  "sc" self-consistent, "scfull" self-consistent using the exact equation,
955  "nr" newton-raphson solver for the exact equation)
956  dumb_scale -- (0.0-1.0) in the "dumb" method, scale wrong temperature
957  intervals by this amount. (default: 0.1)
958  testMethod -- how to test for the difference of the means,
959  either "anova" for a one-way anova, or "kruskal" for a
960  Kruskal-Wallis one-way non-parametric anova.
961  meanMethod -- "ttest" for a two-sided one-sample t-test,
962  "binom" for an exact binomial test of the probability.
963  varMethod -- how to test for the equality of variances.
964  "fligner" for the Fligner-Killeen non-parametric test,
965  "bartlett" for Bartlett's test, "skip" to pass.
966  energies -- if CvMethod is set to "mbar", the energies of each
967  state as a function of time are used to estimate the heat capacity.
968  temps -- the temperatures of the simulations, if estimating with "mbar".
969 
970  Return Value:
971  returns a tuple: (bool, params). bool is True if params have
972  changed, and params is the new set.
973 
974  """
975 
976  #perform ANOVA
977  prdb("performing ANOVA")
978  anova_result = are_equal(indicators, targetAR, alpha, method = testMethod,
979  varMethod = varMethod, power=power)
980  if (anova_result['result'] and
981  are_equal_to_targetAR(indicators,targetAR,alpha, method = meanMethod)):
982  prdb("all means are equal to target AR, nothing to do!")
983  min_n = minimum_n(anova_result, alpha)
984  prdb('Try to rerun this test with at least %d more samples.' % ceil(min_n))
985  return (False, min_n)
986  prdb("some means are different, performing t-tests")
987 
988  #perform two-by-two t-tests
989  isGood = find_good_ARs(indicators, targetAR, alpha, method = meanMethod)
990  if not (False in isGood):
991  prdb("""Bad luck: ANOVA says means are not identical, but t-tests
992  can't find the bad rates...""")
993  return (False, params)
994  prdb(isGood)
995 
996  #check if data is stationnary by doing a block analysis
997  prdb("performing stationarity test")
998  if not are_stationnary(indicators, alpha):
999  prdb("Warning: Some simulations are not stationary!")
1000 
1001  #interpolate the heat capacity from the data
1002  #TODO: use all previous data, not just from this run.
1003  prdb("launching Cv estimation or skipping it")
1004  if CvMethod == "skip":
1005  Cv = None
1006  elif CvMethod == "interpolate" or CvMethod == "constant":
1007  Cv = CvEstimator(params, indicators=indicators, method=CvMethod)
1008  elif CvMethod == "mbar":
1009  Cv = CvEstimator(params, energies=energies, temps=temps, method = CvMethod)
1010  else:
1011  raise NotImplementedError, CvMethod
1012 
1013  #update parameters
1014  prdb('updating params')
1015  #update the current parameter set to match the target AR
1016  params = update_params(indicators, params, isGood, targetAR = targetAR,
1017  immobilePoint = immobilePoint, Cv = Cv,
1018  badMethod = badMethod, goodMethod = goodMethod, dumb_scale=dumb_scale)
1019 
1020  prdb('Done')
1021  return (True, params)
1022 
1023 if __name__ == '__main__':
1024  from numpy import *
1025  replicanums=[]
1026  for i in xrange(1,8):
1027  replicanums.append(tuple(fromfile('data/replica-indices/%d.rep' % i,
1028  dtype=int, sep='\n')))
1029 
1030  prdb("replicanums: %dx%d" % (len(replicanums), len(replicanums[0])))
1031  params = tuple(fromfile('data/temperatures', sep=' '))
1032 
1033  prdb(params)
1034  indicators = compute_indicators(replicanums, subs=1,start=0)
1035  prdb("indicators: %dx%d" % (len(indicators), len(indicators[0])))
1036  prdb("Exchange rate:")
1037  prdb(array([sum(ind)/float(len(ind)) for ind in indicators]))
1038  array([sum(ind)/float(len(ind)) for ind in
1039  indicators]).tofile('xchgs',sep='\n')
1040  changed, newparams = tune_params(indicators, params, targetAR = 0.25,
1041  badMethod="dumb",goodMethod="dumb",CvMethod="skip",testMethod="anova",alpha=0.05)
1042  if not changed:
1043  print "Parameter set seems optimal."
1044  else:
1045  if not True in [abs(newparams[i+1]-newparams[i]) < 1e-3 for i in xrange(len(newparams)-1)]:
1046  array(newparams).tofile('data/temperatures', sep=' ')
1047  else:
1048  print "PROBLEM IN NEW PARAMETERSET -> not saved"
1049  print "params :",params
1050  print "new params:",newparams