IMP logo
IMP Reference Guide  develop.7400db2aee,2024/11/23
The Integrative Modeling Platform
TuneRex.py
1 #!/usr/bin/env python
2 
3 
4 __doc__ = """
5 This module provides a few methods to improve the efficiency of a
6 replica-exchange simulation by tuning its parameters.
7 Author: Yannick Spill
8 """
9 
10 import sys
11 import rpy2.robjects as robjects
12 
13 kB = 1.3806503 * 6.0221415 / 4184.0 # Boltzmann constant in kcal/mol/K
14 # here for float comparison. Floats are equal if their difference
15 # is smaller than EPSILON
16 EPSILON = 1e-8
17 debug = False
18 
19 
20 def prdb(arg):
21  if debug:
22  print(arg)
23 
24 
25 # R compatibility functions
26 r = robjects.r
27 robjects.globalenv["kB"] = kB
28 _rinverf = r('invErf <- function(x) {qnorm((1 + x) /2) / sqrt(2)}')
29 _rerf = r('erf <- function(x) {2 * pnorm(x * sqrt(2)) - 1}')
30 _rinvF = r('qf')
31 _rinterp = None
32 
33 
34 def erfinv(x):
35  return _rinverf(x)[0]
36 
37 
38 def erf(x):
39  return _rerf(x)[0]
40 
41 
42 def Finv(x, d1, d2):
43  return _rinvF(x, d1, d2)[0]
44 
45 
46 def spline(xy, mean, method=None):
47  """spline interpolation of (x,y) coordinates. If interpolation goes
48  negative, replace by mean value.
49  """
50  x, y = list(zip(*xy))
51  robjects.globalenv["x"] = robjects.FloatVector(x)
52  robjects.globalenv["y"] = robjects.FloatVector(y)
53  global _rinterp
54  # _rinterp = r.splinefun(x,y)
55  if method is None:
56  r('cvsplinenonbounded <- splinefun(x,y)')
57  else:
58  r('cvsplinenonbounded <- splinefun(x,y,method="%s")' % method)
59  _rinterp = r(
60  'cvspline <- function(x) { tmp = cvsplinenonbounded(x); '
61  'if (tmp>0) {tmp} else {%f}}' %
62  mean)
63 
64  def interpolated(x):
65  global _rinterp
66  return _rinterp(x)[0]
67  return interpolated
68 
69 
70 def linear_interpolation(xy, mean):
71  """linear interpolation of (x,y) coordinates. No extrapolation possible.
72  """
73  x, y = list(zip(*xy))
74  robjects.globalenv["x"] = robjects.FloatVector(x)
75  robjects.globalenv["y"] = robjects.FloatVector(y)
76  global _rinterp
77  # _rinterp = r.splinefun(x,y)
78  _rinterp = r('cvspline <- approxfun(x,y)')
79 
80  def interpolated(x):
81  global _rinterp
82  return _rinterp(x)[0]
83  return interpolated
84 
85 
86 # R testing functions
87 
88 def anova(*args):
89  """perform anova using R and return statistic, p-value, between and
90  within variance"""
91  ngroups = len(args) # number of groups
92  # nreps = len(args[0]) #number of repetitions
93  # group = r.gl(ngroups,nreps)
94  reps = r.rep(0, len(args[0]))
95  weight = robjects.FloatVector(args[0])
96  for i in range(1, len(args)):
97  reps += r.rep(i, len(args[i]))
98  weight += robjects.FloatVector(args[i])
99  group = r.factor(reps)
100  robjects.globalenv["weight"] = weight
101  robjects.globalenv["group"] = group
102  lm = r.lm("weight ~ group")
103  aov = r.anova(lm)
104  prdb(aov)
105  # F statistic, p-value, between and within variance
106  anova_result = {'fstat': aov[3][0],
107  'pval': aov[4][0],
108  'between': aov[2][0],
109  'within': aov[2][1],
110  'nsteps': [len(i) for i in args],
111  'nreps': ngroups,
112  'test': 'anova'} # nreps: number of replicas
113  return anova_result
114 
115 
116 def kruskal(*args):
117  """perform kruskal-wallis rank test"""
118  ngroups = len(args)
119  # nreps = len(args[0])
120  # group = r.gl(ngroups,nreps)
121  reps = r.rep(0, len(args[0]))
122  weight = robjects.FloatVector(args[0])
123  for i in range(1, len(args)):
124  reps += r.rep(i, len(args[i]))
125  weight += robjects.FloatVector(args[i])
126  group = r.factor(reps)
127  aov = r('kruskal.test')(group, weight)
128  prdb(aov)
129  kruskal_result = {'fstat': aov[0][0],
130  'pval': aov[2][0],
131  'nsteps': [len(i) for i in args],
132  'nreps': ngroups,
133  'test': 'kruskal'}
134  return kruskal_result # F statistic and p-value
135 
136 
137 def ttest(obs, target):
138  """perform a one-sample two-sided t-test on obs against target mean"""
139  test = r('t.test')(robjects.IntVector(obs), mu=target)
140  return test[0][0], test[2][0] # stat and p-value
141 
142 
143 def binom(obs, target):
144  """perform an exact binomial test on the mean of obs against target"""
145  success = sum(obs)
146  trials = len(obs)
147  test = r('binom.test')(success, trials, p=target)
148  return test[0][0], test[2][0] # stat and p-value
149 
150 
151 def bartlett(*args):
152  """perform bartlett's test on the equality of variances of the
153  observations"""
154  ngroups = len(args)
155  nreps = len(args[0])
156  group = r.gl(ngroups, nreps)
157  weight = robjects.IntVector(args[0])
158  for i in args[1:]:
159  weight += robjects.IntVector(i)
160  robjects.globalenv["weight"] = weight
161  robjects.globalenv["group"] = group
162  var = r('bartlett.test')(weight, group)
163  return var[0][0], var[2][0] # statistic and p-value
164 
165 
166 def fligner(*args):
167  """perform Fligner-Killeen non-parametric test of the variance equality"""
168  ngroups = len(args)
169  nreps = len(args[0])
170  group = r.gl(ngroups, nreps)
171  weight = robjects.IntVector(args[0])
172  for i in args[1:]:
173  weight += robjects.IntVector(i)
174  robjects.globalenv["weight"] = weight
175  robjects.globalenv["group"] = group
176  var = r('fligner.test')(weight, group)
177  return var[0][0], var[2][0] # statistic and p-value
178 
179 
180 def power_test(ar, power=0.8, alpha=0.05):
181  """perform an anova power test and return
182  - the power of the test with this input data
183  - the number of points that would be needed to achieve a default
184  power of 0.8
185  ar: the output of anova()
186  """
187  result = r('power.anova.test')(groups=ar['nreps'], n=min(ar['nsteps']),
188  between=ar['between'], within=ar['within'],
189  sig=alpha)
190  prdb('the power of this anova was: %.3f' % result[5][0])
191  result = r('power.anova.test')(groups=ar['nreps'],
192  between=ar['between'], within=ar['within'],
193  sig=alpha, pow=power)
194  prdb('To have a power of %.3f, there should be at least %d exchange '
195  'attempts.' % (power, result[1][0]))
196  return
197 
198 
199 def minimum_n(ar, alpha=0.05):
200  """This routine tries to return an estimate of the additional number of
201  exchange trials that could lead to a positive result of the anova (e.g.
202  the average ARs are not the same). It is still very crude. It also assumes
203  that a one-way anova was made.
204  ar: the output of anova()
205  alpha: type I error
206  """
207  nreps = ar['nreps']
208  nsteps = ar['nsteps']
209  try:
210  nsteps = min(nsteps)
211  except: # noqa: E722
212  pass
213  fstat = ar['fstat']
214  return nsteps * (numpy.sqrt(Finv(1 - alpha, nreps - 1,
215  nreps * (nsteps - 1)) / fstat) - 1)
216 
217 
218 # Heat capacity class
220 
221  """When created, estimates the heat capacity from the energies or from the
222  indicator functions using the specified method. Two methods are then
223  available to guess the heat capacity at a given parameter value: get()
224  returns a single point, and mean() returns the linear average between two
225  points.
226  """
227 
228  def __init__(self, params, energies=None, indicators=None,
229  method="constant", temps=None, write_cv=False):
230 
231  self.__initialized = False
232  self.__cv = []
233  self.method = method
234 
235  if method == "interpolate":
236  self.estimate_cv_interpolate(params, indicators)
237  self.get = self.get_interp
238  self.mean = self.mean_interp
239  elif method == "constant":
240  self.estimate_cv_constant(params, indicators)
241  self.get = lambda p: self.__cv
242  self.mean = lambda p1, p2: self.__cv
243  self.__cvfun = self.get
244  r('cvspline <- function(x) {%f}' % self.__cv)
245  elif method == "mbar":
246  self.estimate_cv_mbar(params, energies, temps)
247  self.get = self.get_mbar
248  self.mean = self.mean_mbar
249  else:
250  raise NotImplementedError(method)
251 
252  self.__initialized = True
253 
254  # write the heat capacity to a file
255  if write_cv:
256  fl = open('cv', 'w')
257  fl.write("".join(["%f %f\n" % (x, self.get(x))
258  for x in numpy.linspace(params[0] / 2, 2 * params[-1])]))
259  fl.close()
260 
261  def estimate_cv_interpolate(self, params, indicators):
262  """interpolate using previous values, by reversing the approximate
263  overlap function
264  """
265  if self.__initialized:
266  return
267  if len(indicators) != len(params) - 1:
268  raise ValueError(
269  "the length of indicators and params does not match!")
270  if params != tuple(sorted(params)):
271  raise NotImplementedError(
272  "unable to work on parameters that do not change "
273  "monotonically")
274 
275  prdb("storing params and means")
276  self.__params = params
277  self.__pmeans = [(params[i] + params[i + 1]) / 2.
278  for i in range(len(params) - 1)]
279  prdb("computing __cv")
280  for i, ind in enumerate(indicators):
281  mean = sum(ind) / float(len(ind))
282  Y2 = 2 * kB * erfinv(1 - mean) ** 2
283  p1 = params[i]
284  p2 = params[i + 1]
285  self.__cv.append(
286  (self.__pmeans[i],
287  (p1 ** 2 + p2 ** 2) * float(Y2) / (p2 - p1) ** 2))
288  prdb(self.__params)
289  prdb(self.__cv)
290  self.__cvmean = sum([i[1] for i in self.__cv]) / float(len(self.__cv))
291  if self.__cvmean < 0:
292  raise ValueError("Cv mean is negative!")
293  self.__cvfun = spline(self.__cv, self.__cvmean)
294  return
295 
296  def estimate_cv_constant(self, params, indicators):
297  """try to guess which constant cv fits best"""
298  if self.__initialized:
299  return
300  self.estimate_cv_interpolate(params, indicators)
301  self.__cv = self.__cvmean
302  return
303 
304  def needs_init(self):
305  if not self.__initialized:
306  raise RuntimeError("Class was not initialized correctly!")
307 
308  def estimate_cv_mbar(self, params, energies, temps):
309  "use MBAR to get the heat capacity"
310  raise NotImplementedError("estimate_cv_mbar")
311 
312  def _isinbounds(self, p, params):
313  """returns True if p is within params, else false. the params list
314  must be sorted ascendingly."""
315  if p < params[0] - EPSILON or p > params[-1] + EPSILON:
316  # prdb("Warning: value %f is outside of bounds, "
317  # "extrapolating." % p)
318  return False
319  else:
320  return True
321 
322  def _interpolate(self, xval, xlist):
323  """return interpolation of Cv at point xval, and return the average
324  instead if this value is negative.
325  """
326  self._isinbounds(xval, xlist)
327  val = self.__cvfun(xval)
328  if val > 0:
329  return val
330  else:
331  return self.__cvmean
332 
333  def get_interp(self, param):
334  """returns the point estimate of the first derivative of the energy
335  with respect to the replica exchange parameter (usually T or q).
336  This version assumes that the means of cv are given.
337  """
338  self.needs_init()
339  return self._interpolate(param, self.__pmeans)
340 
341  def get_mbar(self, param):
342  """returns the point estimate of the first derivative of the energy
343  with respect to the replica exchange parameter (usually T or q).
344  This version assumes that the values of cv are given.
345  """
346  self.needs_init()
347  return self._interpolate(param, self.__params)
348 
349  def mean_interp(self, pa, pb):
350  """estimate the mean of Cv between two points. Here the means were
351  stored previously
352  """
353  self.needs_init()
354  return self._interpolate((pa + pb) / 2., self.__pmeans)
355 
356  def mean_mbar(self, pa, pb):
357  self.needs_init()
358  return (self.get_mbar(pb) + self.get_mbar(pa)) / 2.
359 
360 # Parameter updating methods
361 
362 
363 def update_good_dumb(newp, oldp, *args, **kwargs):
364  """Here the old parameters are oldp[0] and oldp[1], and the starting point
365  is newp[0]. We should modify newp[1] so that the AR in the following cycle
366  is equal to the targetAR.
367  In the "dumb" method, the Cv and targetAR keywords are ignored.
368  Here the newp[1] parameter is modified because prior changes have set
369  newp[0] to a different value than oldp[0]. Thus, we should move newp[1] by
370  minimizing the effect on the AR since it is supposedly equal to targetAR.
371  In this simple method, the parameter is just translated.
372  """
373  prdb(
374  "newp[0] has moved (%.3f -> %.3f), adjusting the position of newp[1]" %
375  (oldp[0], newp[0]))
376  return oldp[1] - (oldp[0] - newp[0])
377 
378 
379 def update_bad_dumb(newp, oldp, ind, targetAR=0.4, scale=0.1, **kwargs):
380  """Here the old parameters are oldp[0] and oldp[1], and the starting point
381  is newp[0]. We should modify newp[1] so that the AR in the following cycle
382  is equal to the targetAR.
383  In the "dumb" method, the Cv keyword is ignored. Here the newp[1]
384  parameter is modified to follow the possible translation of newp[0] by
385  calling update_good_dumb, and then newp[1] is added or subtracted scale% of
386  (oldp[1] - oldp[0]) to adjust to targetAR.
387  """
388 
389  if newp[0] != oldp[0]:
390  newp[1] = update_good_dumb(newp, oldp)
391  if targetAR > sum(ind) / float(len(ind)):
392  prdb("""target AR is higher than expected, decreasing newp[1]""")
393  newp[1] -= scale * (oldp[1] - oldp[0])
394  else:
395  prdb("""target AR is lower than expected, increasing newp[1]""")
396  newp[1] += scale * (oldp[1] - oldp[0])
397  return newp[1]
398 
399 
400 def update_any_cv_step(newp, oldp, ind, targetAR=0.4, Cv=None, **kwargs):
401  """here we use the average AR formula of two gaussians to get newp[1] as a
402  function of newp[1], knowing the targetAR and estimating the Cv. If
403  targetAR is negative, consider that mean(ind) equals the target AR and
404  skip any calculation in the case that oldp[0] equals newp[0].
405  step: suppose the heat capacity is stepwise constant, i.e. use the heat
406  capacity at position newp[0] as an estimate of the mean of the heat
407  capacity between newp[0] and newp[1]. This does not require any
408  self-consistent loop.
409  """
410 
411  global kB
412 
413  if abs(oldp[0] - newp[0]) < EPSILON and targetAR < 0:
414  return oldp[1]
415  if targetAR < 0:
416  targetAR = sum(ind) / float(len(ind))
417  cv = Cv.get(newp[0])
418  Y = numpy.sqrt(2 * kB) * float(erfinv(1 - targetAR))
419  if Y ** 2 >= cv:
420  raise ValueError("""targetAR too small for this approximate method, use
421  the full self-consistent method instead.""")
422  return newp[0] * (cv + Y * numpy.sqrt(2 * cv - Y ** 2)) / (cv - Y ** 2)
423 
424 
425 def update_any_cv_sc(newp, oldp, ind, targetAR=0.4, Cv=None,
426  tol=1e-6, maxiter=10000):
427  """self-consistent solver version"""
428 
429  global kB
430 
431  if abs(oldp[0] - newp[0]) < EPSILON and targetAR < 0:
432  return oldp[1]
433  if targetAR < 0:
434  targetAR = sum(ind) / float(len(ind))
435  cv = Cv.get(newp[0])
436  Y = numpy.sqrt(2 * kB) * float(erfinv(1 - targetAR))
437  if Y ** 2 >= cv:
438  raise ValueError("""targetAR too small for this approximate method, use
439  the full self-consistent method instead.""")
440  targetp = newp[0] * (cv + Y * numpy.sqrt(2 * cv - Y ** 2)) / (cv - Y ** 2)
441  for i in range(maxiter):
442  cv = Cv.mean(newp[0], targetp)
443  (oldtargetp, targetp) = (
444  targetp, newp[0] * (cv + Y * numpy.sqrt(2 * cv - Y ** 2))
445  / (cv - Y ** 2))
446  if abs(targetp - oldtargetp) <= tol:
447  break
448  if numpy.isnan(targetp):
449  if Y ** 2 >= cv:
450  raise ValueError(
451  "targetAR too small for this approximate method, use the "
452  "full self-consistent method instead.")
453  else:
454  raise ValueError("""something unexpected happened""")
455  if i == maxiter - 1:
456  prdb("""Warning: unable to converge the self-consistent after %d
457  iterations and a tolerance of %f. Change the method or decrease the
458  tolerance!""" % (maxiter, tol))
459  prdb("converged after %d iterations and a tolerance of %f for x=%f" %
460  (i, tol, oldp[1]))
461  return targetp
462 
463 
464 def update_any_cv_scfull(newp, oldp, ind, targetAR=0.4, Cv=None,
465  tol=1e-6, maxiter=10000):
466  """self-consistent solver version, on the exact average AR equation"""
467 
468  # create helper functions and overlap function
469  _ = r('u21 <- function(t1,t2) { integrate(Vectorize(cvspline),'
470  't1,t2)$value }')
471  _ = r('b21 <- function(t1,t2) { 1./(kB*t2) - 1./(kB*t1) }')
472  _ = r('sigma2 <- function(t) {cvspline(t)*kB*t**2}')
473  _rovboltz = r('ovboltz <- function(t1,t2) {\
474  1/2*( 1-erf(\
475  u21(t1,t2)/sqrt(2*(sigma2(t1)+sigma2(t2))))\
476  + exp(b21(t1,t2)*(u21(t1,t2)+b21(t1,t2)*(sigma2(t1)+sigma2(t2))/2))\
477  * (1+erf((u21(t1,t2)+b21(t1,t2)*(sigma2(t1)+sigma2(t2)))\
478  /(sqrt(2*(sigma2(t1)+sigma2(t2))))))\
479  )}')
480  _rrootfn = r(
481  'rootfn <- function(t2) {ovboltz(%f,t2)-%f}' %
482  (newp[0], targetAR))
483 
484  # find upper bound for estimation, raise an error if cv is negative
485  if oldp[1] > oldp[0]:
486  tmp = newp[0] * 1.05
487  else:
488  tmp = newp[0] * 0.95
489  nloops = 0
490  while _rrootfn(tmp)[0] >= 0:
491  nloops += 1
492  tmp += (oldp[1] - oldp[0])
493  if Cv.get(tmp) < 0:
494  raise RuntimeError("heat capacity goes negative")
495  if nloops > maxiter:
496  raise RuntimeError('could not find zero of function!')
497 
498  # find root
499  _runiroot = r('uniroot(rootfn,c(%f,%f),f.lower = %f, f.upper = %f, '
500  'tol = %f, maxiter = %d)'
501  % (newp[0], tmp, 1 - targetAR, -targetAR, tol, maxiter))
502  prdb("self-consistent solver converged after %s iterations and an "
503  "estimated precision of %s " % (_runiroot[2][0], _runiroot[3][0]))
504  prdb(
505  ["root:",
506  _runiroot[0][0],
507  "overlap:",
508  _rovboltz(newp[0],
509  _runiroot[0][0])[0]])
510  return _runiroot[0][0]
511 
512 
513 def update_any_cv_nr(newp, oldp, ind, targetAR=0.4, Cv=None, **kwargs):
514  """newton-raphson solver version"""
515 
516  # use nlm
517  raise NotImplementedError
518 
519 # Testing methods
520 
521 
522 def are_equal_to_targetAR(
523  indicators,
524  targetAR=0.4,
525  alpha=0.05,
526  method="binom"):
527  """here, all indicators have same average, we want to know if it is
528  equal to targetAR
529  """
530 
531  # calculate sample mean deviation of each indicator function from targetAR
532  deviations = sorted([(abs(sum(ind) / float(len(ind)) - targetAR), ind)
533  for pos, ind in enumerate(indicators)])
534  deviant = deviations[-1]
535 
536  # perform t-test
537  if method == "ttest":
538  # from statlib.stats import ttest_1samp as ttest
539  our_ttest = ttest
540  elif method == "binom":
541  our_ttest = binom
542  else:
543  raise NotImplementedError
544 
545  try:
546  test, pval = our_ttest(deviant[1], targetAR)
547  except: # noqa: E722
548  if abs(targetAR - sum(deviant[1]) / len(deviant[1])) > EPSILON:
549  pval = 0
550  else:
551  pval = 1
552  if pval < alpha:
553  return False
554  else:
555  return True
556 
557 
558 def are_stationnary(indicators, alpha=0.05, method="anova"):
559  """test on the stationarity of the observations (block analysis). Done
560  so by launching an anova on the difference between the two halves of
561  each observations.
562  """
563 
564  if method == "kruskal":
565  test = kruskal
566  else:
567  test = anova
568 
569  tmp = numpy.array(indicators)
570  blocklen = len(indicators[0]) / 2
571  block = tmp[:, :blocklen] - tmp[:, blocklen:2 * blocklen]
572  if test(*block)['pval'] < alpha:
573  return False
574  else:
575  return True
576 
577 
578 def are_equal(indicators, targetAR=0.4, alpha=0.05,
579  method="anova", varMethod="skip", power=0.8):
580  """Perform a one-way ANOVA or kruskal-wallis test on the indicators set,
581  and return True if one cannot exclude with risk alpha that the indicators
582  AR are different (i.e. True = all means are equal). Also performs a test
583  on the variance (they must be equal).
584  """
585 
586  if min(targetAR, 1 - targetAR) * len(indicators[0]) <= 5 and \
587  (varMethod == "bartlett" or method == "anova"):
588  prdb("Warning: normal approximation to the binomial does not hold!")
589 
590  # test the variances
591  if varMethod == "skip":
592  pass
593  else:
594  if varMethod == "bartlett":
595  pval = bartlett(*indicators)[1]
596  elif varMethod == "fligner":
597  pval = fligner(*indicators)[1]
598  else:
599  raise NotImplementedError(
600  "variance testing method unknown: %s" %
601  varMethod)
602  if pval < alpha:
603  prdb("Warning: performing mean test with unequal variances.")
604 
605  if method == "kruskal":
606  test = kruskal
607  else:
608  test = anova
609 
610  tr = test(*indicators)
611  tr['alpha'] = alpha
612  # p-value < alpha => H0 rejected => result == False
613  tr['result'] = tr['pval'] >= alpha
614 
615  return tr
616 
617 
618 def find_good_ARs(indicators, targetAR=0.4, alpha=0.05, method="binom"):
619  """perform one-sample t-tests on each of the data sets, and return
620  a tuple of bool of size N-1, False if AR i is not equal to targetAR
621  at risk alpha.
622  """
623 
624  # calculate sample means of each indicator function
625  means = sorted([(sum(ind) / float(len(ind)), pos, ind)
626  for pos, ind in enumerate(indicators)])
627 
628  # perform t-test
629  if method == "ttest":
630  # from statlib.stats import ttest_1samp as ttest
631  our_ttest = ttest
632  elif method == "binom":
633  our_ttest = binom
634  else:
635  raise NotImplementedError
636 
637  isGoodTuple = []
638  # start from the lowest means and stop when they are ok
639  prdb("starting left")
640  for (i, (mean, pos, ind)) in enumerate(means):
641  prdb("performing t-test on couple %d having average AR %f, "
642  "position %d" % (pos, mean, i))
643  try:
644  test, pval = our_ttest(ind, targetAR)
645  except: # noqa: E722
646  if abs(targetAR - mean) > EPSILON:
647  pval = 0
648  else:
649  pval = 1
650  if pval < alpha:
651  # means are different
652  isGoodTuple.append((pos, False))
653  else:
654  goodstart = i
655  break
656  # then start from the highest means
657  prdb("starting right")
658  for (i, (mean, pos, ind)) in enumerate(reversed(means)):
659  prdb("performing t-test on couple %d having average AR %f, position %d"
660  % (pos, mean, len(means) - 1 - i))
661  if our_ttest(ind, targetAR)[1] < alpha:
662  # means are different
663  isGoodTuple.append((pos, False))
664  else:
665  goodstop = len(means) - 1 - i
666  break
667 
668  # limiting cases: all different
669  if len(isGoodTuple) > len(indicators):
670  return tuple([False] * len(indicators))
671  # all equal
672  elif len(isGoodTuple) == 0:
673  return tuple([True] * len(indicators))
674  # intermediate
675  else:
676  isGoodTuple.extend([(means[i][1], True) for i in
677  range(goodstart, goodstop + 1)])
678  isGoodTuple.sort()
679  return tuple([tup[1] for tup in isGoodTuple])
680 
681 # Trebst, Katzgraber, Nadler and Hansmann optimum flux stuff
682 
683 
684 def mean_first_passage_times(
685  replicanums_ori,
686  subs=1,
687  start=0,
688  use_avgAR=False):
689  """compute mean first passage times as suggested in
690  Nadler W, Meinke J, Hansmann UHE, Phys Rev E *78* 061905 (2008)
691 
692  use_avgAR : if a list of average ARs is given computes everything from
693  average AR; if it is False, compute by direct counting of events.
694 
695  returns:
696  If use_avgAR == False:
697  tau0, tauN, chose_N, times0, timesN
698  else:
699  tau0, tauN, None, None, None
700  tau0[i]: average time to go from replica 0 to replica i
701  tauN[i]: average time to go from replica N to replica i
702  times0 and timesN are the corresponding lists of single events.
703 
704  """
705 
706  from numpy import array, zeros
707  replicanums = array(replicanums_ori)[:, start::subs]
708  N = len(replicanums)
709  tauN = [0] * N
710  tau0 = [0] * N
711 
712  if use_avgAR:
713  tau0[0] = 0.0
714  tauN[-1] = 0.0
715  for state in range(1, N):
716  tau0[state] = tau0[state - 1] + \
717  state / (float(use_avgAR[state - 1]))
718  for state in reversed(range(1, N)):
719  tauN[state - 1] = tauN[state] \
720  + (N - (state - 1)) / (float(use_avgAR[state - 1]))
721 
722  return tau0, tauN, None, None, None
723 
724  else:
725  # prdb('not using average AR')
726  if sys.version_info[0] >= 3:
727  izip = zip
728  else:
729  from itertools import izip
730  # the algorithm looks for replicas that start at the lowest temp, and
731  # records the farthest state it went to before returning to zero. Once
732  # back it increments the counter of all concerned replicas. Similar
733  # procedure if starting from N.
734  store0 = zeros((N, N), dtype=bool)
735  last0 = [0 for i in range(N)]
736  already0 = [False for i in range(N)]
737  times0 = [[] for i in range(N)]
738  storeN = zeros((N, N), dtype=bool)
739  lastN = [0 for i in range(N)]
740  alreadyN = [False for i in range(N)]
741  timesN = [[] for i in range(N)]
742 
743  # prdb('looping over replicanums')
744  for time, frame in enumerate(izip(*replicanums)):
745  # case of the replica in state 0
746  if not already0[frame[0]]:
747  last0[frame[0]] = time
748  store0[frame[0], :] = True
749  already0[frame[0]] = True
750  # case of the replica in state N
751  if not alreadyN[frame[-1]]:
752  lastN[frame[-1]] = time
753  storeN[frame[-1], :] = True
754  alreadyN[frame[-1]] = True
755  # set already flags to False when in state 1 or N-1
756  already0[frame[1]] = False
757  alreadyN[frame[-2]] = False
758  # loop over all states
759  for state, rep in enumerate(frame):
760  if store0[rep, state]:
761  # reached a state for the first time since 0
762  store0[rep, state] = False
763  # store time since this replica left state 0
764  times0[state].append(time - last0[rep])
765  if storeN[rep, state]:
766  # reached a state for the first time since N
767  storeN[rep, state] = False
768  # store time since this replica left state N
769  timesN[state].append(time - lastN[rep])
770  # prdb([replicanums.shape, len(storeN), len(last0)])
771  # times = [[] for i in range(N)]
772  chose_N = [len(timesN[state]) > len(times0[state]) for state in
773  range(N)]
774  for state in range(N):
775  tauN[state] = sum(timesN[state]) / float(len(timesN[state]))
776  tau0[state] = sum(times0[state]) / float(len(times0[state]))
777  # prdb(len(chose_N))
778 
779  return tau0, tauN, chose_N, times0, timesN
780 
781 
782 def compute_effective_fraction(tau0, tauN, chose_N):
783  """input: tau0, tauN, chose_N
784  output: effective fraction f(T) (P_up(n)) as introduced in
785  Trebst S, Troyer M, Hansmann UHE, J Chem Phys *124* 174903 (2006).
786  formalized in
787  Nadler W, Hansmann UHE, Phys Rev E *75* 026109 (2007)
788  and whose calculation is enhanced in
789  Nadler W, Meinke J, Hansmann UHE, Phys Rev E *78* 061905 (2008)
790  the ideal value of f(n) should be 1 - n/N
791  """
792 
793  # nstar is the index of the last state where tau0 should be used.
794  N = len(tau0)
795  if chose_N is None:
796  nstar = N / 2
797  else:
798  nstar = N - sum([int(a) for a in chose_N]) - 1
799 
800  prdb("n* = %d" % nstar)
801  # compute helper functions h
802  h0 = [0] * N
803  h0[1] = tau0[1]
804  for state in range(2, nstar + 1):
805  h0[state] = h0[state - 1] + \
806  (tau0[state] - tau0[state - 1]) / float(state)
807 
808  hN = [0] * N
809  hN[-2] = tauN[-2]
810  for state in reversed(range(nstar, N - 1)):
811  hN[state] = hN[state + 1] + \
812  (tauN[state] - tauN[state + 1]) / float(N - state)
813 
814  # compute flow probabilities
815  pup = [0] * N
816  pup[0] = 1
817  for n in range(1, nstar + 1):
818  pup[n] = 1 - h0[n] / (h0[nstar] + hN[nstar])
819  for n in range(nstar + 1, N):
820  pup[n] = hN[n] / (h0[nstar] + hN[nstar])
821 
822  return pup
823 
824 
825 def spline_diffusivity(pup, params):
826  """spline interpolation of diffusivity: D = 1/(df/dT * heta)
827  """
828  from numpy import linspace
829  robjects.globalenv["hetay"] = \
830  robjects.FloatVector(linspace(0, 1, num=len(params)).tolist())
831  robjects.globalenv["hetax"] = robjects.FloatVector(params)
832  robjects.globalenv["pupx"] = robjects.FloatVector(params)
833  robjects.globalenv["pupy"] = robjects.FloatVector(pup)
834  _ = r('heta <- splinefun(hetax,hetay,method="monoH.FC")')
835  _ = r('eff <- splinefun(pupx,pupy,method="monoH.FC")')
836  diff = r('diff <- function(x) {-1/(heta(x,deriv=1)*eff(x,deriv=1))}')
837  return lambda x: diff(x)[0]
838 
839 
840 # Misc
841 def compute_indicators(replicanums, subs=1, start=0):
842  """input: replicanums : a list of N lists of size M, where N is the number
843  of states and M is the length of the simulation. Each element is an
844  integer, and corresponds to the label of a replica.
845  output: an indicator function of exchanges (size (N-1)x(M-1)), 1 if
846  exchange and 0 if not.
847  """
848  def exchange(n, m):
849  if replicanums[n][m] == replicanums[n + 1][m + 1] \
850  and replicanums[n][m + 1] == replicanums[n + 1][m]:
851  return 1
852  else:
853  return 0
854 
855  indicators = []
856  for n in range(len(replicanums) - 1):
857  indicators.append(
858  [exchange(n, m)
859  for m in range(len(replicanums[n]) - 1)][start::subs])
860  return indicators
861 
862 # Main routines
863 
864 
865 def update_params_nonergodic(pup, params, write_g=False, num=False):
866 
867  from numpy import linspace
868  # g = spline(zip(pup,params),0,method='monoH.FC')
869  g = linear_interpolation(list(zip(pup, params)), 0)
870  if write_g:
871  d = spline_diffusivity(pup, params)
872  fl = open('g', 'w')
873  fl.write("".join(["%f %f\n" % (x, g(x))
874  for x in linspace(0, 1, num=100)]))
875  fl.close()
876  fl = open('diffusivity', 'w')
877  fl.write(''.join(["%f %f\n" % (x, d(x)) for x in
878  linspace(params[0], params[-1], num=100)]))
879  fl.close()
880  fl = open('pup', 'w')
881  fl.write("".join(["%f %f\n" % (i, j) for (i, j) in zip(params, pup)]))
882  fl.close()
883 
884  if num is False:
885  newparams = [g(i) for i in reversed(linspace(0, 1, num=len(params)))]
886  else:
887  newparams = [g(i) for i in reversed(linspace(0, 1, num=num))]
888  # for numerical issues
889  newparams[0] = params[0]
890  newparams[-1] = params[-1]
891 
892  return newparams
893 
894 
895 def update_params(
896  indicators, params, isGood, targetAR=0.4, immobilePoint=1,
897  Cv=None, badMethod="dumb", goodMethod="dumb", dumb_scale=0.1):
898  """update the parameters according to the isGood tuple and using the
899  specified methods"""
900 
901  newparams = list(params) # make a copy
902 
903  if immobilePoint != 1:
904  raise NotImplementedError
905 
906  if Cv is None and (badMethod != "dumb" or goodMethod != "dumb"):
907  raise RuntimeError("""Cv needs to be estimated if using other methods
908  than 'dumb' for updating!""")
909 
910  if goodMethod == "dumb":
911  update_good = update_good_dumb
912  elif goodMethod == "step":
913  update_good = update_any_cv_step
914  elif goodMethod == "sc":
915  update_good = update_any_cv_sc
916  elif goodMethod == "scfull":
917  update_good = update_any_cv_scfull
918  elif goodMethod == "nr":
919  update_good = update_any_cv_nr
920  else:
921  raise NotImplementedError(goodMethod)
922  if badMethod == "dumb":
923  update_bad = update_bad_dumb
924  elif badMethod == "step":
925  update_bad = update_any_cv_step
926  elif badMethod == "sc":
927  update_bad = update_any_cv_sc
928  elif badMethod == "scfull":
929  update_bad = update_any_cv_scfull
930  elif badMethod == "nr":
931  update_bad = update_any_cv_nr
932  else:
933  raise NotImplementedError(badMethod)
934 
935  # scan each position starting from the immobilePoint
936  for pos in range(len(params) - 1):
937  if isGood[pos]:
938  newparams[pos + 1] = update_good(
939  newparams[pos:pos + 2], params[pos:pos + 2],
940  indicators[pos], targetAR=targetAR, Cv=Cv)
941  else:
942  newparams[pos + 1] = update_bad(
943  newparams[pos:pos + 2], params[pos:pos + 2],
944  indicators[pos], targetAR=targetAR, Cv=Cv, scale=dumb_scale)
945 
946  return tuple(newparams)
947 
948 
949 def tune_params_flux(replicanums, params, subs=1, start=0, alpha=0.05,
950  testMethod='anova', meanMethod='binom', use_avgAR=False,
951  power=0.8, num=False):
952  # num is here if you want to add some more temperatures. indicate total
953  # number of replicas
954 
955  # TODO: do case where one estimates all based on target AR.
956  if use_avgAR is not False:
957  raise NotImplementedError
958 
959  prdb("computing mean first passage times")
960  tau0, tauN, chose_N, times0, timesN = mean_first_passage_times(
961  replicanums, subs=subs, start=start, use_avgAR=use_avgAR)
962 
963  prdb("average round trip time: %.2f (%d+%d events)" %
964  (tau0[-1] + tauN[0], len(times0[-1]), len(timesN[0])))
965  prdb("checking if the parameterset needs to be improved")
966  N = len(replicanums)
967  if chose_N is None:
968  nstar = N / 2
969  else:
970  nstar = N - sum([int(a) for a in chose_N]) - 1
971 
972  reduced = []
973  # no need to check for times?[0] or times?[N]
974  for n in range(1, N - 1):
975  if n > nstar:
976  reduced.append([i * 2.0 / ((N - n) * (N - n + 1))
977  for i in timesN[n]])
978  else:
979  reduced.append([i * 2.0 / (n * (n + 1)) for i in times0[n]])
980 
981  anova_result = are_equal(reduced, alpha=alpha, method=testMethod,
982  power=power)
983  if (anova_result['result']): # TODO test if equal to targetAR
984  prdb("flux is constant, nothing to do!")
985  min_n = minimum_n(anova_result, alpha)
986  prdb('Try to rerun this test with at least %d more samples.' %
987  numpy.ceil(min_n))
988  return (False, min_n)
989 
990  # the flux is not constant so the parameters need improvement.
991  # calculate the estimate of the effective fraction
992  prdb("parameterset not optimal, computing effective fraction")
993  pup = compute_effective_fraction(tau0, tauN, chose_N)
994 
995  # improve parameterset
996  prdb("returning new parameterset")
997  params = update_params_nonergodic(pup, params, num=num)
998 
999  return (True, params)
1000 
1001 
1002 def tune_params_ar(indicators, params, targetAR=0.4, alpha=0.05,
1003  immobilePoint=1, CvMethod="skip", badMethod="dumb",
1004  goodMethod="dumb", varMethod="skip", testMethod="anova",
1005  meanMethod="binom", energies=None, temps=None, power=0.8,
1006  dumb_scale=0.1):
1007  """Tune the replica-exchange parameters and return a new set.
1008 
1009  Arguments:
1010  indicators -- an (N-1)x(M-1) table, where entry at position (j,i)
1011  is True if replica j and j+1 performed an exchange between
1012  times i and i+1.
1013  params -- the current set of N parameters used in the simulation.
1014 
1015  Keyword arguments:
1016  targetAR -- the target AR which is wanted for the simulation
1017  (default: 0.4)
1018  alpha -- the type 1 error on the one-way ANOVA and subsequent
1019  t-tests (default: 5%)
1020  immobilePoint -- which replica should keep it's parameter fixed
1021  (default: 1st replica, e.g. 1)
1022  CvMethod -- the heat capacity estimation method (default:
1023  "skip", other options: "mbar", "spline", "constant")
1024  badMethod -- how to correct the (j+1)th parameter if the
1025  acceptance ratio between replicas j and j+1 is off the
1026  target value (default: "dumb", options: "step", "sc",
1027  "scfull", "nr")
1028  goodMethod -- how to update the value of the (j+1)th parameter
1029  in the case of a correctly exchanging couple, but if the jth
1030  parameter has been modified (default: "dumb",options: "step",
1031  "sc" self-consistent, "scfull" self-consistent using the exact
1032  equation, "nr" newton-raphson solver for the exact equation)
1033  dumb_scale -- (0.0-1.0) in the "dumb" method, scale wrong temperature
1034  intervals by this amount. (default: 0.1)
1035  testMethod -- how to test for the difference of the means,
1036  either "anova" for a one-way anova, or "kruskal" for a
1037  Kruskal-Wallis one-way non-parametric anova.
1038  meanMethod -- "ttest" for a two-sided one-sample t-test,
1039  "binom" for an exact binomial test of the probability.
1040  varMethod -- how to test for the equality of variances.
1041  "fligner" for the Fligner-Killeen non-parametric test,
1042  "bartlett" for Bartlett's test, "skip" to pass.
1043  energies -- if CvMethod is set to "mbar", the energies of each
1044  state as a function of time are used to estimate the heat capacity.
1045  temps -- the temperatures of the simulations, if estimating
1046  with "mbar".
1047 
1048  Return Value:
1049  returns a tuple: (bool, params). bool is True if params have
1050  changed, and params is the new set.
1051 
1052  """
1053 
1054  # perform ANOVA
1055  prdb("performing ANOVA")
1056  anova_result = are_equal(indicators, targetAR, alpha, method=testMethod,
1057  varMethod=varMethod, power=power)
1058  if (anova_result['result']
1059  and are_equal_to_targetAR(indicators, targetAR, alpha,
1060  method=meanMethod)):
1061  prdb("all means are equal to target AR, nothing to do!")
1062  min_n = minimum_n(anova_result, alpha)
1063  prdb(
1064  'Try to rerun this test with at least %d more samples.' %
1065  numpy.ceil(min_n))
1066  return (False, min_n)
1067  prdb("some means are different, performing t-tests")
1068 
1069  # perform two-by-two t-tests
1070  isGood = find_good_ARs(indicators, targetAR, alpha, method=meanMethod)
1071  if not (False in isGood):
1072  prdb("""Bad luck: ANOVA says means are not identical, but t-tests
1073  can't find the bad rates...""")
1074  return (False, params)
1075  prdb(isGood)
1076 
1077  # check if data is stationnary by doing a block analysis
1078  prdb("performing stationarity test")
1079  if not are_stationnary(indicators, alpha):
1080  prdb("Warning: Some simulations are not stationary!")
1081 
1082  # interpolate the heat capacity from the data
1083  # TODO: use all previous data, not just from this run.
1084  prdb("launching Cv estimation or skipping it")
1085  if CvMethod == "skip":
1086  Cv = None
1087  elif CvMethod == "interpolate" or CvMethod == "constant":
1088  Cv = CvEstimator(params, indicators=indicators, method=CvMethod)
1089  elif CvMethod == "mbar":
1090  Cv = CvEstimator(
1091  params,
1092  energies=energies,
1093  temps=temps,
1094  method=CvMethod)
1095  else:
1096  raise NotImplementedError(CvMethod)
1097 
1098  # update parameters
1099  prdb('updating params')
1100  # update the current parameter set to match the target AR
1101  params = update_params(indicators, params, isGood, targetAR=targetAR,
1102  immobilePoint=immobilePoint, Cv=Cv,
1103  badMethod=badMethod, goodMethod=goodMethod,
1104  dumb_scale=dumb_scale)
1105 
1106  prdb('Done')
1107  return (True, params)
1108 
1109 
1110 if __name__ == '__main__':
1111  import numpy
1112  replicanums = []
1113  for i in range(1, 8):
1114  replicanums.append(
1115  tuple(numpy.fromfile('data/replica-indices/%d.rep' % i,
1116  dtype=int, sep='\n')))
1117 
1118  prdb("replicanums: %dx%d" % (len(replicanums), len(replicanums[0])))
1119  params = tuple(numpy.fromfile('data/temperatures', sep=' '))
1120 
1121  prdb(params)
1122  indicators = compute_indicators(replicanums, subs=1, start=0)
1123  prdb("indicators: %dx%d" % (len(indicators), len(indicators[0])))
1124  prdb("Exchange rate:")
1125  prdb(numpy.array([sum(ind) / float(len(ind)) for ind in indicators]))
1126  numpy.array([sum(ind) / float(len(ind))
1127  for ind in indicators]).tofile('xchgs', sep='\n')
1128  changed, newparams = tune_params_ar(
1129  indicators, params, targetAR=0.25,
1130  badMethod="dumb", goodMethod="dumb", CvMethod="skip",
1131  testMethod="anova", alpha=0.05)
1132  if not changed:
1133  print("Parameter set seems optimal.")
1134  else:
1135  if True not in [abs(newparams[i + 1] - newparams[i]) < 1e-3
1136  for i in range(len(newparams) - 1)]:
1137  numpy.array(newparams).tofile('data/temperatures', sep=' ')
1138  else:
1139  print("PROBLEM IN NEW PARAMETERSET -> not saved")
1140  print("params :", params)
1141  print("new params:", newparams)
def estimate_cv_constant
try to guess which constant cv fits best
Definition: TuneRex.py:296
def estimate_cv_interpolate
interpolate using previous values, by reversing the approximate overlap function
Definition: TuneRex.py:261
When created, estimates the heat capacity from the energies or from the indicator functions using the...
Definition: TuneRex.py:219
def mean_interp
estimate the mean of Cv between two points.
Definition: TuneRex.py:349
def estimate_cv_mbar
use MBAR to get the heat capacity
Definition: TuneRex.py:308
def get_mbar
returns the point estimate of the first derivative of the energy with respect to the replica exchange...
Definition: TuneRex.py:341
def get_interp
returns the point estimate of the first derivative of the energy with respect to the replica exchange...
Definition: TuneRex.py:333