IMP logo
IMP Reference Guide  develop.b3a5ae88fa,2024/05/04
The Integrative Modeling Platform
TuneRex.py
1 #!/usr/bin/env python
2 
3 from __future__ import print_function
4 
5 __doc__ = """
6 This module provides a few methods to improve the efficiency of a
7 replica-exchange simulation by tuning its parameters.
8 Author: Yannick Spill
9 """
10 
11 import sys
12 import rpy2.robjects as robjects
13 
14 kB = 1.3806503 * 6.0221415 / 4184.0 # Boltzmann constant in kcal/mol/K
15 # here for float comparison. Floats are equal if their difference
16 # is smaller than EPSILON
17 EPSILON = 1e-8
18 debug = False
19 
20 
21 def prdb(arg):
22  if debug:
23  print(arg)
24 
25 
26 # R compatibility functions
27 r = robjects.r
28 robjects.globalenv["kB"] = kB
29 _rinverf = r('invErf <- function(x) {qnorm((1 + x) /2) / sqrt(2)}')
30 _rerf = r('erf <- function(x) {2 * pnorm(x * sqrt(2)) - 1}')
31 _rinvF = r('qf')
32 _rinterp = None
33 
34 
35 def erfinv(x):
36  return _rinverf(x)[0]
37 
38 
39 def erf(x):
40  return _rerf(x)[0]
41 
42 
43 def Finv(x, d1, d2):
44  return _rinvF(x, d1, d2)[0]
45 
46 
47 def spline(xy, mean, method=None):
48  """spline interpolation of (x,y) coordinates. If interpolation goes
49  negative, replace by mean value.
50  """
51  x, y = list(zip(*xy))
52  robjects.globalenv["x"] = robjects.FloatVector(x)
53  robjects.globalenv["y"] = robjects.FloatVector(y)
54  global _rinterp
55  # _rinterp = r.splinefun(x,y)
56  if method is None:
57  r('cvsplinenonbounded <- splinefun(x,y)')
58  else:
59  r('cvsplinenonbounded <- splinefun(x,y,method="%s")' % method)
60  _rinterp = r(
61  'cvspline <- function(x) { tmp = cvsplinenonbounded(x); '
62  'if (tmp>0) {tmp} else {%f}}' %
63  mean)
64 
65  def interpolated(x):
66  global _rinterp
67  return _rinterp(x)[0]
68  return interpolated
69 
70 
71 def linear_interpolation(xy, mean):
72  """linear interpolation of (x,y) coordinates. No extrapolation possible.
73  """
74  x, y = list(zip(*xy))
75  robjects.globalenv["x"] = robjects.FloatVector(x)
76  robjects.globalenv["y"] = robjects.FloatVector(y)
77  global _rinterp
78  # _rinterp = r.splinefun(x,y)
79  _rinterp = r('cvspline <- approxfun(x,y)')
80 
81  def interpolated(x):
82  global _rinterp
83  return _rinterp(x)[0]
84  return interpolated
85 
86 
87 # R testing functions
88 
89 def anova(*args):
90  """perform anova using R and return statistic, p-value, between and
91  within variance"""
92  ngroups = len(args) # number of groups
93  # nreps = len(args[0]) #number of repetitions
94  # group = r.gl(ngroups,nreps)
95  reps = r.rep(0, len(args[0]))
96  weight = robjects.FloatVector(args[0])
97  for i in range(1, len(args)):
98  reps += r.rep(i, len(args[i]))
99  weight += robjects.FloatVector(args[i])
100  group = r.factor(reps)
101  robjects.globalenv["weight"] = weight
102  robjects.globalenv["group"] = group
103  lm = r.lm("weight ~ group")
104  aov = r.anova(lm)
105  prdb(aov)
106  # F statistic, p-value, between and within variance
107  anova_result = {'fstat': aov[3][0],
108  'pval': aov[4][0],
109  'between': aov[2][0],
110  'within': aov[2][1],
111  'nsteps': [len(i) for i in args],
112  'nreps': ngroups,
113  'test': 'anova'} # nreps: number of replicas
114  return anova_result
115 
116 
117 def kruskal(*args):
118  """perform kruskal-wallis rank test"""
119  ngroups = len(args)
120  # nreps = len(args[0])
121  # group = r.gl(ngroups,nreps)
122  reps = r.rep(0, len(args[0]))
123  weight = robjects.FloatVector(args[0])
124  for i in range(1, len(args)):
125  reps += r.rep(i, len(args[i]))
126  weight += robjects.FloatVector(args[i])
127  group = r.factor(reps)
128  aov = r('kruskal.test')(group, weight)
129  prdb(aov)
130  kruskal_result = {'fstat': aov[0][0],
131  'pval': aov[2][0],
132  'nsteps': [len(i) for i in args],
133  'nreps': ngroups,
134  'test': 'kruskal'}
135  return kruskal_result # F statistic and p-value
136 
137 
138 def ttest(obs, target):
139  """perform a one-sample two-sided t-test on obs against target mean"""
140  test = r('t.test')(robjects.IntVector(obs), mu=target)
141  return test[0][0], test[2][0] # stat and p-value
142 
143 
144 def binom(obs, target):
145  """perform an exact binomial test on the mean of obs against target"""
146  success = sum(obs)
147  trials = len(obs)
148  test = r('binom.test')(success, trials, p=target)
149  return test[0][0], test[2][0] # stat and p-value
150 
151 
152 def bartlett(*args):
153  """perform bartlett's test on the equality of variances of the
154  observations"""
155  ngroups = len(args)
156  nreps = len(args[0])
157  group = r.gl(ngroups, nreps)
158  weight = robjects.IntVector(args[0])
159  for i in args[1:]:
160  weight += robjects.IntVector(i)
161  robjects.globalenv["weight"] = weight
162  robjects.globalenv["group"] = group
163  var = r('bartlett.test')(weight, group)
164  return var[0][0], var[2][0] # statistic and p-value
165 
166 
167 def fligner(*args):
168  """perform Fligner-Killeen non-parametric test of the variance equality"""
169  ngroups = len(args)
170  nreps = len(args[0])
171  group = r.gl(ngroups, nreps)
172  weight = robjects.IntVector(args[0])
173  for i in args[1:]:
174  weight += robjects.IntVector(i)
175  robjects.globalenv["weight"] = weight
176  robjects.globalenv["group"] = group
177  var = r('fligner.test')(weight, group)
178  return var[0][0], var[2][0] # statistic and p-value
179 
180 
181 def power_test(ar, power=0.8, alpha=0.05):
182  """perform an anova power test and return
183  - the power of the test with this input data
184  - the number of points that would be needed to achieve a default
185  power of 0.8
186  ar: the output of anova()
187  """
188  result = r('power.anova.test')(groups=ar['nreps'], n=min(ar['nsteps']),
189  between=ar['between'], within=ar['within'],
190  sig=alpha)
191  prdb('the power of this anova was: %.3f' % result[5][0])
192  result = r('power.anova.test')(groups=ar['nreps'],
193  between=ar['between'], within=ar['within'],
194  sig=alpha, pow=power)
195  prdb('To have a power of %.3f, there should be at least %d exchange '
196  'attempts.' % (power, result[1][0]))
197  return
198 
199 
200 def minimum_n(ar, alpha=0.05):
201  """This routine tries to return an estimate of the additional number of
202  exchange trials that could lead to a positive result of the anova (e.g.
203  the average ARs are not the same). It is still very crude. It also assumes
204  that a one-way anova was made.
205  ar: the output of anova()
206  alpha: type I error
207  """
208  nreps = ar['nreps']
209  nsteps = ar['nsteps']
210  try:
211  nsteps = min(nsteps)
212  except: # noqa: E722
213  pass
214  fstat = ar['fstat']
215  return nsteps * (numpy.sqrt(Finv(1 - alpha, nreps - 1,
216  nreps * (nsteps - 1)) / fstat) - 1)
217 
218 
219 # Heat capacity class
221 
222  """When created, estimates the heat capacity from the energies or from the
223  indicator functions using the specified method. Two methods are then
224  available to guess the heat capacity at a given parameter value: get()
225  returns a single point, and mean() returns the linear average between two
226  points.
227  """
228 
229  def __init__(self, params, energies=None, indicators=None,
230  method="constant", temps=None, write_cv=False):
231 
232  self.__initialized = False
233  self.__cv = []
234  self.method = method
235 
236  if method == "interpolate":
237  self.estimate_cv_interpolate(params, indicators)
238  self.get = self.get_interp
239  self.mean = self.mean_interp
240  elif method == "constant":
241  self.estimate_cv_constant(params, indicators)
242  self.get = lambda p: self.__cv
243  self.mean = lambda p1, p2: self.__cv
244  self.__cvfun = self.get
245  r('cvspline <- function(x) {%f}' % self.__cv)
246  elif method == "mbar":
247  self.estimate_cv_mbar(params, energies, temps)
248  self.get = self.get_mbar
249  self.mean = self.mean_mbar
250  else:
251  raise NotImplementedError(method)
252 
253  self.__initialized = True
254 
255  # write the heat capacity to a file
256  if write_cv:
257  fl = open('cv', 'w')
258  fl.write("".join(["%f %f\n" % (x, self.get(x))
259  for x in numpy.linspace(params[0] / 2, 2 * params[-1])]))
260  fl.close()
261 
262  def estimate_cv_interpolate(self, params, indicators):
263  """interpolate using previous values, by reversing the approximate
264  overlap function
265  """
266  if self.__initialized:
267  return
268  if len(indicators) != len(params) - 1:
269  raise ValueError(
270  "the length of indicators and params does not match!")
271  if params != tuple(sorted(params)):
272  raise NotImplementedError(
273  "unable to work on parameters that do not change "
274  "monotonically")
275 
276  prdb("storing params and means")
277  self.__params = params
278  self.__pmeans = [(params[i] + params[i + 1]) / 2.
279  for i in range(len(params) - 1)]
280  prdb("computing __cv")
281  for i, ind in enumerate(indicators):
282  mean = sum(ind) / float(len(ind))
283  Y2 = 2 * kB * erfinv(1 - mean) ** 2
284  p1 = params[i]
285  p2 = params[i + 1]
286  self.__cv.append(
287  (self.__pmeans[i],
288  (p1 ** 2 + p2 ** 2) * float(Y2) / (p2 - p1) ** 2))
289  prdb(self.__params)
290  prdb(self.__cv)
291  self.__cvmean = sum([i[1] for i in self.__cv]) / float(len(self.__cv))
292  if self.__cvmean < 0:
293  raise ValueError("Cv mean is negative!")
294  self.__cvfun = spline(self.__cv, self.__cvmean)
295  return
296 
297  def estimate_cv_constant(self, params, indicators):
298  """try to guess which constant cv fits best"""
299  if self.__initialized:
300  return
301  self.estimate_cv_interpolate(params, indicators)
302  self.__cv = self.__cvmean
303  return
304 
305  def needs_init(self):
306  if not self.__initialized:
307  raise RuntimeError("Class was not initialized correctly!")
308 
309  def estimate_cv_mbar(self, params, energies, temps):
310  "use MBAR to get the heat capacity"
311  raise NotImplementedError("estimate_cv_mbar")
312 
313  def _isinbounds(self, p, params):
314  """returns True if p is within params, else false. the params list
315  must be sorted ascendingly."""
316  if p < params[0] - EPSILON or p > params[-1] + EPSILON:
317  # prdb("Warning: value %f is outside of bounds, "
318  # "extrapolating." % p)
319  return False
320  else:
321  return True
322 
323  def _interpolate(self, xval, xlist):
324  """return interpolation of Cv at point xval, and return the average
325  instead if this value is negative.
326  """
327  self._isinbounds(xval, xlist)
328  val = self.__cvfun(xval)
329  if val > 0:
330  return val
331  else:
332  return self.__cvmean
333 
334  def get_interp(self, param):
335  """returns the point estimate of the first derivative of the energy
336  with respect to the replica exchange parameter (usually T or q).
337  This version assumes that the means of cv are given.
338  """
339  self.needs_init()
340  return self._interpolate(param, self.__pmeans)
341 
342  def get_mbar(self, param):
343  """returns the point estimate of the first derivative of the energy
344  with respect to the replica exchange parameter (usually T or q).
345  This version assumes that the values of cv are given.
346  """
347  self.needs_init()
348  return self._interpolate(param, self.__params)
349 
350  def mean_interp(self, pa, pb):
351  """estimate the mean of Cv between two points. Here the means were
352  stored previously
353  """
354  self.needs_init()
355  return self._interpolate((pa + pb) / 2., self.__pmeans)
356 
357  def mean_mbar(self, pa, pb):
358  self.needs_init()
359  return (self.get_mbar(pb) + self.get_mbar(pa)) / 2.
360 
361 # Parameter updating methods
362 
363 
364 def update_good_dumb(newp, oldp, *args, **kwargs):
365  """Here the old parameters are oldp[0] and oldp[1], and the starting point
366  is newp[0]. We should modify newp[1] so that the AR in the following cycle
367  is equal to the targetAR.
368  In the "dumb" method, the Cv and targetAR keywords are ignored.
369  Here the newp[1] parameter is modified because prior changes have set
370  newp[0] to a different value than oldp[0]. Thus, we should move newp[1] by
371  minimizing the effect on the AR since it is supposedly equal to targetAR.
372  In this simple method, the parameter is just translated.
373  """
374  prdb(
375  "newp[0] has moved (%.3f -> %.3f), adjusting the position of newp[1]" %
376  (oldp[0], newp[0]))
377  return oldp[1] - (oldp[0] - newp[0])
378 
379 
380 def update_bad_dumb(newp, oldp, ind, targetAR=0.4, scale=0.1, **kwargs):
381  """Here the old parameters are oldp[0] and oldp[1], and the starting point
382  is newp[0]. We should modify newp[1] so that the AR in the following cycle
383  is equal to the targetAR.
384  In the "dumb" method, the Cv keyword is ignored. Here the newp[1]
385  parameter is modified to follow the possible translation of newp[0] by
386  calling update_good_dumb, and then newp[1] is added or subtracted scale% of
387  (oldp[1] - oldp[0]) to adjust to targetAR.
388  """
389 
390  if newp[0] != oldp[0]:
391  newp[1] = update_good_dumb(newp, oldp)
392  if targetAR > sum(ind) / float(len(ind)):
393  prdb("""target AR is higher than expected, decreasing newp[1]""")
394  newp[1] -= scale * (oldp[1] - oldp[0])
395  else:
396  prdb("""target AR is lower than expected, increasing newp[1]""")
397  newp[1] += scale * (oldp[1] - oldp[0])
398  return newp[1]
399 
400 
401 def update_any_cv_step(newp, oldp, ind, targetAR=0.4, Cv=None, **kwargs):
402  """here we use the average AR formula of two gaussians to get newp[1] as a
403  function of newp[1], knowing the targetAR and estimating the Cv. If
404  targetAR is negative, consider that mean(ind) equals the target AR and
405  skip any calculation in the case that oldp[0] equals newp[0].
406  step: suppose the heat capacity is stepwise constant, i.e. use the heat
407  capacity at position newp[0] as an estimate of the mean of the heat
408  capacity between newp[0] and newp[1]. This does not require any
409  self-consistent loop.
410  """
411 
412  global kB
413 
414  if abs(oldp[0] - newp[0]) < EPSILON and targetAR < 0:
415  return oldp[1]
416  if targetAR < 0:
417  targetAR = sum(ind) / float(len(ind))
418  cv = Cv.get(newp[0])
419  Y = numpy.sqrt(2 * kB) * float(erfinv(1 - targetAR))
420  if Y ** 2 >= cv:
421  raise ValueError("""targetAR too small for this approximate method, use
422  the full self-consistent method instead.""")
423  return newp[0] * (cv + Y * numpy.sqrt(2 * cv - Y ** 2)) / (cv - Y ** 2)
424 
425 
426 def update_any_cv_sc(newp, oldp, ind, targetAR=0.4, Cv=None,
427  tol=1e-6, maxiter=10000):
428  """self-consistent solver version"""
429 
430  global kB
431 
432  if abs(oldp[0] - newp[0]) < EPSILON and targetAR < 0:
433  return oldp[1]
434  if targetAR < 0:
435  targetAR = sum(ind) / float(len(ind))
436  cv = Cv.get(newp[0])
437  Y = numpy.sqrt(2 * kB) * float(erfinv(1 - targetAR))
438  if Y ** 2 >= cv:
439  raise ValueError("""targetAR too small for this approximate method, use
440  the full self-consistent method instead.""")
441  targetp = newp[0] * (cv + Y * numpy.sqrt(2 * cv - Y ** 2)) / (cv - Y ** 2)
442  for i in range(maxiter):
443  cv = Cv.mean(newp[0], targetp)
444  (oldtargetp, targetp) = (
445  targetp, newp[0] * (cv + Y * numpy.sqrt(2 * cv - Y ** 2))
446  / (cv - Y ** 2))
447  if abs(targetp - oldtargetp) <= tol:
448  break
449  if numpy.isnan(targetp):
450  if Y ** 2 >= cv:
451  raise ValueError(
452  "targetAR too small for this approximate method, use the "
453  "full self-consistent method instead.")
454  else:
455  raise ValueError("""something unexpected happened""")
456  if i == maxiter - 1:
457  prdb("""Warning: unable to converge the self-consistent after %d
458  iterations and a tolerance of %f. Change the method or decrease the
459  tolerance!""" % (maxiter, tol))
460  prdb("converged after %d iterations and a tolerance of %f for x=%f" %
461  (i, tol, oldp[1]))
462  return targetp
463 
464 
465 def update_any_cv_scfull(newp, oldp, ind, targetAR=0.4, Cv=None,
466  tol=1e-6, maxiter=10000):
467  """self-consistent solver version, on the exact average AR equation"""
468 
469  # create helper functions and overlap function
470  _ = r('u21 <- function(t1,t2) { integrate(Vectorize(cvspline),'
471  't1,t2)$value }')
472  _ = r('b21 <- function(t1,t2) { 1./(kB*t2) - 1./(kB*t1) }')
473  _ = r('sigma2 <- function(t) {cvspline(t)*kB*t**2}')
474  _rovboltz = r('ovboltz <- function(t1,t2) {\
475  1/2*( 1-erf(\
476  u21(t1,t2)/sqrt(2*(sigma2(t1)+sigma2(t2))))\
477  + exp(b21(t1,t2)*(u21(t1,t2)+b21(t1,t2)*(sigma2(t1)+sigma2(t2))/2))\
478  * (1+erf((u21(t1,t2)+b21(t1,t2)*(sigma2(t1)+sigma2(t2)))\
479  /(sqrt(2*(sigma2(t1)+sigma2(t2))))))\
480  )}')
481  _rrootfn = r(
482  'rootfn <- function(t2) {ovboltz(%f,t2)-%f}' %
483  (newp[0], targetAR))
484 
485  # find upper bound for estimation, raise an error if cv is negative
486  if oldp[1] > oldp[0]:
487  tmp = newp[0] * 1.05
488  else:
489  tmp = newp[0] * 0.95
490  nloops = 0
491  while _rrootfn(tmp)[0] >= 0:
492  nloops += 1
493  tmp += (oldp[1] - oldp[0])
494  if Cv.get(tmp) < 0:
495  raise RuntimeError("heat capacity goes negative")
496  if nloops > maxiter:
497  raise RuntimeError('could not find zero of function!')
498 
499  # find root
500  _runiroot = r('uniroot(rootfn,c(%f,%f),f.lower = %f, f.upper = %f, '
501  'tol = %f, maxiter = %d)'
502  % (newp[0], tmp, 1 - targetAR, -targetAR, tol, maxiter))
503  prdb("self-consistent solver converged after %s iterations and an "
504  "estimated precision of %s " % (_runiroot[2][0], _runiroot[3][0]))
505  prdb(
506  ["root:",
507  _runiroot[0][0],
508  "overlap:",
509  _rovboltz(newp[0],
510  _runiroot[0][0])[0]])
511  return _runiroot[0][0]
512 
513 
514 def update_any_cv_nr(newp, oldp, ind, targetAR=0.4, Cv=None, **kwargs):
515  """newton-raphson solver version"""
516 
517  # use nlm
518  raise NotImplementedError
519 
520 # Testing methods
521 
522 
523 def are_equal_to_targetAR(
524  indicators,
525  targetAR=0.4,
526  alpha=0.05,
527  method="binom"):
528  """here, all indicators have same average, we want to know if it is
529  equal to targetAR
530  """
531 
532  # calculate sample mean deviation of each indicator function from targetAR
533  deviations = sorted([(abs(sum(ind) / float(len(ind)) - targetAR), ind)
534  for pos, ind in enumerate(indicators)])
535  deviant = deviations[-1]
536 
537  # perform t-test
538  if method == "ttest":
539  # from statlib.stats import ttest_1samp as ttest
540  our_ttest = ttest
541  elif method == "binom":
542  our_ttest = binom
543  else:
544  raise NotImplementedError
545 
546  try:
547  test, pval = our_ttest(deviant[1], targetAR)
548  except: # noqa: E722
549  if abs(targetAR - sum(deviant[1]) / len(deviant[1])) > EPSILON:
550  pval = 0
551  else:
552  pval = 1
553  if pval < alpha:
554  return False
555  else:
556  return True
557 
558 
559 def are_stationnary(indicators, alpha=0.05, method="anova"):
560  """test on the stationarity of the observations (block analysis). Done
561  so by launching an anova on the difference between the two halves of
562  each observations.
563  """
564 
565  if method == "kruskal":
566  test = kruskal
567  else:
568  test = anova
569 
570  tmp = numpy.array(indicators)
571  blocklen = len(indicators[0]) / 2
572  block = tmp[:, :blocklen] - tmp[:, blocklen:2 * blocklen]
573  if test(*block)['pval'] < alpha:
574  return False
575  else:
576  return True
577 
578 
579 def are_equal(indicators, targetAR=0.4, alpha=0.05,
580  method="anova", varMethod="skip", power=0.8):
581  """Perform a one-way ANOVA or kruskal-wallis test on the indicators set,
582  and return True if one cannot exclude with risk alpha that the indicators
583  AR are different (i.e. True = all means are equal). Also performs a test
584  on the variance (they must be equal).
585  """
586 
587  if min(targetAR, 1 - targetAR) * len(indicators[0]) <= 5 and \
588  (varMethod == "bartlett" or method == "anova"):
589  prdb("Warning: normal approximation to the binomial does not hold!")
590 
591  # test the variances
592  if varMethod == "skip":
593  pass
594  else:
595  if varMethod == "bartlett":
596  pval = bartlett(*indicators)[1]
597  elif varMethod == "fligner":
598  pval = fligner(*indicators)[1]
599  else:
600  raise NotImplementedError(
601  "variance testing method unknown: %s" %
602  varMethod)
603  if pval < alpha:
604  prdb("Warning: performing mean test with unequal variances.")
605 
606  if method == "kruskal":
607  test = kruskal
608  else:
609  test = anova
610 
611  tr = test(*indicators)
612  tr['alpha'] = alpha
613  # p-value < alpha => H0 rejected => result == False
614  tr['result'] = tr['pval'] >= alpha
615 
616  return tr
617 
618 
619 def find_good_ARs(indicators, targetAR=0.4, alpha=0.05, method="binom"):
620  """perform one-sample t-tests on each of the data sets, and return
621  a tuple of bool of size N-1, False if AR i is not equal to targetAR
622  at risk alpha.
623  """
624 
625  # calculate sample means of each indicator function
626  means = sorted([(sum(ind) / float(len(ind)), pos, ind)
627  for pos, ind in enumerate(indicators)])
628 
629  # perform t-test
630  if method == "ttest":
631  # from statlib.stats import ttest_1samp as ttest
632  our_ttest = ttest
633  elif method == "binom":
634  our_ttest = binom
635  else:
636  raise NotImplementedError
637 
638  isGoodTuple = []
639  # start from the lowest means and stop when they are ok
640  prdb("starting left")
641  for (i, (mean, pos, ind)) in enumerate(means):
642  prdb("performing t-test on couple %d having average AR %f, "
643  "position %d" % (pos, mean, i))
644  try:
645  test, pval = our_ttest(ind, targetAR)
646  except: # noqa: E722
647  if abs(targetAR - mean) > EPSILON:
648  pval = 0
649  else:
650  pval = 1
651  if pval < alpha:
652  # means are different
653  isGoodTuple.append((pos, False))
654  else:
655  goodstart = i
656  break
657  # then start from the highest means
658  prdb("starting right")
659  for (i, (mean, pos, ind)) in enumerate(reversed(means)):
660  prdb("performing t-test on couple %d having average AR %f, position %d"
661  % (pos, mean, len(means) - 1 - i))
662  if our_ttest(ind, targetAR)[1] < alpha:
663  # means are different
664  isGoodTuple.append((pos, False))
665  else:
666  goodstop = len(means) - 1 - i
667  break
668 
669  # limiting cases: all different
670  if len(isGoodTuple) > len(indicators):
671  return tuple([False] * len(indicators))
672  # all equal
673  elif len(isGoodTuple) == 0:
674  return tuple([True] * len(indicators))
675  # intermediate
676  else:
677  isGoodTuple.extend([(means[i][1], True) for i in
678  range(goodstart, goodstop + 1)])
679  isGoodTuple.sort()
680  return tuple([tup[1] for tup in isGoodTuple])
681 
682 # Trebst, Katzgraber, Nadler and Hansmann optimum flux stuff
683 
684 
685 def mean_first_passage_times(
686  replicanums_ori,
687  subs=1,
688  start=0,
689  use_avgAR=False):
690  """compute mean first passage times as suggested in
691  Nadler W, Meinke J, Hansmann UHE, Phys Rev E *78* 061905 (2008)
692 
693  use_avgAR : if a list of average ARs is given computes everything from
694  average AR; if it is False, compute by direct counting of events.
695 
696  returns:
697  If use_avgAR == False:
698  tau0, tauN, chose_N, times0, timesN
699  else:
700  tau0, tauN, None, None, None
701  tau0[i]: average time to go from replica 0 to replica i
702  tauN[i]: average time to go from replica N to replica i
703  times0 and timesN are the corresponding lists of single events.
704 
705  """
706 
707  from numpy import array, zeros
708  replicanums = array(replicanums_ori)[:, start::subs]
709  N = len(replicanums)
710  tauN = [0] * N
711  tau0 = [0] * N
712 
713  if use_avgAR:
714  tau0[0] = 0.0
715  tauN[-1] = 0.0
716  for state in range(1, N):
717  tau0[state] = tau0[state - 1] + \
718  state / (float(use_avgAR[state - 1]))
719  for state in reversed(range(1, N)):
720  tauN[state - 1] = tauN[state] \
721  + (N - (state - 1)) / (float(use_avgAR[state - 1]))
722 
723  return tau0, tauN, None, None, None
724 
725  else:
726  # prdb('not using average AR')
727  if sys.version_info[0] >= 3:
728  izip = zip
729  else:
730  from itertools import izip
731  # the algorithm looks for replicas that start at the lowest temp, and
732  # records the farthest state it went to before returning to zero. Once
733  # back it increments the counter of all concerned replicas. Similar
734  # procedure if starting from N.
735  store0 = zeros((N, N), dtype=bool)
736  last0 = [0 for i in range(N)]
737  already0 = [False for i in range(N)]
738  times0 = [[] for i in range(N)]
739  storeN = zeros((N, N), dtype=bool)
740  lastN = [0 for i in range(N)]
741  alreadyN = [False for i in range(N)]
742  timesN = [[] for i in range(N)]
743 
744  # prdb('looping over replicanums')
745  for time, frame in enumerate(izip(*replicanums)):
746  # case of the replica in state 0
747  if not already0[frame[0]]:
748  last0[frame[0]] = time
749  store0[frame[0], :] = True
750  already0[frame[0]] = True
751  # case of the replica in state N
752  if not alreadyN[frame[-1]]:
753  lastN[frame[-1]] = time
754  storeN[frame[-1], :] = True
755  alreadyN[frame[-1]] = True
756  # set already flags to False when in state 1 or N-1
757  already0[frame[1]] = False
758  alreadyN[frame[-2]] = False
759  # loop over all states
760  for state, rep in enumerate(frame):
761  if store0[rep, state]:
762  # reached a state for the first time since 0
763  store0[rep, state] = False
764  # store time since this replica left state 0
765  times0[state].append(time - last0[rep])
766  if storeN[rep, state]:
767  # reached a state for the first time since N
768  storeN[rep, state] = False
769  # store time since this replica left state N
770  timesN[state].append(time - lastN[rep])
771  # prdb([replicanums.shape, len(storeN), len(last0)])
772  # times = [[] for i in range(N)]
773  chose_N = [len(timesN[state]) > len(times0[state]) for state in
774  range(N)]
775  for state in range(N):
776  tauN[state] = sum(timesN[state]) / float(len(timesN[state]))
777  tau0[state] = sum(times0[state]) / float(len(times0[state]))
778  # prdb(len(chose_N))
779 
780  return tau0, tauN, chose_N, times0, timesN
781 
782 
783 def compute_effective_fraction(tau0, tauN, chose_N):
784  """input: tau0, tauN, chose_N
785  output: effective fraction f(T) (P_up(n)) as introduced in
786  Trebst S, Troyer M, Hansmann UHE, J Chem Phys *124* 174903 (2006).
787  formalized in
788  Nadler W, Hansmann UHE, Phys Rev E *75* 026109 (2007)
789  and whose calculation is enhanced in
790  Nadler W, Meinke J, Hansmann UHE, Phys Rev E *78* 061905 (2008)
791  the ideal value of f(n) should be 1 - n/N
792  """
793 
794  # nstar is the index of the last state where tau0 should be used.
795  N = len(tau0)
796  if chose_N is None:
797  nstar = N / 2
798  else:
799  nstar = N - sum([int(a) for a in chose_N]) - 1
800 
801  prdb("n* = %d" % nstar)
802  # compute helper functions h
803  h0 = [0] * N
804  h0[1] = tau0[1]
805  for state in range(2, nstar + 1):
806  h0[state] = h0[state - 1] + \
807  (tau0[state] - tau0[state - 1]) / float(state)
808 
809  hN = [0] * N
810  hN[-2] = tauN[-2]
811  for state in reversed(range(nstar, N - 1)):
812  hN[state] = hN[state + 1] + \
813  (tauN[state] - tauN[state + 1]) / float(N - state)
814 
815  # compute flow probabilities
816  pup = [0] * N
817  pup[0] = 1
818  for n in range(1, nstar + 1):
819  pup[n] = 1 - h0[n] / (h0[nstar] + hN[nstar])
820  for n in range(nstar + 1, N):
821  pup[n] = hN[n] / (h0[nstar] + hN[nstar])
822 
823  return pup
824 
825 
826 def spline_diffusivity(pup, params):
827  """spline interpolation of diffusivity: D = 1/(df/dT * heta)
828  """
829  from numpy import linspace
830  robjects.globalenv["hetay"] = \
831  robjects.FloatVector(linspace(0, 1, num=len(params)).tolist())
832  robjects.globalenv["hetax"] = robjects.FloatVector(params)
833  robjects.globalenv["pupx"] = robjects.FloatVector(params)
834  robjects.globalenv["pupy"] = robjects.FloatVector(pup)
835  _ = r('heta <- splinefun(hetax,hetay,method="monoH.FC")')
836  _ = r('eff <- splinefun(pupx,pupy,method="monoH.FC")')
837  diff = r('diff <- function(x) {-1/(heta(x,deriv=1)*eff(x,deriv=1))}')
838  return lambda x: diff(x)[0]
839 
840 
841 # Misc
842 def compute_indicators(replicanums, subs=1, start=0):
843  """input: replicanums : a list of N lists of size M, where N is the number
844  of states and M is the length of the simulation. Each element is an
845  integer, and corresponds to the label of a replica.
846  output: an indicator function of exchanges (size (N-1)x(M-1)), 1 if
847  exchange and 0 if not.
848  """
849  def exchange(n, m):
850  if replicanums[n][m] == replicanums[n + 1][m + 1] \
851  and replicanums[n][m + 1] == replicanums[n + 1][m]:
852  return 1
853  else:
854  return 0
855 
856  indicators = []
857  for n in range(len(replicanums) - 1):
858  indicators.append(
859  [exchange(n, m)
860  for m in range(len(replicanums[n]) - 1)][start::subs])
861  return indicators
862 
863 # Main routines
864 
865 
866 def update_params_nonergodic(pup, params, write_g=False, num=False):
867 
868  from numpy import linspace
869  # g = spline(zip(pup,params),0,method='monoH.FC')
870  g = linear_interpolation(list(zip(pup, params)), 0)
871  if write_g:
872  d = spline_diffusivity(pup, params)
873  fl = open('g', 'w')
874  fl.write("".join(["%f %f\n" % (x, g(x))
875  for x in linspace(0, 1, num=100)]))
876  fl.close()
877  fl = open('diffusivity', 'w')
878  fl.write(''.join(["%f %f\n" % (x, d(x)) for x in
879  linspace(params[0], params[-1], num=100)]))
880  fl.close()
881  fl = open('pup', 'w')
882  fl.write("".join(["%f %f\n" % (i, j) for (i, j) in zip(params, pup)]))
883  fl.close()
884 
885  if num is False:
886  newparams = [g(i) for i in reversed(linspace(0, 1, num=len(params)))]
887  else:
888  newparams = [g(i) for i in reversed(linspace(0, 1, num=num))]
889  # for numerical issues
890  newparams[0] = params[0]
891  newparams[-1] = params[-1]
892 
893  return newparams
894 
895 
896 def update_params(
897  indicators, params, isGood, targetAR=0.4, immobilePoint=1,
898  Cv=None, badMethod="dumb", goodMethod="dumb", dumb_scale=0.1):
899  """update the parameters according to the isGood tuple and using the
900  specified methods"""
901 
902  newparams = list(params) # make a copy
903 
904  if immobilePoint != 1:
905  raise NotImplementedError
906 
907  if Cv is None and (badMethod != "dumb" or goodMethod != "dumb"):
908  raise RuntimeError("""Cv needs to be estimated if using other methods
909  than 'dumb' for updating!""")
910 
911  if goodMethod == "dumb":
912  update_good = update_good_dumb
913  elif goodMethod == "step":
914  update_good = update_any_cv_step
915  elif goodMethod == "sc":
916  update_good = update_any_cv_sc
917  elif goodMethod == "scfull":
918  update_good = update_any_cv_scfull
919  elif goodMethod == "nr":
920  update_good = update_any_cv_nr
921  else:
922  raise NotImplementedError(goodMethod)
923  if badMethod == "dumb":
924  update_bad = update_bad_dumb
925  elif badMethod == "step":
926  update_bad = update_any_cv_step
927  elif badMethod == "sc":
928  update_bad = update_any_cv_sc
929  elif badMethod == "scfull":
930  update_bad = update_any_cv_scfull
931  elif badMethod == "nr":
932  update_bad = update_any_cv_nr
933  else:
934  raise NotImplementedError(badMethod)
935 
936  # scan each position starting from the immobilePoint
937  for pos in range(len(params) - 1):
938  if isGood[pos]:
939  newparams[pos + 1] = update_good(
940  newparams[pos:pos + 2], params[pos:pos + 2],
941  indicators[pos], targetAR=targetAR, Cv=Cv)
942  else:
943  newparams[pos + 1] = update_bad(
944  newparams[pos:pos + 2], params[pos:pos + 2],
945  indicators[pos], targetAR=targetAR, Cv=Cv, scale=dumb_scale)
946 
947  return tuple(newparams)
948 
949 
950 def tune_params_flux(replicanums, params, subs=1, start=0, alpha=0.05,
951  testMethod='anova', meanMethod='binom', use_avgAR=False,
952  power=0.8, num=False):
953  # num is here if you want to add some more temperatures. indicate total
954  # number of replicas
955 
956  # TODO: do case where one estimates all based on target AR.
957  if use_avgAR is not False:
958  raise NotImplementedError
959 
960  prdb("computing mean first passage times")
961  tau0, tauN, chose_N, times0, timesN = mean_first_passage_times(
962  replicanums, subs=subs, start=start, use_avgAR=use_avgAR)
963 
964  prdb("average round trip time: %.2f (%d+%d events)" %
965  (tau0[-1] + tauN[0], len(times0[-1]), len(timesN[0])))
966  prdb("checking if the parameterset needs to be improved")
967  N = len(replicanums)
968  if chose_N is None:
969  nstar = N / 2
970  else:
971  nstar = N - sum([int(a) for a in chose_N]) - 1
972 
973  reduced = []
974  # no need to check for times?[0] or times?[N]
975  for n in range(1, N - 1):
976  if n > nstar:
977  reduced.append([i * 2.0 / ((N - n) * (N - n + 1))
978  for i in timesN[n]])
979  else:
980  reduced.append([i * 2.0 / (n * (n + 1)) for i in times0[n]])
981 
982  anova_result = are_equal(reduced, alpha=alpha, method=testMethod,
983  power=power)
984  if (anova_result['result']): # TODO test if equal to targetAR
985  prdb("flux is constant, nothing to do!")
986  min_n = minimum_n(anova_result, alpha)
987  prdb('Try to rerun this test with at least %d more samples.' %
988  numpy.ceil(min_n))
989  return (False, min_n)
990 
991  # the flux is not constant so the parameters need improvement.
992  # calculate the estimate of the effective fraction
993  prdb("parameterset not optimal, computing effective fraction")
994  pup = compute_effective_fraction(tau0, tauN, chose_N)
995 
996  # improve parameterset
997  prdb("returning new parameterset")
998  params = update_params_nonergodic(pup, params, num=num)
999 
1000  return (True, params)
1001 
1002 
1003 def tune_params_ar(indicators, params, targetAR=0.4, alpha=0.05,
1004  immobilePoint=1, CvMethod="skip", badMethod="dumb",
1005  goodMethod="dumb", varMethod="skip", testMethod="anova",
1006  meanMethod="binom", energies=None, temps=None, power=0.8,
1007  dumb_scale=0.1):
1008  """Tune the replica-exchange parameters and return a new set.
1009 
1010  Arguments:
1011  indicators -- an (N-1)x(M-1) table, where entry at position (j,i)
1012  is True if replica j and j+1 performed an exchange between
1013  times i and i+1.
1014  params -- the current set of N parameters used in the simulation.
1015 
1016  Keyword arguments:
1017  targetAR -- the target AR which is wanted for the simulation
1018  (default: 0.4)
1019  alpha -- the type 1 error on the one-way ANOVA and subsequent
1020  t-tests (default: 5%)
1021  immobilePoint -- which replica should keep it's parameter fixed
1022  (default: 1st replica, e.g. 1)
1023  CvMethod -- the heat capacity estimation method (default:
1024  "skip", other options: "mbar", "spline", "constant")
1025  badMethod -- how to correct the (j+1)th parameter if the
1026  acceptance ratio between replicas j and j+1 is off the
1027  target value (default: "dumb", options: "step", "sc",
1028  "scfull", "nr")
1029  goodMethod -- how to update the value of the (j+1)th parameter
1030  in the case of a correctly exchanging couple, but if the jth
1031  parameter has been modified (default: "dumb",options: "step",
1032  "sc" self-consistent, "scfull" self-consistent using the exact
1033  equation, "nr" newton-raphson solver for the exact equation)
1034  dumb_scale -- (0.0-1.0) in the "dumb" method, scale wrong temperature
1035  intervals by this amount. (default: 0.1)
1036  testMethod -- how to test for the difference of the means,
1037  either "anova" for a one-way anova, or "kruskal" for a
1038  Kruskal-Wallis one-way non-parametric anova.
1039  meanMethod -- "ttest" for a two-sided one-sample t-test,
1040  "binom" for an exact binomial test of the probability.
1041  varMethod -- how to test for the equality of variances.
1042  "fligner" for the Fligner-Killeen non-parametric test,
1043  "bartlett" for Bartlett's test, "skip" to pass.
1044  energies -- if CvMethod is set to "mbar", the energies of each
1045  state as a function of time are used to estimate the heat capacity.
1046  temps -- the temperatures of the simulations, if estimating
1047  with "mbar".
1048 
1049  Return Value:
1050  returns a tuple: (bool, params). bool is True if params have
1051  changed, and params is the new set.
1052 
1053  """
1054 
1055  # perform ANOVA
1056  prdb("performing ANOVA")
1057  anova_result = are_equal(indicators, targetAR, alpha, method=testMethod,
1058  varMethod=varMethod, power=power)
1059  if (anova_result['result']
1060  and are_equal_to_targetAR(indicators, targetAR, alpha,
1061  method=meanMethod)):
1062  prdb("all means are equal to target AR, nothing to do!")
1063  min_n = minimum_n(anova_result, alpha)
1064  prdb(
1065  'Try to rerun this test with at least %d more samples.' %
1066  numpy.ceil(min_n))
1067  return (False, min_n)
1068  prdb("some means are different, performing t-tests")
1069 
1070  # perform two-by-two t-tests
1071  isGood = find_good_ARs(indicators, targetAR, alpha, method=meanMethod)
1072  if not (False in isGood):
1073  prdb("""Bad luck: ANOVA says means are not identical, but t-tests
1074  can't find the bad rates...""")
1075  return (False, params)
1076  prdb(isGood)
1077 
1078  # check if data is stationnary by doing a block analysis
1079  prdb("performing stationarity test")
1080  if not are_stationnary(indicators, alpha):
1081  prdb("Warning: Some simulations are not stationary!")
1082 
1083  # interpolate the heat capacity from the data
1084  # TODO: use all previous data, not just from this run.
1085  prdb("launching Cv estimation or skipping it")
1086  if CvMethod == "skip":
1087  Cv = None
1088  elif CvMethod == "interpolate" or CvMethod == "constant":
1089  Cv = CvEstimator(params, indicators=indicators, method=CvMethod)
1090  elif CvMethod == "mbar":
1091  Cv = CvEstimator(
1092  params,
1093  energies=energies,
1094  temps=temps,
1095  method=CvMethod)
1096  else:
1097  raise NotImplementedError(CvMethod)
1098 
1099  # update parameters
1100  prdb('updating params')
1101  # update the current parameter set to match the target AR
1102  params = update_params(indicators, params, isGood, targetAR=targetAR,
1103  immobilePoint=immobilePoint, Cv=Cv,
1104  badMethod=badMethod, goodMethod=goodMethod,
1105  dumb_scale=dumb_scale)
1106 
1107  prdb('Done')
1108  return (True, params)
1109 
1110 
1111 if __name__ == '__main__':
1112  import numpy
1113  replicanums = []
1114  for i in range(1, 8):
1115  replicanums.append(
1116  tuple(numpy.fromfile('data/replica-indices/%d.rep' % i,
1117  dtype=int, sep='\n')))
1118 
1119  prdb("replicanums: %dx%d" % (len(replicanums), len(replicanums[0])))
1120  params = tuple(numpy.fromfile('data/temperatures', sep=' '))
1121 
1122  prdb(params)
1123  indicators = compute_indicators(replicanums, subs=1, start=0)
1124  prdb("indicators: %dx%d" % (len(indicators), len(indicators[0])))
1125  prdb("Exchange rate:")
1126  prdb(numpy.array([sum(ind) / float(len(ind)) for ind in indicators]))
1127  numpy.array([sum(ind) / float(len(ind))
1128  for ind in indicators]).tofile('xchgs', sep='\n')
1129  changed, newparams = tune_params_ar(
1130  indicators, params, targetAR=0.25,
1131  badMethod="dumb", goodMethod="dumb", CvMethod="skip",
1132  testMethod="anova", alpha=0.05)
1133  if not changed:
1134  print("Parameter set seems optimal.")
1135  else:
1136  if True not in [abs(newparams[i + 1] - newparams[i]) < 1e-3
1137  for i in range(len(newparams) - 1)]:
1138  numpy.array(newparams).tofile('data/temperatures', sep=' ')
1139  else:
1140  print("PROBLEM IN NEW PARAMETERSET -> not saved")
1141  print("params :", params)
1142  print("new params:", newparams)
def estimate_cv_constant
try to guess which constant cv fits best
Definition: TuneRex.py:297
def estimate_cv_interpolate
interpolate using previous values, by reversing the approximate overlap function
Definition: TuneRex.py:262
When created, estimates the heat capacity from the energies or from the indicator functions using the...
Definition: TuneRex.py:220
def mean_interp
estimate the mean of Cv between two points.
Definition: TuneRex.py:350
def estimate_cv_mbar
use MBAR to get the heat capacity
Definition: TuneRex.py:309
def get_mbar
returns the point estimate of the first derivative of the energy with respect to the replica exchange...
Definition: TuneRex.py:342
def get_interp
returns the point estimate of the first derivative of the energy with respect to the replica exchange...
Definition: TuneRex.py:334