IMP logo
IMP Reference Guide  2.5.0
The Integrative Modeling Platform
TuneRex.py
1 #!/usr/bin/env python
2 
3 from __future__ import print_function
4 
5 __doc__ = """
6 This module provides a few methods to improve the efficiency of a replica-exchange simulation
7 by tuning its parameters.
8 Author: Yannick Spill
9 """
10 
11 import sys
12 #from mpmath import erf, erfinv
13 #from math import sqrt
14 #from random import randint
15 from numpy import *
16 from numpy.random import randint
17 import rpy2.robjects as robjects
18 
19 kB = 1.3806503 * 6.0221415 / 4184.0 # Boltzmann constant in kcal/mol/K
20 # here for float comparison. Floats are equal if their difference
21 EPSILON = 1e-8
22  # is smaller than EPSILON
23 debug = False
24 
25 
26 def prdb(arg):
27  if debug:
28  print(arg)
29 
30 # R compatibility functions
31 
32 r = robjects.r
33 robjects.globalenv["kB"] = kB
34 _rinverf = r('invErf <- function(x) {qnorm((1 + x) /2) / sqrt(2)}')
35 _rerf = r('erf <- function(x) {2 * pnorm(x * sqrt(2)) - 1}')
36 _rinvF = r('qf')
37 _rinterp = None
38 
39 
40 def erfinv(x):
41  return _rinverf(x)[0]
42 
43 
44 def erf(x):
45  return _rerf(x)[0]
46 
47 
48 def Finv(x, d1, d2):
49  return _rinvF(x, d1, d2)[0]
50 
51 
52 def spline(xy, mean, method=None):
53  """spline interpolation of (x,y) coordinates. If interpolation goes
54  negative, replace by mean value.
55  """
56  x, y = list(zip(*xy))
57  robjects.globalenv["x"] = robjects.FloatVector(x)
58  robjects.globalenv["y"] = robjects.FloatVector(y)
59  global _rinterp
60  #_rinterp = r.splinefun(x,y)
61  if method is None:
62  r('cvsplinenonbounded <- splinefun(x,y)')
63  else:
64  r('cvsplinenonbounded <- splinefun(x,y,method="%s")' % method)
65  _rinterp = r(
66  'cvspline <- function(x) { tmp = cvsplinenonbounded(x); if (tmp>0) {tmp} else {%f}}' %
67  mean)
68 
69  def interpolated(x):
70  global _rinterp
71  return _rinterp(x)[0]
72  return interpolated
73 
74 
75 def linear_interpolation(xy, mean):
76  """linear interpolation of (x,y) coordinates. No extrapolation possible.
77  """
78  x, y = list(zip(*xy))
79  robjects.globalenv["x"] = robjects.FloatVector(x)
80  robjects.globalenv["y"] = robjects.FloatVector(y)
81  global _rinterp
82  #_rinterp = r.splinefun(x,y)
83  _rinterp = r('cvspline <- approxfun(x,y)')
84 
85  def interpolated(x):
86  global _rinterp
87  return _rinterp(x)[0]
88  return interpolated
89 
90 
91 # R testing functions
92 
93 def anova(*args):
94  """perform anova using R and return statistic, p-value, between and within variance"""
95  ngroups = len(args) # number of groups
96  # nreps = len(args[0]) #number of repetitions
97  #group = r.gl(ngroups,nreps)
98  reps = r.rep(0, len(args[0]))
99  weight = robjects.FloatVector(args[0])
100  for i in range(1, len(args)):
101  reps += r.rep(i, len(args[i]))
102  weight += robjects.FloatVector(args[i])
103  group = r.factor(reps)
104  robjects.globalenv["weight"] = weight
105  robjects.globalenv["group"] = group
106  lm = r.lm("weight ~ group")
107  aov = r.anova(lm)
108  prdb(aov)
109  # F statistic, p-value, between and within variance
110  anova_result = {'fstat': aov[3][0],
111  'pval': aov[4][0],
112  'between': aov[2][0],
113  'within': aov[2][1],
114  'nsteps': [len(i) for i in args],
115  'nreps': ngroups,
116  'test': 'anova'} # nreps: number of replicas
117  return anova_result
118 
119 
120 def kruskal(*args):
121  """perform kruskal-wallis rank test"""
122  ngroups = len(args)
123  #nreps = len(args[0])
124  #group = r.gl(ngroups,nreps)
125  reps = r.rep(0, len(args[0]))
126  weight = robjects.FloatVector(args[0])
127  for i in range(1, len(args)):
128  reps += r.rep(i, len(args[i]))
129  weight += robjects.FloatVector(args[i])
130  group = r.factor(reps)
131  aov = r('kruskal.test')(group, weight)
132  prdb(aov)
133  kruskal_result = {'fstat': aov[0][0],
134  'pval': aov[2][0],
135  'nsteps': [len(i) for i in args],
136  'nreps': ngroups,
137  'test': 'kruskal'}
138  return kruskal_result # F statistic and p-value
139 
140 
141 def ttest(obs, target):
142  """perform a one-sample two-sided t-test on obs against target mean"""
143  test = r('t.test')(robjects.IntVector(obs), mu=target)
144  return test[0][0], test[2][0] # stat and p-value
145 
146 
147 def binom(obs, target):
148  """perform an exact binomial test on the mean of obs against target"""
149  success = sum(obs)
150  trials = len(obs)
151  test = r('binom.test')(success, trials, p=target)
152  return test[0][0], test[2][0] # stat and p-value
153 
154 
155 def bartlett(*args):
156  """perform bartlett's test on the equality of variances of the observations"""
157  ngroups = len(args)
158  nreps = len(args[0])
159  group = r.gl(ngroups, nreps)
160  weight = robjects.IntVector(args[0])
161  for i in args[1:]:
162  weight += robjects.IntVector(i)
163  robjects.globalenv["weight"] = weight
164  robjects.globalenv["group"] = group
165  var = r('bartlett.test')(weight, group)
166  return var[0][0], var[2][0] # statistic and p-value
167 
168 
169 def fligner(*args):
170  """perform Fligner-Killeen non-parametric test of the variance equality"""
171  ngroups = len(args)
172  nreps = len(args[0])
173  group = r.gl(ngroups, nreps)
174  weight = robjects.IntVector(args[0])
175  for i in args[1:]:
176  weight += robjects.IntVector(i)
177  robjects.globalenv["weight"] = weight
178  robjects.globalenv["group"] = group
179  var = r('fligner.test')(weight, group)
180  return var[0][0], var[2][0] # statistic and p-value
181 
182 
183 def power_test(ar, power=0.8, alpha=0.05):
184  """perform an anova power test and return
185  - the power of the test with this input data
186  - the number of points that would be needed to achieve a default power of 0.8
187  ar: the ouput of anova()
188  """
189  result = r('power.anova.test')(groups=ar['nreps'], n=min(ar['nsteps']),
190  between=ar['between'], within=ar['within'], sig=alpha)
191  prdb('the power of this anova was: %.3f' % result[5][0])
192  result = r('power.anova.test')(groups=ar['nreps'],
193  between=ar['between'], within=ar['within'], sig=alpha, pow=power)
194  prdb(
195  'To have a power of %.3f, there should be at least %d exchange attemps.' %
196  (power, result[1][0]))
197  return
198 
199 
200 def minimum_n(ar, alpha=0.05):
201  """This routine tries to return an estimate of the additional number of exchange trials that
202  could lead to a positive result of the anova (e.g. the average ARs are not the same). It is
203  still very crude. It also assumes that a one-way anova was made.
204  ar: the output of anova()
205  alpha: type I error
206  """
207  nreps = ar['nreps']
208  nsteps = ar['nsteps']
209  try:
210  nsteps = min(nsteps)
211  except:
212  pass
213  fstat = ar['fstat']
214  return (
215  nsteps *
216  (sqrt(Finv(1 - alpha, nreps - 1, nreps * (nsteps - 1)) / fstat) - 1)
217  )
218 
219 
220 # Heat capacity class
222 
223  """When created, estimates the heat capacity from the energies or from the
224  indicator functions using the specified method. Two methods are then
225  available to guess the heat capacity at a given parameter value: get()
226  returns a single point, and mean() returns the linear average between two
227  points.
228  """
229 
230  def __init__(self, params, energies=None, indicators=None,
231  method="constant", temps=None, write_cv=False):
232 
233  kB = 1.3806503 * 6.0221415 / 4184.0 # Boltzmann constant in kcal/mol/K
234  self.__initialized = False
235  self.__cv = []
236  self.method = method
237 
238  if method == "interpolate":
239  self.estimate_cv_interpolate(params, indicators)
240  self.get = self.get_interp
241  self.mean = self.mean_interp
242  elif method == "constant":
243  self.estimate_cv_constant(params, indicators)
244  self.get = lambda p: self.__cv
245  self.mean = lambda p1, p2: self.__cv
246  self.__cvfun = self.get
247  r('cvspline <- function(x) {%f}' % self.__cv)
248  elif method == "mbar":
249  self.estimate_cv_mbar(params, energies, temps)
250  self.get = self.get_mbar
251  self.mean = self.mean_mbar
252  else:
253  raise NotImplementedError(method)
254 
255  self.__initialized = True
256 
257  # write the heat capacity to a file
258  if write_cv:
259  fl = open('cv', 'w')
260  fl.write("".join(["%f %f\n" % (x, self.get(x))
261  for x in linspace(params[0] / 2, 2 * params[-1])]))
262  fl.close()
263 
264  def estimate_cv_interpolate(self, params, indicators):
265  """interpolate using previous values, by reversing the approximate overlap
266  function
267  """
268  if self.__initialized:
269  return
270  if len(indicators) != len(params) - 1:
271  raise ValueError(
272  "the length of indicators and params does not match!")
273  if params != tuple(sorted(params)):
274  raise NotImplementedError(
275  "unable to work on parameters that do not change monotonically")
276 
277  prdb("storing params and means")
278  self.__params = params
279  self.__pmeans = [(params[i] + params[i + 1]) / 2.
280  for i in range(len(params) - 1)]
281  prdb("computing __cv")
282  for i, ind in enumerate(indicators):
283  mean = sum(ind) / float(len(ind))
284  Y2 = 2 * kB * erfinv(1 - mean) ** 2
285  p1 = params[i]
286  p2 = params[i + 1]
287  self.__cv.append(
288  (self.__pmeans[i], (p1 ** 2 + p2 ** 2) * float(Y2) / (p2 - p1) ** 2))
289  prdb(self.__params)
290  prdb(self.__cv)
291  self.__cvmean = sum([i[1] for i in self.__cv]) / float(len(self.__cv))
292  if self.__cvmean < 0:
293  raise ValueError("Cv mean is negative!")
294  self.__cvfun = spline(self.__cv, self.__cvmean)
295  return
296 
297  def estimate_cv_constant(self, params, indicators):
298  """try to guess which constant cv fits best"""
299  if self.__initialized:
300  return
301  self.estimate_cv_interpolate(params, indicators)
302  self.__cv = self.__cvmean
303  return
304 
305  def needs_init(self):
306  if not self.__initialized:
307  raise RuntimeError("Class was not initialized correctly!")
308 
309  def estimate_cv_mbar(params, energies, temps):
310  "use MBAR to get the heat capacity"
311  raise NotImplementedError(method)
312  if self.__initialized:
313  return
314 
315  def _isinbounds(self, p, params):
316  """returns True if p is within parms, else false. the params list must be sorted ascendingly."""
317  if p < params[0] - EPSILON or p > params[-1] + EPSILON:
318  #prdb("Warning: value %f is outside of bounds, extrapolating." % p)
319  return False
320  else:
321  return True
322 
323  def _interpolate(self, xval, xlist):
324  """return interpolation of Cv at point xval, and return the average instead
325  if this value is negative.
326  """
327  self._isinbounds(xval, xlist)
328  val = self.__cvfun(xval)
329  if val > 0:
330  return val
331  else:
332  return self.__cvmean
333 
334  def get_interp(self, param):
335  """returns the point estimate of the first derivative of the energy with
336  respect to the replica exchange parameter (usually T or q).
337  This version assumes that the means of cv are given.
338  """
339  self.needs_init()
340  return self._interpolate(param, self.__pmeans)
341 
342  def get_mbar(self, param):
343  """returns the point estimate of the first derivative of the energy with
344  respect to the replica exchange parameter (usually T or q).
345  This version assumes that the values of cv are given.
346  """
347  self.needs_init()
348  return self._interpolate(param, self.__params)
349 
350  def mean_interp(self, pa, pb):
351  """estimate the mean of Cv between two points. Here the means were
352  stored previously
353  """
354  self.needs_init()
355  return self._interpolate((pa + pb) / 2., self.__pmeans)
356 
357  def mean_mbar(self, pa, pb):
358  self.needs_init()
359  return (self.get_mbar(pb) + self.get_mbar(pa)) / 2.
360 
361 # Parameter updating methods
362 
363 
364 def update_good_dumb(newp, oldp, *args, **kwargs):
365  """Here the old parameters are oldp[0] and oldp[1], and the starting point
366  is newp[0]. We should modify newp[1] so that the AR in the following cycle
367  is equal to the targetAR.
368  In the "dumb" method, the Cv and targetAR keywords are ignored.
369  Here the newp[1] parameter is modified because prior changes have set
370  newp[0] to a different value than oldp[0]. Thus, we should move newp[1] by
371  minimizing the effect on the AR since it is supposedly equal to targetAR. In
372  this simple method, the parameter is just translated.
373  """
374  prdb(
375  "newp[0] has moved (%.3f -> %.3f), adjusting the position of newp[1]" %
376  (oldp[0], newp[0]))
377  return oldp[1] - (oldp[0] - newp[0])
378 
379 
380 def update_bad_dumb(newp, oldp, ind, targetAR=0.4, scale=0.1, **kwargs):
381  """Here the old parameters are oldp[0] and oldp[1], and the starting point
382  is newp[0]. We should modify newp[1] so that the AR in the following cycle
383  is equal to the targetAR.
384  In the "dumb" method, the Cv keyword is ignored. Here the newp[1]
385  parameter is modified to follow the possible translation of newp[0] by
386  calling update_good_dumb, and then newp[1] is added or substracted scale% of
387  (oldp[1] - oldp[0]) to adjust to targetAR.
388  """
389 
390  if newp[0] != oldp[0]:
391  newp[1] = update_good_dumb(newp, oldp)
392  if targetAR > sum(ind) / float(len(ind)):
393  prdb("""target AR is higher than expected, decreasing newp[1]""")
394  newp[1] -= scale * (oldp[1] - oldp[0])
395  else:
396  prdb("""target AR is lower than expected, increasing newp[1]""")
397  newp[1] += scale * (oldp[1] - oldp[0])
398  return newp[1]
399 
400 
401 def update_any_cv_step(newp, oldp, ind, targetAR=0.4, Cv=None, **kwargs):
402  """here we use the average AR formula of two gaussians to get newp[1] as a
403  function of newp[1], knowing the targetAR and estimating the Cv. If targetAR
404  is negative, consider that mean(ind) equals the target AR and skip any
405  calculation in the case that oldp[0] equals newp[0].
406  step: suppose the heat capacity is stepwise constant, i.e. use the heat
407  capacity at position newp[0] as an estimate of the mean of the heat capacity
408  between newp[0] and newp[1]. This does not require any self-consistent loop.
409  """
410 
411  global kB
412 
413  if abs(oldp[0] - newp[0]) < EPSILON and targetAR < 0:
414  return oldp[1]
415  if targetAR < 0:
416  targetAR = sum(ind) / float(len(ind))
417  cv = Cv.get(newp[0])
418  Y = sqrt(2 * kB) * float(erfinv(1 - targetAR))
419  if Y ** 2 >= cv:
420  raise ValueError("""targetAR too small for this approximate method, use
421  the full self-consistent method instead.""")
422  return newp[0] * (cv + Y * sqrt(2 * cv - Y ** 2)) / (cv - Y ** 2)
423 
424 
425 def update_any_cv_sc(newp, oldp, ind, targetAR=0.4, Cv=None,
426  tol=1e-6, maxiter=10000):
427  """self-consistent solver version"""
428 
429  global kB
430 
431  if abs(oldp[0] - newp[0]) < EPSILON and targetAR < 0:
432  return oldp[1]
433  if targetAR < 0:
434  targetAR = sum(ind) / float(len(ind))
435  cv = Cv.get(newp[0])
436  Y = sqrt(2 * kB) * float(erfinv(1 - targetAR))
437  if Y ** 2 >= cv:
438  raise ValueError("""targetAR too small for this approximate method, use
439  the full self-consistent method instead.""")
440  targetp = newp[0] * (cv + Y * sqrt(2 * cv - Y ** 2)) / (cv - Y ** 2)
441  for i in range(maxiter):
442  cv = Cv.mean(newp[0], targetp)
443  (oldtargetp, targetp) = (
444  targetp, newp[0] * (cv + Y * sqrt(2 * cv - Y ** 2)) / (cv - Y ** 2))
445  if abs(targetp - oldtargetp) <= tol:
446  break
447  if isnan(targetp):
448  if Y ** 2 >= cv:
449  raise ValueError("""targetAR too small for this approximate method, use
450  the full self-consistent method instead.""")
451  else:
452  raise ValueError("""something unexpected happened""")
453  if i == maxiter - 1:
454  prdb("""Warning: unable to converge the self-consistent after %d
455  iterations and a tolerance of %f. Change the method or decrease the
456  tolerance!""" % (maxiter, tol))
457  prdb("converged after %d iterations and a tolerance of %f for x=%f" %
458  (i, tol, oldp[1]))
459  return targetp
460 
461 
462 def update_any_cv_scfull(newp, oldp, ind, targetAR=0.4, Cv=None,
463  tol=1e-6, maxiter=10000):
464  """self-consistent solver version, on the exact average AR equation"""
465 
466  # create helper functions and overlap function
467  #_ru21 = r('u21 <- function(t1,t2) { (t2-t1)*(cvspline(t2)-cvspline(t1))/2. }')
468  _ru21 = r(
469  'u21 <- function(t1,t2) { integrate(Vectorize(cvspline),t1,t2)$value }')
470  _rb21 = r('b21 <- function(t1,t2) { 1./(kB*t2) - 1./(kB*t1) }')
471  _rsigma2 = r('sigma2 <- function(t) {cvspline(t)*kB*t**2}')
472  _rovboltz = r('ovboltz <- function(t1,t2) {\
473  1/2*( 1-erf(\
474  u21(t1,t2)/sqrt(2*(sigma2(t1)+sigma2(t2))))\
475  + exp(b21(t1,t2)*(u21(t1,t2)+b21(t1,t2)*(sigma2(t1)+sigma2(t2))/2))\
476  * (1+erf((u21(t1,t2)+b21(t1,t2)*(sigma2(t1)+sigma2(t2)))/(sqrt(2*(sigma2(t1)+sigma2(t2))))))\
477  )}')
478  _rrootfn = r(
479  'rootfn <- function(t2) {ovboltz(%f,t2)-%f}' %
480  (newp[0], targetAR))
481 
482  # find upper bound for estimation, raise an error if cv is negative
483  if oldp[1] > oldp[0]:
484  tmp = newp[0] * 1.05
485  else:
486  tmp = newp[0] * 0.95
487  nloops = 0
488  while _rrootfn(tmp)[0] >= 0:
489  nloops += 1
490  tmp += (oldp[1] - oldp[0])
491  if Cv.get(tmp) < 0:
492  raise RuntimeError("heat capacity goes negative")
493  if nloops > maxiter:
494  raise RuntimeError('could not find zero of function!')
495 
496  # find root
497  _runiroot = r(
498  'uniroot(rootfn,c(%f,%f),f.lower = %f, f.upper = %f, tol = %f, maxiter = %d)' % (newp[0], tmp,
499  1 - targetAR, -targetAR, tol, maxiter))
500  prdb(
501  "self-consistent solver converged after %s iterations and an estimated precision of %s " %
502  (_runiroot[2][0], _runiroot[3][0]))
503  prdb(
504  ["root:",
505  _runiroot[0][0],
506  "overlap:",
507  _rovboltz(newp[0],
508  _runiroot[0][0])[0]])
509  return _runiroot[0][0]
510 
511 
512 def update_any_cv_nr(newp, oldp, ind, targetAR=0.4, Cv=None, **kwargs):
513  """newton-raphson solver version"""
514 
515  # use nlm
516  raise NotImplementedError
517 
518 # Testing methods
519 
520 
521 def are_equal_to_targetAR(
522  indicators,
523  targetAR=0.4,
524  alpha=0.05,
525  method="binom"):
526  """here, all indicators have same average, we want to know if it is equal to
527  targetAR
528  """
529 
530  # calculate sample mean deviation of each indicator function from targetAR
531  deviations = sorted([(abs(sum(ind) / float(len(ind)) - targetAR), ind)
532  for pos, ind in enumerate(indicators)])
533  deviant = deviations[-1]
534 
535  # perform t-test
536  if method == "ttest":
537  #from statlib.stats import ttest_1samp as ttest
538  ttest = ttest
539  elif method == "binom":
540  ttest = binom
541  else:
542  raise NotImplementedError
543 
544  try:
545  test, pval = ttest(deviant[1], targetAR)
546  except:
547  if abs(targetAR - sum(deviant[1]) / len(deviant[1])) > EPSILON:
548  pval = 0
549  else:
550  pval = 1
551  if pval < alpha:
552  return False
553  else:
554  return True
555 
556 
557 def are_stationnary(indicators, alpha=0.05, method="anova"):
558  """test on the stationarity of the observations (block analysis). Done so by
559  launching an anova on the difference between the two halves of each observations.
560  """
561 
562  if method == "kruskal":
563  test = kruskal
564  else:
565  test = anova
566 
567  tmp = array(indicators)
568  blocklen = len(indicators[0]) / 2
569  block = tmp[:, :blocklen] - tmp[:, blocklen:2 * blocklen]
570  if test(*block)['pval'] < alpha:
571  return False
572  else:
573  return True
574 
575 
576 def are_equal(indicators, targetAR=0.4, alpha=0.05,
577  method="anova", varMethod="skip", power=0.8):
578  """Perform a one-way ANOVA or kruskal-wallis test on the indicators set,
579  and return True if one cannot exclude with risk alpha that the indicators
580  AR are different (i.e. True = all means are equal). Also performs a test
581  on the variance (they must be equal).
582  """
583 
584  if min(targetAR, 1 - targetAR) * len(indicators[0]) <= 5 and \
585  (varMethod == "bartlett" or method == "anova"):
586  prdb("Warning: normal approximation to the binomial does not hold!")
587 
588  # test the variances
589  if varMethod == "skip":
590  pass
591  else:
592  if varMethod == "bartlett":
593  pval = bartlett(*indicators)[1]
594  elif varMethod == "fligner":
595  pval = killeen(*indicators)[1]
596  else:
597  raise NotImplementedError(
598  "variance testing method unknown: %s" %
599  varMethod)
600  if pval < alpha:
601  prdb("Warning: performing mean test with unequal variances.")
602 
603  if method == "kruskal":
604  test = kruskal
605  else:
606  test = anova
607 
608  tr = test(*indicators)
609  tr['alpha'] = alpha
610  # p-value < alpha => H0 rejected => result == False
611  tr['result'] = tr['pval'] >= alpha
612 
613  # if tr['test'] == 'anova':
614  # try:
615  # power_test(tr, power=power, alpha=alpha)
616  # except:
617  # prdb("power test failed")
618  # pass
619 
620  return tr
621 
622 
623 def find_good_ARs(indicators, targetAR=0.4, alpha=0.05, method="binom"):
624  """perform one-sample t-tests on each of the data sets, and return
625  a tuple of bool of size N-1, False if AR i is not equal to targetAR
626  at risk alpha.
627  """
628 
629  # calculate sample means of each indicator function
630  means = sorted([(sum(ind) / float(len(ind)), pos, ind)
631  for pos, ind in enumerate(indicators)])
632 
633  # perform t-test
634  if method == "ttest":
635  #from statlib.stats import ttest_1samp as ttest
636  ttest = ttest
637  elif method == "binom":
638  ttest = binom
639  else:
640  raise NotImplementedError
641 
642  isGoodTuple = []
643  # start from the lowest means and stop when they are ok
644  prdb("starting left")
645  for (i, (mean, pos, ind)) in enumerate(means):
646  prdb(
647  "performing t-test on couple %d having average AR %f, position %d" %
648  (pos, mean, i))
649  try:
650  test, pval = ttest(ind, targetAR)
651  except:
652  if abs(targetAR - mean) > EPSILON:
653  pval = 0
654  else:
655  pval = 1
656  if pval < alpha:
657  # means are different
658  isGoodTuple.append((pos, False))
659  else:
660  goodstart = i
661  break
662  # then start from the highest means
663  prdb("starting right")
664  for (i, (mean, pos, ind)) in enumerate(reversed(means)):
665  prdb("performing t-test on couple %d having average AR %f, position %d"
666  % (pos, mean, len(means) - 1 - i))
667  if ttest(ind, targetAR)[1] < alpha:
668  # means are different
669  isGoodTuple.append((pos, False))
670  else:
671  goodstop = len(means) - 1 - i
672  break
673 
674  # limiting cases: all different
675  if len(isGoodTuple) > len(indicators):
676  return tuple([False] * len(indicators))
677  # all equal
678  elif len(isGoodTuple) == 0:
679  return tuple([True] * len(indicators))
680  # intermediate
681  else:
682  isGoodTuple.extend([(means[i][1], True) for i in
683  range(goodstart, goodstop + 1)])
684  isGoodTuple.sort()
685  return tuple([tup[1] for tup in isGoodTuple])
686 
687 # Trebst, Katzgraber, Nadler and Hansmann optimum flux stuff
688 
689 
690 def mean_first_passage_times(
691  replicanums_ori,
692  subs=1,
693  start=0,
694  use_avgAR=False):
695  """compute mean first passage times as suggested in
696  Nadler W, Meinke J, Hansmann UHE, Phys Rev E *78* 061905 (2008)
697 
698  use_avgAR : if a list of average ARs is given computes everything from
699  average AR; if it is False, compute by direct counting of events.
700 
701  returns:
702  If use_avgAR == False:
703  tau0, tauN, chose_N, times0, timesN
704  else:
705  tau0, tauN, None, None, None
706  tau0[i]: average time to go from replica 0 to replica i
707  tauN[i]: average time to go from replica N to replica i
708  times0 and timesN are the corresponding lists of single events.
709 
710  """
711 
712  from numpy import array, zeros
713  from pprint import pprint, pformat
714  replicanums = array(replicanums_ori)[:, start::subs]
715  N = len(replicanums)
716  tauN = [0] * N
717  tau0 = [0] * N
718 
719  if use_avgAR:
720  tau0[0] = 0.0
721  tauN[-1] = 0.0
722  for state in range(1, N):
723  tau0[state] = tau0[state - 1] + \
724  state / (float(use_avgAR[state - 1]))
725  for state in reversed(range(1, N)):
726  tauN[state - 1] = tauN[state] \
727  + (N - (state - 1)) / (float(use_avgAR[state - 1]))
728 
729  return tau0, tauN, None, None, None
730 
731  else:
732  #prdb('not using average AR')
733  if sys.version_info[0] >= 3:
734  izip = zip
735  else:
736  from itertools import izip
737  # the algorithm looks for replicas that start at the lowest temp, and
738  # records the farthest state it went to before returning to zero. Once
739  # back it increments the counter of all concerned replicas. Similar
740  # procedure if starting from N.
741  store0 = zeros((N, N), dtype=bool)
742  last0 = [0 for i in range(N)]
743  already0 = [False for i in range(N)]
744  times0 = [[] for i in range(N)]
745  storeN = zeros((N, N), dtype=bool)
746  lastN = [0 for i in range(N)]
747  alreadyN = [False for i in range(N)]
748  timesN = [[] for i in range(N)]
749 
750  #prdb('looping over replicanums')
751  for time, frame in enumerate(izip(*replicanums)):
752  # case of the replica in state 0
753  if not already0[frame[0]]:
754  last0[frame[0]] = time
755  store0[frame[0], :] = True
756  already0[frame[0]] = True
757  # case of the replica in state N
758  if not alreadyN[frame[-1]]:
759  lastN[frame[-1]] = time
760  storeN[frame[-1], :] = True
761  alreadyN[frame[-1]] = True
762  # set already flags to False when in state 1 or N-1
763  already0[frame[1]] = False
764  alreadyN[frame[-2]] = False
765  # loop over all states
766  for state, rep in enumerate(frame):
767  if store0[rep, state]:
768  # reached a state for the first time since 0
769  store0[rep, state] = False
770  # store time since this replica left state 0
771  times0[state].append(time - last0[rep])
772  if storeN[rep, state]:
773  # reached a state for the first time since N
774  storeN[rep, state] = False
775  # store time since this replica left state N
776  timesN[state].append(time - lastN[rep])
777  #prdb([replicanums.shape, len(storeN), len(last0)])
778  times = [[] for i in range(N)]
779  chose_N = [len(timesN[state]) > len(times0[state]) for state in
780  range(N)]
781  for state in range(N):
782  tauN[state] = sum(timesN[state]) / float(len(timesN[state]))
783  tau0[state] = sum(times0[state]) / float(len(times0[state]))
784  # prdb(len(chose_N))
785 
786  return tau0, tauN, chose_N, times0, timesN
787 
788 
789 def compute_effective_fraction(tau0, tauN, chose_N):
790  """input: tau0, tauN, chose_N
791  output: effective fraction f(T) (P_up(n)) as introduced in
792  Trebst S, Troyer M, Hansmann UHE, J Chem Phys *124* 174903 (2006).
793  formalized in
794  Nadler W, Hansmann UHE, Phys Rev E *75* 026109 (2007)
795  and whose calculation is enhanced in
796  Nadler W, Meinke J, Hansmann UHE, Phys Rev E *78* 061905 (2008)
797  the ideal value of f(n) should be 1 - n/N
798  """
799 
800  # nstar is the index of the last state where tau0 should be used.
801  N = len(tau0)
802  if chose_N is None:
803  nstar = N / 2
804  else:
805  nstar = N - sum([int(a) for a in chose_N]) - 1
806 
807  prdb("n* = %d" % nstar)
808  # compute helper functions h
809  h0 = [0] * N
810  h0[1] = tau0[1]
811  for state in range(2, nstar + 1):
812  h0[state] = h0[state - 1] + \
813  (tau0[state] - tau0[state - 1]) / float(state)
814 
815  hN = [0] * N
816  hN[-2] = tauN[-2]
817  for state in reversed(range(nstar, N - 1)):
818  hN[state] = hN[state + 1] + \
819  (tauN[state] - tauN[state + 1]) / float(N - state)
820 
821  # compute flow probabilities
822  pup = [0] * N
823  pup[0] = 1
824  for n in range(1, nstar + 1):
825  pup[n] = 1 - h0[n] / (h0[nstar] + hN[nstar])
826  for n in range(nstar + 1, N):
827  pup[n] = hN[n] / (h0[nstar] + hN[nstar])
828 
829  return pup
830 
831 
832 def spline_diffusivity(pup, params):
833  """spline interpolation of diffusivity: D = 1/(df/dT * heta)
834  """
835  from numpy import linspace
836  robjects.globalenv["hetay"] = \
837  robjects.FloatVector(linspace(0, 1, num=len(params)).tolist())
838  robjects.globalenv["hetax"] = robjects.FloatVector(params)
839  robjects.globalenv["pupx"] = robjects.FloatVector(params)
840  robjects.globalenv["pupy"] = robjects.FloatVector(pup)
841  heta = r('heta <- splinefun(hetax,hetay,method="monoH.FC")')
842  eff = r('eff <- splinefun(pupx,pupy,method="monoH.FC")')
843  diff = r('diff <- function(x) {-1/(heta(x,deriv=1)*eff(x,deriv=1))}')
844  return lambda x: diff(x)[0]
845 
846 
847 # Misc
848 def compute_indicators(replicanums, subs=1, start=0):
849  """input: replicanums : a list of N lists of size M, where N is the number of
850  states and M is the length of the simulation. Each element is an integer,
851  and corresponds to the label of a replica.
852  output: an indicator function of exchanges (size (N-1)x(M-1)), 1 if exchange and
853  0 if not.
854  """
855  def exchange(n, m):
856  if replicanums[n][m] == replicanums[n + 1][m + 1] \
857  and replicanums[n][m + 1] == replicanums[n + 1][m]:
858  return 1
859  else:
860  return 0
861 
862  indicators = []
863  for n in range(len(replicanums) - 1):
864  indicators.append([exchange(n, m)
865  for m in range(len(replicanums[n]) - 1)][start::subs])
866  return indicators
867 
868 # Main routines
869 
870 
871 def update_params_nonergodic(pup, params, write_g=False, num=False):
872 
873  from numpy import linspace
874  #g = spline(zip(pup,params),0,method='monoH.FC')
875  g = linear_interpolation(list(zip(pup, params)), 0)
876  if write_g:
877  d = spline_diffusivity(pup, params)
878  fl = open('g', 'w')
879  fl.write("".join(["%f %f\n" % (x, g(x))
880  for x in linspace(0, 1, num=100)]))
881  fl.close()
882  fl = open('diffusivity', 'w')
883  fl.write(''.join(["%f %f\n" % (x, d(x)) for x in
884  linspace(params[0], params[-1], num=100)]))
885  fl.close()
886  fl = open('pup', 'w')
887  fl.write("".join(["%f %f\n" % (i, j) for (i, j) in zip(params, pup)]))
888  fl.close()
889 
890  if num is False:
891  newparams = [g(i) for i in reversed(linspace(0, 1, num=len(params)))]
892  else:
893  newparams = [g(i) for i in reversed(linspace(0, 1, num=num))]
894  # for numerical issues
895  newparams[0] = params[0]
896  newparams[-1] = params[-1]
897 
898  return newparams
899 
900 
901 def update_params(
902  indicators, params, isGood, targetAR=0.4, immobilePoint=1,
903  Cv=None, badMethod="dumb", goodMethod="dumb", dumb_scale=0.1):
904  """update the parameters according to the isGood tuple and using the
905  specified methods"""
906 
907  newparams = list(params) # make a copy
908 
909  if immobilePoint != 1:
910  raise NotImplementedError
911 
912  if Cv is None and (badMethod != "dumb" or goodMethod != "dumb"):
913  raise RuntimeError("""Cv needs to be estimated if using other methods
914  than 'dumb' for updating!""")
915 
916  if goodMethod == "dumb":
917  update_good = update_good_dumb
918  elif goodMethod == "step":
919  update_good = update_any_cv_step
920  elif goodMethod == "sc":
921  update_good = update_any_cv_sc
922  elif goodMethod == "scfull":
923  update_good = update_any_cv_scfull
924  elif goodMethod == "nr":
925  update_good = update_any_cv_nr
926  else:
927  raise NotImplementedError(goodMethod)
928  if badMethod == "dumb":
929  update_bad = update_bad_dumb
930  elif badMethod == "step":
931  update_bad = update_any_cv_step
932  elif badMethod == "sc":
933  update_bad = update_any_cv_sc
934  elif badMethod == "scfull":
935  update_bad = update_any_cv_scfull
936  elif badMethod == "nr":
937  update_bad = update_any_cv_nr
938  else:
939  raise NotImplementedError(badMethod)
940 
941  # scan each position starting from the immobilePoint
942  for pos in range(len(params) - 1):
943  if isGood[pos]:
944  newparams[pos + 1] = update_good(
945  newparams[pos:pos + 2], params[pos:pos + 2],
946  indicators[pos], targetAR=targetAR, Cv=Cv)
947  else:
948  newparams[pos + 1] = update_bad(
949  newparams[pos:pos + 2], params[pos:pos + 2],
950  indicators[pos], targetAR=targetAR, Cv=Cv, scale=dumb_scale)
951 
952  return tuple(newparams)
953 
954 
955 def tune_params_flux(replicanums, params, subs=1, start=0, alpha=0.05,
956  testMethod='anova', meanMethod='binom', use_avgAR=False,
957  power=0.8, num=False):
958  # num is here if you want to add some more temperatures. indicate total
959  # number of replicas
960 
961  # TODO: do case where one estimates all based on target AR.
962  if use_avgAR is not False:
963  raise NotImplementedError
964 
965  prdb("computing mean first passage times")
966  tau0, tauN, chose_N, times0, timesN = mean_first_passage_times(replicanums,
967  subs=subs, start=start, use_avgAR=use_avgAR)
968 
969  prdb("average round trip time: %.2f (%d+%d events)" %
970  (tau0[-1] + tauN[0], len(times0[-1]), len(timesN[0])))
971  prdb("checking if the parameterset needs to be improved")
972  N = len(replicanums)
973  if chose_N is None:
974  nstar = N / 2
975  else:
976  nstar = N - sum([int(a) for a in chose_N]) - 1
977 
978  reduced = []
979  # no need to check for times?[0] or times?[N]
980  for n in range(1, N - 1):
981  if n > nstar:
982  reduced.append([i * 2.0 / ((N - n) * (N - n + 1))
983  for i in timesN[n]])
984  else:
985  reduced.append([i * 2.0 / (n * (n + 1)) for i in times0[n]])
986 
987  anova_result = are_equal(reduced, alpha=alpha, method=testMethod,
988  power=power)
989  if (anova_result['result']): # TODO test if equal to targetAR
990  prdb("flux is constant, nothing to do!")
991  min_n = minimum_n(anova_result, alpha)
992  prdb('Try to rerun this test with at least %d more samples.' %
993  ceil(min_n))
994  return (False, min_n)
995 
996  # the flux is not constant so the parameters need improvement.
997  # calculate the estimate of the effective fraction
998  prdb("parameterset not optimal, computing effective fraction")
999  pup = compute_effective_fraction(tau0, tauN, chose_N)
1000 
1001  # improve parameterset
1002  prdb("returning new parameterset")
1003  params = update_params_nonergodic(pup, params, num=num)
1004 
1005  return (True, params)
1006 
1007 
1008 def tune_params_ar(
1009  indicators, params, targetAR=0.4, alpha=0.05, immobilePoint=1, CvMethod="skip", badMethod="dumb", goodMethod="dumb",
1010  varMethod="skip", testMethod="anova", meanMethod="binom",
1011  energies=None, temps=None, power=0.8, dumb_scale=0.1):
1012  """Tune the replica-exchange parameters and return a new set.
1013 
1014  Arguments:
1015  indicators -- an (N-1)x(M-1) table, where entry at position (j,i)
1016  is True if replica j and j+1 performed an exchange between
1017  times i and i+1.
1018  params -- the current set of N parameters used in the simulation.
1019 
1020  Keyword arguments:
1021  targetAR -- the target AR which is wanted for the simulation
1022  (default: 0.4)
1023  alpha -- the type 1 error on the one-way ANOVA and subsequent
1024  t-tests (default: 5%)
1025  immobilePoint -- which replica should keep it's parameter fixed
1026  (default: 1st replica, e.g. 1)
1027  CvMethod -- the heat capacity estimation method (default:
1028  "skip", other options: "mbar", "spline", "constant")
1029  badMethod -- how to correct the (j+1)th parameter if the
1030  acceptance ratio between replicas j and j+1 is off the
1031  target value (default: "dumb", options: "step", "sc", "scfull", "nr")
1032  goodMethod -- how to update the value of the (j+1)th parameter
1033  in the case of a correctly exchanging couple, but if the jth
1034  parameter has been modified (default: "dumb",options: "step",
1035  "sc" self-consistent, "scfull" self-consistent using the exact equation,
1036  "nr" newton-raphson solver for the exact equation)
1037  dumb_scale -- (0.0-1.0) in the "dumb" method, scale wrong temperature
1038  intervals by this amount. (default: 0.1)
1039  testMethod -- how to test for the difference of the means,
1040  either "anova" for a one-way anova, or "kruskal" for a
1041  Kruskal-Wallis one-way non-parametric anova.
1042  meanMethod -- "ttest" for a two-sided one-sample t-test,
1043  "binom" for an exact binomial test of the probability.
1044  varMethod -- how to test for the equality of variances.
1045  "fligner" for the Fligner-Killeen non-parametric test,
1046  "bartlett" for Bartlett's test, "skip" to pass.
1047  energies -- if CvMethod is set to "mbar", the energies of each
1048  state as a function of time are used to estimate the heat capacity.
1049  temps -- the temperatures of the simulations, if estimating with "mbar".
1050 
1051  Return Value:
1052  returns a tuple: (bool, params). bool is True if params have
1053  changed, and params is the new set.
1054 
1055  """
1056 
1057  # perform ANOVA
1058  prdb("performing ANOVA")
1059  anova_result = are_equal(indicators, targetAR, alpha, method=testMethod,
1060  varMethod=varMethod, power=power)
1061  if (anova_result['result'] and
1062  are_equal_to_targetAR(indicators, targetAR, alpha, method=meanMethod)):
1063  prdb("all means are equal to target AR, nothing to do!")
1064  min_n = minimum_n(anova_result, alpha)
1065  prdb(
1066  'Try to rerun this test with at least %d more samples.' %
1067  ceil(min_n))
1068  return (False, min_n)
1069  prdb("some means are different, performing t-tests")
1070 
1071  # perform two-by-two t-tests
1072  isGood = find_good_ARs(indicators, targetAR, alpha, method=meanMethod)
1073  if not (False in isGood):
1074  prdb("""Bad luck: ANOVA says means are not identical, but t-tests
1075  can't find the bad rates...""")
1076  return (False, params)
1077  prdb(isGood)
1078 
1079  # check if data is stationnary by doing a block analysis
1080  prdb("performing stationarity test")
1081  if not are_stationnary(indicators, alpha):
1082  prdb("Warning: Some simulations are not stationary!")
1083 
1084  # interpolate the heat capacity from the data
1085  # TODO: use all previous data, not just from this run.
1086  prdb("launching Cv estimation or skipping it")
1087  if CvMethod == "skip":
1088  Cv = None
1089  elif CvMethod == "interpolate" or CvMethod == "constant":
1090  Cv = CvEstimator(params, indicators=indicators, method=CvMethod)
1091  elif CvMethod == "mbar":
1092  Cv = CvEstimator(
1093  params,
1094  energies=energies,
1095  temps=temps,
1096  method=CvMethod)
1097  else:
1098  raise NotImplementedError(CvMethod)
1099 
1100  # update parameters
1101  prdb('updating params')
1102  # update the current parameter set to match the target AR
1103  params = update_params(indicators, params, isGood, targetAR=targetAR,
1104  immobilePoint=immobilePoint, Cv=Cv,
1105  badMethod=badMethod, goodMethod=goodMethod, dumb_scale=dumb_scale)
1106 
1107  prdb('Done')
1108  return (True, params)
1109 
1110 if __name__ == '__main__':
1111  from numpy import *
1112  replicanums = []
1113  for i in range(1, 8):
1114  replicanums.append(tuple(fromfile('data/replica-indices/%d.rep' % i,
1115  dtype=int, sep='\n')))
1116 
1117  prdb("replicanums: %dx%d" % (len(replicanums), len(replicanums[0])))
1118  params = tuple(fromfile('data/temperatures', sep=' '))
1119 
1120  prdb(params)
1121  indicators = compute_indicators(replicanums, subs=1, start=0)
1122  prdb("indicators: %dx%d" % (len(indicators), len(indicators[0])))
1123  prdb("Exchange rate:")
1124  prdb(array([sum(ind) / float(len(ind)) for ind in indicators]))
1125  array([sum(ind) / float(len(ind)) for ind in
1126  indicators]).tofile('xchgs', sep='\n')
1127  changed, newparams = tune_params(indicators, params, targetAR=0.25,
1128  badMethod="dumb", goodMethod="dumb", CvMethod="skip", testMethod="anova", alpha=0.05)
1129  if not changed:
1130  print("Parameter set seems optimal.")
1131  else:
1132  if not True in [abs(newparams[i + 1] - newparams[i]) < 1e-3 for i in range(len(newparams) - 1)]:
1133  array(newparams).tofile('data/temperatures', sep=' ')
1134  else:
1135  print("PROBLEM IN NEW PARAMETERSET -> not saved")
1136  print("params :", params)
1137  print("new params:", newparams)
def estimate_cv_constant
try to guess which constant cv fits best
Definition: TuneRex.py:297
def estimate_cv_interpolate
interpolate using previous values, by reversing the approximate overlap function
Definition: TuneRex.py:264
When created, estimates the heat capacity from the energies or from the indicator functions using the...
Definition: TuneRex.py:221
def mean_interp
estimate the mean of Cv between two points.
Definition: TuneRex.py:350
def estimate_cv_mbar
use MBAR to get the heat capacity
Definition: TuneRex.py:309
def get_mbar
returns the point estimate of the first derivative of the energy with respect to the replica exchange...
Definition: TuneRex.py:342
bool isnan(const T &a)
Return true if a number is NaN.
Definition: math.h:24
def get_interp
returns the point estimate of the first derivative of the energy with respect to the replica exchange...
Definition: TuneRex.py:334