IMP  2.2.1
The Integrative Modeling Platform
TuneRex.py
1 #!/usr/bin/env python
2 
3 __doc__ = """
4 This module provides a few methods to improve the efficiency of a replica-exchange simulation
5 by tuning its parameters.
6 Author: Yannick Spill
7 """
8 
9 import sys
10 #from mpmath import erf, erfinv
11 #from math import sqrt
12 #from random import randint
13 from numpy import *
14 from numpy.random import randint
15 import rpy2.robjects as robjects
16 
17 kB = 1.3806503 * 6.0221415 / 4184.0 # Boltzmann constant in kcal/mol/K
18 # here for float comparison. Floats are equal if their difference
19 EPSILON = 1e-8
20  # is smaller than EPSILON
21 debug = False
22 
23 
24 def prdb(arg):
25  if debug:
26  print arg
27 
28 # R compatibility functions
29 
30 r = robjects.r
31 robjects.globalenv["kB"] = kB
32 _rinverf = r('invErf <- function(x) {qnorm((1 + x) /2) / sqrt(2)}')
33 _rerf = r('erf <- function(x) {2 * pnorm(x * sqrt(2)) - 1}')
34 _rinvF = r('qf')
35 _rinterp = None
36 
37 
38 def erfinv(x):
39  return _rinverf(x)[0]
40 
41 
42 def erf(x):
43  return _rerf(x)[0]
44 
45 
46 def Finv(x, d1, d2):
47  return _rinvF(x, d1, d2)[0]
48 
49 
50 def spline(xy, mean, method=None):
51  """spline interpolation of (x,y) coordinates. If interpolation goes
52  negative, replace by mean value.
53  """
54  x, y = zip(*xy)
55  robjects.globalenv["x"] = robjects.FloatVector(x)
56  robjects.globalenv["y"] = robjects.FloatVector(y)
57  global _rinterp
58  #_rinterp = r.splinefun(x,y)
59  if method is None:
60  r('cvsplinenonbounded <- splinefun(x,y)')
61  else:
62  r('cvsplinenonbounded <- splinefun(x,y,method="%s")' % method)
63  _rinterp = r(
64  'cvspline <- function(x) { tmp = cvsplinenonbounded(x); if (tmp>0) {tmp} else {%f}}' %
65  mean)
66 
67  def interpolated(x):
68  global _rinterp
69  return _rinterp(x)[0]
70  return interpolated
71 
72 
73 def linear_interpolation(xy, mean):
74  """linear interpolation of (x,y) coordinates. No extrapolation possible.
75  """
76  x, y = zip(*xy)
77  robjects.globalenv["x"] = robjects.FloatVector(x)
78  robjects.globalenv["y"] = robjects.FloatVector(y)
79  global _rinterp
80  #_rinterp = r.splinefun(x,y)
81  _rinterp = r('cvspline <- approxfun(x,y)')
82 
83  def interpolated(x):
84  global _rinterp
85  return _rinterp(x)[0]
86  return interpolated
87 
88 
89 # R testing functions
90 
91 def anova(*args):
92  """perform anova using R and return statistic, p-value, between and within variance"""
93  ngroups = len(args) # number of groups
94  # nreps = len(args[0]) #number of repetitions
95  #group = r.gl(ngroups,nreps)
96  reps = r.rep(0, len(args[0]))
97  weight = robjects.FloatVector(args[0])
98  for i in xrange(1, len(args)):
99  reps += r.rep(i, len(args[i]))
100  weight += robjects.FloatVector(args[i])
101  group = r.factor(reps)
102  robjects.globalenv["weight"] = weight
103  robjects.globalenv["group"] = group
104  lm = r.lm("weight ~ group")
105  aov = r.anova(lm)
106  prdb(aov)
107  # F statistic, p-value, between and within variance
108  anova_result = {'fstat': aov[3][0],
109  'pval': aov[4][0],
110  'between': aov[2][0],
111  'within': aov[2][1],
112  'nsteps': [len(i) for i in args],
113  'nreps': ngroups,
114  'test': 'anova'} # nreps: number of replicas
115  return anova_result
116 
117 
118 def kruskal(*args):
119  """perform kruskal-wallis rank test"""
120  ngroups = len(args)
121  #nreps = len(args[0])
122  #group = r.gl(ngroups,nreps)
123  reps = r.rep(0, len(args[0]))
124  weight = robjects.FloatVector(args[0])
125  for i in xrange(1, len(args)):
126  reps += r.rep(i, len(args[i]))
127  weight += robjects.FloatVector(args[i])
128  group = r.factor(reps)
129  aov = r('kruskal.test')(group, weight)
130  prdb(aov)
131  kruskal_result = {'fstat': aov[0][0],
132  'pval': aov[2][0],
133  'nsteps': [len(i) for i in args],
134  'nreps': ngroups,
135  'test': 'kruskal'}
136  return kruskal_result # F statistic and p-value
137 
138 
139 def ttest(obs, target):
140  """perform a one-sample two-sided t-test on obs against target mean"""
141  test = r('t.test')(robjects.IntVector(obs), mu=target)
142  return test[0][0], test[2][0] # stat and p-value
143 
144 
145 def binom(obs, target):
146  """perform an exact binomial test on the mean of obs against target"""
147  success = sum(obs)
148  trials = len(obs)
149  test = r('binom.test')(success, trials, p=target)
150  return test[0][0], test[2][0] # stat and p-value
151 
152 
153 def bartlett(*args):
154  """perform bartlett's test on the equality of variances of the observations"""
155  ngroups = len(args)
156  nreps = len(args[0])
157  group = r.gl(ngroups, nreps)
158  weight = robjects.IntVector(args[0])
159  for i in args[1:]:
160  weight += robjects.IntVector(i)
161  robjects.globalenv["weight"] = weight
162  robjects.globalenv["group"] = group
163  var = r('bartlett.test')(weight, group)
164  return var[0][0], var[2][0] # statistic and p-value
165 
166 
167 def fligner(*args):
168  """perform Fligner-Killeen non-parametric test of the variance equality"""
169  ngroups = len(args)
170  nreps = len(args[0])
171  group = r.gl(ngroups, nreps)
172  weight = robjects.IntVector(args[0])
173  for i in args[1:]:
174  weight += robjects.IntVector(i)
175  robjects.globalenv["weight"] = weight
176  robjects.globalenv["group"] = group
177  var = r('fligner.test')(weight, group)
178  return var[0][0], var[2][0] # statistic and p-value
179 
180 
181 def power_test(ar, power=0.8, alpha=0.05):
182  """perform an anova power test and return
183  - the power of the test with this input data
184  - the number of points that would be needed to achieve a default power of 0.8
185  ar: the ouput of anova()
186  """
187  result = r('power.anova.test')(groups=ar['nreps'], n=min(ar['nsteps']),
188  between=ar['between'], within=ar['within'], sig=alpha)
189  prdb('the power of this anova was: %.3f' % result[5][0])
190  result = r('power.anova.test')(groups=ar['nreps'],
191  between=ar['between'], within=ar['within'], sig=alpha, pow=power)
192  prdb(
193  'To have a power of %.3f, there should be at least %d exchange attemps.' %
194  (power, result[1][0]))
195  return
196 
197 
198 def minimum_n(ar, alpha=0.05):
199  """This routine tries to return an estimate of the additional number of exchange trials that
200  could lead to a positive result of the anova (e.g. the average ARs are not the same). It is
201  still very crude. It also assumes that a one-way anova was made.
202  ar: the output of anova()
203  alpha: type I error
204  """
205  nreps = ar['nreps']
206  nsteps = ar['nsteps']
207  try:
208  nsteps = min(nsteps)
209  except:
210  pass
211  fstat = ar['fstat']
212  return (
213  nsteps *
214  (sqrt(Finv(1 - alpha, nreps - 1, nreps * (nsteps - 1)) / fstat) - 1)
215  )
216 
217 
218 # Heat capacity class
220 
221  """When created, estimates the heat capacity from the energies or from the
222  indicator functions using the specified method. Two methods are then
223  available to guess the heat capacity at a given parameter value: get()
224  returns a single point, and mean() returns the linear average between two
225  points.
226  """
227 
228  def __init__(self, params, energies=None, indicators=None,
229  method="constant", temps=None, write_cv=False):
230 
231  kB = 1.3806503 * 6.0221415 / 4184.0 # Boltzmann constant in kcal/mol/K
232  self.__initialized = False
233  self.__cv = []
234  self.method = method
235 
236  if method == "interpolate":
237  self.estimate_cv_interpolate(params, indicators)
238  self.get = self.get_interp
239  self.mean = self.mean_interp
240  elif method == "constant":
241  self.estimate_cv_constant(params, indicators)
242  self.get = lambda p: self.__cv
243  self.mean = lambda p1, p2: self.__cv
244  self.__cvfun = self.get
245  r('cvspline <- function(x) {%f}' % self.__cv)
246  elif method == "mbar":
247  self.estimate_cv_mbar(params, energies, temps)
248  self.get = self.get_mbar
249  self.mean = self.mean_mbar
250  else:
251  raise NotImplementedError(method)
252 
253  self.__initialized = True
254 
255  # write the heat capacity to a file
256  if write_cv:
257  fl = open('cv', 'w')
258  fl.write("".join(["%f %f\n" % (x, self.get(x))
259  for x in linspace(params[0] / 2, 2 * params[-1])]))
260  fl.close()
261 
262  def estimate_cv_interpolate(self, params, indicators):
263  """interpolate using previous values, by reversing the approximate overlap
264  function
265  """
266  if self.__initialized:
267  return
268  if len(indicators) != len(params) - 1:
269  raise ValueError(
270  "the length of indicators and params does not match!")
271  if params != tuple(sorted(params)):
272  raise NotImplementedError(
273  "unable to work on parameters that do not change monotonically")
274 
275  prdb("storing params and means")
276  self.__params = params
277  self.__pmeans = [(params[i] + params[i + 1]) / 2.
278  for i in xrange(len(params) - 1)]
279  prdb("computing __cv")
280  for i, ind in enumerate(indicators):
281  mean = sum(ind) / float(len(ind))
282  Y2 = 2 * kB * erfinv(1 - mean) ** 2
283  p1 = params[i]
284  p2 = params[i + 1]
285  self.__cv.append(
286  (self.__pmeans[i], (p1 ** 2 + p2 ** 2) * float(Y2) / (p2 - p1) ** 2))
287  prdb(self.__params)
288  prdb(self.__cv)
289  self.__cvmean = sum([i[1] for i in self.__cv]) / float(len(self.__cv))
290  if self.__cvmean < 0:
291  raise ValueError("Cv mean is negative!")
292  self.__cvfun = spline(self.__cv, self.__cvmean)
293  return
294 
295  def estimate_cv_constant(self, params, indicators):
296  """try to guess which constant cv fits best"""
297  if self.__initialized:
298  return
299  self.estimate_cv_interpolate(params, indicators)
300  self.__cv = self.__cvmean
301  return
302 
303  def needs_init(self):
304  if not self.__initialized:
305  raise RuntimeError("Class was not initialized correctly!")
306 
307  def estimate_cv_mbar(params, energies, temps):
308  "use MBAR to get the heat capacity"
309  raise NotImplementedError(method)
310  if self.__initialized:
311  return
312 
313  def _isinbounds(self, p, params):
314  """returns True if p is within parms, else false. the params list must be sorted ascendingly."""
315  if p < params[0] - EPSILON or p > params[-1] + EPSILON:
316  #prdb("Warning: value %f is outside of bounds, extrapolating." % p)
317  return False
318  else:
319  return True
320 
321  def _interpolate(self, xval, xlist):
322  """return interpolation of Cv at point xval, and return the average instead
323  if this value is negative.
324  """
325  self._isinbounds(xval, xlist)
326  val = self.__cvfun(xval)
327  if val > 0:
328  return val
329  else:
330  return self.__cvmean
331 
332  def get_interp(self, param):
333  """returns the point estimate of the first derivative of the energy with
334  respect to the replica exchange parameter (usually T or q).
335  This version assumes that the means of cv are given.
336  """
337  self.needs_init()
338  return self._interpolate(param, self.__pmeans)
339 
340  def get_mbar(self, param):
341  """returns the point estimate of the first derivative of the energy with
342  respect to the replica exchange parameter (usually T or q).
343  This version assumes that the values of cv are given.
344  """
345  self.needs_init()
346  return self._interpolate(param, self.__params)
347 
348  def mean_interp(self, pa, pb):
349  """estimate the mean of Cv between two points. Here the means were
350  stored previously
351  """
352  self.needs_init()
353  return self._interpolate((pa + pb) / 2., self.__pmeans)
354 
355  def mean_mbar(self, pa, pb):
356  self.needs_init()
357  return (self.get_mbar(pb) + self.get_mbar(pa)) / 2.
358 
359 # Parameter updating methods
360 
361 
362 def update_good_dumb(newp, oldp, *args, **kwargs):
363  """Here the old parameters are oldp[0] and oldp[1], and the starting point
364  is newp[0]. We should modify newp[1] so that the AR in the following cycle
365  is equal to the targetAR.
366  In the "dumb" method, the Cv and targetAR keywords are ignored.
367  Here the newp[1] parameter is modified because prior changes have set
368  newp[0] to a different value than oldp[0]. Thus, we should move newp[1] by
369  minimizing the effect on the AR since it is supposedly equal to targetAR. In
370  this simple method, the parameter is just translated.
371  """
372  prdb(
373  "newp[0] has moved (%.3f -> %.3f), adjusting the position of newp[1]" %
374  (oldp[0], newp[0]))
375  return oldp[1] - (oldp[0] - newp[0])
376 
377 
378 def update_bad_dumb(newp, oldp, ind, targetAR=0.4, scale=0.1, **kwargs):
379  """Here the old parameters are oldp[0] and oldp[1], and the starting point
380  is newp[0]. We should modify newp[1] so that the AR in the following cycle
381  is equal to the targetAR.
382  In the "dumb" method, the Cv keyword is ignored. Here the newp[1]
383  parameter is modified to follow the possible translation of newp[0] by
384  calling update_good_dumb, and then newp[1] is added or substracted scale% of
385  (oldp[1] - oldp[0]) to adjust to targetAR.
386  """
387 
388  if newp[0] != oldp[0]:
389  newp[1] = update_good_dumb(newp, oldp)
390  if targetAR > sum(ind) / float(len(ind)):
391  prdb("""target AR is higher than expected, decreasing newp[1]""")
392  newp[1] -= scale * (oldp[1] - oldp[0])
393  else:
394  prdb("""target AR is lower than expected, increasing newp[1]""")
395  newp[1] += scale * (oldp[1] - oldp[0])
396  return newp[1]
397 
398 
399 def update_any_cv_step(newp, oldp, ind, targetAR=0.4, Cv=None, **kwargs):
400  """here we use the average AR formula of two gaussians to get newp[1] as a
401  function of newp[1], knowing the targetAR and estimating the Cv. If targetAR
402  is negative, consider that mean(ind) equals the target AR and skip any
403  calculation in the case that oldp[0] equals newp[0].
404  step: suppose the heat capacity is stepwise constant, i.e. use the heat
405  capacity at position newp[0] as an estimate of the mean of the heat capacity
406  between newp[0] and newp[1]. This does not require any self-consistent loop.
407  """
408 
409  global kB
410 
411  if abs(oldp[0] - newp[0]) < EPSILON and targetAR < 0:
412  return oldp[1]
413  if targetAR < 0:
414  targetAR = sum(ind) / float(len(ind))
415  cv = Cv.get(newp[0])
416  Y = sqrt(2 * kB) * float(erfinv(1 - targetAR))
417  if Y ** 2 >= cv:
418  raise ValueError("""targetAR too small for this approximate method, use
419  the full self-consistent method instead.""")
420  return newp[0] * (cv + Y * sqrt(2 * cv - Y ** 2)) / (cv - Y ** 2)
421 
422 
423 def update_any_cv_sc(newp, oldp, ind, targetAR=0.4, Cv=None,
424  tol=1e-6, maxiter=10000):
425  """self-consistent solver version"""
426 
427  global kB
428 
429  if abs(oldp[0] - newp[0]) < EPSILON and targetAR < 0:
430  return oldp[1]
431  if targetAR < 0:
432  targetAR = sum(ind) / float(len(ind))
433  cv = Cv.get(newp[0])
434  Y = sqrt(2 * kB) * float(erfinv(1 - targetAR))
435  if Y ** 2 >= cv:
436  raise ValueError("""targetAR too small for this approximate method, use
437  the full self-consistent method instead.""")
438  targetp = newp[0] * (cv + Y * sqrt(2 * cv - Y ** 2)) / (cv - Y ** 2)
439  for i in xrange(maxiter):
440  cv = Cv.mean(newp[0], targetp)
441  (oldtargetp, targetp) = (
442  targetp, newp[0] * (cv + Y * sqrt(2 * cv - Y ** 2)) / (cv - Y ** 2))
443  if abs(targetp - oldtargetp) <= tol:
444  break
445  if isnan(targetp):
446  if Y ** 2 >= cv:
447  raise ValueError("""targetAR too small for this approximate method, use
448  the full self-consistent method instead.""")
449  else:
450  raise ValueError("""something unexpected happened""")
451  if i == maxiter - 1:
452  prdb("""Warning: unable to converge the self-consistent after %d
453  iterations and a tolerance of %f. Change the method or decrease the
454  tolerance!""" % (maxiter, tol))
455  prdb("converged after %d iterations and a tolerance of %f for x=%f" %
456  (i, tol, oldp[1]))
457  return targetp
458 
459 
460 def update_any_cv_scfull(newp, oldp, ind, targetAR=0.4, Cv=None,
461  tol=1e-6, maxiter=10000):
462  """self-consistent solver version, on the exact average AR equation"""
463 
464  # create helper functions and overlap function
465  #_ru21 = r('u21 <- function(t1,t2) { (t2-t1)*(cvspline(t2)-cvspline(t1))/2. }')
466  _ru21 = r(
467  'u21 <- function(t1,t2) { integrate(Vectorize(cvspline),t1,t2)$value }')
468  _rb21 = r('b21 <- function(t1,t2) { 1./(kB*t2) - 1./(kB*t1) }')
469  _rsigma2 = r('sigma2 <- function(t) {cvspline(t)*kB*t**2}')
470  _rovboltz = r('ovboltz <- function(t1,t2) {\
471  1/2*( 1-erf(\
472  u21(t1,t2)/sqrt(2*(sigma2(t1)+sigma2(t2))))\
473  + exp(b21(t1,t2)*(u21(t1,t2)+b21(t1,t2)*(sigma2(t1)+sigma2(t2))/2))\
474  * (1+erf((u21(t1,t2)+b21(t1,t2)*(sigma2(t1)+sigma2(t2)))/(sqrt(2*(sigma2(t1)+sigma2(t2))))))\
475  )}')
476  _rrootfn = r(
477  'rootfn <- function(t2) {ovboltz(%f,t2)-%f}' %
478  (newp[0], targetAR))
479 
480  # find upper bound for estimation, raise an error if cv is negative
481  if oldp[1] > oldp[0]:
482  tmp = newp[0] * 1.05
483  else:
484  tmp = newp[0] * 0.95
485  nloops = 0
486  while _rrootfn(tmp)[0] >= 0:
487  nloops += 1
488  tmp += (oldp[1] - oldp[0])
489  if Cv.get(tmp) < 0:
490  raise RuntimeError("heat capacity goes negative")
491  if nloops > maxiter:
492  raise RuntimeError('could not find zero of function!')
493 
494  # find root
495  _runiroot = r(
496  'uniroot(rootfn,c(%f,%f),f.lower = %f, f.upper = %f, tol = %f, maxiter = %d)' % (newp[0], tmp,
497  1 - targetAR, -targetAR, tol, maxiter))
498  prdb(
499  "self-consistent solver converged after %s iterations and an estimated precision of %s " %
500  (_runiroot[2][0], _runiroot[3][0]))
501  prdb(
502  ["root:",
503  _runiroot[0][0],
504  "overlap:",
505  _rovboltz(newp[0],
506  _runiroot[0][0])[0]])
507  return _runiroot[0][0]
508 
509 
510 def update_any_cv_nr(newp, oldp, ind, targetAR=0.4, Cv=None, **kwargs):
511  """newton-raphson solver version"""
512 
513  # use nlm
514  raise NotImplementedError
515 
516 # Testing methods
517 
518 
519 def are_equal_to_targetAR(
520  indicators,
521  targetAR=0.4,
522  alpha=0.05,
523  method="binom"):
524  """here, all indicators have same average, we want to know if it is equal to
525  targetAR
526  """
527 
528  # calculate sample mean deviation of each indicator function from targetAR
529  deviations = sorted([(abs(sum(ind) / float(len(ind)) - targetAR), ind)
530  for pos, ind in enumerate(indicators)])
531  deviant = deviations[-1]
532 
533  # perform t-test
534  if method == "ttest":
535  #from statlib.stats import ttest_1samp as ttest
536  ttest = ttest
537  elif method == "binom":
538  ttest = binom
539  else:
540  raise NotImplementedError
541 
542  try:
543  test, pval = ttest(deviant[1], targetAR)
544  except:
545  if abs(targetAR - sum(deviant[1]) / len(deviant[1])) > EPSILON:
546  pval = 0
547  else:
548  pval = 1
549  if pval < alpha:
550  return False
551  else:
552  return True
553 
554 
555 def are_stationnary(indicators, alpha=0.05, method="anova"):
556  """test on the stationarity of the observations (block analysis). Done so by
557  launching an anova on the difference between the two halves of each observations.
558  """
559 
560  if method == "kruskal":
561  test = kruskal
562  else:
563  test = anova
564 
565  tmp = array(indicators)
566  blocklen = len(indicators[0]) / 2
567  block = tmp[:, :blocklen] - tmp[:, blocklen:2 * blocklen]
568  if test(*block)['pval'] < alpha:
569  return False
570  else:
571  return True
572 
573 
574 def are_equal(indicators, targetAR=0.4, alpha=0.05,
575  method="anova", varMethod="skip", power=0.8):
576  """Perform a one-way ANOVA or kruskal-wallis test on the indicators set,
577  and return True if one cannot exclude with risk alpha that the indicators
578  AR are different (i.e. True = all means are equal). Also performs a test
579  on the variance (they must be equal).
580  """
581 
582  if min(targetAR, 1 - targetAR) * len(indicators[0]) <= 5 and \
583  (varMethod == "bartlett" or method == "anova"):
584  prdb("Warning: normal approximation to the binomial does not hold!")
585 
586  # test the variances
587  if varMethod == "skip":
588  pass
589  else:
590  if varMethod == "bartlett":
591  pval = bartlett(*indicators)[1]
592  elif varMethod == "fligner":
593  pval = killeen(*indicators)[1]
594  else:
595  raise NotImplementedError(
596  "variance testing method unknown: %s" %
597  varMethod)
598  if pval < alpha:
599  prdb("Warning: performing mean test with unequal variances.")
600 
601  if method == "kruskal":
602  test = kruskal
603  else:
604  test = anova
605 
606  tr = test(*indicators)
607  tr['alpha'] = alpha
608  # p-value < alpha => H0 rejected => result == False
609  tr['result'] = tr['pval'] >= alpha
610 
611  # if tr['test'] == 'anova':
612  # try:
613  # power_test(tr, power=power, alpha=alpha)
614  # except:
615  # prdb("power test failed")
616  # pass
617 
618  return tr
619 
620 
621 def find_good_ARs(indicators, targetAR=0.4, alpha=0.05, method="binom"):
622  """perform one-sample t-tests on each of the data sets, and return
623  a tuple of bool of size N-1, False if AR i is not equal to targetAR
624  at risk alpha.
625  """
626 
627  # calculate sample means of each indicator function
628  means = sorted([(sum(ind) / float(len(ind)), pos, ind)
629  for pos, ind in enumerate(indicators)])
630 
631  # perform t-test
632  if method == "ttest":
633  #from statlib.stats import ttest_1samp as ttest
634  ttest = ttest
635  elif method == "binom":
636  ttest = binom
637  else:
638  raise NotImplementedError
639 
640  isGoodTuple = []
641  # start from the lowest means and stop when they are ok
642  prdb("starting left")
643  for (i, (mean, pos, ind)) in enumerate(means):
644  prdb(
645  "performing t-test on couple %d having average AR %f, position %d" %
646  (pos, mean, i))
647  try:
648  test, pval = ttest(ind, targetAR)
649  except:
650  if abs(targetAR - mean) > EPSILON:
651  pval = 0
652  else:
653  pval = 1
654  if pval < alpha:
655  # means are different
656  isGoodTuple.append((pos, False))
657  else:
658  goodstart = i
659  break
660  # then start from the highest means
661  prdb("starting right")
662  for (i, (mean, pos, ind)) in enumerate(reversed(means)):
663  prdb("performing t-test on couple %d having average AR %f, position %d"
664  % (pos, mean, len(means) - 1 - i))
665  if ttest(ind, targetAR)[1] < alpha:
666  # means are different
667  isGoodTuple.append((pos, False))
668  else:
669  goodstop = len(means) - 1 - i
670  break
671 
672  # limiting cases: all different
673  if len(isGoodTuple) > len(indicators):
674  return tuple([False] * len(indicators))
675  # all equal
676  elif len(isGoodTuple) == 0:
677  return tuple([True] * len(indicators))
678  # intermediate
679  else:
680  isGoodTuple.extend([(means[i][1], True) for i in
681  xrange(goodstart, goodstop + 1)])
682  isGoodTuple.sort()
683  return tuple([tup[1] for tup in isGoodTuple])
684 
685 # Trebst, Katzgraber, Nadler and Hansmann optimum flux stuff
686 
687 
688 def mean_first_passage_times(
689  replicanums_ori,
690  subs=1,
691  start=0,
692  use_avgAR=False):
693  """compute mean first passage times as suggested in
694  Nadler W, Meinke J, Hansmann UHE, Phys Rev E *78* 061905 (2008)
695 
696  use_avgAR : if a list of average ARs is given computes everything from
697  average AR; if it is False, compute by direct counting of events.
698 
699  returns:
700  If use_avgAR == False:
701  tau0, tauN, chose_N, times0, timesN
702  else:
703  tau0, tauN, None, None, None
704  tau0[i]: average time to go from replica 0 to replica i
705  tauN[i]: average time to go from replica N to replica i
706  times0 and timesN are the corresponding lists of single events.
707 
708  """
709 
710  from numpy import array, zeros
711  from pprint import pprint, pformat
712  replicanums = array(replicanums_ori)[:, start::subs]
713  N = len(replicanums)
714  tauN = [0] * N
715  tau0 = [0] * N
716 
717  if use_avgAR:
718  tau0[0] = 0.0
719  tauN[-1] = 0.0
720  for state in xrange(1, N):
721  tau0[state] = tau0[state - 1] + \
722  state / (float(use_avgAR[state - 1]))
723  for state in reversed(xrange(1, N)):
724  tauN[state - 1] = tauN[state] \
725  + (N - (state - 1)) / (float(use_avgAR[state - 1]))
726 
727  return tau0, tauN, None, None, None
728 
729  else:
730  #prdb('not using average AR')
731  from itertools import izip
732  # the algorithm looks for replicas that start at the lowest temp, and
733  # records the farthest state it went to before returning to zero. Once
734  # back it increments the counter of all concerned replicas. Similar
735  # procedure if starting from N.
736  store0 = zeros((N, N), dtype=bool)
737  last0 = [0 for i in xrange(N)]
738  already0 = [False for i in xrange(N)]
739  times0 = [[] for i in xrange(N)]
740  storeN = zeros((N, N), dtype=bool)
741  lastN = [0 for i in xrange(N)]
742  alreadyN = [False for i in xrange(N)]
743  timesN = [[] for i in xrange(N)]
744 
745  #prdb('looping over replicanums')
746  for time, frame in enumerate(izip(*replicanums)):
747  # case of the replica in state 0
748  if not already0[frame[0]]:
749  last0[frame[0]] = time
750  store0[frame[0], :] = True
751  already0[frame[0]] = True
752  # case of the replica in state N
753  if not alreadyN[frame[-1]]:
754  lastN[frame[-1]] = time
755  storeN[frame[-1], :] = True
756  alreadyN[frame[-1]] = True
757  # set already flags to False when in state 1 or N-1
758  already0[frame[1]] = False
759  alreadyN[frame[-2]] = False
760  # loop over all states
761  for state, rep in enumerate(frame):
762  if store0[rep, state]:
763  # reached a state for the first time since 0
764  store0[rep, state] = False
765  # store time since this replica left state 0
766  times0[state].append(time - last0[rep])
767  if storeN[rep, state]:
768  # reached a state for the first time since N
769  storeN[rep, state] = False
770  # store time since this replica left state N
771  timesN[state].append(time - lastN[rep])
772  #prdb([replicanums.shape, len(storeN), len(last0)])
773  times = [[] for i in xrange(N)]
774  chose_N = [len(timesN[state]) > len(times0[state]) for state in
775  xrange(N)]
776  for state in xrange(N):
777  tauN[state] = sum(timesN[state]) / float(len(timesN[state]))
778  tau0[state] = sum(times0[state]) / float(len(times0[state]))
779  # prdb(len(chose_N))
780 
781  return tau0, tauN, chose_N, times0, timesN
782 
783 
784 def compute_effective_fraction(tau0, tauN, chose_N):
785  """input: tau0, tauN, chose_N
786  output: effective fraction f(T) (P_up(n)) as introduced in
787  Trebst S, Troyer M, Hansmann UHE, J Chem Phys *124* 174903 (2006).
788  formalized in
789  Nadler W, Hansmann UHE, Phys Rev E *75* 026109 (2007)
790  and whose calculation is enhanced in
791  Nadler W, Meinke J, Hansmann UHE, Phys Rev E *78* 061905 (2008)
792  the ideal value of f(n) should be 1 - n/N
793  """
794 
795  # nstar is the index of the last state where tau0 should be used.
796  N = len(tau0)
797  if chose_N is None:
798  nstar = N / 2
799  else:
800  nstar = N - sum([int(a) for a in chose_N]) - 1
801 
802  prdb("n* = %d" % nstar)
803  # compute helper functions h
804  h0 = [0] * N
805  h0[1] = tau0[1]
806  for state in xrange(2, nstar + 1):
807  h0[state] = h0[state - 1] + \
808  (tau0[state] - tau0[state - 1]) / float(state)
809 
810  hN = [0] * N
811  hN[-2] = tauN[-2]
812  for state in reversed(xrange(nstar, N - 1)):
813  hN[state] = hN[state + 1] + \
814  (tauN[state] - tauN[state + 1]) / float(N - state)
815 
816  # compute flow probabilities
817  pup = [0] * N
818  pup[0] = 1
819  for n in xrange(1, nstar + 1):
820  pup[n] = 1 - h0[n] / (h0[nstar] + hN[nstar])
821  for n in xrange(nstar + 1, N):
822  pup[n] = hN[n] / (h0[nstar] + hN[nstar])
823 
824  return pup
825 
826 
827 def spline_diffusivity(pup, params):
828  """spline interpolation of diffusivity: D = 1/(df/dT * heta)
829  """
830  from numpy import linspace
831  robjects.globalenv["hetay"] = \
832  robjects.FloatVector(linspace(0, 1, num=len(params)).tolist())
833  robjects.globalenv["hetax"] = robjects.FloatVector(params)
834  robjects.globalenv["pupx"] = robjects.FloatVector(params)
835  robjects.globalenv["pupy"] = robjects.FloatVector(pup)
836  heta = r('heta <- splinefun(hetax,hetay,method="monoH.FC")')
837  eff = r('eff <- splinefun(pupx,pupy,method="monoH.FC")')
838  diff = r('diff <- function(x) {-1/(heta(x,deriv=1)*eff(x,deriv=1))}')
839  return lambda x: diff(x)[0]
840 
841 
842 # Misc
843 def compute_indicators(replicanums, subs=1, start=0):
844  """input: replicanums : a list of N lists of size M, where N is the number of
845  states and M is the length of the simulation. Each element is an integer,
846  and corresponds to the label of a replica.
847  output: an indicator function of exchanges (size (N-1)x(M-1)), 1 if exchange and
848  0 if not.
849  """
850  def exchange(n, m):
851  if replicanums[n][m] == replicanums[n + 1][m + 1] \
852  and replicanums[n][m + 1] == replicanums[n + 1][m]:
853  return 1
854  else:
855  return 0
856 
857  indicators = []
858  for n in xrange(len(replicanums) - 1):
859  indicators.append([exchange(n, m)
860  for m in xrange(len(replicanums[n]) - 1)][start::subs])
861  return indicators
862 
863 # Main routines
864 
865 
866 def update_params_nonergodic(pup, params, write_g=False, num=False):
867 
868  from numpy import linspace
869  #g = spline(zip(pup,params),0,method='monoH.FC')
870  g = linear_interpolation(zip(pup, params), 0)
871  if write_g:
872  d = spline_diffusivity(pup, params)
873  fl = open('g', 'w')
874  fl.write("".join(["%f %f\n" % (x, g(x))
875  for x in linspace(0, 1, num=100)]))
876  fl.close()
877  fl = open('diffusivity', 'w')
878  fl.write(''.join(["%f %f\n" % (x, d(x)) for x in
879  linspace(params[0], params[-1], num=100)]))
880  fl.close()
881  fl = open('pup', 'w')
882  fl.write("".join(["%f %f\n" % (i, j) for (i, j) in zip(params, pup)]))
883  fl.close()
884 
885  if num is False:
886  newparams = [g(i) for i in reversed(linspace(0, 1, num=len(params)))]
887  else:
888  newparams = [g(i) for i in reversed(linspace(0, 1, num=num))]
889  # for numerical issues
890  newparams[0] = params[0]
891  newparams[-1] = params[-1]
892 
893  return newparams
894 
895 
896 def update_params(
897  indicators, params, isGood, targetAR=0.4, immobilePoint=1,
898  Cv=None, badMethod="dumb", goodMethod="dumb", dumb_scale=0.1):
899  """update the parameters according to the isGood tuple and using the
900  specified methods"""
901 
902  newparams = list(params) # make a copy
903 
904  if immobilePoint != 1:
905  raise NotImplementedError
906 
907  if Cv is None and (badMethod != "dumb" or goodMethod != "dumb"):
908  raise RuntimeError("""Cv needs to be estimated if using other methods
909  than 'dumb' for updating!""")
910 
911  if goodMethod == "dumb":
912  update_good = update_good_dumb
913  elif goodMethod == "step":
914  update_good = update_any_cv_step
915  elif goodMethod == "sc":
916  update_good = update_any_cv_sc
917  elif goodMethod == "scfull":
918  update_good = update_any_cv_scfull
919  elif goodMethod == "nr":
920  update_good = update_any_cv_nr
921  else:
922  raise NotImplementedError(goodMethod)
923  if badMethod == "dumb":
924  update_bad = update_bad_dumb
925  elif badMethod == "step":
926  update_bad = update_any_cv_step
927  elif badMethod == "sc":
928  update_bad = update_any_cv_sc
929  elif badMethod == "scfull":
930  update_bad = update_any_cv_scfull
931  elif badMethod == "nr":
932  update_bad = update_any_cv_nr
933  else:
934  raise NotImplementedError(badMethod)
935 
936  # scan each position starting from the immobilePoint
937  for pos in xrange(len(params) - 1):
938  if isGood[pos]:
939  newparams[pos + 1] = update_good(
940  newparams[pos:pos + 2], params[pos:pos + 2],
941  indicators[pos], targetAR=targetAR, Cv=Cv)
942  else:
943  newparams[pos + 1] = update_bad(
944  newparams[pos:pos + 2], params[pos:pos + 2],
945  indicators[pos], targetAR=targetAR, Cv=Cv, scale=dumb_scale)
946 
947  return tuple(newparams)
948 
949 
950 def tune_params_flux(replicanums, params, subs=1, start=0, alpha=0.05,
951  testMethod='anova', meanMethod='binom', use_avgAR=False,
952  power=0.8, num=False):
953  # num is here if you want to add some more temperatures. indicate total
954  # number of replicas
955 
956  # TODO: do case where one estimates all based on target AR.
957  if use_avgAR is not False:
958  raise NotImplementedError
959 
960  prdb("computing mean first passage times")
961  tau0, tauN, chose_N, times0, timesN = mean_first_passage_times(replicanums,
962  subs=subs, start=start, use_avgAR=use_avgAR)
963 
964  prdb("average round trip time: %.2f (%d+%d events)" %
965  (tau0[-1] + tauN[0], len(times0[-1]), len(timesN[0])))
966  prdb("checking if the parameterset needs to be improved")
967  N = len(replicanums)
968  if chose_N is None:
969  nstar = N / 2
970  else:
971  nstar = N - sum([int(a) for a in chose_N]) - 1
972 
973  reduced = []
974  # no need to check for times?[0] or times?[N]
975  for n in xrange(1, N - 1):
976  if n > nstar:
977  reduced.append([i * 2.0 / ((N - n) * (N - n + 1))
978  for i in timesN[n]])
979  else:
980  reduced.append([i * 2.0 / (n * (n + 1)) for i in times0[n]])
981 
982  anova_result = are_equal(reduced, alpha=alpha, method=testMethod,
983  power=power)
984  if (anova_result['result']): # TODO test if equal to targetAR
985  prdb("flux is constant, nothing to do!")
986  min_n = minimum_n(anova_result, alpha)
987  prdb('Try to rerun this test with at least %d more samples.' %
988  ceil(min_n))
989  return (False, min_n)
990 
991  # the flux is not constant so the parameters need improvement.
992  # calculate the estimate of the effective fraction
993  prdb("parameterset not optimal, computing effective fraction")
994  pup = compute_effective_fraction(tau0, tauN, chose_N)
995 
996  # improve parameterset
997  prdb("returning new parameterset")
998  params = update_params_nonergodic(pup, params, num=num)
999 
1000  return (True, params)
1001 
1002 
1003 def tune_params_ar(
1004  indicators, params, targetAR=0.4, alpha=0.05, immobilePoint=1, CvMethod="skip", badMethod="dumb", goodMethod="dumb",
1005  varMethod="skip", testMethod="anova", meanMethod="binom",
1006  energies=None, temps=None, power=0.8, dumb_scale=0.1):
1007  """Tune the replica-exchange parameters and return a new set.
1008 
1009  Arguments:
1010  indicators -- an (N-1)x(M-1) table, where entry at position (j,i)
1011  is True if replica j and j+1 performed an exchange between
1012  times i and i+1.
1013  params -- the current set of N parameters used in the simulation.
1014 
1015  Keyword arguments:
1016  targetAR -- the target AR which is wanted for the simulation
1017  (default: 0.4)
1018  alpha -- the type 1 error on the one-way ANOVA and subsequent
1019  t-tests (default: 5%)
1020  immobilePoint -- which replica should keep it's parameter fixed
1021  (default: 1st replica, e.g. 1)
1022  CvMethod -- the heat capacity estimation method (default:
1023  "skip", other options: "mbar", "spline", "constant")
1024  badMethod -- how to correct the (j+1)th parameter if the
1025  acceptance ratio between replicas j and j+1 is off the
1026  target value (default: "dumb", options: "step", "sc", "scfull", "nr")
1027  goodMethod -- how to update the value of the (j+1)th parameter
1028  in the case of a correctly exchanging couple, but if the jth
1029  parameter has been modified (default: "dumb",options: "step",
1030  "sc" self-consistent, "scfull" self-consistent using the exact equation,
1031  "nr" newton-raphson solver for the exact equation)
1032  dumb_scale -- (0.0-1.0) in the "dumb" method, scale wrong temperature
1033  intervals by this amount. (default: 0.1)
1034  testMethod -- how to test for the difference of the means,
1035  either "anova" for a one-way anova, or "kruskal" for a
1036  Kruskal-Wallis one-way non-parametric anova.
1037  meanMethod -- "ttest" for a two-sided one-sample t-test,
1038  "binom" for an exact binomial test of the probability.
1039  varMethod -- how to test for the equality of variances.
1040  "fligner" for the Fligner-Killeen non-parametric test,
1041  "bartlett" for Bartlett's test, "skip" to pass.
1042  energies -- if CvMethod is set to "mbar", the energies of each
1043  state as a function of time are used to estimate the heat capacity.
1044  temps -- the temperatures of the simulations, if estimating with "mbar".
1045 
1046  Return Value:
1047  returns a tuple: (bool, params). bool is True if params have
1048  changed, and params is the new set.
1049 
1050  """
1051 
1052  # perform ANOVA
1053  prdb("performing ANOVA")
1054  anova_result = are_equal(indicators, targetAR, alpha, method=testMethod,
1055  varMethod=varMethod, power=power)
1056  if (anova_result['result'] and
1057  are_equal_to_targetAR(indicators, targetAR, alpha, method=meanMethod)):
1058  prdb("all means are equal to target AR, nothing to do!")
1059  min_n = minimum_n(anova_result, alpha)
1060  prdb(
1061  'Try to rerun this test with at least %d more samples.' %
1062  ceil(min_n))
1063  return (False, min_n)
1064  prdb("some means are different, performing t-tests")
1065 
1066  # perform two-by-two t-tests
1067  isGood = find_good_ARs(indicators, targetAR, alpha, method=meanMethod)
1068  if not (False in isGood):
1069  prdb("""Bad luck: ANOVA says means are not identical, but t-tests
1070  can't find the bad rates...""")
1071  return (False, params)
1072  prdb(isGood)
1073 
1074  # check if data is stationnary by doing a block analysis
1075  prdb("performing stationarity test")
1076  if not are_stationnary(indicators, alpha):
1077  prdb("Warning: Some simulations are not stationary!")
1078 
1079  # interpolate the heat capacity from the data
1080  # TODO: use all previous data, not just from this run.
1081  prdb("launching Cv estimation or skipping it")
1082  if CvMethod == "skip":
1083  Cv = None
1084  elif CvMethod == "interpolate" or CvMethod == "constant":
1085  Cv = CvEstimator(params, indicators=indicators, method=CvMethod)
1086  elif CvMethod == "mbar":
1087  Cv = CvEstimator(
1088  params,
1089  energies=energies,
1090  temps=temps,
1091  method=CvMethod)
1092  else:
1093  raise NotImplementedError(CvMethod)
1094 
1095  # update parameters
1096  prdb('updating params')
1097  # update the current parameter set to match the target AR
1098  params = update_params(indicators, params, isGood, targetAR=targetAR,
1099  immobilePoint=immobilePoint, Cv=Cv,
1100  badMethod=badMethod, goodMethod=goodMethod, dumb_scale=dumb_scale)
1101 
1102  prdb('Done')
1103  return (True, params)
1104 
1105 if __name__ == '__main__':
1106  from numpy import *
1107  replicanums = []
1108  for i in xrange(1, 8):
1109  replicanums.append(tuple(fromfile('data/replica-indices/%d.rep' % i,
1110  dtype=int, sep='\n')))
1111 
1112  prdb("replicanums: %dx%d" % (len(replicanums), len(replicanums[0])))
1113  params = tuple(fromfile('data/temperatures', sep=' '))
1114 
1115  prdb(params)
1116  indicators = compute_indicators(replicanums, subs=1, start=0)
1117  prdb("indicators: %dx%d" % (len(indicators), len(indicators[0])))
1118  prdb("Exchange rate:")
1119  prdb(array([sum(ind) / float(len(ind)) for ind in indicators]))
1120  array([sum(ind) / float(len(ind)) for ind in
1121  indicators]).tofile('xchgs', sep='\n')
1122  changed, newparams = tune_params(indicators, params, targetAR=0.25,
1123  badMethod="dumb", goodMethod="dumb", CvMethod="skip", testMethod="anova", alpha=0.05)
1124  if not changed:
1125  print "Parameter set seems optimal."
1126  else:
1127  if not True in [abs(newparams[i + 1] - newparams[i]) < 1e-3 for i in xrange(len(newparams) - 1)]:
1128  array(newparams).tofile('data/temperatures', sep=' ')
1129  else:
1130  print "PROBLEM IN NEW PARAMETERSET -> not saved"
1131  print "params :", params
1132  print "new params:", newparams
def estimate_cv_constant
try to guess which constant cv fits best
Definition: TuneRex.py:295
def estimate_cv_interpolate
interpolate using previous values, by reversing the approximate overlap function
Definition: TuneRex.py:262
When created, estimates the heat capacity from the energies or from the indicator functions using the...
Definition: TuneRex.py:219
def mean_interp
estimate the mean of Cv between two points.
Definition: TuneRex.py:348
def estimate_cv_mbar
use MBAR to get the heat capacity
Definition: TuneRex.py:307
def get_mbar
returns the point estimate of the first derivative of the energy with respect to the replica exchange...
Definition: TuneRex.py:340
bool isnan(const T &a)
Return true if a number is NaN.
Definition: base/math.h:24
def get_interp
returns the point estimate of the first derivative of the energy with respect to the replica exchange...
Definition: TuneRex.py:332