IMP  2.0.1
The Integrative Modeling Platform
saxs_merge.py
1 #!/usr/bin/env python
2 
3 import sys,os
4 import optparse
5 import fractions
6 from random import sample
7 from numpy import *
8 import copy
9 from scipy.stats import t as student_t
10 import glob
11 import tempfile
12 
13 import IMP
14 import IMP.isd
15 import IMP.gsl
16 import IMP.saxs
17 
19 #fitno=0
20 
21 def subsample(idx, data, npoints):
22  """keep min(npoints,len(data)) out of data if npoints>0,
23  subsampling evenly along the idx (float) coordinate
24  """
25  #defined here because it's used in the class
26  Ntot = len(data)
27  if len(idx) != Ntot:
28  raise ValueError
29  if npoints >= Ntot or npoints <= 0:
30  return data
31  all = zip(idx,data)
32  all.sort()
33  qvals = array([i[0] for i in all])
34  newdata=[]
35  i=0
36  qmin = min(qvals)
37  qmax = max(qvals)
38  for q in linspace(qmin,qmax,num=npoints):
39  if q==qmax:
40  i=Ntot-1
41  else:
42  while qvals[i] <= q:
43  i += 1
44  i -= 1
45  newdata.append(all[i][1])
46  return newdata
47 
48 class FittingError(Exception):
49  pass
50 
51 class SAXSProfile:
52 
53  def __init__(self):
54  self.gamma = 1.0
55  self.offset = 0.0
56  self.Nreps = 10
57  self.flag_names=["q","I","err"]
58  self.flag_types=[float,float,float]
59  self.flag_dict={"q":0,"I":1,"err":2}
60  self.nflags=len(self.flag_names)
61  self.data = []
62  self.gp = None
63  self.particles = {}
64  self.intervals = {}
65  self.filename = None
66 
67  def add_data(self, input, offset=0, positive=False, err=True, scale=1):
68  """add experimental data to saxs profile.
69  offset=i means discard first i columns
70  positive=True means only keep intensities that are >0
71  err=True means keep only points that have >0 error bar
72  scale is a factor by which to multiply input I and err
73  """
74  if isinstance(input, str):
75  #read all lines
76  stuff = [i.split() for i in open(input).readlines()]
77  #keep the ones that have data
78  stuff = filter(lambda a: (len(a)>=self.nflags)
79  and not a[0].startswith('#'), stuff)
80  #drop offset, and first 3 columns must be of float type and not nan
81  stuff = map(lambda a:a[:self.nflags+offset], stuff)
82  s2=[]
83  for s in stuff:
84  try:
85  map(float,s[:3])
86  except ValueError:
87  continue
88  if 'nan' in s[:3]:
89  continue
90  s2.append(s)
91  stuff=s2
92  elif isinstance(input, list) or isinstance(input, tuple):
93  for i in input:
94  if len(i) != self.nflags + offset:
95  raise TypeError, "length of each entry should be %d" % \
96  self.nflags
97  stuff = input
98  else:
99  raise ValueError, "Unknown data type"
100  data = []
101  for a in stuff:
102  entry=[]
103  for f,z in zip(self.flag_types,a[offset:]):
104  if z is None:
105  entry.append(None)
106  else:
107  entry.append(f(z))
108  #keep positive data
109  if positive and entry[1] <= 0:
110  continue
111  #keep weighted points
112  if err and entry[2] <= 0:
113  continue
114  #multiply I and err by scale
115  entry[1] *= scale
116  entry[2] *= scale
117  data.append(entry)
118  self.data += copy.deepcopy(data)
119  self.data.sort(key=lambda a:a[0])
120 
121  def get_raw_data(self, colwise=False):
122  """return original q,I,err values, sorted"""
123  if colwise:
124  data = self.get_raw_data(colwise=False)
125  retval = {}
126  retval[self.flag_names[0]] = [d[0] for d in data]
127  retval[self.flag_names[1]] = [d[1] for d in data]
128  retval[self.flag_names[2]] = [d[2] for d in data]
129  return retval
130  return [i[:3] for i in self.data]
131 
132  def get_data(self, *args, **kwargs):
133  """get data, rescaled by self.gamma and self.offset
134  Returns a list of tuples, each of which is organized as:
135  Column 1 : unique id (increasing)
136  Column 2 : q (increasing)
137  Column 3 : I
138  Column 4 : err
139  Columns 5+ : flags, as indicated by self.get_flag_names()
140  Arguments / Options:
141  qmin : specify the starting q value
142  qmax : specify the ending q value
143  if they are not specified, take the data's min and max
144  Both have to be specified or none of them.
145  Options:
146  filter : string or list of strings corresponding to a boolean flag
147  name which should be used to filter the data before returning it.
148  Data is returned only if all of the provided flags are True.
149  colwise : instead of returning a list of tuples for each data point,
150  return a dictionnary with flag names as keys.
151  maxpoints : only return at most N points, subsampling if necessary.
152  """
153  if 'filter' in kwargs:
154  filt = kwargs.pop('filter')
155  else:
156  filt = None
157  if filt:
158  if isinstance(filt, str):
159  flagnos = [self.flag_dict[filt]]
160  else:
161  flagnos = [self.flag_dict[name] for name in filt]
162  else:
163  flagnos = []
164  maxpoints=-1
165  if 'maxpoints' in kwargs:
166  maxpoints=kwargs.pop('maxpoints')
167  if 'colwise' in kwargs:
168  colwise = kwargs.pop('colwise')
169  else:
170  colwise = False
171  if len(args) == 0 and len(kwargs) == 0:
172  qmin = self.data[0][0]
173  qmax = self.data[-1][0]
174  elif (len(args) == 2) ^ (len(kwargs) == 2):
175  if len(args) == 2:
176  try:
177  qmin=float(args[0])
178  qmax=float(args[1])
179  except:
180  raise TypeError, "arguments have incorrect type"
181  if qmin > qmax:
182  raise ValueError, "qmin > qmax !"
183  else:
184  if set(kwargs.keys()) != set(['qmax','qmin']):
185  raise TypeError, "incorrect keyword argument(s)"
186  qmin=kwargs['qmin']
187  qmax=kwargs['qmax']
188  else:
189  raise TypeError, "provide zero or two arguments (qmin, qmax)"
190  if colwise:
191  retval=dict.fromkeys(self.get_flag_names()+('id',))
192  for i in retval:
193  retval[i] = []
194  else:
195  retval = []
196  for i,d in enumerate(self.data):
197  if d[0] < qmin:
198  continue
199  if d[0] > qmax:
200  break
201  if len(flagnos) > 0 and False in [d[k] is True for k in flagnos]:
202  continue
203  cd = copy.copy(d)
204  cd[1]=self.gamma*(cd[1]+self.offset)
205  cd[2]=self.gamma*cd[2]
206  if colwise:
207  retval['id'].append(i)
208  for k in self.get_flag_names():
209  retval[k].append(cd[self.flag_dict[k]])
210  else:
211  retval.append(tuple([i,]+cd))
212  if colwise:
213  keys,values = zip(*retval.items())
214  values = zip(*self._subsample(retval['q'], zip(*values), maxpoints))
215  values = map(list, values)
216  return dict(zip(keys,values))
217  else:
218  return self._subsample([i[1] for i in retval], retval,maxpoints)
219 
220  def get_gamma(self):
221  return self.gamma
222 
223  def get_offset(self):
224  return self.offset
225 
226  def set_gamma(self, gamma):
227  self.gamma = gamma
228 
229  def set_offset(self, offset):
230  self.offset = offset
231 
232  def new_flag(self, name, tp):
233  """add extra flag to all data points. Initialized to None"""
234  self.flag_names.append(name)
235  self.flag_types.append(tp)
236  self.flag_dict[name] = self.nflags
237  self.nflags += 1
238  for d in self.data:
239  d.append(None)
240  self.intervals[name]=[]
241 
242  def set_flag(self, id, name, value):
243  "set the flag of a data point"
244  flagno = self.flag_dict[name]
245  flagtype = self.flag_types[flagno]
246  try:
247  converted = flagtype(value)
248  except:
249  raise TypeError, "unable to cast given value to %s" % flagtype
250  self.data[id][flagno]=converted
251 
252  def get_flag(self, val, flag, default = '---'):
253  """get the flag of a point in the profile
254  call with (id, flag) to get the flag of data point id
255  call with (qvalue, flag) to get the flag of an unobserved data point.
256  Uses the flag intervals for this.
257  If the flag is not defined for that point, return default
258  """
259  if isinstance(val, int):
260  try:
261  retval = self.data[val][self.flag_dict[flag]]
262  if retval is None:
263  raise ValueError
264  except ValueError:
265  return default
266  return retval
267  elif not isinstance(val, float):
268  raise ValueError, "first argument must be int or float"
269  if not flag in self.get_flag_names():
270  raise KeyError, "flag %s does not exist!" % flag
271  for qmin,qmax,flagval in self.get_flag_intervals(flag):
272  if qmin <= val <= qmax:
273  return flagval
274  return default
275 
276  def set_interpolant(self, gp, particles, functions, mean, model, bayes,
277  hessian=None):
278  """store a class that gives interpolated values,
279  usually an instance of the GaussianProcessInterpolation
280  """
281  if hessian is not None:
282  self._memoized_hessian = hessian
283  self.recompute_hessian = False
284  else:
285  self.recompute_hessian = True
286  if not hasattr(gp, "get_posterior_mean"):
287  raise TypeError, "interpolant.get_posterior_mean does not exist"
288  if not hasattr(gp, "get_posterior_covariance"):
289  raise TypeError, "interpolant.get_posterior_covariance "\
290  "does not exist"
291  if not isinstance(particles, dict):
292  raise TypeError, "second argument should be dict"
293  for p in particles.values():
295  raise TypeError, "particles should all be ISD Nuisances"
296  if not set(functions.keys()) == set(['mean','covariance']):
297  raise TypeError, "expected mean and covariance functions"
298  if not isinstance(model, IMP.Model):
299  raise TypeError, "fourth argument is expected to be an IMP.Model()"
300  self.gp = gp
301  self.particles = particles
302  self.functions=functions
303  self.model = model
304  self.bayes = bayes
305  self.mean = mean
306 
307  def set_flag_interval(self, flag, qmin, qmax, value):
308  if flag not in self.flag_names:
309  raise ValueError, "unknown flag %s" % flag
310  intervals = []
311  for imin,imax,ival in self.intervals[flag]:
312  #add non-overlapping intervals
313  if imax < qmin or imin > qmax:
314  intervals.append((imin,imax,ival))
315  else:
316  if value == ival:
317  #merge intervals with same value
318  #no need to start over since all other intervals
319  #don't overlap
320  qmin = min(qmin,imin)
321  qmax = max(qmax,imax)
322  else:
323  #discard new portion that disagrees
324  if imin < qmin and imax < qmax:
325  #discard qmin -> imax
326  imax = qmin
327  intervals.append((imin,imax,ival))
328  elif imin > qmin and imax > qmax:
329  #discard imin -> qmax
330  imin = qmax
331  intervals.append((imin,imax,ival))
332  elif imin < qmin and imax > qmax:
333  #split interval in two
334  intervals.append((imin,qmin,ival))
335  intervals.append((qmax,imax,ival))
336  else:
337  #just discard old interval
338  pass
339  #NoneType is accepted, but otherwise cast to target value
340  if value is None:
341  newinterval = (qmin,qmax,None)
342  else:
343  newinterval = (qmin,qmax,
344  self.flag_types[self.flag_dict[flag]](value))
345  intervals.append(newinterval)
346  #sort by ascending qmin value
347  intervals.sort(key=lambda a:a[0])
348  self.intervals[flag]=intervals
349 
350  def get_flag_intervals(self, flag):
351  """returns a list of [qmin, qmax, flag_value] lists"""
352  return self.intervals[flag]
353 
354  def _subsample(self, q, data, npoints):
355  return subsample(q, data, npoints)
356 
357  def _setup_gp(self, npoints):
358  if npoints <0:
359  return self.gp
360  q = self.gp.get_data_abscissa()
361  I = self.gp.get_data_mean()
362  S = self.gp.get_data_variance()
363  err = [S[i][i] for i in xrange(len(S))]
364  q,I,err = zip(*self._subsample(q,zip(q,I,err),npoints))
366  self.get_Nreps(), self.functions['mean'],
367  self.functions['covariance'], self.particles['sigma2'])
368  return gp
369 
370  def _get_hessian(self, gp):
371  self.particles['A'].set_nuisance_is_optimized(True)
372  if self.mean == 'Flat':
373  self.particles['G'].set_lower(0)
374  self.particles['G'].set_nuisance(0)
375  self.particles['G'].set_upper(0)
376  self.particles['G'].set_nuisance_is_optimized(False)
377  self.particles['Rg'].set_nuisance_is_optimized(False)
378  self.particles['d'].set_nuisance_is_optimized(False)
379  self.particles['s'].set_nuisance_is_optimized(False)
380  else:
381  self.particles['G'].set_nuisance_is_optimized(True)
382  self.particles['Rg'].set_nuisance_is_optimized(True)
383  if self.mean == 'Simple':
384  self.particles['d'].set_nuisance_is_optimized(False)
385  else:
386  self.particles['d'].set_nuisance_is_optimized(True)
387  if self.mean == 'Generalized':
388  self.particles['s'].set_nuisance_is_optimized(False)
389  else:
390  self.particles['s'].set_nuisance_is_optimized(True)
391  self.particles['tau'].set_nuisance_is_optimized(True)
392  self.particles['lambda'].set_nuisance_is_optimized(True)
393  self.particles['sigma2'].set_nuisance_is_optimized(True)
394  if self.recompute_hessian is False:
395  return self._memoized_hessian
397  self.model.add_restraint(gpr)
398  Hessian = matrix(gpr.get_hessian(False))
399  if linalg.det(Hessian) == 0:
400  print Hessian,[(i,j.get_nuisance())
401  for i,j in self.particles.items()
402  if j.get_nuisance_is_optimized()]
403  self.model.remove_restraint(gpr)
404  #fl=open('avcov','w')
405  #fl.write("#q I err err*lapl lapl det (Hessian= %f )\n"
406  # % linalg.det(Hessian))
407  #fl.close()
408  self._memoized_hessian = Hessian
409  self.recompute_hessian = False
410  return Hessian
411 
412  def get_mean(self, **kwargs):
413  """returns (q,I,err,mean,...) tuples for num points (default 200)
414  ranging from qmin to qmax (whole data range by default)
415  possible keyword arguments:
416  num qmin qmax filter average verbose
417  mean is the mean function of the gaussian process.
418  the length of the returned tuple is the same as the number of flags plus
419  one for the mean function.
420  You can also give specific values via the qvalues keyword argument.
421  see get_data for details on the filter argument. If num is used with
422  filter, the function returns at most num elements and possibly none.
423  average : if 0, just output maximum posterior estimate (faster).
424  if <0, compute hessians and average out parameter vector.
425  if >0, same as <0 but only take N data points instead of all
426  verbose : if True, display progress meter in %
427  """
428  rem = set(kwargs.keys()) \
429  - set(['qvalues','num','qmin','qmax','filter',
430  'colwise','average','verbose'])
431  if len(rem) > 0:
432  raise TypeError, "Unknown keyword(s): %s" % list(rem)
433  #
434  average=0
435  if 'average' in kwargs and kwargs['average'] != 0:
436  average=kwargs['average']
437  #compute hessian, which doesn't depend on q
438  gp = self._setup_gp(average)
439  Hessian = self._get_hessian(gp)
440  #
441  if 'qvalues' in kwargs:
442  qvals = kwargs['qvalues']
443  else:
444  if 'num' in kwargs:
445  num = kwargs['num']
446  else:
447  num = 200
448  if 'qmin' in kwargs:
449  qmin = kwargs['qmin']
450  else:
451  qmin = self.data[0][0]
452  if 'qmax' in kwargs:
453  qmax = kwargs['qmax']
454  else:
455  qmax = self.data[-1][0]
456  qvals = linspace(qmin,qmax,num)
457  #
458  verbose=False
459  if 'verbose' in kwargs:
460  verbose = kwargs['verbose']
461  #
462  if 'colwise' in kwargs:
463  colwise = kwargs['colwise']
464  else:
465  colwise = False
466  #
467  if 'filter' in kwargs:
468  filt = kwargs.pop('filter')
469  else:
470  filt = None
471  if filt:
472  if isinstance(filt, str):
473  flagnos = [self.flag_dict[filt]]
474  else:
475  flagnos = [self.flag_dict[name] for name in filt]
476  else:
477  flagnos = []
478  flagnames = list(self.get_flag_names())
479  flagnames = flagnames[:3]+['mean']+flagnames[3:]
480  flags = []
481  gamma = self.get_gamma()
482  offset = self.get_offset()
483  if colwise:
484  retval=dict.fromkeys(flagnames)
485  for i in retval:
486  retval[i] = []
487  else:
488  retval=[]
489  flagdict = copy.deepcopy(self.flag_dict)
490  for k in flagdict:
491  if flagdict[k] >= 3:
492  flagdict[k] += 1
493  flagdict['mean']=3
494  for i,q in enumerate(qvals):
495  if verbose:
496  percent=str(int(round((i+1)/float(len(qvals))*100)))
497  sys.stdout.write("%s%%" % (percent))
498  sys.stdout.flush()
499  sys.stdout.write('\b'*(len(percent)+1))
500  if len(flagnames) > 4:
501  flags = map(self.get_flag, [q]*len(flagnames[4:]),
502  flagnames[4:])
503  if False in [flags[i-3] for i in flagnos]:
504  continue
505  thing = [q,gamma*(self.gp.get_posterior_mean([q])+offset),
506  gamma*sqrt(self.gp.get_posterior_covariance([q],[q])),
507  gamma*(self.functions['mean']([q])[0]+offset) ]
508  if average != 0:
509  #compute hessian of -log(cov)
510  cov = thing[2]**2
511  Hder = matrix(gp.get_posterior_covariance_derivative(
512  [q],False)).T
513  Hcov = matrix(gp.get_posterior_covariance_hessian(
514  [q],False))
515  Hlcov = 1/cov*(-Hcov + 1/cov * Hder*Hder.T)
516  #fl=open('avcov','a')
517  #fl.write(" ".join(["%f" % i for i in thing[:3]]))
518  lapl = linalg.det(matrix(eye(Hessian.shape[0]))
519  + linalg.solve(Hessian,Hlcov))**(-0.5)
520  #fl.write(" %f" % sqrt(cov*lapl))
521  #fl.write(" %f" % lapl)
522  #fl.write(" %f\n" % linalg.det(Hlcov))
523  #fl.close()
524  #print thing[0],thing[1],cov,cov*lapl,lapl
525  thing[2] = sqrt(cov*lapl)
526  if flags:
527  thing.extend(flags)
528  if colwise:
529  for flag in flagnames:
530  retval[flag].append(thing[flagdict[flag]])
531  else:
532  retval.append(tuple(thing))
533  return retval
534 
535  def get_flag_names(self):
536  return tuple(self.flag_names)
537 
538  def get_params(self):
539  retval={}
540  for k,v in self.particles.items():
541  retval[k]=v.get_nuisance()
542  return retval
543 
544  def set_Nreps(self,nreps):
545  self.Nreps = nreps
546 
547  def get_Nreps(self):
548  return self.Nreps
549 
550  def write_data(self, filename, prefix='data_', suffix='', sep=' ',
551  bool_to_int=False, dir='./', header=True, flags=None,
552  float_fmt='%-15.14G', *args, **kwargs):
553  fl=open(os.path.join(dir,prefix+filename+suffix),'w')
554  allflags = self.get_flag_names()
555  if flags == None:
556  flags=allflags
557  if header:
558  fl.write('#')
559  for i,name in enumerate(flags):
560  fl.write("%d:%s%s" % (i+1,name,sep))
561  fl.write('\n')
562  for d in self.get_data(*args, **kwargs):
563  for flagname,ent in zip(allflags,d[1:]):
564  if flagname not in flags:
565  continue
566  if bool_to_int and isinstance(ent, bool):
567  fl.write('%d' % ent)
568  elif isinstance(ent, float):
569  fl.write(float_fmt % ent)
570  else:
571  fl.write('%s' % ent)
572  fl.write(sep)
573  fl.write('\n')
574  fl.close()
575 
576  def write_mean(self, filename, prefix='mean_', suffix='', sep=' ',
577  bool_to_int=False, dir='./', header=True, flags=None,
578  float_fmt="%-15.14G", *args, **kwargs):
579  """write gaussian process interpolation to a file named
580  dir+prefix+filename+suffix.
581  bool_to_int : convert booleans to 0 or 1 in output.
582  header : if True, first line starts with # and is a header
583  flags : specify which flags to write, if None write all.
584  float_fmt : the format for outputting floats
585  """
586  fl=open(os.path.join(dir,prefix+filename+suffix),'w')
587  allflags = list(self.get_flag_names())
588  allflags = allflags[:3]+['mean']+allflags[3:]
589  if flags == None:
590  flags=allflags
591  if header:
592  fl.write('#')
593  for i,name in enumerate(flags):
594  fl.write("%d:%s%s" % (i+1,name,sep))
595  fl.write('\n')
596  for d in self.get_mean(*args, **kwargs):
597  for flagname,ent in zip(allflags,d):
598  if flagname not in flags:
599  continue
600  if bool_to_int and isinstance(ent, bool):
601  fl.write('%d' % ent)
602  elif isinstance(ent, float):
603  fl.write(float_fmt % ent)
604  else:
605  fl.write('%s' % ent)
606  fl.write(sep)
607  fl.write('\n')
608  fl.close()
609 
610  def set_filename(self, fname):
611  self.filename = fname
612 
613  def get_filename(self):
614  return self.filename
615 
616 class _RawEpilogFormatter(optparse.IndentedHelpFormatter):
617  """Output the epilog help text as-is"""
618  def format_epilog(self, epilog):
619  return epilog
620 
621 def parse_args():
622  parser = IMP.OptionParser(
623  description="Perform a statistical merge of the given SAXS curves. "
624  "file is a 3-column file that contains SAXS data. "
625  "To specify the number of repetitions of this experiment, "
626  "use the syntax file.txt=20 indicating that data "
627  "in file.txt is an average of 20 experiments. Default is "
628  "10. Note that in the case of different number of "
629  "repetitions, the minimum is taken for the final "
630  "fitting step (Step 5).",
631  formatter=_RawEpilogFormatter(),
632  usage="%prog [options] file [file ...]",
633  version='%prog 0.3',
634  epilog="""Output file legend:\n
635 Cleanup
636  agood (bool) : True if SNR is high enough
637  apvalue (float) : p-value of the student t test
638 Rescaling
639  cgood (bool) : True if data point is both valid (wrt SNR) and in the
640  validity domain of the gamma reference curve (the last
641  curve)
642 Classification
643  drefnum (int) : number of the reference profile for this point
644  drefname (string) : associated filename
645  dgood (bool) : True if this point is compatible with the reference and
646  False otherwise. Undefined if 'agood' is False for that
647  point.
648  dselfref (bool) : True if this curve was it's own reference, in which
649  case dgood is also True
650  dpvalue (float) : p-value for the classification test
651 Merging
652  eorigin (int) : profile index from which this point originates
653  eoriname (string) : associated filename
654  eextrapol (bool) : True if mean function is being extrapolated.
655 """)
656  parser.add_option('--verbose', '-v', action="count",
657  dest='verbose', default=0,
658  help="Increase verbosity. Can be repeated up to 3 times "
659  "for more output.")
660  #general
661  group = optparse.OptionGroup(parser, title="general")
662  parser.add_option_group(group)
663  group.add_option('--mergename', help="filename suffix for output "
664  "(default is merged.dat)", default='merged.dat', metavar='SUFFIX')
665  group.add_option('--sumname', metavar='NAME', default='summary.txt',
666  help="File to which the merge summary will be written."
667  " Default is summary.txt")
668  group.add_option('--destdir', default="./", metavar='DIR',
669  help="Destination folder in which files will be written")
670  group.add_option('--header', default=False, action='store_true',
671  help="First line of output files is a header (default False)")
672  group.add_option('--outlevel', default='normal', help="Set the output "
673  "level, 'sparse' is for q,I,err columns only, 'normal' adds "
674  "eorigin, eoriname and eextrapol (default), and 'full' outputs "
675  "all flags.", type="choice", choices=['normal','sparse','full'])
676  group.add_option('--allfiles', default=False, action='store_true',
677  help="Output data files for parsed input files as well (default "
678  "is only to output merge and summary files).")
679  group.add_option('--stop', type='choice', help="stop after the given step, "
680  "one of cleanup, fitting, rescaling, classification, merging "
681  "(default: merging)", choices=["cleanup","fitting","rescaling",
682  "classification", "merging"], default="merging")
683  group.add_option('--postpone_cleanup', action='store_true', default=False,
684  help="Cleanup step comes after rescaling step (default is False)")
685  group.add_option('--npoints', type="int", default=200, metavar="NUM",
686  help="Number of points to output for the mean function. Negative "
687  "values signify to take the same q values as the first data file. "
688  "In that case extrapolation flags are ignored, and extrapolation "
689  "is performed when the data file's q values fall outside of the "
690  "range of accepted data points. Default is NUM=200 points.")
691  group.add_option('--lambdamin', type="float", default=0.005, metavar="MIN",
692  help="lower bound for lambda parameter in steps 2 and 5")
693  #cleanup
694  group = optparse.OptionGroup(parser, title="Cleanup (Step 1)",
695  description="Discard or keep SAXS curves' "
696  "points based on their SNR. Points with an error"
697  " of zero are discarded as well")
698  parser.add_option_group(group)
699 
700  group.add_option('--aalpha', help='type I error (default 1e-4)',
701  type="float", default=1e-4, metavar='ALPHA')
702  group.add_option('--acutoff', help='when a value after CUT is discarded,'
703  ' the rest of the curve is discarded as well (default is 0.1)',
704  type="float", default=0.1, metavar='CUT')
705  #fitting
706  group = optparse.OptionGroup(parser, title="Fitting (Step 2)",
707  description="Estimate the mean function and the noise level "
708  "of each SAXS curve.")
709  parser.add_option_group(group)
710  group.add_option('--bd', help='Initial value for d (default 4)',
711  type="float", default=4., metavar='D')
712  group.add_option('--bs', help='Initial value for s (default 0)',
713  type="float", default=0., metavar='S')
714  group.add_option('--bmean', help='Defines the most complex mean '
715  'function that will be tried during model comparison.'
716  " One of Flat (the offset parameter A is optimized), "
717  "Simple (optimizes A, G and Rg), "
718  "Generalized (optimizes G, Rg and d), "
719  "Full (default, optimizes G, Rg, d and s) "
720  "If --bnocomp is given, will try to fit only with this model",
721  type="choice", default="Full",
722  choices=['Flat','Simple','Generalized','Full'])
723  group.add_option('--bnocomp', help='Don\'t perform model comparison. '
724  "Default: perform it.",
725  action='store_true', default=False)
726  group.add_option('--baverage', help='Average over all possible parameters '
727  'instead of just taking the most probable set of parameters. '
728  'Default is not to perform the averaging.',
729  action='store_true', default=False)
730  group.add_option('--blimit_fitting', metavar='NUM', default=-1, type='int',
731  help='To save resources, set the maximum number of points used in'
732  'the fitting step. Dataset will be subsampled if it is bigger than'
733  ' NUM. If NUM=-1 (default), all points will be used.')
734  group.add_option('--blimit_hessian', metavar='NUM', default=-1, type='int',
735  help='To save resources, set the maximum number of points used in'
736  ' the Hessian calculation (model comparison, options --baverage, '
737  'and --berror ). Dataset will be subsampled if it is bigger than'
738  'NUM. If NUM=-1 (default), all points will be used.')
739  group.add_option('--berror', action='store_true', default=False,
740  help="Compute error bars on all parameters even in case where "
741  "model comparison was disabled. Involves the computation of a "
742  "Hessian. Default: no extra computation.")
743  #rescaling
744  group = optparse.OptionGroup(parser, title="Rescaling (Step 3)",
745  description="Find the most probable scaling factor of all "
746  "curves wrt the first curve.")
747  parser.add_option_group(group)
748  group.add_option('--creference', default='last', help="Define which "
749  "input curve the other curves will be rescaled to. Options are "
750  "first or last (default is last)", type="choice",
751  choices=['first','last'])
752  group.add_option('--cmodel', default='normal',
753  help="Which rescaling model to use to calculate gamma. "
754  "'normal-offset' for a normal model with offset, "
755  "'normal' (default) for a normal model with zero offset "
756  "and 'lognormal' for a lognormal model.",
757  choices=['normal','normal-offset','lognormal'], type='choice')
758  group.add_option('--cnpoints', type="int", default=200, metavar="NUM",
759  help="Number of points to use to compute gamma (default 200)")
760  #classification
761  group = optparse.OptionGroup(parser, title="Classification (Step 4)",
762  description="Classify the mean curves by comparing them using "
763  "a two-sided two-sample student t test")
764  parser.add_option_group(group)
765  group.add_option('--dalpha', help='type I error (default 0.05)',
766  type="float", default=0.05, metavar='ALPHA')
767  #merging
768  group = optparse.OptionGroup(parser, title="Merging (Step 5)",
769  description="Collect compatible data and produce best estimate "
770  "of mean function")
771  parser.add_option_group(group)
772  group.add_option('--eextrapolate', metavar="NUM", help='Extrapolate '
773  "NUM percent outside of the curve's bounds. Example: if NUM=50 "
774  "and the highest acceptable data point is at q=0.3, the mean will "
775  "be estimated up to q=0.45. Default is 0 (just extrapolate at low "
776  "angle).", type="int", default=0)
777  group.add_option('--enoextrapolate', action='store_true', default=False,
778  help="Don't extrapolate at all, even at low angle (default False)")
779  group.add_option('--emean', help='Which most complex mean function '
780  'to try for model comparison.'
781  ' See --bmean. Default is Full', type="choice",
782  default="Full", choices=['Simple','Generalized','Full','Flat'])
783  group.add_option('--enocomp', help='Don\'t perform model comparison, '
784  'see --bnocomp. Default is not to perform it.',
785  action='store_true', default=False)
786  group.add_option('--eaverage', help="Average over all possible parameters "
787  "instead of just taking the most probable set of parameters. "
788  "Default is not to perform the averaging.",
789  action='store_true', default=False)
790  group.add_option('--elimit_fitting', metavar='NUM', default=-1, type='int',
791  help='To save resources, set the maximum number of points used in'
792  'the fitting step. Dataset will be subsampled if it is bigger than'
793  ' NUM. If NUM=-1 (default), all points will be used.')
794  group.add_option('--elimit_hessian', metavar='NUM', default=-1, type='int',
795  help='To save resources, set the maximum number of points used in'
796  ' the Hessian calculation (model comparison, options --eaverage, '
797  'and --eerror ). Dataset will be subsampled if it is bigger than'
798  'NUM. If NUM=-1 (default), all points will be used.')
799  group.add_option('--eerror', action='store_true', default=False,
800  help="Compute error bars on all parameters even in case where "
801  "model comparison was disabled. Involves the computation of a "
802  "Hessian. Default: no extra computation.")
803  (args, files) = parser.parse_args()
804  if len(files) == 0:
805  parser.error("No files specified")
806  return (args, files)
807 
808 def parse_filenames(fnames, defaultvalue=10):
809  files = []
810  Nreps = []
811  for fname in fnames:
812  if '=' in fname:
813  tokens = fname.split('=')
814  paths = glob.glob(tokens[0])
815  if len(paths) == 0:
816  sys.exit("File %s not found!" % fname)
817  files.extend(paths)
818  Nreps.append(int(tokens[1]))
819  else:
820  paths = glob.glob(fname)
821  if len(paths) == 0:
822  sys.exit("File %s not found!" % fname)
823  files.extend(paths)
824  Nreps.append(defaultvalue)
825  return files, Nreps
826 
827 def get_global_scaling_factor(file):
828  """get an order of magnitude for 100/I(0)"""
829  p=SAXSProfile()
830  p.add_data(file, positive=True)
831  data=p.get_raw_data()
832  n=len(data)
833  #take the first 50 points or 10% whichever comes first
834  m=min(int(0.1*n),50)
835  return 100./mean([i[1] for i in data[:m]])
836 
837 def create_profile(file, nreps, scale=1):
838  p=SAXSProfile()
839  p.add_data(file, positive=True, scale=scale)
840  p.set_Nreps(nreps)
841  p.set_filename(file)
842  return p
843 
844 def ttest_one(mu,s,n):
845  """one-sample right-tailed t-test against 0
846  Returns: pval, t, nu
847  """
848  v = s**2/float(n)
849  t = mu/sqrt(v)
850  nu = n-1
851  pval = (1-student_t.cdf(t,nu))
852  return pval, t, nu
853 
854 def ttest_two(mu1,s1,n1,mu2,s2,n2):
855  """Welch's t-test
856  two-sample two-sided t-test
857  Returns: pval, t, nu
858  """
859  v1 = s1**2/float(n1)
860  v2 = s2**2/float(n2)
861  t = (mu1-mu2)/sqrt(v1+v2)
862  nu = (v1+v2)**2/(v1**2/(n1-1)+v2**2/(n2-1))
863  pval = 2*(1-student_t.cdf(abs(t),nu))
864  return pval, t, nu
865 
866 def setup_particles(initvals):
867  """ initvals : dict of initial values for the parameters
868  returns: model, dict(*particles), dict(mean, covariance)
869  """
870 
871  model = IMP.Model()
872  #mean function
873  G=IMP.isd.Scale.setup_particle(IMP.Particle(model,"G"),initvals['G'])
874  #model.add_restraint(IMP.isd.JeffreysRestraint(G))
875  Rg=IMP.isd.Scale.setup_particle(IMP.Particle(model,"Rg"),initvals['Rg'])
876  d=IMP.isd.Scale.setup_particle(IMP.Particle(model,"d"),initvals['d'])
877  s=IMP.isd.Scale.setup_particle(IMP.Particle(model,"s"),initvals['s'])
878  A=IMP.isd.Nuisance.setup_particle(IMP.Particle(model,"A"),initvals['A'])
880  #covariance function
881  tau=IMP.isd.Scale.setup_particle(IMP.Particle(model,"tau"),initvals['tau'])
882  lam=IMP.isd.Scale.setup_particle(IMP.Particle(model,"lambda"),
883  initvals['lambda'])
884  w = IMP.isd.Covariance1DFunction(tau,lam,2.0)
885  #sigma
886  sigma=IMP.isd.Scale.setup_particle(IMP.Particle(model,"sigma2"),
887  initvals['sigma2'])
888  #prior on scales
889  model.add_score_state(IMP.core.SingletonConstraint(
890  IMP.isd.NuisanceRangeModifier(),None,G,"Constrain_G"))
891  model.add_score_state(IMP.core.SingletonConstraint(
892  IMP.isd.NuisanceRangeModifier(),None,Rg,"Constrain_Rg"))
893  model.add_score_state(IMP.core.SingletonConstraint(
894  IMP.isd.NuisanceRangeModifier(),None,d,"Constrain_d"))
895  model.add_score_state(IMP.core.SingletonConstraint(
896  IMP.isd.NuisanceRangeModifier(),None,A,"Constrain_A"))
897  model.add_score_state(IMP.core.SingletonConstraint(
898  IMP.isd.NuisanceRangeModifier(),None,s,"Constrain_s"))
899  model.add_score_state(IMP.core.SingletonConstraint(
900  IMP.isd.NuisanceRangeModifier(),None,lam,"Constrain_lambda"))
901  #model.add_restraint(IMP.isd.JeffreysRestraint(tau))
902  model.add_score_state(IMP.core.SingletonConstraint(
903  IMP.isd.NuisanceRangeModifier(),None,tau,"Constrain_tau"))
904  model.add_restraint(IMP.isd.JeffreysRestraint(sigma))
905  model.add_score_state(IMP.core.SingletonConstraint(
906  IMP.isd.NuisanceRangeModifier(),None,sigma,"Constrain_sigma2"))
907  #set lower and upper bounds for Rg, d and s
908  Rg.set_lower(0.1)
909  d.set_lower(0.1)
910  d.set_upper(8.)
911  s.set_upper(3.)
912  s.set_upper(d)
913  #set lower bounds for cov particles
914  tau.set_lower(0.01)
915  sigma.set_lower(0.01)
916  sigma.set_upper(1000.)
917  #
918  particles = {}
919  particles['G'] = G
920  particles['Rg'] = Rg
921  particles['d'] = d
922  particles['s'] = s
923  particles['A'] = A
924  particles['tau'] = tau
925  particles['lambda'] = lam
926  particles['sigma2'] = sigma
927  functions = {}
928  functions['mean'] = m
929  functions['covariance'] = w
930  return model, particles, functions
931 
932 def do_conjugategradients(model,nsteps):
934  cg.optimize(nsteps)
935 
936 def do_quasinewton(model,nsteps):
937  qn=IMP.gsl.QuasiNewton(model)
938  #fl=open('qn.txt','w')
939  #write_header(fl)
940  #write_params(fl,model,a,b,tau,lam,sigma)
941  #for i in xrange(nsteps):
942  # if print_steps >0 and i % print_steps == 0 :
943  # print i,
944  # sys.stdout.flush()
945  # IMP.base.set_log_level(IMP.base.TERSE)
946  # qn.optimize(1)
947  # IMP.base.set_log_level(0)
948  # write_params(fl,model,a,b,tau,lam,sigma)
949  qn.optimize(nsteps)
950 
951 def setup_process(data,initvals, subs, maxpoints=-1):
952  model,particles,functions = setup_particles(initvals)
953  q = [ [i] for i in data['q'][::subs]]
954  I = data['I'][::subs]
955  err = data['err'][::subs]
956  if maxpoints >0:
957  q,I,err = zip(*subsample(q,zip(q,I,err),maxpoints))
959  data['N'], functions['mean'], functions['covariance'],
960  particles['sigma2'])
961  return model, particles, functions, gp
962 
963 def get_initial_Rg(data):
964  fl, name = tempfile.mkstemp(text=True)
965  handle = os.fdopen(fl,'w')
966  for dat in zip(data['q'],data['I'],data['err']):
967  handle.write(' '.join(['%G' % i for i in dat]))
968  handle.write('\n')
969  handle.close()
970  profile = IMP.saxs.Profile(name)
971  Rg = profile.radius_of_gyration()
972  os.unlink(name)
973  return Rg
974 
975 def set_defaults_mean(data, particles, mean_function):
976  #set initial value for G to be a rough estimate of I(0)
977  #take first 10 pc or 20 points at least
978  npoints=min(len(data['q'])/10,50)
979  Ivals = [data['I'][i] for i in xrange(len(data['q']))][:npoints]
980  particles['G'].set_nuisance(mean(Ivals))
981  particles['G'].set_lower(min(Ivals))
982  particles['G'].set_upper(2*max(Ivals))
983  #optimize mean particles only
984  particles['A'].set_nuisance_is_optimized(True)
985  if mean_function == 'Flat':
986  particles['G'].set_lower(0)
987  particles['G'].set_nuisance(0)
988  particles['G'].set_upper(0)
989  particles['G'].set_nuisance_is_optimized(False)
990  particles['Rg'].set_nuisance_is_optimized(False)
991  particles['d'].set_nuisance_is_optimized(False)
992  particles['s'].set_nuisance_is_optimized(False)
993  else:
994  particles['G'].set_nuisance_is_optimized(True)
995  particles['Rg'].set_nuisance_is_optimized(True)
996  particles['Rg'].set_nuisance(get_initial_Rg(data))
997  if mean_function == 'Simple':
998  particles['d'].set_nuisance_is_optimized(False)
999  else:
1000  particles['d'].set_nuisance_is_optimized(True)
1001  if mean_function == 'Generalized':
1002  particles['s'].set_nuisance_is_optimized(False)
1003  else:
1004  particles['s'].set_nuisance_is_optimized(True)
1005  particles['tau'].set_nuisance_is_optimized(False)
1006  particles['lambda'].set_nuisance_is_optimized(False)
1007  particles['sigma2'].set_nuisance_is_optimized(False)
1008 
1009 def find_fit_mean(data, initvals, verbose, mean_function):
1010  model, particles, functions, gp = \
1011  setup_process(data, initvals, 1)
1012  IMP.base.set_log_level(IMP.base.TERSE)
1014  model.add_restraint(gpr)
1015  set_defaults_mean(data, particles, mean_function)
1016  #set tau to 0 allows for faster estimate (diagonal covariance matrix)
1017  #no need to store its value since it will be reset later on
1018  taulow = None
1019  if particles['tau'].has_lower():
1020  taulow = particles['tau'].get_lower()
1021  particles['tau'].set_nuisance(0.)
1022  particles['tau'].set_lower(0.)
1023  #particles['A'].set_nuisance(-2.88)
1024  #particles['G'].set_nuisance(5485.31)
1025  #particles['Rg'].set_nuisance(42.09)
1026  #particles['d'].set_nuisance(2.92)
1027  #for q in linspace(0.001,0.3):
1028  # print "cmp",q,gp.get_posterior_mean([q])
1029  #IMP.base.set_log_level(IMP.base.TERSE)
1030  #for q in linspace(0.001,0.3):
1031  # print "cmp",q,gp.get_posterior_mean([q])
1032  #sys.exit()
1033  #optimize mean particles for 3x100 steps
1034  #when getting initvals, calling model.evaluate() ensures constraints are met
1035  #print particles['A'].get_nuisance()
1036  #print [(a,b.get_nuisance()) for a,b in particles.items()]
1037  #dg=IMP.get_dependency_graph(model)
1038  #IMP.base.show_graphviz(dg)
1039  #print IMP.get_required_score_states(gpr, [], dg,
1040  # IMP.get_dependency_graph_vertex_index(dg))
1041  #sys.exit()
1042  rgopt = particles['Rg'].get_nuisance_is_optimized()
1043  particles['Rg'].set_nuisance_is_optimized(False)
1044  for i in xrange(2):
1045  model.evaluate(False)
1046  do_conjugategradients(model,100)
1047  particles['Rg'].set_nuisance_is_optimized(rgopt)
1048  for i in xrange(2):
1049  #print particles['A'].get_nuisance()
1050  #print [(k,v.get_nuisance()) for (k,v) in particles.items()]
1051  model.evaluate(False)
1052  do_conjugategradients(model,100)
1053  for i in xrange(2):
1054  #print particles['A'].get_nuisance()
1055  #print [(k,v.get_nuisance()) for (k,v) in particles.items()]
1056  model.evaluate(False)
1057  do_quasinewton(model,100)
1058  #set_defaults_mean(data, particles, 'Full')
1059  #for A in linspace(-100,100):
1060  # particles['A'].set_nuisance(A)
1061  # print "pyy",A,model.evaluate(False)
1062  #set_defaults_mean(data, particles, 'Generalized')
1063  #for A in linspace(-100,100):
1064  # particles['A'].set_nuisance(A)
1065  # print "pyy",A,model.evaluate(False)
1066  #sys.exit()
1067  #for q in linspace(0.001,0.3):
1068  # print "cmp",q,gp.get_posterior_mean([q])
1069  #for i in particles:
1070  # print i,particles[i].get_nuisance()
1071  #sys.exit()
1072  #reset tau bounds
1073  for i in particles.values():
1074  if i.get_nuisance_is_optimized():
1075  if isnan(i.get_nuisance()):
1076  raise FittingError
1077  if taulow:
1078  particles['tau'].set_lower(taulow)
1079  model.evaluate(False)
1080  newvals = dict([(k,v.get_nuisance())
1081  for (k,v) in particles.iteritems()])
1082  return newvals
1083 
1084 def set_defaults_covariance(data, particles, functions, args):
1085  #set sigma to be equal to 10x the noise level, assuming mean has been fitted
1086  chisquares = [functions['mean']([q])[0] for q in data['q']]
1087  chisquares = [(data['I'][i]-chisquares[i])**2 + data['err'][i]**2
1088  for i in xrange(len(data['q']))]
1089  noiselevel = (sum(chisquares)/len(chisquares))
1090  errorlevel = (sum([data['err'][i]**2 for i in xrange(len(data['q']))])
1091  /len(data['err']))
1092  sigmaval = 10*noiselevel/errorlevel
1093  particles['sigma2'].set_nuisance(sigmaval)
1094  #set tau to be equal to this value
1095  particles['tau'].set_nuisance(sigmaval)
1096 
1097 def find_fit_lambda(data, initvals, args, verbose):
1098  #optimize covariance, first on lambda by subsampling the curve
1099  #print "lambda opt"
1100  meandist = mean(array(data['q'][1:])-array(data['q'][:-1]))
1101  initial=True
1102  for subs,nsteps in [(10,1000),(5,500)]:
1103  model, particles, functions, gp = \
1104  setup_process(data, initvals, subs)
1105  if initial:
1106  initial=False
1107  set_defaults_covariance(data, particles, functions, args)
1108  model.evaluate(False)
1109  #initvals = dict([(k,v.get_nuisance())
1110  # for (k,v) in particles.iteritems()])
1111  #print " ".join(["%5s=%10G" % d for d in initvals.items()])
1112  #set lambda to be bigger than the distance between points
1113  particles['lambda'].set_lower(2*meandist)
1114  particles['G'].set_nuisance_is_optimized(False)
1115  particles['Rg'].set_nuisance_is_optimized(False)
1116  particles['d'].set_nuisance_is_optimized(False)
1117  particles['s'].set_nuisance_is_optimized(False)
1118  particles['A'].set_nuisance_is_optimized(False)
1119  particles['tau'].set_nuisance_is_optimized(False)
1120  particles['lambda'].set_nuisance_is_optimized(True)
1121  particles['sigma2'].set_nuisance_is_optimized(False)
1123  model.add_restraint(gpr)
1124  do_quasinewton(model,nsteps)
1125  model.evaluate(False)
1126  initvals = dict([(k,v.get_nuisance())
1127  for (k,v) in particles.iteritems()])
1128  #print " ".join(["%5s=%10G" % d for d in initvals.items()])
1129  return initvals
1130 
1131 def find_fit_covariance(data, initvals, args, verbose):
1132  #then on all covariance particles
1133  #print "cov opt"
1134  model, particles, functions, gp = \
1135  setup_process(data, initvals, 1, args)
1136  set_defaults_covariance(data, particles, functions, args)
1137  meandist = mean(array(data['q'][1:])-array(data['q'][:-1]))
1138  particles['lambda'].set_lower(meandist)
1139  #optimize all covariance particles for 3x10 steps and all data points
1140  particles['G'].set_nuisance_is_optimized(False)
1141  particles['Rg'].set_nuisance_is_optimized(False)
1142  particles['d'].set_nuisance_is_optimized(False)
1143  particles['s'].set_nuisance_is_optimized(False)
1144  particles['A'].set_nuisance_is_optimized(False)
1145  particles['tau'].set_nuisance_is_optimized(True)
1146  particles['lambda'].set_nuisance_is_optimized(True)
1147  particles['sigma2'].set_nuisance_is_optimized(True)
1149  model.add_restraint(gpr)
1150  for i in xrange(3):
1151  do_quasinewton(model,30)
1152  model.evaluate(False)
1153  initvals = dict([(k,v.get_nuisance()) for (k,v) in particles.iteritems()])
1154  #print " ".join(["%5s=%10G" % d for d in initvals.items()])
1155  if False:
1156  #plotrange
1157  minval=(model.evaluate(False),particles['lambda'].get_nuisance())
1158  initvals = dict([(k,v.get_nuisance()) for (k,v) in particles.iteritems()])
1159  particles['lambda'].set_lower(0.)
1160  for l in linspace(.01,10,num=100):
1161  particles['lambda'].set_nuisance(l)
1162  ene = model.evaluate(True)
1163  deriv = psarticles['lambda'].get_nuisance_derivative()
1164  if minval[0] > ene:
1165  minval = (ene, l)
1166  print "plotrange",l,ene,deriv
1167  print "minimum: ene=%f lambda=%f" % minval
1168  if False:
1169  #residuals
1170  for k,v in particles.items():
1171  v.set_nuisance(initvals[k])
1172  for q,I,err in zip(data['q'],data['I'],data['err']):
1173  gpval=gp.get_posterior_mean([q])
1174  avg=functions['mean']([q])[0]
1175  print "residuals",q,I,err,gpval,avg,(gpval-avg),(I-avg)
1176  return initvals
1177 
1178 def find_fit_by_minimizing(data, initvals, verbose, lambdalow):
1179  model, particles, functions, gp = \
1180  setup_process(data, initvals, 1)
1182  model.add_restraint(gpr)
1183  meandist = mean(array(data['q'][1:])-array(data['q'][:-1]))
1184  particles['lambda'].set_lower(max(2*meandist,lambdalow))
1185  particles['lambda'].set_upper(max(data['q'])/2.)
1186  particles['lambda'].set_nuisance(max(data['q'])/10.)
1187  lambdamin = particles['lambda'].get_lower()
1188  lambdamax = particles['lambda'].get_upper()
1189  particles['sigma2'].set_lower(1)
1190  particles['sigma2'].set_upper(1000)
1191  particles['sigma2'].set_nuisance(100.0)
1192  sigmamin = particles['sigma2'].get_lower()
1193  sigmamax = particles['sigma2'].get_upper()
1194  particles['tau'].set_lower(0.001)
1195  particles['tau'].set_nuisance(10.0)
1196  taumin = particles['tau'].get_lower()
1197  taumax = 100
1198  #print "minimizing"
1199  particles['lambda'].set_nuisance_is_optimized(True)
1200  particles['sigma2'].set_nuisance_is_optimized(False)
1201  particles['tau'].set_nuisance_is_optimized(True)
1202  minimas = []
1203  while len(minimas) < 5:
1204  initm = model.evaluate(False)
1205  initt = particles['tau'].get_nuisance()
1206  inits = particles['sigma2'].get_nuisance()
1207  initl = particles['lambda'].get_nuisance()
1208  #steepest descent
1209  sd = IMP.core.SteepestDescent(model)
1210  sd.optimize(500)
1211  #conjugate gradients
1212  cg = IMP.core.ConjugateGradients(model)
1213  cg.optimize(500)
1214  #add to minimas
1215  minimas.append((model.evaluate(False), initm,
1216  particles['tau'].get_nuisance(), initt,
1217  particles['sigma2'].get_nuisance(), inits,
1218  particles['lambda'].get_nuisance(), initl))
1219  print "--- new min"
1220  print " tau",initt, minimas[-1][2]
1221  print " sig",inits, minimas[-1][4]
1222  print " lam",initl, minimas[-1][6]
1223  print " mod",initm, minimas[-1][0]
1224  #draw new random starting position
1225  particles['tau'].set_nuisance(
1226  exp(random.uniform(log(taumin)+1,log(taumax)-1)))
1227  particles['sigma2'].set_nuisance(
1228  exp(random.uniform(log(sigmamin)+1,log(sigmamax)-1)))
1229  particles['lambda'].set_nuisance(
1230  exp(random.uniform(log(lambdamin)+1,log(lambdamax)-1)))
1231  minimas.sort(key=lambda a:a[0])
1232  print "best is ",minimas[0][0]
1233  particles['tau'].set_nuisance(minimas[0][2])
1234  particles['sigma2'].set_nuisance(minimas[0][4])
1235  particles['lambda'].set_nuisance(minimas[0][6])
1236  ene = model.evaluate(False)
1237  initvals = dict([(k,v.get_nuisance()) for (k,v) in particles.iteritems()])
1238  return initvals
1239 
1240 def find_fit_by_gridding(data, initvals, verbose, lambdalow):
1241  """use the fact that sigma2 can be factored out of the covariance matrix and
1242  drawn from a gamma distribution. Calculates a 2D grid on lambda (D1) and
1243  tau**2/sigma2 (D2) to get the minimum value.
1244  """
1245  model, particles, functions, gp = \
1246  setup_process(data, initvals, 1)
1248  model.add_restraint(gpr)
1249  meandist = mean(array(data['q'][1:])-array(data['q'][:-1]))
1250  particles['lambda'].set_lower(max(2*meandist,lambdalow))
1251  lambdamin = particles['lambda'].get_lower()
1252  lambdamax = 100
1253  numpoints=30
1254  gridvalues = []
1255  particles['tau'].set_lower(0.)
1256  particles['sigma2'].set_lower(0.)
1257  #fl=open('grid.txt','w')
1258  particles['sigma2'].set_nuisance(100.0)
1259  particles['sigma2'].set_upper(1000.0)
1260  particles['tau'].set_nuisance(10.0)
1261  #print "gridding"
1262  experror=0
1263  for lambdaval in logspace(log(lambdamin),log(lambdamax),
1264  base=exp(1),num=numpoints):
1265  for rel in logspace(-2, 2, num=numpoints):
1266  particles['lambda'].set_nuisance(lambdaval)
1267  #set new value of tau**2/sigma
1268  tauval = particles['tau'].get_nuisance()
1269  if tauval**2/rel > particles['sigma2'].get_upper():
1270  continue
1271  particles['sigma2'].set_nuisance(tauval**2/rel)
1272  #print "sigma2 has val",particles['sigma2'].get_nuisance(), \
1273  # "tau has val",particles['tau'].get_nuisance()
1274  #get exponent and compute reduced exponent
1275  exponent = gpr.get_minus_exponent()
1276  if isnan(exponent) or exponent <=0:
1277  if experror < 2:
1278  print "got invalid exponent at grid point "\
1279  "lambda=%f rel=%f exp=%s" % (lambdaval,rel, exponent)
1280  experror += 1
1281  continue
1282  redexp = tauval * exponent
1283  #compute maximum posterior value for tau**2 assuming jeffreys prior
1284  tauval = (redexp/(len(data['q'])+2))**.5
1285  sigmaval = tauval**2/rel
1286  #print "sigma=",sigmaval,"tau=",tauval,\
1287  # "tau**2/sigma2=",rel,"lambda=",lambdaval
1288  if sigmaval > particles['sigma2'].get_upper():
1289  #skip value if outside of bounds for sigma
1290  continue
1291  particles['tau'].set_nuisance(tauval)
1292  #reset tau to correct value and get minimized energy
1293  if sigmaval > particles['sigma2'].get_upper():
1294  #skip value if outside of bounds for sigma
1295  continue
1296  particles['sigma2'].set_nuisance(sigmaval)
1297  ene = model.evaluate(False)
1298  gridvalues.append((lambdaval,rel,sigmaval,tauval,ene))
1299  #fl.write(" ".join(["%f" % i
1300  # for i in lambdaval,rel,sigmaval,tauval,ene]))
1301  #fl.write('\n')
1302  #fl.write('\n')
1303  #print "minimizing"
1304  if experror >=2:
1305  print "skipped another " + str(experror-2) + " similar errors"
1306  particles['tau'].set_lower(0.01)
1307  particles['tau'].set_upper(100)
1308  particles['sigma2'].set_lower(1.)
1309  particles['sigma2'].set_upper(1000)
1310  if len(gridvalues) == 0:
1311  raise FittingError
1312  minval = gridvalues[0][:]
1313  for i in gridvalues:
1314  if i[4] < minval[4]:
1315  minval = i[:]
1316  minval=list(minval)
1317  #minval[0]=minval[0]
1318  #minval[1]=minval[1]
1319  #print "minimum",minval
1320  particles['lambda'].set_nuisance_is_optimized(True)
1321  particles['sigma2'].set_nuisance_is_optimized(True)
1322  particles['tau'].set_nuisance_is_optimized(True)
1323  #do 3 independent minimizations, pick best run
1324  bestmin=[]
1325  for i in xrange(3):
1326  particles['lambda'].set_nuisance(minval[0])
1327  particles['sigma2'].set_nuisance(minval[2])
1328  particles['tau'].set_nuisance(minval[3])
1329  do_quasinewton(model,10)
1330  bestmin.append((model.evaluate(False),
1331  particles['lambda'].get_nuisance(),
1332  particles['sigma2'].get_nuisance(),
1333  particles['tau'].get_nuisance()))
1334  bestmin.sort()
1335  particles['lambda'].set_nuisance(bestmin[0][1])
1336  particles['sigma2'].set_nuisance(bestmin[0][2])
1337  particles['tau'].set_nuisance(bestmin[0][3])
1338  #print " "
1339  #for i in bestmin:
1340  # print i
1341  #for i,p in particles.items():
1342  # print i,p.get_nuisance(),p.get_lower(),p.get_upper()
1343  #print " "
1344  #global fitno
1345  #fl=open('fit_%d.dat' % fitno,'w')
1346  #fitno += 1
1347  #for i,q in enumerate(data['q']):
1348  # fl.write('%s %s %s %s %s\n' % (q,data['I'][i],data['err'][i],
1349  # gp.get_posterior_mean([q]),
1350  # sqrt(gp.get_posterior_covariance([q],[q]))))
1351  #ene = model.evaluate(False)
1352  #newmin = [particles['lambda'].get_nuisance(),None,
1353  # particles['sigma2'].get_nuisance(),particles['tau'].get_nuisance(),
1354  # ene]
1355  #newmin[1]=newmin[3]**2/newmin[2]
1356  #newmin=tuple(newmin)
1357  initvals = dict([(k,v.get_nuisance()) for (k,v) in particles.iteritems()])
1358  return initvals
1359 
1360 def bayes_factor(data, initvals, verbose, mean_func, maxpoints):
1361  """given optimized values for the parameters (in initvals), the data, and
1362  the model specification (in mean_func), return
1363  0 the number of parameters (Np) that have been optimized
1364  1 sqrt(2pi)^Np
1365  2 hessian (H) wrt parameter vector
1366  3 1/2*log(abs(det(H)))
1367  4 minus log maximum posterior value (MP)
1368  5 minus log maximum likelihood value
1369  6 minus log maximum prior value
1370  7 exp(-MP)*sqrt(2Pi)^Np*det(H)^(-1/2) (bayes factor)
1371  8 MP - Np/2*log(2Pi) + 1/2*log(det(H)) (minus log bayes factor)
1372  """
1373  model, particles, functions, gp = setup_process(data, initvals, 1,
1374  maxpoints=maxpoints)
1376  model.add_restraint(gpr)
1377  particles['A'].set_nuisance_is_optimized(True)
1378  if mean_func == 'Flat':
1379  particles['G'].set_nuisance_is_optimized(False)
1380  particles['Rg'].set_nuisance_is_optimized(False)
1381  particles['d'].set_nuisance_is_optimized(False)
1382  particles['s'].set_nuisance_is_optimized(False)
1383  else:
1384  particles['G'].set_nuisance_is_optimized(True)
1385  particles['Rg'].set_nuisance_is_optimized(True)
1386  if mean_func == 'Simple':
1387  particles['d'].set_nuisance_is_optimized(False)
1388  else:
1389  particles['d'].set_nuisance_is_optimized(True)
1390  if mean_func == 'Generalized':
1391  particles['s'].set_nuisance_is_optimized(False)
1392  else:
1393  particles['s'].set_nuisance_is_optimized(True)
1394  particles['tau'].set_nuisance_is_optimized(True)
1395  particles['lambda'].set_nuisance_is_optimized(True)
1396  particles['sigma2'].set_nuisance_is_optimized(True)
1397 
1398  H = matrix(gpr.get_hessian(False))
1399  Np = H.shape[0]
1400  MP = model.evaluate(False)
1401  ML = gpr.unprotected_evaluate(None)
1402  try:
1403  retval = linalg.slogdet(H)
1404  if retval[0] == 0 and retval[1] == -inf:
1405  print "Warning: skipping model %s" % mean_func
1406  logdet = inf
1407  else:
1408  logdet = retval[1]/2.
1409  except AttributeError:
1410  #older numpy versions don't have slogdet, try built-in
1411  #at the cost of an extra hessian calculation
1412  try:
1413  retval = gpr.get_logdet_hessian()
1414  if isinf(retval):
1415  print "Warning: re-skipping model %s" % mean_func
1416  logdet = inf
1417  else:
1418  logdet = retval/2.
1419  except IMP.ModelException:
1420  print "Warning: Hessian is not positive definite"
1421  logdet=inf
1422  return (Np, (2*pi)**(Np/2.), H, logdet, MP, ML, MP-ML,
1423  exp(-MP)*(2*pi)**(Np/2.)*exp(-logdet),
1424  MP - Np/2.*log(2*pi) + logdet)
1425 
1426 def find_fit(data, initvals, verbose, model_comp=False, model_comp_maxpoints=-1,
1427  mean_function='Simple', lambdamin=0.005):
1428  #if verbose >2:
1429  # print " %d:%d" % (subs,nsteps),
1430  # sys.stdout.flush()
1431  #
1432  #compile a list of functions to fit, ordered by number of params
1433  functions=[]
1434  if model_comp:
1435  for i in ['Flat','Simple','Generalized','Full']:
1436  functions.append(i)
1437  if i==mean_function:
1438  break
1439  else:
1440  functions.append(mean_function)
1441  param_vals = {}
1442  #optimize parameters for each function
1443  if verbose > 2:
1444  print ""
1445  for mean_func in functions:
1446  if verbose > 2:
1447  sys.stdout.write(" "+mean_func+": mean ")
1448  sys.stdout.flush()
1449  try:
1450  param_vals[mean_func] = \
1451  find_fit_mean(data, initvals, verbose, mean_func)
1452  except FittingError:
1453  if model_comp:
1454  print "error"
1455  sys.stdout.flush()
1456  continue
1457  else:
1458  raise FittingError, "Error while fitting! Choose"\
1459  " another model or do model comparison"
1460  if verbose > 2:
1461  sys.stdout.write("A=%1.2f " % param_vals[mean_func]['A'])
1462  if mean_func in ["Simple","Generalized","Full"]:
1463  sys.stdout.write("G=%1.2f " % param_vals[mean_func]['G'])
1464  sys.stdout.write("Rg=%1.2f " % param_vals[mean_func]['Rg'])
1465  if mean_func in ["Generalized","Full"]:
1466  sys.stdout.write("d=%1.2f " % param_vals[mean_func]['d'])
1467  if mean_func == "Full":
1468  sys.stdout.write("s=%1.2f " % param_vals[mean_func]['s'])
1469  sys.stdout.write("covariance ")
1470  sys.stdout.flush()
1471  #initvals = find_fit_lambda(data, initvals, args, verbose)
1472  #initvals = find_fit_covariance(data, initvals, args, verbose)
1473  try:
1474  param_vals[mean_func] = \
1475  find_fit_by_gridding(data, param_vals[mean_func], verbose,
1476  lambdamin)
1477  except FittingError:
1478  if model_comp:
1479  print "error"
1480  sys.stdout.flush()
1481  continue
1482  else:
1483  raise FittingError, "Error while fitting! Choose"\
1484  " another model or do model comparison"
1485  if verbose > 2:
1486  for i in ['tau','lambda','sigma2']:
1487  sys.stdout.write("%s=%1.2f " % (i,param_vals[mean_func][i]))
1488  print ""
1489  sys.stdout.flush()
1490  #compare functions via model comparison
1491  if len(functions) == 1:
1492  return functions[0], param_vals[functions[0]], {}
1493  free_energies={}
1494  if verbose > 2:
1495  print " -log(Bayes Factor):",
1496  for f in functions:
1497  if verbose > 2:
1498  print f+" =",
1499  if f in param_vals:
1500  free_energies[f]=bayes_factor(data, param_vals[f], verbose, f,
1501  model_comp_maxpoints)
1502  else:
1503  free_energies[f] = [-1]+[inf]*8
1504  if verbose > 2:
1505  print "%.1f" % free_energies[f][8],
1506  sys.stdout.flush()
1507  if verbose > 2:
1508  print ""
1509  minf = sorted([(free_energies[i][8],i) for i in functions])[0][1]
1510  return minf,param_vals[minf],free_energies
1511 
1512 def create_intervals_from_data(profile, flag):
1513  """This function creates intervals for the mean function so that
1514  the flag for the mean is consistent with that of the data.
1515  example: q1:True, q2:False, q3:True q4:True
1516  creates [q1:q2[ (True), [q2:q3[ (False) and [q3:q4[ (True)
1517  """
1518  if flag not in profile.get_flag_names():
1519  raise KeyError, "No such flag: %s" % flag
1520  data = profile.get_data(colwise=True)
1521  new=True
1522  interval=[]
1523  oldb = None
1524  oldval = None
1525  for q,b in zip(data['q'],data[flag]):
1526  if new:
1527  new = False
1528  oldval = q
1529  oldb = b
1530  elif b != oldb:
1531  profile.set_flag_interval(flag, oldval, q, oldb)
1532  oldval = q
1533  oldb = b
1534  if q != oldval:
1535  profile.set_flag_interval(flag, oldval, q, oldb)
1536 
1537 
1538 def rescale_curves(refdata, data, normal = False, offset = False):
1539  """Find gamma and (optionnally) c so that
1540  normal model: I0 = gamma*(I1 + c)
1541  lognormal model log I0 = log(gamma*I1)
1542  if offset==False we have c=0
1543  """
1544  I0=array(refdata['I'])
1545  s0=array(refdata['err'])
1546  I1=array(data['I'])
1547  s1=array(data['err'])
1548  if not offset:
1549  if normal:
1550  weights = (s0**2+(s1*I0/I1)**2)**(-1)
1551  return (weights*I0*I1).sum()/(weights*I1**2).sum(),0
1552  else:
1553  weights=(s0**2/I0**2 + s1**2/I1**2)**(-1)
1554  lg = (weights*log(I0/I1)).sum()/weights.sum()
1555  return exp(lg),0
1556  else:
1557  if normal:
1558  weights = (s0**2+(s1*I0/I1)**2)**(-1)
1559  iexp = (weights*I0).sum()/weights.sum()
1560  icalc = (weights*I1).sum()/weights.sum()
1561  icalc2 = (weights*I1*I1).sum()/weights.sum()
1562  icie = (weights*I1*I0).sum()/weights.sum()
1563  c = (icalc*icie-icalc2*iexp)/(icalc*iexp-icie)
1564  gamma = (icie-iexp*icalc)/(icalc2-icalc**2)
1565  return gamma,c
1566  else:
1567  raise NotImplementedError
1568 
1569 
1570 
1571 def write_individual_profile(prof, qvals, args):
1572  destname = os.path.basename(prof.get_filename())
1573  if args.outlevel == 'sparse':
1574  dflags = ['q','I','err']
1575  mflags = ['q','I','err']
1576  elif args.outlevel == 'normal':
1577  dflags = ['q','I','err','agood']
1578  mflags = ['q','I','err','mean','agood']
1579  else:
1580  dflags = None
1581  mflags = None
1582  prof.write_data(destname, bool_to_int=True, dir=args.destdir,
1583  header=args.header, flags=dflags)
1584  if args.postpone_cleanup or args.stop != "cleanup" :
1585  if args.npoints >0:
1586  prof.write_mean(destname, bool_to_int=True, dir=args.destdir,
1587  header=args.header, average=args.eaverage, num=args.npoints,
1588  flags=mflags)
1589  else:
1590  qvalues = prof.get_data(colwise=True)['q']
1591  qmin = min(qvalues)
1592  qmax = max(qvalues)
1593  qvalues = qvals[where(qvals >= qmin)]
1594  qvalues = qvalues[where(qvalues <= qmax)]
1595  prof.write_mean(destname, bool_to_int=True, dir=args.destdir,
1596  header=args.header, average=args.eaverage, qvalues=qvalues,
1597  flags=mflags)
1598 
1599 def write_merge_profile(merge,qvals, args):
1600  if args.outlevel == 'sparse':
1601  dflags = ['q','I','err']
1602  mflags = ['q','I','err']
1603  elif args.outlevel == 'normal':
1604  dflags = ['q','I','err','eorigin','eoriname']
1605  mflags = ['q','I','err','mean','eorigin','eoriname','eextrapol']
1606  else:
1607  dflags = None
1608  mflags = None
1609  if args.verbose > 2:
1610  print " data ",
1611  merge.write_data(merge.get_filename(), bool_to_int=True, dir=args.destdir,
1612  header=args.header, flags=dflags)
1613  qmax = max([i[1] for i in merge.get_flag_intervals('eextrapol')])
1614  if args.enoextrapolate:
1615  qmin = min([i[0] for i in merge.get_flag_intervals('eextrapol')])
1616  else:
1617  qmin=0
1618  if args.npoints >0:
1619  if args.verbose > 2:
1620  print " mean ",
1621  merge.write_mean(merge.get_filename(), bool_to_int=True,
1622  dir=args.destdir, qmin=qmin, qmax=qmax, header=args.header,
1623  flags=mflags, num=args.npoints, average=args.eaverage,
1624  verbose=(args.verbose > 2))
1625  else:
1626  if args.verbose > 2:
1627  print " mean ",
1628  qvalues = qvals[where(qvals >= qmin)]
1629  qvalues = qvals[where(qvalues <= qmax)]
1630  merge.write_mean(merge.get_filename(), bool_to_int=True,
1631  dir=args.destdir, qvalues=qvalues, header=args.header,
1632  flags=mflags, average=args.eaverage, verbose=(args.verbose > 2))
1633 
1634 def get_hessian_stats(profile, nmax):
1635  gp = profile._setup_gp(nmax)
1636  hessian = profile._get_hessian(gp)
1637  hessian_names = []
1638  for k in ['G','Rg','d','s','A','sigma2','tau','lambda']:
1639  if profile.particles[k].get_nuisance_is_optimized():
1640  hessian_names.append(k)
1641  return array(hessian), hessian_names
1642 
1643 def write_summary_file(merge, profiles, args):
1644  fl=open(os.path.join(args.destdir,args.sumname),'w')
1645  #header
1646  fl.write("#STATISTICAL MERGE: SUMMARY\n\n")
1647  fl.write("Ran with the following arguments:\n")
1648  fl.write(os.path.basename(sys.argv[0]) + " "
1649  + " ".join(sys.argv[1:]) + "\n\n")
1650  #merge file
1651  if args.stop == "merging":
1652  fl.write("Merge file\n"
1653  " General\n"
1654  " Filename: " + merge.filename + "\n")
1655  data = merge.get_data(colwise=True)
1656  fl.write(" Number of points: %d\n" % len(data['q']) +
1657  " Data range: %.5f %.5f\n" % (data['q'][0],data['q'][-1]))
1658  for i in xrange(len(profiles)):
1659  fl.write(" %d points from profile %d (%s)\n" %
1660  (len([1 for k in data['eorigin'] if k == i]),
1661  i, profiles[i].filename))
1662  fl.write(" Gaussian Process parameters\n")
1663  fl.write(" mean function : %s\n" % merge.mean)
1664  data = merge.get_params()
1665  if args.eerror or (not args.enocomp):
1666  Hessian, Hess_names = get_hessian_stats(merge,args.elimit_hessian)
1667  else:
1668  Hessian = None
1669  if Hessian is not None:
1670  for i,k in enumerate(Hess_names):
1671  fl.write(" %s : %f +- %f\n" % (k,data[k],sqrt(Hessian[i][i])))
1672  else:
1673  for i in sorted(data.keys()):
1674  fl.write(" %s : %f\n" % (i,data[i]))
1675  fl.write(" Calculated Values\n")
1676  Q1Rg = sqrt((data['d']-data['s'])*(3-data['s'])/2)
1677  fl.write(" Q1 : %f\n" % (Q1Rg/data['Rg']))
1678  fl.write(" Q1.Rg : %f\n" % Q1Rg)
1679  fl.write(" I(0) : %f\n" % \
1680  (merge.get_mean(qvalues=[0],
1681  average=args.eaverage)[0][1]))
1682  if not args.enocomp:
1683  fl.write(" Model Comparison : num_params -log10(Bayes Factor) "
1684  "-log(Posterior) -log(Likelihood) BIC AIC\n")
1685  for i in merge.bayes:
1686  name = '*'+i if i==merge.mean else i
1687  fl.write(" %s : %d\t%f\t%f\t%f\t%f\t%f\n" %
1688  (name, merge.bayes[i][0], merge.bayes[i][8]/log(10),
1689  merge.bayes[i][4], merge.bayes[i][5],
1690  -2*merge.bayes[i][5]
1691  +merge.bayes[i][0]*log(len(merge.get_raw_data())),
1692  -2*merge.bayes[i][5]+2*merge.bayes[i][0]))
1693  fl.write(" Model Comparison : best model\n")
1694  fl.write(" Name : %s\n" % merge.mean)
1695  fl.write(" Number of parameters : %d\n" %
1696  merge.bayes[merge.mean][0])
1697  fl.write(" -log(Posterior) : %f\n" %
1698  merge.bayes[merge.mean][4])
1699  fl.write(" -log(Likelihood) : %f\n" %
1700  merge.bayes[merge.mean][5])
1701  if Hessian is not None:
1702  fl.write(" Hessian matrix : ")
1703  fl.write(" ".join(Hess_names))
1704  fl.write("\n")
1705  for i in Hessian:
1706  fl.write(" ")
1707  fl.write(" ".join(['{0: <12}'.format(s) for s in i]))
1708  fl.write("\n")
1709  fl.write("\n")
1710  else:
1711  fl.write("Merge step skipped\n\n")
1712  #individual profiles
1713  for i,p in enumerate(profiles):
1714  #general stuff
1715  fl.write("Input file %d\n" % i +
1716  " General\n" +
1717  " Filename: " + p.filename + "\n")
1718  data = p.get_raw_data()
1719  fl.write(" Number of points: %d \n" % len(data) +
1720  " Data range: %.5f %.5f \n" % (data[0][0],data[-1][0]))
1721  #cleanup
1722  if not args.postpone_cleanup:
1723  data = p.get_data(filter='agood',colwise=True)
1724  fl.write(" 1. Cleanup\n" +
1725  " Number of significant points: %d \n" % \
1726  len(data['q']) +
1727  " Data range: %.5f %.5f \n" % \
1728  (data['q'][0],data['q'][-1]))
1729  if args.stop == "cleanup":
1730  fl.write(" Skipped further steps\n\n")
1731  continue
1732  #fitting
1733  data = p.get_params()
1734  fl.write(" 2. GP parameters (values for non-rescaled curve)\n")
1735  fl.write(" mean function : %s\n" % p.mean)
1736  if args.berror or (not args.bnocomp):
1737  Hessian, Hess_names = get_hessian_stats(p,args.blimit_hessian)
1738  else:
1739  Hessian = None
1740  if Hessian is not None:
1741  for i,k in enumerate(Hess_names):
1742  fl.write(" %s : %f +- %f\n" % (k,data[k],
1743  sqrt(Hessian[i][i])))
1744  else:
1745  for i in sorted(data.keys()):
1746  fl.write(" %s : %f\n" % (i,data[i]))
1747  fl.write(" Calculated Values\n")
1748  qrg = sqrt((data['d']-data['s'])*(3-data['s'])/2)
1749  fl.write(" Q1 : %f\n" % (qrg/data['Rg']))
1750  fl.write(" Q1.Rg : %f\n" % qrg)
1751  fl.write(" I(0) : %f\n" % (p.get_mean(qvalues=[0],
1752  average=args.eaverage)[0][1]))
1753  if Hessian is not None:
1754  fl.write(" Hessian matrix : ")
1755  fl.write(" ".join(Hess_names))
1756  fl.write("\n")
1757  for i in Hessian:
1758  fl.write(" ")
1759  fl.write(" ".join(['{0: <12}'.format(s) for s in i]))
1760  fl.write("\n")
1761  if args.stop == "fitting":
1762  fl.write(" Skipped further steps\n\n")
1763  continue
1764  #rescaling
1765  data = p.get_gamma()
1766  fl.write(" 3. Rescaling\n")
1767  if args.cmodel == 'normal-offset':
1768  fl.write(" normal model with offset\n")
1769  fl.write(" offset : %f\n" % p.get_offset())
1770  elif args.cmodel == 'normal':
1771  fl.write(" normal model\n")
1772  else:
1773  fl.write(" lognormal model\n")
1774  fl.write(" gamma : %s\n" % data)
1775  if args.stop == "rescaling":
1776  fl.write(" Skipped further steps\n\n")
1777  continue
1778  #cleanup (if it has been postponed)
1779  if args.postpone_cleanup:
1780  data = p.get_data(filter='agood',colwise=True)
1781  fl.write(" 1. Cleanup (postponed)\n" +
1782  " Number of significant points: %d \n" % \
1783  len(data['q']) +
1784  " Data range: %.5f %.5f \n" % \
1785  (data['q'][0],data['q'][-1]))
1786  if args.stop == "cleanup":
1787  fl.write(" Skipped further steps\n\n")
1788  continue
1789  #classification
1790  data = p.get_data(filter='dgood',colwise=True)
1791  if data == {}:
1792  fl.write(" 4. Classification\n" +
1793  " Number of valid points: 0\n")
1794  else:
1795  fl.write(" 4. Classification\n" +
1796  " Number of valid points: %d \n" % len(data['q']) +
1797  " Data range: %.5f %.5f \n" % (data['q'][0],data['q'][-1]))
1798  fl.write("\n")
1799  #command-line arguments
1800  fl.write("List of all options used for the merge:\n")
1801  fl.write(' name (type) = "value"\n')
1802  for name in [ i for i in dir(args) if
1803  (not i.startswith('_'))
1804  and (not i in ['ensure_value','read_file','read_module'] ) ] :
1805  fl.write(" %s " % name)
1806  val = getattr(args,name)
1807  fl.write("(%s)" % type(val).__name__)
1808  fl.write('= "%s"\n' % val)
1809 
1810 
1811 def initialize():
1812  args, files = parse_args()
1813  if args.verbose >= 2 :
1814  print "Parsing files and creating profile classes"
1815  print ""
1816  filenames,Nreps = parse_filenames(files, defaultvalue=10)
1817  args.filenames = filenames
1818  args.Nreps = Nreps
1819  scale = get_global_scaling_factor(filenames[-1])
1820  profiles = map(create_profile, filenames, Nreps, [scale]*len(filenames))
1821  #args.bschedule = parse_schedule(args.bschedule)
1822  #args.eschedule = parse_schedule(args.eschedule)
1823  return profiles, args
1824 
1825 def cleanup(profiles, args):
1826  """first stage of merge: discard low SNR data
1827  Created flags:
1828  agood : True if SNR is high enough
1829  apvalue : p-value of the student t test
1830  """
1831  verbose = args.verbose
1832  alpha = args.aalpha
1833  q_cutoff = args.acutoff
1834  if verbose >0:
1835  print "1. cleanup ( alpha = %2G %% )" % (alpha*100)
1836  #loop over profiles
1837  good_profiles=[]
1838  for p in profiles:
1839  if verbose > 1:
1840  print " ",p.filename,
1841  N = p.get_Nreps()
1842  p.new_flag('agood',bool)
1843  p.new_flag('apvalue',float)
1844  all_points_bad = True
1845  #loop over individual points
1846  had_outlier = False
1847  for datum in p.get_data():
1848  id,q,I,err = datum[:4]
1849  if err == 0:
1850  p.set_flag(id,'agood', False)
1851  p.set_flag(id, 'apvalue', -1)
1852  continue
1853  pval,t = ttest_one(I,err,N)[0:2]
1854  if pval > alpha or had_outlier: #the point is invalid
1855  p.set_flag(id, 'agood', False)
1856  if q >= q_cutoff and had_outlier == False:
1857  had_outlier = True
1858  else:
1859  p.set_flag(id, 'agood', True)
1860  if all_points_bad:
1861  all_points_bad = False
1862  p.set_flag(id, 'apvalue', pval)
1863  if all_points_bad:
1864  print "Warning: all points in file %s have been discarded"\
1865  " on cleanup" % p.filename
1866  else:
1867  if verbose > 2:
1868  qvals = p.get_data(filter="agood", colwise=True)['q']
1869  print "\tqmin = %.3f\tqmax = %.3f" % (qvals[0], qvals[-1])
1870  elif verbose == 2:
1871  print ""
1872  good_profiles.append(p)
1873  #need a continuous indicator of validity
1874  for p in good_profiles:
1875  create_intervals_from_data(p, 'agood')
1876  return good_profiles, args
1877 
1878 def fitting(profiles, args):
1879  """second stage of merge: gp fitting
1880  sets and optimizes the interpolant and enables calls to get_mean()
1881  """
1882  #schedule = args.bschedule
1883  verbose = args.verbose
1884  maxpointsF = args.blimit_fitting
1885  maxpointsH = args.blimit_hessian
1886  mean_function = args.bmean
1887  model_comp = not args.bnocomp
1888  if verbose >0:
1889  print "2. fitting"
1890  for p in profiles:
1891  if verbose > 1:
1892  print " ",p.filename,
1893  try:
1894  data = p.get_data(filter='agood',colwise=True, maxpoints=maxpointsF)
1895  except KeyError:
1896  data = p.get_data(colwise=True, maxpoints=maxpointsF)
1897  if verbose > 1:
1898  if len(data['q']) == maxpointsF:
1899  print " (subsampled fitting: %d points) " % maxpointsF,
1900  if model_comp and maxpointsH > 0 and maxpointsH < len(data['q']):
1901  print " (subsampled hessian: %d points) " % maxpointsH,
1902  data['N'] = p.get_Nreps()
1903  initvals={}
1904  initvals['G']=1. #will be calculated properly
1905  initvals['Rg']=30. #same
1906  initvals['d']=args.bd
1907  initvals['s']=args.bs
1908  initvals['A']=0.
1909  initvals['tau']=1.
1910  initvals['lambda']=1.
1911  initvals['sigma2']=1.
1912  mean, initvals, bayes = find_fit(data, initvals, #schedule,
1913  verbose, model_comp=model_comp, model_comp_maxpoints=maxpointsH,
1914  mean_function=mean_function,
1915  lambdamin=args.lambdamin)
1916  model, particles, functions, gp = setup_process(data,initvals,1)
1917  if bayes:
1918  p.set_interpolant(gp, particles, functions, mean, model, bayes,
1919  hessian=bayes[mean][2])
1920  else:
1921  p.set_interpolant(gp, particles, functions, mean, model, bayes)
1922  if verbose > 1 and model_comp:
1923  print " => "+mean
1924  return profiles, args
1925 
1926 def rescaling(profiles, args):
1927  """third stage of merge: rescaling
1928  Created flag:
1929  cgood : True if data point is both valid (wrt SNR) and in the validity
1930  domain of the rescaling reference curve (option --creference)
1931  sets profile.gamma to the correct value
1932  """
1933  use_normal = args.cmodel.startswith('normal')
1934  use_offset = use_normal and args.cmodel.endswith('offset')
1935  numpoints = args.cnpoints
1936  verbose = args.verbose
1937  reference = args.creference
1938  average = args.baverage
1939  if verbose >0:
1940  print "3. rescaling",
1941  if use_normal:
1942  print "(normal model",
1943  else:
1944  print "(lognormal model",
1945  if use_offset:
1946  print "with constant offset)"
1947  else:
1948  print "without constant offset)"
1949  #take last as internal reference as there's usually good overlap
1950  pref = profiles[-1]
1951  gammas = []
1952  for ctr,p in enumerate(profiles):
1953  #find intervals where both functions are valid
1954  p.new_flag('cgood',bool)
1955  pdata = p.get_data(colwise=True)
1956  if 'agood' in pdata: #assumed true for pref also
1957  for id,q,flag in zip(pdata['id'],pdata['q'],pdata['agood']):
1958  refflag = pref.get_flag(q,'agood')
1959  p.set_flag(id, 'cgood', flag and refflag)
1960  else: #assumed false for pref also
1961  refq = pref.get_data(colwise=True)['q']
1962  for id,q in zip(pdata['id'],pdata['q']):
1963  refflag = (refq[0] <= q <= refq[-1])
1964  p.set_flag(id, 'cgood', refflag)
1965  create_intervals_from_data(p, 'cgood')
1966  #generate points in these intervals to compute gamma
1967  goodip = [i for i in p.get_flag_intervals('cgood') if i[2]]
1968  totaldist = array([i[1]-i[0] for i in goodip]).sum()
1969  qvalues = []
1970  for interval in goodip:
1971  qmin = interval[0]
1972  qmax = interval[1]
1973  dist = qmax - qmin
1974  numint = int(round(dist/totaldist*numpoints))
1975  if numint <= 1:
1976  continue
1977  for i in xrange(numint):
1978  qvalues.append((float(i)/(numint-1))*dist + qmin)
1979  pvalues = p.get_mean(qvalues=qvalues, colwise=True, average=average)
1980  if True in map(isnan,pvalues['I']):
1981  raise RuntimeError, "Got NAN in I"
1982  if True in map(isnan,pvalues['err']):
1983  raise RuntimeError, "Got NAN in err"
1984  prefvalues = pref.get_mean(qvalues=qvalues, colwise=True,
1985  average=average)
1986  if True in map(isnan,prefvalues['I']):
1987  raise RuntimeError, "Got NAN in ref I"
1988  if True in map(isnan,prefvalues['err']):
1989  raise RuntimeError, "Got NAN in ref err"
1990  gammas.append(rescale_curves(prefvalues, pvalues,
1991  normal = use_normal, offset = use_offset))
1992  #fl=open('rescale_%d.npy' % ctr, 'w')
1993  #import cPickle
1994  #cPickle.dump([pvalues,prefvalues],fl)
1995  #fl.close()
1996  #fl=open('rescale_%d.dat' % ctr, 'w')
1997  #for i in xrange(len(pvalues['q'])):
1998  # fl.write("%s " % pvalues['q'][i])
1999  # fl.write("%s " % pvalues['I'][i])
2000  # fl.write("%s " % pvalues['err'][i])
2001  # fl.write("%s " % pvalues['q'][i])
2002  # fl.write("%s " % (gammas[-1][0]*pvalues['I'][i]))
2003  # fl.write("%s " % (gammas[-1][0]*pvalues['err'][i]))
2004  # fl.write("%s " % prefvalues['q'][i])
2005  # fl.write("%s " % prefvalues['I'][i])
2006  # fl.write("%s\n" % prefvalues['err'][i])
2007  #set gammas wrt reference
2008  if reference == 'first':
2009  gr=gammas[0][0]
2010  cr=gammas[0][1]
2011  else:
2012  gr = gammas[-1][0]
2013  cr = gammas[-1][1]
2014  for p,g in zip(profiles,gammas):
2015  gamma = g[0]/gr
2016  c = g[1] - cr
2017  p.set_gamma(gamma)
2018  p.set_offset(c)
2019  if verbose >1:
2020  print " ",p.filename," gamma = %.3G" % gamma,
2021  if use_offset:
2022  print " offset = %.3G" % c,
2023  print ""
2024  if True in map(isnan,gammas[-1]):
2025  raise RuntimeError, "Got NAN in rescaling step, try another rescaling " \
2026  "or fitting model."
2027  return profiles,args
2028 
2029 def classification(profiles, args):
2030  """fourth stage of merge: classify mean functions
2031  Created flags:
2032  drefnum : number of the reference profile for this point
2033  drefname : associated filename
2034  dgood : True if this point is compatible with the reference
2035  and False otherwise. Undefined if 'agood' is False for
2036  that point.
2037  dselfref : True if this curve was it's own reference, in which case
2038  dgood is also True
2039  dpvalue : p-value for the classification test
2040  """
2041  alpha = args.dalpha
2042  verbose = args.verbose
2043  average=args.baverage
2044  if verbose >0:
2045  print "4. classification ( alpha = %2G %% )" % (alpha*100)
2046  for i in xrange(len(profiles)):
2047  p = profiles[i]
2048  if verbose >1:
2049  print " ",p.filename
2050  p.new_flag('drefnum',int)
2051  p.new_flag('drefname',str)
2052  p.new_flag('dgood',bool)
2053  p.new_flag('dselfref',bool)
2054  p.new_flag('dpvalue',float)
2055  N = p.get_Nreps()
2056  for entry in p.get_data(filter='agood'):
2057  #find out who is the reference here
2058  for refnum,pref in enumerate(profiles[:i+1]):
2059  if pref.get_flag(entry[1],'agood',default=None):
2060  break
2061  p.set_flag(entry[0], 'drefnum', refnum)
2062  p.set_flag(entry[0], 'drefname', pref.get_filename())
2063  p.set_flag(entry[0], 'dselfref', pref is p)
2064  #do classification
2065  data = p.get_mean(qvalues=[entry[1]],colwise=True, average=average)
2066  refdata = pref.get_mean(qvalues=[entry[1]],colwise=True,
2067  average=average)
2068  Nref = pref.get_Nreps()
2069  pval, t, nu = ttest_two(data['I'][0],data['err'][0],N,
2070  refdata['I'][0],refdata['err'][0],Nref)
2071  p.set_flag(entry[0], 'dgood', pval >= alpha)
2072  p.set_flag(entry[0], 'dpvalue', pval)
2073  create_intervals_from_data(p, 'drefnum')
2074  create_intervals_from_data(p, 'drefname')
2075  create_intervals_from_data(p, 'dselfref')
2076  create_intervals_from_data(p, 'dgood')
2077  return profiles, args
2078 
2079 def merging(profiles, args):
2080  """last stage of merge: collect valid data points and fit them
2081  Creates a profile that contains all compatible data points. This profile has
2082  the following flags:
2083  dselfref, drefnum, drefname (see above)
2084  eorigin : profile index from which this point originates
2085  eoriname : associated filename
2086  it then sets and optimizes the interpolant and enables calls to get_mean()
2087  """
2088  #schedule = args.eschedule
2089  verbose = args.verbose
2090  maxpointsF = args.elimit_fitting
2091  maxpointsH = args.elimit_hessian
2092  do_extrapolation = not args.enoextrapolate
2093  extrapolate = 1+args.eextrapolate/float(100)
2094  mean_function = args.emean
2095  model_comp = not args.enocomp
2096  if verbose > 0:
2097  print "5. merging"
2098  print " gathering data"
2099  merge = SAXSProfile()
2100  merge.set_filename(args.mergename)
2101  merge.new_flag('dselfref',bool)
2102  merge.new_flag('drefnum',int)
2103  merge.new_flag('drefname',str)
2104  merge.new_flag('eorigin',int)
2105  merge.new_flag('eoriname',str)
2106  merge.new_flag('eextrapol',bool)
2107  flags_to_keep=['q','I','err', 'dselfref','drefnum','drefname']
2108  #loop over profiles and add data
2109  for i,p in enumerate(profiles):
2110  if verbose >1:
2111  print " ",p.filename
2112  #get data and keep only relevant flags
2113  data = p.get_data(filter='dgood',colwise=True)
2114  if data == {}: #no good datapoints
2115  continue
2116  #flag_numbers = [p.flag_dict[k]+1 for k in flags_to_keep]
2117  #print len(flag_numbers)
2118  #cleaned = [[d for j,d in enumerate(dat) if j in flag_numbers]
2119  # + [i,p.filename] for dat in data]
2120  #print cleaned
2121  for k in data.keys():
2122  if k not in flags_to_keep:
2123  data.pop(k)
2124  data['eorigin']=[i]*len(data['q'])
2125  data['eoriname']=[p.filename]*len(data['q'])
2126  data['eextrapol']=[False]*len(data['q'])
2127  cleaned = []
2128  all_flags = flags_to_keep + ['eorigin','eoriname','eextrapol']
2129  for j in xrange(len(data['q'])):
2130  entry=[]
2131  for k in all_flags:
2132  entry.append(data[k][j])
2133  cleaned.append(entry)
2134  merge.add_data(cleaned)
2135  if verbose >0:
2136  print " calculating merged mean",
2137  #recompute all intervals
2138  for n,t in zip(merge.flag_names,merge.flag_types):
2139  if t != float and n not in ['q','I','err']:
2140  create_intervals_from_data(merge, n)
2141  #create interval for extrapolation
2142  data = merge.get_data(colwise=True)['q']
2143  merge.set_flag_interval('eextrapol', min(data), max(data), False)
2144  if do_extrapolation:
2145  merge.set_flag_interval('eextrapol',0,min(data),True)
2146  if args.eextrapolate > 0:
2147  merge.set_flag_interval('eextrapol',
2148  max(data), max(data)*extrapolate, True)
2149  #set Nreps to min of all
2150  #it's the bet we can do when fitting all points simultaneously
2151  merge.set_Nreps(min([p.get_Nreps() for p in profiles]))
2152  #fit curve
2153  data = merge.get_data(colwise=True, maxpoints=maxpointsF)
2154  if verbose > 1:
2155  if len(data['q']) == maxpointsF:
2156  print " (subsampled fitting: %d points) " % maxpointsF,
2157  if model_comp and maxpointsH > 0 and maxpointsH < len(data['q']):
2158  print " (subsampled hessian: %d points) " % maxpointsH,
2159  data['N'] = merge.get_Nreps()
2160  #take initial values from the curve which has gamma == 1
2161  initvals = profiles[-1].get_params()
2162  mean, initvals, bayes = find_fit(data, initvals, #schedule,
2163  verbose, model_comp=model_comp,
2164  model_comp_maxpoints=maxpointsH,
2165  mean_function=mean_function,
2166  lambdamin=args.lambdamin)
2167  if verbose > 1 and model_comp:
2168  print " => "+mean
2169  model, particles, functions, gp = setup_process(data,initvals,1)
2170  if bayes:
2171  merge.set_interpolant(gp, particles, functions, mean, model, bayes,
2172  hessian=bayes[mean][2])
2173  else:
2174  merge.set_interpolant(gp, particles, functions, mean, model, bayes)
2175  if verbose > 0:
2176  print ""
2177  return merge, profiles, args
2178 
2179 def write_data(merge, profiles, args):
2180  if args.verbose > 0:
2181  print "writing data"
2182  if not os.path.isdir(args.destdir):
2183  os.mkdir(args.destdir)
2184  qvals = array(profiles[0].get_raw_data(colwise=True)['q'])
2185  #individual profiles
2186  if args.allfiles:
2187  if args.verbose > 1:
2188  print " individual profiles"
2189  for i in profiles:
2190  if args.verbose > 2:
2191  print " %s" % (i.get_filename())
2192  write_individual_profile(i, qvals, args)
2193  #merge profile
2194  if args.stop == 'merging':
2195  if args.verbose > 1:
2196  print " merge profile",
2197  write_merge_profile(merge, qvals, args)
2198  if args.verbose > 1:
2199  print ""
2200  #summary
2201  if args.verbose > 1:
2202  print " summary"
2203  write_summary_file(merge, profiles, args)
2204  if args.verbose > 0:
2205  print "Done."
2206 
2207 def main():
2208  #get arguments and create profiles
2209  profiles, args = initialize()
2210  #create list of steps to do, in order
2211  if args.postpone_cleanup:
2212  steps_to_go = ["fitting","rescaling","cleanup",
2213  "classification","merging"]
2214  else:
2215  steps_to_go = ["cleanup","fitting","rescaling",
2216  "classification","merging"]
2217  steps_to_go = steps_to_go[:steps_to_go.index(args.stop)+1]
2218  #call steps in turn
2219  merge = None
2220  for step in steps_to_go:
2221  if step != 'merging':
2222  profiles, args = globals()[step](profiles, args)
2223  else:
2224  merge, profiles, args = merging(profiles, args)
2225  #write output
2226  write_data(merge, profiles, args)
2227 
2228 if __name__ == "__main__":
2229  main()