6 from random
import sample
9 from scipy.stats
import t
as student_t
21 def subsample(idx, data, npoints):
22 """keep min(npoints,len(data)) out of data if npoints>0,
23 subsampling evenly along the idx (float) coordinate
29 if npoints >= Ntot
or npoints <= 0:
33 qvals = array([i[0]
for i
in all])
38 for q
in linspace(qmin,qmax,num=npoints):
45 newdata.append(all[i][1])
48 class FittingError(Exception):
57 self.flag_names=[
"q",
"I",
"err"]
58 self.flag_types=[float,float,float]
59 self.flag_dict={
"q":0,
"I":1,
"err":2}
60 self.nflags=len(self.flag_names)
67 def add_data(self, input, offset=0, positive=False, err=True, scale=1):
68 """add experimental data to saxs profile.
69 offset=i means discard first i columns
70 positive=True means only keep intensities that are >0
71 err=True means keep only points that have >0 error bar
72 scale is a factor by which to multiply input I and err
74 if isinstance(input, str):
76 stuff = [i.split()
for i
in open(input).readlines()]
78 stuff = filter(
lambda a: (len(a)>=self.nflags)
79 and not a[0].startswith(
'#'), stuff)
81 stuff = map(
lambda a:a[:self.nflags+offset], stuff)
92 elif isinstance(input, list)
or isinstance(input, tuple):
94 if len(i) != self.nflags + offset:
95 raise TypeError,
"length of each entry should be %d" % \
99 raise ValueError,
"Unknown data type"
103 for f,z
in zip(self.flag_types,a[offset:]):
109 if positive
and entry[1] <= 0:
112 if err
and entry[2] <= 0:
118 self.data += copy.deepcopy(data)
119 self.data.sort(key=
lambda a:a[0])
121 def get_raw_data(self, colwise=False):
122 """return original q,I,err values, sorted"""
124 data = self.get_raw_data(colwise=
False)
126 retval[self.flag_names[0]] = [d[0]
for d
in data]
127 retval[self.flag_names[1]] = [d[1]
for d
in data]
128 retval[self.flag_names[2]] = [d[2]
for d
in data]
130 return [i[:3]
for i
in self.data]
132 def get_data(self, *args, **kwargs):
133 """get data, rescaled by self.gamma and self.offset
134 Returns a list of tuples, each of which is organized as:
135 Column 1 : unique id (increasing)
136 Column 2 : q (increasing)
139 Columns 5+ : flags, as indicated by self.get_flag_names()
141 qmin : specify the starting q value
142 qmax : specify the ending q value
143 if they are not specified, take the data's min and max
144 Both have to be specified or none of them.
146 filter : string or list of strings corresponding to a boolean flag
147 name which should be used to filter the data before returning it.
148 Data is returned only if all of the provided flags are True.
149 colwise : instead of returning a list of tuples for each data point,
150 return a dictionnary with flag names as keys.
151 maxpoints : only return at most N points, subsampling if necessary.
153 if 'filter' in kwargs:
154 filt = kwargs.pop(
'filter')
158 if isinstance(filt, str):
159 flagnos = [self.flag_dict[filt]]
161 flagnos = [self.flag_dict[name]
for name
in filt]
165 if 'maxpoints' in kwargs:
166 maxpoints=kwargs.pop(
'maxpoints')
167 if 'colwise' in kwargs:
168 colwise = kwargs.pop(
'colwise')
171 if len(args) == 0
and len(kwargs) == 0:
172 qmin = self.data[0][0]
173 qmax = self.data[-1][0]
174 elif (len(args) == 2) ^ (len(kwargs) == 2):
180 raise TypeError,
"arguments have incorrect type"
182 raise ValueError,
"qmin > qmax !"
184 if set(kwargs.keys()) != set([
'qmax',
'qmin']):
185 raise TypeError,
"incorrect keyword argument(s)"
189 raise TypeError,
"provide zero or two arguments (qmin, qmax)"
191 retval=dict.fromkeys(self.get_flag_names()+(
'id',))
196 for i,d
in enumerate(self.data):
201 if len(flagnos) > 0
and False in [d[k]
is True for k
in flagnos]:
204 cd[1]=self.gamma*(cd[1]+self.offset)
205 cd[2]=self.gamma*cd[2]
207 retval[
'id'].append(i)
208 for k
in self.get_flag_names():
209 retval[k].append(cd[self.flag_dict[k]])
211 retval.append(tuple([i,]+cd))
213 keys,values = zip(*retval.items())
214 values = zip(*self._subsample(retval[
'q'], zip(*values), maxpoints))
215 values = map(list, values)
216 return dict(zip(keys,values))
218 return self._subsample([i[1]
for i
in retval], retval,maxpoints)
223 def get_offset(self):
226 def set_gamma(self, gamma):
229 def set_offset(self, offset):
232 def new_flag(self, name, tp):
233 """add extra flag to all data points. Initialized to None"""
234 self.flag_names.append(name)
235 self.flag_types.append(tp)
236 self.flag_dict[name] = self.nflags
240 self.intervals[name]=[]
242 def set_flag(self, id, name, value):
243 "set the flag of a data point"
244 flagno = self.flag_dict[name]
245 flagtype = self.flag_types[flagno]
247 converted = flagtype(value)
249 raise TypeError,
"unable to cast given value to %s" % flagtype
250 self.data[id][flagno]=converted
252 def get_flag(self, val, flag, default = '---'):
253 """get the flag of a point in the profile
254 call with (id, flag) to get the flag of data point id
255 call with (qvalue, flag) to get the flag of an unobserved data point.
256 Uses the flag intervals for this.
257 If the flag is not defined for that point, return default
259 if isinstance(val, int):
261 retval = self.data[val][self.flag_dict[flag]]
267 elif not isinstance(val, float):
268 raise ValueError,
"first argument must be int or float"
269 if not flag
in self.get_flag_names():
270 raise KeyError,
"flag %s does not exist!" % flag
271 for qmin,qmax,flagval
in self.get_flag_intervals(flag):
272 if qmin <= val <= qmax:
276 def set_interpolant(self, gp, particles, functions, mean, model, bayes,
278 """store a class that gives interpolated values,
279 usually an instance of the GaussianProcessInterpolation
281 if hessian
is not None:
282 self._memoized_hessian = hessian
283 self.recompute_hessian =
False
285 self.recompute_hessian =
True
286 if not hasattr(gp,
"get_posterior_mean"):
287 raise TypeError,
"interpolant.get_posterior_mean does not exist"
288 if not hasattr(gp,
"get_posterior_covariance"):
289 raise TypeError,
"interpolant.get_posterior_covariance "\
291 if not isinstance(particles, dict):
292 raise TypeError,
"second argument should be dict"
293 for p
in particles.values():
295 raise TypeError,
"particles should all be ISD Nuisances"
296 if not set(functions.keys()) == set([
'mean',
'covariance']):
297 raise TypeError,
"expected mean and covariance functions"
299 raise TypeError,
"fourth argument is expected to be an IMP.Model()"
301 self.particles = particles
302 self.functions=functions
307 def set_flag_interval(self, flag, qmin, qmax, value):
308 if flag
not in self.flag_names:
309 raise ValueError,
"unknown flag %s" % flag
311 for imin,imax,ival
in self.intervals[flag]:
313 if imax < qmin
or imin > qmax:
314 intervals.append((imin,imax,ival))
320 qmin = min(qmin,imin)
321 qmax = max(qmax,imax)
324 if imin < qmin
and imax < qmax:
327 intervals.append((imin,imax,ival))
328 elif imin > qmin
and imax > qmax:
331 intervals.append((imin,imax,ival))
332 elif imin < qmin
and imax > qmax:
334 intervals.append((imin,qmin,ival))
335 intervals.append((qmax,imax,ival))
341 newinterval = (qmin,qmax,
None)
343 newinterval = (qmin,qmax,
344 self.flag_types[self.flag_dict[flag]](value))
345 intervals.append(newinterval)
347 intervals.sort(key=
lambda a:a[0])
348 self.intervals[flag]=intervals
350 def get_flag_intervals(self, flag):
351 """returns a list of [qmin, qmax, flag_value] lists"""
352 return self.intervals[flag]
354 def _subsample(self, q, data, npoints):
355 return subsample(q, data, npoints)
357 def _setup_gp(self, npoints):
360 q = self.gp.get_data_abscissa()
361 I = self.gp.get_data_mean()
362 S = self.gp.get_data_variance()
363 err = [S[i][i]
for i
in xrange(len(S))]
364 q,I,err = zip(*self._subsample(q,zip(q,I,err),npoints))
366 self.get_Nreps(), self.functions[
'mean'],
367 self.functions[
'covariance'], self.particles[
'sigma2'])
370 def _get_hessian(self, gp):
371 self.particles[
'A'].set_nuisance_is_optimized(
True)
372 if self.mean ==
'Flat':
373 self.particles[
'G'].set_lower(0)
374 self.particles[
'G'].set_nuisance(0)
375 self.particles[
'G'].set_upper(0)
376 self.particles[
'G'].set_nuisance_is_optimized(
False)
377 self.particles[
'Rg'].set_nuisance_is_optimized(
False)
378 self.particles[
'd'].set_nuisance_is_optimized(
False)
379 self.particles[
's'].set_nuisance_is_optimized(
False)
381 self.particles[
'G'].set_nuisance_is_optimized(
True)
382 self.particles[
'Rg'].set_nuisance_is_optimized(
True)
383 if self.mean ==
'Simple':
384 self.particles[
'd'].set_nuisance_is_optimized(
False)
386 self.particles[
'd'].set_nuisance_is_optimized(
True)
387 if self.mean ==
'Generalized':
388 self.particles[
's'].set_nuisance_is_optimized(
False)
390 self.particles[
's'].set_nuisance_is_optimized(
True)
391 self.particles[
'tau'].set_nuisance_is_optimized(
True)
392 self.particles[
'lambda'].set_nuisance_is_optimized(
True)
393 self.particles[
'sigma2'].set_nuisance_is_optimized(
True)
394 if self.recompute_hessian
is False:
395 return self._memoized_hessian
397 self.model.add_restraint(gpr)
398 Hessian = matrix(gpr.get_hessian(
False))
399 if linalg.det(Hessian) == 0:
400 print Hessian,[(i,j.get_nuisance())
401 for i,j
in self.particles.items()
402 if j.get_nuisance_is_optimized()]
403 self.model.remove_restraint(gpr)
408 self._memoized_hessian = Hessian
409 self.recompute_hessian =
False
413 """returns (q,I,err,mean,...) tuples for num points (default 200)
414 ranging from qmin to qmax (whole data range by default)
415 possible keyword arguments:
416 num qmin qmax filter average verbose
417 mean is the mean function of the gaussian process.
418 the length of the returned tuple is the same as the number of flags plus
419 one for the mean function.
420 You can also give specific values via the qvalues keyword argument.
421 see get_data for details on the filter argument. If num is used with
422 filter, the function returns at most num elements and possibly none.
423 average : if 0, just output maximum posterior estimate (faster).
424 if <0, compute hessians and average out parameter vector.
425 if >0, same as <0 but only take N data points instead of all
426 verbose : if True, display progress meter in %
428 rem = set(kwargs.keys()) \
429 - set([
'qvalues',
'num',
'qmin',
'qmax',
'filter',
430 'colwise',
'average',
'verbose'])
432 raise TypeError,
"Unknown keyword(s): %s" % list(rem)
435 if 'average' in kwargs
and kwargs[
'average'] != 0:
436 average=kwargs[
'average']
438 gp = self._setup_gp(average)
439 Hessian = self._get_hessian(gp)
441 if 'qvalues' in kwargs:
442 qvals = kwargs[
'qvalues']
449 qmin = kwargs[
'qmin']
451 qmin = self.data[0][0]
453 qmax = kwargs[
'qmax']
455 qmax = self.data[-1][0]
456 qvals = linspace(qmin,qmax,num)
459 if 'verbose' in kwargs:
460 verbose = kwargs[
'verbose']
462 if 'colwise' in kwargs:
463 colwise = kwargs[
'colwise']
467 if 'filter' in kwargs:
468 filt = kwargs.pop(
'filter')
472 if isinstance(filt, str):
473 flagnos = [self.flag_dict[filt]]
475 flagnos = [self.flag_dict[name]
for name
in filt]
478 flagnames = list(self.get_flag_names())
479 flagnames = flagnames[:3]+[
'mean']+flagnames[3:]
481 gamma = self.get_gamma()
482 offset = self.get_offset()
484 retval=dict.fromkeys(flagnames)
489 flagdict = copy.deepcopy(self.flag_dict)
494 for i,q
in enumerate(qvals):
496 percent=str(int(round((i+1)/float(len(qvals))*100)))
497 sys.stdout.write(
"%s%%" % (percent))
499 sys.stdout.write(
'\b'*(len(percent)+1))
500 if len(flagnames) > 4:
501 flags = map(self.get_flag, [q]*len(flagnames[4:]),
503 if False in [flags[i-3]
for i
in flagnos]:
505 thing = [q,gamma*(self.gp.get_posterior_mean([q])+offset),
506 gamma*sqrt(self.gp.get_posterior_covariance([q],[q])),
507 gamma*(self.functions[
'mean']([q])[0]+offset) ]
511 Hder = matrix(gp.get_posterior_covariance_derivative(
513 Hcov = matrix(gp.get_posterior_covariance_hessian(
515 Hlcov = 1/cov*(-Hcov + 1/cov * Hder*Hder.T)
518 lapl = linalg.det(matrix(eye(Hessian.shape[0]))
519 + linalg.solve(Hessian,Hlcov))**(-0.5)
525 thing[2] = sqrt(cov*lapl)
529 for flag
in flagnames:
530 retval[flag].append(thing[flagdict[flag]])
532 retval.append(tuple(thing))
535 def get_flag_names(self):
536 return tuple(self.flag_names)
538 def get_params(self):
540 for k,v
in self.particles.items():
541 retval[k]=v.get_nuisance()
544 def set_Nreps(self,nreps):
550 def write_data(self, filename, prefix='data_', suffix='', sep=' ',
551 bool_to_int=
False, dir=
'./', header=
True, flags=
None,
552 float_fmt=
'%-15.14G', *args, **kwargs):
553 fl=open(os.path.join(dir,prefix+filename+suffix),
'w')
554 allflags = self.get_flag_names()
559 for i,name
in enumerate(flags):
560 fl.write(
"%d:%s%s" % (i+1,name,sep))
562 for d
in self.get_data(*args, **kwargs):
563 for flagname,ent
in zip(allflags,d[1:]):
564 if flagname
not in flags:
566 if bool_to_int
and isinstance(ent, bool):
568 elif isinstance(ent, float):
569 fl.write(float_fmt % ent)
576 def write_mean(self, filename, prefix='mean_', suffix='', sep=' ',
577 bool_to_int=
False, dir=
'./', header=
True, flags=
None,
578 float_fmt=
"%-15.14G", *args, **kwargs):
579 """write gaussian process interpolation to a file named
580 dir+prefix+filename+suffix.
581 bool_to_int : convert booleans to 0 or 1 in output.
582 header : if True, first line starts with # and is a header
583 flags : specify which flags to write, if None write all.
584 float_fmt : the format for outputting floats
586 fl=open(os.path.join(dir,prefix+filename+suffix),
'w')
587 allflags = list(self.get_flag_names())
588 allflags = allflags[:3]+[
'mean']+allflags[3:]
593 for i,name
in enumerate(flags):
594 fl.write(
"%d:%s%s" % (i+1,name,sep))
596 for d
in self.get_mean(*args, **kwargs):
597 for flagname,ent
in zip(allflags,d):
598 if flagname
not in flags:
600 if bool_to_int
and isinstance(ent, bool):
602 elif isinstance(ent, float):
603 fl.write(float_fmt % ent)
610 def set_filename(self, fname):
611 self.filename = fname
613 def get_filename(self):
616 class _RawEpilogFormatter(optparse.IndentedHelpFormatter):
617 """Output the epilog help text as-is"""
618 def format_epilog(self, epilog):
623 description=
"Perform a statistical merge of the given SAXS curves. "
624 "file is a 3-column file that contains SAXS data. "
625 "To specify the number of repetitions of this experiment, "
626 "use the syntax file.txt=20 indicating that data "
627 "in file.txt is an average of 20 experiments. Default is "
628 "10. Note that in the case of different number of "
629 "repetitions, the minimum is taken for the final "
630 "fitting step (Step 5).",
631 formatter=_RawEpilogFormatter(),
632 usage=
"%prog [options] file [file ...]",
634 epilog=
"""Output file legend:\n
636 agood (bool) : True if SNR is high enough
637 apvalue (float) : p-value of the student t test
639 cgood (bool) : True if data point is both valid (wrt SNR) and in the
640 validity domain of the gamma reference curve (the last
643 drefnum (int) : number of the reference profile for this point
644 drefname (string) : associated filename
645 dgood (bool) : True if this point is compatible with the reference and
646 False otherwise. Undefined if 'agood' is False for that
648 dselfref (bool) : True if this curve was it's own reference, in which
649 case dgood is also True
650 dpvalue (float) : p-value for the classification test
652 eorigin (int) : profile index from which this point originates
653 eoriname (string) : associated filename
654 eextrapol (bool) : True if mean function is being extrapolated.
656 parser.add_option(
'--verbose',
'-v', action=
"count",
657 dest=
'verbose', default=0,
658 help=
"Increase verbosity. Can be repeated up to 3 times "
661 group = optparse.OptionGroup(parser, title=
"general")
662 parser.add_option_group(group)
663 group.add_option(
'--mergename', help=
"filename suffix for output "
664 "(default is merged.dat)", default=
'merged.dat', metavar=
'SUFFIX')
665 group.add_option(
'--sumname', metavar=
'NAME', default=
'summary.txt',
666 help=
"File to which the merge summary will be written."
667 " Default is summary.txt")
668 group.add_option(
'--destdir', default=
"./", metavar=
'DIR',
669 help=
"Destination folder in which files will be written")
670 group.add_option(
'--header', default=
False, action=
'store_true',
671 help=
"First line of output files is a header (default False)")
672 group.add_option(
'--outlevel', default=
'normal', help=
"Set the output "
673 "level, 'sparse' is for q,I,err columns only, 'normal' adds "
674 "eorigin, eoriname and eextrapol (default), and 'full' outputs "
675 "all flags.", type=
"choice", choices=[
'normal',
'sparse',
'full'])
676 group.add_option(
'--allfiles', default=
False, action=
'store_true',
677 help=
"Output data files for parsed input files as well (default "
678 "is only to output merge and summary files).")
679 group.add_option(
'--stop', type=
'choice', help=
"stop after the given step, "
680 "one of cleanup, fitting, rescaling, classification, merging "
681 "(default: merging)", choices=[
"cleanup",
"fitting",
"rescaling",
682 "classification",
"merging"], default=
"merging")
683 group.add_option(
'--postpone_cleanup', action=
'store_true', default=
False,
684 help=
"Cleanup step comes after rescaling step (default is False)")
685 group.add_option(
'--npoints', type=
"int", default=200, metavar=
"NUM",
686 help=
"Number of points to output for the mean function. Negative "
687 "values signify to take the same q values as the first data file. "
688 "In that case extrapolation flags are ignored, and extrapolation "
689 "is performed when the data file's q values fall outside of the "
690 "range of accepted data points. Default is NUM=200 points.")
691 group.add_option(
'--lambdamin', type=
"float", default=0.005, metavar=
"MIN",
692 help=
"lower bound for lambda parameter in steps 2 and 5")
694 group = optparse.OptionGroup(parser, title=
"Cleanup (Step 1)",
695 description=
"Discard or keep SAXS curves' "
696 "points based on their SNR. Points with an error"
697 " of zero are discarded as well")
698 parser.add_option_group(group)
700 group.add_option(
'--aalpha', help=
'type I error (default 1e-4)',
701 type=
"float", default=1e-4, metavar=
'ALPHA')
702 group.add_option(
'--acutoff', help=
'when a value after CUT is discarded,'
703 ' the rest of the curve is discarded as well (default is 0.1)',
704 type=
"float", default=0.1, metavar=
'CUT')
706 group = optparse.OptionGroup(parser, title=
"Fitting (Step 2)",
707 description=
"Estimate the mean function and the noise level "
708 "of each SAXS curve.")
709 parser.add_option_group(group)
710 group.add_option(
'--bd', help=
'Initial value for d (default 4)',
711 type=
"float", default=4., metavar=
'D')
712 group.add_option(
'--bs', help=
'Initial value for s (default 0)',
713 type=
"float", default=0., metavar=
'S')
714 group.add_option(
'--bmean', help=
'Defines the most complex mean '
715 'function that will be tried during model comparison.'
716 " One of Flat (the offset parameter A is optimized), "
717 "Simple (optimizes A, G and Rg), "
718 "Generalized (optimizes G, Rg and d), "
719 "Full (default, optimizes G, Rg, d and s) "
720 "If --bnocomp is given, will try to fit only with this model",
721 type=
"choice", default=
"Full",
722 choices=[
'Flat',
'Simple',
'Generalized',
'Full'])
723 group.add_option(
'--bnocomp', help=
'Don\'t perform model comparison. '
724 "Default: perform it.",
725 action=
'store_true', default=
False)
726 group.add_option(
'--baverage', help=
'Average over all possible parameters '
727 'instead of just taking the most probable set of parameters. '
728 'Default is not to perform the averaging.',
729 action=
'store_true', default=
False)
730 group.add_option(
'--blimit_fitting', metavar=
'NUM', default=-1, type=
'int',
731 help=
'To save resources, set the maximum number of points used in'
732 'the fitting step. Dataset will be subsampled if it is bigger than'
733 ' NUM. If NUM=-1 (default), all points will be used.')
734 group.add_option(
'--blimit_hessian', metavar=
'NUM', default=-1, type=
'int',
735 help=
'To save resources, set the maximum number of points used in'
736 ' the Hessian calculation (model comparison, options --baverage, '
737 'and --berror ). Dataset will be subsampled if it is bigger than'
738 'NUM. If NUM=-1 (default), all points will be used.')
739 group.add_option(
'--berror', action=
'store_true', default=
False,
740 help=
"Compute error bars on all parameters even in case where "
741 "model comparison was disabled. Involves the computation of a "
742 "Hessian. Default: no extra computation.")
744 group = optparse.OptionGroup(parser, title=
"Rescaling (Step 3)",
745 description=
"Find the most probable scaling factor of all "
746 "curves wrt the first curve.")
747 parser.add_option_group(group)
748 group.add_option(
'--creference', default=
'last', help=
"Define which "
749 "input curve the other curves will be rescaled to. Options are "
750 "first or last (default is last)", type=
"choice",
751 choices=[
'first',
'last'])
752 group.add_option(
'--cmodel', default=
'normal',
753 help=
"Which rescaling model to use to calculate gamma. "
754 "'normal-offset' for a normal model with offset, "
755 "'normal' (default) for a normal model with zero offset "
756 "and 'lognormal' for a lognormal model.",
757 choices=[
'normal',
'normal-offset',
'lognormal'], type=
'choice')
758 group.add_option(
'--cnpoints', type=
"int", default=200, metavar=
"NUM",
759 help=
"Number of points to use to compute gamma (default 200)")
761 group = optparse.OptionGroup(parser, title=
"Classification (Step 4)",
762 description=
"Classify the mean curves by comparing them using "
763 "a two-sided two-sample student t test")
764 parser.add_option_group(group)
765 group.add_option(
'--dalpha', help=
'type I error (default 0.05)',
766 type=
"float", default=0.05, metavar=
'ALPHA')
768 group = optparse.OptionGroup(parser, title=
"Merging (Step 5)",
769 description=
"Collect compatible data and produce best estimate "
771 parser.add_option_group(group)
772 group.add_option(
'--eextrapolate', metavar=
"NUM", help=
'Extrapolate '
773 "NUM percent outside of the curve's bounds. Example: if NUM=50 "
774 "and the highest acceptable data point is at q=0.3, the mean will "
775 "be estimated up to q=0.45. Default is 0 (just extrapolate at low "
776 "angle).", type=
"int", default=0)
777 group.add_option(
'--enoextrapolate', action=
'store_true', default=
False,
778 help=
"Don't extrapolate at all, even at low angle (default False)")
779 group.add_option(
'--emean', help=
'Which most complex mean function '
780 'to try for model comparison.'
781 ' See --bmean. Default is Full', type=
"choice",
782 default=
"Full", choices=[
'Simple',
'Generalized',
'Full',
'Flat'])
783 group.add_option(
'--enocomp', help=
'Don\'t perform model comparison, '
784 'see --bnocomp. Default is not to perform it.',
785 action=
'store_true', default=
False)
786 group.add_option(
'--eaverage', help=
"Average over all possible parameters "
787 "instead of just taking the most probable set of parameters. "
788 "Default is not to perform the averaging.",
789 action=
'store_true', default=
False)
790 group.add_option(
'--elimit_fitting', metavar=
'NUM', default=-1, type=
'int',
791 help=
'To save resources, set the maximum number of points used in'
792 'the fitting step. Dataset will be subsampled if it is bigger than'
793 ' NUM. If NUM=-1 (default), all points will be used.')
794 group.add_option(
'--elimit_hessian', metavar=
'NUM', default=-1, type=
'int',
795 help=
'To save resources, set the maximum number of points used in'
796 ' the Hessian calculation (model comparison, options --eaverage, '
797 'and --eerror ). Dataset will be subsampled if it is bigger than'
798 'NUM. If NUM=-1 (default), all points will be used.')
799 group.add_option(
'--eerror', action=
'store_true', default=
False,
800 help=
"Compute error bars on all parameters even in case where "
801 "model comparison was disabled. Involves the computation of a "
802 "Hessian. Default: no extra computation.")
803 (args, files) = parser.parse_args()
805 parser.error(
"No files specified")
808 def parse_filenames(fnames, defaultvalue=10):
813 tokens = fname.split(
'=')
814 paths = glob.glob(tokens[0])
816 sys.exit(
"File %s not found!" % fname)
818 Nreps.append(int(tokens[1]))
820 paths = glob.glob(fname)
822 sys.exit(
"File %s not found!" % fname)
824 Nreps.append(defaultvalue)
827 def get_global_scaling_factor(file):
828 """get an order of magnitude for 100/I(0)"""
830 p.add_data(file, positive=
True)
831 data=p.get_raw_data()
835 return 100./mean([i[1]
for i
in data[:m]])
837 def create_profile(file, nreps, scale=1):
839 p.add_data(file, positive=
True, scale=scale)
844 def ttest_one(mu,s,n):
845 """one-sample right-tailed t-test against 0
851 pval = (1-student_t.cdf(t,nu))
854 def ttest_two(mu1,s1,n1,mu2,s2,n2):
856 two-sample two-sided t-test
861 t = (mu1-mu2)/sqrt(v1+v2)
862 nu = (v1+v2)**2/(v1**2/(n1-1)+v2**2/(n2-1))
863 pval = 2*(1-student_t.cdf(abs(t),nu))
866 def setup_particles(initvals):
867 """ initvals : dict of initial values for the parameters
868 returns: model, dict(*particles), dict(mean, covariance)
890 IMP.isd.NuisanceRangeModifier(),
None,G,
"Constrain_G"))
892 IMP.isd.NuisanceRangeModifier(),
None,Rg,
"Constrain_Rg"))
894 IMP.isd.NuisanceRangeModifier(),
None,d,
"Constrain_d"))
896 IMP.isd.NuisanceRangeModifier(),
None,A,
"Constrain_A"))
898 IMP.isd.NuisanceRangeModifier(),
None,s,
"Constrain_s"))
900 IMP.isd.NuisanceRangeModifier(),
None,lam,
"Constrain_lambda"))
903 IMP.isd.NuisanceRangeModifier(),
None,tau,
"Constrain_tau"))
906 IMP.isd.NuisanceRangeModifier(),
None,sigma,
"Constrain_sigma2"))
915 sigma.set_lower(0.01)
916 sigma.set_upper(1000.)
924 particles[
'tau'] = tau
925 particles[
'lambda'] = lam
926 particles[
'sigma2'] = sigma
928 functions[
'mean'] = m
929 functions[
'covariance'] = w
930 return model, particles, functions
932 def do_conjugategradients(model,nsteps):
936 def do_quasinewton(model,nsteps):
951 def setup_process(data,initvals, subs, maxpoints=-1):
952 model,particles,functions = setup_particles(initvals)
953 q = [ [i]
for i
in data[
'q'][::subs]]
954 I = data[
'I'][::subs]
955 err = data[
'err'][::subs]
957 q,I,err = zip(*subsample(q,zip(q,I,err),maxpoints))
959 data[
'N'], functions[
'mean'], functions[
'covariance'],
961 return model, particles, functions, gp
963 def get_initial_Rg(data):
964 fl, name = tempfile.mkstemp(text=
True)
965 handle = os.fdopen(fl,
'w')
966 for dat
in zip(data[
'q'],data[
'I'],data[
'err']):
967 handle.write(
' '.join([
'%G' % i
for i
in dat]))
971 Rg = profile.radius_of_gyration()
975 def set_defaults_mean(data, particles, mean_function):
978 npoints=min(len(data[
'q'])/10,50)
979 Ivals = [data[
'I'][i]
for i
in xrange(len(data[
'q']))][:npoints]
980 particles[
'G'].set_nuisance(mean(Ivals))
981 particles[
'G'].set_lower(min(Ivals))
982 particles[
'G'].set_upper(2*max(Ivals))
984 particles[
'A'].set_nuisance_is_optimized(
True)
985 if mean_function ==
'Flat':
986 particles[
'G'].set_lower(0)
987 particles[
'G'].set_nuisance(0)
988 particles[
'G'].set_upper(0)
989 particles[
'G'].set_nuisance_is_optimized(
False)
990 particles[
'Rg'].set_nuisance_is_optimized(
False)
991 particles[
'd'].set_nuisance_is_optimized(
False)
992 particles[
's'].set_nuisance_is_optimized(
False)
994 particles[
'G'].set_nuisance_is_optimized(
True)
995 particles[
'Rg'].set_nuisance_is_optimized(
True)
996 particles[
'Rg'].set_nuisance(get_initial_Rg(data))
997 if mean_function ==
'Simple':
998 particles[
'd'].set_nuisance_is_optimized(
False)
1000 particles[
'd'].set_nuisance_is_optimized(
True)
1001 if mean_function ==
'Generalized':
1002 particles[
's'].set_nuisance_is_optimized(
False)
1004 particles[
's'].set_nuisance_is_optimized(
True)
1005 particles[
'tau'].set_nuisance_is_optimized(
False)
1006 particles[
'lambda'].set_nuisance_is_optimized(
False)
1007 particles[
'sigma2'].set_nuisance_is_optimized(
False)
1009 def find_fit_mean(data, initvals, verbose, mean_function):
1010 model, particles, functions, gp = \
1011 setup_process(data, initvals, 1)
1014 model.add_restraint(gpr)
1015 set_defaults_mean(data, particles, mean_function)
1019 if particles[
'tau'].has_lower():
1020 taulow = particles[
'tau'].get_lower()
1021 particles[
'tau'].set_nuisance(0.)
1022 particles[
'tau'].set_lower(0.)
1042 rgopt = particles[
'Rg'].get_nuisance_is_optimized()
1043 particles[
'Rg'].set_nuisance_is_optimized(
False)
1045 model.evaluate(
False)
1046 do_conjugategradients(model,100)
1047 particles[
'Rg'].set_nuisance_is_optimized(rgopt)
1051 model.evaluate(
False)
1052 do_conjugategradients(model,100)
1056 model.evaluate(
False)
1057 do_quasinewton(model,100)
1073 for i
in particles.values():
1074 if i.get_nuisance_is_optimized():
1075 if isnan(i.get_nuisance()):
1078 particles[
'tau'].set_lower(taulow)
1079 model.evaluate(
False)
1080 newvals = dict([(k,v.get_nuisance())
1081 for (k,v)
in particles.iteritems()])
1084 def set_defaults_covariance(data, particles, functions, args):
1086 chisquares = [functions[
'mean']([q])[0]
for q
in data[
'q']]
1087 chisquares = [(data[
'I'][i]-chisquares[i])**2 + data[
'err'][i]**2
1088 for i
in xrange(len(data[
'q']))]
1089 noiselevel = (sum(chisquares)/len(chisquares))
1090 errorlevel = (sum([data[
'err'][i]**2
for i
in xrange(len(data[
'q']))])
1092 sigmaval = 10*noiselevel/errorlevel
1093 particles[
'sigma2'].set_nuisance(sigmaval)
1095 particles[
'tau'].set_nuisance(sigmaval)
1097 def find_fit_lambda(data, initvals, args, verbose):
1100 meandist = mean(array(data[
'q'][1:])-array(data[
'q'][:-1]))
1102 for subs,nsteps
in [(10,1000),(5,500)]:
1103 model, particles, functions, gp = \
1104 setup_process(data, initvals, subs)
1107 set_defaults_covariance(data, particles, functions, args)
1108 model.evaluate(
False)
1113 particles[
'lambda'].set_lower(2*meandist)
1114 particles[
'G'].set_nuisance_is_optimized(
False)
1115 particles[
'Rg'].set_nuisance_is_optimized(
False)
1116 particles[
'd'].set_nuisance_is_optimized(
False)
1117 particles[
's'].set_nuisance_is_optimized(
False)
1118 particles[
'A'].set_nuisance_is_optimized(
False)
1119 particles[
'tau'].set_nuisance_is_optimized(
False)
1120 particles[
'lambda'].set_nuisance_is_optimized(
True)
1121 particles[
'sigma2'].set_nuisance_is_optimized(
False)
1123 model.add_restraint(gpr)
1124 do_quasinewton(model,nsteps)
1125 model.evaluate(
False)
1126 initvals = dict([(k,v.get_nuisance())
1127 for (k,v)
in particles.iteritems()])
1131 def find_fit_covariance(data, initvals, args, verbose):
1134 model, particles, functions, gp = \
1135 setup_process(data, initvals, 1, args)
1136 set_defaults_covariance(data, particles, functions, args)
1137 meandist = mean(array(data[
'q'][1:])-array(data[
'q'][:-1]))
1138 particles[
'lambda'].set_lower(meandist)
1140 particles[
'G'].set_nuisance_is_optimized(
False)
1141 particles[
'Rg'].set_nuisance_is_optimized(
False)
1142 particles[
'd'].set_nuisance_is_optimized(
False)
1143 particles[
's'].set_nuisance_is_optimized(
False)
1144 particles[
'A'].set_nuisance_is_optimized(
False)
1145 particles[
'tau'].set_nuisance_is_optimized(
True)
1146 particles[
'lambda'].set_nuisance_is_optimized(
True)
1147 particles[
'sigma2'].set_nuisance_is_optimized(
True)
1149 model.add_restraint(gpr)
1151 do_quasinewton(model,30)
1152 model.evaluate(
False)
1153 initvals = dict([(k,v.get_nuisance())
for (k,v)
in particles.iteritems()])
1157 minval=(model.evaluate(
False),particles[
'lambda'].get_nuisance())
1158 initvals = dict([(k,v.get_nuisance())
for (k,v)
in particles.iteritems()])
1159 particles[
'lambda'].set_lower(0.)
1160 for l
in linspace(.01,10,num=100):
1161 particles[
'lambda'].set_nuisance(l)
1162 ene = model.evaluate(
True)
1163 deriv = psarticles[
'lambda'].get_nuisance_derivative()
1166 print "plotrange",l,ene,deriv
1167 print "minimum: ene=%f lambda=%f" % minval
1170 for k,v
in particles.items():
1171 v.set_nuisance(initvals[k])
1172 for q,I,err
in zip(data[
'q'],data[
'I'],data[
'err']):
1173 gpval=gp.get_posterior_mean([q])
1174 avg=functions[
'mean']([q])[0]
1175 print "residuals",q,I,err,gpval,avg,(gpval-avg),(I-avg)
1178 def find_fit_by_minimizing(data, initvals, verbose, lambdalow):
1179 model, particles, functions, gp = \
1180 setup_process(data, initvals, 1)
1182 model.add_restraint(gpr)
1183 meandist = mean(array(data[
'q'][1:])-array(data[
'q'][:-1]))
1184 particles[
'lambda'].set_lower(max(2*meandist,lambdalow))
1185 particles[
'lambda'].set_upper(max(data[
'q'])/2.)
1186 particles[
'lambda'].set_nuisance(max(data[
'q'])/10.)
1187 lambdamin = particles[
'lambda'].get_lower()
1188 lambdamax = particles[
'lambda'].get_upper()
1189 particles[
'sigma2'].set_lower(1)
1190 particles[
'sigma2'].set_upper(1000)
1191 particles[
'sigma2'].set_nuisance(100.0)
1192 sigmamin = particles[
'sigma2'].get_lower()
1193 sigmamax = particles[
'sigma2'].get_upper()
1194 particles[
'tau'].set_lower(0.001)
1195 particles[
'tau'].set_nuisance(10.0)
1196 taumin = particles[
'tau'].get_lower()
1199 particles[
'lambda'].set_nuisance_is_optimized(
True)
1200 particles[
'sigma2'].set_nuisance_is_optimized(
False)
1201 particles[
'tau'].set_nuisance_is_optimized(
True)
1203 while len(minimas) < 5:
1204 initm = model.evaluate(
False)
1205 initt = particles[
'tau'].get_nuisance()
1206 inits = particles[
'sigma2'].get_nuisance()
1207 initl = particles[
'lambda'].get_nuisance()
1215 minimas.append((model.evaluate(
False), initm,
1216 particles[
'tau'].get_nuisance(), initt,
1217 particles[
'sigma2'].get_nuisance(), inits,
1218 particles[
'lambda'].get_nuisance(), initl))
1220 print " tau",initt, minimas[-1][2]
1221 print " sig",inits, minimas[-1][4]
1222 print " lam",initl, minimas[-1][6]
1223 print " mod",initm, minimas[-1][0]
1225 particles[
'tau'].set_nuisance(
1226 exp(random.uniform(
log(taumin)+1,
log(taumax)-1)))
1227 particles[
'sigma2'].set_nuisance(
1228 exp(random.uniform(
log(sigmamin)+1,
log(sigmamax)-1)))
1229 particles[
'lambda'].set_nuisance(
1230 exp(random.uniform(
log(lambdamin)+1,
log(lambdamax)-1)))
1231 minimas.sort(key=
lambda a:a[0])
1232 print "best is ",minimas[0][0]
1233 particles[
'tau'].set_nuisance(minimas[0][2])
1234 particles[
'sigma2'].set_nuisance(minimas[0][4])
1235 particles[
'lambda'].set_nuisance(minimas[0][6])
1236 ene = model.evaluate(
False)
1237 initvals = dict([(k,v.get_nuisance())
for (k,v)
in particles.iteritems()])
1240 def find_fit_by_gridding(data, initvals, verbose, lambdalow):
1241 """use the fact that sigma2 can be factored out of the covariance matrix and
1242 drawn from a gamma distribution. Calculates a 2D grid on lambda (D1) and
1243 tau**2/sigma2 (D2) to get the minimum value.
1245 model, particles, functions, gp = \
1246 setup_process(data, initvals, 1)
1248 model.add_restraint(gpr)
1249 meandist = mean(array(data[
'q'][1:])-array(data[
'q'][:-1]))
1250 particles[
'lambda'].set_lower(max(2*meandist,lambdalow))
1251 lambdamin = particles[
'lambda'].get_lower()
1255 particles[
'tau'].set_lower(0.)
1256 particles[
'sigma2'].set_lower(0.)
1258 particles[
'sigma2'].set_nuisance(100.0)
1259 particles[
'sigma2'].set_upper(1000.0)
1260 particles[
'tau'].set_nuisance(10.0)
1263 for lambdaval
in logspace(
log(lambdamin),
log(lambdamax),
1264 base=exp(1),num=numpoints):
1265 for rel
in logspace(-2, 2, num=numpoints):
1266 particles[
'lambda'].set_nuisance(lambdaval)
1268 tauval = particles[
'tau'].get_nuisance()
1269 if tauval**2/rel > particles[
'sigma2'].get_upper():
1271 particles[
'sigma2'].set_nuisance(tauval**2/rel)
1275 exponent = gpr.get_minus_exponent()
1276 if isnan(exponent)
or exponent <=0:
1278 print "got invalid exponent at grid point "\
1279 "lambda=%f rel=%f exp=%s" % (lambdaval,rel, exponent)
1282 redexp = tauval * exponent
1284 tauval = (redexp/(len(data[
'q'])+2))**.5
1285 sigmaval = tauval**2/rel
1288 if sigmaval > particles[
'sigma2'].get_upper():
1291 particles[
'tau'].set_nuisance(tauval)
1293 if sigmaval > particles[
'sigma2'].get_upper():
1296 particles[
'sigma2'].set_nuisance(sigmaval)
1297 ene = model.evaluate(
False)
1298 gridvalues.append((lambdaval,rel,sigmaval,tauval,ene))
1305 print "skipped another " + str(experror-2) +
" similar errors"
1306 particles[
'tau'].set_lower(0.01)
1307 particles[
'tau'].set_upper(100)
1308 particles[
'sigma2'].set_lower(1.)
1309 particles[
'sigma2'].set_upper(1000)
1310 if len(gridvalues) == 0:
1312 minval = gridvalues[0][:]
1313 for i
in gridvalues:
1314 if i[4] < minval[4]:
1320 particles[
'lambda'].set_nuisance_is_optimized(
True)
1321 particles[
'sigma2'].set_nuisance_is_optimized(
True)
1322 particles[
'tau'].set_nuisance_is_optimized(
True)
1326 particles[
'lambda'].set_nuisance(minval[0])
1327 particles[
'sigma2'].set_nuisance(minval[2])
1328 particles[
'tau'].set_nuisance(minval[3])
1329 do_quasinewton(model,10)
1330 bestmin.append((model.evaluate(
False),
1331 particles[
'lambda'].get_nuisance(),
1332 particles[
'sigma2'].get_nuisance(),
1333 particles[
'tau'].get_nuisance()))
1335 particles[
'lambda'].set_nuisance(bestmin[0][1])
1336 particles[
'sigma2'].set_nuisance(bestmin[0][2])
1337 particles[
'tau'].set_nuisance(bestmin[0][3])
1357 initvals = dict([(k,v.get_nuisance())
for (k,v)
in particles.iteritems()])
1360 def bayes_factor(data, initvals, verbose, mean_func, maxpoints):
1361 """given optimized values for the parameters (in initvals), the data, and
1362 the model specification (in mean_func), return
1363 0 the number of parameters (Np) that have been optimized
1365 2 hessian (H) wrt parameter vector
1366 3 1/2*log(abs(det(H)))
1367 4 minus log maximum posterior value (MP)
1368 5 minus log maximum likelihood value
1369 6 minus log maximum prior value
1370 7 exp(-MP)*sqrt(2Pi)^Np*det(H)^(-1/2) (bayes factor)
1371 8 MP - Np/2*log(2Pi) + 1/2*log(det(H)) (minus log bayes factor)
1373 model, particles, functions, gp = setup_process(data, initvals, 1,
1374 maxpoints=maxpoints)
1376 model.add_restraint(gpr)
1377 particles[
'A'].set_nuisance_is_optimized(
True)
1378 if mean_func ==
'Flat':
1379 particles[
'G'].set_nuisance_is_optimized(
False)
1380 particles[
'Rg'].set_nuisance_is_optimized(
False)
1381 particles[
'd'].set_nuisance_is_optimized(
False)
1382 particles[
's'].set_nuisance_is_optimized(
False)
1384 particles[
'G'].set_nuisance_is_optimized(
True)
1385 particles[
'Rg'].set_nuisance_is_optimized(
True)
1386 if mean_func ==
'Simple':
1387 particles[
'd'].set_nuisance_is_optimized(
False)
1389 particles[
'd'].set_nuisance_is_optimized(
True)
1390 if mean_func ==
'Generalized':
1391 particles[
's'].set_nuisance_is_optimized(
False)
1393 particles[
's'].set_nuisance_is_optimized(
True)
1394 particles[
'tau'].set_nuisance_is_optimized(
True)
1395 particles[
'lambda'].set_nuisance_is_optimized(
True)
1396 particles[
'sigma2'].set_nuisance_is_optimized(
True)
1398 H = matrix(gpr.get_hessian(
False))
1400 MP = model.evaluate(
False)
1401 ML = gpr.unprotected_evaluate(
None)
1403 retval = linalg.slogdet(H)
1404 if retval[0] == 0
and retval[1] == -inf:
1405 print "Warning: skipping model %s" % mean_func
1408 logdet = retval[1]/2.
1409 except AttributeError:
1413 retval = gpr.get_logdet_hessian()
1415 print "Warning: re-skipping model %s" % mean_func
1419 except IMP.ModelException:
1420 print "Warning: Hessian is not positive definite"
1422 return (Np, (2*pi)**(Np/2.), H, logdet, MP, ML, MP-ML,
1423 exp(-MP)*(2*pi)**(Np/2.)*exp(-logdet),
1424 MP - Np/2.*
log(2*pi) + logdet)
1426 def find_fit(data, initvals, verbose, model_comp=False, model_comp_maxpoints=-1,
1427 mean_function=
'Simple', lambdamin=0.005):
1435 for i
in [
'Flat',
'Simple',
'Generalized',
'Full']:
1437 if i==mean_function:
1440 functions.append(mean_function)
1445 for mean_func
in functions:
1447 sys.stdout.write(
" "+mean_func+
": mean ")
1450 param_vals[mean_func] = \
1451 find_fit_mean(data, initvals, verbose, mean_func)
1452 except FittingError:
1458 raise FittingError,
"Error while fitting! Choose"\
1459 " another model or do model comparison"
1461 sys.stdout.write(
"A=%1.2f " % param_vals[mean_func][
'A'])
1462 if mean_func
in [
"Simple",
"Generalized",
"Full"]:
1463 sys.stdout.write(
"G=%1.2f " % param_vals[mean_func][
'G'])
1464 sys.stdout.write(
"Rg=%1.2f " % param_vals[mean_func][
'Rg'])
1465 if mean_func
in [
"Generalized",
"Full"]:
1466 sys.stdout.write(
"d=%1.2f " % param_vals[mean_func][
'd'])
1467 if mean_func ==
"Full":
1468 sys.stdout.write(
"s=%1.2f " % param_vals[mean_func][
's'])
1469 sys.stdout.write(
"covariance ")
1474 param_vals[mean_func] = \
1475 find_fit_by_gridding(data, param_vals[mean_func], verbose,
1477 except FittingError:
1483 raise FittingError,
"Error while fitting! Choose"\
1484 " another model or do model comparison"
1486 for i
in [
'tau',
'lambda',
'sigma2']:
1487 sys.stdout.write(
"%s=%1.2f " % (i,param_vals[mean_func][i]))
1491 if len(functions) == 1:
1492 return functions[0], param_vals[functions[0]], {}
1495 print " -log(Bayes Factor):",
1500 free_energies[f]=bayes_factor(data, param_vals[f], verbose, f,
1501 model_comp_maxpoints)
1503 free_energies[f] = [-1]+[inf]*8
1505 print "%.1f" % free_energies[f][8],
1509 minf = sorted([(free_energies[i][8],i)
for i
in functions])[0][1]
1510 return minf,param_vals[minf],free_energies
1512 def create_intervals_from_data(profile, flag):
1513 """This function creates intervals for the mean function so that
1514 the flag for the mean is consistent with that of the data.
1515 example: q1:True, q2:False, q3:True q4:True
1516 creates [q1:q2[ (True), [q2:q3[ (False) and [q3:q4[ (True)
1518 if flag
not in profile.get_flag_names():
1519 raise KeyError,
"No such flag: %s" % flag
1520 data = profile.get_data(colwise=
True)
1525 for q,b
in zip(data[
'q'],data[flag]):
1531 profile.set_flag_interval(flag, oldval, q, oldb)
1535 profile.set_flag_interval(flag, oldval, q, oldb)
1538 def rescale_curves(refdata, data, normal = False, offset = False):
1539 """Find gamma and (optionnally) c so that
1540 normal model: I0 = gamma*(I1 + c)
1541 lognormal model log I0 = log(gamma*I1)
1542 if offset==False we have c=0
1544 I0=array(refdata[
'I'])
1545 s0=array(refdata[
'err'])
1547 s1=array(data[
'err'])
1550 weights = (s0**2+(s1*I0/I1)**2)**(-1)
1551 return (weights*I0*I1).sum()/(weights*I1**2).sum(),0
1553 weights=(s0**2/I0**2 + s1**2/I1**2)**(-1)
1554 lg = (weights*
log(I0/I1)).sum()/weights.sum()
1558 weights = (s0**2+(s1*I0/I1)**2)**(-1)
1559 iexp = (weights*I0).sum()/weights.sum()
1560 icalc = (weights*I1).sum()/weights.sum()
1561 icalc2 = (weights*I1*I1).sum()/weights.sum()
1562 icie = (weights*I1*I0).sum()/weights.sum()
1563 c = (icalc*icie-icalc2*iexp)/(icalc*iexp-icie)
1564 gamma = (icie-iexp*icalc)/(icalc2-icalc**2)
1567 raise NotImplementedError
1571 def write_individual_profile(prof, qvals, args):
1572 destname = os.path.basename(prof.get_filename())
1573 if args.outlevel ==
'sparse':
1574 dflags = [
'q',
'I',
'err']
1575 mflags = [
'q',
'I',
'err']
1576 elif args.outlevel ==
'normal':
1577 dflags = [
'q',
'I',
'err',
'agood']
1578 mflags = [
'q',
'I',
'err',
'mean',
'agood']
1582 prof.write_data(destname, bool_to_int=
True, dir=args.destdir,
1583 header=args.header, flags=dflags)
1584 if args.postpone_cleanup
or args.stop !=
"cleanup" :
1586 prof.write_mean(destname, bool_to_int=
True, dir=args.destdir,
1587 header=args.header, average=args.eaverage, num=args.npoints,
1590 qvalues = prof.get_data(colwise=
True)[
'q']
1593 qvalues = qvals[where(qvals >= qmin)]
1594 qvalues = qvalues[where(qvalues <= qmax)]
1595 prof.write_mean(destname, bool_to_int=
True, dir=args.destdir,
1596 header=args.header, average=args.eaverage, qvalues=qvalues,
1599 def write_merge_profile(merge,qvals, args):
1600 if args.outlevel ==
'sparse':
1601 dflags = [
'q',
'I',
'err']
1602 mflags = [
'q',
'I',
'err']
1603 elif args.outlevel ==
'normal':
1604 dflags = [
'q',
'I',
'err',
'eorigin',
'eoriname']
1605 mflags = [
'q',
'I',
'err',
'mean',
'eorigin',
'eoriname',
'eextrapol']
1609 if args.verbose > 2:
1611 merge.write_data(merge.get_filename(), bool_to_int=
True, dir=args.destdir,
1612 header=args.header, flags=dflags)
1613 qmax = max([i[1]
for i
in merge.get_flag_intervals(
'eextrapol')])
1614 if args.enoextrapolate:
1615 qmin = min([i[0]
for i
in merge.get_flag_intervals(
'eextrapol')])
1619 if args.verbose > 2:
1621 merge.write_mean(merge.get_filename(), bool_to_int=
True,
1622 dir=args.destdir, qmin=qmin, qmax=qmax, header=args.header,
1623 flags=mflags, num=args.npoints, average=args.eaverage,
1624 verbose=(args.verbose > 2))
1626 if args.verbose > 2:
1628 qvalues = qvals[where(qvals >= qmin)]
1629 qvalues = qvals[where(qvalues <= qmax)]
1630 merge.write_mean(merge.get_filename(), bool_to_int=
True,
1631 dir=args.destdir, qvalues=qvalues, header=args.header,
1632 flags=mflags, average=args.eaverage, verbose=(args.verbose > 2))
1634 def get_hessian_stats(profile, nmax):
1635 gp = profile._setup_gp(nmax)
1636 hessian = profile._get_hessian(gp)
1638 for k
in [
'G',
'Rg',
'd',
's',
'A',
'sigma2',
'tau',
'lambda']:
1639 if profile.particles[k].get_nuisance_is_optimized():
1640 hessian_names.append(k)
1641 return array(hessian), hessian_names
1643 def write_summary_file(merge, profiles, args):
1644 fl=open(os.path.join(args.destdir,args.sumname),
'w')
1646 fl.write(
"#STATISTICAL MERGE: SUMMARY\n\n")
1647 fl.write(
"Ran with the following arguments:\n")
1648 fl.write(os.path.basename(sys.argv[0]) +
" "
1649 +
" ".join(sys.argv[1:]) +
"\n\n")
1651 if args.stop ==
"merging":
1652 fl.write(
"Merge file\n"
1654 " Filename: " + merge.filename +
"\n")
1655 data = merge.get_data(colwise=
True)
1656 fl.write(
" Number of points: %d\n" % len(data[
'q']) +
1657 " Data range: %.5f %.5f\n" % (data[
'q'][0],data[
'q'][-1]))
1658 for i
in xrange(len(profiles)):
1659 fl.write(
" %d points from profile %d (%s)\n" %
1660 (len([1
for k
in data[
'eorigin']
if k == i]),
1661 i, profiles[i].filename))
1662 fl.write(
" Gaussian Process parameters\n")
1663 fl.write(
" mean function : %s\n" % merge.mean)
1664 data = merge.get_params()
1665 if args.eerror
or (
not args.enocomp):
1666 Hessian, Hess_names = get_hessian_stats(merge,args.elimit_hessian)
1669 if Hessian
is not None:
1670 for i,k
in enumerate(Hess_names):
1671 fl.write(
" %s : %f +- %f\n" % (k,data[k],sqrt(Hessian[i][i])))
1673 for i
in sorted(data.keys()):
1674 fl.write(
" %s : %f\n" % (i,data[i]))
1675 fl.write(
" Calculated Values\n")
1676 Q1Rg = sqrt((data[
'd']-data[
's'])*(3-data[
's'])/2)
1677 fl.write(
" Q1 : %f\n" % (Q1Rg/data[
'Rg']))
1678 fl.write(
" Q1.Rg : %f\n" % Q1Rg)
1679 fl.write(
" I(0) : %f\n" % \
1680 (merge.get_mean(qvalues=[0],
1681 average=args.eaverage)[0][1]))
1682 if not args.enocomp:
1683 fl.write(
" Model Comparison : num_params -log10(Bayes Factor) "
1684 "-log(Posterior) -log(Likelihood) BIC AIC\n")
1685 for i
in merge.bayes:
1686 name =
'*'+i
if i==merge.mean
else i
1687 fl.write(
" %s : %d\t%f\t%f\t%f\t%f\t%f\n" %
1688 (name, merge.bayes[i][0], merge.bayes[i][8]/
log(10),
1689 merge.bayes[i][4], merge.bayes[i][5],
1690 -2*merge.bayes[i][5]
1691 +merge.bayes[i][0]*
log(len(merge.get_raw_data())),
1692 -2*merge.bayes[i][5]+2*merge.bayes[i][0]))
1693 fl.write(
" Model Comparison : best model\n")
1694 fl.write(
" Name : %s\n" % merge.mean)
1695 fl.write(
" Number of parameters : %d\n" %
1696 merge.bayes[merge.mean][0])
1697 fl.write(
" -log(Posterior) : %f\n" %
1698 merge.bayes[merge.mean][4])
1699 fl.write(
" -log(Likelihood) : %f\n" %
1700 merge.bayes[merge.mean][5])
1701 if Hessian
is not None:
1702 fl.write(
" Hessian matrix : ")
1703 fl.write(
" ".join(Hess_names))
1707 fl.write(
" ".join([
'{0: <12}'.format(s)
for s
in i]))
1711 fl.write(
"Merge step skipped\n\n")
1713 for i,p
in enumerate(profiles):
1715 fl.write(
"Input file %d\n" % i +
1717 " Filename: " + p.filename +
"\n")
1718 data = p.get_raw_data()
1719 fl.write(
" Number of points: %d \n" % len(data) +
1720 " Data range: %.5f %.5f \n" % (data[0][0],data[-1][0]))
1722 if not args.postpone_cleanup:
1723 data = p.get_data(filter=
'agood',colwise=
True)
1724 fl.write(
" 1. Cleanup\n" +
1725 " Number of significant points: %d \n" % \
1727 " Data range: %.5f %.5f \n" % \
1728 (data[
'q'][0],data[
'q'][-1]))
1729 if args.stop ==
"cleanup":
1730 fl.write(
" Skipped further steps\n\n")
1733 data = p.get_params()
1734 fl.write(
" 2. GP parameters (values for non-rescaled curve)\n")
1735 fl.write(
" mean function : %s\n" % p.mean)
1736 if args.berror
or (
not args.bnocomp):
1737 Hessian, Hess_names = get_hessian_stats(p,args.blimit_hessian)
1740 if Hessian
is not None:
1741 for i,k
in enumerate(Hess_names):
1742 fl.write(
" %s : %f +- %f\n" % (k,data[k],
1743 sqrt(Hessian[i][i])))
1745 for i
in sorted(data.keys()):
1746 fl.write(
" %s : %f\n" % (i,data[i]))
1747 fl.write(
" Calculated Values\n")
1748 qrg = sqrt((data[
'd']-data[
's'])*(3-data[
's'])/2)
1749 fl.write(
" Q1 : %f\n" % (qrg/data[
'Rg']))
1750 fl.write(
" Q1.Rg : %f\n" % qrg)
1751 fl.write(
" I(0) : %f\n" % (p.get_mean(qvalues=[0],
1752 average=args.eaverage)[0][1]))
1753 if Hessian
is not None:
1754 fl.write(
" Hessian matrix : ")
1755 fl.write(
" ".join(Hess_names))
1759 fl.write(
" ".join([
'{0: <12}'.format(s)
for s
in i]))
1761 if args.stop ==
"fitting":
1762 fl.write(
" Skipped further steps\n\n")
1765 data = p.get_gamma()
1766 fl.write(
" 3. Rescaling\n")
1767 if args.cmodel ==
'normal-offset':
1768 fl.write(
" normal model with offset\n")
1769 fl.write(
" offset : %f\n" % p.get_offset())
1770 elif args.cmodel ==
'normal':
1771 fl.write(
" normal model\n")
1773 fl.write(
" lognormal model\n")
1774 fl.write(
" gamma : %s\n" % data)
1775 if args.stop ==
"rescaling":
1776 fl.write(
" Skipped further steps\n\n")
1779 if args.postpone_cleanup:
1780 data = p.get_data(filter=
'agood',colwise=
True)
1781 fl.write(
" 1. Cleanup (postponed)\n" +
1782 " Number of significant points: %d \n" % \
1784 " Data range: %.5f %.5f \n" % \
1785 (data[
'q'][0],data[
'q'][-1]))
1786 if args.stop ==
"cleanup":
1787 fl.write(
" Skipped further steps\n\n")
1790 data = p.get_data(filter=
'dgood',colwise=
True)
1792 fl.write(
" 4. Classification\n" +
1793 " Number of valid points: 0\n")
1795 fl.write(
" 4. Classification\n" +
1796 " Number of valid points: %d \n" % len(data[
'q']) +
1797 " Data range: %.5f %.5f \n" % (data[
'q'][0],data[
'q'][-1]))
1800 fl.write(
"List of all options used for the merge:\n")
1801 fl.write(
' name (type) = "value"\n')
1802 for name
in [ i
for i
in dir(args)
if
1803 (
not i.startswith(
'_'))
1804 and (
not i
in [
'ensure_value',
'read_file',
'read_module'] ) ] :
1805 fl.write(
" %s " % name)
1806 val = getattr(args,name)
1807 fl.write(
"(%s)" % type(val).__name__)
1808 fl.write(
'= "%s"\n' % val)
1812 args, files = parse_args()
1813 if args.verbose >= 2 :
1814 print "Parsing files and creating profile classes"
1816 filenames,Nreps = parse_filenames(files, defaultvalue=10)
1817 args.filenames = filenames
1819 scale = get_global_scaling_factor(filenames[-1])
1820 profiles = map(create_profile, filenames, Nreps, [scale]*len(filenames))
1823 return profiles, args
1825 def cleanup(profiles, args):
1826 """first stage of merge: discard low SNR data
1828 agood : True if SNR is high enough
1829 apvalue : p-value of the student t test
1831 verbose = args.verbose
1833 q_cutoff = args.acutoff
1835 print "1. cleanup ( alpha = %2G %% )" % (alpha*100)
1840 print " ",p.filename,
1842 p.new_flag(
'agood',bool)
1843 p.new_flag(
'apvalue',float)
1844 all_points_bad =
True
1847 for datum
in p.get_data():
1848 id,q,I,err = datum[:4]
1850 p.set_flag(id,
'agood',
False)
1851 p.set_flag(id,
'apvalue', -1)
1853 pval,t = ttest_one(I,err,N)[0:2]
1854 if pval > alpha
or had_outlier:
1855 p.set_flag(id,
'agood',
False)
1856 if q >= q_cutoff
and had_outlier ==
False:
1859 p.set_flag(id,
'agood',
True)
1861 all_points_bad =
False
1862 p.set_flag(id,
'apvalue', pval)
1864 print "Warning: all points in file %s have been discarded"\
1865 " on cleanup" % p.filename
1868 qvals = p.get_data(filter=
"agood", colwise=
True)[
'q']
1869 print "\tqmin = %.3f\tqmax = %.3f" % (qvals[0], qvals[-1])
1872 good_profiles.append(p)
1874 for p
in good_profiles:
1875 create_intervals_from_data(p,
'agood')
1876 return good_profiles, args
1878 def fitting(profiles, args):
1879 """second stage of merge: gp fitting
1880 sets and optimizes the interpolant and enables calls to get_mean()
1883 verbose = args.verbose
1884 maxpointsF = args.blimit_fitting
1885 maxpointsH = args.blimit_hessian
1886 mean_function = args.bmean
1887 model_comp =
not args.bnocomp
1892 print " ",p.filename,
1894 data = p.get_data(filter=
'agood',colwise=
True, maxpoints=maxpointsF)
1896 data = p.get_data(colwise=
True, maxpoints=maxpointsF)
1898 if len(data[
'q']) == maxpointsF:
1899 print " (subsampled fitting: %d points) " % maxpointsF,
1900 if model_comp
and maxpointsH > 0
and maxpointsH < len(data[
'q']):
1901 print " (subsampled hessian: %d points) " % maxpointsH,
1902 data[
'N'] = p.get_Nreps()
1906 initvals[
'd']=args.bd
1907 initvals[
's']=args.bs
1910 initvals[
'lambda']=1.
1911 initvals[
'sigma2']=1.
1912 mean, initvals, bayes = find_fit(data, initvals,
1913 verbose, model_comp=model_comp, model_comp_maxpoints=maxpointsH,
1914 mean_function=mean_function,
1915 lambdamin=args.lambdamin)
1916 model, particles, functions, gp = setup_process(data,initvals,1)
1918 p.set_interpolant(gp, particles, functions, mean, model, bayes,
1919 hessian=bayes[mean][2])
1921 p.set_interpolant(gp, particles, functions, mean, model, bayes)
1922 if verbose > 1
and model_comp:
1924 return profiles, args
1926 def rescaling(profiles, args):
1927 """third stage of merge: rescaling
1929 cgood : True if data point is both valid (wrt SNR) and in the validity
1930 domain of the rescaling reference curve (option --creference)
1931 sets profile.gamma to the correct value
1933 use_normal = args.cmodel.startswith(
'normal')
1934 use_offset = use_normal
and args.cmodel.endswith(
'offset')
1935 numpoints = args.cnpoints
1936 verbose = args.verbose
1937 reference = args.creference
1938 average = args.baverage
1940 print "3. rescaling",
1942 print "(normal model",
1944 print "(lognormal model",
1946 print "with constant offset)"
1948 print "without constant offset)"
1952 for ctr,p
in enumerate(profiles):
1954 p.new_flag(
'cgood',bool)
1955 pdata = p.get_data(colwise=
True)
1956 if 'agood' in pdata:
1957 for id,q,flag
in zip(pdata[
'id'],pdata[
'q'],pdata[
'agood']):
1958 refflag = pref.get_flag(q,
'agood')
1959 p.set_flag(id,
'cgood', flag
and refflag)
1961 refq = pref.get_data(colwise=
True)[
'q']
1962 for id,q
in zip(pdata[
'id'],pdata[
'q']):
1963 refflag = (refq[0] <= q <= refq[-1])
1964 p.set_flag(id,
'cgood', refflag)
1965 create_intervals_from_data(p,
'cgood')
1967 goodip = [i
for i
in p.get_flag_intervals(
'cgood')
if i[2]]
1968 totaldist = array([i[1]-i[0]
for i
in goodip]).sum()
1970 for interval
in goodip:
1974 numint = int(round(dist/totaldist*numpoints))
1977 for i
in xrange(numint):
1978 qvalues.append((float(i)/(numint-1))*dist + qmin)
1979 pvalues = p.get_mean(qvalues=qvalues, colwise=
True, average=average)
1980 if True in map(isnan,pvalues[
'I']):
1981 raise RuntimeError,
"Got NAN in I"
1982 if True in map(isnan,pvalues[
'err']):
1983 raise RuntimeError,
"Got NAN in err"
1984 prefvalues = pref.get_mean(qvalues=qvalues, colwise=
True,
1986 if True in map(isnan,prefvalues[
'I']):
1987 raise RuntimeError,
"Got NAN in ref I"
1988 if True in map(isnan,prefvalues[
'err']):
1989 raise RuntimeError,
"Got NAN in ref err"
1990 gammas.append(rescale_curves(prefvalues, pvalues,
1991 normal = use_normal, offset = use_offset))
2008 if reference ==
'first':
2014 for p,g
in zip(profiles,gammas):
2020 print " ",p.filename,
" gamma = %.3G" % gamma,
2022 print " offset = %.3G" % c,
2024 if True in map(isnan,gammas[-1]):
2025 raise RuntimeError,
"Got NAN in rescaling step, try another rescaling " \
2027 return profiles,args
2029 def classification(profiles, args):
2030 """fourth stage of merge: classify mean functions
2032 drefnum : number of the reference profile for this point
2033 drefname : associated filename
2034 dgood : True if this point is compatible with the reference
2035 and False otherwise. Undefined if 'agood' is False for
2037 dselfref : True if this curve was it's own reference, in which case
2039 dpvalue : p-value for the classification test
2042 verbose = args.verbose
2043 average=args.baverage
2045 print "4. classification ( alpha = %2G %% )" % (alpha*100)
2046 for i
in xrange(len(profiles)):
2049 print " ",p.filename
2050 p.new_flag(
'drefnum',int)
2051 p.new_flag(
'drefname',str)
2052 p.new_flag(
'dgood',bool)
2053 p.new_flag(
'dselfref',bool)
2054 p.new_flag(
'dpvalue',float)
2056 for entry
in p.get_data(filter=
'agood'):
2058 for refnum,pref
in enumerate(profiles[:i+1]):
2059 if pref.get_flag(entry[1],
'agood',default=
None):
2061 p.set_flag(entry[0],
'drefnum', refnum)
2062 p.set_flag(entry[0],
'drefname', pref.get_filename())
2063 p.set_flag(entry[0],
'dselfref', pref
is p)
2065 data = p.get_mean(qvalues=[entry[1]],colwise=
True, average=average)
2066 refdata = pref.get_mean(qvalues=[entry[1]],colwise=
True,
2068 Nref = pref.get_Nreps()
2069 pval, t, nu = ttest_two(data[
'I'][0],data[
'err'][0],N,
2070 refdata[
'I'][0],refdata[
'err'][0],Nref)
2071 p.set_flag(entry[0],
'dgood', pval >= alpha)
2072 p.set_flag(entry[0],
'dpvalue', pval)
2073 create_intervals_from_data(p,
'drefnum')
2074 create_intervals_from_data(p,
'drefname')
2075 create_intervals_from_data(p,
'dselfref')
2076 create_intervals_from_data(p,
'dgood')
2077 return profiles, args
2079 def merging(profiles, args):
2080 """last stage of merge: collect valid data points and fit them
2081 Creates a profile that contains all compatible data points. This profile has
2082 the following flags:
2083 dselfref, drefnum, drefname (see above)
2084 eorigin : profile index from which this point originates
2085 eoriname : associated filename
2086 it then sets and optimizes the interpolant and enables calls to get_mean()
2089 verbose = args.verbose
2090 maxpointsF = args.elimit_fitting
2091 maxpointsH = args.elimit_hessian
2092 do_extrapolation =
not args.enoextrapolate
2093 extrapolate = 1+args.eextrapolate/float(100)
2094 mean_function = args.emean
2095 model_comp =
not args.enocomp
2098 print " gathering data"
2099 merge = SAXSProfile()
2100 merge.set_filename(args.mergename)
2101 merge.new_flag(
'dselfref',bool)
2102 merge.new_flag(
'drefnum',int)
2103 merge.new_flag(
'drefname',str)
2104 merge.new_flag(
'eorigin',int)
2105 merge.new_flag(
'eoriname',str)
2106 merge.new_flag(
'eextrapol',bool)
2107 flags_to_keep=[
'q',
'I',
'err',
'dselfref',
'drefnum',
'drefname']
2109 for i,p
in enumerate(profiles):
2111 print " ",p.filename
2113 data = p.get_data(filter=
'dgood',colwise=
True)
2121 for k
in data.keys():
2122 if k
not in flags_to_keep:
2124 data[
'eorigin']=[i]*len(data[
'q'])
2125 data[
'eoriname']=[p.filename]*len(data[
'q'])
2126 data[
'eextrapol']=[
False]*len(data[
'q'])
2128 all_flags = flags_to_keep + [
'eorigin',
'eoriname',
'eextrapol']
2129 for j
in xrange(len(data[
'q'])):
2132 entry.append(data[k][j])
2133 cleaned.append(entry)
2134 merge.add_data(cleaned)
2136 print " calculating merged mean",
2138 for n,t
in zip(merge.flag_names,merge.flag_types):
2139 if t != float
and n
not in [
'q',
'I',
'err']:
2140 create_intervals_from_data(merge, n)
2142 data = merge.get_data(colwise=
True)[
'q']
2143 merge.set_flag_interval(
'eextrapol', min(data), max(data),
False)
2144 if do_extrapolation:
2145 merge.set_flag_interval(
'eextrapol',0,min(data),
True)
2146 if args.eextrapolate > 0:
2147 merge.set_flag_interval(
'eextrapol',
2148 max(data), max(data)*extrapolate,
True)
2151 merge.set_Nreps(min([p.get_Nreps()
for p
in profiles]))
2153 data = merge.get_data(colwise=
True, maxpoints=maxpointsF)
2155 if len(data[
'q']) == maxpointsF:
2156 print " (subsampled fitting: %d points) " % maxpointsF,
2157 if model_comp
and maxpointsH > 0
and maxpointsH < len(data[
'q']):
2158 print " (subsampled hessian: %d points) " % maxpointsH,
2159 data[
'N'] = merge.get_Nreps()
2161 initvals = profiles[-1].get_params()
2162 mean, initvals, bayes = find_fit(data, initvals,
2163 verbose, model_comp=model_comp,
2164 model_comp_maxpoints=maxpointsH,
2165 mean_function=mean_function,
2166 lambdamin=args.lambdamin)
2167 if verbose > 1
and model_comp:
2169 model, particles, functions, gp = setup_process(data,initvals,1)
2171 merge.set_interpolant(gp, particles, functions, mean, model, bayes,
2172 hessian=bayes[mean][2])
2174 merge.set_interpolant(gp, particles, functions, mean, model, bayes)
2177 return merge, profiles, args
2179 def write_data(merge, profiles, args):
2180 if args.verbose > 0:
2181 print "writing data"
2182 if not os.path.isdir(args.destdir):
2183 os.mkdir(args.destdir)
2184 qvals = array(profiles[0].get_raw_data(colwise=
True)[
'q'])
2187 if args.verbose > 1:
2188 print " individual profiles"
2190 if args.verbose > 2:
2191 print " %s" % (i.get_filename())
2192 write_individual_profile(i, qvals, args)
2194 if args.stop ==
'merging':
2195 if args.verbose > 1:
2196 print " merge profile",
2197 write_merge_profile(merge, qvals, args)
2198 if args.verbose > 1:
2201 if args.verbose > 1:
2203 write_summary_file(merge, profiles, args)
2204 if args.verbose > 0:
2209 profiles, args = initialize()
2211 if args.postpone_cleanup:
2212 steps_to_go = [
"fitting",
"rescaling",
"cleanup",
2213 "classification",
"merging"]
2215 steps_to_go = [
"cleanup",
"fitting",
"rescaling",
2216 "classification",
"merging"]
2217 steps_to_go = steps_to_go[:steps_to_go.index(args.stop)+1]
2220 for step
in steps_to_go:
2221 if step !=
'merging':
2222 profiles, args = globals()[step](profiles, args)
2224 merge, profiles, args = merging(profiles, args)
2226 write_data(merge, profiles, args)
2228 if __name__ ==
"__main__":