IMP  2.0.1
The Integrative Modeling Platform
Statistics.py
1 #!/usr/bin/env python
2 
3 from IMP.isd.Entry import Entry
4 import gzip
5 import os
6 
7 class Statistics:
8  """Statistics gathering and printing class for ISD gibbs sampling.
9  Also manages the restart file (TODO).
10  - prefix: all outputted files will have this prefix
11  - rate: print statistics every so many gibbs sampling steps
12  - trajrate: print trajectories (pdb) every multiple of rate (default 1).
13  - statfile: suffix of the statistics file
14  - append: whether to append to a trajectory or to write multiple files.
15  For the statistics class, a trajectory is just a string, you can
16  stuff whatever you want in it. If append is False, files will be
17  numbered according to the counter of their category.
18  - num_entries_per_line: number of entries per line in the output. -1 to
19  disable wrapping.
20  - repeat_title: if 0 (default) only print it in the beginning. Else repeat
21  it every 'repeat_title' outputted lines in the statistics file.
22  - separate_lines: If False the entries are not separated (default). If True,
23  the lines are separated with stars.
24  - compress: If set to a positive number of steps, compress trajectories each
25  time so many steps have elapsed, appending the current frame
26  number to the filename. Only works in append mode, and when it
27  is set to a multiple of rate.
28 
29  TODO: check if everything was updated nicely
30  """
31 
32  def __init__(self, prefix='r01', rate=1, trajrate=1, statfile='_stats.txt',
33  append=True, num_entries_per_line=5, repeat_title=0,
34  separate_lines=False,compress=10000):
35  self.prefix = prefix
36  self.rate=rate
37  self.trajrate=trajrate
38  self.statfile=prefix+statfile
39  self.append=append
40  self.compress=compress
41  #list of the things that will be printed to the stats file, in order.
42  self.entries=[]
43  #list of coordinate entries
44  self.coordinates=[]
45  #internal
46  self.__counter_pos = 0
47  #the same entries but sorted by category
48  self.categories={}
49  #create the global category along with the global counter
50  self.add_category('global')
51  #that makes it the first entry of the table.
52  self.add_entry('global',name='counter')
53  #output-specific flags
54  self.write_title=True
55  if num_entries_per_line == 0 or num_entries_per_line < -1:
56  raise ValueError, "number of entries per line is >0 or equal to -1"
57  if num_entries_per_line == -1:
58  self.wrap_stats = False
59  else:
60  self.wrap_stats = True
61  self.num_entries_per_line = num_entries_per_line
62  self.add_numbers_to_titles = True
63  self.separator=' '
64  self.repeat_title = repeat_title
65  self.separate_lines = separate_lines
66  self.comment_marker='#'
67 
68  def _get_unique_category_name(self, name):
69  if name:
70  if name in self.categories.keys():
71  i=0
72  while True:
73  i += 1
74  ncat=''.join([name,'%d' % i])
75  if ncat not in self.categories.keys():
76  break
77  else:
78  ncat = name
79  else:
80  i=0
81  while True:
82  i+=1
83  ncat = '%d' % i
84  if ncat not in self.categories.keys():
85  break
86  return ncat
87 
88  def add_category(self, name=None):
89  """creates a logging entry for a simulation substep of the gibbs
90  sampler. Each category has its own counter, initialized to zero.
91  The global category does not need to be created, it's already created by
92  the init method, and its key is 'global'.
93  - name: an optional name, must be string.
94  Returns: a unique key to refer to this category, which will start with
95  the optional name.
96  """
97  ncat = self._get_unique_category_name(name)
98  self.categories[ncat]={'counter':Entry('step', '%10d', 0)}
99  return ncat
100 
101  def _append_to_stats(self, name, entry):
102  """append to stats, or put in front if entry is a counter"""
103  if name == 'counter':
104  self.entries.insert(self.__counter_pos, entry)
105  self.__counter_pos += 1
106  else:
107  self.entries.append(entry)
108 
109  def add_entry(self, key, name=None, entry=None):
110  """add an entry for the statistics file
111  - key: which category it belongs to (key returned by add_category)
112  You must specify at least one of the two following:
113  - name: a name for this entry
114  - entry: an instance of the Entry class.
115  Arguments: - name only: must already have an entry by that name.
116  - entry only: name is set to the entry title and added. If
117  it didn't exist before it is stored as well.
118  - name and entry: name is used instead of the title.
119  - nothing: raises an error.
120  Currently, not providing entry only makes sense for the counter since
121  there is no method to create an entry without adding it to the
122  statistics file.
123  """
124  if not entry and not name:
125  raise ValueError, "Should specify at least one of name or entry"
126  if entry:
127  if not name:
128  name = entry.get_title()
129  self._append_to_stats(name, entry)
130  self.categories[key][name]=entry
131  else:
132  if not name in self.categories[key]:
133  raise ValueError, "entry %s:%s does not exist!" % (key,name)
134  self._append_to_stats(name, self.categories[key][name])
135 
136  def update(self, key, name, value):
137  """updates an entry and change its value to value"""
138  if not key in self.categories:
139  raise ValueError, "unknown category: %s" % key
140  if not name in self.categories[key]:
141  raise ValueError, "unknown entry %s:%s" % (key,name)
142  self.categories[key][name].set_value(value)
143 
144  def add_coordinates(self, key, name):
145  """adds a placeholder for coordinates"""
146  if not key in self.categories:
147  raise ValueError, "unknown category: %s" % key
148  self.categories[key][name]=None
149  self.coordinates.append((key,name))
150 
151  def update_coordinates(self, key, name, value):
152  """updates the coordinates of key:name entry. Format should match with
153  the format specified at init time (pdb or cdf)
154  """
155  if not key in self.categories:
156  raise ValueError, "unknown category: %s" % key
157  if not name in self.categories[key]:
158  raise ValueError, "unknown coordinates %s:%s" % (key,name)
159  self.categories[key][name]=value
160 
161  def increment_counter(self, key, value):
162  """increments the counter of category 'key' by 'value' steps."""
163  if not key in self.categories:
164  raise ValueError, "unknown category: %s" % key
165  cnt=self.categories[key]['counter']
166  cnt.set_value(cnt.get_raw_value() + value)
167 
168  def get_entry_category(self, entry):
169  #ugly, find something better
170  for cat in self.categories:
171  if entry in self.categories[cat].values():
172  return cat
173 
174  def format_titles(self):
175  titles = []
176  for (i,entry) in enumerate(self.entries):
177  if self.add_numbers_to_titles:
178  title='%d:' % ( (i % self.num_entries_per_line) + 1 )
179  else:
180  title=''
181  cat = self.get_entry_category(entry)
182  ti = entry.get_title()
183  title += '%s:%s' % (cat,ti)
184  titles.append(title)
185  return titles
186 
187  def get_formatted_entries(self):
188  return [ent.get_value() for ent in self.entries]
189 
190  def should_wrap_line(self, pos, line):
191  if self.wrap_stats:
192  num = self.num_entries_per_line
193  if pos % num == num - 1 and pos != len(line)-1:
194  return True
195  return False
196 
197  def prepare_line(self, line, marker='L'):
198  out = marker+'1'
199  out += self.separator
200  for i,tok in enumerate(line):
201  out += tok
202  ln = 2 + (i / self.num_entries_per_line)
203  if self.should_wrap_line(i,line):
204  out += '\n%s%d' % (marker,ln)
205  out += self.separator
206  #don't add a newline if we just did
207  if not self.should_wrap_line(i,line):
208  out += '\n'
209  return out
210 
211  def compress_file(self, fname):
212  gz=gzip.open(fname+'.gz','wb')
213  fl=open(fname,'rb')
214  gz.writelines(fl)
215  gz.close()
216  fl.close()
217  os.system('rm %s' % fname)
218 
219  def new_stage(self, name):
220  fl=open(self.statfile,'a')
221  fl.write("### STAGE %s\n" % name)
222  fl.close()
223 
224  def write_stats(self):
225  """Writes statistics to the stats file and writes/appends
226  trajectories. Only does that if the global step matches
227  the output rate. Trajectories are written more sparsely, see trajrate.
228  Returns: True if data was written, False if not.
229  """
230  stepno = self.categories['global']['counter'].get_raw_value()
231  if stepno % self.rate != 0:
232  return False
233  #stats file
234  fl=open(self.statfile, 'a')
235  #do title if necessary
236  if self.write_title:
237  self.write_title = False
238  titles = self.format_titles()
239  fl.write(self.prepare_line(titles, marker=self.comment_marker))
240  elif self.repeat_title > 0:
241  if (stepno/self.rate) % self.repeat_title == 0:
242  self.write_title = True
243  #write stats
244  entries = self.get_formatted_entries()
245  fl.write(self.prepare_line(entries))
246  if self.separate_lines:
247  fl.write('*'*80+'\n')
248  fl.close()
249  #write trajs
250  if stepno % (self.rate*self.trajrate) != 0:
251  return True
252  for key,name in self.coordinates:
253  if self.categories[key][name] is None:
254  raise ValueError, "The trajectory was not passed to the stats class!"
255  if self.append:
256  pdbname=self.prefix+'_traj.pdb'
257  if self.compress > 0 and stepno % self.compress == 0:
258  newname = "%s_traj_%d.pdb" % (self.prefix, stepno)
259  os.system('mv %s %s' % (pdbname, newname))
260  self.compress_file(newname)
261  fl=open(pdbname, 'a')
262  else:
263  num=self.categories[key]['counter'].get_raw_value()
264  fl=open(self.prefix + ('_%s_%010d.pdb' % (name,num)), 'w')
265  fl.write(self.categories[key][name])
266  fl.close()
267  return True