IMP  2.3.0
The Integrative Modeling Platform
Statistics.py
1 #!/usr/bin/env python
2 
3 from IMP.isd.Entry import Entry
4 import gzip
5 import os
6 
7 
8 class Statistics:
9 
10  """Statistics gathering and printing class for ISD gibbs sampling.
11  Also manages the restart file (TODO).
12  - prefix: all outputted files will have this prefix
13  - rate: print statistics every so many gibbs sampling steps
14  - trajrate: print trajectories (pdb) every multiple of rate (default 1).
15  implies that update_coordinates() has been called by that time
16  otherwise writing will not occur.
17  - statfile: suffix of the statistics file
18  - num_entries_per_line: number of entries per line in the output. -1 to
19  disable wrapping.
20  - repeat_title: if 0 (default) only print it in the beginning. Else repeat
21  it every 'repeat_title' outputted lines in the statistics file.
22  - separate_lines: If False the entries are not separated (default). If True,
23  the lines are separated with stars.
24  - compress: If set to a positive number of steps, compress trajectories each
25  time so many steps have elapsed, appending the current frame
26  number to the filename. Only works in append mode, and when it
27  is set to a multiple of rate.
28 
29  TODO: check if everything was updated nicely
30  """
31 
32  def __init__(self, prefix='r01', rate=1, trajrate=1, statfile='_stats.txt',
33  num_entries_per_line=5, repeat_title=0,
34  separate_lines=False, compress=10000):
35  self.prefix = prefix
36  self.rate = rate
37  self.trajrate = trajrate
38  self.statfile = prefix + statfile
39  self.compress = compress
40  # list of the things that will be printed to the stats file, in order.
41  self.entries = []
42  # list of coordinate entries
43  self.coordinates = []
44  # internal
45  self.__counter_pos = 0
46  # the same entries but sorted by category
47  self.categories = {}
48  # create the global category along with the global counter
49  self.add_category('global')
50  # that makes it the first entry of the table.
51  self.add_entry('global', name='counter')
52  # output-specific flags
53  self.write_title = True
54  if num_entries_per_line == 0 or num_entries_per_line < -1:
55  raise ValueError("number of entries per line is >0 or equal to -1")
56  if num_entries_per_line == -1:
57  self.wrap_stats = False
58  else:
59  self.wrap_stats = True
60  self.num_entries_per_line = num_entries_per_line
61  self.add_numbers_to_titles = True
62  self.separator = ' '
63  self.repeat_title = repeat_title
64  self.separate_lines = separate_lines
65  self.comment_marker = '#'
66 
67  def _get_unique_category_name(self, name):
68  if name:
69  if name in self.categories.keys():
70  i = 0
71  while True:
72  i += 1
73  ncat = ''.join([name, '%d' % i])
74  if ncat not in self.categories.keys():
75  break
76  else:
77  ncat = name
78  else:
79  i = 0
80  while True:
81  i += 1
82  ncat = '%d' % i
83  if ncat not in self.categories.keys():
84  break
85  return ncat
86 
87  def add_category(self, name=None):
88  """creates a logging entry for a simulation substep of the gibbs
89  sampler. Each category has its own counter, initialized to zero.
90  The global category does not need to be created, it's already created by
91  the init method, and its key is 'global'.
92  - name: an optional name, must be string.
93  Returns: a unique key to refer to this category, which will start with
94  the optional name.
95  """
96  ncat = self._get_unique_category_name(name)
97  self.categories[ncat] = {'counter': Entry('step', '%10d', 0)}
98  return ncat
99 
100  def _append_to_stats(self, name, entry):
101  """append to stats, or put in front if entry is a counter"""
102  if name == 'counter':
103  self.entries.insert(self.__counter_pos, entry)
104  self.__counter_pos += 1
105  else:
106  self.entries.append(entry)
107 
108  def add_entry(self, key, name=None, entry=None):
109  """add an entry for the statistics file
110  - key: which category it belongs to (key returned by add_category)
111  You must specify at least one of the two following:
112  - name: a name for this entry
113  - entry: an instance of the Entry class.
114  Arguments: - name only: must already have an entry by that name.
115  - entry only: name is set to the entry title and added. If
116  it didn't exist before it is stored as well.
117  - name and entry: name is used instead of the title.
118  - nothing: raises an error.
119  Currently, not providing entry only makes sense for the counter since
120  there is no method to create an entry without adding it to the
121  statistics file.
122  """
123  if not entry and not name:
124  raise ValueError("Should specify at least one of name or entry")
125  if entry:
126  if not name:
127  name = entry.get_title()
128  self._append_to_stats(name, entry)
129  self.categories[key][name] = entry
130  else:
131  if not name in self.categories[key]:
132  raise ValueError("entry %s:%s does not exist!" % (key, name))
133  self._append_to_stats(name, self.categories[key][name])
134 
135  def update(self, key, name, value):
136  """updates an entry and change its value to value"""
137  if not key in self.categories:
138  raise ValueError("unknown category: %s" % key)
139  if not name in self.categories[key]:
140  raise ValueError("unknown entry %s:%s" % (key, name))
141  self.categories[key][name].set_value(value)
142 
143  def add_coordinates(self, key, name, format='raw', append=True,
144  extension='pdb', hierarchies=None, restraints=None):
145  """adds a placeholder for coordinates
146  - format = rmf3:
147  will write the whole system as provided, in rmf3 format
148  - hierarchies must contain protein hierarchies
149  - restraints is a list of restraints
150  - format = raw:
151  will write provided data as-is
152  - append: whether to append to a trajectory or to write multiple
153  files. With this format, a trajectory is just a string, you can
154  stuff whatever you want in it. If append is False, files will be
155  numbered according to the counter of their category.
156  - extension: the file extension to use
157  """
158  if not key in self.categories:
159  raise ValueError("unknown category: %s" % key)
160  self.categories[key][name] = None
161  if format == 'raw':
162  self.coordinates.append((key, name, 'raw', (append, extension)))
163  elif format == 'rmf3':
164  import RMF
165  import IMP.rmf
166  assert hierarchies is not None
167  rh = RMF.create_rmf_file(self.prefix + '_' + name + '_traj.rmf3')
168  IMP.rmf.add_hierarchies(rh, hierarchies)
169  if restraints:
170  IMP.rmf.add_restraints(rh, restraints)
171  self.coordinates.append((key, name, 'rmf3', rh))
172  else:
173  raise ValueError("format can only be rmf3 or raw")
174 
175  def update_coordinates(self, key, name, value=True):
176  """updates the coordinates of key:name entry. Format should match with
177  the format specified at init time (raw or rmf3)
178  note that setting value to None is equivalent to not calling this
179  function
180  """
181  if not key in self.categories:
182  raise ValueError("unknown category: %s" % key)
183  if not name in self.categories[key]:
184  raise ValueError("unknown coordinates %s:%s" % (key, name))
185  self.categories[key][name] = value
186 
187  def increment_counter(self, key, value):
188  """increments the counter of category 'key' by 'value' steps."""
189  if not key in self.categories:
190  raise ValueError("unknown category: %s" % key)
191  cnt = self.categories[key]['counter']
192  cnt.set_value(cnt.get_raw_value() + value)
193 
194  def get_entry_category(self, entry):
195  # ugly, find something better
196  for cat in self.categories:
197  if entry in self.categories[cat].values():
198  return cat
199 
200  def format_titles(self):
201  titles = []
202  for (i, entry) in enumerate(self.entries):
203  if self.add_numbers_to_titles:
204  if self.num_entries_per_line > 0:
205  title = '%d:' % ((i % self.num_entries_per_line) + 1)
206  else:
207  title = '%d:' % (i + 1)
208  else:
209  title = ''
210  cat = self.get_entry_category(entry)
211  ti = entry.get_title()
212  title += '%s:%s' % (cat, ti)
213  titles.append(title)
214  return titles
215 
216  def get_formatted_entries(self):
217  return [ent.get_value() for ent in self.entries]
218 
219  def should_wrap_line(self, pos, line):
220  if self.wrap_stats:
221  num = self.num_entries_per_line
222  if pos % num == num - 1 and pos != len(line) - 1:
223  return True
224  return False
225 
226  def prepare_line(self, line, marker='L'):
227  out = marker + '1'
228  out += self.separator
229  for i, tok in enumerate(line):
230  out += tok
231  ln = 2 + (i / self.num_entries_per_line)
232  if self.should_wrap_line(i, line):
233  out += '\n%s%d' % (marker, ln)
234  out += self.separator
235  # don't add a newline if we just did
236  if not self.should_wrap_line(i, line):
237  out += '\n'
238  return out
239 
240  def compress_file(self, fname):
241  gz = gzip.open(fname + '.gz', 'wb')
242  fl = open(fname, 'rb')
243  gz.writelines(fl)
244  gz.close()
245  fl.close()
246  os.system('rm %s' % fname)
247 
248  def new_stage(self, name):
249  fl = open(self.statfile, 'a')
250  fl.write("### STAGE %s\n" % name)
251  fl.close()
252 
253  def write_stats(self):
254  """Writes statistics to the stats file and writes/appends
255  trajectories. Only does that if the global step matches
256  the output rate. Trajectories are written more sparsely, see trajrate.
257  Returns: True if data was written, False if not.
258  """
259  stepno = self.categories['global']['counter'].get_raw_value()
260  if stepno % self.rate != 0:
261  return False
262  # stats file
263  fl = open(self.statfile, 'a')
264  # do title if necessary
265  if self.write_title:
266  self.write_title = False
267  titles = self.format_titles()
268  fl.write(self.prepare_line(titles, marker=self.comment_marker))
269  elif self.repeat_title > 0:
270  if (stepno / self.rate) % self.repeat_title == 0:
271  self.write_title = True
272  # write stats
273  entries = self.get_formatted_entries()
274  fl.write(self.prepare_line(entries))
275  if self.separate_lines:
276  fl.write('*' * 80 + '\n')
277  fl.close()
278  # write trajs
279  if stepno % (self.rate * self.trajrate) != 0:
280  return True
281  for key, name, format, args in self.coordinates:
282  if self.categories[key][name] is None:
283  continue
284  if format == 'raw':
285  do_append, extension = args
286  if do_append:
287  pdbname = self.prefix + '_traj.' + extension
288  if self.compress > 0 and stepno % self.compress == 0:
289  newname = "%s_traj_%d.%s" % (
290  self.prefix,
291  stepno,
292  extension)
293  os.system('mv %s %s' % (pdbname, newname))
294  self.compress_file(newname)
295  fl = open(pdbname, 'a')
296  else:
297  num = self.categories[key]['counter'].get_raw_value()
298  fl = open(
299  self.prefix + ('_%s_%010d.%s' %
300  (name, num, extension)), 'w')
301  fl.write(self.categories[key][name])
302  fl.close()
303  elif format == 'rmf3':
304  import IMP.rmf
305  IMP.rmf.save_frame(args)
306  args.flush()
307  else:
308  raise RuntimeError
309  self.categories[key][name] = None
310  return True
void save_frame(RMF::FileHandle file, unsigned int, std::string name="")
Definition: frames.h:42
void add_restraints(RMF::NodeHandle fh, const kernel::Restraints &hs)
Classes to handle ISD statistics files.
Definition: Entry.py:1
def increment_counter
increments the counter of category 'key' by 'value' steps.
Definition: Statistics.py:187
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
def update
updates an entry and change its value to value
Definition: Statistics.py:135
def write_stats
Writes statistics to the stats file and writes/appends trajectories.
Definition: Statistics.py:253
def add_category
creates a logging entry for a simulation substep of the gibbs sampler.
Definition: Statistics.py:87
def update_coordinates
updates the coordinates of key:name entry.
Definition: Statistics.py:175
Support for the RMF file format for storing hierarchical molecular data and markup.
def add_coordinates
adds a placeholder for coordinates
Definition: Statistics.py:150
def add_entry
add an entry for the statistics file
Definition: Statistics.py:108