IMP logo
IMP Reference Guide  develop.63b38c487d,2024/12/21
The Integrative Modeling Platform
Statistics.py
1 #!/usr/bin/env python
2 
3 from IMP.isd.Entry import Entry
4 import gzip
5 import os
6 
7 
8 class Statistics:
9 
10  """Statistics gathering and printing class for ISD gibbs sampling.
11  Also manages the restart file (TODO).
12  - prefix: all outputted files will have this prefix
13  - rate: print statistics every so many gibbs sampling steps
14  - trajrate: print trajectories (pdb) every multiple of rate (default 1).
15  implies that update_coordinates() has been called by that time
16  otherwise writing will not occur.
17  - statfile: suffix of the statistics file
18  - num_entries_per_line: number of entries per line in the output. -1 to
19  disable wrapping.
20  - repeat_title: if 0 (default) only print it in the beginning. Else repeat
21  it every 'repeat_title' outputted lines in the statistics
22  file.
23  - separate_lines: If False the entries are not separated (default).
24  If True, the lines are separated with stars.
25  - compress: If set to a positive number of steps, compress trajectories
26  each time so many steps have elapsed, appending the current
27  frame number to the filename. Only works in append mode, and
28  when it is set to a multiple of rate.
29 
30  TODO: check if everything was updated nicely
31  """
32 
33  def __init__(self, prefix='r01', rate=1, trajrate=1, statfile='_stats.txt',
34  num_entries_per_line=5, repeat_title=0,
35  separate_lines=False, compress=10000):
36  self.prefix = prefix
37  self.rate = rate
38  self.trajrate = trajrate
39  self.statfile = prefix + statfile
40  self.compress = compress
41  # list of the things that will be printed to the stats file, in order.
42  self.entries = []
43  # list of coordinate entries
44  self.coordinates = []
45  # internal
46  self.__counter_pos = 0
47  # the same entries but sorted by category
48  self.categories = {}
49  # create the global category along with the global counter
50  self.add_category('global')
51  # that makes it the first entry of the table.
52  self.add_entry('global', name='counter')
53  # output-specific flags
54  self.write_title = True
55  if num_entries_per_line == 0 or num_entries_per_line < -1:
56  raise ValueError("number of entries per line is >0 or equal to -1")
57  if num_entries_per_line == -1:
58  self.wrap_stats = False
59  else:
60  self.wrap_stats = True
61  self.num_entries_per_line = num_entries_per_line
62  self.add_numbers_to_titles = True
63  self.separator = ' '
64  self.repeat_title = repeat_title
65  self.separate_lines = separate_lines
66  self.comment_marker = '#'
67 
68  def _get_unique_category_name(self, name):
69  if name:
70  if name in self.categories.keys():
71  i = 0
72  while True:
73  i += 1
74  ncat = ''.join([name, '%d' % i])
75  if ncat not in self.categories.keys():
76  break
77  else:
78  ncat = name
79  else:
80  i = 0
81  while True:
82  i += 1
83  ncat = '%d' % i
84  if ncat not in self.categories.keys():
85  break
86  return ncat
87 
88  def add_category(self, name=None):
89  """creates a logging entry for a simulation substep of the gibbs
90  sampler. Each category has its own counter, initialized to zero.
91  The global category does not need to be created, it's already created
92  by the init method, and its key is 'global'.
93  - name: an optional name, must be string.
94  Returns: a unique key to refer to this category, which will start with
95  the optional name.
96  """
97  ncat = self._get_unique_category_name(name)
98  self.categories[ncat] = {'counter': Entry('step', '%10d', 0)}
99  return ncat
100 
101  def _append_to_stats(self, name, entry):
102  """append to stats, or put in front if entry is a counter"""
103  if name == 'counter':
104  self.entries.insert(self.__counter_pos, entry)
105  self.__counter_pos += 1
106  else:
107  self.entries.append(entry)
108 
109  def add_entry(self, key, name=None, entry=None):
110  """add an entry for the statistics file
111  - key: which category it belongs to (key returned by add_category)
112  You must specify at least one of the two following:
113  - name: a name for this entry
114  - entry: an instance of the Entry class.
115  Arguments: - name only: must already have an entry by that name.
116  - entry only: name is set to the entry title and added. If
117  it didn't exist before it is stored as well.
118  - name and entry: name is used instead of the title.
119  - nothing: raises an error.
120  Currently, not providing entry only makes sense for the counter since
121  there is no method to create an entry without adding it to the
122  statistics file.
123  """
124  if not entry and not name:
125  raise ValueError("Should specify at least one of name or entry")
126  if entry:
127  if not name:
128  name = entry.get_title()
129  self._append_to_stats(name, entry)
130  self.categories[key][name] = entry
131  else:
132  if name not in self.categories[key]:
133  raise ValueError("entry %s:%s does not exist!" % (key, name))
134  self._append_to_stats(name, self.categories[key][name])
135 
136  def update(self, key, name, value):
137  """updates an entry and change its value to value"""
138  if key not in self.categories:
139  raise ValueError("unknown category: %s" % key)
140  if name not in self.categories[key]:
141  raise ValueError("unknown entry %s:%s" % (key, name))
142  self.categories[key][name].set_value(value)
143 
144  def add_coordinates(self, key, name, format='raw', append=True,
145  extension='pdb', hierarchies=None, restraints=None):
146  """adds a placeholder for coordinates
147  - format = rmf3:
148  will write the whole system as provided, in rmf3 format
149  - hierarchies must contain protein hierarchies
150  - restraints is a list of restraints
151  - format = raw:
152  will write provided data as-is
153  - append: whether to append to a trajectory or to write multiple
154  files. With this format, a trajectory is just a string, you can
155  stuff whatever you want in it. If append is False, files will be
156  numbered according to the counter of their category.
157  - extension: the file extension to use
158  """
159  if key not in self.categories:
160  raise ValueError("unknown category: %s" % key)
161  self.categories[key][name] = None
162  if format == 'raw':
163  self.coordinates.append((key, name, 'raw', (append, extension)))
164  elif format == 'rmf3':
165  import RMF
166  import IMP.rmf
167  assert hierarchies is not None
168  rh = RMF.create_rmf_file(self.prefix + '_' + name + '_traj.rmf3')
169  IMP.rmf.add_hierarchies(rh, hierarchies)
170  if restraints:
171  IMP.rmf.add_restraints(rh, restraints)
172  self.coordinates.append((key, name, 'rmf3', rh))
173  else:
174  raise ValueError("format can only be rmf3 or raw")
175 
176  def update_coordinates(self, key, name, value=True):
177  """updates the coordinates of key:name entry. Format should match with
178  the format specified at init time (raw or rmf3)
179  note that setting value to None is equivalent to not calling this
180  function
181  """
182  if key not in self.categories:
183  raise ValueError("unknown category: %s" % key)
184  if name not in self.categories[key]:
185  raise ValueError("unknown coordinates %s:%s" % (key, name))
186  self.categories[key][name] = value
187 
188  def increment_counter(self, key, value):
189  """increments the counter of category 'key' by 'value' steps."""
190  if key not in self.categories:
191  raise ValueError("unknown category: %s" % key)
192  cnt = self.categories[key]['counter']
193  cnt.set_value(cnt.get_raw_value() + value)
194 
195  def get_entry_category(self, entry):
196  # ugly, find something better
197  for cat in self.categories:
198  if entry in self.categories[cat].values():
199  return cat
200 
201  def format_titles(self):
202  titles = []
203  for (i, entry) in enumerate(self.entries):
204  if self.add_numbers_to_titles:
205  if self.num_entries_per_line > 0:
206  title = '%d:' % ((i % self.num_entries_per_line) + 1)
207  else:
208  title = '%d:' % (i + 1)
209  else:
210  title = ''
211  cat = self.get_entry_category(entry)
212  ti = entry.get_title()
213  title += '%s:%s' % (cat, ti)
214  titles.append(title)
215  return titles
216 
217  def get_formatted_entries(self):
218  return [ent.get_value() for ent in self.entries]
219 
220  def should_wrap_line(self, pos, line):
221  if self.wrap_stats:
222  num = self.num_entries_per_line
223  if pos % num == num - 1 and pos != len(line) - 1:
224  return True
225  return False
226 
227  def prepare_line(self, line, marker='L'):
228  out = marker + '1'
229  out += self.separator
230  for i, tok in enumerate(line):
231  out += tok
232  ln = 2 + (i / self.num_entries_per_line)
233  if self.should_wrap_line(i, line):
234  out += '\n%s%d' % (marker, ln)
235  out += self.separator
236  # don't add a newline if we just did
237  if not self.should_wrap_line(i, line):
238  out += '\n'
239  return out
240 
241  def compress_file(self, fname):
242  gz = gzip.open(fname + '.gz', 'wb')
243  fl = open(fname, 'rb')
244  gz.writelines(fl)
245  gz.close()
246  fl.close()
247  os.system('rm %s' % fname)
248 
249  def new_stage(self, name):
250  fl = open(self.statfile, 'a')
251  fl.write("### STAGE %s\n" % name)
252  fl.close()
253 
254  def write_stats(self):
255  """Writes statistics to the stats file and writes/appends
256  trajectories. Only does that if the global step matches
257  the output rate. Trajectories are written more sparsely, see trajrate.
258  Returns: True if data was written, False if not.
259  """
260  stepno = self.categories['global']['counter'].get_raw_value()
261  if stepno % self.rate != 0:
262  return False
263  # stats file
264  fl = open(self.statfile, 'a')
265  # do title if necessary
266  if self.write_title:
267  self.write_title = False
268  titles = self.format_titles()
269  fl.write(self.prepare_line(titles, marker=self.comment_marker))
270  elif self.repeat_title > 0:
271  if (stepno / self.rate) % self.repeat_title == 0:
272  self.write_title = True
273  # write stats
274  entries = self.get_formatted_entries()
275  fl.write(self.prepare_line(entries))
276  if self.separate_lines:
277  fl.write('*' * 80 + '\n')
278  fl.close()
279  # write trajs
280  if stepno % (self.rate * self.trajrate) != 0:
281  return True
282  for key, name, format, args in self.coordinates:
283  if self.categories[key][name] is None:
284  continue
285  if format == 'raw':
286  do_append, extension = args
287  if do_append:
288  pdbname = self.prefix + '_traj.' + extension
289  if self.compress > 0 and stepno % self.compress == 0:
290  newname = "%s_traj_%d.%s" % (
291  self.prefix,
292  stepno,
293  extension)
294  os.system('mv %s %s' % (pdbname, newname))
295  self.compress_file(newname)
296  fl = open(pdbname, 'a')
297  else:
298  num = self.categories[key]['counter'].get_raw_value()
299  fl = open(
300  self.prefix + ('_%s_%010d.%s' %
301  (name, num, extension)), 'w')
302  fl.write(self.categories[key][name])
303  fl.close()
304  elif format == 'rmf3':
305  import IMP.rmf
306  IMP.rmf.save_frame(args)
307  args.flush()
308  else:
309  raise RuntimeError
310  self.categories[key][name] = None
311  return True
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
Classes to handle ISD statistics files.
Definition: Entry.py:1
def increment_counter
increments the counter of category 'key' by 'value' steps.
Definition: Statistics.py:188
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
def update
updates an entry and change its value to value
Definition: Statistics.py:136
def write_stats
Writes statistics to the stats file and writes/appends trajectories.
Definition: Statistics.py:254
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
def add_category
creates a logging entry for a simulation substep of the gibbs sampler.
Definition: Statistics.py:88
def update_coordinates
updates the coordinates of key:name entry.
Definition: Statistics.py:176
Support for the RMF file format for storing hierarchical molecular data and markup.
def add_coordinates
adds a placeholder for coordinates
Definition: Statistics.py:151
def add_entry
add an entry for the statistics file
Definition: Statistics.py:109