IMP  2.3.0
The Integrative Modeling Platform
demux_trajs.py
1 #!/usr/bin/env python
2 
3 import sys
4 import os
5 import re
6 import shutil
7 
8 
9 class LogStep:
10 
11  def __init__(self, stepno, statline, header):
12  self.stepno = stepno
13  self.stats = statline
14  self.header = header
15  self.dumps = {}
16  self.trajs = {}
17 
18  def add(self, ftype, category, *data):
19  if ftype == 'dump':
20  self.dumps[category] = data[0]
21  elif ftype == 'traj':
22  self.trajs[category] = {'ftype': data[0],
23  'fullpath': data[1],
24  'stepno': data[2],
25  'tail': data[3]}
26  else:
27  raise ValueError("unknown file type")
28 
29  def get_stats_header(self):
30  return self.header
31 
32  def get_stats(self):
33  return self.stats
34 
35  def get_dumps(self):
36  return self.dumps
37 
38  def get_trajs(self):
39  return self.trajs
40 
41 
42 class LogHolder:
43 
44  """Manages information on a given simulation.
45  Assumes the existence of a _stats.txt file, and handles more files if
46  available.
47  folder : the folder which contains _stats.txt
48  prefix : the stats file is supposed to be prefix+'_stats.txt'
49  """
50 
51  def __init__(self, folder, prefix):
52  self.folder = folder
53  self.prefix = prefix
54  # verify that stats file exists
55  self.stats_file = os.path.join(folder, prefix + 'stats.txt')
56  if not os.path.isfile(self.stats_file):
57  raise ValueError('cannot find stats file %s' % self.stats_file)
58  # scan for other files
59  files = {}
60  for fl in os.listdir(folder):
61  # get variable part of file, exclude stats file
62  match = re.match(prefix + r'(.*)', fl)
63  if match is None:
64  continue
65  tail = match.group(1)
66  if tail == 'stats.txt':
67  continue
68  # get file category and add to files dict
69  category = tail.split('_')[0]
70  if not category in files:
71  files[category] = []
72  files[category].append(tail)
73  # see if there are multiple files in the same category, and store them
74  self.dumpfiles = {}
75  self.trajfiles = {}
76  for cat, fnames in files.iteritems():
77  if len(fnames) > 1 \
78  or os.path.splitext(fnames[0].split('_')[-1])[0].isdigit():
79  # there are multiple files, no need to understand their content
80  if not cat in self.dumpfiles:
81  self.dumpfiles[cat] = []
82  for fname in fnames:
83  # parse tail and find index number
84  indexno = int(os.path.splitext(fname.split('_')[1])[0])
85  self.dumpfiles[cat].append((indexno, fname))
86  self.dumpfiles[cat] = dict(self.dumpfiles[cat])
87  # make sure there are no duplicate index numbers
88  if len(self.dumpfiles[cat]) != \
89  len(set(self.dumpfiles[cat].keys())):
90  raise ValueError("found duplicates in %s %s %s"
91  % (folder, prefix, fname))
92  else:
93  # this is a trajectory, need to be able to parse it
94  fname = fnames[0]
95  ext = os.path.splitext(fname)[1]
96  if ext.startswith('.rmf'):
97  self.trajfiles[cat] = (ext[1:], fname)
98  else:
99  raise ValueError("Unknown extension: %s in file %s"
100  % (ext, fname))
101 
102  def get_stats_header(self):
103  if not hasattr(self, 'stats_handle'):
104  self.stats_handle = open(self.stats_file)
105  # read the file and guess the number of lines
106  # for now, be compatible with only one line
107  self.stats_handle.readline()
108  self.stats_first_line = self.stats_handle.readline()
109  if self.stats_first_line.startswith('#'):
110  raise ValueError('stats file must be 1-line only')
111  self.stats_handle = open(self.stats_file)
112  self.stats_header = self.stats_handle.readline()
113  return self.stats_header
114 
115  def get_first_stats_line(self):
116  # make sure file is open, skip header
117  self.get_stats_header()
118  return self.stats_first_line
119 
120  def _get_next_stats(self):
121  # make sure file is open, skip header
122  self.get_stats_header()
123  for line in self.stats_handle:
124  yield line
125 
126  def items(self):
127  """iterate over all time steps"""
128  # open stats file, store header and loop over stats file
129  for stat in self._get_next_stats():
130  # extract step number and create LogStep
131  stepno = int(stat.split()[1])
132  step = LogStep(stepno, stat, self.get_stats_header())
133  # get other files' entries at this step if available
134  for cat, df in self.dumpfiles.iteritems():
135  if stepno in df:
136  fullpath = os.path.join(self.folder,
137  self.prefix + df[stepno])
138  step.add('dump', cat, fullpath)
139  for cat, tf in self.trajfiles.iteritems():
140  fullpath = os.path.join(self.folder,
141  self.prefix + tf[1])
142  step.add('traj', cat, tf[0], fullpath, stepno, tf[1])
143  # yield a LogStep containing these entries
144  yield step
145 
146 
147 class Demuxer:
148 
149  """uses column to demux a replica trajectory. Assumes column points to a
150  float or integer type, which is allowed to change over time. Attribution is
151  based on order of float params. State 0 will be lowest param etc. Use
152  reverse=True to start with highest.
153  """
154 
155  def __init__(self, logs, outfolder, column, reverse=False):
156  self.logs = logs
157  self.reverse = reverse
158  self.column = column
159  self.outfolder = outfolder
160  self.stat_handles = {}
161  self.traj_handles_in = {}
162  self.traj_handles_out = {}
163  self.folders = {}
164  # create needed folders
165  if not os.path.isdir(outfolder):
166  os.mkdir(outfolder)
167  for l in xrange(len(self.logs)):
168  fname = os.path.join(outfolder, 'p%d' % l)
169  if not os.path.isdir(fname):
170  os.mkdir(fname)
171  self.folders[l] = fname
172  # make sure every log has the same header
173  h0 = self.logs[0].get_stats_header()
174  for log in self.logs[1:]:
175  if h0 != log.get_stats_header():
176  raise "headers must be identical!"
177  # get column number from header
178  tokens = [idx for idx, i in enumerate(h0.split()) if self.column in i]
179  if len(tokens) == 0:
180  raise ValueError("column %d not found in this header\n%s"
181  % (column, h0))
182  elif len(tokens) > 1:
183  raise ValueError("column %d found multiple times!\n%s"
184  % (column, h0))
185  self.colno = tokens[0]
186 
187  def get_param(self, statline):
188  return float(statline.split()[self.colno])
189 
190  def _write_step_stats(self, stateno, lstep):
191  # check if stats file is open
192  if stateno not in self.stat_handles:
193  self.stat_handles[stateno] = open(
194  os.path.join(self.folders[stateno],
195  str(stateno) + '_stats.txt'), 'w')
196  self.stat_handles[stateno].write(lstep.get_stats_header())
197  # write stats
198  self.stat_handles[stateno].write(lstep.get_stats())
199 
200  def _write_step_dump(self, stateno, lstep):
201  for cat, fname in lstep.get_dumps().iteritems():
202  shutil.copyfile(fname, os.path.join(self.folders[stateno],
203  str(stateno) + '_' + cat + fname.split(cat)[1]))
204 
205  def _write_traj_rmf(self, infile, instep, outfile, stateno, cat):
206  import RMF
207  # make sure infile is open
208  if infile not in self.traj_handles_in:
209  src = RMF.open_rmf_file_read_only(infile)
210  self.traj_handles_in[infile] = src
211  src = self.traj_handles_in[infile]
212  # make sure outfile is open
213  if outfile not in self.traj_handles_out:
214  dest = RMF.create_rmf_file(outfile)
215  self.traj_handles_out[outfile] = dest
216  RMF.clone_file_info(src, dest)
217  RMF.clone_hierarchy(src, dest)
218  RMF.clone_static_frame(src, dest)
219  dest = self.traj_handles_out[outfile]
220  # clone frame
221  frameid = src.get_frames()[instep - 1]
222  src.set_current_frame(frameid)
223  dest.add_frame(src.get_name(frameid), src.get_type(frameid))
224  RMF.clone_loaded_frame(src, dest)
225 
226  def _write_step_traj(self, stateno, lstep):
227  # loop over categories
228  for cat, data in lstep.get_trajs().iteritems():
229  destfile = os.path.join(self.outfolder, 'p' + str(stateno),
230  str(stateno) + '_' + data['tail'])
231  if data['ftype'].startswith('rmf'):
232  self._write_traj_rmf(data['fullpath'], data['stepno'],
233  destfile, stateno, cat)
234  else:
235  raise ValueError("unknown trajectory file type")
236 
237  def _write_step(self, stateno, lstep):
238  self._write_step_stats(stateno, lstep)
239  self._write_step_dump(stateno, lstep)
240  self._write_step_traj(stateno, lstep)
241 
242  def write(self):
243  # loop over time steps
244  log_iterators = [l.items() for l in self.logs]
245  print "Demuxing", len(log_iterators), "replicas"
246  for idx, steps in enumerate(zip(*log_iterators)):
247  if idx % 10 == 0 and idx > 0:
248  print "step", idx, '\r',
249  sys.stdout.flush()
250  # assign state numbers to these logs
251  params = [(self.get_param(i.get_stats()), i) for i in steps]
252  params.sort(reverse=self.reverse)
253  # write them
254  for i in xrange(len(params)):
255  self._write_step(i, params[i][1])
256  print "Done"
257 
258 
259 def get_prefix(folder):
260  rval = [re.match(r'(.*_)stats.txt', f) for f in os.listdir(folder)]
261  rval = [i for i in rval if i]
262  if len(rval) != 1:
263  raise ValueError("stats file not unique, found %d" % len(rval))
264  return rval[0].group(1)
265 
266 if __name__ == '__main__':
267  if len(sys.argv) == 1 or len(sys.argv) > 4:
268  sys.exit("""demux_trajs.py column [infolder [outfolder]]
269  expects r?? folders in infolder and will write p?? folders in outfolder.
270  infolder must contain a _stats.txt file which will contain a header.
271  column must be a substring matching to one of the columns in the
272  _stats.txt files. It will typically be a temperature, or a state number.
273  That column will be used for demuxing. Folders are optional and will be
274  taken as ./ if not indicated.
275  """)
276  column = sys.argv[1]
277  if len(sys.argv) == 3:
278  infolder = sys.argv[2]
279  outfolder = './'
280  elif len(sys.argv) == 4:
281  infolder = sys.argv[2]
282  outfolder = sys.argv[3]
283  else:
284  infolder = './'
285  outfolder = './'
286  # loop over infolder and read stats files
287  folders = [os.path.join(infolder, f)
288  for f in os.listdir(infolder) if re.match(r'r\d+', f)]
289  replica_logs = [LogHolder(f, prefix)
290  for f, prefix in zip(folders, map(get_prefix, folders))]
291  demux = Demuxer(replica_logs, outfolder, column, reverse=True)
292  demux.write()
def items
iterate over all time steps
Definition: demux_trajs.py:126
Manages information on a given simulation.
Definition: demux_trajs.py:42
uses column to demux a replica trajectory.
Definition: demux_trajs.py:147