1 """@namespace IMP.pmi.io
2 Utility classes and functions for reading and storing PMI files
5 from __future__
import print_function
18 from collections
import defaultdict
21 def parse_dssp(dssp_fn, limit_to_chains='',name_map=None):
22 """Read a DSSP file, and return secondary structure elements (SSEs).
23 Values are all PDB residue numbering.
24 @param dssp_fn The file to read
25 @param limit_to_chains Only read/return these chain IDs
26 @param name_map If passed, return tuples organized by molecule name
27 (name_map should be a dictionary with chain IDs as keys and
28 molecule names as values).
30 @return a dictionary with keys 'helix', 'beta', 'loop'
31 Each contains a list of SSEs.
32 Each SSE is a list of elements (e.g. strands in a sheet)
33 Each element is a tuple (residue start, residue end, chain)
35 Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
38 ret = { 'helix' : [ [ (5,7,'A') ],... ]
39 'beta' : [ [ (1,3,'A'),
41 'loop' : same format as helix
46 def convert_chain(ch):
58 loop_classes = [
' ',
'',
'T',
'S']
60 for h
in helix_classes:
62 for s
in strand_classes:
64 for l
in loop_classes:
78 with open(dssp_fn,
'r') as fh:
84 if fields[1] ==
"RESIDUE":
92 elif limit_to_chains !=
'' and line[11]
not in limit_to_chains:
97 pdb_res_num = int(line[5:10])
99 sstype = sse_dict[line[16]]
103 if prev_sstype
is None:
104 cur_sse = [pdb_res_num,pdb_res_num,convert_chain(chain)]
105 elif sstype != prev_sstype
or chain_break:
107 if prev_sstype
in [
'helix',
'loop']:
108 sses[prev_sstype].append([cur_sse])
109 elif prev_sstype ==
'beta':
110 beta_dict[prev_beta_id].append(cur_sse)
111 cur_sse = [pdb_res_num,pdb_res_num,convert_chain(chain)]
113 cur_sse[1] = pdb_res_num
119 prev_beta_id = beta_id
122 if not prev_sstype
is None:
123 if prev_sstype
in [
'helix',
'loop']:
124 sses[prev_sstype].append([cur_sse])
125 elif prev_sstype ==
'beta':
126 beta_dict[prev_beta_id].append(cur_sse)
128 for beta_sheet
in beta_dict:
129 sses[
'beta'].append(beta_dict[beta_sheet])
135 number_of_best_scoring_models=10,
137 score_key=
"SimplifiedModel_Total_Score_None",
139 rmf_file_key=
"rmf_file",
140 rmf_file_frame_key=
"rmf_frame_index",
141 override_rmf_dir=
None):
142 """Given a list of stat files, read them all and find the best models.
143 Save to a single RMF along with a stat file.
144 @param mdl The IMP Model
145 @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
146 @param stat_files List of all stat files to collect
147 @param number_of_best_scoring_models Num best models to gather
148 @param get_every Skip frames
149 @param score_key Used for the ranking
150 @param feature_keys Keys to keep around
151 @param rmf_file_key The key that says RMF file name
152 @param rmf_file_frame_key The key that says RMF frame number
153 @param override_rmf_dir For output, change the name of the RMF directory (experiment)
158 from mpi4py
import MPI
159 comm = MPI.COMM_WORLD
160 rank = comm.Get_rank()
161 number_of_processes = comm.size
164 number_of_processes = 1
165 my_stat_files=IMP.pmi.tools.chunk_list_into_segments(
166 stat_files,number_of_processes)[rank]
169 out_stat_fn = os.path.join(out_dir,
"top_"+str(number_of_best_scoring_models)+
".out")
170 out_rmf_fn = os.path.join(out_dir,
"top_"+str(number_of_best_scoring_models)+
".rmf3")
174 for nsf,sf
in enumerate(my_stat_files):
177 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
178 print(
"getting data from file %s" % sf)
180 all_keys = [score_key,
183 for k
in po.get_keys():
184 for fk
in feature_keys:
187 fields = po.get_fields(all_keys,
191 length_set = set([len(fields[f])
for f
in fields])
192 minlen = min(length_set)
195 if len(length_set) > 1:
196 print(
"get_best_models: the statfile is not synchronous")
197 minlen = min(length_set)
199 fields[f] = fields[f][0:minlen]
204 all_fields[k]+=fields[k]
206 if override_rmf_dir
is not None:
207 for i
in range(minlen):
208 all_fields[rmf_file_key][i]=os.path.join(
209 override_rmf_dir,os.path.basename(all_fields[rmf_file_key][i]))
212 if number_of_processes!=1:
215 comm.send(all_fields, dest=0, tag=11)
217 for i
in range(1,number_of_processes):
218 data_tmp = comm.recv(source=i, tag=11)
220 all_fields[k]+=data_tmp[k]
223 order = sorted(range(len(all_fields[score_key])),
224 key=
lambda i: float(all_fields[score_key][i]))
227 stat = open(out_stat_fn,
'w')
228 rh0 = RMF.open_rmf_file_read_only(
229 os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][0]))
232 outf = RMF.create_rmf_file(out_rmf_fn)
234 for nm,i
in enumerate(order[:number_of_best_scoring_models]):
235 dline=dict((k,all_fields[k][i])
for k
in all_fields)
236 dline[
'orig_rmf_file']=dline[rmf_file_key]
237 dline[
'orig_rmf_frame_index']=dline[rmf_file_frame_key]
238 dline[rmf_file_key]=out_rmf_fn
239 dline[rmf_file_frame_key]=nm
240 rh = RMF.open_rmf_file_read_only(
241 os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][i]))
244 RMF.FrameID(all_fields[rmf_file_frame_key][i]))
247 stat.write(str(dline)+
'\n')
249 print(
'wrote stats to',out_stat_fn)
250 print(
'wrote rmfs to',out_rmf_fn)
252 class _TempProvenance(object):
253 """Placeholder to track provenance information added to the IMP model.
254 This is since we typically don't preserve the IMP::Model object
255 throughout a PMI protocol."""
256 def __init__(self, provclass, particle_name, *args, **kwargs):
257 self.provclass = provclass
258 self.particle_name = particle_name
262 def get_decorator(self, model):
263 """Instantiate and return an IMP Provenance object in the model."""
264 pi = model.add_particle(self.particle_name)
265 return self.provclass.setup_particle(model, pi, *self.args,
269 class ClusterProvenance(_TempProvenance):
270 def __init__(self, *args, **kwargs):
272 "clustering", *args, **kwargs)
275 class FilterProvenance(_TempProvenance):
276 def __init__(self, *args, **kwargs):
278 "filtering", *args, **kwargs)
281 class CombineProvenance(_TempProvenance):
282 def __init__(self, *args, **kwargs):
284 "combine runs", *args, **kwargs)
288 """Add provenance information in `prov` (a list of _TempProvenance
289 objects) to each of the IMP hierarchies provided.
290 Note that we do this all at once since we typically don't preserve
291 the IMP::Model object throughout a PMI protocol."""
293 IMP.pmi.tools._add_pmi_provenance(h)
299 score_key=
"SimplifiedModel_Total_Score_None",
301 rmf_file_key=
"rmf_file",
302 rmf_file_frame_key=
"rmf_frame_index",
304 get_every=1, provenance=
None):
305 """ Given a list of stat files, read them all and find the best models.
306 Returns the best rmf filenames, frame numbers, scores, and values for feature keywords
309 rmf_file_frame_list=[]
311 feature_keyword_list_dict=defaultdict(list)
313 for sf
in stat_files:
314 root_directory_of_stat_file = os.path.dirname(os.path.abspath(sf))
316 root_directory_of_stat_file = os.path.dirname(os.path.abspath(root_directory_of_stat_file))
317 print(
"getting data from file %s" % sf)
321 file_keywords = po.get_keys()
325 keywords = [score_key,
332 for requested_key
in feature_keys:
333 for file_k
in file_keywords:
334 if requested_key
in file_k:
335 if file_k
not in keywords:
336 keywords.append(file_k)
338 if prefiltervalue
is None:
339 fields = po.get_fields(keywords,
341 statistics=statistics)
343 fields = po.get_fields(keywords,
344 filtertuple=(score_key,
"<",prefiltervalue),
346 statistics=statistics)
351 length_set.add(len(fields[f]))
355 if len(length_set) > 1:
356 print(
"get_best_models: the statfile is not synchronous")
357 minlen = min(length_set)
359 fields[f] = fields[f][0:minlen]
362 score_list += fields[score_key]
363 for rmf
in fields[rmf_file_key]:
364 rmf=os.path.normpath(rmf)
365 if root_directory_of_stat_file
not in rmf:
366 rmf_local_path=os.path.join(os.path.basename(os.path.dirname(rmf)),os.path.basename(rmf))
367 rmf=os.path.join(root_directory_of_stat_file,rmf_local_path)
368 rmf_file_list.append(rmf)
370 rmf_file_frame_list += fields[rmf_file_frame_key]
373 feature_keyword_list_dict[k] += fields[k]
376 if provenance
is not None:
377 if len(stat_files) > 1:
378 provenance.append(CombineProvenance(len(stat_files),
381 provenance.append(FilterProvenance(
"Keep fraction",
382 0., statistics.passed_get_every))
383 if prefiltervalue
is not None:
384 provenance.append(FilterProvenance(
"Total score",
386 statistics.passed_filtertuple))
388 return rmf_file_list,rmf_file_frame_list,score_list,feature_keyword_list_dict
391 score_key=
"SimplifiedModel_Total_Score_None",
392 rmf_file_key=
"rmf_file",
393 rmf_file_frame_key=
"rmf_frame_index",
395 """ Given a list of stat files, read them all and find a trajectory of models.
396 Returns the rmf filenames, frame numbers, scores, and values for feature keywords
399 rmf_file_frame_list=[]
401 for sf
in stat_files:
402 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
403 print(
"getting data from file %s" % sf)
405 keywords = po.get_keys()
407 feature_keywords = [score_key,
411 fields = po.get_fields(feature_keywords,
417 length_set.add(len(fields[f]))
421 if len(length_set) > 1:
422 print(
"get_best_models: the statfile is not synchronous")
423 minlen = min(length_set)
425 fields[f] = fields[f][0:minlen]
428 score_list += fields[score_key]
429 for rmf
in fields[rmf_file_key]:
430 rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
432 rmf_file_frame_list += fields[rmf_file_frame_key]
434 return rmf_file_list,rmf_file_frame_list,score_list
439 alignment_components=
None,
440 rmsd_calculation_components=
None,
442 """ Read in coordinates of a set of RMF tuples.
443 Returns the coordinates split as requested (all, alignment only, rmsd only) as well as
444 RMF file names (as keys in a dictionary, with values being the rank number) and just a plain list
445 @param model The IMP model
446 @param rmf_tuples [score,filename,frame number,original order number, rank]
447 @param alignment_components Tuples to specify what you're aligning on
448 @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
451 rmsd_coordinates = []
452 alignment_coordinates = []
453 all_rmf_file_names = []
454 rmf_file_name_index_dict = {}
456 for cnt, tpl
in enumerate(rmf_tuples):
458 frame_number = tpl[2]
460 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
464 IMP.pmi.analysis.link_hiers_to_rmf(model,prots,frame_number,rmf_file)
469 states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
470 prot = states[state_number]
472 prot = prots[state_number]
476 all_particles = [pp
for key
in part_dict
for pp
in part_dict[key]]
477 all_ps_set = set(all_particles)
478 model_coordinate_dict = {}
479 template_coordinate_dict={}
480 rmsd_coordinate_dict={}
483 model_coordinate_dict[pr] = np.array(
484 [np.array(
IMP.core.XYZ(i).get_coordinates())
for i
in part_dict[pr]])
487 for tuple_dict,result_dict
in zip((alignment_components,rmsd_calculation_components),
488 (template_coordinate_dict,rmsd_coordinate_dict)):
490 if tuple_dict
is None:
495 for pr
in tuple_dict:
497 result_dict[pr] = [list(map(float,
IMP.core.XYZ(p).get_coordinates()))
500 for pr
in tuple_dict:
501 if type(tuple_dict[pr])
is str:
504 elif type(tuple_dict[pr])
is tuple:
505 name=tuple_dict[pr][2]
506 rend=tuple_dict[pr][1]
507 rbegin=tuple_dict[pr][0]
509 ps=s.get_selected_particles()
510 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
512 [list(map(float,
IMP.core.XYZ(p).get_coordinates()))
for p
in filtered_particles]
514 all_coordinates.append(model_coordinate_dict)
515 alignment_coordinates.append(template_coordinate_dict)
516 rmsd_coordinates.append(rmsd_coordinate_dict)
517 frame_name = rmf_file +
'|' + str(frame_number)
518 all_rmf_file_names.append(frame_name)
519 rmf_file_name_index_dict[frame_name] = tpl[4]
520 return all_coordinates,alignment_coordinates,rmsd_coordinates,rmf_file_name_index_dict,all_rmf_file_names
522 def get_bead_sizes(model,rmf_tuple,rmsd_calculation_components=None,state_number=0):
524 @param model The IMP model
525 @param rmf_tuple score,filename,frame number,original order number, rank
526 @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
528 if rmsd_calculation_components
is None:
531 rmf_file = rmf_tuple[1]
532 frame_number = rmf_tuple[2]
533 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
538 states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
539 prot = states[state_number]
541 prot = prots[state_number]
543 rmsd_bead_size_dict = {}
547 for pr
in rmsd_calculation_components:
554 all_particles=[pp
for key
in part_dict
for pp
in part_dict[key]]
555 all_ps_set=set(all_particles)
558 for pr
in rmsd_calculation_components:
559 if type(rmsd_calculation_components[pr])
is str:
560 name=rmsd_calculation_components[pr]
562 elif type(rmsd_calculation_components[pr])
is tuple:
563 name=rmsd_calculation_components[pr][2]
564 rend=rmsd_calculation_components[pr][1]
565 rbegin=rmsd_calculation_components[pr][0]
567 ps=s.get_selected_particles()
568 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
569 rmsd_bead_size_dict[pr] = \
572 return rmsd_bead_size_dict
575 """A helper output based on dist to initial coordinates"""
576 def __init__(self,ps,label,init_coords=None):
577 self.mdl = ps[0].get_model()
579 if init_coords
is None:
580 self.init_coords = [
IMP.core.XYZ(p).get_coordinates()
for p
in self.ps]
582 self.init_coords = init_coords
584 def get_output(self):
587 coords = [
IMP.core.XYZ(p).get_coordinates()
for p
in self.ps]
589 output[
"RMSD_"+self.label] = str(rmsd)
593 """A helper output for model evaluation"""
594 def __init__(self,mdl):
597 def get_output(self):
599 score = self.rs.evaluate(
False)
601 output[
"Total_Score"] = str(score)
A class for reading stat files (either rmf or ascii v1 and v2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
Read a DSSP file, and return secondary structure elements (SSEs).
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output for model evaluation.
A helper output based on dist to initial coordinates.
Collect statistics from ProcessOutput.get_fields().
Track creation of a system fragment by combination.
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
Track creation of a system fragment by filtering.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
Classes for writing output files and processing them.
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Track creation of a system fragment from clustering.
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
Functionality for loading, creating, manipulating and scoring atomic structures.
void add_provenance(Model *m, ParticleIndex pi, Provenance p)
Add provenance to part of the model.
Select hierarchy particles identified by the biological name.
Support for the RMF file format for storing hierarchical molecular data and markup.
def add_provenance
Add provenance information in prov (a list of _TempProvenance objects) to each of the IMP hierarchies...