1 """@namespace IMP.pmi.io
2 Utility classes and functions for reading and storing PMI files
5 from __future__
import print_function
18 from collections
import defaultdict
21 def parse_dssp(dssp_fn, limit_to_chains='',name_map=None):
22 """Read a DSSP file, and return secondary structure elements (SSEs).
23 Values are all PDB residue numbering.
24 @param dssp_fn The file to read
25 @param limit_to_chains Only read/return these chain IDs
26 @param name_map If passed, return tuples organized by molecule name
27 (name_map should be a dictionary with chain IDs as keys and
28 molecule names as values).
30 @return a dictionary with keys 'helix', 'beta', 'loop'.
31 Each contains a list of SSEs.
32 Each SSE is a list of elements (e.g. strands in a sheet).
33 Each element is a tuple (residue start, residue end, chain).
35 Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
38 ret = { 'helix' : [ [ (5,7,'A') ], ...]
39 'beta' : [ [ (1,3,'A'),
40 (9,11,'A'), ...], ...]
41 'loop' : same format as helix
46 def convert_chain(ch):
50 return name_map.get(ch, ch)
55 loop_classes = [
' ',
'',
'T',
'S']
57 for h
in helix_classes:
59 for s
in strand_classes:
61 for l
in loop_classes:
71 beta_dict = IMP.pmi.tools.OrderedDefaultDict(list)
75 with open(dssp_fn,
'r') as fh:
81 if fields[1] ==
"RESIDUE":
89 elif limit_to_chains !=
'' and line[11]
not in limit_to_chains:
94 pdb_res_num = int(line[5:10])
96 sstype = sse_dict[line[16]]
100 if prev_sstype
is None:
101 cur_sse = [pdb_res_num,pdb_res_num,convert_chain(chain)]
102 elif sstype != prev_sstype
or chain_break:
104 if prev_sstype
in [
'helix',
'loop']:
105 sses[prev_sstype].append([cur_sse])
107 beta_dict[prev_beta_id].append(cur_sse)
108 cur_sse = [pdb_res_num,pdb_res_num,convert_chain(chain)]
110 cur_sse[1] = pdb_res_num
116 prev_beta_id = beta_id
119 if prev_sstype
in [
'helix',
'loop']:
120 sses[prev_sstype].append([cur_sse])
121 elif prev_sstype ==
'beta':
122 beta_dict[prev_beta_id].append(cur_sse)
124 for beta_sheet
in beta_dict:
125 sses[
'beta'].append(beta_dict[beta_sheet])
131 number_of_best_scoring_models=10,
133 score_key=
"SimplifiedModel_Total_Score_None",
135 rmf_file_key=
"rmf_file",
136 rmf_file_frame_key=
"rmf_frame_index",
137 override_rmf_dir=
None):
138 """Given a list of stat files, read them all and find the best models.
139 Save to a single RMF along with a stat file.
140 @param model The IMP Model
141 @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
142 @param stat_files List of all stat files to collect
143 @param number_of_best_scoring_models Num best models to gather
144 @param get_every Skip frames
145 @param score_key Used for the ranking
146 @param feature_keys Keys to keep around
147 @param rmf_file_key The key that says RMF file name
148 @param rmf_file_frame_key The key that says RMF frame number
149 @param override_rmf_dir For output, change the name of the RMF directory (experiment)
154 from mpi4py
import MPI
155 comm = MPI.COMM_WORLD
156 rank = comm.Get_rank()
157 number_of_processes = comm.size
160 number_of_processes = 1
161 my_stat_files=IMP.pmi.tools.chunk_list_into_segments(
162 stat_files,number_of_processes)[rank]
165 out_stat_fn = os.path.join(out_dir,
"top_"+str(number_of_best_scoring_models)+
".out")
166 out_rmf_fn = os.path.join(out_dir,
"top_"+str(number_of_best_scoring_models)+
".rmf3")
170 for nsf,sf
in enumerate(my_stat_files):
173 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
174 print(
"getting data from file %s" % sf)
176 all_keys = [score_key,
179 for k
in po.get_keys():
180 for fk
in feature_keys:
183 fields = po.get_fields(all_keys,
187 length_set = set([len(fields[f])
for f
in fields])
188 minlen = min(length_set)
191 if len(length_set) > 1:
192 print(
"get_best_models: the statfile is not synchronous")
193 minlen = min(length_set)
195 fields[f] = fields[f][0:minlen]
200 all_fields[k]+=fields[k]
202 if override_rmf_dir
is not None:
203 for i
in range(minlen):
204 all_fields[rmf_file_key][i]=os.path.join(
205 override_rmf_dir,os.path.basename(all_fields[rmf_file_key][i]))
208 if number_of_processes!=1:
211 comm.send(all_fields, dest=0, tag=11)
213 for i
in range(1,number_of_processes):
214 data_tmp = comm.recv(source=i, tag=11)
216 all_fields[k]+=data_tmp[k]
219 order = sorted(range(len(all_fields[score_key])),
220 key=
lambda i: float(all_fields[score_key][i]))
223 stat = open(out_stat_fn,
'w')
224 rh0 = RMF.open_rmf_file_read_only(
225 os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][0]))
228 outf = RMF.create_rmf_file(out_rmf_fn)
230 for nm,i
in enumerate(order[:number_of_best_scoring_models]):
231 dline=dict((k,all_fields[k][i])
for k
in all_fields)
232 dline[
'orig_rmf_file']=dline[rmf_file_key]
233 dline[
'orig_rmf_frame_index']=dline[rmf_file_frame_key]
234 dline[rmf_file_key]=out_rmf_fn
235 dline[rmf_file_frame_key]=nm
236 rh = RMF.open_rmf_file_read_only(
237 os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][i]))
240 RMF.FrameID(all_fields[rmf_file_frame_key][i]))
243 stat.write(str(dline)+
'\n')
245 print(
'wrote stats to',out_stat_fn)
246 print(
'wrote rmfs to',out_rmf_fn)
248 class _TempProvenance(object):
249 """Placeholder to track provenance information added to the IMP model.
250 This is since we typically don't preserve the IMP::Model object
251 throughout a PMI protocol."""
252 def __init__(self, provclass, particle_name, *args, **kwargs):
253 self.provclass = provclass
254 self.particle_name = particle_name
258 def get_decorator(self, model):
259 """Instantiate and return an IMP Provenance object in the model."""
260 pi = model.add_particle(self.particle_name)
261 return self.provclass.setup_particle(model, pi, *self.args,
265 class ClusterProvenance(_TempProvenance):
266 def __init__(self, *args, **kwargs):
268 "clustering", *args, **kwargs)
271 class FilterProvenance(_TempProvenance):
272 def __init__(self, *args, **kwargs):
274 "filtering", *args, **kwargs)
277 class CombineProvenance(_TempProvenance):
278 def __init__(self, *args, **kwargs):
280 "combine runs", *args, **kwargs)
284 """Add provenance information in `prov` (a list of _TempProvenance
285 objects) to each of the IMP hierarchies provided.
286 Note that we do this all at once since we typically don't preserve
287 the IMP::Model object throughout a PMI protocol."""
289 IMP.pmi.tools._add_pmi_provenance(h)
295 score_key=
"SimplifiedModel_Total_Score_None",
297 rmf_file_key=
"rmf_file",
298 rmf_file_frame_key=
"rmf_frame_index",
300 get_every=1, provenance=
None):
301 """ Given a list of stat files, read them all and find the best models.
302 Returns the best rmf filenames, frame numbers, scores, and values for feature keywords
305 rmf_file_frame_list=[]
307 feature_keyword_list_dict=defaultdict(list)
309 for sf
in stat_files:
310 root_directory_of_stat_file = os.path.dirname(os.path.abspath(sf))
312 root_directory_of_stat_file = os.path.dirname(os.path.abspath(root_directory_of_stat_file))
313 print(
"getting data from file %s" % sf)
317 file_keywords = po.get_keys()
321 keywords = [score_key,
328 for requested_key
in feature_keys:
329 for file_k
in file_keywords:
330 if requested_key
in file_k:
331 if file_k
not in keywords:
332 keywords.append(file_k)
334 if prefiltervalue
is None:
335 fields = po.get_fields(keywords,
337 statistics=statistics)
339 fields = po.get_fields(keywords,
340 filtertuple=(score_key,
"<",prefiltervalue),
342 statistics=statistics)
347 length_set.add(len(fields[f]))
351 if len(length_set) > 1:
352 print(
"get_best_models: the statfile is not synchronous")
353 minlen = min(length_set)
355 fields[f] = fields[f][0:minlen]
358 score_list += fields[score_key]
359 for rmf
in fields[rmf_file_key]:
360 rmf=os.path.normpath(rmf)
361 if root_directory_of_stat_file
not in rmf:
362 rmf_local_path=os.path.join(os.path.basename(os.path.dirname(rmf)),os.path.basename(rmf))
363 rmf=os.path.join(root_directory_of_stat_file,rmf_local_path)
364 rmf_file_list.append(rmf)
366 rmf_file_frame_list += fields[rmf_file_frame_key]
369 feature_keyword_list_dict[k] += fields[k]
372 if provenance
is not None:
373 if len(stat_files) > 1:
374 provenance.append(CombineProvenance(len(stat_files),
377 provenance.append(FilterProvenance(
"Keep fraction",
378 0., statistics.passed_get_every))
379 if prefiltervalue
is not None:
380 provenance.append(FilterProvenance(
"Total score",
382 statistics.passed_filtertuple))
384 return rmf_file_list,rmf_file_frame_list,score_list,feature_keyword_list_dict
387 score_key=
"SimplifiedModel_Total_Score_None",
388 rmf_file_key=
"rmf_file",
389 rmf_file_frame_key=
"rmf_frame_index",
391 """ Given a list of stat files, read them all and find a trajectory of models.
392 Returns the rmf filenames, frame numbers, scores, and values for feature keywords
395 rmf_file_frame_list=[]
397 for sf
in stat_files:
398 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
399 print(
"getting data from file %s" % sf)
401 keywords = po.get_keys()
403 feature_keywords = [score_key,
407 fields = po.get_fields(feature_keywords,
413 length_set.add(len(fields[f]))
417 if len(length_set) > 1:
418 print(
"get_best_models: the statfile is not synchronous")
419 minlen = min(length_set)
421 fields[f] = fields[f][0:minlen]
424 score_list += fields[score_key]
425 for rmf
in fields[rmf_file_key]:
426 rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
428 rmf_file_frame_list += fields[rmf_file_frame_key]
430 return rmf_file_list,rmf_file_frame_list,score_list
435 alignment_components=
None,
436 rmsd_calculation_components=
None,
438 """ Read in coordinates of a set of RMF tuples.
439 Returns the coordinates split as requested (all, alignment only, rmsd only) as well as
440 RMF file names (as keys in a dictionary, with values being the rank number) and just a plain list
441 @param model The IMP model
442 @param rmf_tuples [score,filename,frame number,original order number, rank]
443 @param alignment_components Tuples to specify what you're aligning on
444 @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
447 rmsd_coordinates = []
448 alignment_coordinates = []
449 all_rmf_file_names = []
450 rmf_file_name_index_dict = {}
452 for cnt, tpl
in enumerate(rmf_tuples):
454 frame_number = tpl[2]
456 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
460 IMP.pmi.analysis.link_hiers_to_rmf(model,prots,frame_number,rmf_file)
465 states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
466 prot = states[state_number]
468 prot = prots[state_number]
472 all_particles = [pp
for key
in part_dict
for pp
in part_dict[key]]
473 all_ps_set = set(all_particles)
474 model_coordinate_dict = {}
475 template_coordinate_dict={}
476 rmsd_coordinate_dict={}
479 model_coordinate_dict[pr] = np.array(
480 [np.array(
IMP.core.XYZ(i).get_coordinates())
for i
in part_dict[pr]])
483 for tuple_dict,result_dict
in zip((alignment_components,rmsd_calculation_components),
484 (template_coordinate_dict,rmsd_coordinate_dict)):
486 if tuple_dict
is None:
491 for pr
in tuple_dict:
493 result_dict[pr] = [list(map(float,
IMP.core.XYZ(p).get_coordinates()))
496 for pr
in tuple_dict:
497 if type(tuple_dict[pr])
is str:
500 elif type(tuple_dict[pr])
is tuple:
501 name=tuple_dict[pr][2]
502 rend=tuple_dict[pr][1]
503 rbegin=tuple_dict[pr][0]
505 ps=s.get_selected_particles()
506 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
508 [list(map(float,
IMP.core.XYZ(p).get_coordinates()))
for p
in filtered_particles]
510 all_coordinates.append(model_coordinate_dict)
511 alignment_coordinates.append(template_coordinate_dict)
512 rmsd_coordinates.append(rmsd_coordinate_dict)
513 frame_name = rmf_file +
'|' + str(frame_number)
514 all_rmf_file_names.append(frame_name)
515 rmf_file_name_index_dict[frame_name] = tpl[4]
516 return all_coordinates,alignment_coordinates,rmsd_coordinates,rmf_file_name_index_dict,all_rmf_file_names
518 def get_bead_sizes(model,rmf_tuple,rmsd_calculation_components=None,state_number=0):
520 @param model The IMP model
521 @param rmf_tuple score,filename,frame number,original order number, rank
522 @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
524 if rmsd_calculation_components
is None:
527 rmf_file = rmf_tuple[1]
528 frame_number = rmf_tuple[2]
529 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
534 states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
535 prot = states[state_number]
537 prot = prots[state_number]
539 rmsd_bead_size_dict = {}
543 for pr
in rmsd_calculation_components:
550 all_particles=[pp
for key
in part_dict
for pp
in part_dict[key]]
551 all_ps_set=set(all_particles)
554 for pr
in rmsd_calculation_components:
555 if type(rmsd_calculation_components[pr])
is str:
556 name=rmsd_calculation_components[pr]
558 elif type(rmsd_calculation_components[pr])
is tuple:
559 name=rmsd_calculation_components[pr][2]
560 rend=rmsd_calculation_components[pr][1]
561 rbegin=rmsd_calculation_components[pr][0]
563 ps=s.get_selected_particles()
564 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
565 rmsd_bead_size_dict[pr] = \
568 return rmsd_bead_size_dict
571 "If you use this class please let the PMI developers know.")
572 class RMSDOutput(object):
573 """A helper output based on dist to initial coordinates"""
574 def __init__(self,ps,label,init_coords=None):
575 self.model = ps[0].get_model()
577 if init_coords
is None:
578 self.init_coords = [
IMP.core.XYZ(p).get_coordinates()
for p
in self.ps]
580 self.init_coords = init_coords
588 def get_output(self):
590 coords = [
IMP.core.XYZ(p).get_coordinates()
for p
in self.ps]
592 output[
"RMSD_"+self.label] = str(rmsd)
595 class TotalScoreOutput(object):
596 """A helper output for model evaluation"""
597 def __init__(self,model):
606 def get_output(self):
607 score = self.rs.evaluate(
False)
609 output[
"Total_Score"] = str(score)
A class for reading stat files (either rmf or ascii v1 and v2)
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
Read a DSSP file, and return secondary structure elements (SSEs).
def get_best_models
Given a list of stat files, read them all and find the best models.
Collect statistics from ProcessOutput.get_fields().
Track creation of a system fragment by combination.
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def deprecated_method
Python decorator to mark a method as deprecated.
Track creation of a system fragment by filtering.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
Classes for writing output files and processing them.
def deprecated_object
Python decorator to mark a class as deprecated.
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Track creation of a system fragment from clustering.
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
Functionality for loading, creating, manipulating and scoring atomic structures.
void add_provenance(Model *m, ParticleIndex pi, Provenance p)
Add provenance to part of the model.
Select hierarchy particles identified by the biological name.
Support for the RMF file format for storing hierarchical molecular data and markup.
def add_provenance
Add provenance information in prov (a list of _TempProvenance objects) to each of the IMP hierarchies...