1 """@namespace IMP.pmi.io
2 Utility classes and functions for reading and storing PMI files
5 from __future__
import print_function
18 from collections
import defaultdict
21 """read dssp file, get SSEs. values are all PDB residue numbering.
22 Returns a SubsequenceData object containing labels helix, beta, loop.
23 Each one is a list of SelectionDictionaries
25 Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
26 helix : [ [ {'chain':'A','residue_indexes': [5,6,7]} ] ]
27 beta : [ [ {'chain':'A','residue_indexes': [1,2,3]},
28 {'chain':'A','residue_indexes': [9,10,11]} ] ]
29 loop : same format as helix
34 loop_classes = [
' ',
'',
'T',
'S']
36 for h
in helix_classes:
38 for s
in strand_classes:
40 for l
in loop_classes:
47 beta_dict = defaultdict(list)
49 cur_sse = {
'chain':
'',
'residue_tuple':[-1,-1]}
51 for line
in open(dssp_fn,
'r'):
56 if fields[1] ==
"RESIDUE":
64 elif limit_to_chains !=
'' and line[11]
not in limit_to_chains:
69 pdb_res_num = int(line[5:10])
71 sstype = sse_dict[line[16]]
75 if prev_sstype
is None:
76 cur_sse = {
'chain':chain,
'residue_tuple':[pdb_res_num,pdb_res_num]}
77 elif sstype != prev_sstype
or chain_break:
79 if prev_sstype
in [
'helix',
'loop']:
80 sses.add_subsequence(prev_sstype,
Subsequence(**cur_sse))
81 elif prev_sstype ==
'beta':
82 beta_dict[prev_beta_id].append(cur_sse)
83 cur_sse = {
'chain':chain,
'residue_tuple':[pdb_res_num,pdb_res_num]}
85 cur_sse[
'residue_tuple'][1]=pdb_res_num
91 prev_beta_id = beta_id
94 if not prev_sstype
is None:
95 if prev_sstype
in [
'helix',
'loop']:
96 sses.add_subsequence(prev_sstype,
Subsequence(**cur_sse))
97 elif prev_sstype ==
'beta':
98 beta_dict[prev_beta_id].append(cur_sse)
100 for beta_sheet
in beta_dict:
102 for strand
in beta_dict[beta_sheet]:
103 seq.add_range(**strand)
104 sses.add_subsequence(
'beta',seq)
112 """ Format from Trisha Davis. Lines are:
113 ignore ignore seq1 seq2 >Name(res) >Name(res) score
114 @param model An IMP model
115 @param data_fn The data file name
116 @param max_num Maximum number of XL to read (-1 is all)
117 @param name_map Dictionary mapping text file names to the molecule name
118 @param named_offsets Integer offsets to apply to the indexing in the file
119 Output is a CrossLinkData object containing SelectionDictionaries
121 [ { 'r1': {'molecule':'A','residue_index':5},
122 'r2': {'molecule':'B','residue_index':100},
124 { 'r1': {'molecule':'C','residue_index':63},
125 'r2': {'molecule':'D','residue_index':94},
130 inf=open(data_fn,
'r')
131 data=defaultdict(list)
134 for nl,l
in enumerate(inf):
135 if max_num==-1
or nl<max_num:
136 ig1,ig2,seq1,seq2,s1,s2,score=l.split()
138 m1=re.search(
r'>([\w-]+)\((\w+)\)',s1)
139 m2=re.search(
r'>([\w-]+)\((\w+)\)',s2)
145 if n1
in named_offsets:
146 r1+=named_offsets[n1]
152 if n2
in named_offsets:
153 r2+=named_offsets[n2]
154 key=tuple(sorted([
'%s.%i'%(n1,r1),
'%s.%i'%(n2,r2)]))
156 print(
'skipping duplicated xl',key)
159 data.add_cross_link(nl,
160 {
'molecule':n1,
'residue_index':r1},
161 {
'molecule':n2,
'residue_index':r2},
168 """ A light class to store multiple not-necessarily-contiguous residue ranges."""
169 def __init__(self,chain=None,molecule=None,residue_tuple=None,subsequences=None):
170 """Create subsequence and optionally pass the first contiguous range.
171 @param chain The chain ID
172 @param molecule The molecule name
173 @param residue_tuple PDB-style inclusive residue range
174 @param subsequences A list of other subsequences to combine (not implemented)
177 if chain
or molecule
or residue_tuple:
178 self.
add_range(chain,molecule,residue_tuple)
181 def add_range(self,chain=None,molecule=None,residue_tuple=None):
182 """Add some stuff to this subsequence
183 @param chain The chain ID
184 @param molecule The molecule name
185 @param residue_tuple PDB-style inclusive residue range
187 self.seqs.append({
'chain':chain,
'molecule':molecule,
'residue_tuple':residue_tuple})
188 def join(self,new_subsequence):
189 for s
in new_subsequence:
193 """Create an IMP Selection from this subsequence
194 @param hier An IMP hierarchy or list of them
195 \note any additional keyword arguments will be appended to the selection
197 for nseq,seq
in enumerate(self.seqs):
200 args[
'chain']=seq[
'chain']
202 args[
'molecule']=seq[
'molecule']
203 if seq[
'residue_tuple']:
204 args[
'residue_indexes']=list(range(seq[
'residue_tuple'][0],
205 seq[
'residue_tuple'][1]+1))
214 for nseq,seq
in enumerate(self.seqs):
216 if seq[
'residue_tuple']
is not None:
217 this_str.append(
'%i-%i'%(seq[
'residue_tuple'][0],seq[
'residue_tuple'][1]))
218 if seq[
'molecule']
is not None:
219 this_str.append(
'%s'%seq[
'molecule'])
220 if seq[
'chain']
is not None:
221 this_str.append(
'%s'%seq[
'chain'])
222 rep+=
'.'.join(this_str)
223 if nseq < len(self.seqs)-1:
227 def __getitem__(self,key):
228 return self.data[key]
231 return self.seqs.__iter__()
233 def __add__(self,other):
238 """ Group a bunch of subsequences with certain labels
239 Use cases: storing lists of secondary structures from DSSP or PSIPRED
240 storing lists of molecules that should be made symmetric
243 """Setup groups of subsequences
245 self.data=defaultdict(list)
248 """ Append a new cross-link to a given unique ID
249 @param label label for this subsequence (e.g., 'helix')
250 @param subsequence a Subsequence object to store under that label
251 e.g. Subsequence(chain='A',residue_tuple=(10,15))
253 if type(subsequence)
is not Subsequence:
254 raise InputError(
'must provide a subsequence object')
255 self.data[label].append(subsequence)
257 def __getitem__(self,key):
258 return self.data[key]
261 return self.data.__repr__()
264 return self.data.keys()
267 """A class to store the selection commands for a single crosslink.
269 def __init__(self,unique_id,r1,r2,score):
271 @param unique_id The id is used to group crosslinks that are alternatives
272 @param r1 A dictionary of selection keywords for the first residue
273 @param r2 A dictionary of selection keywards for the second residue
274 @param score A score that might be used later for filtering
275 @Note The dictionaries can contain any Selection argument like
276 molecule or residue_index
278 self.unique_id = unique_id
284 return "CrossLink id: "+str(self.unique_id)+
" r1: "+repr(self.r1)+
", r2: "+repr(self.r2)
286 def intersects(self,other):
287 if self.r1==other.r1
or self.r1==other.r2
or self.r2==other.r1
or self.r2==other.r2:
293 """Return a list of atom pairs (particles) for this crosslink.
294 Found by selecting everything with r1 and r2 then returning the
296 @Note you may want to provide some atom specifiers like
297 atom_type=IMP.atom.AtomType("CA")
298 @param mh The hierarchy to select from
299 @param kwargs Any additional selection arguments
308 raise Exception(
"this selection is empty",rsel1)
310 raise Exception(
"this selection is empty",rsel2)
313 # Check no repeating copy numbers....not sure if this is general
314 for s in (sel1,sel2):
315 idxs=[IMP.atom.get_copy_index(p) for p in s]
316 if len(idxs)!=len(set(idxs)):
317 raise Exception("this XL is selecting more than one particle per copy")
320 for p1,p2
in itertools.product(sel1,sel2):
325 """A class for storing groups of crosslinks.
326 Acts like a dictionary where keys are unique IDs and values are CrossLinks
327 Equivalent (using objects instead of dicts) to a data structure like so:
329 [ { 'r1': {'molecule':'A','residue_index':5},
330 'r2': {'molecule':'B','residue_index':100},
332 { 'r1': {'molecule':'C','residue_index':63},
333 'r2': {'molecule':'D','residue_index':94},
338 """Setup a CrossLinkData object"""
339 self.data = defaultdict(list)
341 """Add a crosslink. They are organized by their unique_ids.
342 @param unique_id The id is used to group crosslinks that are alternatives
343 @param r1 A dictionary of selection keywords for the first residue
344 @param r2 A dictionary of selection keywards for the second residue
345 @param score A score that might be used later for filtering
346 @Note The dictionaries can contain any Selection argument like
347 molecule or residue_index
349 self.data[unique_id].append(
CrossLink(unique_id,kws1,kws2,score))
350 def copy_cross_link(self,xl):
351 if type(xl)
is not CrossLink:
352 raise Exception(
"CrossLinkData::copy_cross_link() requires a Crosslink object")
353 self.data[xl.unique_id].append(xl)
354 def __getitem__(self, key):
355 return self.data[key]
357 ret=
"CrossLinkData with these entries:\n"
359 for xl
in self.data[d]:
363 return self.data.keys()
365 return self.data.values()
366 def __contains__(self, item):
367 return item
in self.data
369 return iter(self.data)
371 return len(self.data)
376 number_of_best_scoring_models=10,
378 score_key=
"SimplifiedModel_Total_Score_None",
380 rmf_file_key=
"rmf_file",
381 rmf_file_frame_key=
"rmf_frame_index",
382 override_rmf_dir=
None):
383 """Given a list of stat files, read them all and find the best models.
384 Save to a single RMF along with a stat file.
385 @param mdl The IMP Model
386 @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
387 @param stat_files List of all stat files to collect
388 @param number_of_best_scoring_models Num best models to gather
389 @param get_every Skip frames
390 @param score_key Used for the ranking
391 @param feature_keys Keys to keep around
392 @param rmf_file_key The key that says RMF file name
393 @param rmf_file_frame_key The key that says RMF frame number
394 @param override_rmf_dir For output, change the name of the RMF directory (experiment)
399 from mpi4py
import MPI
400 comm = MPI.COMM_WORLD
401 rank = comm.Get_rank()
402 number_of_processes = comm.size
405 number_of_processes = 1
406 my_stat_files=IMP.pmi.tools.chunk_list_into_segments(
407 stat_files,number_of_processes)[rank]
410 out_stat_fn = os.path.join(out_dir,
"top_"+str(number_of_best_scoring_models)+
".out")
411 out_rmf_fn = os.path.join(out_dir,
"top_"+str(number_of_best_scoring_models)+
".rmf3")
415 for nsf,sf
in enumerate(my_stat_files):
418 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
419 print(
"getting data from file %s" % sf)
421 all_keys = [score_key,
424 for k
in po.get_keys():
425 for fk
in feature_keys:
428 fields = po.get_fields(all_keys,
432 length_set = set([len(fields[f])
for f
in fields])
433 minlen = min(length_set)
436 if len(length_set) > 1:
437 print(
"get_best_models: the statfile is not synchronous")
438 minlen = min(length_set)
440 fields[f] = fields[f][0:minlen]
445 all_fields[k]+=fields[k]
447 if override_rmf_dir
is not None:
448 for i
in range(minlen):
449 all_fields[rmf_file_key][i]=os.path.join(
450 override_rmf_dir,os.path.basename(all_fields[rmf_file_key][i]))
453 if number_of_processes!=1:
456 comm.send(all_fields, dest=0, tag=11)
458 for i
in range(1,number_of_processes):
459 data_tmp = comm.recv(source=i, tag=11)
461 all_fields[k]+=data_tmp[k]
464 order = sorted(range(len(all_fields[score_key])),
465 key=
lambda i: float(all_fields[score_key][i]))
468 stat = open(out_stat_fn,
'w')
469 rh0 = RMF.open_rmf_file_read_only(
470 os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][0]))
473 outf = RMF.create_rmf_file(out_rmf_fn)
475 for nm,i
in enumerate(order[:number_of_best_scoring_models]):
476 dline=dict((k,all_fields[k][i])
for k
in all_fields)
477 dline[
'orig_rmf_file']=dline[rmf_file_key]
478 dline[
'orig_rmf_frame_index']=dline[rmf_file_frame_key]
479 dline[rmf_file_key]=out_rmf_fn
480 dline[rmf_file_frame_key]=nm
481 rh = RMF.open_rmf_file_read_only(
482 os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][i]))
487 stat.write(str(dline)+
'\n')
489 print(
'wrote stats to',out_stat_fn)
490 print(
'wrote rmfs to',out_rmf_fn)
496 score_key=
"SimplifiedModel_Total_Score_None",
498 rmf_file_key=
"rmf_file",
499 rmf_file_frame_key=
"rmf_frame_index",
502 """ Given a list of stat files, read them all and find the best models.
503 Returns the best rmf filenames, frame numbers, scores, and values for feature keywords
506 rmf_file_frame_list=[]
508 feature_keyword_list_dict=defaultdict(list)
509 for sf
in stat_files:
510 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
511 print(
"getting data from file %s" % sf)
513 keywords = po.get_keys()
515 feature_keywords = [score_key,
520 for fk
in feature_keys:
522 feature_keywords.append(k)
524 if prefiltervalue
is None:
525 fields = po.get_fields(feature_keywords,
528 fields = po.get_fields(feature_keywords,
529 filtertuple=(score_key,
"<",prefiltervalue),
535 length_set.add(len(fields[f]))
539 if len(length_set) > 1:
540 print(
"get_best_models: the statfile is not synchronous")
541 minlen = min(length_set)
543 fields[f] = fields[f][0:minlen]
546 score_list += fields[score_key]
547 for rmf
in fields[rmf_file_key]:
548 rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
550 rmf_file_frame_list += fields[rmf_file_frame_key]
552 if feature_keywords
is not None:
553 for k
in feature_keywords:
554 feature_keyword_list_dict[k] += fields[k]
556 return rmf_file_list,rmf_file_frame_list,score_list,feature_keyword_list_dict
559 score_key=
"SimplifiedModel_Total_Score_None",
560 rmf_file_key=
"rmf_file",
561 rmf_file_frame_key=
"rmf_frame_index",
563 """ Given a list of stat files, read them all and find a trajectory of models.
564 Returns the rmf filenames, frame numbers, scores, and values for feature keywords
567 rmf_file_frame_list=[]
569 for sf
in stat_files:
570 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
571 print(
"getting data from file %s" % sf)
573 keywords = po.get_keys()
575 feature_keywords = [score_key,
579 fields = po.get_fields(feature_keywords,
585 length_set.add(len(fields[f]))
589 if len(length_set) > 1:
590 print(
"get_best_models: the statfile is not synchronous")
591 minlen = min(length_set)
593 fields[f] = fields[f][0:minlen]
596 score_list += fields[score_key]
597 for rmf
in fields[rmf_file_key]:
598 rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
600 rmf_file_frame_list += fields[rmf_file_frame_key]
602 return rmf_file_list,rmf_file_frame_list,score_list
607 alignment_components=
None,
608 rmsd_calculation_components=
None):
609 """ Read in coordinates of a set of RMF tuples.
610 Returns the coordinates split as requested (all, alignment only, rmsd only) as well as
611 RMF file names (as keys in a dictionary, with values being the rank number) and just a plain list
612 @param model The IMP model
613 @param rmf_tuples [score,filename,frame number,original order number, rank]
614 @param alignment_components Tuples to specify what you're aligning on
615 @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
618 rmsd_coordinates = []
619 alignment_coordinates = []
620 all_rmf_file_names = []
621 rmf_file_name_index_dict = {}
623 for cnt, tpl
in enumerate(rmf_tuples):
625 frame_number = tpl[2]
627 prot = IMP.pmi.analysis.get_hier_from_rmf(model,
635 all_particles=[pp
for key
in part_dict
for pp
in part_dict[key]]
636 all_ps_set=set(all_particles)
638 model_coordinate_dict = {}
639 template_coordinate_dict={}
640 rmsd_coordinate_dict={}
642 model_coordinate_dict[pr] = np.array(
643 [np.array(
IMP.core.XYZ(i).get_coordinates())
for i
in part_dict[pr]])
645 if alignment_components
is not None:
646 for pr
in alignment_components:
647 if type(alignment_components[pr])
is str:
648 name=alignment_components[pr]
650 elif type(alignment_components[pr])
is tuple:
651 name=alignment_components[pr][2]
652 rend=alignment_components[pr][1]
653 rbegin=alignment_components[pr][0]
655 ps=s.get_selected_particles()
656 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
657 template_coordinate_dict[pr] = \
658 [list(map(float,
IMP.core.XYZ(i).get_coordinates()))
for i
in filtered_particles]
660 if rmsd_calculation_components
is not None:
661 for pr
in rmsd_calculation_components:
662 if type(rmsd_calculation_components[pr])
is str:
663 name=rmsd_calculation_components[pr]
665 elif type(rmsd_calculation_components[pr])
is tuple:
666 name=rmsd_calculation_components[pr][2]
667 rend=rmsd_calculation_components[pr][1]
668 rbegin=rmsd_calculation_components[pr][0]
670 ps=s.get_selected_particles()
671 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
672 rmsd_coordinate_dict[pr] = \
673 [list(map(float,
IMP.core.XYZ(i).get_coordinates()))
for i
in filtered_particles]
675 all_coordinates.append(model_coordinate_dict)
676 alignment_coordinates.append(template_coordinate_dict)
677 rmsd_coordinates.append(rmsd_coordinate_dict)
678 frame_name = rmf_file +
'|' + str(frame_number)
679 all_rmf_file_names.append(frame_name)
680 rmf_file_name_index_dict[frame_name] = tpl[4]
681 return all_coordinates,alignment_coordinates,rmsd_coordinates,rmf_file_name_index_dict,all_rmf_file_names
685 @param model The IMP model
686 @param rmf_tuple score,filename,frame number,original order number, rank
687 @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
689 rmf_file = rmf_tuple[1]
690 frame_number = rmf_tuple[2]
692 prot = IMP.pmi.analysis.get_hier_from_rmf(model,
698 all_particles=[pp
for key
in part_dict
for pp
in part_dict[key]]
699 all_ps_set=set(all_particles)
701 rmsd_bead_size_dict={}
703 if rmsd_calculation_components
is not None:
704 for pr
in rmsd_calculation_components:
705 if type(rmsd_calculation_components[pr])
is str:
706 name=rmsd_calculation_components[pr]
708 elif type(rmsd_calculation_components[pr])
is tuple:
709 name=rmsd_calculation_components[pr][2]
710 rend=rmsd_calculation_components[pr][1]
711 rbegin=rmsd_calculation_components[pr][0]
713 ps=s.get_selected_particles()
714 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
715 rmsd_bead_size_dict[pr] = \
719 return rmsd_bead_size_dict
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, kernel::Model *m)
void save_frame(RMF::FileHandle file, unsigned int, std::string name="")
A class for reading stat files.
def parse_dssp
read dssp file, get SSEs.
def get_best_models
Given a list of stat files, read them all and find the best models.
A class for storing groups of crosslinks.
void load_frame(RMF::FileConstHandle file, unsigned int frame)
def add_range
Add some stuff to this subsequence.
A light class to store multiple not-necessarily-contiguous residue ranges.
def add_cross_link
Add a crosslink.
def __init__
Setup a CrossLinkData object.
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def __init__
Create subsequence and optionally pass the first contiguous range.
def get_selection
Return a list of atom pairs (particles) for this crosslink.
def __init__
Setup groups of subsequences.
def parse_xlinks_davis
Format from Trisha Davis.
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Group a bunch of subsequences with certain labels Use cases: storing lists of secondary structures fr...
A class to store the selection commands for a single crosslink.
Tools for clustering and cluster analysis.
def get_selection
Create an IMP Selection from this subsequence.
Classes for writing output files and processing them.
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
def __init__
Add a crosslink.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Support for the RMF file format for storing hierarchical molecular data and markup.
def add_subsequence
Append a new cross-link to a given unique ID.