1 """@namespace IMP.pmi.io
2 Utility classes and functions for reading and storing PMI files
5 from __future__
import print_function
18 from collections
import defaultdict
21 """read dssp file, get SSEs. values are all PDB residue numbering.
22 Returns a SubsequenceData object containing labels helix, beta, loop.
23 Each one is a list of SelectionDictionaries
25 Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
26 helix : [ [ {'chain':'A','residue_indexes': [5,6,7]} ] ]
27 beta : [ [ {'chain':'A','residue_indexes': [1,2,3]},
28 {'chain':'A','residue_indexes': [9,10,11]} ] ]
29 loop : same format as helix
34 loop_classes = [
' ',
'',
'T',
'S']
36 for h
in helix_classes:
38 for s
in strand_classes:
40 for l
in loop_classes:
47 beta_dict = defaultdict(list)
49 cur_sse = {
'chain':
'',
'residue_tuple':[-1,-1]}
51 for line
in open(dssp_fn,
'r'):
56 if fields[1] ==
"RESIDUE":
64 elif limit_to_chains !=
'' and line[11]
not in limit_to_chains:
69 pdb_res_num = int(line[5:10])
71 sstype = sse_dict[line[16]]
75 if prev_sstype
is None:
76 cur_sse = {
'chain':chain,
'residue_tuple':[pdb_res_num,pdb_res_num]}
77 elif sstype != prev_sstype
or chain_break:
79 if prev_sstype
in [
'helix',
'loop']:
80 sses.add_subsequence(prev_sstype,
Subsequence(**cur_sse))
81 elif prev_sstype ==
'beta':
82 beta_dict[prev_beta_id].append(cur_sse)
83 cur_sse = {
'chain':chain,
'residue_tuple':[pdb_res_num,pdb_res_num]}
85 cur_sse[
'residue_tuple'][1]=pdb_res_num
91 prev_beta_id = beta_id
94 if not prev_sstype
is None:
95 if prev_sstype
in [
'helix',
'loop']:
96 sses.add_subsequence(prev_sstype,
Subsequence(**cur_sse))
97 elif prev_sstype ==
'beta':
98 beta_dict[prev_beta_id].append(cur_sse)
100 for beta_sheet
in beta_dict:
102 for strand
in beta_dict[beta_sheet]:
103 seq.add_range(**strand)
104 sses.add_subsequence(
'beta',seq)
112 """ Format from Trisha Davis. Lines are:
113 ignore ignore seq1 seq2 >Name(res) >Name(res) score
114 @param data_fn The data file name
115 @param max_num Maximum number of XL to read (-1 is all)
116 @param name_map Dictionary mapping text file names to the molecule name
117 @param named_offsets Integer offsets to apply to the indexing in the file
118 Output is a CrossLinkData object containing SelectionDictionaries
120 [ { 'r1': {'molecule':'A','residue_index':5},
121 'r2': {'molecule':'B','residue_index':100},
123 { 'r1': {'molecule':'C','residue_index':63},
124 'r2': {'molecule':'D','residue_index':94},
129 inf=open(data_fn,
'r')
130 data=defaultdict(list)
133 for nl,l
in enumerate(inf):
134 if max_num==-1
or nl<max_num:
135 ig1,ig2,seq1,seq2,s1,s2,score=l.split()
137 m1=re.search(
r'>([\w-]+)\((\w+)\)',s1)
138 m2=re.search(
r'>([\w-]+)\((\w+)\)',s2)
144 if n1
in named_offsets:
145 r1+=named_offsets[n1]
151 if n2
in named_offsets:
152 r2+=named_offsets[n2]
153 key=tuple(sorted([
'%s.%i'%(n1,r1),
'%s.%i'%(n2,r2)]))
155 print(
'skipping duplicated xl',key)
158 data.add_cross_link(nl,
159 {
'molecule':n1,
'residue_index':r1},
160 {
'molecule':n2,
'residue_index':r2},
167 """ A light class to store multiple not-necessarily-contiguous residue ranges."""
168 def __init__(self,chain=None,molecule=None,residue_tuple=None,subsequences=None):
169 """Create subsequence and optionally pass the first contiguous range.
170 @param chain The chain ID
171 @param molecule The molecule name
172 @param residue_tuple PDB-style inclusive residue range
173 @param subsequences A list of other subsequences to combine (not implemented)
176 if chain
or molecule
or residue_tuple:
177 self.
add_range(chain,molecule,residue_tuple)
180 def add_range(self,chain=None,molecule=None,residue_tuple=None):
181 """Add some stuff to this subsequence
182 @param chain The chain ID
183 @param molecule The molecule name
184 @param residue_tuple PDB-style inclusive residue range
186 self.seqs.append({
'chain':chain,
'molecule':molecule,
'residue_tuple':residue_tuple})
187 def join(self,new_subsequence):
188 for s
in new_subsequence:
192 """Create an IMP Selection from this subsequence
193 @param hier An IMP hierarchy or list of them
194 \note any additional keyword arguments will be appended to the selection
196 for nseq,seq
in enumerate(self.seqs):
199 args[
'chain']=seq[
'chain']
201 args[
'molecule']=seq[
'molecule']
202 if seq[
'residue_tuple']:
203 args[
'residue_indexes']=list(range(seq[
'residue_tuple'][0],
204 seq[
'residue_tuple'][1]+1))
213 for nseq,seq
in enumerate(self.seqs):
215 if seq[
'residue_tuple']
is not None:
216 this_str.append(
'%i-%i'%(seq[
'residue_tuple'][0],seq[
'residue_tuple'][1]))
217 if seq[
'molecule']
is not None:
218 this_str.append(
'%s'%seq[
'molecule'])
219 if seq[
'chain']
is not None:
220 this_str.append(
'%s'%seq[
'chain'])
221 rep+=
'.'.join(this_str)
222 if nseq < len(self.seqs)-1:
226 def __getitem__(self,key):
227 return self.data[key]
230 return self.seqs.__iter__()
232 def __add__(self,other):
237 """ Group a bunch of subsequences with certain labels
238 Use cases: storing lists of secondary structures from DSSP or PSIPRED
239 storing lists of molecules that should be made symmetric
242 """Setup groups of subsequences
244 self.data=defaultdict(list)
247 """ Append a new cross-link to a given unique ID
248 @param label label for this subsequence (e.g., 'helix')
249 @param subsequence a Subsequence object to store under that label
250 e.g. Subsequence(chain='A',residue_tuple=(10,15))
252 if type(subsequence)
is not Subsequence:
253 raise InputError(
'must provide a subsequence object')
254 self.data[label].append(subsequence)
256 def __getitem__(self,key):
257 return self.data[key]
260 return self.data.__repr__()
263 return self.data.keys()
266 """A class to store the selection commands for a single crosslink.
268 def __init__(self,unique_id,r1,r2,score):
270 @param unique_id The id is used to group crosslinks that are alternatives
271 @param r1 A dictionary of selection keywords for the first residue
272 @param r2 A dictionary of selection keywards for the second residue
273 @param score A score that might be used later for filtering
274 @note The dictionaries can contain any Selection argument like
275 molecule or residue_index
277 self.unique_id = unique_id
283 return "CrossLink id: "+str(self.unique_id)+
" r1: "+repr(self.r1)+
", r2: "+repr(self.r2)
285 def intersects(self,other):
286 if self.r1==other.r1
or self.r1==other.r2
or self.r2==other.r1
or self.r2==other.r2:
292 """Return a list of atom pairs (particles) for this crosslink.
293 Found by selecting everything with r1 and r2 then returning the
295 @note you may want to provide some atom specifiers like
296 atom_type=IMP.atom.AtomType("CA")
297 @param mh The hierarchy to select from
298 @param kwargs Any additional selection arguments
307 raise Exception(
"this selection is empty",rsel1)
309 raise Exception(
"this selection is empty",rsel2)
312 # Check no repeating copy numbers....not sure if this is general
313 for s in (sel1,sel2):
314 idxs=[IMP.atom.get_copy_index(p) for p in s]
315 if len(idxs)!=len(set(idxs)):
316 raise Exception("this XL is selecting more than one particle per copy")
319 for p1,p2
in itertools.product(sel1,sel2):
324 """A class for storing groups of crosslinks.
325 Acts like a dictionary where keys are unique IDs and values are CrossLinks
326 Equivalent (using objects instead of dicts) to a data structure like so:
328 [ { 'r1': {'molecule':'A','residue_index':5},
329 'r2': {'molecule':'B','residue_index':100},
331 { 'r1': {'molecule':'C','residue_index':63},
332 'r2': {'molecule':'D','residue_index':94},
337 """Setup a CrossLinkData object"""
338 self.data = defaultdict(list)
340 """Add a crosslink. They are organized by their unique_ids.
341 @param unique_id The id is used to group crosslinks that are alternatives
342 @param kws1 A dictionary of selection keywords for the first residue
343 @param kws2 A dictionary of selection keywards for the second residue
344 @param score A score that might be used later for filtering
345 @note The dictionaries can contain any Selection argument like
346 molecule or residue_index
348 self.data[unique_id].append(
CrossLink(unique_id,kws1,kws2,score))
349 def copy_cross_link(self,xl):
350 if type(xl)
is not CrossLink:
351 raise Exception(
"CrossLinkData::copy_cross_link() requires a Crosslink object")
352 self.data[xl.unique_id].append(xl)
353 def __getitem__(self, key):
354 return self.data[key]
356 ret=
"CrossLinkData with these entries:\n"
358 for xl
in self.data[d]:
362 return self.data.keys()
364 return self.data.values()
365 def __contains__(self, item):
366 return item
in self.data
368 return iter(self.data)
370 return len(self.data)
375 number_of_best_scoring_models=10,
377 score_key=
"SimplifiedModel_Total_Score_None",
379 rmf_file_key=
"rmf_file",
380 rmf_file_frame_key=
"rmf_frame_index",
381 override_rmf_dir=
None):
382 """Given a list of stat files, read them all and find the best models.
383 Save to a single RMF along with a stat file.
384 @param mdl The IMP Model
385 @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
386 @param stat_files List of all stat files to collect
387 @param number_of_best_scoring_models Num best models to gather
388 @param get_every Skip frames
389 @param score_key Used for the ranking
390 @param feature_keys Keys to keep around
391 @param rmf_file_key The key that says RMF file name
392 @param rmf_file_frame_key The key that says RMF frame number
393 @param override_rmf_dir For output, change the name of the RMF directory (experiment)
398 from mpi4py
import MPI
399 comm = MPI.COMM_WORLD
400 rank = comm.Get_rank()
401 number_of_processes = comm.size
404 number_of_processes = 1
405 my_stat_files=IMP.pmi.tools.chunk_list_into_segments(
406 stat_files,number_of_processes)[rank]
409 out_stat_fn = os.path.join(out_dir,
"top_"+str(number_of_best_scoring_models)+
".out")
410 out_rmf_fn = os.path.join(out_dir,
"top_"+str(number_of_best_scoring_models)+
".rmf3")
414 for nsf,sf
in enumerate(my_stat_files):
417 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
418 print(
"getting data from file %s" % sf)
420 all_keys = [score_key,
423 for k
in po.get_keys():
424 for fk
in feature_keys:
427 fields = po.get_fields(all_keys,
431 length_set = set([len(fields[f])
for f
in fields])
432 minlen = min(length_set)
435 if len(length_set) > 1:
436 print(
"get_best_models: the statfile is not synchronous")
437 minlen = min(length_set)
439 fields[f] = fields[f][0:minlen]
444 all_fields[k]+=fields[k]
446 if override_rmf_dir
is not None:
447 for i
in range(minlen):
448 all_fields[rmf_file_key][i]=os.path.join(
449 override_rmf_dir,os.path.basename(all_fields[rmf_file_key][i]))
452 if number_of_processes!=1:
455 comm.send(all_fields, dest=0, tag=11)
457 for i
in range(1,number_of_processes):
458 data_tmp = comm.recv(source=i, tag=11)
460 all_fields[k]+=data_tmp[k]
463 order = sorted(range(len(all_fields[score_key])),
464 key=
lambda i: float(all_fields[score_key][i]))
467 stat = open(out_stat_fn,
'w')
468 rh0 = RMF.open_rmf_file_read_only(
469 os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][0]))
472 outf = RMF.create_rmf_file(out_rmf_fn)
474 for nm,i
in enumerate(order[:number_of_best_scoring_models]):
475 dline=dict((k,all_fields[k][i])
for k
in all_fields)
476 dline[
'orig_rmf_file']=dline[rmf_file_key]
477 dline[
'orig_rmf_frame_index']=dline[rmf_file_frame_key]
478 dline[rmf_file_key]=out_rmf_fn
479 dline[rmf_file_frame_key]=nm
480 rh = RMF.open_rmf_file_read_only(
481 os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][i]))
484 RMF.FrameID(all_fields[rmf_file_frame_key][i]))
487 stat.write(str(dline)+
'\n')
489 print(
'wrote stats to',out_stat_fn)
490 print(
'wrote rmfs to',out_rmf_fn)
496 score_key=
"SimplifiedModel_Total_Score_None",
498 rmf_file_key=
"rmf_file",
499 rmf_file_frame_key=
"rmf_frame_index",
502 """ Given a list of stat files, read them all and find the best models.
503 Returns the best rmf filenames, frame numbers, scores, and values for feature keywords
506 rmf_file_frame_list=[]
508 feature_keyword_list_dict=defaultdict(list)
509 for sf
in stat_files:
510 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
511 print(
"getting data from file %s" % sf)
513 keywords = po.get_keys()
515 feature_keywords = [score_key,
520 for fk
in feature_keys:
522 feature_keywords.append(k)
524 if prefiltervalue
is None:
525 fields = po.get_fields(feature_keywords,
528 fields = po.get_fields(feature_keywords,
529 filtertuple=(score_key,
"<",prefiltervalue),
535 length_set.add(len(fields[f]))
539 if len(length_set) > 1:
540 print(
"get_best_models: the statfile is not synchronous")
541 minlen = min(length_set)
543 fields[f] = fields[f][0:minlen]
546 score_list += fields[score_key]
547 for rmf
in fields[rmf_file_key]:
548 rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
550 rmf_file_frame_list += fields[rmf_file_frame_key]
552 if feature_keywords
is not None:
553 for k
in feature_keywords:
554 feature_keyword_list_dict[k] += fields[k]
556 return rmf_file_list,rmf_file_frame_list,score_list,feature_keyword_list_dict
559 score_key=
"SimplifiedModel_Total_Score_None",
560 rmf_file_key=
"rmf_file",
561 rmf_file_frame_key=
"rmf_frame_index",
563 """ Given a list of stat files, read them all and find a trajectory of models.
564 Returns the rmf filenames, frame numbers, scores, and values for feature keywords
567 rmf_file_frame_list=[]
569 for sf
in stat_files:
570 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
571 print(
"getting data from file %s" % sf)
573 keywords = po.get_keys()
575 feature_keywords = [score_key,
579 fields = po.get_fields(feature_keywords,
585 length_set.add(len(fields[f]))
589 if len(length_set) > 1:
590 print(
"get_best_models: the statfile is not synchronous")
591 minlen = min(length_set)
593 fields[f] = fields[f][0:minlen]
596 score_list += fields[score_key]
597 for rmf
in fields[rmf_file_key]:
598 rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
600 rmf_file_frame_list += fields[rmf_file_frame_key]
602 return rmf_file_list,rmf_file_frame_list,score_list
607 alignment_components=
None,
608 rmsd_calculation_components=
None,
610 """ Read in coordinates of a set of RMF tuples.
611 Returns the coordinates split as requested (all, alignment only, rmsd only) as well as
612 RMF file names (as keys in a dictionary, with values being the rank number) and just a plain list
613 @param model The IMP model
614 @param rmf_tuples [score,filename,frame number,original order number, rank]
615 @param alignment_components Tuples to specify what you're aligning on
616 @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
619 rmsd_coordinates = []
620 alignment_coordinates = []
621 all_rmf_file_names = []
622 rmf_file_name_index_dict = {}
624 for cnt, tpl
in enumerate(rmf_tuples):
626 frame_number = tpl[2]
629 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
633 IMP.pmi.analysis.link_hiers_to_rmf(model,prots,frame_number,rmf_file)
638 prot=prots[state_number]
641 all_particles=[pp
for key
in part_dict
for pp
in part_dict[key]]
642 all_ps_set=set(all_particles)
644 model_coordinate_dict = {}
645 template_coordinate_dict={}
646 rmsd_coordinate_dict={}
648 model_coordinate_dict[pr] = np.array(
649 [np.array(
IMP.core.XYZ(i).get_coordinates())
for i
in part_dict[pr]])
651 if alignment_components
is not None:
652 for pr
in alignment_components:
653 if type(alignment_components[pr])
is str:
654 name=alignment_components[pr]
656 elif type(alignment_components[pr])
is tuple:
657 name=alignment_components[pr][2]
658 rend=alignment_components[pr][1]
659 rbegin=alignment_components[pr][0]
661 ps=s.get_selected_particles()
662 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
663 template_coordinate_dict[pr] = \
664 [list(map(float,
IMP.core.XYZ(i).get_coordinates()))
for i
in filtered_particles]
666 if rmsd_calculation_components
is not None:
667 for pr
in rmsd_calculation_components:
668 if type(rmsd_calculation_components[pr])
is str:
669 name=rmsd_calculation_components[pr]
671 elif type(rmsd_calculation_components[pr])
is tuple:
672 name=rmsd_calculation_components[pr][2]
673 rend=rmsd_calculation_components[pr][1]
674 rbegin=rmsd_calculation_components[pr][0]
676 ps=s.get_selected_particles()
677 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
678 rmsd_coordinate_dict[pr] = \
679 [list(map(float,
IMP.core.XYZ(i).get_coordinates()))
for i
in filtered_particles]
681 all_coordinates.append(model_coordinate_dict)
682 alignment_coordinates.append(template_coordinate_dict)
683 rmsd_coordinates.append(rmsd_coordinate_dict)
684 frame_name = rmf_file +
'|' + str(frame_number)
685 all_rmf_file_names.append(frame_name)
686 rmf_file_name_index_dict[frame_name] = tpl[4]
687 return all_coordinates,alignment_coordinates,rmsd_coordinates,rmf_file_name_index_dict,all_rmf_file_names
689 def get_bead_sizes(model,rmf_tuple,rmsd_calculation_components=None,state_number=0):
691 @param model The IMP model
692 @param rmf_tuple score,filename,frame number,original order number, rank
693 @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
695 rmf_file = rmf_tuple[1]
696 frame_number = rmf_tuple[2]
698 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
702 prot=prots[state_number]
706 all_particles=[pp
for key
in part_dict
for pp
in part_dict[key]]
707 all_ps_set=set(all_particles)
709 rmsd_bead_size_dict={}
711 if rmsd_calculation_components
is not None:
712 for pr
in rmsd_calculation_components:
713 if type(rmsd_calculation_components[pr])
is str:
714 name=rmsd_calculation_components[pr]
716 elif type(rmsd_calculation_components[pr])
is tuple:
717 name=rmsd_calculation_components[pr][2]
718 rend=rmsd_calculation_components[pr][1]
719 rbegin=rmsd_calculation_components[pr][0]
721 ps=s.get_selected_particles()
722 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
723 rmsd_bead_size_dict[pr] = \
727 return rmsd_bead_size_dict
A class for reading stat files.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
read dssp file, get SSEs.
def get_best_models
Given a list of stat files, read them all and find the best models.
A class for storing groups of crosslinks.
def add_range
Add some stuff to this subsequence.
A light class to store multiple not-necessarily-contiguous residue ranges.
def add_cross_link
Add a crosslink.
def __init__
Setup a CrossLinkData object.
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
def __init__
Create subsequence and optionally pass the first contiguous range.
def get_selection
Return a list of atom pairs (particles) for this crosslink.
def __init__
Setup groups of subsequences.
def parse_xlinks_davis
Format from Trisha Davis.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Group a bunch of subsequences with certain labels Use cases: storing lists of secondary structures fr...
A class to store the selection commands for a single crosslink.
Tools for clustering and cluster analysis.
def get_selection
Create an IMP Selection from this subsequence.
Classes for writing output files and processing them.
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
def __init__
Add a crosslink.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Support for the RMF file format for storing hierarchical molecular data and markup.
def add_subsequence
Append a new cross-link to a given unique ID.