1 """@namespace IMP.pmi.io
2 Utility classes and functions for reading and storing PMI files
5 from __future__
import print_function
18 from collections
import defaultdict
22 """read dssp file, get SSEs. values are all PDB residue numbering.
23 Returns a dictionary with keys helix, beta, loop
24 Each contains a list of SSEs.
25 Each SSE is a list of elements (e.g. strands in a sheet)
26 Each element is a pair (chain,(residue_indexes))
28 Example for a structure with helix A:5-7 and Beta strands A:1-3,A:9-11:
29 ret = { 'helix' : [ [ ('A',5,7 ) ],... ]
30 'beta' : [ [ ('A',1,3),
32 'loop' : same format as helix
38 loop_classes = [
' ',
'',
'T',
'S']
40 for h
in helix_classes:
42 for s
in strand_classes:
44 for l
in loop_classes:
54 beta_dict = defaultdict(list)
58 for line
in open(dssp_fn,
'r'):
63 if fields[1] ==
"RESIDUE":
71 elif limit_to_chains !=
'' and line[11]
not in limit_to_chains:
76 pdb_res_num = int(line[5:10])
78 sstype = sse_dict[line[16]]
82 if prev_sstype
is None:
83 cur_sse = [chain,pdb_res_num,pdb_res_num]
84 elif sstype != prev_sstype
or chain_break:
86 if prev_sstype
in [
'helix',
'loop']:
87 sses[prev_sstype].append([cur_sse])
88 elif prev_sstype ==
'beta':
89 beta_dict[prev_beta_id].append(cur_sse)
90 cur_sse = [chain,pdb_res_num,pdb_res_num]
92 cur_sse[2] = pdb_res_num
98 prev_beta_id = beta_id
101 if not prev_sstype
is None:
102 if prev_sstype
in [
'helix',
'loop']:
103 sses[prev_sstype].append([cur_sse])
104 elif prev_sstype ==
'beta':
105 beta_dict[prev_beta_id].append(cur_sse)
107 for beta_sheet
in beta_dict:
108 sses[
'beta'].append(beta_dict[beta_sheet])
114 number_of_best_scoring_models=10,
116 score_key=
"SimplifiedModel_Total_Score_None",
118 rmf_file_key=
"rmf_file",
119 rmf_file_frame_key=
"rmf_frame_index",
120 override_rmf_dir=
None):
121 """Given a list of stat files, read them all and find the best models.
122 Save to a single RMF along with a stat file.
123 @param mdl The IMP Model
124 @param out_dir The output directory. Will save 3 files (RMF, stat, summary)
125 @param stat_files List of all stat files to collect
126 @param number_of_best_scoring_models Num best models to gather
127 @param get_every Skip frames
128 @param score_key Used for the ranking
129 @param feature_keys Keys to keep around
130 @param rmf_file_key The key that says RMF file name
131 @param rmf_file_frame_key The key that says RMF frame number
132 @param override_rmf_dir For output, change the name of the RMF directory (experiment)
137 from mpi4py
import MPI
138 comm = MPI.COMM_WORLD
139 rank = comm.Get_rank()
140 number_of_processes = comm.size
143 number_of_processes = 1
144 my_stat_files=IMP.pmi.tools.chunk_list_into_segments(
145 stat_files,number_of_processes)[rank]
148 out_stat_fn = os.path.join(out_dir,
"top_"+str(number_of_best_scoring_models)+
".out")
149 out_rmf_fn = os.path.join(out_dir,
"top_"+str(number_of_best_scoring_models)+
".rmf3")
153 for nsf,sf
in enumerate(my_stat_files):
156 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
157 print(
"getting data from file %s" % sf)
159 all_keys = [score_key,
162 for k
in po.get_keys():
163 for fk
in feature_keys:
166 fields = po.get_fields(all_keys,
170 length_set = set([len(fields[f])
for f
in fields])
171 minlen = min(length_set)
174 if len(length_set) > 1:
175 print(
"get_best_models: the statfile is not synchronous")
176 minlen = min(length_set)
178 fields[f] = fields[f][0:minlen]
183 all_fields[k]+=fields[k]
185 if override_rmf_dir
is not None:
186 for i
in range(minlen):
187 all_fields[rmf_file_key][i]=os.path.join(
188 override_rmf_dir,os.path.basename(all_fields[rmf_file_key][i]))
191 if number_of_processes!=1:
194 comm.send(all_fields, dest=0, tag=11)
196 for i
in range(1,number_of_processes):
197 data_tmp = comm.recv(source=i, tag=11)
199 all_fields[k]+=data_tmp[k]
202 order = sorted(range(len(all_fields[score_key])),
203 key=
lambda i: float(all_fields[score_key][i]))
206 stat = open(out_stat_fn,
'w')
207 rh0 = RMF.open_rmf_file_read_only(
208 os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][0]))
211 outf = RMF.create_rmf_file(out_rmf_fn)
213 for nm,i
in enumerate(order[:number_of_best_scoring_models]):
214 dline=dict((k,all_fields[k][i])
for k
in all_fields)
215 dline[
'orig_rmf_file']=dline[rmf_file_key]
216 dline[
'orig_rmf_frame_index']=dline[rmf_file_frame_key]
217 dline[rmf_file_key]=out_rmf_fn
218 dline[rmf_file_frame_key]=nm
219 rh = RMF.open_rmf_file_read_only(
220 os.path.join(root_directory_of_stat_file,all_fields[rmf_file_key][i]))
223 RMF.FrameID(all_fields[rmf_file_frame_key][i]))
226 stat.write(str(dline)+
'\n')
228 print(
'wrote stats to',out_stat_fn)
229 print(
'wrote rmfs to',out_rmf_fn)
235 score_key=
"SimplifiedModel_Total_Score_None",
237 rmf_file_key=
"rmf_file",
238 rmf_file_frame_key=
"rmf_frame_index",
241 """ Given a list of stat files, read them all and find the best models.
242 Returns the best rmf filenames, frame numbers, scores, and values for feature keywords
245 rmf_file_frame_list=[]
247 feature_keyword_list_dict=defaultdict(list)
248 for sf
in stat_files:
249 root_directory_of_stat_file = os.path.dirname(os.path.dirname(os.path.abspath(sf)))
250 print(
"getting data from file %s" % sf)
254 keywords = po.get_keys()
258 feature_keywords = [score_key,
263 for fk
in feature_keys:
265 feature_keywords.append(k)
267 if prefiltervalue
is None:
268 fields = po.get_fields(feature_keywords,
271 fields = po.get_fields(feature_keywords,
272 filtertuple=(score_key,
"<",prefiltervalue),
278 length_set.add(len(fields[f]))
282 if len(length_set) > 1:
283 print(
"get_best_models: the statfile is not synchronous")
284 minlen = min(length_set)
286 fields[f] = fields[f][0:minlen]
289 score_list += fields[score_key]
290 for rmf
in fields[rmf_file_key]:
291 rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
293 rmf_file_frame_list += fields[rmf_file_frame_key]
295 if feature_keywords
is not None:
296 for k
in feature_keywords:
297 feature_keyword_list_dict[k] += fields[k]
299 return rmf_file_list,rmf_file_frame_list,score_list,feature_keyword_list_dict
302 score_key=
"SimplifiedModel_Total_Score_None",
303 rmf_file_key=
"rmf_file",
304 rmf_file_frame_key=
"rmf_frame_index",
306 """ Given a list of stat files, read them all and find a trajectory of models.
307 Returns the rmf filenames, frame numbers, scores, and values for feature keywords
310 rmf_file_frame_list=[]
312 for sf
in stat_files:
313 root_directory_of_stat_file = os.path.dirname(os.path.dirname(sf))
314 print(
"getting data from file %s" % sf)
316 keywords = po.get_keys()
318 feature_keywords = [score_key,
322 fields = po.get_fields(feature_keywords,
328 length_set.add(len(fields[f]))
332 if len(length_set) > 1:
333 print(
"get_best_models: the statfile is not synchronous")
334 minlen = min(length_set)
336 fields[f] = fields[f][0:minlen]
339 score_list += fields[score_key]
340 for rmf
in fields[rmf_file_key]:
341 rmf_file_list.append(os.path.join(root_directory_of_stat_file,rmf))
343 rmf_file_frame_list += fields[rmf_file_frame_key]
345 return rmf_file_list,rmf_file_frame_list,score_list
350 alignment_components=
None,
351 rmsd_calculation_components=
None,
353 """ Read in coordinates of a set of RMF tuples.
354 Returns the coordinates split as requested (all, alignment only, rmsd only) as well as
355 RMF file names (as keys in a dictionary, with values being the rank number) and just a plain list
356 @param model The IMP model
357 @param rmf_tuples [score,filename,frame number,original order number, rank]
358 @param alignment_components Tuples to specify what you're aligning on
359 @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
362 rmsd_coordinates = []
363 alignment_coordinates = []
364 all_rmf_file_names = []
365 rmf_file_name_index_dict = {}
367 for cnt, tpl
in enumerate(rmf_tuples):
369 frame_number = tpl[2]
371 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
375 IMP.pmi.analysis.link_hiers_to_rmf(model,prots,frame_number,rmf_file)
380 states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
381 prot = states[state_number]
383 prot = prots[state_number]
387 all_particles = [pp
for key
in part_dict
for pp
in part_dict[key]]
388 all_ps_set = set(all_particles)
389 model_coordinate_dict = {}
390 template_coordinate_dict={}
391 rmsd_coordinate_dict={}
394 model_coordinate_dict[pr] = np.array(
395 [np.array(
IMP.core.XYZ(i).get_coordinates())
for i
in part_dict[pr]])
396 for tuple_dict,result_dict
in zip((alignment_components,rmsd_calculation_components),
397 (template_coordinate_dict,rmsd_coordinate_dict)):
399 if tuple_dict
is None:
404 for pr
in tuple_dict:
405 if type(tuple_dict[pr])
is str:
406 name = tuple_dict[pr]
408 elif type(tuple_dict[pr])
is tuple:
409 rend = tuple_dict[pr][1]
410 rbegin = tuple_dict[pr][0]
412 residue_indexes=range(rbegin,rend+1))
413 result_dict[pr] = [list(map(float,
IMP.core.XYZ(p).get_coordinates()))
414 for p
in s.get_selected_particles()]
416 for pr
in tuple_dict:
417 if type(tuple_dict[pr])
is str:
420 elif type(tuple_dict[pr])
is tuple:
421 name=tuple_dict[pr][2]
422 rend=tuple_dict[pr][1]
423 rbegin=tuple_dict[pr][0]
425 ps=s.get_selected_particles()
426 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
428 [list(map(float,
IMP.core.XYZ(p).get_coordinates()))
for p
in filtered_particles]
430 all_coordinates.append(model_coordinate_dict)
431 alignment_coordinates.append(template_coordinate_dict)
432 rmsd_coordinates.append(rmsd_coordinate_dict)
433 frame_name = rmf_file +
'|' + str(frame_number)
434 all_rmf_file_names.append(frame_name)
435 rmf_file_name_index_dict[frame_name] = tpl[4]
436 return all_coordinates,alignment_coordinates,rmsd_coordinates,rmf_file_name_index_dict,all_rmf_file_names
438 def get_bead_sizes(model,rmf_tuple,rmsd_calculation_components=None,state_number=0):
440 @param model The IMP model
441 @param rmf_tuple score,filename,frame number,original order number, rank
442 @param rmsd_calculation_components Tuples to specify what components are used for RMSD calc
444 if rmsd_calculation_components
is None:
447 rmf_file = rmf_tuple[1]
448 frame_number = rmf_tuple[2]
449 prots = IMP.pmi.analysis.get_hiers_from_rmf(model,
454 states = IMP.atom.get_by_type(prots[0],IMP.atom.STATE_TYPE)
455 prot = states[state_number]
457 prot = prots[state_number]
459 rmsd_bead_size_dict = {}
463 for pr
in rmsd_calculation_components:
464 if type(rmsd_calculation_components[pr])
is str:
465 name = rmsd_calculation_components[pr]
467 elif type(rmsd_calculation_components[pr])
is tuple:
468 rend = rmsd_calculation_components[pr][1]
469 rbegin = rmsd_calculation_components[pr][0]
471 residue_indexes=range(rbegin,rend+1))
473 for p
in s.get_selected_particles()]
477 all_particles=[pp
for key
in part_dict
for pp
in part_dict[key]]
478 all_ps_set=set(all_particles)
481 for pr
in rmsd_calculation_components:
482 if type(rmsd_calculation_components[pr])
is str:
483 name=rmsd_calculation_components[pr]
485 elif type(rmsd_calculation_components[pr])
is tuple:
486 name=rmsd_calculation_components[pr][2]
487 rend=rmsd_calculation_components[pr][1]
488 rbegin=rmsd_calculation_components[pr][0]
490 ps=s.get_selected_particles()
491 filtered_particles=[p
for p
in ps
if p
in all_ps_set]
492 rmsd_bead_size_dict[pr] = \
495 return rmsd_bead_size_dict
498 """A helper output based on dist to initial coordinates"""
499 def __init__(self,ps,label,init_coords=None):
500 self.mdl = ps[0].get_model()
502 if init_coords
is None:
503 self.init_coords = [
IMP.core.XYZ(p).get_coordinates()
for p
in self.ps]
505 self.init_coords = init_coords
507 def get_output(self):
510 coords = [
IMP.core.XYZ(p).get_coordinates()
for p
in self.ps]
512 output[
"RMSD_"+self.label] = str(rmsd)
516 """A helper output for model evaluation"""
517 def __init__(self,mdl):
520 def get_output(self):
522 score = self.rs.evaluate(
False)
524 output[
"Total_Score"] = str(score)
A class for reading stat files.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def parse_dssp
read dssp file, get SSEs.
def get_best_models
Given a list of stat files, read them all and find the best models.
A helper output for model evaluation.
A helper output based on dist to initial coordinates.
def get_trajectory_models
Given a list of stat files, read them all and find a trajectory of models.
double get_rmsd(const Vector3DsOrXYZs0 &m1, const Vector3DsOrXYZs1 &m2)
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
A decorator for a particle with x,y,z coordinates.
def get_particles_at_resolution_one
Get particles at res 1, or any beads, based on the name.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Tools for clustering and cluster analysis.
bool get_is_canonical(atom::Hierarchy h)
Walk up a PMI2 hierarchy/representations and check if the root is named System.
Classes for writing output files and processing them.
def save_best_models
Given a list of stat files, read them all and find the best models.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
void link_hierarchies(RMF::FileConstHandle fh, const atom::Hierarchies &hs)
def read_coordinates_of_rmfs
Read in coordinates of a set of RMF tuples.
Python classes to represent, score, sample and analyze models.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Support for the RMF file format for storing hierarchical molecular data and markup.