IMP logo
IMP Reference Guide  2.8.0
The Integrative Modeling Platform
crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 from __future__ import print_function
8 import IMP
9 import IMP.pmi
10 import IMP.atom
11 import IMP.core
12 import IMP.algebra
13 import IMP.rmf
14 import RMF
15 import IMP.display
16 import operator
17 import math
18 import sys
19 
20 # json default serializations
21 def set_json_default(obj):
22  if isinstance(obj, set):
23  return list(obj)
24  if isinstance(obj, IMP.pmi.topology.Molecule):
25  return str(obj)
26  raise TypeError
27 
28 # Handle and return data that must be a string
29 if sys.version_info[0] >= 3:
30  def _handle_string_input(inp):
31  if not isinstance(inp, str):
32  raise TypeError("expecting a string")
33  return inp
34 else:
35  def _handle_string_input(inp):
36  if not isinstance(inp, (str, unicode)):
37  raise TypeError("expecting a string or unicode")
38  # Coerce to non-unicode representation (str)
39  if isinstance(inp, unicode):
40  return str(inp)
41  else:
42  return inp
43 
44 class _CrossLinkDataBaseStandardKeys(object):
45  '''
46  This class setup all the standard keys needed to
47  identify the crosslink features from the data sets
48  '''
49  def __init__(self):
50  self.type={}
51  self.protein1_key="Protein1"
52  self.type[self.protein1_key]=str
53  self.protein2_key="Protein2"
54  self.type[self.protein2_key]=str
55  self.residue1_key="Residue1"
56  self.type[self.residue1_key]=int
57  self.residue2_key="Residue2"
58  self.type[self.residue2_key]=int
59  self.residue1_amino_acid_key="Residue1AminoAcid"
60  self.type[self.residue1_amino_acid_key]=str
61  self.residue2_amino_acid_key="Residue2AminoAcid"
62  self.type[self.residue2_amino_acid_key]=str
63  self.residue1_moiety_key="Residue1Moiety"
64  self.type[self.residue1_moiety_key]=str
65  self.residue2_moiety_key="Residue2Moiety"
66  self.type[self.residue2_moiety_key]=str
67  self.site_pairs_key="SitePairs"
68  self.type[self.site_pairs_key]=str
69  self.unique_id_key="XLUniqueID"
70  self.type[self.unique_id_key]=str
71  self.unique_sub_index_key="XLUniqueSubIndex"
72  self.type[self.unique_sub_index_key]=int
73  self.unique_sub_id_key="XLUniqueSubID"
74  self.type[self.unique_sub_id_key]=str
75  self.data_set_name_key="DataSetName"
76  self.type[self.data_set_name_key]=str
77  self.cross_linker_chemical_key="CrossLinkerChemical"
78  self.type[self.cross_linker_chemical_key]=str
79  self.id_score_key="IDScore"
80  self.type[self.id_score_key]=float
81  self.fdr_key="FDR"
82  self.type[self.fdr_key]=float
83  self.quantitation_key="Quantitation"
84  self.type[self.quantitation_key]=float
85  self.redundancy_key="Redundancy"
86  self.type[self.redundancy_key]=int
87  self.redundancy_list_key="RedundancyList"
88  self.type[self.redundancy_key]=list
89  self.ambiguity_key="Ambiguity"
90  self.type[self.ambiguity_key]=int
91  self.residue1_links_number_key="Residue1LinksNumber"
92  self.type[self.residue1_links_number_key]=int
93  self.residue2_links_number_key="Residue2LinksNumber"
94  self.type[self.residue2_links_number_key]=int
95  self.type[self.ambiguity_key]=int
96  self.state_key="State"
97  self.type[self.state_key]=int
98  self.sigma1_key="Sigma1"
99  self.type[self.sigma1_key]=str
100  self.sigma2_key="Sigma2"
101  self.type[self.sigma2_key]=str
102  self.psi_key="Psi"
103  self.type[self.psi_key]=str
104  self.distance_key="Distance"
105  self.type[self.distance_key]=float
106  self.min_ambiguous_distance_key="MinAmbiguousDistance"
107  self.type[self.distance_key]=float
108  #link types are either Monolink, Intralink or Interlink
109  self.link_type_key="LinkType"
110  self.type[self.link_type_key]=str
111 
112  self.ordered_key_list =[self.data_set_name_key,
113  self.unique_id_key,
114  self.unique_sub_index_key,
115  self.unique_sub_id_key,
116  self.protein1_key,
117  self.protein2_key,
118  self.residue1_key,
119  self.residue2_key,
120  self.residue1_amino_acid_key,
121  self.residue2_amino_acid_key,
122  self.residue1_moiety_key,
123  self.residue2_moiety_key,
124  self.cross_linker_chemical_key,
125  self.id_score_key,
126  self.fdr_key,
127  self.quantitation_key,
128  self.redundancy_key,
129  self.redundancy_list_key,
130  self.state_key,
131  self.sigma1_key,
132  self.sigma2_key,
133  self.psi_key,
134  self.distance_key,
135  self.min_ambiguous_distance_key,
136  self.link_type_key]
137 
138 
139 class _ProteinsResiduesArray(tuple):
140  '''
141  This class is inherits from tuple, and it is a shorthand for a cross-link
142  (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
143  residue1 and residue2.
144  '''
145 
146  def __new__(self,input_data):
147  '''
148  @input_data can be a dict or a tuple
149  '''
150  self.cldbsk=_CrossLinkDataBaseStandardKeys()
151  if type(input_data) is dict:
152  monolink=False
153  p1=input_data[self.cldbsk.protein1_key]
154  try:
155  p2=input_data[self.cldbsk.protein2_key]
156  except KeyError:
157  monolink=True
158  r1=input_data[self.cldbsk.residue1_key]
159  try:
160  r2=input_data[self.cldbsk.residue2_key]
161  except KeyError:
162  monolink=True
163  if not monolink:
164  t=(p1,p2,r1,r2)
165  else:
166  t=(p1,"",r1,None)
167  elif type(input_data) is tuple:
168  if len(input_data)>4 or len(input_data)==3 or len(input_data)==1:
169  raise TypeError("_ProteinsResiduesArray: must have only 4 elements")
170  if len(input_data)==4:
171  p1 = _handle_string_input(input_data[0])
172  p2 = _handle_string_input(input_data[1])
173  r1=input_data[2]
174  r2=input_data[3]
175  if (not (type(r1) is int)) and (not (r1 is None)):
176  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
177  if (not (type(r2) is int)) and (not (r1 is None)):
178  raise TypeError("_ProteinsResiduesArray: residue2 must be a integer")
179  t=(p1,p2,r1,r2)
180  if len(input_data) == 2:
181  p1 = _handle_string_input(input_data[0])
182  r1 = input_data[1]
183  if type(r1) is not int:
184  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
185  t = (p1,"",r1,None)
186  else:
187  raise TypeError("_ProteinsResiduesArray: input must be a dict or tuple")
188  return tuple.__new__(_ProteinsResiduesArray, t)
189 
190  def get_inverted(self):
191  '''
192  Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
193  '''
194  return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
195 
196  def __str__(self):
197  outstr=self.cldbsk.protein1_key+" "+str(self[0])
198  outstr+=" "+self.cldbsk.protein2_key+" "+str(self[1])
199  outstr+=" "+self.cldbsk.residue1_key+" "+str(self[2])
200  outstr+=" "+self.cldbsk.residue2_key+" "+str(self[3])
201  return outstr
202 
203 class FilterOperator(object):
204  '''
205  This class allows to create filter functions that can be passed to the CrossLinkDataBase
206  in this way:
207 
208  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
209 
210  where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
211 
212  A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
213 
214  fo.evaluate(xl)
215 
216  and it is used to filter the database
217  '''
218 
219  def __init__(self, argument1, operator, argument2):
220  '''
221  (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
222  or (FilterOperator1,operator.or|and...,FilterOperator2)
223  '''
224  if isinstance(argument1, FilterOperator):
225  self.operations = [argument1, operator, argument2]
226  else:
227  self.operations = []
228  self.values = (argument1, operator, argument2)
229 
230  def __or__(self, FilterOperator2):
231  return FilterOperator(self, operator.or_, FilterOperator2)
232 
233  def __and__(self, FilterOperator2):
234  return FilterOperator(self, operator.and_, FilterOperator2)
235 
236  def __invert__(self):
237  return FilterOperator(self, operator.not_, None)
238 
239  def evaluate(self, xl_item):
240 
241  if len(self.operations) == 0:
242  keyword, operator, value = self.values
243  return operator(xl_item[keyword], value)
244  FilterOperator1, op, FilterOperator2 = self.operations
245 
246  if FilterOperator2 is None:
247  return op(FilterOperator1.evaluate(xl_item))
248  else:
249  return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
250 
251 '''
252 def filter_factory(xl_):
253 
254  class FilterOperator(object):
255  import operator
256  xl = xl_
257 
258  def __new__(self,key,value,oper=operator.eq):
259  return oper(self.xl[key],value)
260 
261  return FilterOperator
262 '''
263 
264 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
265  '''
266  This class is needed to convert the keywords from a generic database
267  to the standard ones
268  '''
269 
270  def __init__(self, list_parser=None):
271  '''
272  @param list_parser an instance of ResiduePairListParser, if any is needed
273  '''
274  self.converter={}
275  self.backward_converter={}
276  _CrossLinkDataBaseStandardKeys.__init__(self)
277  self.rplp = list_parser
278  if self.rplp is None:
279  # either you have protein1, protein2, residue1, residue2
280  self.compulsory_keys=set([self.protein1_key,
281  self.protein2_key,
282  self.residue1_key,
283  self.residue2_key])
284  else:
285  self.compulsory_keys=self.rplp.get_compulsory_keys()
286  self.setup_keys=set()
287 
288  def check_keys(self):
289  '''
290  Is a function that check whether necessary keys are setup
291  '''
292  setup_keys=set(self.get_setup_keys())
293  if self.compulsory_keys & setup_keys != self.compulsory_keys:
294  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
295 
296  def get_setup_keys(self):
297  '''
298  Returns the keys that have been setup so far
299  '''
300  return self.backward_converter.keys()
301 
302  def set_standard_keys(self):
303  """
304  This sets up the standard compulsory keys as defined by
305  _CrossLinkDataBaseStandardKeys
306  """
307  for ck in self.compulsory_keys:
308  self.converter[ck]=ck
309  self.backward_converter[ck]=ck
310 
311  def set_unique_id_key(self,origin_key):
312  self.converter[origin_key]=self.unique_id_key
313  self.backward_converter[self.unique_id_key]=origin_key
314 
315  def set_protein1_key(self,origin_key):
316  self.converter[origin_key]=self.protein1_key
317  self.backward_converter[self.protein1_key]=origin_key
318 
319  def set_protein2_key(self,origin_key):
320  self.converter[origin_key]=self.protein2_key
321  self.backward_converter[self.protein2_key]=origin_key
322 
323  def set_residue1_key(self,origin_key):
324  self.converter[origin_key]=self.residue1_key
325  self.backward_converter[self.residue1_key]=origin_key
326 
327  def set_residue2_key(self,origin_key):
328  self.converter[origin_key]=self.residue2_key
329  self.backward_converter[self.residue2_key]=origin_key
330 
331  def set_residue1_amino_acid_key(self, origin_key):
332  self.converter[origin_key] = self.residue1_amino_acid_key
333  self.backward_converter[self.residue1_amino_acid_key] = origin_key
334 
335  def set_residue2_amino_acid_key(self, origin_key):
336  self.converter[origin_key] = self.residue2_amino_acid_key
337  self.backward_converter[self.residue2_amino_acid_key] = origin_key
338 
339  def set_residue1_moiety_key(self, origin_key):
340  self.converter[origin_key] = self.residue1_moiety_key
341  self.backward_converter[self.residue1_moiety_key] = origin_key
342 
343  def set_residue2_moiety_key(self, origin_key):
344  self.converter[origin_key] = self.residue2_moiety_key
345  self.backward_converter[self.residue2_moiety_key] = origin_key
346 
347  def set_site_pairs_key(self,origin_key):
348  self.converter[origin_key]=self.site_pairs_key
349  self.backward_converter[self.site_pairs_key]=origin_key
350 
351  def set_id_score_key(self,origin_key):
352  self.converter[origin_key]=self.id_score_key
353  self.backward_converter[self.id_score_key]=origin_key
354 
355  def set_fdr_key(self,origin_key):
356  self.converter[origin_key]=self.fdr_key
357  self.backward_converter[self.fdr_key]=origin_key
358 
359  def set_quantitation_key(self,origin_key):
360  self.converter[origin_key]=self.quantitation_key
361  self.backward_converter[self.quantitation_key]=origin_key
362 
363  def set_psi_key(self,origin_key):
364  self.converter[origin_key]=self.psi_key
365  self.backward_converter[self.psi_key]=origin_key
366 
367  def set_link_type_key(self,link_type_key):
368  self.converter[link_type_key]=self.link_type_key
369  self.backward_converter[self.link_type_key]=link_type_key
370 
371  def get_converter(self):
372  '''
373  Returns the dictionary that convert the old keywords to the new ones
374  '''
375  self.check_keys()
376  return self.converter
377 
379  '''
380  Returns the dictionary that convert the new keywords to the old ones
381  '''
382  self.check_keys()
383  return self.backward_converter
384 
385 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
386  '''
387  A class to handle different styles of site pairs parsers.
388  Implemented styles:
389  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
390  [Y3-;K4-] for dead-ends
391  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
392  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
393  '''
394 
395  import re
396 
397  def __init__(self,style):
398 
399  _CrossLinkDataBaseStandardKeys.__init__(self)
400  if style is "MSSTUDIO":
401  self.style=style
402  self.compulsory_keys= set([self.protein1_key,
403  self.protein2_key,
404  self.site_pairs_key])
405  elif style is "XTRACT" or style is "QUANTITATION":
406  self.style=style
407  self.compulsory_keys= set([self.site_pairs_key])
408  else:
409  raise Error("ResiduePairListParser: unknown list parser style")
410 
411  def get_compulsory_keys(self):
412  return self.compulsory_keys
413 
414  def get_list(self,input_string):
415  '''
416  This function returns a list of cross-linked residues and the corresponding list of
417  cross-linked chains. The latter list can be empty, if the style doesn't have the
418  corresponding information.
419  '''
420  if self.style == "MSSTUDIO":
421  input_string=input_string.replace("[","")
422  input_string=input_string.replace("]","")
423  input_string_pairs=input_string.split(";")
424  residue_pair_indexes=[]
425  chain_pair_indexes=[]
426  for s in input_string_pairs:
427  m1=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
428  m2=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
429  if m1:
430  # cross link
431  residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
432  residue_pair_indexes.append((residue_index_1,residue_index_2))
433  elif m2:
434  # dead end
435  residue_type_1,residue_index_1=m2.group(1,2)
436  # at this stage chain_pair_indexes is empty
437  return residue_pair_indexes,chain_pair_indexes
438  if self.style is "XTRACT" or self.style is "QUANTITATION":
439  if ":x:" in input_string:
440  # if it is a crosslink....
441  input_strings=input_string.split(":x:")
442  first_peptides=input_strings[0].split(":|:")
443  second_peptides=input_strings[1].split(":|:")
444  first_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in first_peptides]
445  second_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in second_peptides]
446  residue_pair_indexes=[]
447  chain_pair_indexes=[]
448  for fpi in first_peptides_indentifiers:
449  for spi in second_peptides_indentifiers:
450  chain1=fpi[0]
451  chain2=spi[0]
452  residue1=fpi[1]
453  residue2=spi[1]
454  residue_pair_indexes.append((residue1,residue2))
455  chain_pair_indexes.append((chain1,chain2))
456  return residue_pair_indexes, chain_pair_indexes
457  else:
458  # if it is a monolink....
459  first_peptides = input_string.split(":|:")
460  first_peptides_indentifiers = [(f.split(":")[0], f.split(":")[1]) for f in first_peptides]
461  residue_indexes = []
462  chain_indexes = []
463  for fpi in first_peptides_indentifiers:
464  chain1=fpi[0]
465  residue1=fpi[1]
466  residue_indexes.append((residue1,))
467  chain_indexes.append((chain1,))
468  return residue_indexes, chain_indexes
469 
470 
471 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
472  '''
473  A class to handle different XL format with fixed format
474  currently support ProXL
475  '''
476  def __init__(self,format):
477 
478  _CrossLinkDataBaseStandardKeys.__init__(self)
479  if format is "PROXL":
480  self.format=format
481  else:
482  raise Error("FixedFormatParser: unknown list format name")
483 
484 
485  def get_data(self,input_string):
486  if self.format is "PROXL":
487  tockens=input_string.split("\t")
488  xl={}
489  if tockens[0]=="SEARCH ID(S)":
490  return None
491  else:
492  xl[self.protein1_key]=tockens[2]
493  xl[self.protein2_key]=tockens[4]
494  xl[self.residue1_key]=int(tockens[3])
495  xl[self.residue2_key]=int(tockens[5])
496  return xl
497 
498 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
499  import operator
500  '''
501  this class handles a cross-link dataset and do filtering
502  operations, adding cross-links, merge datasets...
503  '''
504 
505  def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=('K',)):
506  '''
507  Constructor.
508  @param converter an instance of CrossLinkDataBaseKeywordsConverter
509  @param data_base an instance of CrossLinkDataBase to build the new database on
510  @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
511  crosslink consistency. If not given consistency will not be checked
512  @param linkable_aa a tuple containing one-letter amino acids which are linkable by the crosslinker;
513  only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
514  and if a fasta_seq is given
515  '''
516 
517  if data_base is None:
518  self.data_base = {}
519  else:
520  self.data_base=data_base
521 
522  _CrossLinkDataBaseStandardKeys.__init__(self)
523  if converter is not None:
524  self.cldbkc = converter #type: CrossLinkDataBaseKeywordsConverter
525  self.list_parser=self.cldbkc.rplp
526  self.converter = converter.get_converter()
527 
528  else:
529  self.cldbkc = None #type: CrossLinkDataBaseKeywordsConverter
530  self.list_parser=None
531  self.converter = None
532 
533  # default amino acids considered to be 'linkable' if none are given
534  self.def_aa_tuple = linkable_aa
535  self.fasta_seq = fasta_seq #type: IMP.pmi.topology.Sequences
536  self._update()
537 
538  def _update(self):
539  '''
540  Update the whole dataset after changes
541  '''
542  self.update_cross_link_unique_sub_index()
543  self.update_cross_link_redundancy()
544  self.update_residues_links_number()
545  self.check_cross_link_consistency()
546 
547 
548  def __iter__(self):
549  sorted_ids=sorted(self.data_base.keys())
550  for k in sorted_ids:
551  for xl in self.data_base[k]:
552  yield xl
553 
554  def xlid_iterator(self):
555  sorted_ids=sorted(self.data_base.keys())
556  for xlid in sorted_ids:
557  yield xlid
558 
559  def __getitem__(self,xlid):
560  return self.data_base[xlid]
561 
562  def __len__(self):
563  return len([xl for xl in self])
564 
565  def get_name(self):
566  return self.name
567 
568  def set_name(self,name):
569  new_data_base={}
570  for k in self.data_base:
571  new_data_base[k+"."+name]=self.data_base[k]
572  self.data_base=new_data_base
573  self.name=name
574  self._update()
575 
576  def get_number_of_xlid(self):
577  return len(self.data_base)
578 
579 
580  def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
581  '''
582  if FixedFormatParser is not specified, the file is comma-separated-values
583  @param file_name a txt file to be parsed
584  @param converter an instance of CrossLinkDataBaseKeywordsConverter
585  @param FixedFormatParser a parser for a fixed format
586  '''
587  if not FixedFormatParser:
588  xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
589 
590  if converter is not None:
591  self.cldbkc = converter
592  self.list_parser=self.cldbkc.rplp
593  self.converter = converter.get_converter()
594 
595 
596  if not self.list_parser:
597  # normal procedure without a list_parser
598  # each line is a cross-link
599  new_xl_dict={}
600  for nxl,xl in enumerate(xl_list):
601  new_xl={}
602  for k in xl:
603  if k in self.converter:
604  new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
605  else:
606  new_xl[k]=xl[k]
607  if self.unique_id_key in self.cldbkc.get_setup_keys():
608  if new_xl[self.unique_id_key] not in new_xl_dict:
609  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
610  else:
611  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
612  else:
613  if str(nxl) not in new_xl_dict:
614  new_xl_dict[str(nxl)]=[new_xl]
615  else:
616  new_xl_dict[str(nxl)].append(new_xl)
617 
618  else:
619  # with a list_parser, a line can be a list of ambiguous crosslinks
620  new_xl_dict={}
621  for nxl,entry in enumerate(xl_list):
622 
623  # first get the translated keywords
624  new_dict={}
625  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
626  raise Error("CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
627  for k in entry:
628  if k in self.converter:
629  new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
630  else:
631  new_dict[k]=entry[k]
632 
633  residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
634 
635  # then create the crosslinks
636  for n,p in enumerate(residue_pair_list):
637  is_monolink=False
638  if len(p)==1:
639  is_monolink=True
640 
641  new_xl={}
642  for k in new_dict:
643  new_xl[k]=new_dict[k]
644  new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
645  if not is_monolink:
646  new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
647 
648  if len(chain_pair_list)==len(residue_pair_list):
649  new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
650  if not is_monolink:
651  new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
652 
653  if not is_monolink:
654  new_xl[self.link_type_key]="CROSSLINK"
655  else:
656  new_xl[self.link_type_key]="MONOLINK"
657 
658  if self.unique_id_key in self.cldbkc.get_setup_keys():
659  if new_xl[self.unique_id_key] not in new_xl_dict:
660  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
661  else:
662  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
663  else:
664  if str(nxl) not in new_xl_dict:
665  new_xl[self.unique_id_key]=str(nxl+1)
666  new_xl_dict[str(nxl)]=[new_xl]
667  else:
668  new_xl[self.unique_id_key]=str(nxl+1)
669  new_xl_dict[str(nxl)].append(new_xl)
670 
671 
672  else:
673  '''
674  if FixedFormatParser is defined
675  '''
676 
677  new_xl_dict={}
678  f=open(file_name,"r")
679  nxl=0
680  for line in f:
681  xl=FixedFormatParser.get_data(line)
682  if xl:
683  xl[self.unique_id_key]=str(nxl+1)
684  new_xl_dict[str(nxl)]=[xl]
685  nxl+=1
686 
687 
688  self.data_base=new_xl_dict
689  self.name=file_name
690  self._update()
691 
692  def update_cross_link_unique_sub_index(self):
693  for k in self.data_base:
694  for n,xl in enumerate(self.data_base[k]):
695  xl[self.ambiguity_key]=len(self.data_base[k])
696  xl[self.unique_sub_index_key]=n+1
697  xl[self.unique_sub_id_key]=k+"."+str(n+1)
698 
699  def update_cross_link_redundancy(self):
700  redundancy_data_base={}
701  for xl in self:
702  pra=_ProteinsResiduesArray(xl)
703  if pra not in redundancy_data_base:
704  redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
705  redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
706  else:
707  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
708  redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
709  for xl in self:
710  pra=_ProteinsResiduesArray(xl)
711  xl[self.redundancy_key]=len(redundancy_data_base[pra])
712  xl[self.redundancy_list_key]=redundancy_data_base[pra]
713 
714  def update_residues_links_number(self):
715  residue_links={}
716  for xl in self:
717  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
718  if (p1,r1) not in residue_links:
719  residue_links[(p1,r1)]=set([(p2,r2)])
720  else:
721  residue_links[(p1,r1)].add((p2,r2))
722  if (p2,r2) not in residue_links:
723  residue_links[(p2,r2)]=set([(p1,r1)])
724  else:
725  residue_links[(p2,r2)].add((p1,r1))
726 
727  for xl in self:
728  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
729  xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
730  xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
731 
732  def check_cross_link_consistency(self):
733  if self.cldbkc and self.fasta_seq:
734  cnt_matched, cnt_matched_file = 0, 0
735  matched = {}
736  non_matched = {}
737  for xl in self:
738  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
739  b_matched_file = False
740  if self.residue1_amino_acid_key in xl:
741  # either you know the residue type and aa_tuple is a single entry
742  aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
743  b_matched = self._match_xlinks(p1, r1, aa_from_file)
744  b_matched_file = b_matched
745  else:
746  # or pass the possible list of types that can be crosslinked
747  b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
748 
749  matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
750 
751  if self.residue2_amino_acid_key in xl:
752  aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
753  b_matched = self._match_xlinks(p2, r2, aa_from_file)
754  b_matched_file = b_matched
755  else:
756  b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
757 
758  matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
759  if b_matched: cnt_matched += 1
760  if b_matched_file: cnt_matched_file += 1
761  if len(self) > 0:
762  percentage_matched = round(100*cnt_matched/len(self),1)
763  percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
764  #if matched: print "Matched xlinks:", matched
765  if matched or non_matched: print("check_cross_link_consistency: Out of %d crosslinks "
766  "%d were matched to the fasta sequence (%f %%).\n "
767  "%d were matched by using the crosslink file (%f %%)."%
768  (len(self),cnt_matched,percentage_matched,cnt_matched_file,
769  percentage_matched_file) )
770  if non_matched: print("check_cross_link_consistency: Warning: Non matched xlinks:",
771  [(prot_name, sorted(list(non_matched[prot_name]))) for prot_name in non_matched])
772  return matched,non_matched
773 
774  def _match_xlinks(self, prot_name, res_index, aa_tuple):
775  # returns Boolean whether given aa matches a position in the fasta file
776  # cross link files usually start counting at 1 and not 0; therefore subtract -1 to compare with fasta
777  amino_dict = IMP.pmi.tools.ThreeToOneConverter()
778  res_index -= 1
779  for amino_acid in aa_tuple:
780  if len(amino_acid) == 3:
781  amino_acid = amino_dict[amino_acid.upper()]
782  if prot_name in self.fasta_seq.sequences:
783  seq = self.fasta_seq.sequences[prot_name]
784  # if we are looking at the first amino acid in a given sequence always return a match
785  # the first aa should always be the n-terminal aa
786  # which may form a crosslink in any case (for BS3-like crosslinkers)
787  # for some data sets the first aa is at position 1; todo: check why this is the case
788  if res_index == 0 or res_index == 1:
789  return True
790  if res_index < len(seq):
791  if amino_acid == seq[res_index]:
792  return True
793  # else:
794  # print "Could not match", prot, res+1, amino_acid, seq[res]
795  return False
796 
797  def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
798  if b_matched:
799  if prot in matched:
800  matched[prot].add(res)
801  else:
802  matched[prot] = set([res])
803  else:
804  if prot in non_matched:
805  non_matched[prot].add(res)
806  else:
807  non_matched[prot] = set([res])
808  return matched, non_matched
809 
810 
811  def get_cross_link_string(self,xl):
812  string='|'
813  for k in self.ordered_key_list:
814  try:
815  string+=str(k)+":"+str(xl[k])+"|"
816  except KeyError:
817  continue
818 
819  for k in xl:
820  if k not in self.ordered_key_list:
821  string+=str(k)+":"+str(xl[k])+"|"
822 
823  return string
824 
825  def get_short_cross_link_string(self,xl):
826 
827  string='|'
828  list_of_keys=[self.data_set_name_key,
829  self.unique_sub_id_key,
830  self.protein1_key,
831  self.residue1_key,
832  self.protein2_key,
833  self.residue2_key,
834  self.state_key,
835  self.psi_key]
836 
837  for k in list_of_keys:
838  try:
839  string+=str(xl[k])+"|"
840  except KeyError:
841  continue
842 
843  return string
844 
845  def filter(self,FilterOperator):
846  new_xl_dict={}
847  for id in self.data_base.keys():
848  for xl in self.data_base[id]:
849  if FilterOperator.evaluate(xl):
850  if id not in new_xl_dict:
851  new_xl_dict[id]=[xl]
852  else:
853  new_xl_dict[id].append(xl)
854  self._update()
855  return CrossLinkDataBase(self.cldbkc,new_xl_dict)
856 
857 
858  def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
859  '''
860  This function merges two cross-link datasets so that if two conflicting crosslinks have the same
861  cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
862  with different SubIDs
863  '''
864  pass
865 
866  def append_database(self,CrossLinkDataBase2):
867  '''
868  This function append one cross-link dataset to another. Unique ids will be renamed to avoid
869  conflicts.
870  '''
871  name1=self.get_name()
872  name2=CrossLinkDataBase2.get_name()
873  if name1 == name2:
874  name1=id(self)
875  name2=id(CrossLinkDataBase2)
876 
877  #rename first database:
878  new_data_base={}
879  for k in self.data_base:
880  new_data_base[k]=self.data_base[k]
881  for k in CrossLinkDataBase2.data_base:
882  new_data_base[k]=CrossLinkDataBase2.data_base[k]
883  self.data_base=new_data_base
884  self._update()
885 
886  def set_value(self,key,new_value,FilterOperator=None):
887  '''
888  This function changes the value for a given key in the database
889  For instance one can change the name of a protein
890  @param key: the key in the database that must be changed
891  @param new_value: the new value of the key
892  @param FilterOperator: optional FilterOperator to change the value to
893  a subset of the database
894 
895  example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
896  '''
897 
898  for xl in self:
899  if FilterOperator is not None:
900  if FilterOperator.evaluate(xl):
901  xl[key]=new_value
902  else:
903  xl[key]=new_value
904  self._update()
905 
906  def get_values(self,key):
907  '''
908  this function returns the list of values for a given key in the database
909  alphanumerically sorted
910  '''
911  values=set()
912  for xl in self:
913  values.add(xl[key])
914  return sorted(list(values))
915 
916  def offset_residue_index(self,protein_name,offset):
917  '''
918  This function offset the residue indexes of a given protein by a specified value
919  @param protein_name: the protein name that need to be changed
920  @param offset: the offset value
921  '''
922 
923  for xl in self:
924  if xl[self.protein1_key] == protein_name:
925  xl[self.residue1_key]=xl[self.residue1_key]+offset
926  if xl[self.protein2_key] == protein_name:
927  xl[self.residue2_key]=xl[self.residue2_key]+offset
928  self._update()
929 
930  def create_new_keyword(self,keyword,values_from_keyword=None):
931  '''
932  This function creates a new keyword for the whole database and set the values from
933  and existing keyword (optional), otherwise the values are set to None
934  @param keyword the new keyword name:
935  @param values_from_keyword the keyword from which we are copying the values:
936  '''
937  for xl in self:
938  if values_from_keyword is not None:
939  xl[keyword] = xl[values_from_keyword]
940  else:
941  xl[keyword] = None
942  self._update()
943 
944  def rename_proteins(self,old_to_new_names_dictionary):
945  '''
946  This function renames all proteins contained in the input dictionary
947  from the old names (keys) to the new name (values)
948  '''
949 
950  for old_name in old_to_new_names_dictionary:
951  new_name=old_to_new_names_dictionary[old_name]
952  fo2=FilterOperator(self.protein1_key,operator.eq,old_name)
953  self.set_value(self.protein1_key,new_name,fo2)
954  fo2=FilterOperator(self.protein2_key,operator.eq,old_name)
955  self.set_value(self.protein2_key,new_name,fo2)
956 
957  def clone_protein(self,protein_name,new_protein_name):
958  new_xl_dict={}
959  for id in self.data_base.keys():
960  new_data_base=[]
961  for xl in self.data_base[id]:
962  new_data_base.append(xl)
963  if xl[self.protein1_key]==protein_name and xl[self.protein2_key]!=protein_name:
964  new_xl=dict(xl)
965  new_xl[self.protein1_key]=new_protein_name
966  new_data_base.append(new_xl)
967  elif xl[self.protein1_key]!=protein_name and xl[self.protein2_key]==protein_name:
968  new_xl=dict(xl)
969  new_xl[self.protein2_key]=new_protein_name
970  new_data_base.append(new_xl)
971  elif xl[self.protein1_key]==protein_name and xl[self.protein2_key]==protein_name:
972  new_xl=dict(xl)
973  new_xl[self.protein1_key]=new_protein_name
974  new_data_base.append(new_xl)
975  new_xl=dict(xl)
976  new_xl[self.protein2_key]=new_protein_name
977  new_data_base.append(new_xl)
978  new_xl=dict(xl)
979  new_xl[self.protein1_key]=new_protein_name
980  new_xl[self.protein2_key]=new_protein_name
981  new_data_base.append(new_xl)
982  self.data_base[id]=new_data_base
983  self._update()
984 
985  def filter_out_same_residues(self):
986  '''
987  This function remove cross-links applied to the same residue
988  (ie, same chain name and residue number)
989  '''
990  new_xl_dict={}
991  for id in self.data_base.keys():
992  new_data_base=[]
993  for xl in self.data_base[id]:
994  if xl[self.protein1_key]==xl[self.protein2_key] and xl[self.residue1_key]==xl[self.residue2_key]:
995  continue
996  else:
997  new_data_base.append(xl)
998  self.data_base[id]=new_data_base
999  self._update()
1000 
1001 
1002  def jackknife(self,percentage):
1003  '''
1004  this method returns a CrossLinkDataBase class containing
1005  a random subsample of the original cross-link database.
1006  @param percentage float between 0 and 1, is the percentage of
1007  of spectra taken from the original list
1008  '''
1009  import random
1010  if percentage > 1.0 or percentage < 0.0:
1011  raise ValueError('the percentage of random cross-link spectra should be between 0 and 1')
1012  nspectra=self.get_number_of_xlid()
1013  nrandom_spectra=int(nspectra*percentage)
1014  random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1015  new_data_base={}
1016  for k in random_keys:
1017  new_data_base[k]=self.data_base[k]
1018  return CrossLinkDataBase(self.cldbkc,new_data_base)
1019 
1020  def __str__(self):
1021  outstr=''
1022  sorted_ids=sorted(self.data_base.keys())
1023 
1024  for id in sorted_ids:
1025  outstr+=id+"\n"
1026  for xl in self.data_base[id]:
1027  for k in self.ordered_key_list:
1028  try:
1029  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1030  except KeyError:
1031  continue
1032 
1033  for k in xl:
1034  if k not in self.ordered_key_list:
1035  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1036  outstr+="-------------\n"
1037  return outstr
1038 
1039 
1040  def plot(self,filename,**kwargs):
1041  import matplotlib
1042  matplotlib.use('Agg')
1043  import matplotlib.pyplot as plt
1044  import matplotlib.colors
1045 
1046 
1047 
1048  if kwargs["type"] == "scatter":
1049  cmap=plt.get_cmap("rainbow")
1050  norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1051  xkey=kwargs["xkey"]
1052  ykey=kwargs["ykey"]
1053  if "colorkey" in kwargs:
1054  colorkey=kwargs["colorkey"]
1055  if "sizekey" in kwargs:
1056  sizekey=kwargs["sizekey"]
1057  if "logyscale" in kwargs:
1058  logyscale=kwargs["logyscale"]
1059  else:
1060  logyscale=False
1061  xs=[]
1062  ys=[]
1063  colors=[]
1064  for xl in self:
1065  try:
1066  xs.append(float(xl[xkey]))
1067  if logyscale:
1068  ys.append(math.log10(float(xl[ykey])))
1069  else:
1070  ys.append(float(xl[ykey]))
1071  colors.append(float(xl[colorkey]))
1072  except ValueError:
1073  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1074  continue
1075 
1076  cs=[]
1077  for color in colors:
1078  cindex=(color-min(colors))/(max(colors)-min(colors))
1079  cs.append(cmap(cindex))
1080 
1081  fig = plt.figure()
1082  ax = fig.add_subplot(111)
1083  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker="o")
1084  ax.set_xlabel(xkey)
1085  ax.set_ylabel(ykey)
1086  plt.savefig(filename)
1087  plt.show()
1088  plt.close()
1089 
1090  if kwargs["type"] == "residue_links":
1091  #plot the number of distinct links to a residue
1092  #in an histogram
1093  #molecule name
1094  molecule=kwargs["molecule"]
1095  if type(molecule) is IMP.pmi.topology.Molecule:
1096  length=len(molecule.sequence)
1097  molecule=molecule.get_name()
1098  else:
1099  #you need a IMP.pmi.topology.Sequences object
1100  sequences_object=kwargs["sequences_object"]
1101  sequence=sequences_object.sequences[molecule]
1102  length=len(sequence)
1103 
1104  histogram=[0]*length
1105  for xl in self:
1106  if xl[self.protein1_key]==molecule:
1107  histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1108  if xl[self.protein2_key]==molecule:
1109  histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1110  rects = plt.bar(range(1,length+1), histogram)
1111  #bar_width,
1112  #alpha=opacity,
1113  #color='b',
1114  #yerr=std_men,
1115  #error_kw=error_config,
1116  #label='Men')
1117  plt.savefig(filename)
1118  plt.show()
1119  plt.close()
1120 
1121 
1122 
1123  if kwargs["type"] == "histogram":
1124  valuekey=kwargs["valuekey"]
1125  reference_xline=kwargs["reference_xline"]
1126  valuename=valuekey
1127  bins=kwargs["bins"]
1128  values_list=[]
1129  for xl in self:
1130  try:
1131  values_list.append(float(xl[valuekey]))
1132  except ValueError:
1133  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1134  continue
1136  filename, [values_list], valuename=valuename, bins=bins,
1137  colors=None, format="pdf",
1138  reference_xline=None, yplotrange=None,
1139  xplotrange=None,normalized=True,
1140  leg_names=None)
1141 
1142  def dump(self,json_filename):
1143  import json
1144  with open(json_filename, 'w') as fp:
1145  json.dump(self.data_base, fp, sort_keys=True, indent=2, default=set_json_default)
1146 
1147  def load(self,json_filename):
1148  import json
1149  with open(json_filename, 'r') as fp:
1150  self.data_base = json.load(fp)
1151  self._update()
1152  #getting rid of unicode
1153  # (can't do this in Python 3, since *everything* is Unicode there)
1154  if sys.version_info[0] < 3:
1155  for xl in self:
1156  for k,v in xl.iteritems():
1157  if type(k) is unicode: k=str(k)
1158  if type(v) is unicode: v=str(v)
1159  xl[k]=v
1160 
1161  def save_csv(self,filename):
1162 
1163  import csv
1164 
1165  data=[]
1166  sorted_ids=None
1167  sorted_group_ids=sorted(self.data_base.keys())
1168  for group in sorted_group_ids:
1169  group_block=[]
1170  for xl in self.data_base[group]:
1171  if not sorted_ids:
1172  sorted_ids=sorted(xl.keys())
1173  values=[xl[k] for k in sorted_ids]
1174  group_block.append(values)
1175  data+=group_block
1176 
1177 
1178  with open(filename, 'w') as fp:
1179  a = csv.writer(fp, delimiter=',')
1180  a.writerow(sorted_ids)
1181  a.writerows(data)
1182 
1183 
1185  """This class allows to compute and plot the distance between datasets"""
1186 
1187  def __init__(self,cldb_dictionary):
1188  """Input a dictionary where keys are cldb names and values are cldbs"""
1189  import scipy.spatial.distance
1190  self.dbs=cldb_dictionary
1191  self.keylist=list(self.dbs.keys())
1192  self.distances=list()
1193 
1194 
1195  for i,key1 in enumerate(self.keylist):
1196  for key2 in self.keylist[i+1:]:
1197  distance=self.get_distance(key1,key2)
1198  self.distances.append(distance)
1199 
1200  self.distances=scipy.spatial.distance.squareform(self.distances)
1201 
1202  def get_distance(self,key1,key2):
1203  return 1.0-self.jaccard_index(self.dbs[key1],self.dbs[key2])
1204 
1205  def jaccard_index(self,CrossLinkDataBase1,CrossLinkDataBase2):
1206  """Similarity index between two datasets
1207  https://en.wikipedia.org/wiki/Jaccard_index"""
1208 
1209  set1=set()
1210  set2=set()
1211  for xl1 in CrossLinkDataBase1:
1212  a1f=_ProteinsResiduesArray(xl1)
1213  a1b=a1f.get_inverted()
1214  set1.add(a1f)
1215  set1.add(a1b)
1216  for xl2 in CrossLinkDataBase2:
1217  a2f=_ProteinsResiduesArray(xl2)
1218  a2b=a2f.get_inverted()
1219  set2.add(a2f)
1220  set2.add(a2b)
1221  return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1222 
1223  def plot_matrix(self,figurename="clustermatrix.pdf"):
1224  import matplotlib as mpl
1225  import numpy
1226  mpl.use('Agg')
1227  import matplotlib.pylab as pl
1228  from scipy.cluster import hierarchy as hrc
1229 
1230  raw_distance_matrix=self.distances
1231  labels=self.keylist
1232 
1233  fig = pl.figure()
1234  #fig.autolayout=True
1235 
1236  ax = fig.add_subplot(1,1,1)
1237  dendrogram = hrc.dendrogram(
1238  hrc.linkage(raw_distance_matrix),
1239  color_threshold=7,
1240  no_labels=True)
1241  leaves_order = dendrogram['leaves']
1242  ax.set_xlabel('Dataset')
1243  ax.set_ylabel('Jaccard Distance')
1244  pl.tight_layout()
1245  pl.savefig("dendrogram."+figurename, dpi=300)
1246  pl.close(fig)
1247 
1248  fig = pl.figure()
1249  #fig.autolayout=True
1250 
1251  ax = fig.add_subplot(1,1,1)
1252  cax = ax.imshow(
1253  raw_distance_matrix[leaves_order,
1254  :][:,
1255  leaves_order],
1256  interpolation='nearest')
1257  cb = fig.colorbar(cax)
1258  cb.set_label('Jaccard Distance')
1259  ax.set_xlabel('Dataset')
1260  ax.set_ylabel('Dataset')
1261  ax.set_xticks(range(len(labels)))
1262  ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation='vertical')
1263  ax.set_yticks(range(len(labels)))
1264  ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation='horizontal')
1265  pl.tight_layout()
1266  pl.savefig("matrix."+figurename, dpi=300)
1267  pl.close(fig)
1268 
1269 
1270 
1271 
1272 
1273 
1274 
1275 
1277  '''
1278  This class maps a CrossLinkDataBase on a given structure
1279  and save an rmf file with the color coded crosslinks
1280  '''
1281 
1282 
1283  def __init__(self,model,CrossLinkDataBase,rmf_name,frame_index):
1284  self.CrossLinkDataBase=CrossLinkDataBase
1285  rh = RMF.open_rmf_file_read_only(rmf_name)
1286  self.model=model
1287  self.prots = IMP.rmf.create_hierarchies(rh, self.model)
1288  IMP.rmf.load_frame(rh, RMF.FrameID(frame_index))
1289  self.compute_distances()
1290  self.model.update()
1291 
1292  def compute_distances(self):
1293  data=[]
1294  sorted_ids=None
1295  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1296  for group in sorted_group_ids:
1297  #group_block=[]
1298  group_dists=[]
1299  for xl in self.CrossLinkDataBase.data_base[group]:
1300  if not sorted_ids:
1301  sorted_ids=sorted(xl.keys())
1302  data.append(sorted_ids+["UniqueID","Distance","MinAmbiguousDistance"])
1303  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1304  try:
1305  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1306  except:
1307  mdist="None"
1308  state1="None"
1309  copy1="None"
1310  state2="None"
1311  copy2="None"
1312  values=[xl[k] for k in sorted_ids]
1313  values+=[group,mdist]
1314  group_dists.append(mdist)
1315  #group_block.append(values)
1316  xl["Distance"]=mdist
1317  xl["State1"]=state1
1318  xl["Copy1"]=copy1
1319  xl["State2"]=state2
1320  xl["Copy2"]=copy2
1321 
1322  for xl in self.CrossLinkDataBase.data_base[group]:
1323  xl["MinAmbiguousDistance"]=min(group_dists)
1324 
1325  def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1326  '''more robust and slower version of above'''
1327  sel=IMP.atom.Selection(self.prots,molecule=c1,residue_index=r1,resolution=1)
1328  selpart_1=sel.get_selected_particles()
1329  if len(selpart_1)==0:
1330  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1331  return None
1332  sel=IMP.atom.Selection(self.prots,molecule=c2,residue_index=r2,resolution=1)
1333  selpart_2=sel.get_selected_particles()
1334  if len(selpart_2)==0:
1335  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1336  return None
1337  results=[]
1338  for p1 in selpart_1:
1339  for p2 in selpart_2:
1340  if p1 == p2 and r1 == r2: continue
1341  d1=IMP.core.XYZ(p1)
1342  d2=IMP.core.XYZ(p2)
1343  #round distance to second decimal
1344  dist=float(int(IMP.core.get_distance(d1,d2)*100.0))/100.0
1345  h1=IMP.atom.Hierarchy(p1)
1346  while not IMP.atom.Molecule.get_is_setup(h1.get_particle()):
1347  h1=h1.get_parent()
1348  copy_index1=IMP.atom.Copy(h1).get_copy_index()
1349  while not IMP.atom.State.get_is_setup(h1.get_particle()):
1350  h1=h1.get_parent()
1351  state_index1=IMP.atom.State(h1).get_state_index()
1352  h2=IMP.atom.Hierarchy(p2)
1353  while not IMP.atom.Molecule.get_is_setup(h2.get_particle()):
1354  h2=h2.get_parent()
1355  copy_index2=IMP.atom.Copy(h2).get_copy_index()
1356  while not IMP.atom.State.get_is_setup(h2.get_particle()):
1357  h2=h2.get_parent()
1358  state_index2=IMP.atom.State(h2).get_state_index()
1359 
1360  results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1361  if len(results)==0: return None
1362  results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1363  return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1364 
1365  def save_rmf_snapshot(self,filename,color_id=None):
1366  import operator
1367 
1368  if color_id is None:
1369  color_id=self.CrossLinkDataBase.id_score_key
1370  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1371  list_of_pairs=[]
1372  color_scores=[]
1373  for group in sorted_group_ids:
1374  group_dists_particles=[]
1375  for xl in self.CrossLinkDataBase.data_base[group]:
1376  xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1377  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1378  print (c1,c2,r1,r2)
1379  try:
1380  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1381  except TypeError:
1382  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1383  continue
1384  print(xl[color_id])
1385  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1386  if group_dists_particles:
1387  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1388  color_scores.append(mincolor_score)
1389  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1390  else:
1391  continue
1392 
1393  linear = IMP.core.Linear(0, 0.0)
1394  linear.set_slope(1.0)
1395  dps2 = IMP.core.DistancePairScore(linear)
1396  rslin = IMP.RestraintSet(self.model, 'linear_dummy_restraints')
1397  sgs=[]
1398  offset=min(color_scores)
1399  maxvalue=max(color_scores)
1400  for pair in list_of_pairs:
1401  pr = IMP.core.PairRestraint(self.model, dps2, (pair[0], pair[1]))
1402  pr.set_name(pair[2])
1403  factor=(pair[3]-offset)/(maxvalue-offset)
1404  print(factor)
1405  c=IMP.display.get_rgb_color(factor)
1406  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1407  rslin.add_restraint(pr)
1408  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1409 
1410  rh = RMF.create_rmf_file(filename)
1411  IMP.rmf.add_hierarchies(rh, self.prots)
1412  IMP.rmf.add_restraints(rh,[rslin])
1413  IMP.rmf.add_geometries(rh, sgs)
1414  IMP.rmf.save_frame(rh)
1415  del rh
1416 
1417 
1419  '''
1420  This class generates a CrossLinkDataBase from a given structure
1421  '''
1422  def __init__(self,representation=None,
1423  system=None,
1424  residue_types_1=["K"],
1425  residue_types_2=["K"],
1426  reactivity_range=[0,1],
1427  kt=1.0):
1428 
1429  import numpy.random
1430  import math
1432  cldbkc.set_protein1_key("Protein1")
1433  cldbkc.set_protein2_key("Protein2")
1434  cldbkc.set_residue1_key("Residue1")
1435  cldbkc.set_residue2_key("Residue2")
1436  self.cldb=CrossLinkDataBase(cldbkc)
1437  if representation is not None:
1438  #PMI 1.0 mode
1439  self.mode="pmi1"
1440  self.representation=representation
1441  self.model=self.representation.m
1442  elif system is not None:
1443  #PMI 2.0 mode
1444  self.system=system
1445  self.model=self.system.mdl
1446  self.mode="pmi2"
1447  else:
1448  print("Argument error: please provide either a representation object or a IMP.Hierarchy")
1449  raise
1450  self.residue_types_1=residue_types_1
1451  self.residue_types_2=residue_types_2
1452  self.kt=kt
1453  self.indexes_dict1={}
1454  self.indexes_dict2={}
1455  self.protein_residue_dict={}
1456  self.reactivity_dictionary={}
1457  self.euclidean_interacting_pairs=None
1458  self.xwalk_interacting_pairs=None
1459  import random
1460 
1461  if self.mode=="pmi1":
1462  for protein in self.representation.sequence_dict.keys():
1463  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
1464  seq=self.representation.sequence_dict[protein]
1465  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
1466 
1467  for r in residues:
1468  # uniform random reactivities
1469  #self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
1470  # getting reactivities from the CDF of an exponential distribution
1471  rexp=numpy.random.exponential(0.1)
1472  prob=1.0-math.exp(-rexp)
1473  self.reactivity_dictionary[(protein,r)]=prob
1474 
1475 
1476  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
1477  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
1478  for r in residues1:
1479  h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1480  p=h.get_particle()
1481  index=p.get_index()
1482  self.indexes_dict1[index]=(protein,r)
1483  self.protein_residue_dict[(protein,r)]=index
1484  for r in residues2:
1485  h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1486  p=h.get_particle()
1487  index=p.get_index()
1488  self.indexes_dict2[index]=(protein,r)
1489  self.protein_residue_dict[(protein,r)]=index
1490 
1491  if self.mode=="pmi2":
1492  for state in self.system.get_states():
1493  for moleculename,molecules in state.get_molecules().iteritems():
1494  for molecule in molecules:
1495  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
1496  seq=molecule.sequence
1497  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
1498 
1499  for r in residues:
1500  # uniform random reactivities
1501  #self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
1502  # getting reactivities from the CDF of an exponential distribution
1503  rexp=numpy.random.exponential(0.00000001)
1504  prob=1.0-math.exp(-rexp)
1505  self.reactivity_dictionary[(molecule,r)]=prob
1506 
1507  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
1508  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
1509  for r in residues1:
1510  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1511  ps=s.get_selected_particles()
1512  for p in ps:
1513  index=p.get_index()
1514  self.indexes_dict1[index]=(molecule,r)
1515  self.protein_residue_dict[(molecule,r)]=index
1516  for r in residues2:
1517  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1518  ps=s.get_selected_particles()
1519  for p in ps:
1520  index=p.get_index()
1521  self.indexes_dict2[index]=(molecule,r)
1522  self.protein_residue_dict[(molecule,r)]=index
1523 
1524 
1525  def get_all_possible_pairs(self):
1526  n=float(len(self.protein_residue_dict.keys()))
1527  return n*(n-1.0)/2.0
1528 
1529  def get_all_feasible_pairs(self,distance=21):
1530  import itertools
1531  particle_index_pairs=[]
1532  nxl=0
1533  for a,b in itertools.combinations(self.protein_residue_dict.keys(),2):
1534 
1535  new_xl={}
1536  index1=self.protein_residue_dict[a]
1537  index2=self.protein_residue_dict[b]
1538  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1539  if particle_distance <= distance:
1540  particle_index_pairs.append((index1,index2))
1541  if self.mode=="pmi1":
1542  new_xl[self.cldb.protein1_key]=a[0]
1543  new_xl[self.cldb.protein2_key]=b[0]
1544  elif self.mode=="pmi2":
1545  new_xl[self.cldb.protein1_key]=a[0].get_name()
1546  new_xl[self.cldb.protein2_key]=b[0].get_name()
1547  new_xl["molecule_object1"]=a[0]
1548  new_xl["molecule_object2"]=b[0]
1549  new_xl[self.cldb.residue1_key]=a[1]
1550  new_xl[self.cldb.residue2_key]=b[1]
1551  self.cldb.data_base[str(nxl)]=[new_xl]
1552  nxl+=1
1553  self.cldb._update()
1554  return self.cldb
1555 
1556 
1557 
1558 
1559  def get_data_base(self,total_number_of_spectra,
1560  ambiguity_probability=0.1,
1561  noise=0.01,
1562  distance=21,
1563  max_delta_distance=10.0,
1564  xwalk_bin_path=None,
1565  confidence_false=0.75,
1566  confidence_true=0.75):
1567  import math
1568  from random import random,uniform
1569  import numpy as np
1570  number_of_spectra=1
1571 
1572  self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1573  self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1574  self.cldb.data_base[str(number_of_spectra)]=[]
1575  self.sites_weighted=None
1576 
1577  while number_of_spectra<total_number_of_spectra:
1578  if random() > ambiguity_probability and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1579  # new spectrum
1580  number_of_spectra+=1
1581  self.cldb.data_base[str(number_of_spectra)]=[]
1582  noisy=False
1583  if random() > noise:
1584  # not noisy crosslink
1585  pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1586  else:
1587  # noisy crosslink
1588  pra,dist=self.get_random_residue_pair(None,None,None)
1589  noisy=True
1590 
1591  new_xl={}
1592  if self.mode=="pmi1":
1593  new_xl[self.cldb.protein1_key]=pra[0]
1594  new_xl[self.cldb.protein2_key]=pra[1]
1595  elif self.mode=="pmi2":
1596  new_xl[self.cldb.protein1_key]=pra[0].get_name()
1597  new_xl[self.cldb.protein2_key]=pra[1].get_name()
1598  new_xl["molecule_object1"]=pra[0]
1599  new_xl["molecule_object2"]=pra[1]
1600  new_xl[self.cldb.residue1_key]=pra[2]
1601  new_xl[self.cldb.residue2_key]=pra[3]
1602  new_xl["Noisy"]=noisy
1603  # the reactivity is defined as r=1-exp(-k*Delta t)
1604  new_xl["Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1605  new_xl["Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1606  r1=new_xl["Reactivity_Residue1"]
1607  r2=new_xl["Reactivity_Residue2"]
1608  #combined reactivity 1-exp(-k12*Delta t),
1609  # k12=k1*k2/(k1+k2)
1610  #new_xl["Reactivity"]=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1611  if noisy:
1612  #new_xl["Score"]=uniform(-1.0,1.0)
1613  new_xl["Score"]=np.random.beta(1.0,self.beta_false)
1614  else:
1615  #new_xl["Score"]=new_xl["Reactivity"]+uniform(0.0,2.0)
1616  new_xl["Score"]=1.0-np.random.beta(1.0,self.beta_true)
1617  new_xl["TargetDistance"]=dist
1618  new_xl["NoiseProbability"]=noise
1619  new_xl["AmbiguityProbability"]=ambiguity_probability
1620  # getting if it is intra or inter rigid body
1621  (p1,p2)=IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1622  self.protein_residue_dict[(pra[1],pra[3])]])
1625  IMP.core.RigidMember(p1).get_rigid_body() ==
1626  IMP.core.RigidMember(p2).get_rigid_body()):
1627  new_xl["InterRigidBody"] = False
1628  elif (IMP.core.RigidMember.get_is_setup(p1) and
1630  IMP.core.RigidMember(p1).get_rigid_body() !=
1631  IMP.core.RigidMember(p2).get_rigid_body()):
1632  new_xl["InterRigidBody"] = True
1633  else:
1634  new_xl["InterRigidBody"] = None
1635 
1636  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1637  self.cldb._update()
1638  return self.cldb
1639 
1640 
1641  def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1642  import IMP.pmi.tools
1643  import math
1644  from random import choice,uniform
1645  if distance is None:
1646  # get a random pair
1647  while True:
1648  if self.mode=="pmi1":
1649  protein1=choice(self.representation.sequence_dict.keys())
1650  protein2=choice(self.representation.sequence_dict.keys())
1651  seq1=self.representation.sequence_dict[protein1]
1652  seq2=self.representation.sequence_dict[protein2]
1653  residue1=choice([i for i in range(1,len(seq1)+1) if seq1[i-1] in self.residue_types_1])
1654  residue2=choice([i for i in range(1,len(seq2)+1) if seq2[i-1] in self.residue_types_2])
1655  h1=IMP.pmi.tools.select_by_tuple(self.representation,(residue1,residue1,protein1),resolution=1)[0]
1656  h2=IMP.pmi.tools.select_by_tuple(self.representation,(residue2,residue2,protein2),resolution=1)[0]
1657  particle_distance=IMP.core.get_distance(IMP.core.XYZ(h1.get_particle()),IMP.core.XYZ(h2.get_particle()))
1658  if (protein1,residue1) != (protein2,residue2):
1659  break
1660  elif self.mode=="pmi2":
1661  (protein1,residue1)=choice(self.protein_residue_dict.keys())
1662  (protein2,residue2)=choice(self.protein_residue_dict.keys())
1663  index1=self.protein_residue_dict[(protein1,residue1)]
1664  index2=self.protein_residue_dict[(protein2,residue2)]
1665  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1666  if (protein1,residue1) != (protein2,residue2):
1667  break
1668  else:
1669  # get a pair of residues whose distance is below the threshold
1670  if not xwalk_bin_path:
1672  gcpf.set_distance(distance+max_delta_distance)
1673 
1674  while True:
1675  #setup the reaction rates lists
1676  if not self.sites_weighted:
1677  self.sites_weighted=[]
1678  for key in self.reactivity_dictionary:
1679  r=self.reactivity_dictionary[key]
1680  self.sites_weighted.append((key,r))
1681  #get a random reaction site
1682  first_site=self.weighted_choice(self.sites_weighted)
1683  #get all distances
1684  if not self.euclidean_interacting_pairs:
1685  self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1686  self.indexes_dict1.keys(),
1687  self.indexes_dict2.keys())
1688  #get the partner for the first reacted site
1689  first_site_pairs = [pair for pair in self.euclidean_interacting_pairs
1690  if self.indexes_dict1[pair[0]] == first_site or
1691  self.indexes_dict2[pair[1]] == first_site]
1692  if len(first_site_pairs)==0: continue
1693  #build the list of second reaction sites
1694  second_sites_weighted=[]
1695  for pair in first_site_pairs:
1696  if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1697  if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1698  r=self.reactivity_dictionary[second_site]
1699  second_sites_weighted.append((second_site,r))
1700  second_site=self.weighted_choice(second_sites_weighted)
1701  """
1702  interacting_pairs_weighted=[]
1703  for pair in self.euclidean_interacting_pairs:
1704  r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1705  r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1706  #combined reactivity 1-exp(-k12*Delta t),
1707  # k12=k1*k2/(k1+k2)
1708  #print(r1,r2,dist)
1709  r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1710  interacting_pairs_weighted.append((pair,r12))
1711  #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1712  #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1713  #interacting_pairs_weighted.append((pair,weight1*weight2))
1714 
1715  while True:
1716  pair=self.weighted_choice(interacting_pairs_weighted)
1717  protein1,residue1=self.indexes_dict1[pair[0]]
1718  protein2,residue2=self.indexes_dict2[pair[1]]
1719  particle_pair=IMP.get_particles(self.model,pair)
1720  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1721  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1722  break
1723  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1724  #allow some flexibility
1725  prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1726  if uniform(0.0,1.0)<prob: break
1727  """
1728  protein1,residue1=first_site
1729  protein2,residue2=second_site
1730  print("CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1731  "First site",first_site,self.reactivity_dictionary[first_site],
1732  "Second site",second_site,self.reactivity_dictionary[second_site])
1733  particle_pair=IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1734  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1735 
1736  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1737  break
1738  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1739  #allow some flexibility
1740  #prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1741  #if uniform(0.0,1.0)<prob: break
1742  if particle_distance-distance < max_delta_distance: break
1743 
1744 
1745 
1746  else:
1747  if not self.xwalk_interacting_pairs:
1748  self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1749  interacting_pairs_weighted=[]
1750 
1751  for pair in self.xwalk_interacting_pairs:
1752  protein1=pair[0]
1753  protein2=pair[1]
1754  residue1=pair[2]
1755  residue2=pair[3]
1756  weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1757  weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1758  interacting_pairs_weighted.append((pair,weight1*weight2))
1759 
1760  pair=self.weighted_choice(interacting_pairs_weighted)
1761  protein1=pair[0]
1762  protein2=pair[1]
1763  residue1=pair[2]
1764  residue2=pair[3]
1765  particle_distance=float(pair[4])
1766 
1767 
1768 
1769  return ((protein1,protein2,residue1,residue2)),particle_distance
1770 
1771  def get_xwalk_distances(self,xwalk_bin_path,distance):
1772  import IMP.pmi.output
1773  import os
1774  o=IMP.pmi.output.Output(atomistic=True)
1775  o.init_pdb("xwalk.pdb",self.representation.prot)
1776  o.write_pdb("xwalk.pdb")
1777  namechainiddict=o.dictchain["xwalk.pdb"]
1778  chainiddict={}
1779 
1780  for key in namechainiddict: chainiddict[namechainiddict[key]]=key
1781  xwalkout=os.popen('java -Xmx256m -cp ' + xwalk_bin_path +' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+' -bb').read()
1782 
1783  output_list_of_distance=[]
1784  for line in xwalkout.split("\n")[0:-2]:
1785  tockens=line.split()
1786  first=tockens[2]
1787  second=tockens[3]
1788  distance=float(tockens[6])
1789  fs=first.split("-")
1790  ss=second.split("-")
1791  chainid1=fs[2]
1792  chainid2=ss[2]
1793  protein1=chainiddict[chainid1]
1794  protein2=chainiddict[chainid2]
1795  residue1=int(fs[1])
1796  residue2=int(ss[1])
1797  output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1798  return output_list_of_distance
1799 
1800 
1801  def weighted_choice(self,choices):
1802  import random
1803  # from http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
1804  total = sum(w for c, w in choices)
1805  r = random.uniform(0, total)
1806  upto = 0
1807  for c, w in choices:
1808  if upto + w > r:
1809  return c
1810  upto += w
1811  assert False, "Shouldn't get here"
1812 
1813  def save_rmf_snapshot(self,filename,color_id=None):
1814  import IMP.rmf
1815  import RMF
1816  if color_id is None:
1817  color_id="Reactivity"
1818  sorted_ids=None
1819  sorted_group_ids=sorted(self.cldb.data_base.keys())
1820  list_of_pairs=[]
1821  color_scores=[]
1822  for group in sorted_group_ids:
1823  group_xls=[]
1824  group_dists_particles=[]
1825  for xl in self.cldb.data_base[group]:
1826  xllabel=self.cldb.get_short_cross_link_string(xl)
1827  (c1,c2,r1,r2)=(xl["molecule_object1"],xl["molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1828  try:
1829  index1=self.protein_residue_dict[(c1,r1)]
1830  index2=self.protein_residue_dict[(c2,r2)]
1831  p1,p2=IMP.get_particles(self.model,[index1])[0],IMP.get_particles(self.model,[index2])[0]
1832  mdist=xl["TargetDistance"]
1833  except TypeError:
1834  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1835  continue
1836  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1837  if group_dists_particles:
1838  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1839  color_scores.append(mincolor_score)
1840  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1841  else:
1842  continue
1843 
1844 
1845  m=self.model
1846  linear = IMP.core.Linear(0, 0.0)
1847  linear.set_slope(1.0)
1848  dps2 = IMP.core.DistancePairScore(linear)
1849  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
1850  sgs=[]
1851  offset=min(color_scores)
1852  maxvalue=max(color_scores)
1853  for pair in list_of_pairs:
1854  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
1855  pr.set_name(pair[2])
1856  factor=(pair[3]-offset)/(maxvalue-offset)
1857  c=IMP.display.get_rgb_color(factor)
1858  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1859  rslin.add_restraint(pr)
1860  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1861 
1862  rh = RMF.create_rmf_file(filename)
1863  IMP.rmf.add_hierarchies(rh, [self.system.hier])
1864  IMP.rmf.add_restraints(rh,[rslin])
1865  IMP.rmf.add_geometries(rh, sgs)
1866  IMP.rmf.save_frame(rh)
1867  del rh
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: State.h:41
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:963
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:619
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:36
Stores a named protein chain.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
The standard decorator for manipulating molecular structures.
This class converts three to one letter codes, and return X for any unknown codes.
Definition: tools.py:1749
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:47
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:19
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:24
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
Definition: State.h:27
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Definition: PairRestraint.h:29
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.