IMP logo
IMP Reference Guide  2.10.0
The Integrative Modeling Platform
io/crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 from __future__ import print_function
8 import IMP
9 import IMP.pmi
10 import IMP.pmi.output
11 import IMP.atom
12 import IMP.core
13 import IMP.algebra
14 import IMP.rmf
15 import RMF
16 import IMP.display
17 import operator
18 import math
19 import sys
20 import ihm.location
21 import ihm.dataset
22 from collections import defaultdict
23 
24 # json default serializations
25 def set_json_default(obj):
26  if isinstance(obj, set):
27  return list(obj)
28  if isinstance(obj, IMP.pmi.topology.Molecule):
29  return str(obj)
30  raise TypeError
31 
32 # Handle and return data that must be a string
33 if sys.version_info[0] >= 3:
34  def _handle_string_input(inp):
35  if not isinstance(inp, str):
36  raise TypeError("expecting a string")
37  return inp
38 else:
39  def _handle_string_input(inp):
40  if not isinstance(inp, (str, unicode)):
41  raise TypeError("expecting a string or unicode")
42  # Coerce to non-unicode representation (str)
43  if isinstance(inp, unicode):
44  return str(inp)
45  else:
46  return inp
47 
48 class _CrossLinkDataBaseStandardKeys(object):
49  '''
50  This class setup all the standard keys needed to
51  identify the crosslink features from the data sets
52  '''
53  def __init__(self):
54  self.type={}
55  self.protein1_key="Protein1"
56  self.type[self.protein1_key]=str
57  self.protein2_key="Protein2"
58  self.type[self.protein2_key]=str
59  self.residue1_key="Residue1"
60  self.type[self.residue1_key]=int
61  self.residue2_key="Residue2"
62  self.type[self.residue2_key]=int
63  self.residue1_amino_acid_key="Residue1AminoAcid"
64  self.type[self.residue1_amino_acid_key]=str
65  self.residue2_amino_acid_key="Residue2AminoAcid"
66  self.type[self.residue2_amino_acid_key]=str
67  self.residue1_moiety_key="Residue1Moiety"
68  self.type[self.residue1_moiety_key]=str
69  self.residue2_moiety_key="Residue2Moiety"
70  self.type[self.residue2_moiety_key]=str
71  self.site_pairs_key="SitePairs"
72  self.type[self.site_pairs_key]=str
73  self.unique_id_key="XLUniqueID"
74  self.type[self.unique_id_key]=str
75  self.unique_sub_index_key="XLUniqueSubIndex"
76  self.type[self.unique_sub_index_key]=int
77  self.unique_sub_id_key="XLUniqueSubID"
78  self.type[self.unique_sub_id_key]=str
79  self.data_set_name_key="DataSetName"
80  self.type[self.data_set_name_key]=str
81  self.cross_linker_chemical_key="CrossLinkerChemical"
82  self.type[self.cross_linker_chemical_key]=str
83  self.id_score_key="IDScore"
84  self.type[self.id_score_key]=float
85  self.fdr_key="FDR"
86  self.type[self.fdr_key]=float
87  self.quantitation_key="Quantitation"
88  self.type[self.quantitation_key]=float
89  self.redundancy_key="Redundancy"
90  self.type[self.redundancy_key]=int
91  self.redundancy_list_key="RedundancyList"
92  self.type[self.redundancy_key]=list
93  self.ambiguity_key="Ambiguity"
94  self.type[self.ambiguity_key]=int
95  self.residue1_links_number_key="Residue1LinksNumber"
96  self.type[self.residue1_links_number_key]=int
97  self.residue2_links_number_key="Residue2LinksNumber"
98  self.type[self.residue2_links_number_key]=int
99  self.type[self.ambiguity_key]=int
100  self.state_key="State"
101  self.type[self.state_key]=int
102  self.sigma1_key="Sigma1"
103  self.type[self.sigma1_key]=str
104  self.sigma2_key="Sigma2"
105  self.type[self.sigma2_key]=str
106  self.psi_key="Psi"
107  self.type[self.psi_key]=str
108  self.distance_key="Distance"
109  self.type[self.distance_key]=float
110  self.min_ambiguous_distance_key="MinAmbiguousDistance"
111  self.type[self.distance_key]=float
112  #link types are either Monolink, Intralink or Interlink
113  self.link_type_key="LinkType"
114  self.type[self.link_type_key]=str
115 
116  self.ordered_key_list =[self.data_set_name_key,
117  self.unique_id_key,
118  self.unique_sub_index_key,
119  self.unique_sub_id_key,
120  self.protein1_key,
121  self.protein2_key,
122  self.residue1_key,
123  self.residue2_key,
124  self.residue1_amino_acid_key,
125  self.residue2_amino_acid_key,
126  self.residue1_moiety_key,
127  self.residue2_moiety_key,
128  self.cross_linker_chemical_key,
129  self.id_score_key,
130  self.fdr_key,
131  self.quantitation_key,
132  self.redundancy_key,
133  self.redundancy_list_key,
134  self.state_key,
135  self.sigma1_key,
136  self.sigma2_key,
137  self.psi_key,
138  self.distance_key,
139  self.min_ambiguous_distance_key,
140  self.link_type_key]
141 
142 
143 class _ProteinsResiduesArray(tuple):
144  '''
145  This class is inherits from tuple, and it is a shorthand for a cross-link
146  (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
147  residue1 and residue2.
148  '''
149 
150  def __new__(self,input_data):
151  '''
152  @input_data can be a dict or a tuple
153  '''
154  self.cldbsk=_CrossLinkDataBaseStandardKeys()
155  if type(input_data) is dict:
156  monolink=False
157  p1=input_data[self.cldbsk.protein1_key]
158  try:
159  p2=input_data[self.cldbsk.protein2_key]
160  except KeyError:
161  monolink=True
162  r1=input_data[self.cldbsk.residue1_key]
163  try:
164  r2=input_data[self.cldbsk.residue2_key]
165  except KeyError:
166  monolink=True
167  if not monolink:
168  t=(p1,p2,r1,r2)
169  else:
170  t=(p1,"",r1,None)
171  elif type(input_data) is tuple:
172  if len(input_data)>4 or len(input_data)==3 or len(input_data)==1:
173  raise TypeError("_ProteinsResiduesArray: must have only 4 elements")
174  if len(input_data)==4:
175  p1 = _handle_string_input(input_data[0])
176  p2 = _handle_string_input(input_data[1])
177  r1=input_data[2]
178  r2=input_data[3]
179  if (not (type(r1) is int)) and (not (r1 is None)):
180  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
181  if (not (type(r2) is int)) and (not (r1 is None)):
182  raise TypeError("_ProteinsResiduesArray: residue2 must be a integer")
183  t=(p1,p2,r1,r2)
184  if len(input_data) == 2:
185  p1 = _handle_string_input(input_data[0])
186  r1 = input_data[1]
187  if type(r1) is not int:
188  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
189  t = (p1,"",r1,None)
190  else:
191  raise TypeError("_ProteinsResiduesArray: input must be a dict or tuple")
192  return tuple.__new__(_ProteinsResiduesArray, t)
193 
194  def get_inverted(self):
195  '''
196  Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
197  '''
198  return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
199 
200  def __repr__(self):
201  outstr=self.cldbsk.protein1_key+" "+str(self[0])
202  outstr+=" "+self.cldbsk.protein2_key+" "+str(self[1])
203  outstr+=" "+self.cldbsk.residue1_key+" "+str(self[2])
204  outstr+=" "+self.cldbsk.residue2_key+" "+str(self[3])
205  return outstr
206 
207  def __str__(self):
208  outstr=str(self[0])+"."+str(self[2])+"-"+str(self[1])+"."+str(self[3])
209  return outstr
210 
211 class FilterOperator(object):
212  '''
213  This class allows to create filter functions that can be passed to the CrossLinkDataBase
214  in this way:
215 
216  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
217 
218  where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
219 
220  A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
221 
222  fo.evaluate(xl)
223 
224  and it is used to filter the database
225  '''
226 
227  def __init__(self, argument1, operator, argument2):
228  '''
229  (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
230  or (FilterOperator1,operator.or|and...,FilterOperator2)
231  '''
232  if isinstance(argument1, FilterOperator):
233  self.operations = [argument1, operator, argument2]
234  else:
235  self.operations = []
236  self.values = (argument1, operator, argument2)
237 
238  def __or__(self, FilterOperator2):
239  return FilterOperator(self, operator.or_, FilterOperator2)
240 
241  def __and__(self, FilterOperator2):
242  return FilterOperator(self, operator.and_, FilterOperator2)
243 
244  def __invert__(self):
245  return FilterOperator(self, operator.not_, None)
246 
247  def evaluate(self, xl_item):
248 
249  if len(self.operations) == 0:
250  keyword, operator, value = self.values
251  return operator(xl_item[keyword], value)
252  FilterOperator1, op, FilterOperator2 = self.operations
253 
254  if FilterOperator2 is None:
255  return op(FilterOperator1.evaluate(xl_item))
256  else:
257  return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
258 
259 '''
260 def filter_factory(xl_):
261 
262  class FilterOperator(object):
263  import operator
264  xl = xl_
265 
266  def __new__(self,key,value,oper=operator.eq):
267  return oper(self.xl[key],value)
268 
269  return FilterOperator
270 '''
271 
272 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
273  '''
274  This class is needed to convert the keywords from a generic database
275  to the standard ones
276  '''
277 
278  def __init__(self, list_parser=None):
279  '''
280  @param list_parser an instance of ResiduePairListParser, if any is needed
281  '''
282  self.converter={}
283  self.backward_converter={}
284  _CrossLinkDataBaseStandardKeys.__init__(self)
285  self.rplp = list_parser
286  if self.rplp is None:
287  # either you have protein1, protein2, residue1, residue2
288  self.compulsory_keys=set([self.protein1_key,
289  self.protein2_key,
290  self.residue1_key,
291  self.residue2_key])
292  else:
293  self.compulsory_keys=self.rplp.get_compulsory_keys()
294  self.setup_keys=set()
295 
296  def check_keys(self):
297  '''
298  Is a function that check whether necessary keys are setup
299  '''
300  setup_keys=set(self.get_setup_keys())
301  if self.compulsory_keys & setup_keys != self.compulsory_keys:
302  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
303 
304  def get_setup_keys(self):
305  '''
306  Returns the keys that have been setup so far
307  '''
308  return self.backward_converter.keys()
309 
310  def set_standard_keys(self):
311  """
312  This sets up the standard compulsory keys as defined by
313  _CrossLinkDataBaseStandardKeys
314  """
315  for ck in self.compulsory_keys:
316  self.converter[ck]=ck
317  self.backward_converter[ck]=ck
318 
319  def set_unique_id_key(self,origin_key):
320  self.converter[origin_key]=self.unique_id_key
321  self.backward_converter[self.unique_id_key]=origin_key
322 
323  def set_protein1_key(self,origin_key):
324  self.converter[origin_key]=self.protein1_key
325  self.backward_converter[self.protein1_key]=origin_key
326 
327  def set_protein2_key(self,origin_key):
328  self.converter[origin_key]=self.protein2_key
329  self.backward_converter[self.protein2_key]=origin_key
330 
331  def set_residue1_key(self,origin_key):
332  self.converter[origin_key]=self.residue1_key
333  self.backward_converter[self.residue1_key]=origin_key
334 
335  def set_residue2_key(self,origin_key):
336  self.converter[origin_key]=self.residue2_key
337  self.backward_converter[self.residue2_key]=origin_key
338 
339  def set_residue1_amino_acid_key(self, origin_key):
340  self.converter[origin_key] = self.residue1_amino_acid_key
341  self.backward_converter[self.residue1_amino_acid_key] = origin_key
342 
343  def set_residue2_amino_acid_key(self, origin_key):
344  self.converter[origin_key] = self.residue2_amino_acid_key
345  self.backward_converter[self.residue2_amino_acid_key] = origin_key
346 
347  def set_residue1_moiety_key(self, origin_key):
348  self.converter[origin_key] = self.residue1_moiety_key
349  self.backward_converter[self.residue1_moiety_key] = origin_key
350 
351  def set_residue2_moiety_key(self, origin_key):
352  self.converter[origin_key] = self.residue2_moiety_key
353  self.backward_converter[self.residue2_moiety_key] = origin_key
354 
355  def set_site_pairs_key(self,origin_key):
356  self.converter[origin_key]=self.site_pairs_key
357  self.backward_converter[self.site_pairs_key]=origin_key
358 
359  def set_id_score_key(self,origin_key):
360  self.converter[origin_key]=self.id_score_key
361  self.backward_converter[self.id_score_key]=origin_key
362 
363  def set_fdr_key(self,origin_key):
364  self.converter[origin_key]=self.fdr_key
365  self.backward_converter[self.fdr_key]=origin_key
366 
367  def set_quantitation_key(self,origin_key):
368  self.converter[origin_key]=self.quantitation_key
369  self.backward_converter[self.quantitation_key]=origin_key
370 
371  def set_psi_key(self,origin_key):
372  self.converter[origin_key]=self.psi_key
373  self.backward_converter[self.psi_key]=origin_key
374 
375  def set_link_type_key(self,link_type_key):
376  self.converter[link_type_key]=self.link_type_key
377  self.backward_converter[self.link_type_key]=link_type_key
378 
379  def get_converter(self):
380  '''
381  Returns the dictionary that convert the old keywords to the new ones
382  '''
383  self.check_keys()
384  return self.converter
385 
387  '''
388  Returns the dictionary that convert the new keywords to the old ones
389  '''
390  self.check_keys()
391  return self.backward_converter
392 
393 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
394  '''
395  A class to handle different styles of site pairs parsers.
396  Implemented styles:
397  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
398  [Y3-;K4-] for dead-ends
399  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
400  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
401  LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
402  '''
403 
404  import re
405 
406  def __init__(self,style):
407 
408  _CrossLinkDataBaseStandardKeys.__init__(self)
409  if style is "MSSTUDIO":
410  self.style=style
411  self.compulsory_keys= set([self.protein1_key,
412  self.protein2_key,
413  self.site_pairs_key])
414  elif style is "XTRACT" or style is "QUANTITATION":
415  self.style=style
416  self.compulsory_keys= set([self.site_pairs_key])
417  elif style is "LAN_HUANG":
418  self.style=style
419  self.compulsory_keys= set([self.site_pairs_key])
420  else:
421  raise Error("ResiduePairListParser: unknown list parser style")
422 
423  def get_compulsory_keys(self):
424  return self.compulsory_keys
425 
426  def get_list(self,input_string):
427  '''
428  This function returns a list of cross-linked residues and the corresponding list of
429  cross-linked chains. The latter list can be empty, if the style doesn't have the
430  corresponding information.
431  '''
432  if self.style == "MSSTUDIO":
433  input_string=input_string.replace("[","")
434  input_string=input_string.replace("]","")
435  input_string_pairs=input_string.split(";")
436  residue_pair_indexes=[]
437  chain_pair_indexes=[]
438  for s in input_string_pairs:
439  m1=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
440  m2=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
441  if m1:
442  # cross link
443  residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
444  residue_pair_indexes.append((residue_index_1,residue_index_2))
445  elif m2:
446  # dead end
447  residue_type_1,residue_index_1=m2.group(1,2)
448  # at this stage chain_pair_indexes is empty
449  return residue_pair_indexes,chain_pair_indexes
450  if self.style is "XTRACT" or self.style is "QUANTITATION":
451  if ":x:" in input_string:
452  # if it is a crosslink....
453  input_strings=input_string.split(":x:")
454  first_peptides=input_strings[0].split(":|:")
455  second_peptides=input_strings[1].split(":|:")
456  first_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in first_peptides]
457  second_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in second_peptides]
458  residue_pair_indexes=[]
459  chain_pair_indexes=[]
460  for fpi in first_peptides_indentifiers:
461  for spi in second_peptides_indentifiers:
462  chain1=fpi[0]
463  chain2=spi[0]
464  residue1=fpi[1]
465  residue2=spi[1]
466  residue_pair_indexes.append((residue1,residue2))
467  chain_pair_indexes.append((chain1,chain2))
468  return residue_pair_indexes, chain_pair_indexes
469  else:
470  # if it is a monolink....
471  first_peptides = input_string.split(":|:")
472  first_peptides_indentifiers = [(f.split(":")[0], f.split(":")[1]) for f in first_peptides]
473  residue_indexes = []
474  chain_indexes = []
475  for fpi in first_peptides_indentifiers:
476  chain1=fpi[0]
477  residue1=fpi[1]
478  residue_indexes.append((residue1,))
479  chain_indexes.append((chain1,))
480  return residue_indexes, chain_indexes
481  if self.style is "LAN_HUANG":
482  input_strings=input_string.split("-")
483  chain1,first_series=input_strings[0].split(":")
484  chain2,second_series=input_strings[1].split(":")
485 
486  first_residues=first_series.replace(";","|").split("|")
487  second_residues=second_series.replace(";","|").split("|")
488  residue_pair_indexes=[]
489  chain_pair_indexes=[]
490  for fpi in first_residues:
491  for spi in second_residues:
492  residue1=self.re.sub("[^0-9]", "", fpi)
493  residue2=self.re.sub("[^0-9]", "", spi)
494  residue_pair_indexes.append((residue1,residue2))
495  chain_pair_indexes.append((chain1,chain2))
496  return residue_pair_indexes, chain_pair_indexes
497 
498 
499 
500 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
501  '''
502  A class to handle different XL format with fixed format
503  currently support ProXL
504  '''
505  def __init__(self,format):
506 
507  _CrossLinkDataBaseStandardKeys.__init__(self)
508  if format is "PROXL":
509  self.format=format
510  else:
511  raise Error("FixedFormatParser: unknown list format name")
512 
513 
514  def get_data(self,input_string):
515  if self.format is "PROXL":
516  tockens=input_string.split("\t")
517  xl={}
518  if tockens[0]=="SEARCH ID(S)":
519  return None
520  else:
521  xl[self.protein1_key]=tockens[2]
522  xl[self.protein2_key]=tockens[4]
523  xl[self.residue1_key]=int(tockens[3])
524  xl[self.residue2_key]=int(tockens[5])
525  return xl
526 
527 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
528  import operator
529  '''
530  this class handles a cross-link dataset and do filtering
531  operations, adding cross-links, merge datasets...
532  '''
533 
534  def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=('K',)):
535  '''
536  Constructor.
537  @param converter an instance of CrossLinkDataBaseKeywordsConverter
538  @param data_base an instance of CrossLinkDataBase to build the new database on
539  @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
540  crosslink consistency. If not given consistency will not be checked
541  @param linkable_aa a tuple containing one-letter amino acids which are linkable by the crosslinker;
542  only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
543  and if a fasta_seq is given
544  '''
545 
546  if data_base is None:
547  self.data_base = {}
548  else:
549  self.data_base=data_base
550 
551  _CrossLinkDataBaseStandardKeys.__init__(self)
552  if converter is not None:
553  self.cldbkc = converter #type: CrossLinkDataBaseKeywordsConverter
554  self.list_parser=self.cldbkc.rplp
555  self.converter = converter.get_converter()
556 
557  else:
558  self.cldbkc = None #type: CrossLinkDataBaseKeywordsConverter
559  self.list_parser=None
560  self.converter = None
561 
562  # default amino acids considered to be 'linkable' if none are given
563  self.def_aa_tuple = linkable_aa
564  self.fasta_seq = fasta_seq #type: IMP.pmi.topology.Sequences
565  self.dataset = None
566  self._update()
567 
568  def _update(self):
569  '''
570  Update the whole dataset after changes
571  '''
572  self.update_cross_link_unique_sub_index()
573  self.update_cross_link_redundancy()
574  self.update_residues_links_number()
575  self.check_cross_link_consistency()
576 
577 
578  def __iter__(self):
579  sorted_ids=sorted(self.data_base.keys())
580  for k in sorted_ids:
581  for xl in self.data_base[k]:
582  yield xl
583 
584  def xlid_iterator(self):
585  sorted_ids=sorted(self.data_base.keys())
586  for xlid in sorted_ids:
587  yield xlid
588 
589  def __getitem__(self,xlid):
590  return self.data_base[xlid]
591 
592  def __len__(self):
593  return len([xl for xl in self])
594 
595  def get_name(self):
596  return self.name
597 
598  def set_name(self,name):
599  new_data_base={}
600  for k in self.data_base:
601  new_data_base[k+"."+name]=self.data_base[k]
602  self.data_base=new_data_base
603  self.name=name
604  self._update()
605 
606  def get_number_of_xlid(self):
607  return len(self.data_base)
608 
609 
610  def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
611  '''
612  if FixedFormatParser is not specified, the file is comma-separated-values
613  @param file_name a txt file to be parsed
614  @param converter an instance of CrossLinkDataBaseKeywordsConverter
615  @param FixedFormatParser a parser for a fixed format
616  '''
617  if not FixedFormatParser:
618  xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
619 
620  if converter is not None:
621  self.cldbkc = converter
622  self.list_parser=self.cldbkc.rplp
623  self.converter = converter.get_converter()
624 
625 
626  if not self.list_parser:
627  # normal procedure without a list_parser
628  # each line is a cross-link
629  new_xl_dict={}
630  for nxl,xl in enumerate(xl_list):
631  new_xl={}
632  for k in xl:
633  if k in self.converter:
634  new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
635  else:
636  new_xl[k]=xl[k]
637  if self.unique_id_key in self.cldbkc.get_setup_keys():
638  if new_xl[self.unique_id_key] not in new_xl_dict:
639  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
640  else:
641  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
642  else:
643  if str(nxl) not in new_xl_dict:
644  new_xl_dict[str(nxl)]=[new_xl]
645  else:
646  new_xl_dict[str(nxl)].append(new_xl)
647 
648  else:
649  # with a list_parser, a line can be a list of ambiguous crosslinks
650  new_xl_dict={}
651  for nxl,entry in enumerate(xl_list):
652 
653  # first get the translated keywords
654  new_dict={}
655  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
656  raise Error("CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
657  for k in entry:
658  if k in self.converter:
659  new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
660  else:
661  new_dict[k]=entry[k]
662 
663  residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
664 
665  # then create the crosslinks
666  for n,p in enumerate(residue_pair_list):
667  is_monolink=False
668  if len(p)==1:
669  is_monolink=True
670 
671  new_xl={}
672  for k in new_dict:
673  new_xl[k]=new_dict[k]
674  new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
675  if not is_monolink:
676  new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
677 
678  if len(chain_pair_list)==len(residue_pair_list):
679  new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
680  if not is_monolink:
681  new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
682 
683  if not is_monolink:
684  new_xl[self.link_type_key]="CROSSLINK"
685  else:
686  new_xl[self.link_type_key]="MONOLINK"
687 
688  if self.unique_id_key in self.cldbkc.get_setup_keys():
689  if new_xl[self.unique_id_key] not in new_xl_dict:
690  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
691  else:
692  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
693  else:
694  if str(nxl) not in new_xl_dict:
695  new_xl[self.unique_id_key]=str(nxl+1)
696  new_xl_dict[str(nxl)]=[new_xl]
697  else:
698  new_xl[self.unique_id_key]=str(nxl+1)
699  new_xl_dict[str(nxl)].append(new_xl)
700 
701 
702  else:
703  '''
704  if FixedFormatParser is defined
705  '''
706 
707  new_xl_dict={}
708  f=open(file_name,"r")
709  nxl=0
710  for line in f:
711  xl=FixedFormatParser.get_data(line)
712  if xl:
713  xl[self.unique_id_key]=str(nxl+1)
714  new_xl_dict[str(nxl)]=[xl]
715  nxl+=1
716 
717 
718  self.data_base=new_xl_dict
719  self.name=file_name
720  l = ihm.location.InputFileLocation(file_name, details='Crosslinks')
721  self.dataset = ihm.dataset.CXMSDataset(l)
722  self._update()
723 
724  def update_cross_link_unique_sub_index(self):
725  for k in self.data_base:
726  for n,xl in enumerate(self.data_base[k]):
727  xl[self.ambiguity_key]=len(self.data_base[k])
728  xl[self.unique_sub_index_key]=n+1
729  xl[self.unique_sub_id_key]=k+"."+str(n+1)
730 
731  def update_cross_link_redundancy(self):
732  redundancy_data_base={}
733  for xl in self:
734  pra=_ProteinsResiduesArray(xl)
735  if pra not in redundancy_data_base:
736  redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
737  redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
738  else:
739  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
740  redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
741  for xl in self:
742  pra=_ProteinsResiduesArray(xl)
743  xl[self.redundancy_key]=len(redundancy_data_base[pra])
744  xl[self.redundancy_list_key]=redundancy_data_base[pra]
745 
746  def update_residues_links_number(self):
747  residue_links={}
748  for xl in self:
749  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
750  if (p1,r1) not in residue_links:
751  residue_links[(p1,r1)]=set([(p2,r2)])
752  else:
753  residue_links[(p1,r1)].add((p2,r2))
754  if (p2,r2) not in residue_links:
755  residue_links[(p2,r2)]=set([(p1,r1)])
756  else:
757  residue_links[(p2,r2)].add((p1,r1))
758 
759  for xl in self:
760  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
761  xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
762  xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
763 
764  def check_cross_link_consistency(self):
765  """This function checks the consistency of the dataset with the amino acid sequence"""
766  if self.cldbkc and self.fasta_seq:
767  cnt_matched, cnt_matched_file = 0, 0
768  matched = {}
769  non_matched = {}
770  for xl in self:
771  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
772  b_matched_file = False
773  if self.residue1_amino_acid_key in xl:
774  # either you know the residue type and aa_tuple is a single entry
775  aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
776  b_matched = self._match_xlinks(p1, r1, aa_from_file)
777  b_matched_file = b_matched
778  else:
779  # or pass the possible list of types that can be crosslinked
780  b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
781 
782  matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
783 
784  if self.residue2_amino_acid_key in xl:
785  aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
786  b_matched = self._match_xlinks(p2, r2, aa_from_file)
787  b_matched_file = b_matched
788  else:
789  b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
790 
791  matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
792  if b_matched: cnt_matched += 1
793  if b_matched_file: cnt_matched_file += 1
794  if len(self) > 0:
795  percentage_matched = round(100*cnt_matched/len(self),1)
796  percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
797  #if matched: print "Matched xlinks:", matched
798  if matched or non_matched: print("check_cross_link_consistency: Out of %d crosslinks "
799  "%d were matched to the fasta sequence (%f %%).\n "
800  "%d were matched by using the crosslink file (%f %%)."%
801  (len(self),cnt_matched,percentage_matched,cnt_matched_file,
802  percentage_matched_file) )
803  if non_matched: print("check_cross_link_consistency: Warning: Non matched xlinks:",
804  [(prot_name, sorted(list(non_matched[prot_name]))) for prot_name in non_matched])
805  return matched,non_matched
806 
807  def _match_xlinks(self, prot_name, res_index, aa_tuple):
808  # returns Boolean whether given aa matches a position in the fasta file
809  # cross link files usually start counting at 1 and not 0; therefore subtract -1 to compare with fasta
810  amino_dict = IMP.pmi.tools.ThreeToOneConverter()
811  res_index -= 1
812  for amino_acid in aa_tuple:
813  if len(amino_acid) == 3:
814  amino_acid = amino_dict[amino_acid.upper()]
815  if prot_name in self.fasta_seq.sequences:
816  seq = self.fasta_seq.sequences[prot_name]
817  # if we are looking at the first amino acid in a given sequence always return a match
818  # the first aa should always be the n-terminal aa
819  # which may form a crosslink in any case (for BS3-like crosslinkers)
820  # for some data sets the first aa is at position 1; todo: check why this is the case
821  if res_index == 0 or res_index == 1:
822  return True
823  if res_index < len(seq):
824  if amino_acid == seq[res_index]:
825  return True
826  # else:
827  # print "Could not match", prot, res+1, amino_acid, seq[res]
828  return False
829 
830  def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
831  if b_matched:
832  if prot in matched:
833  matched[prot].add(res)
834  else:
835  matched[prot] = set([res])
836  else:
837  if prot in non_matched:
838  non_matched[prot].add(res)
839  else:
840  non_matched[prot] = set([res])
841  return matched, non_matched
842 
843 
844  def get_cross_link_string(self,xl):
845  string='|'
846  for k in self.ordered_key_list:
847  try:
848  string+=str(k)+":"+str(xl[k])+"|"
849  except KeyError:
850  continue
851 
852  for k in xl:
853  if k not in self.ordered_key_list:
854  string+=str(k)+":"+str(xl[k])+"|"
855 
856  return string
857 
858  def get_short_cross_link_string(self,xl):
859 
860  string='|'
861  list_of_keys=[self.data_set_name_key,
862  self.unique_sub_id_key,
863  self.protein1_key,
864  self.residue1_key,
865  self.protein2_key,
866  self.residue2_key,
867  self.state_key,
868  self.psi_key]
869 
870  for k in list_of_keys:
871  try:
872  string+=str(xl[k])+"|"
873  except KeyError:
874  continue
875 
876  return string
877 
878  def filter(self,FilterOperator):
879  new_xl_dict={}
880  for id in self.data_base.keys():
881  for xl in self.data_base[id]:
882  if FilterOperator.evaluate(xl):
883  if id not in new_xl_dict:
884  new_xl_dict[id]=[xl]
885  else:
886  new_xl_dict[id].append(xl)
887  self._update()
888  cdb = CrossLinkDataBase(self.cldbkc,new_xl_dict)
889  cdb.dataset = self.dataset
890  return cdb
891 
892  def filter_score(self,score):
893  '''Get all crosslinks with score greater than an input value'''
894  FilterOperator=IMP.pmi.io.crosslink.FilterOperator(self.id_score_key,operator.gt,score)
895  return self.filter(FilterOperator)
896 
897  def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
898  '''
899  This function merges two cross-link datasets so that if two conflicting crosslinks have the same
900  cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
901  with different SubIDs
902  '''
903  pass
904 
905  def append_database(self,CrossLinkDataBase2):
906  '''
907  This function append one cross-link dataset to another. Unique ids will be renamed to avoid
908  conflicts.
909  '''
910  name1=self.get_name()
911  name2=CrossLinkDataBase2.get_name()
912  if name1 == name2:
913  name1=id(self)
914  name2=id(CrossLinkDataBase2)
915 
916  #rename first database:
917  new_data_base={}
918  for k in self.data_base:
919  new_data_base[k]=self.data_base[k]
920  for k in CrossLinkDataBase2.data_base:
921  new_data_base[k]=CrossLinkDataBase2.data_base[k]
922  self.data_base=new_data_base
923  self._update()
924 
925  def set_value(self,key,new_value,FilterOperator=None):
926  '''
927  This function changes the value for a given key in the database
928  For instance one can change the name of a protein
929  @param key: the key in the database that must be changed
930  @param new_value: the new value of the key
931  @param FilterOperator: optional FilterOperator to change the value to
932  a subset of the database
933 
934  example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
935  '''
936 
937  for xl in self:
938  if FilterOperator is not None:
939  if FilterOperator.evaluate(xl):
940  xl[key]=new_value
941  else:
942  xl[key]=new_value
943  self._update()
944 
945  def get_values(self,key):
946  '''
947  this function returns the list of values for a given key in the database
948  alphanumerically sorted
949  '''
950  values=set()
951  for xl in self:
952  values.add(xl[key])
953  return sorted(list(values))
954 
955  def offset_residue_index(self,protein_name,offset):
956  '''
957  This function offset the residue indexes of a given protein by a specified value
958  @param protein_name: the protein name that need to be changed
959  @param offset: the offset value
960  '''
961 
962  for xl in self:
963  if xl[self.protein1_key] == protein_name:
964  xl[self.residue1_key]=xl[self.residue1_key]+offset
965  if xl[self.protein2_key] == protein_name:
966  xl[self.residue2_key]=xl[self.residue2_key]+offset
967  self._update()
968 
969  def create_new_keyword(self,keyword,values_from_keyword=None):
970  '''
971  This function creates a new keyword for the whole database and set the values from
972  and existing keyword (optional), otherwise the values are set to None
973  @param keyword the new keyword name:
974  @param values_from_keyword the keyword from which we are copying the values:
975  '''
976  for xl in self:
977  if values_from_keyword is not None:
978  xl[keyword] = xl[values_from_keyword]
979  else:
980  xl[keyword] = None
981  self._update()
982 
983  def rename_proteins(self,old_to_new_names_dictionary, protein_to_rename="both"):
984  '''
985  This function renames all proteins contained in the input dictionary
986  from the old names (keys) to the new name (values)
987  @param old_to_new_names_dictionary dictionary for converting old to new names
988  @param protein_to_rename specify whether to rename both or protein1 or protein2 only
989  '''
990 
991  for old_name in old_to_new_names_dictionary:
992  new_name=old_to_new_names_dictionary[old_name]
993  if protein_to_rename == "both" or protein_to_rename == "protein1":
994  fo2=FilterOperator(self.protein1_key,operator.eq,old_name)
995  self.set_value(self.protein1_key,new_name,fo2)
996  if protein_to_rename == "both" or protein_to_rename == "protein2":
997  fo2=FilterOperator(self.protein2_key,operator.eq,old_name)
998  self.set_value(self.protein2_key,new_name,fo2)
999 
1000  def clone_protein(self,protein_name,new_protein_name):
1001  new_xl_dict={}
1002  for id in self.data_base.keys():
1003  new_data_base=[]
1004  for xl in self.data_base[id]:
1005  new_data_base.append(xl)
1006  if xl[self.protein1_key]==protein_name and xl[self.protein2_key]!=protein_name:
1007  new_xl=dict(xl)
1008  new_xl[self.protein1_key]=new_protein_name
1009  new_data_base.append(new_xl)
1010  elif xl[self.protein1_key]!=protein_name and xl[self.protein2_key]==protein_name:
1011  new_xl=dict(xl)
1012  new_xl[self.protein2_key]=new_protein_name
1013  new_data_base.append(new_xl)
1014  elif xl[self.protein1_key]==protein_name and xl[self.protein2_key]==protein_name:
1015  new_xl=dict(xl)
1016  new_xl[self.protein1_key]=new_protein_name
1017  new_data_base.append(new_xl)
1018  new_xl=dict(xl)
1019  new_xl[self.protein2_key]=new_protein_name
1020  new_data_base.append(new_xl)
1021  new_xl=dict(xl)
1022  new_xl[self.protein1_key]=new_protein_name
1023  new_xl[self.protein2_key]=new_protein_name
1024  new_data_base.append(new_xl)
1025  self.data_base[id]=new_data_base
1026  self._update()
1027 
1028  def filter_out_same_residues(self):
1029  '''
1030  This function remove cross-links applied to the same residue
1031  (ie, same chain name and residue number)
1032  '''
1033  new_xl_dict={}
1034  for id in self.data_base.keys():
1035  new_data_base=[]
1036  for xl in self.data_base[id]:
1037  if xl[self.protein1_key]==xl[self.protein2_key] and xl[self.residue1_key]==xl[self.residue2_key]:
1038  continue
1039  else:
1040  new_data_base.append(xl)
1041  self.data_base[id]=new_data_base
1042  self._update()
1043 
1044 
1045  def jackknife(self,percentage):
1046  '''
1047  this method returns a CrossLinkDataBase class containing
1048  a random subsample of the original cross-link database.
1049  @param percentage float between 0 and 1, is the percentage of
1050  of spectra taken from the original list
1051  '''
1052  import random
1053  if percentage > 1.0 or percentage < 0.0:
1054  raise ValueError('the percentage of random cross-link spectra should be between 0 and 1')
1055  nspectra=self.get_number_of_xlid()
1056  nrandom_spectra=int(nspectra*percentage)
1057  random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1058  new_data_base={}
1059  for k in random_keys:
1060  new_data_base[k]=self.data_base[k]
1061  return CrossLinkDataBase(self.cldbkc,new_data_base)
1062 
1063  def __str__(self):
1064  outstr=''
1065  sorted_ids=sorted(self.data_base.keys())
1066 
1067  for id in sorted_ids:
1068  outstr+=id+"\n"
1069  for xl in self.data_base[id]:
1070  for k in self.ordered_key_list:
1071  try:
1072  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1073  except KeyError:
1074  continue
1075 
1076  for k in xl:
1077  if k not in self.ordered_key_list:
1078  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1079  outstr+="-------------\n"
1080  return outstr
1081 
1082 
1083  def plot(self,filename,**kwargs):
1084  import matplotlib
1085  matplotlib.use('Agg')
1086  import matplotlib.pyplot as plt
1087  import matplotlib.colors
1088 
1089 
1090 
1091  if kwargs["type"] == "scatter":
1092  cmap=plt.get_cmap("rainbow")
1093  norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1094  xkey=kwargs["xkey"]
1095  ykey=kwargs["ykey"]
1096  if "colorkey" in kwargs:
1097  colorkey=kwargs["colorkey"]
1098  if "sizekey" in kwargs:
1099  sizekey=kwargs["sizekey"]
1100  if "logyscale" in kwargs:
1101  logyscale=kwargs["logyscale"]
1102  else:
1103  logyscale=False
1104  xs=[]
1105  ys=[]
1106  colors=[]
1107  for xl in self:
1108  try:
1109  xs.append(float(xl[xkey]))
1110  if logyscale:
1111  ys.append(math.log10(float(xl[ykey])))
1112  else:
1113  ys.append(float(xl[ykey]))
1114  colors.append(float(xl[colorkey]))
1115  except ValueError:
1116  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1117  continue
1118 
1119  cs=[]
1120  for color in colors:
1121  cindex=(color-min(colors))/(max(colors)-min(colors))
1122  cs.append(cmap(cindex))
1123 
1124  fig = plt.figure()
1125  ax = fig.add_subplot(111)
1126  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker="o")
1127  ax.set_xlabel(xkey)
1128  ax.set_ylabel(ykey)
1129  plt.savefig(filename)
1130  plt.show()
1131  plt.close()
1132 
1133  if kwargs["type"] == "residue_links":
1134  #plot the number of distinct links to a residue
1135  #in an histogram
1136  #molecule name
1137  molecule=kwargs["molecule"]
1138  if type(molecule) is IMP.pmi.topology.Molecule:
1139  length=len(molecule.sequence)
1140  molecule=molecule.get_name()
1141  else:
1142  #you need a IMP.pmi.topology.Sequences object
1143  sequences_object=kwargs["sequences_object"]
1144  sequence=sequences_object.sequences[molecule]
1145  length=len(sequence)
1146 
1147  histogram=[0]*length
1148  for xl in self:
1149  if xl[self.protein1_key]==molecule:
1150  histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1151  if xl[self.protein2_key]==molecule:
1152  histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1153  rects = plt.bar(range(1,length+1), histogram)
1154  #bar_width,
1155  #alpha=opacity,
1156  #color='b',
1157  #yerr=std_men,
1158  #error_kw=error_config,
1159  #label='Men')
1160  plt.savefig(filename)
1161  plt.show()
1162  plt.close()
1163 
1164 
1165 
1166  if kwargs["type"] == "histogram":
1167  valuekey=kwargs["valuekey"]
1168  reference_xline=kwargs["reference_xline"]
1169  valuename=valuekey
1170  bins=kwargs["bins"]
1171  values_list=[]
1172  for xl in self:
1173  try:
1174  values_list.append(float(xl[valuekey]))
1175  except ValueError:
1176  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1177  continue
1179  filename, [values_list], valuename=valuename, bins=bins,
1180  colors=None, format="pdf",
1181  reference_xline=None, yplotrange=None,
1182  xplotrange=None,normalized=True,
1183  leg_names=None)
1184 
1185  def dump(self,json_filename):
1186  import json
1187  with open(json_filename, 'w') as fp:
1188  json.dump(self.data_base, fp, sort_keys=True, indent=2, default=set_json_default)
1189 
1190  def load(self,json_filename):
1191  import json
1192  with open(json_filename, 'r') as fp:
1193  self.data_base = json.load(fp)
1194  self._update()
1195  #getting rid of unicode
1196  # (can't do this in Python 3, since *everything* is Unicode there)
1197  if sys.version_info[0] < 3:
1198  for xl in self:
1199  for k,v in xl.iteritems():
1200  if type(k) is unicode: k=str(k)
1201  if type(v) is unicode: v=str(v)
1202  xl[k]=v
1203 
1204  def save_csv(self,filename):
1205 
1206  import csv
1207 
1208  data=[]
1209  sorted_ids=None
1210  sorted_group_ids=sorted(self.data_base.keys())
1211  for group in sorted_group_ids:
1212  group_block=[]
1213  for xl in self.data_base[group]:
1214  if not sorted_ids:
1215  sorted_ids=sorted(xl.keys())
1216  values=[xl[k] for k in sorted_ids]
1217  group_block.append(values)
1218  data+=group_block
1219 
1220 
1221  with open(filename, 'w') as fp:
1222  a = csv.writer(fp, delimiter=',')
1223  a.writerow(sorted_ids)
1224  a.writerows(data)
1225 
1226  def get_number_of_unique_crosslinked_sites(self):
1227  """
1228  Returns the number of non redundant crosslink sites
1229  """
1230  praset=set()
1231  for xl in self:
1232  pra=_ProteinsResiduesArray(xl)
1233  prai=pra.get_inverted()
1234  praset.add(pra)
1235  praset.add(prai)
1236  return len(list(praset))
1237 
1239  """This class allows to compute and plot the distance between datasets"""
1240 
1241  def __init__(self,cldb_dictionary):
1242  """Input a dictionary where keys are cldb names and values are cldbs"""
1243  import scipy.spatial.distance
1244  self.dbs=cldb_dictionary
1245  self.keylist=list(self.dbs.keys())
1246  self.distances=list()
1247 
1248 
1249  for i,key1 in enumerate(self.keylist):
1250  for key2 in self.keylist[i+1:]:
1251  distance=self.get_distance(key1,key2)
1252  self.distances.append(distance)
1253 
1254  self.distances=scipy.spatial.distance.squareform(self.distances)
1255 
1256  def get_distance(self,key1,key2):
1257  return 1.0-self.jaccard_index(self.dbs[key1],self.dbs[key2])
1258 
1259  def jaccard_index(self,CrossLinkDataBase1,CrossLinkDataBase2):
1260  """Similarity index between two datasets
1261  https://en.wikipedia.org/wiki/Jaccard_index"""
1262 
1263  set1=set()
1264  set2=set()
1265  for xl1 in CrossLinkDataBase1:
1266  a1f=_ProteinsResiduesArray(xl1)
1267  a1b=a1f.get_inverted()
1268  set1.add(a1f)
1269  set1.add(a1b)
1270  for xl2 in CrossLinkDataBase2:
1271  a2f=_ProteinsResiduesArray(xl2)
1272  a2b=a2f.get_inverted()
1273  set2.add(a2f)
1274  set2.add(a2b)
1275  return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1276 
1277  def plot_matrix(self,figurename="clustermatrix.pdf"):
1278  import matplotlib as mpl
1279  import numpy
1280  mpl.use('Agg')
1281  import matplotlib.pylab as pl
1282  from scipy.cluster import hierarchy as hrc
1283 
1284  raw_distance_matrix=self.distances
1285  labels=self.keylist
1286 
1287  fig = pl.figure()
1288  #fig.autolayout=True
1289 
1290  ax = fig.add_subplot(1,1,1)
1291  dendrogram = hrc.dendrogram(
1292  hrc.linkage(raw_distance_matrix),
1293  color_threshold=7,
1294  no_labels=True)
1295  leaves_order = dendrogram['leaves']
1296  ax.set_xlabel('Dataset')
1297  ax.set_ylabel('Jaccard Distance')
1298  pl.tight_layout()
1299  pl.savefig("dendrogram."+figurename, dpi=300)
1300  pl.close(fig)
1301 
1302  fig = pl.figure()
1303  #fig.autolayout=True
1304 
1305  ax = fig.add_subplot(1,1,1)
1306  cax = ax.imshow(
1307  raw_distance_matrix[leaves_order,
1308  :][:,
1309  leaves_order],
1310  interpolation='nearest')
1311  cb = fig.colorbar(cax)
1312  cb.set_label('Jaccard Distance')
1313  ax.set_xlabel('Dataset')
1314  ax.set_ylabel('Dataset')
1315  ax.set_xticks(range(len(labels)))
1316  ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation='vertical')
1317  ax.set_yticks(range(len(labels)))
1318  ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation='horizontal')
1319  pl.tight_layout()
1320  pl.savefig("matrix."+figurename, dpi=300)
1321  pl.close(fig)
1322 
1323 
1325  '''
1326  This class maps a CrossLinkDataBase on a given structure
1327  and save an rmf file with color-coded crosslinks
1328  '''
1329 
1330 
1331  def __init__(self,CrossLinkDataBase,rmf_or_stat_handler):
1332  self.CrossLinkDataBase=CrossLinkDataBase
1333  if type(rmf_or_stat_handler) is IMP.pmi.output.RMFHierarchyHandler or \
1334  type(rmf_or_stat_handler) is IMP.pmi.output.StatHierarchyHandler:
1335  self.prots=rmf_or_stat_handler
1336  self.distances=defaultdict(list)
1337  self.array_to_id={}
1338  self.id_to_array={}
1339 
1340  print("computing distances fro all crosslinks and all structures")
1341  for i in self.prots[::10]:
1342  self.compute_distances()
1343  for key,xl in enumerate(self.CrossLinkDataBase):
1344  array=_ProteinsResiduesArray(xl)
1345  self.array_to_id[array]=key
1346  self.id_to_array[key]=array
1347  if xl["MinAmbiguousDistance"] is not 'None':
1348  self.distances[key].append(xl["MinAmbiguousDistance"])
1349 
1350  def compute_distances(self):
1351  data=[]
1352  sorted_ids=None
1353  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1354  for group in sorted_group_ids:
1355  #group_block=[]
1356  group_dists=[]
1357  for xl in self.CrossLinkDataBase.data_base[group]:
1358  if not sorted_ids:
1359  sorted_ids=sorted(xl.keys())
1360  data.append(sorted_ids+["UniqueID","Distance","MinAmbiguousDistance"])
1361  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1362  try:
1363  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1364  except:
1365  mdist="None"
1366  state1="None"
1367  copy1="None"
1368  state2="None"
1369  copy2="None"
1370  try:
1371  # sometimes keys get "lost" in the database, not really sure why
1372  values=[xl[k] for k in sorted_ids]
1373  values += [group, mdist]
1374  except KeyError as e:
1375  print("MapCrossLinkDataBaseOnStructure KeyError: {0} in {1}".format(e, xl))
1376  group_dists.append(mdist)
1377  #group_block.append(values)
1378  xl["Distance"]=mdist
1379  xl["State1"]=state1
1380  xl["Copy1"]=copy1
1381  xl["State2"]=state2
1382  xl["Copy2"]=copy2
1383 
1384  for xl in self.CrossLinkDataBase.data_base[group]:
1385  xl["MinAmbiguousDistance"]=min(group_dists)
1386 
1387  def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1388  '''more robust and slower version of above'''
1389  sel=IMP.atom.Selection(self.prots,molecule=c1,residue_index=r1,resolution=1)
1390  selpart_1=sel.get_selected_particles()
1391  if len(selpart_1)==0:
1392  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1393  return None
1394  sel=IMP.atom.Selection(self.prots,molecule=c2,residue_index=r2,resolution=1)
1395  selpart_2=sel.get_selected_particles()
1396  if len(selpart_2)==0:
1397  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1398  return None
1399  results=[]
1400  for p1 in selpart_1:
1401  for p2 in selpart_2:
1402  if p1 == p2 and r1 == r2: continue
1403  d1=IMP.core.XYZ(p1)
1404  d2=IMP.core.XYZ(p2)
1405  #round distance to second decimal
1406  dist=float(int(IMP.core.get_distance(d1,d2)*100.0))/100.0
1407  h1=IMP.atom.Hierarchy(p1)
1408  while not IMP.atom.Molecule.get_is_setup(h1.get_particle()):
1409  h1=h1.get_parent()
1410  copy_index1=IMP.atom.Copy(h1).get_copy_index()
1411  while not IMP.atom.State.get_is_setup(h1.get_particle()):
1412  h1=h1.get_parent()
1413  state_index1=IMP.atom.State(h1).get_state_index()
1414  h2=IMP.atom.Hierarchy(p2)
1415  while not IMP.atom.Molecule.get_is_setup(h2.get_particle()):
1416  h2=h2.get_parent()
1417  copy_index2=IMP.atom.Copy(h2).get_copy_index()
1418  while not IMP.atom.State.get_is_setup(h2.get_particle()):
1419  h2=h2.get_parent()
1420  state_index2=IMP.atom.State(h2).get_state_index()
1421 
1422  results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1423  if len(results)==0: return None
1424  results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1425  return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1426 
1427  def save_rmf_snapshot(self,filename,color_id=None):
1428  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1429  list_of_pairs=[]
1430  color_scores=[]
1431  for group in sorted_group_ids:
1432  group_dists_particles=[]
1433  for xl in self.CrossLinkDataBase.data_base[group]:
1434  xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1435  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1436  try:
1437  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1438  except TypeError:
1439  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1440  continue
1441  if color_id is not None:
1442  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1443  else:
1444  group_dists_particles.append((mdist,p1,p2,xllabel,0.0))
1445  if group_dists_particles:
1446  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1447  color_scores.append(mincolor_score)
1448  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1449  else:
1450  continue
1451 
1452  linear = IMP.core.Linear(0, 0.0)
1453  linear.set_slope(1.0)
1454  dps2 = IMP.core.DistancePairScore(linear)
1455  rslin = IMP.RestraintSet(self.prots.get_model(), 'linear_dummy_restraints')
1456  sgs=[]
1457  offset=min(color_scores)
1458  maxvalue=max(color_scores)
1459  for pair in list_of_pairs:
1460  pr = IMP.core.PairRestraint(self.prots.get_model(), dps2, (pair[0], pair[1]))
1461  pr.set_name(pair[2])
1462  if offset<maxvalue:
1463  factor=(pair[3]-offset)/(maxvalue-offset)
1464  else:
1465  factor=0.0
1466  c=IMP.display.get_rgb_color(factor)
1467  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1468  rslin.add_restraint(pr)
1469  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1470 
1471  rh = RMF.create_rmf_file(filename)
1472  IMP.rmf.add_hierarchies(rh, [self.prots])
1473  IMP.rmf.add_restraints(rh,[rslin])
1474  IMP.rmf.add_geometries(rh, sgs)
1475  IMP.rmf.save_frame(rh)
1476  del rh
1477 
1478  def boxplot_crosslink_distances(self,filename):
1479  import numpy
1480  keys=[self.id_to_array[k] for k in self.distances.keys()]
1481  medians=[numpy.median(self.distances[self.array_to_id[k]]) for k in keys]
1482  dists=[self.distances[self.array_to_id[k]] for k in keys]
1483  distances_sorted_by_median=[x for _,x in sorted(zip(medians,dists))]
1484  keys_sorted_by_median=[x for _,x in sorted(zip(medians,keys))]
1486  distances_sorted_by_median,
1487  range(len(distances_sorted_by_median)),
1488  xlabels=keys_sorted_by_median,scale_plot_length=0.2)
1489 
1490  def get_crosslink_violations(self,threshold):
1491  violations=defaultdict(list)
1492  for k in self.distances:
1493  #viols=map(lambda x:1.0*(x<= threshold), self.distances[k])
1494  viols=self.distances[k]
1495  violations[self.id_to_array[k]]=viols
1496  return violations
1497 
1498 
1500  '''
1501  This class generates a CrossLinkDataBase from a given structure
1502  '''
1503  def __init__(self,representation=None,
1504  system=None,
1505  residue_types_1=["K"],
1506  residue_types_2=["K"],
1507  reactivity_range=[0,1],
1508  kt=1.0):
1509 
1510  import numpy.random
1511  import math
1513  cldbkc.set_protein1_key("Protein1")
1514  cldbkc.set_protein2_key("Protein2")
1515  cldbkc.set_residue1_key("Residue1")
1516  cldbkc.set_residue2_key("Residue2")
1517  self.cldb=CrossLinkDataBase(cldbkc)
1518  if representation is not None:
1519  #PMI 1.0 mode
1520  self.mode="pmi1"
1521  self.representation=representation
1522  self.model=self.representation.model
1523  elif system is not None:
1524  #PMI 2.0 mode
1525  self.system=system
1526  self.model=self.system.model
1527  self.mode="pmi2"
1528  else:
1529  print("Argument error: please provide either a representation object or a IMP.Hierarchy")
1530  raise
1531  self.residue_types_1=residue_types_1
1532  self.residue_types_2=residue_types_2
1533  self.kt=kt
1534  self.indexes_dict1={}
1535  self.indexes_dict2={}
1536  self.protein_residue_dict={}
1537  self.reactivity_dictionary={}
1538  self.euclidean_interacting_pairs=None
1539  self.xwalk_interacting_pairs=None
1540  import random
1541 
1542  if self.mode=="pmi1":
1543  for protein in self.representation.sequence_dict.keys():
1544  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
1545  seq=self.representation.sequence_dict[protein]
1546  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
1547 
1548  for r in residues:
1549  # uniform random reactivities
1550  #self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
1551  # getting reactivities from the CDF of an exponential distribution
1552  rexp=numpy.random.exponential(0.1)
1553  prob=1.0-math.exp(-rexp)
1554  self.reactivity_dictionary[(protein,r)]=prob
1555 
1556 
1557  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
1558  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
1559  for r in residues1:
1560  h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1561  p=h.get_particle()
1562  index=p.get_index()
1563  self.indexes_dict1[index]=(protein,r)
1564  self.protein_residue_dict[(protein,r)]=index
1565  for r in residues2:
1566  h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1567  p=h.get_particle()
1568  index=p.get_index()
1569  self.indexes_dict2[index]=(protein,r)
1570  self.protein_residue_dict[(protein,r)]=index
1571 
1572  if self.mode=="pmi2":
1573  for state in self.system.get_states():
1574  for moleculename,molecules in state.get_molecules().items():
1575  for molecule in molecules:
1576  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
1577  seq=molecule.sequence
1578  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
1579 
1580  for r in residues:
1581  # uniform random reactivities
1582  #self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
1583  # getting reactivities from the CDF of an exponential distribution
1584  rexp=numpy.random.exponential(0.00000001)
1585  prob=1.0-math.exp(-rexp)
1586  self.reactivity_dictionary[(molecule,r)]=prob
1587 
1588  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
1589  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
1590  for r in residues1:
1591  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1592  ps=s.get_selected_particles()
1593  for p in ps:
1594  index=p.get_index()
1595  self.indexes_dict1[index]=(molecule,r)
1596  self.protein_residue_dict[(molecule,r)]=index
1597  for r in residues2:
1598  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1599  ps=s.get_selected_particles()
1600  for p in ps:
1601  index=p.get_index()
1602  self.indexes_dict2[index]=(molecule,r)
1603  self.protein_residue_dict[(molecule,r)]=index
1604 
1605 
1606  def get_all_possible_pairs(self):
1607  n=float(len(self.protein_residue_dict.keys()))
1608  return n*(n-1.0)/2.0
1609 
1610  def get_all_feasible_pairs(self,distance=21):
1611  import itertools
1612  particle_index_pairs=[]
1613  nxl=0
1614  for a,b in itertools.combinations(self.protein_residue_dict.keys(),2):
1615 
1616  new_xl={}
1617  index1=self.protein_residue_dict[a]
1618  index2=self.protein_residue_dict[b]
1619  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1620  if particle_distance <= distance:
1621  particle_index_pairs.append((index1,index2))
1622  if self.mode=="pmi1":
1623  new_xl[self.cldb.protein1_key]=a[0]
1624  new_xl[self.cldb.protein2_key]=b[0]
1625  elif self.mode=="pmi2":
1626  new_xl[self.cldb.protein1_key]=a[0].get_name()
1627  new_xl[self.cldb.protein2_key]=b[0].get_name()
1628  new_xl["molecule_object1"]=a[0]
1629  new_xl["molecule_object2"]=b[0]
1630  new_xl[self.cldb.residue1_key]=a[1]
1631  new_xl[self.cldb.residue2_key]=b[1]
1632  self.cldb.data_base[str(nxl)]=[new_xl]
1633  nxl+=1
1634  self.cldb._update()
1635  return self.cldb
1636 
1637 
1638 
1639 
1640  def get_data_base(self,total_number_of_spectra,
1641  ambiguity_probability=0.1,
1642  noise=0.01,
1643  distance=21,
1644  max_delta_distance=10.0,
1645  xwalk_bin_path=None,
1646  confidence_false=0.75,
1647  confidence_true=0.75):
1648  import math
1649  from random import random,uniform
1650  import numpy as np
1651  number_of_spectra=1
1652 
1653  self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1654  self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1655  self.cldb.data_base[str(number_of_spectra)]=[]
1656  self.sites_weighted=None
1657 
1658  while number_of_spectra<total_number_of_spectra:
1659  if random() > ambiguity_probability and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1660  # new spectrum
1661  number_of_spectra+=1
1662  self.cldb.data_base[str(number_of_spectra)]=[]
1663  noisy=False
1664  if random() > noise:
1665  # not noisy crosslink
1666  pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1667  else:
1668  # noisy crosslink
1669  pra,dist=self.get_random_residue_pair(None,None,None)
1670  noisy=True
1671 
1672  new_xl={}
1673  if self.mode=="pmi1":
1674  new_xl[self.cldb.protein1_key]=pra[0]
1675  new_xl[self.cldb.protein2_key]=pra[1]
1676  elif self.mode=="pmi2":
1677  new_xl[self.cldb.protein1_key]=pra[0].get_name()
1678  new_xl[self.cldb.protein2_key]=pra[1].get_name()
1679  new_xl["molecule_object1"]=pra[0]
1680  new_xl["molecule_object2"]=pra[1]
1681  new_xl[self.cldb.residue1_key]=pra[2]
1682  new_xl[self.cldb.residue2_key]=pra[3]
1683  new_xl["Noisy"]=noisy
1684  # the reactivity is defined as r=1-exp(-k*Delta t)
1685  new_xl["Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1686  new_xl["Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1687  r1=new_xl["Reactivity_Residue1"]
1688  r2=new_xl["Reactivity_Residue2"]
1689  #combined reactivity 1-exp(-k12*Delta t),
1690  # k12=k1*k2/(k1+k2)
1691  #new_xl["Reactivity"]=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1692  if noisy:
1693  #new_xl["Score"]=uniform(-1.0,1.0)
1694  new_xl["Score"]=np.random.beta(1.0,self.beta_false)
1695  else:
1696  #new_xl["Score"]=new_xl["Reactivity"]+uniform(0.0,2.0)
1697  new_xl["Score"]=1.0-np.random.beta(1.0,self.beta_true)
1698  new_xl["TargetDistance"]=dist
1699  new_xl["NoiseProbability"]=noise
1700  new_xl["AmbiguityProbability"]=ambiguity_probability
1701  # getting if it is intra or inter rigid body
1702  (p1,p2)=IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1703  self.protein_residue_dict[(pra[1],pra[3])]])
1706  IMP.core.RigidMember(p1).get_rigid_body() ==
1707  IMP.core.RigidMember(p2).get_rigid_body()):
1708  new_xl["InterRigidBody"] = False
1709  elif (IMP.core.RigidMember.get_is_setup(p1) and
1711  IMP.core.RigidMember(p1).get_rigid_body() !=
1712  IMP.core.RigidMember(p2).get_rigid_body()):
1713  new_xl["InterRigidBody"] = True
1714  else:
1715  new_xl["InterRigidBody"] = None
1716 
1717  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1718  self.cldb._update()
1719  return self.cldb
1720 
1721 
1722  def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1723  import IMP.pmi.tools
1724  import math
1725  from random import choice,uniform
1726  if distance is None:
1727  # get a random pair
1728  while True:
1729  if self.mode=="pmi1":
1730  protein1=choice(self.representation.sequence_dict.keys())
1731  protein2=choice(self.representation.sequence_dict.keys())
1732  seq1=self.representation.sequence_dict[protein1]
1733  seq2=self.representation.sequence_dict[protein2]
1734  residue1=choice([i for i in range(1,len(seq1)+1) if seq1[i-1] in self.residue_types_1])
1735  residue2=choice([i for i in range(1,len(seq2)+1) if seq2[i-1] in self.residue_types_2])
1736  h1=IMP.pmi.tools.select_by_tuple(self.representation,(residue1,residue1,protein1),resolution=1)[0]
1737  h2=IMP.pmi.tools.select_by_tuple(self.representation,(residue2,residue2,protein2),resolution=1)[0]
1738  particle_distance=IMP.core.get_distance(IMP.core.XYZ(h1.get_particle()),IMP.core.XYZ(h2.get_particle()))
1739  if (protein1,residue1) != (protein2,residue2):
1740  break
1741  elif self.mode=="pmi2":
1742  (protein1,residue1)=choice(self.protein_residue_dict.keys())
1743  (protein2,residue2)=choice(self.protein_residue_dict.keys())
1744  index1=self.protein_residue_dict[(protein1,residue1)]
1745  index2=self.protein_residue_dict[(protein2,residue2)]
1746  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1747  if (protein1,residue1) != (protein2,residue2):
1748  break
1749  else:
1750  # get a pair of residues whose distance is below the threshold
1751  if not xwalk_bin_path:
1753  gcpf.set_distance(distance+max_delta_distance)
1754 
1755  while True:
1756  #setup the reaction rates lists
1757  if not self.sites_weighted:
1758  self.sites_weighted=[]
1759  for key in self.reactivity_dictionary:
1760  r=self.reactivity_dictionary[key]
1761  self.sites_weighted.append((key,r))
1762  #get a random reaction site
1763  first_site=self.weighted_choice(self.sites_weighted)
1764  #get all distances
1765  if not self.euclidean_interacting_pairs:
1766  self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1767  self.indexes_dict1.keys(),
1768  self.indexes_dict2.keys())
1769  #get the partner for the first reacted site
1770  first_site_pairs = [pair for pair in self.euclidean_interacting_pairs
1771  if self.indexes_dict1[pair[0]] == first_site or
1772  self.indexes_dict2[pair[1]] == first_site]
1773  if len(first_site_pairs)==0: continue
1774  #build the list of second reaction sites
1775  second_sites_weighted=[]
1776  for pair in first_site_pairs:
1777  if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1778  if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1779  r=self.reactivity_dictionary[second_site]
1780  second_sites_weighted.append((second_site,r))
1781  second_site=self.weighted_choice(second_sites_weighted)
1782  """
1783  interacting_pairs_weighted=[]
1784  for pair in self.euclidean_interacting_pairs:
1785  r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1786  r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1787  #combined reactivity 1-exp(-k12*Delta t),
1788  # k12=k1*k2/(k1+k2)
1789  #print(r1,r2,dist)
1790  r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1791  interacting_pairs_weighted.append((pair,r12))
1792  #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1793  #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1794  #interacting_pairs_weighted.append((pair,weight1*weight2))
1795 
1796  while True:
1797  pair=self.weighted_choice(interacting_pairs_weighted)
1798  protein1,residue1=self.indexes_dict1[pair[0]]
1799  protein2,residue2=self.indexes_dict2[pair[1]]
1800  particle_pair=IMP.get_particles(self.model,pair)
1801  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1802  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1803  break
1804  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1805  #allow some flexibility
1806  prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1807  if uniform(0.0,1.0)<prob: break
1808  """
1809  protein1,residue1=first_site
1810  protein2,residue2=second_site
1811  print("CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1812  "First site",first_site,self.reactivity_dictionary[first_site],
1813  "Second site",second_site,self.reactivity_dictionary[second_site])
1814  particle_pair=IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1815  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1816 
1817  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1818  break
1819  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1820  #allow some flexibility
1821  #prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1822  #if uniform(0.0,1.0)<prob: break
1823  if particle_distance-distance < max_delta_distance: break
1824 
1825 
1826 
1827  else:
1828  if not self.xwalk_interacting_pairs:
1829  self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1830  interacting_pairs_weighted=[]
1831 
1832  for pair in self.xwalk_interacting_pairs:
1833  protein1=pair[0]
1834  protein2=pair[1]
1835  residue1=pair[2]
1836  residue2=pair[3]
1837  weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1838  weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1839  interacting_pairs_weighted.append((pair,weight1*weight2))
1840 
1841  pair=self.weighted_choice(interacting_pairs_weighted)
1842  protein1=pair[0]
1843  protein2=pair[1]
1844  residue1=pair[2]
1845  residue2=pair[3]
1846  particle_distance=float(pair[4])
1847 
1848 
1849 
1850  return ((protein1,protein2,residue1,residue2)),particle_distance
1851 
1852  def get_xwalk_distances(self,xwalk_bin_path,distance):
1853  import IMP.pmi.output
1854  import os
1855  o=IMP.pmi.output.Output(atomistic=True)
1856  o.init_pdb("xwalk.pdb",self.representation.prot)
1857  o.write_pdb("xwalk.pdb")
1858  namechainiddict=o.dictchain["xwalk.pdb"]
1859  chainiddict={}
1860 
1861  for key in namechainiddict: chainiddict[namechainiddict[key]]=key
1862  xwalkout=os.popen('java -Xmx256m -cp ' + xwalk_bin_path +' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+' -bb').read()
1863 
1864  output_list_of_distance=[]
1865  for line in xwalkout.split("\n")[0:-2]:
1866  tockens=line.split()
1867  first=tockens[2]
1868  second=tockens[3]
1869  distance=float(tockens[6])
1870  fs=first.split("-")
1871  ss=second.split("-")
1872  chainid1=fs[2]
1873  chainid2=ss[2]
1874  protein1=chainiddict[chainid1]
1875  protein2=chainiddict[chainid2]
1876  residue1=int(fs[1])
1877  residue2=int(ss[1])
1878  output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1879  return output_list_of_distance
1880 
1881 
1882  def weighted_choice(self,choices):
1883  import random
1884  # from http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
1885  total = sum(w for c, w in choices)
1886  r = random.uniform(0, total)
1887  upto = 0
1888  for c, w in choices:
1889  if upto + w > r:
1890  return c
1891  upto += w
1892  assert False, "Shouldn't get here"
1893 
1894  def save_rmf_snapshot(self,filename,color_id=None):
1895  import IMP.rmf
1896  import RMF
1897  if color_id is None:
1898  color_id="Reactivity"
1899  sorted_ids=None
1900  sorted_group_ids=sorted(self.cldb.data_base.keys())
1901  list_of_pairs=[]
1902  color_scores=[]
1903  for group in sorted_group_ids:
1904  group_xls=[]
1905  group_dists_particles=[]
1906  for xl in self.cldb.data_base[group]:
1907  xllabel=self.cldb.get_short_cross_link_string(xl)
1908  (c1,c2,r1,r2)=(xl["molecule_object1"],xl["molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1909  try:
1910  index1=self.protein_residue_dict[(c1,r1)]
1911  index2=self.protein_residue_dict[(c2,r2)]
1912  p1,p2=IMP.get_particles(self.model,[index1])[0],IMP.get_particles(self.model,[index2])[0]
1913  mdist=xl["TargetDistance"]
1914  except TypeError:
1915  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1916  continue
1917  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1918  if group_dists_particles:
1919  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1920  color_scores.append(mincolor_score)
1921  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1922  else:
1923  continue
1924 
1925 
1926  m=self.model
1927  linear = IMP.core.Linear(0, 0.0)
1928  linear.set_slope(1.0)
1929  dps2 = IMP.core.DistancePairScore(linear)
1930  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
1931  sgs=[]
1932  offset=min(color_scores)
1933  maxvalue=max(color_scores)
1934  for pair in list_of_pairs:
1935  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
1936  pr.set_name(pair[2])
1937  factor=(pair[3]-offset)/(maxvalue-offset)
1938  c=IMP.display.get_rgb_color(factor)
1939  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1940  rslin.add_restraint(pr)
1941  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1942 
1943  rh = RMF.create_rmf_file(filename)
1944  IMP.rmf.add_hierarchies(rh, [self.system.hier])
1945  IMP.rmf.add_restraints(rh,[rslin])
1946  IMP.rmf.add_geometries(rh, sgs)
1947  IMP.rmf.save_frame(rh)
1948  del rh
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: State.h:41
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1579
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1644
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:690
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:36
Stores a named protein chain.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
The standard decorator for manipulating molecular structures.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:62
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:19
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:24
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
Definition: State.h:27
class to link stat files to several rmf files
Definition: output.py:1088
class to allow more advanced handling of RMF files.
Definition: output.py:960
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Definition: PairRestraint.h:29
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.