IMP logo
IMP Reference Guide  2.11.1
The Integrative Modeling Platform
io/crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 from __future__ import print_function
8 import IMP
9 import IMP.pmi
10 import IMP.pmi.output
11 import IMP.atom
12 import IMP.core
13 import IMP.algebra
14 import IMP.rmf
15 import RMF
16 import IMP.display
17 import operator
18 import math
19 import sys
20 import ihm.location
21 import ihm.dataset
22 from collections import defaultdict
23 
24 # json default serializations
25 def set_json_default(obj):
26  if isinstance(obj, set):
27  return list(obj)
28  if isinstance(obj, IMP.pmi.topology.Molecule):
29  return str(obj)
30  raise TypeError
31 
32 # Handle and return data that must be a string
33 if sys.version_info[0] >= 3:
34  def _handle_string_input(inp):
35  if not isinstance(inp, str):
36  raise TypeError("expecting a string")
37  return inp
38 else:
39  def _handle_string_input(inp):
40  if not isinstance(inp, (str, unicode)):
41  raise TypeError("expecting a string or unicode")
42  # Coerce to non-unicode representation (str)
43  if isinstance(inp, unicode):
44  return str(inp)
45  else:
46  return inp
47 
48 class _CrossLinkDataBaseStandardKeys(object):
49  '''
50  This class setup all the standard keys needed to
51  identify the crosslink features from the data sets
52  '''
53  def __init__(self):
54  self.type={}
55  self.protein1_key="Protein1"
56  self.type[self.protein1_key]=str
57  self.protein2_key="Protein2"
58  self.type[self.protein2_key]=str
59  self.residue1_key="Residue1"
60  self.type[self.residue1_key]=int
61  self.residue2_key="Residue2"
62  self.type[self.residue2_key]=int
63  self.residue1_amino_acid_key="Residue1AminoAcid"
64  self.type[self.residue1_amino_acid_key]=str
65  self.residue2_amino_acid_key="Residue2AminoAcid"
66  self.type[self.residue2_amino_acid_key]=str
67  self.residue1_moiety_key="Residue1Moiety"
68  self.type[self.residue1_moiety_key]=str
69  self.residue2_moiety_key="Residue2Moiety"
70  self.type[self.residue2_moiety_key]=str
71  self.site_pairs_key="SitePairs"
72  self.type[self.site_pairs_key]=str
73  self.unique_id_key="XLUniqueID"
74  self.type[self.unique_id_key]=str
75  self.unique_sub_index_key="XLUniqueSubIndex"
76  self.type[self.unique_sub_index_key]=int
77  self.unique_sub_id_key="XLUniqueSubID"
78  self.type[self.unique_sub_id_key]=str
79  self.data_set_name_key="DataSetName"
80  self.type[self.data_set_name_key]=str
81  self.cross_linker_chemical_key="CrossLinkerChemical"
82  self.type[self.cross_linker_chemical_key]=str
83  self.id_score_key="IDScore"
84  self.type[self.id_score_key]=float
85  self.fdr_key="FDR"
86  self.type[self.fdr_key]=float
87  self.quantitation_key="Quantitation"
88  self.type[self.quantitation_key]=float
89  self.redundancy_key="Redundancy"
90  self.type[self.redundancy_key]=int
91  self.redundancy_list_key="RedundancyList"
92  self.type[self.redundancy_key]=list
93  self.ambiguity_key="Ambiguity"
94  self.type[self.ambiguity_key]=int
95  self.residue1_links_number_key="Residue1LinksNumber"
96  self.type[self.residue1_links_number_key]=int
97  self.residue2_links_number_key="Residue2LinksNumber"
98  self.type[self.residue2_links_number_key]=int
99  self.type[self.ambiguity_key]=int
100  self.state_key="State"
101  self.type[self.state_key]=int
102  self.sigma1_key="Sigma1"
103  self.type[self.sigma1_key]=str
104  self.sigma2_key="Sigma2"
105  self.type[self.sigma2_key]=str
106  self.psi_key="Psi"
107  self.type[self.psi_key]=str
108  self.distance_key="Distance"
109  self.type[self.distance_key]=float
110  self.min_ambiguous_distance_key="MinAmbiguousDistance"
111  self.type[self.distance_key]=float
112  #link types are either Monolink, Intralink or Interlink
113  self.link_type_key="LinkType"
114  self.type[self.link_type_key]=str
115 
116  self.ordered_key_list =[self.data_set_name_key,
117  self.unique_id_key,
118  self.unique_sub_index_key,
119  self.unique_sub_id_key,
120  self.protein1_key,
121  self.protein2_key,
122  self.residue1_key,
123  self.residue2_key,
124  self.residue1_amino_acid_key,
125  self.residue2_amino_acid_key,
126  self.residue1_moiety_key,
127  self.residue2_moiety_key,
128  self.cross_linker_chemical_key,
129  self.id_score_key,
130  self.fdr_key,
131  self.quantitation_key,
132  self.redundancy_key,
133  self.redundancy_list_key,
134  self.state_key,
135  self.sigma1_key,
136  self.sigma2_key,
137  self.psi_key,
138  self.distance_key,
139  self.min_ambiguous_distance_key,
140  self.link_type_key]
141 
142 
143 class _ProteinsResiduesArray(tuple):
144  '''
145  This class is inherits from tuple, and it is a shorthand for a cross-link
146  (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
147  residue1 and residue2.
148  '''
149 
150  def __new__(self,input_data):
151  '''
152  @input_data can be a dict or a tuple
153  '''
154  self.cldbsk=_CrossLinkDataBaseStandardKeys()
155  if type(input_data) is dict:
156  monolink=False
157  p1=input_data[self.cldbsk.protein1_key]
158  try:
159  p2=input_data[self.cldbsk.protein2_key]
160  except KeyError:
161  monolink=True
162  r1=input_data[self.cldbsk.residue1_key]
163  try:
164  r2=input_data[self.cldbsk.residue2_key]
165  except KeyError:
166  monolink=True
167  if not monolink:
168  t=(p1,p2,r1,r2)
169  else:
170  t=(p1,"",r1,None)
171  elif type(input_data) is tuple:
172  if len(input_data)>4 or len(input_data)==3 or len(input_data)==1:
173  raise TypeError("_ProteinsResiduesArray: must have only 4 elements")
174  if len(input_data)==4:
175  p1 = _handle_string_input(input_data[0])
176  p2 = _handle_string_input(input_data[1])
177  r1=input_data[2]
178  r2=input_data[3]
179  if (not (type(r1) is int)) and (not (r1 is None)):
180  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
181  if (not (type(r2) is int)) and (not (r1 is None)):
182  raise TypeError("_ProteinsResiduesArray: residue2 must be a integer")
183  t=(p1,p2,r1,r2)
184  if len(input_data) == 2:
185  p1 = _handle_string_input(input_data[0])
186  r1 = input_data[1]
187  if type(r1) is not int:
188  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
189  t = (p1,"",r1,None)
190  else:
191  raise TypeError("_ProteinsResiduesArray: input must be a dict or tuple")
192  return tuple.__new__(_ProteinsResiduesArray, t)
193 
194  def get_inverted(self):
195  '''
196  Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
197  '''
198  return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
199 
200  def __repr__(self):
201  outstr=self.cldbsk.protein1_key+" "+str(self[0])
202  outstr+=" "+self.cldbsk.protein2_key+" "+str(self[1])
203  outstr+=" "+self.cldbsk.residue1_key+" "+str(self[2])
204  outstr+=" "+self.cldbsk.residue2_key+" "+str(self[3])
205  return outstr
206 
207  def __str__(self):
208  outstr=str(self[0])+"."+str(self[2])+"-"+str(self[1])+"."+str(self[3])
209  return outstr
210 
211 class FilterOperator(object):
212  '''
213  This class allows to create filter functions that can be passed to the CrossLinkDataBase
214  in this way:
215 
216  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
217 
218  where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
219 
220  A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
221 
222  fo.evaluate(xl)
223 
224  and it is used to filter the database
225  '''
226 
227  def __init__(self, argument1, operator, argument2):
228  '''
229  (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
230  or (FilterOperator1,operator.or|and...,FilterOperator2)
231  '''
232  if isinstance(argument1, FilterOperator):
233  self.operations = [argument1, operator, argument2]
234  else:
235  self.operations = []
236  self.values = (argument1, operator, argument2)
237 
238  def __or__(self, FilterOperator2):
239  return FilterOperator(self, operator.or_, FilterOperator2)
240 
241  def __and__(self, FilterOperator2):
242  return FilterOperator(self, operator.and_, FilterOperator2)
243 
244  def __invert__(self):
245  return FilterOperator(self, operator.not_, None)
246 
247  def evaluate(self, xl_item):
248 
249  if len(self.operations) == 0:
250  keyword, operator, value = self.values
251  return operator(xl_item[keyword], value)
252  FilterOperator1, op, FilterOperator2 = self.operations
253 
254  if FilterOperator2 is None:
255  return op(FilterOperator1.evaluate(xl_item))
256  else:
257  return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
258 
259 '''
260 def filter_factory(xl_):
261 
262  class FilterOperator(object):
263  import operator
264  xl = xl_
265 
266  def __new__(self,key,value,oper=operator.eq):
267  return oper(self.xl[key],value)
268 
269  return FilterOperator
270 '''
271 
272 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
273  '''
274  This class is needed to convert the keywords from a generic database
275  to the standard ones
276  '''
277 
278  def __init__(self, list_parser=None):
279  '''
280  @param list_parser an instance of ResiduePairListParser, if any is needed
281  '''
282  self.converter={}
283  self.backward_converter={}
284  _CrossLinkDataBaseStandardKeys.__init__(self)
285  self.rplp = list_parser
286  if self.rplp is None:
287  # either you have protein1, protein2, residue1, residue2
288  self.compulsory_keys=set([self.protein1_key,
289  self.protein2_key,
290  self.residue1_key,
291  self.residue2_key])
292  else:
293  self.compulsory_keys=self.rplp.get_compulsory_keys()
294  self.setup_keys=set()
295 
296  def check_keys(self):
297  '''
298  Is a function that check whether necessary keys are setup
299  '''
300  setup_keys=set(self.get_setup_keys())
301  if self.compulsory_keys & setup_keys != self.compulsory_keys:
302  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
303 
304  def get_setup_keys(self):
305  '''
306  Returns the keys that have been setup so far
307  '''
308  return self.backward_converter.keys()
309 
310  def set_standard_keys(self):
311  """
312  This sets up the standard compulsory keys as defined by
313  _CrossLinkDataBaseStandardKeys
314  """
315  for ck in self.compulsory_keys:
316  self.converter[ck]=ck
317  self.backward_converter[ck]=ck
318 
319  def set_unique_id_key(self,origin_key):
320  self.converter[origin_key]=self.unique_id_key
321  self.backward_converter[self.unique_id_key]=origin_key
322 
323  def set_protein1_key(self,origin_key):
324  self.converter[origin_key]=self.protein1_key
325  self.backward_converter[self.protein1_key]=origin_key
326 
327  def set_protein2_key(self,origin_key):
328  self.converter[origin_key]=self.protein2_key
329  self.backward_converter[self.protein2_key]=origin_key
330 
331  def set_residue1_key(self,origin_key):
332  self.converter[origin_key]=self.residue1_key
333  self.backward_converter[self.residue1_key]=origin_key
334 
335  def set_residue2_key(self,origin_key):
336  self.converter[origin_key]=self.residue2_key
337  self.backward_converter[self.residue2_key]=origin_key
338 
339  def set_residue1_amino_acid_key(self, origin_key):
340  self.converter[origin_key] = self.residue1_amino_acid_key
341  self.backward_converter[self.residue1_amino_acid_key] = origin_key
342 
343  def set_residue2_amino_acid_key(self, origin_key):
344  self.converter[origin_key] = self.residue2_amino_acid_key
345  self.backward_converter[self.residue2_amino_acid_key] = origin_key
346 
347  def set_residue1_moiety_key(self, origin_key):
348  self.converter[origin_key] = self.residue1_moiety_key
349  self.backward_converter[self.residue1_moiety_key] = origin_key
350 
351  def set_residue2_moiety_key(self, origin_key):
352  self.converter[origin_key] = self.residue2_moiety_key
353  self.backward_converter[self.residue2_moiety_key] = origin_key
354 
355  def set_site_pairs_key(self,origin_key):
356  self.converter[origin_key]=self.site_pairs_key
357  self.backward_converter[self.site_pairs_key]=origin_key
358 
359  def set_id_score_key(self,origin_key):
360  self.converter[origin_key]=self.id_score_key
361  self.backward_converter[self.id_score_key]=origin_key
362 
363  def set_fdr_key(self,origin_key):
364  self.converter[origin_key]=self.fdr_key
365  self.backward_converter[self.fdr_key]=origin_key
366 
367  def set_quantitation_key(self,origin_key):
368  self.converter[origin_key]=self.quantitation_key
369  self.backward_converter[self.quantitation_key]=origin_key
370 
371  def set_psi_key(self,origin_key):
372  self.converter[origin_key]=self.psi_key
373  self.backward_converter[self.psi_key]=origin_key
374 
375  def set_link_type_key(self,link_type_key):
376  self.converter[link_type_key]=self.link_type_key
377  self.backward_converter[self.link_type_key]=link_type_key
378 
379  def get_converter(self):
380  '''
381  Returns the dictionary that convert the old keywords to the new ones
382  '''
383  self.check_keys()
384  return self.converter
385 
387  '''
388  Returns the dictionary that convert the new keywords to the old ones
389  '''
390  self.check_keys()
391  return self.backward_converter
392 
393 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
394  '''
395  A class to handle different styles of site pairs parsers.
396  Implemented styles:
397  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
398  [Y3-;K4-] for dead-ends
399  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
400  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
401  LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
402  '''
403 
404  import re
405 
406  def __init__(self,style):
407 
408  _CrossLinkDataBaseStandardKeys.__init__(self)
409  if style is "MSSTUDIO":
410  self.style=style
411  self.compulsory_keys= set([self.protein1_key,
412  self.protein2_key,
413  self.site_pairs_key])
414  elif style is "XTRACT" or style is "QUANTITATION":
415  self.style=style
416  self.compulsory_keys= set([self.site_pairs_key])
417  elif style is "LAN_HUANG":
418  self.style=style
419  self.compulsory_keys= set([self.site_pairs_key])
420  else:
421  raise Error("ResiduePairListParser: unknown list parser style")
422 
423  def get_compulsory_keys(self):
424  return self.compulsory_keys
425 
426  def get_list(self,input_string):
427  '''
428  This function returns a list of cross-linked residues and the corresponding list of
429  cross-linked chains. The latter list can be empty, if the style doesn't have the
430  corresponding information.
431  '''
432  if self.style == "MSSTUDIO":
433  input_string=input_string.replace("[","")
434  input_string=input_string.replace("]","")
435  input_string_pairs=input_string.split(";")
436  residue_pair_indexes=[]
437  chain_pair_indexes=[]
438  for s in input_string_pairs:
439  m1=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
440  m2=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
441  if m1:
442  # cross link
443  residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
444  residue_pair_indexes.append((residue_index_1,residue_index_2))
445  elif m2:
446  # dead end
447  residue_type_1,residue_index_1=m2.group(1,2)
448  # at this stage chain_pair_indexes is empty
449  return residue_pair_indexes,chain_pair_indexes
450  if self.style is "XTRACT" or self.style is "QUANTITATION":
451  if ":x:" in input_string:
452  # if it is a crosslink....
453  input_strings=input_string.split(":x:")
454  first_peptides=input_strings[0].split(":|:")
455  second_peptides=input_strings[1].split(":|:")
456  first_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in first_peptides]
457  second_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in second_peptides]
458  residue_pair_indexes=[]
459  chain_pair_indexes=[]
460  for fpi in first_peptides_indentifiers:
461  for spi in second_peptides_indentifiers:
462  chain1=fpi[0]
463  chain2=spi[0]
464  residue1=fpi[1]
465  residue2=spi[1]
466  residue_pair_indexes.append((residue1,residue2))
467  chain_pair_indexes.append((chain1,chain2))
468  return residue_pair_indexes, chain_pair_indexes
469  else:
470  # if it is a monolink....
471  first_peptides = input_string.split(":|:")
472  first_peptides_indentifiers = [(f.split(":")[0], f.split(":")[1]) for f in first_peptides]
473  residue_indexes = []
474  chain_indexes = []
475  for fpi in first_peptides_indentifiers:
476  chain1=fpi[0]
477  residue1=fpi[1]
478  residue_indexes.append((residue1,))
479  chain_indexes.append((chain1,))
480  return residue_indexes, chain_indexes
481  if self.style is "LAN_HUANG":
482  input_strings=input_string.split("-")
483  chain1,first_series=input_strings[0].split(":")
484  chain2,second_series=input_strings[1].split(":")
485 
486  first_residues=first_series.replace(";","|").split("|")
487  second_residues=second_series.replace(";","|").split("|")
488  residue_pair_indexes=[]
489  chain_pair_indexes=[]
490  for fpi in first_residues:
491  for spi in second_residues:
492  residue1=self.re.sub("[^0-9]", "", fpi)
493  residue2=self.re.sub("[^0-9]", "", spi)
494  residue_pair_indexes.append((residue1,residue2))
495  chain_pair_indexes.append((chain1,chain2))
496  return residue_pair_indexes, chain_pair_indexes
497 
498 
499 
500 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
501  '''
502  A class to handle different XL format with fixed format
503  currently support ProXL
504  '''
505  def __init__(self,format):
506 
507  _CrossLinkDataBaseStandardKeys.__init__(self)
508  if format is "PROXL":
509  self.format=format
510  else:
511  raise Error("FixedFormatParser: unknown list format name")
512 
513 
514  def get_data(self,input_string):
515  if self.format is "PROXL":
516  tockens=input_string.split("\t")
517  xl={}
518  if tockens[0]=="SEARCH ID(S)":
519  return None
520  else:
521  xl[self.protein1_key]=tockens[2]
522  xl[self.protein2_key]=tockens[4]
523  xl[self.residue1_key]=int(tockens[3])
524  xl[self.residue2_key]=int(tockens[5])
525  return xl
526 
527 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
528  '''
529  this class handles a cross-link dataset and do filtering
530  operations, adding cross-links, merge datasets...
531  '''
532 
533  def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=('K',)):
534  '''
535  Constructor.
536  @param converter an instance of CrossLinkDataBaseKeywordsConverter
537  @param data_base an instance of CrossLinkDataBase to build the new database on
538  @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
539  crosslink consistency. If not given consistency will not be checked
540  @param linkable_aa a tuple containing one-letter amino acids which are linkable by the crosslinker;
541  only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
542  and if a fasta_seq is given
543  '''
544 
545  if data_base is None:
546  self.data_base = {}
547  else:
548  self.data_base=data_base
549 
550  _CrossLinkDataBaseStandardKeys.__init__(self)
551  if converter is not None:
552  self.cldbkc = converter #type: CrossLinkDataBaseKeywordsConverter
553  self.list_parser=self.cldbkc.rplp
554  self.converter = converter.get_converter()
555 
556  else:
557  self.cldbkc = None #type: CrossLinkDataBaseKeywordsConverter
558  self.list_parser=None
559  self.converter = None
560 
561  # default amino acids considered to be 'linkable' if none are given
562  self.def_aa_tuple = linkable_aa
563  self.fasta_seq = fasta_seq #type: IMP.pmi.topology.Sequences
564  self.dataset = None
565  self._update()
566 
567  def _update(self):
568  '''
569  Update the whole dataset after changes
570  '''
571  self.update_cross_link_unique_sub_index()
572  self.update_cross_link_redundancy()
573  self.update_residues_links_number()
575 
576 
577  def __iter__(self):
578  sorted_ids=sorted(self.data_base.keys())
579  for k in sorted_ids:
580  for xl in self.data_base[k]:
581  yield xl
582 
583  def xlid_iterator(self):
584  sorted_ids=sorted(self.data_base.keys())
585  for xlid in sorted_ids:
586  yield xlid
587 
588  def __getitem__(self,xlid):
589  return self.data_base[xlid]
590 
591  def __len__(self):
592  return len([xl for xl in self])
593 
594  def get_name(self):
595  return self.name
596 
597  def set_name(self,name):
598  new_data_base={}
599  for k in self.data_base:
600  new_data_base[k+"."+name]=self.data_base[k]
601  self.data_base=new_data_base
602  self.name=name
603  self._update()
604 
605  def get_number_of_xlid(self):
606  return len(self.data_base)
607 
608 
609  def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
610  '''
611  if FixedFormatParser is not specified, the file is comma-separated-values
612  @param file_name a txt file to be parsed
613  @param converter an instance of CrossLinkDataBaseKeywordsConverter
614  @param FixedFormatParser a parser for a fixed format
615  '''
616  if not FixedFormatParser:
617  xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
618 
619  if converter is not None:
620  self.cldbkc = converter
621  self.list_parser=self.cldbkc.rplp
622  self.converter = converter.get_converter()
623 
624 
625  if not self.list_parser:
626  # normal procedure without a list_parser
627  # each line is a cross-link
628  new_xl_dict={}
629  for nxl,xl in enumerate(xl_list):
630  new_xl={}
631  for k in xl:
632  if k in self.converter:
633  new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
634  else:
635  new_xl[k]=xl[k]
636  if self.unique_id_key in self.cldbkc.get_setup_keys():
637  if new_xl[self.unique_id_key] not in new_xl_dict:
638  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
639  else:
640  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
641  else:
642  if str(nxl) not in new_xl_dict:
643  new_xl_dict[str(nxl)]=[new_xl]
644  else:
645  new_xl_dict[str(nxl)].append(new_xl)
646 
647  else:
648  # with a list_parser, a line can be a list of ambiguous crosslinks
649  new_xl_dict={}
650  for nxl,entry in enumerate(xl_list):
651 
652  # first get the translated keywords
653  new_dict={}
654  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
655  raise Error("CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
656  for k in entry:
657  if k in self.converter:
658  new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
659  else:
660  new_dict[k]=entry[k]
661 
662  residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
663 
664  # then create the crosslinks
665  for n,p in enumerate(residue_pair_list):
666  is_monolink=False
667  if len(p)==1:
668  is_monolink=True
669 
670  new_xl={}
671  for k in new_dict:
672  new_xl[k]=new_dict[k]
673  new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
674  if not is_monolink:
675  new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
676 
677  if len(chain_pair_list)==len(residue_pair_list):
678  new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
679  if not is_monolink:
680  new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
681 
682  if not is_monolink:
683  new_xl[self.link_type_key]="CROSSLINK"
684  else:
685  new_xl[self.link_type_key]="MONOLINK"
686 
687  if self.unique_id_key in self.cldbkc.get_setup_keys():
688  if new_xl[self.unique_id_key] not in new_xl_dict:
689  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
690  else:
691  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
692  else:
693  if str(nxl) not in new_xl_dict:
694  new_xl[self.unique_id_key]=str(nxl+1)
695  new_xl_dict[str(nxl)]=[new_xl]
696  else:
697  new_xl[self.unique_id_key]=str(nxl+1)
698  new_xl_dict[str(nxl)].append(new_xl)
699 
700 
701  else:
702  '''
703  if FixedFormatParser is defined
704  '''
705 
706  new_xl_dict={}
707  f=open(file_name,"r")
708  nxl=0
709  for line in f:
710  xl=FixedFormatParser.get_data(line)
711  if xl:
712  xl[self.unique_id_key]=str(nxl+1)
713  new_xl_dict[str(nxl)]=[xl]
714  nxl+=1
715 
716 
717  self.data_base=new_xl_dict
718  self.name=file_name
719  l = ihm.location.InputFileLocation(file_name, details='Crosslinks')
720  self.dataset = ihm.dataset.CXMSDataset(l)
721  self._update()
722 
723  def update_cross_link_unique_sub_index(self):
724  for k in self.data_base:
725  for n,xl in enumerate(self.data_base[k]):
726  xl[self.ambiguity_key]=len(self.data_base[k])
727  xl[self.unique_sub_index_key]=n+1
728  xl[self.unique_sub_id_key]=k+"."+str(n+1)
729 
730  def update_cross_link_redundancy(self):
731  redundancy_data_base={}
732  for xl in self:
733  pra=_ProteinsResiduesArray(xl)
734  if pra not in redundancy_data_base:
735  redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
736  redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
737  else:
738  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
739  redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
740  for xl in self:
741  pra=_ProteinsResiduesArray(xl)
742  xl[self.redundancy_key]=len(redundancy_data_base[pra])
743  xl[self.redundancy_list_key]=redundancy_data_base[pra]
744 
745  def update_residues_links_number(self):
746  residue_links={}
747  for xl in self:
748  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
749  if (p1,r1) not in residue_links:
750  residue_links[(p1,r1)]=set([(p2,r2)])
751  else:
752  residue_links[(p1,r1)].add((p2,r2))
753  if (p2,r2) not in residue_links:
754  residue_links[(p2,r2)]=set([(p1,r1)])
755  else:
756  residue_links[(p2,r2)].add((p1,r1))
757 
758  for xl in self:
759  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
760  xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
761  xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
762 
764  """This function checks the consistency of the dataset with the amino acid sequence"""
765  if self.cldbkc and self.fasta_seq:
766  cnt_matched, cnt_matched_file = 0, 0
767  matched = {}
768  non_matched = {}
769  for xl in self:
770  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
771  b_matched_file = False
772  if self.residue1_amino_acid_key in xl:
773  # either you know the residue type and aa_tuple is a single entry
774  aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
775  b_matched = self._match_xlinks(p1, r1, aa_from_file)
776  b_matched_file = b_matched
777  else:
778  # or pass the possible list of types that can be crosslinked
779  b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
780 
781  matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
782 
783  if self.residue2_amino_acid_key in xl:
784  aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
785  b_matched = self._match_xlinks(p2, r2, aa_from_file)
786  b_matched_file = b_matched
787  else:
788  b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
789 
790  matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
791  if b_matched: cnt_matched += 1
792  if b_matched_file: cnt_matched_file += 1
793  if len(self) > 0:
794  percentage_matched = round(100*cnt_matched/len(self),1)
795  percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
796  #if matched: print "Matched xlinks:", matched
797  if matched or non_matched: print("check_cross_link_consistency: Out of %d crosslinks "
798  "%d were matched to the fasta sequence (%f %%).\n "
799  "%d were matched by using the crosslink file (%f %%)."%
800  (len(self),cnt_matched,percentage_matched,cnt_matched_file,
801  percentage_matched_file) )
802  if non_matched: print("check_cross_link_consistency: Warning: Non matched xlinks:",
803  [(prot_name, sorted(list(non_matched[prot_name]))) for prot_name in non_matched])
804  return matched,non_matched
805 
806  def _match_xlinks(self, prot_name, res_index, aa_tuple):
807  # returns Boolean whether given aa matches a position in the fasta file
808  # cross link files usually start counting at 1 and not 0; therefore subtract -1 to compare with fasta
809  amino_dict = IMP.pmi.tools.ThreeToOneConverter()
810  res_index -= 1
811  for amino_acid in aa_tuple:
812  if len(amino_acid) == 3:
813  amino_acid = amino_dict[amino_acid.upper()]
814  if prot_name in self.fasta_seq.sequences:
815  seq = self.fasta_seq.sequences[prot_name]
816  # if we are looking at the first amino acid in a given sequence always return a match
817  # the first aa should always be the n-terminal aa
818  # which may form a crosslink in any case (for BS3-like crosslinkers)
819  # for some data sets the first aa is at position 1; todo: check why this is the case
820  if res_index == 0 or res_index == 1:
821  return True
822  if res_index < len(seq):
823  if amino_acid == seq[res_index]:
824  return True
825  # else:
826  # print "Could not match", prot, res+1, amino_acid, seq[res]
827  return False
828 
829  def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
830  if b_matched:
831  if prot in matched:
832  matched[prot].add(res)
833  else:
834  matched[prot] = set([res])
835  else:
836  if prot in non_matched:
837  non_matched[prot].add(res)
838  else:
839  non_matched[prot] = set([res])
840  return matched, non_matched
841 
842 
843  def get_cross_link_string(self,xl):
844  string='|'
845  for k in self.ordered_key_list:
846  try:
847  string+=str(k)+":"+str(xl[k])+"|"
848  except KeyError:
849  continue
850 
851  for k in xl:
852  if k not in self.ordered_key_list:
853  string+=str(k)+":"+str(xl[k])+"|"
854 
855  return string
856 
857  def get_short_cross_link_string(self,xl):
858 
859  string='|'
860  list_of_keys=[self.data_set_name_key,
861  self.unique_sub_id_key,
862  self.protein1_key,
863  self.residue1_key,
864  self.protein2_key,
865  self.residue2_key,
866  self.state_key,
867  self.psi_key]
868 
869  for k in list_of_keys:
870  try:
871  string+=str(xl[k])+"|"
872  except KeyError:
873  continue
874 
875  return string
876 
877  def filter(self,FilterOperator):
878  new_xl_dict={}
879  for id in self.data_base.keys():
880  for xl in self.data_base[id]:
881  if FilterOperator.evaluate(xl):
882  if id not in new_xl_dict:
883  new_xl_dict[id]=[xl]
884  else:
885  new_xl_dict[id].append(xl)
886  self._update()
887  cdb = CrossLinkDataBase(self.cldbkc,new_xl_dict)
888  cdb.dataset = self.dataset
889  return cdb
890 
891  def filter_score(self,score):
892  '''Get all crosslinks with score greater than an input value'''
893  FilterOperator=IMP.pmi.io.crosslink.FilterOperator(self.id_score_key,operator.gt,score)
894  return self.filter(FilterOperator)
895 
896  def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
897  '''
898  This function merges two cross-link datasets so that if two conflicting crosslinks have the same
899  cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
900  with different SubIDs
901  '''
902  pass
903 
904  def append_database(self,CrossLinkDataBase2):
905  '''
906  This function append one cross-link dataset to another. Unique ids will be renamed to avoid
907  conflicts.
908  '''
909  name1=self.get_name()
910  name2=CrossLinkDataBase2.get_name()
911  if name1 == name2:
912  name1=id(self)
913  name2=id(CrossLinkDataBase2)
914 
915  #rename first database:
916  new_data_base={}
917  for k in self.data_base:
918  new_data_base[k]=self.data_base[k]
919  for k in CrossLinkDataBase2.data_base:
920  new_data_base[k]=CrossLinkDataBase2.data_base[k]
921  self.data_base=new_data_base
922  self._update()
923 
924  def set_value(self,key,new_value,FilterOperator=None):
925  '''
926  This function changes the value for a given key in the database
927  For instance one can change the name of a protein
928  @param key: the key in the database that must be changed
929  @param new_value: the new value of the key
930  @param FilterOperator: optional FilterOperator to change the value to
931  a subset of the database
932 
933  example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
934  '''
935 
936  for xl in self:
937  if FilterOperator is not None:
938  if FilterOperator.evaluate(xl):
939  xl[key]=new_value
940  else:
941  xl[key]=new_value
942  self._update()
943 
944  def get_values(self,key):
945  '''
946  this function returns the list of values for a given key in the database
947  alphanumerically sorted
948  '''
949  values=set()
950  for xl in self:
951  values.add(xl[key])
952  return sorted(list(values))
953 
954  def offset_residue_index(self,protein_name,offset):
955  '''
956  This function offset the residue indexes of a given protein by a specified value
957  @param protein_name: the protein name that need to be changed
958  @param offset: the offset value
959  '''
960 
961  for xl in self:
962  if xl[self.protein1_key] == protein_name:
963  xl[self.residue1_key]=xl[self.residue1_key]+offset
964  if xl[self.protein2_key] == protein_name:
965  xl[self.residue2_key]=xl[self.residue2_key]+offset
966  self._update()
967 
968  def create_new_keyword(self,keyword,values_from_keyword=None):
969  '''
970  This function creates a new keyword for the whole database and set the values from
971  and existing keyword (optional), otherwise the values are set to None
972  @param keyword the new keyword name:
973  @param values_from_keyword the keyword from which we are copying the values:
974  '''
975  for xl in self:
976  if values_from_keyword is not None:
977  xl[keyword] = xl[values_from_keyword]
978  else:
979  xl[keyword] = None
980  self._update()
981 
982  def rename_proteins(self,old_to_new_names_dictionary, protein_to_rename="both"):
983  '''
984  This function renames all proteins contained in the input dictionary
985  from the old names (keys) to the new name (values)
986  @param old_to_new_names_dictionary dictionary for converting old to new names
987  @param protein_to_rename specify whether to rename both or protein1 or protein2 only
988  '''
989 
990  for old_name in old_to_new_names_dictionary:
991  new_name=old_to_new_names_dictionary[old_name]
992  if protein_to_rename == "both" or protein_to_rename == "protein1":
993  fo2=FilterOperator(self.protein1_key,operator.eq,old_name)
994  self.set_value(self.protein1_key,new_name,fo2)
995  if protein_to_rename == "both" or protein_to_rename == "protein2":
996  fo2=FilterOperator(self.protein2_key,operator.eq,old_name)
997  self.set_value(self.protein2_key,new_name,fo2)
998 
999  def clone_protein(self,protein_name,new_protein_name):
1000  new_xl_dict={}
1001  for id in self.data_base.keys():
1002  new_data_base=[]
1003  for xl in self.data_base[id]:
1004  new_data_base.append(xl)
1005  if xl[self.protein1_key]==protein_name and xl[self.protein2_key]!=protein_name:
1006  new_xl=dict(xl)
1007  new_xl[self.protein1_key]=new_protein_name
1008  new_data_base.append(new_xl)
1009  elif xl[self.protein1_key]!=protein_name and xl[self.protein2_key]==protein_name:
1010  new_xl=dict(xl)
1011  new_xl[self.protein2_key]=new_protein_name
1012  new_data_base.append(new_xl)
1013  elif xl[self.protein1_key]==protein_name and xl[self.protein2_key]==protein_name:
1014  new_xl=dict(xl)
1015  new_xl[self.protein1_key]=new_protein_name
1016  new_data_base.append(new_xl)
1017  new_xl=dict(xl)
1018  new_xl[self.protein2_key]=new_protein_name
1019  new_data_base.append(new_xl)
1020  new_xl=dict(xl)
1021  new_xl[self.protein1_key]=new_protein_name
1022  new_xl[self.protein2_key]=new_protein_name
1023  new_data_base.append(new_xl)
1024  self.data_base[id]=new_data_base
1025  self._update()
1026 
1028  '''
1029  This function remove cross-links applied to the same residue
1030  (ie, same chain name and residue number)
1031  '''
1032  new_xl_dict={}
1033  for id in self.data_base.keys():
1034  new_data_base=[]
1035  for xl in self.data_base[id]:
1036  if xl[self.protein1_key]==xl[self.protein2_key] and xl[self.residue1_key]==xl[self.residue2_key]:
1037  continue
1038  else:
1039  new_data_base.append(xl)
1040  self.data_base[id]=new_data_base
1041  self._update()
1042 
1043 
1044  def jackknife(self,percentage):
1045  '''
1046  this method returns a CrossLinkDataBase class containing
1047  a random subsample of the original cross-link database.
1048  @param percentage float between 0 and 1, is the percentage of
1049  of spectra taken from the original list
1050  '''
1051  import random
1052  if percentage > 1.0 or percentage < 0.0:
1053  raise ValueError('the percentage of random cross-link spectra should be between 0 and 1')
1054  nspectra=self.get_number_of_xlid()
1055  nrandom_spectra=int(nspectra*percentage)
1056  random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1057  new_data_base={}
1058  for k in random_keys:
1059  new_data_base[k]=self.data_base[k]
1060  return CrossLinkDataBase(self.cldbkc,new_data_base)
1061 
1062  def __str__(self):
1063  outstr=''
1064  sorted_ids=sorted(self.data_base.keys())
1065 
1066  for id in sorted_ids:
1067  outstr+=id+"\n"
1068  for xl in self.data_base[id]:
1069  for k in self.ordered_key_list:
1070  try:
1071  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1072  except KeyError:
1073  continue
1074 
1075  for k in xl:
1076  if k not in self.ordered_key_list:
1077  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1078  outstr+="-------------\n"
1079  return outstr
1080 
1081 
1082  def plot(self,filename,**kwargs):
1083  import matplotlib
1084  matplotlib.use('Agg')
1085  import matplotlib.pyplot as plt
1086  import matplotlib.colors
1087 
1088 
1089 
1090  if kwargs["type"] == "scatter":
1091  cmap=plt.get_cmap("rainbow")
1092  norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1093  xkey=kwargs["xkey"]
1094  ykey=kwargs["ykey"]
1095  if "colorkey" in kwargs:
1096  colorkey=kwargs["colorkey"]
1097  if "sizekey" in kwargs:
1098  sizekey=kwargs["sizekey"]
1099  if "logyscale" in kwargs:
1100  logyscale=kwargs["logyscale"]
1101  else:
1102  logyscale=False
1103  xs=[]
1104  ys=[]
1105  colors=[]
1106  for xl in self:
1107  try:
1108  xs.append(float(xl[xkey]))
1109  if logyscale:
1110  ys.append(math.log10(float(xl[ykey])))
1111  else:
1112  ys.append(float(xl[ykey]))
1113  colors.append(float(xl[colorkey]))
1114  except ValueError:
1115  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1116  continue
1117 
1118  cs=[]
1119  for color in colors:
1120  cindex=(color-min(colors))/(max(colors)-min(colors))
1121  cs.append(cmap(cindex))
1122 
1123  fig = plt.figure()
1124  ax = fig.add_subplot(111)
1125  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker="o")
1126  ax.set_xlabel(xkey)
1127  ax.set_ylabel(ykey)
1128  plt.savefig(filename)
1129  plt.show()
1130  plt.close()
1131 
1132  if kwargs["type"] == "residue_links":
1133  #plot the number of distinct links to a residue
1134  #in an histogram
1135  #molecule name
1136  molecule=kwargs["molecule"]
1137  if type(molecule) is IMP.pmi.topology.Molecule:
1138  length=len(molecule.sequence)
1139  molecule=molecule.get_name()
1140  else:
1141  #you need a IMP.pmi.topology.Sequences object
1142  sequences_object=kwargs["sequences_object"]
1143  sequence=sequences_object.sequences[molecule]
1144  length=len(sequence)
1145 
1146  histogram=[0]*length
1147  for xl in self:
1148  if xl[self.protein1_key]==molecule:
1149  histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1150  if xl[self.protein2_key]==molecule:
1151  histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1152  rects = plt.bar(range(1,length+1), histogram)
1153  #bar_width,
1154  #alpha=opacity,
1155  #color='b',
1156  #yerr=std_men,
1157  #error_kw=error_config,
1158  #label='Men')
1159  plt.savefig(filename)
1160  plt.show()
1161  plt.close()
1162 
1163 
1164 
1165  if kwargs["type"] == "histogram":
1166  valuekey=kwargs["valuekey"]
1167  reference_xline=kwargs["reference_xline"]
1168  valuename=valuekey
1169  bins=kwargs["bins"]
1170  values_list=[]
1171  for xl in self:
1172  try:
1173  values_list.append(float(xl[valuekey]))
1174  except ValueError:
1175  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1176  continue
1178  filename, [values_list], valuename=valuename, bins=bins,
1179  colors=None, format="pdf",
1180  reference_xline=None, yplotrange=None,
1181  xplotrange=None,normalized=True,
1182  leg_names=None)
1183 
1184  def dump(self,json_filename):
1185  import json
1186  with open(json_filename, 'w') as fp:
1187  json.dump(self.data_base, fp, sort_keys=True, indent=2, default=set_json_default)
1188 
1189  def load(self,json_filename):
1190  import json
1191  with open(json_filename, 'r') as fp:
1192  self.data_base = json.load(fp)
1193  self._update()
1194  #getting rid of unicode
1195  # (can't do this in Python 3, since *everything* is Unicode there)
1196  if sys.version_info[0] < 3:
1197  for xl in self:
1198  for k,v in xl.iteritems():
1199  if type(k) is unicode: k=str(k)
1200  if type(v) is unicode: v=str(v)
1201  xl[k]=v
1202 
1203  def save_csv(self,filename):
1204 
1205  import csv
1206 
1207  data=[]
1208  sorted_ids=None
1209  sorted_group_ids=sorted(self.data_base.keys())
1210  for group in sorted_group_ids:
1211  group_block=[]
1212  for xl in self.data_base[group]:
1213  if not sorted_ids:
1214  sorted_ids=sorted(xl.keys())
1215  values=[xl[k] for k in sorted_ids]
1216  group_block.append(values)
1217  data+=group_block
1218 
1219 
1220  with open(filename, 'w') as fp:
1221  a = csv.writer(fp, delimiter=',')
1222  a.writerow(sorted_ids)
1223  a.writerows(data)
1224 
1226  """
1227  Returns the number of non redundant crosslink sites
1228  """
1229  praset=set()
1230  for xl in self:
1231  pra=_ProteinsResiduesArray(xl)
1232  prai=pra.get_inverted()
1233  praset.add(pra)
1234  praset.add(prai)
1235  return len(list(praset))
1236 
1238  """This class allows to compute and plot the distance between datasets"""
1239 
1240  def __init__(self,cldb_dictionary):
1241  """Input a dictionary where keys are cldb names and values are cldbs"""
1242  import scipy.spatial.distance
1243  self.dbs=cldb_dictionary
1244  self.keylist=list(self.dbs.keys())
1245  self.distances=list()
1246 
1247 
1248  for i,key1 in enumerate(self.keylist):
1249  for key2 in self.keylist[i+1:]:
1250  distance=self.get_distance(key1,key2)
1251  self.distances.append(distance)
1252 
1253  self.distances=scipy.spatial.distance.squareform(self.distances)
1254 
1255  def get_distance(self,key1,key2):
1256  return 1.0-self.jaccard_index(self.dbs[key1],self.dbs[key2])
1257 
1258  def jaccard_index(self,CrossLinkDataBase1,CrossLinkDataBase2):
1259  """Similarity index between two datasets
1260  https://en.wikipedia.org/wiki/Jaccard_index"""
1261 
1262  set1=set()
1263  set2=set()
1264  for xl1 in CrossLinkDataBase1:
1265  a1f=_ProteinsResiduesArray(xl1)
1266  a1b=a1f.get_inverted()
1267  set1.add(a1f)
1268  set1.add(a1b)
1269  for xl2 in CrossLinkDataBase2:
1270  a2f=_ProteinsResiduesArray(xl2)
1271  a2b=a2f.get_inverted()
1272  set2.add(a2f)
1273  set2.add(a2b)
1274  return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1275 
1276  def plot_matrix(self,figurename="clustermatrix.pdf"):
1277  import matplotlib as mpl
1278  import numpy
1279  mpl.use('Agg')
1280  import matplotlib.pylab as pl
1281  from scipy.cluster import hierarchy as hrc
1282 
1283  raw_distance_matrix=self.distances
1284  labels=self.keylist
1285 
1286  fig = pl.figure()
1287  #fig.autolayout=True
1288 
1289  ax = fig.add_subplot(1,1,1)
1290  dendrogram = hrc.dendrogram(
1291  hrc.linkage(raw_distance_matrix),
1292  color_threshold=7,
1293  no_labels=True)
1294  leaves_order = dendrogram['leaves']
1295  ax.set_xlabel('Dataset')
1296  ax.set_ylabel('Jaccard Distance')
1297  pl.tight_layout()
1298  pl.savefig("dendrogram."+figurename, dpi=300)
1299  pl.close(fig)
1300 
1301  fig = pl.figure()
1302  #fig.autolayout=True
1303 
1304  ax = fig.add_subplot(1,1,1)
1305  cax = ax.imshow(
1306  raw_distance_matrix[leaves_order,
1307  :][:,
1308  leaves_order],
1309  interpolation='nearest')
1310  cb = fig.colorbar(cax)
1311  cb.set_label('Jaccard Distance')
1312  ax.set_xlabel('Dataset')
1313  ax.set_ylabel('Dataset')
1314  ax.set_xticks(range(len(labels)))
1315  ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation='vertical')
1316  ax.set_yticks(range(len(labels)))
1317  ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation='horizontal')
1318  pl.tight_layout()
1319  pl.savefig("matrix."+figurename, dpi=300)
1320  pl.close(fig)
1321 
1322 
1324  '''
1325  This class maps a CrossLinkDataBase on a given structure
1326  and save an rmf file with color-coded crosslinks
1327  '''
1328 
1329 
1330  def __init__(self,CrossLinkDataBase,rmf_or_stat_handler):
1331  self.CrossLinkDataBase=CrossLinkDataBase
1332  if type(rmf_or_stat_handler) is IMP.pmi.output.RMFHierarchyHandler or \
1333  type(rmf_or_stat_handler) is IMP.pmi.output.StatHierarchyHandler:
1334  self.prots=rmf_or_stat_handler
1335  self.distances=defaultdict(list)
1336  self.array_to_id={}
1337  self.id_to_array={}
1338 
1339  print("computing distances fro all crosslinks and all structures")
1340  for i in self.prots[::10]:
1341  self.compute_distances()
1342  for key,xl in enumerate(self.CrossLinkDataBase):
1343  array=_ProteinsResiduesArray(xl)
1344  self.array_to_id[array]=key
1345  self.id_to_array[key]=array
1346  if xl["MinAmbiguousDistance"] is not 'None':
1347  self.distances[key].append(xl["MinAmbiguousDistance"])
1348 
1349  def compute_distances(self):
1350  data=[]
1351  sorted_ids=None
1352  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1353  for group in sorted_group_ids:
1354  #group_block=[]
1355  group_dists=[]
1356  for xl in self.CrossLinkDataBase.data_base[group]:
1357  if not sorted_ids:
1358  sorted_ids=sorted(xl.keys())
1359  data.append(sorted_ids+["UniqueID","Distance","MinAmbiguousDistance"])
1360  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1361  try:
1362  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1363  except:
1364  mdist="None"
1365  state1="None"
1366  copy1="None"
1367  state2="None"
1368  copy2="None"
1369  try:
1370  # sometimes keys get "lost" in the database, not really sure why
1371  values=[xl[k] for k in sorted_ids]
1372  values += [group, mdist]
1373  except KeyError as e:
1374  print("MapCrossLinkDataBaseOnStructure KeyError: {0} in {1}".format(e, xl))
1375  group_dists.append(mdist)
1376  #group_block.append(values)
1377  xl["Distance"]=mdist
1378  xl["State1"]=state1
1379  xl["Copy1"]=copy1
1380  xl["State2"]=state2
1381  xl["Copy2"]=copy2
1382 
1383  for xl in self.CrossLinkDataBase.data_base[group]:
1384  xl["MinAmbiguousDistance"]=min(group_dists)
1385 
1386  def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1387  '''more robust and slower version of above'''
1388  sel=IMP.atom.Selection(self.prots,molecule=c1,residue_index=r1,resolution=1)
1389  selpart_1=sel.get_selected_particles()
1390  if len(selpart_1)==0:
1391  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1392  return None
1393  sel=IMP.atom.Selection(self.prots,molecule=c2,residue_index=r2,resolution=1)
1394  selpart_2=sel.get_selected_particles()
1395  if len(selpart_2)==0:
1396  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1397  return None
1398  results=[]
1399  for p1 in selpart_1:
1400  for p2 in selpart_2:
1401  if p1 == p2 and r1 == r2: continue
1402  d1=IMP.core.XYZ(p1)
1403  d2=IMP.core.XYZ(p2)
1404  #round distance to second decimal
1405  dist=float(int(IMP.core.get_distance(d1,d2)*100.0))/100.0
1406  h1=IMP.atom.Hierarchy(p1)
1407  while not IMP.atom.Molecule.get_is_setup(h1.get_particle()):
1408  h1=h1.get_parent()
1409  copy_index1=IMP.atom.Copy(h1).get_copy_index()
1410  while not IMP.atom.State.get_is_setup(h1.get_particle()):
1411  h1=h1.get_parent()
1412  state_index1=IMP.atom.State(h1).get_state_index()
1413  h2=IMP.atom.Hierarchy(p2)
1414  while not IMP.atom.Molecule.get_is_setup(h2.get_particle()):
1415  h2=h2.get_parent()
1416  copy_index2=IMP.atom.Copy(h2).get_copy_index()
1417  while not IMP.atom.State.get_is_setup(h2.get_particle()):
1418  h2=h2.get_parent()
1419  state_index2=IMP.atom.State(h2).get_state_index()
1420 
1421  results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1422  if len(results)==0: return None
1423  results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1424  return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1425 
1426  def save_rmf_snapshot(self,filename,color_id=None):
1427  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1428  list_of_pairs=[]
1429  color_scores=[]
1430  for group in sorted_group_ids:
1431  group_dists_particles=[]
1432  for xl in self.CrossLinkDataBase.data_base[group]:
1433  xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1434  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1435  try:
1436  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1437  except TypeError:
1438  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1439  continue
1440  if color_id is not None:
1441  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1442  else:
1443  group_dists_particles.append((mdist,p1,p2,xllabel,0.0))
1444  if group_dists_particles:
1445  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1446  color_scores.append(mincolor_score)
1447  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1448  else:
1449  continue
1450 
1451  linear = IMP.core.Linear(0, 0.0)
1452  linear.set_slope(1.0)
1453  dps2 = IMP.core.DistancePairScore(linear)
1454  rslin = IMP.RestraintSet(self.prots.get_model(), 'linear_dummy_restraints')
1455  sgs=[]
1456  offset=min(color_scores)
1457  maxvalue=max(color_scores)
1458  for pair in list_of_pairs:
1459  pr = IMP.core.PairRestraint(self.prots.get_model(), dps2, (pair[0], pair[1]))
1460  pr.set_name(pair[2])
1461  if offset<maxvalue:
1462  factor=(pair[3]-offset)/(maxvalue-offset)
1463  else:
1464  factor=0.0
1465  c=IMP.display.get_rgb_color(factor)
1466  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1467  rslin.add_restraint(pr)
1468  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1469 
1470  rh = RMF.create_rmf_file(filename)
1471  IMP.rmf.add_hierarchies(rh, [self.prots])
1472  IMP.rmf.add_restraints(rh,[rslin])
1473  IMP.rmf.add_geometries(rh, sgs)
1474  IMP.rmf.save_frame(rh)
1475  del rh
1476 
1477  def boxplot_crosslink_distances(self,filename):
1478  import numpy
1479  keys=[self.id_to_array[k] for k in self.distances.keys()]
1480  medians=[numpy.median(self.distances[self.array_to_id[k]]) for k in keys]
1481  dists=[self.distances[self.array_to_id[k]] for k in keys]
1482  distances_sorted_by_median=[x for _,x in sorted(zip(medians,dists))]
1483  keys_sorted_by_median=[x for _,x in sorted(zip(medians,keys))]
1485  distances_sorted_by_median,
1486  range(len(distances_sorted_by_median)),
1487  xlabels=keys_sorted_by_median,scale_plot_length=0.2)
1488 
1489  def get_crosslink_violations(self,threshold):
1490  violations=defaultdict(list)
1491  for k in self.distances:
1492  #viols=map(lambda x:1.0*(x<= threshold), self.distances[k])
1493  viols=self.distances[k]
1494  violations[self.id_to_array[k]]=viols
1495  return violations
1496 
1497 
1499  '''
1500  This class generates a CrossLinkDataBase from a given structure
1501  '''
1502  def __init__(self,representation=None,
1503  system=None,
1504  residue_types_1=["K"],
1505  residue_types_2=["K"],
1506  reactivity_range=[0,1],
1507  kt=1.0):
1508 
1509  import numpy.random
1510  import math
1512  cldbkc.set_protein1_key("Protein1")
1513  cldbkc.set_protein2_key("Protein2")
1514  cldbkc.set_residue1_key("Residue1")
1515  cldbkc.set_residue2_key("Residue2")
1516  self.cldb=CrossLinkDataBase(cldbkc)
1517  if representation is not None:
1518  #PMI 1.0 mode
1519  self.mode="pmi1"
1520  self.representation=representation
1521  self.model=self.representation.model
1522  elif system is not None:
1523  #PMI 2.0 mode
1524  self.system=system
1525  self.model=self.system.model
1526  self.mode="pmi2"
1527  else:
1528  print("Argument error: please provide either a representation object or a IMP.Hierarchy")
1529  raise
1530  self.residue_types_1=residue_types_1
1531  self.residue_types_2=residue_types_2
1532  self.kt=kt
1533  self.indexes_dict1={}
1534  self.indexes_dict2={}
1535  self.protein_residue_dict={}
1536  self.reactivity_dictionary={}
1537  self.euclidean_interacting_pairs=None
1538  self.xwalk_interacting_pairs=None
1539  import random
1540 
1541  if self.mode=="pmi1":
1542  for protein in self.representation.sequence_dict.keys():
1543  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
1544  seq=self.representation.sequence_dict[protein]
1545  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
1546 
1547  for r in residues:
1548  # uniform random reactivities
1549  #self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
1550  # getting reactivities from the CDF of an exponential distribution
1551  rexp=numpy.random.exponential(0.1)
1552  prob=1.0-math.exp(-rexp)
1553  self.reactivity_dictionary[(protein,r)]=prob
1554 
1555 
1556  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
1557  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
1558  for r in residues1:
1559  h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1560  p=h.get_particle()
1561  index=p.get_index()
1562  self.indexes_dict1[index]=(protein,r)
1563  self.protein_residue_dict[(protein,r)]=index
1564  for r in residues2:
1565  h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1566  p=h.get_particle()
1567  index=p.get_index()
1568  self.indexes_dict2[index]=(protein,r)
1569  self.protein_residue_dict[(protein,r)]=index
1570 
1571  if self.mode=="pmi2":
1572  for state in self.system.get_states():
1573  for moleculename,molecules in state.get_molecules().items():
1574  for molecule in molecules:
1575  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
1576  seq=molecule.sequence
1577  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
1578 
1579  for r in residues:
1580  # uniform random reactivities
1581  #self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
1582  # getting reactivities from the CDF of an exponential distribution
1583  rexp=numpy.random.exponential(0.00000001)
1584  prob=1.0-math.exp(-rexp)
1585  self.reactivity_dictionary[(molecule,r)]=prob
1586 
1587  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
1588  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
1589  for r in residues1:
1590  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1591  ps=s.get_selected_particles()
1592  for p in ps:
1593  index=p.get_index()
1594  self.indexes_dict1[index]=(molecule,r)
1595  self.protein_residue_dict[(molecule,r)]=index
1596  for r in residues2:
1597  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1598  ps=s.get_selected_particles()
1599  for p in ps:
1600  index=p.get_index()
1601  self.indexes_dict2[index]=(molecule,r)
1602  self.protein_residue_dict[(molecule,r)]=index
1603 
1604 
1605  def get_all_possible_pairs(self):
1606  n=float(len(self.protein_residue_dict.keys()))
1607  return n*(n-1.0)/2.0
1608 
1609  def get_all_feasible_pairs(self,distance=21):
1610  import itertools
1611  particle_index_pairs=[]
1612  nxl=0
1613  for a,b in itertools.combinations(self.protein_residue_dict.keys(),2):
1614 
1615  new_xl={}
1616  index1=self.protein_residue_dict[a]
1617  index2=self.protein_residue_dict[b]
1618  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1619  if particle_distance <= distance:
1620  particle_index_pairs.append((index1,index2))
1621  if self.mode=="pmi1":
1622  new_xl[self.cldb.protein1_key]=a[0]
1623  new_xl[self.cldb.protein2_key]=b[0]
1624  elif self.mode=="pmi2":
1625  new_xl[self.cldb.protein1_key]=a[0].get_name()
1626  new_xl[self.cldb.protein2_key]=b[0].get_name()
1627  new_xl["molecule_object1"]=a[0]
1628  new_xl["molecule_object2"]=b[0]
1629  new_xl[self.cldb.residue1_key]=a[1]
1630  new_xl[self.cldb.residue2_key]=b[1]
1631  self.cldb.data_base[str(nxl)]=[new_xl]
1632  nxl+=1
1633  self.cldb._update()
1634  return self.cldb
1635 
1636 
1637 
1638 
1639  def get_data_base(self,total_number_of_spectra,
1640  ambiguity_probability=0.1,
1641  noise=0.01,
1642  distance=21,
1643  max_delta_distance=10.0,
1644  xwalk_bin_path=None,
1645  confidence_false=0.75,
1646  confidence_true=0.75):
1647  import math
1648  from random import random,uniform
1649  import numpy as np
1650  number_of_spectra=1
1651 
1652  self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1653  self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1654  self.cldb.data_base[str(number_of_spectra)]=[]
1655  self.sites_weighted=None
1656 
1657  while number_of_spectra<total_number_of_spectra:
1658  if random() > ambiguity_probability and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1659  # new spectrum
1660  number_of_spectra+=1
1661  self.cldb.data_base[str(number_of_spectra)]=[]
1662  noisy=False
1663  if random() > noise:
1664  # not noisy crosslink
1665  pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1666  else:
1667  # noisy crosslink
1668  pra,dist=self.get_random_residue_pair(None,None,None)
1669  noisy=True
1670 
1671  new_xl={}
1672  if self.mode=="pmi1":
1673  new_xl[self.cldb.protein1_key]=pra[0]
1674  new_xl[self.cldb.protein2_key]=pra[1]
1675  elif self.mode=="pmi2":
1676  new_xl[self.cldb.protein1_key]=pra[0].get_name()
1677  new_xl[self.cldb.protein2_key]=pra[1].get_name()
1678  new_xl["molecule_object1"]=pra[0]
1679  new_xl["molecule_object2"]=pra[1]
1680  new_xl[self.cldb.residue1_key]=pra[2]
1681  new_xl[self.cldb.residue2_key]=pra[3]
1682  new_xl["Noisy"]=noisy
1683  # the reactivity is defined as r=1-exp(-k*Delta t)
1684  new_xl["Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1685  new_xl["Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1686  r1=new_xl["Reactivity_Residue1"]
1687  r2=new_xl["Reactivity_Residue2"]
1688  #combined reactivity 1-exp(-k12*Delta t),
1689  # k12=k1*k2/(k1+k2)
1690  #new_xl["Reactivity"]=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1691  if noisy:
1692  #new_xl["Score"]=uniform(-1.0,1.0)
1693  new_xl["Score"]=np.random.beta(1.0,self.beta_false)
1694  else:
1695  #new_xl["Score"]=new_xl["Reactivity"]+uniform(0.0,2.0)
1696  new_xl["Score"]=1.0-np.random.beta(1.0,self.beta_true)
1697  new_xl["TargetDistance"]=dist
1698  new_xl["NoiseProbability"]=noise
1699  new_xl["AmbiguityProbability"]=ambiguity_probability
1700  # getting if it is intra or inter rigid body
1701  (p1,p2)=IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1702  self.protein_residue_dict[(pra[1],pra[3])]])
1705  IMP.core.RigidMember(p1).get_rigid_body() ==
1706  IMP.core.RigidMember(p2).get_rigid_body()):
1707  new_xl["InterRigidBody"] = False
1708  elif (IMP.core.RigidMember.get_is_setup(p1) and
1710  IMP.core.RigidMember(p1).get_rigid_body() !=
1711  IMP.core.RigidMember(p2).get_rigid_body()):
1712  new_xl["InterRigidBody"] = True
1713  else:
1714  new_xl["InterRigidBody"] = None
1715 
1716  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1717  self.cldb._update()
1718  return self.cldb
1719 
1720 
1721  def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1722  import IMP.pmi.tools
1723  import math
1724  from random import choice,uniform
1725  if distance is None:
1726  # get a random pair
1727  while True:
1728  if self.mode=="pmi1":
1729  protein1=choice(self.representation.sequence_dict.keys())
1730  protein2=choice(self.representation.sequence_dict.keys())
1731  seq1=self.representation.sequence_dict[protein1]
1732  seq2=self.representation.sequence_dict[protein2]
1733  residue1=choice([i for i in range(1,len(seq1)+1) if seq1[i-1] in self.residue_types_1])
1734  residue2=choice([i for i in range(1,len(seq2)+1) if seq2[i-1] in self.residue_types_2])
1735  h1=IMP.pmi.tools.select_by_tuple(self.representation,(residue1,residue1,protein1),resolution=1)[0]
1736  h2=IMP.pmi.tools.select_by_tuple(self.representation,(residue2,residue2,protein2),resolution=1)[0]
1737  particle_distance=IMP.core.get_distance(IMP.core.XYZ(h1.get_particle()),IMP.core.XYZ(h2.get_particle()))
1738  if (protein1,residue1) != (protein2,residue2):
1739  break
1740  elif self.mode=="pmi2":
1741  (protein1,residue1)=choice(self.protein_residue_dict.keys())
1742  (protein2,residue2)=choice(self.protein_residue_dict.keys())
1743  index1=self.protein_residue_dict[(protein1,residue1)]
1744  index2=self.protein_residue_dict[(protein2,residue2)]
1745  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1746  if (protein1,residue1) != (protein2,residue2):
1747  break
1748  else:
1749  # get a pair of residues whose distance is below the threshold
1750  if not xwalk_bin_path:
1752  gcpf.set_distance(distance+max_delta_distance)
1753 
1754  while True:
1755  #setup the reaction rates lists
1756  if not self.sites_weighted:
1757  self.sites_weighted=[]
1758  for key in self.reactivity_dictionary:
1759  r=self.reactivity_dictionary[key]
1760  self.sites_weighted.append((key,r))
1761  #get a random reaction site
1762  first_site=self.weighted_choice(self.sites_weighted)
1763  #get all distances
1764  if not self.euclidean_interacting_pairs:
1765  self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1766  self.indexes_dict1.keys(),
1767  self.indexes_dict2.keys())
1768  #get the partner for the first reacted site
1769  first_site_pairs = [pair for pair in self.euclidean_interacting_pairs
1770  if self.indexes_dict1[pair[0]] == first_site or
1771  self.indexes_dict2[pair[1]] == first_site]
1772  if len(first_site_pairs)==0: continue
1773  #build the list of second reaction sites
1774  second_sites_weighted=[]
1775  for pair in first_site_pairs:
1776  if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1777  if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1778  r=self.reactivity_dictionary[second_site]
1779  second_sites_weighted.append((second_site,r))
1780  second_site=self.weighted_choice(second_sites_weighted)
1781  """
1782  interacting_pairs_weighted=[]
1783  for pair in self.euclidean_interacting_pairs:
1784  r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1785  r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1786  #combined reactivity 1-exp(-k12*Delta t),
1787  # k12=k1*k2/(k1+k2)
1788  #print(r1,r2,dist)
1789  r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1790  interacting_pairs_weighted.append((pair,r12))
1791  #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1792  #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1793  #interacting_pairs_weighted.append((pair,weight1*weight2))
1794 
1795  while True:
1796  pair=self.weighted_choice(interacting_pairs_weighted)
1797  protein1,residue1=self.indexes_dict1[pair[0]]
1798  protein2,residue2=self.indexes_dict2[pair[1]]
1799  particle_pair=IMP.get_particles(self.model,pair)
1800  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1801  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1802  break
1803  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1804  #allow some flexibility
1805  prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1806  if uniform(0.0,1.0)<prob: break
1807  """
1808  protein1,residue1=first_site
1809  protein2,residue2=second_site
1810  print("CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1811  "First site",first_site,self.reactivity_dictionary[first_site],
1812  "Second site",second_site,self.reactivity_dictionary[second_site])
1813  particle_pair=IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1814  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1815 
1816  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1817  break
1818  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1819  #allow some flexibility
1820  #prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1821  #if uniform(0.0,1.0)<prob: break
1822  if particle_distance-distance < max_delta_distance: break
1823 
1824 
1825 
1826  else:
1827  if not self.xwalk_interacting_pairs:
1828  self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1829  interacting_pairs_weighted=[]
1830 
1831  for pair in self.xwalk_interacting_pairs:
1832  protein1=pair[0]
1833  protein2=pair[1]
1834  residue1=pair[2]
1835  residue2=pair[3]
1836  weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1837  weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1838  interacting_pairs_weighted.append((pair,weight1*weight2))
1839 
1840  pair=self.weighted_choice(interacting_pairs_weighted)
1841  protein1=pair[0]
1842  protein2=pair[1]
1843  residue1=pair[2]
1844  residue2=pair[3]
1845  particle_distance=float(pair[4])
1846 
1847 
1848 
1849  return ((protein1,protein2,residue1,residue2)),particle_distance
1850 
1851  def get_xwalk_distances(self,xwalk_bin_path,distance):
1852  import IMP.pmi.output
1853  import os
1854  o=IMP.pmi.output.Output(atomistic=True)
1855  o.init_pdb("xwalk.pdb",self.representation.prot)
1856  o.write_pdb("xwalk.pdb")
1857  namechainiddict=o.dictchain["xwalk.pdb"]
1858  chainiddict={}
1859 
1860  for key in namechainiddict: chainiddict[namechainiddict[key]]=key
1861  xwalkout=os.popen('java -Xmx256m -cp ' + xwalk_bin_path +' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+' -bb').read()
1862 
1863  output_list_of_distance=[]
1864  for line in xwalkout.split("\n")[0:-2]:
1865  tockens=line.split()
1866  first=tockens[2]
1867  second=tockens[3]
1868  distance=float(tockens[6])
1869  fs=first.split("-")
1870  ss=second.split("-")
1871  chainid1=fs[2]
1872  chainid2=ss[2]
1873  protein1=chainiddict[chainid1]
1874  protein2=chainiddict[chainid2]
1875  residue1=int(fs[1])
1876  residue2=int(ss[1])
1877  output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1878  return output_list_of_distance
1879 
1880 
1881  def weighted_choice(self,choices):
1882  import random
1883  # from http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
1884  total = sum(w for c, w in choices)
1885  r = random.uniform(0, total)
1886  upto = 0
1887  for c, w in choices:
1888  if upto + w > r:
1889  return c
1890  upto += w
1891  assert False, "Shouldn't get here"
1892 
1893  def save_rmf_snapshot(self,filename,color_id=None):
1894  import IMP.rmf
1895  import RMF
1896  if color_id is None:
1897  color_id="Reactivity"
1898  sorted_ids=None
1899  sorted_group_ids=sorted(self.cldb.data_base.keys())
1900  list_of_pairs=[]
1901  color_scores=[]
1902  for group in sorted_group_ids:
1903  group_xls=[]
1904  group_dists_particles=[]
1905  for xl in self.cldb.data_base[group]:
1906  xllabel=self.cldb.get_short_cross_link_string(xl)
1907  (c1,c2,r1,r2)=(xl["molecule_object1"],xl["molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1908  try:
1909  index1=self.protein_residue_dict[(c1,r1)]
1910  index2=self.protein_residue_dict[(c2,r2)]
1911  p1,p2=IMP.get_particles(self.model,[index1])[0],IMP.get_particles(self.model,[index2])[0]
1912  mdist=xl["TargetDistance"]
1913  except TypeError:
1914  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1915  continue
1916  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1917  if group_dists_particles:
1918  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1919  color_scores.append(mincolor_score)
1920  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1921  else:
1922  continue
1923 
1924 
1925  m=self.model
1926  linear = IMP.core.Linear(0, 0.0)
1927  linear.set_slope(1.0)
1928  dps2 = IMP.core.DistancePairScore(linear)
1929  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
1930  sgs=[]
1931  offset=min(color_scores)
1932  maxvalue=max(color_scores)
1933  for pair in list_of_pairs:
1934  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
1935  pr.set_name(pair[2])
1936  factor=(pair[3]-offset)/(maxvalue-offset)
1937  c=IMP.display.get_rgb_color(factor)
1938  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1939  rslin.add_restraint(pr)
1940  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1941 
1942  rh = RMF.create_rmf_file(filename)
1943  IMP.rmf.add_hierarchies(rh, [self.system.hier])
1944  IMP.rmf.add_restraints(rh,[rslin])
1945  IMP.rmf.add_geometries(rh, sgs)
1946  IMP.rmf.save_frame(rh)
1947  del rh
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: State.h:41
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1557
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1622
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:690
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:36
Stores a named protein chain.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
The standard decorator for manipulating molecular structures.
This class converts three to one letter codes, and return X for any unknown codes.
Definition: tools.py:1449
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:62
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:19
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:24
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
Definition: State.h:27
class to link stat files to several rmf files
Definition: output.py:1081
class to allow more advanced handling of RMF files.
Definition: output.py:956
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Definition: PairRestraint.h:29
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.