IMP logo
IMP Reference Guide  2.13.0
The Integrative Modeling Platform
io/crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 from __future__ import print_function
8 import IMP
9 import IMP.pmi
10 import IMP.pmi.output
11 import IMP.pmi.alphabets
12 import IMP.atom
13 import IMP.core
14 import IMP.algebra
15 import IMP.rmf
16 import RMF
17 import IMP.display
18 import operator
19 import math
20 import sys
21 import ihm.location
22 import ihm.dataset
23 from collections import defaultdict
24 import numpy
25 
26 # json default serializations
27 def set_json_default(obj):
28  if isinstance(obj, set):
29  return list(obj)
30  if isinstance(obj, IMP.pmi.topology.Molecule):
31  return str(obj)
32  raise TypeError
33 
34 # Handle and return data that must be a string
35 if sys.version_info[0] >= 3:
36  def _handle_string_input(inp):
37  if not isinstance(inp, str):
38  raise TypeError("expecting a string")
39  return inp
40 else:
41  def _handle_string_input(inp):
42  if not isinstance(inp, (str, unicode)):
43  raise TypeError("expecting a string or unicode")
44  # Coerce to non-unicode representation (str)
45  if isinstance(inp, unicode):
46  return str(inp)
47  else:
48  return inp
49 
50 class _CrossLinkDataBaseStandardKeys(object):
51  '''
52  This class setup all the standard keys needed to
53  identify the cross-link features from the data sets
54  '''
55  def __init__(self):
56  self.type={}
57  self.protein1_key="Protein1"
58  self.type[self.protein1_key]=str
59  self.protein2_key="Protein2"
60  self.type[self.protein2_key]=str
61  self.residue1_key="Residue1"
62  self.type[self.residue1_key]=int
63  self.residue2_key="Residue2"
64  self.type[self.residue2_key]=int
65  self.residue1_amino_acid_key="Residue1AminoAcid"
66  self.type[self.residue1_amino_acid_key]=str
67  self.residue2_amino_acid_key="Residue2AminoAcid"
68  self.type[self.residue2_amino_acid_key]=str
69  self.residue1_moiety_key="Residue1Moiety"
70  self.type[self.residue1_moiety_key]=str
71  self.residue2_moiety_key="Residue2Moiety"
72  self.type[self.residue2_moiety_key]=str
73  self.site_pairs_key="SitePairs"
74  self.type[self.site_pairs_key]=str
75  self.unique_id_key="XLUniqueID"
76  self.type[self.unique_id_key]=str
77  self.unique_sub_index_key="XLUniqueSubIndex"
78  self.type[self.unique_sub_index_key]=int
79  self.unique_sub_id_key="XLUniqueSubID"
80  self.type[self.unique_sub_id_key]=str
81  self.data_set_name_key="DataSetName"
82  self.type[self.data_set_name_key]=str
83  self.cross_linker_chemical_key="CrossLinkerChemical"
84  self.type[self.cross_linker_chemical_key]=str
85  self.id_score_key="IDScore"
86  self.type[self.id_score_key]=float
87  self.fdr_key="FDR"
88  self.type[self.fdr_key]=float
89  self.quantitation_key="Quantitation"
90  self.type[self.quantitation_key]=float
91  self.redundancy_key="Redundancy"
92  self.type[self.redundancy_key]=int
93  self.redundancy_list_key="RedundancyList"
94  self.type[self.redundancy_key]=list
95  self.ambiguity_key="Ambiguity"
96  self.type[self.ambiguity_key]=int
97  self.residue1_links_number_key="Residue1LinksNumber"
98  self.type[self.residue1_links_number_key]=int
99  self.residue2_links_number_key="Residue2LinksNumber"
100  self.type[self.residue2_links_number_key]=int
101  self.type[self.ambiguity_key]=int
102  self.state_key="State"
103  self.type[self.state_key]=int
104  self.sigma1_key="Sigma1"
105  self.type[self.sigma1_key]=str
106  self.sigma2_key="Sigma2"
107  self.type[self.sigma2_key]=str
108  self.psi_key="Psi"
109  self.type[self.psi_key]=str
110  self.distance_key="Distance"
111  self.type[self.distance_key]=float
112  self.min_ambiguous_distance_key="MinAmbiguousDistance"
113  self.type[self.distance_key]=float
114  #link types are either Monolink, Intralink or Interlink
115  self.link_type_key="LinkType"
116  self.type[self.link_type_key]=str
117 
118  self.ordered_key_list =[self.data_set_name_key,
119  self.unique_id_key,
120  self.unique_sub_index_key,
121  self.unique_sub_id_key,
122  self.protein1_key,
123  self.protein2_key,
124  self.residue1_key,
125  self.residue2_key,
126  self.residue1_amino_acid_key,
127  self.residue2_amino_acid_key,
128  self.residue1_moiety_key,
129  self.residue2_moiety_key,
130  self.cross_linker_chemical_key,
131  self.id_score_key,
132  self.fdr_key,
133  self.quantitation_key,
134  self.redundancy_key,
135  self.redundancy_list_key,
136  self.state_key,
137  self.sigma1_key,
138  self.sigma2_key,
139  self.psi_key,
140  self.distance_key,
141  self.min_ambiguous_distance_key,
142  self.link_type_key]
143 
144 
145 class _ProteinsResiduesArray(tuple):
146  '''
147  This class is inherits from tuple, and it is a shorthand for a cross-link
148  (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
149  residue1 and residue2.
150  '''
151 
152  def __new__(self,input_data):
153  '''
154  @input_data can be a dict or a tuple
155  '''
156  self.cldbsk=_CrossLinkDataBaseStandardKeys()
157  if type(input_data) is dict:
158  monolink=False
159  p1=input_data[self.cldbsk.protein1_key]
160  try:
161  p2=input_data[self.cldbsk.protein2_key]
162  except KeyError:
163  monolink=True
164  r1=input_data[self.cldbsk.residue1_key]
165  try:
166  r2=input_data[self.cldbsk.residue2_key]
167  except KeyError:
168  monolink=True
169  if not monolink:
170  t=(p1,p2,r1,r2)
171  else:
172  t=(p1,"",r1,None)
173  elif type(input_data) is tuple:
174  if len(input_data)>4 or len(input_data)==3 or len(input_data)==1:
175  raise TypeError("_ProteinsResiduesArray: must have only 4 elements")
176  if len(input_data)==4:
177  p1 = _handle_string_input(input_data[0])
178  p2 = _handle_string_input(input_data[1])
179  r1=input_data[2]
180  r2=input_data[3]
181  if (not (type(r1) is int)) and (not (r1 is None)):
182  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
183  if (not (type(r2) is int)) and (not (r1 is None)):
184  raise TypeError("_ProteinsResiduesArray: residue2 must be a integer")
185  t=(p1,p2,r1,r2)
186  if len(input_data) == 2:
187  p1 = _handle_string_input(input_data[0])
188  r1 = input_data[1]
189  if type(r1) is not int:
190  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
191  t = (p1,"",r1,None)
192  else:
193  raise TypeError("_ProteinsResiduesArray: input must be a dict or tuple")
194  return tuple.__new__(_ProteinsResiduesArray, t)
195 
196  def get_inverted(self):
197  '''
198  Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
199  '''
200  return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
201 
202  def __repr__(self):
203  outstr=self.cldbsk.protein1_key+" "+str(self[0])
204  outstr+=" "+self.cldbsk.protein2_key+" "+str(self[1])
205  outstr+=" "+self.cldbsk.residue1_key+" "+str(self[2])
206  outstr+=" "+self.cldbsk.residue2_key+" "+str(self[3])
207  return outstr
208 
209  def __str__(self):
210  outstr=str(self[0])+"."+str(self[2])+"-"+str(self[1])+"."+str(self[3])
211  return outstr
212 
213 class FilterOperator(object):
214  '''
215  This class allows to create filter functions that can be passed to the CrossLinkDataBase
216  in this way:
217 
218  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
219 
220  where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
221 
222  A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
223 
224  fo.evaluate(xl)
225 
226  and it is used to filter the database
227  '''
228 
229  def __init__(self, argument1, operator, argument2):
230  '''
231  (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
232  or (FilterOperator1,operator.or|and...,FilterOperator2)
233  '''
234  if isinstance(argument1, FilterOperator):
235  self.operations = [argument1, operator, argument2]
236  else:
237  self.operations = []
238  self.values = (argument1, operator, argument2)
239 
240  def __or__(self, FilterOperator2):
241  return FilterOperator(self, operator.or_, FilterOperator2)
242 
243  def __and__(self, FilterOperator2):
244  return FilterOperator(self, operator.and_, FilterOperator2)
245 
246  def __invert__(self):
247  return FilterOperator(self, operator.not_, None)
248 
249  def evaluate(self, xl_item):
250 
251  if len(self.operations) == 0:
252  keyword, operator, value = self.values
253  return operator(xl_item[keyword], value)
254  FilterOperator1, op, FilterOperator2 = self.operations
255 
256  if FilterOperator2 is None:
257  return op(FilterOperator1.evaluate(xl_item))
258  else:
259  return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
260 
261 '''
262 def filter_factory(xl_):
263 
264  class FilterOperator(object):
265  import operator
266  xl = xl_
267 
268  def __new__(self,key,value,oper=operator.eq):
269  return oper(self.xl[key],value)
270 
271  return FilterOperator
272 '''
273 
274 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
275  '''
276  This class is needed to convert the keywords from a generic database
277  to the standard ones
278  '''
279 
280  def __init__(self, list_parser=None):
281  '''
282  @param list_parser an instance of ResiduePairListParser, if any is needed
283  '''
284  self.converter={}
285  self.backward_converter={}
286  _CrossLinkDataBaseStandardKeys.__init__(self)
287  self.rplp = list_parser
288  if self.rplp is None:
289  # either you have protein1, protein2, residue1, residue2
290  self.compulsory_keys=set([self.protein1_key,
291  self.protein2_key,
292  self.residue1_key,
293  self.residue2_key])
294  else:
295  self.compulsory_keys=self.rplp.get_compulsory_keys()
296  self.setup_keys=set()
297 
298  def check_keys(self):
299  '''
300  Is a function that check whether necessary keys are setup
301  '''
302  setup_keys=set(self.get_setup_keys())
303  if self.compulsory_keys & setup_keys != self.compulsory_keys:
304  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
305 
306  def get_setup_keys(self):
307  '''
308  Returns the keys that have been setup so far
309  '''
310  return self.backward_converter.keys()
311 
312  def set_standard_keys(self):
313  """
314  This sets up the standard compulsory keys as defined by
315  _CrossLinkDataBaseStandardKeys
316  """
317  for ck in self.compulsory_keys:
318  self.converter[ck]=ck
319  self.backward_converter[ck]=ck
320 
321  def set_unique_id_key(self,origin_key):
322  self.converter[origin_key]=self.unique_id_key
323  self.backward_converter[self.unique_id_key]=origin_key
324 
325  def set_protein1_key(self,origin_key):
326  self.converter[origin_key]=self.protein1_key
327  self.backward_converter[self.protein1_key]=origin_key
328 
329  def set_protein2_key(self,origin_key):
330  self.converter[origin_key]=self.protein2_key
331  self.backward_converter[self.protein2_key]=origin_key
332 
333  def set_residue1_key(self,origin_key):
334  self.converter[origin_key]=self.residue1_key
335  self.backward_converter[self.residue1_key]=origin_key
336 
337  def set_residue2_key(self,origin_key):
338  self.converter[origin_key]=self.residue2_key
339  self.backward_converter[self.residue2_key]=origin_key
340 
341  def set_residue1_amino_acid_key(self, origin_key):
342  self.converter[origin_key] = self.residue1_amino_acid_key
343  self.backward_converter[self.residue1_amino_acid_key] = origin_key
344 
345  def set_residue2_amino_acid_key(self, origin_key):
346  self.converter[origin_key] = self.residue2_amino_acid_key
347  self.backward_converter[self.residue2_amino_acid_key] = origin_key
348 
349  def set_residue1_moiety_key(self, origin_key):
350  self.converter[origin_key] = self.residue1_moiety_key
351  self.backward_converter[self.residue1_moiety_key] = origin_key
352 
353  def set_residue2_moiety_key(self, origin_key):
354  self.converter[origin_key] = self.residue2_moiety_key
355  self.backward_converter[self.residue2_moiety_key] = origin_key
356 
357  def set_site_pairs_key(self,origin_key):
358  self.converter[origin_key]=self.site_pairs_key
359  self.backward_converter[self.site_pairs_key]=origin_key
360 
361  def set_id_score_key(self,origin_key):
362  self.converter[origin_key]=self.id_score_key
363  self.backward_converter[self.id_score_key]=origin_key
364 
365  def set_fdr_key(self,origin_key):
366  self.converter[origin_key]=self.fdr_key
367  self.backward_converter[self.fdr_key]=origin_key
368 
369  def set_quantitation_key(self,origin_key):
370  self.converter[origin_key]=self.quantitation_key
371  self.backward_converter[self.quantitation_key]=origin_key
372 
373  def set_psi_key(self,origin_key):
374  self.converter[origin_key]=self.psi_key
375  self.backward_converter[self.psi_key]=origin_key
376 
377  def set_link_type_key(self,link_type_key):
378  self.converter[link_type_key]=self.link_type_key
379  self.backward_converter[self.link_type_key]=link_type_key
380 
381  def get_converter(self):
382  '''
383  Returns the dictionary that convert the old keywords to the new ones
384  '''
385  self.check_keys()
386  return self.converter
387 
389  '''
390  Returns the dictionary that convert the new keywords to the old ones
391  '''
392  self.check_keys()
393  return self.backward_converter
394 
395 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
396  '''
397  A class to handle different styles of site pairs parsers.
398  Implemented styles:
399  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
400  [Y3-;K4-] for dead-ends
401  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
402  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
403  LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
404  '''
405 
406  import re
407 
408  def __init__(self,style):
409 
410  _CrossLinkDataBaseStandardKeys.__init__(self)
411  if style == "MSSTUDIO":
412  self.style=style
413  self.compulsory_keys= set([self.protein1_key,
414  self.protein2_key,
415  self.site_pairs_key])
416  elif style == "XTRACT" or style == "QUANTITATION":
417  self.style=style
418  self.compulsory_keys= set([self.site_pairs_key])
419  elif style == "LAN_HUANG":
420  self.style=style
421  self.compulsory_keys= set([self.site_pairs_key])
422  else:
423  raise Error("ResiduePairListParser: unknown list parser style")
424 
425  def get_compulsory_keys(self):
426  return self.compulsory_keys
427 
428  def get_list(self,input_string):
429  '''
430  This function returns a list of cross-linked residues and the corresponding list of
431  cross-linked chains. The latter list can be empty, if the style doesn't have the
432  corresponding information.
433  '''
434  if self.style == "MSSTUDIO":
435  input_string=input_string.replace("[","")
436  input_string=input_string.replace("]","")
437  input_string_pairs=input_string.split(";")
438  residue_pair_indexes=[]
439  chain_pair_indexes=[]
440  for s in input_string_pairs:
441  m1=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
442  m2=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
443  if m1:
444  # cross-link
445  residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
446  residue_pair_indexes.append((residue_index_1,residue_index_2))
447  elif m2:
448  # dead end
449  residue_type_1,residue_index_1=m2.group(1,2)
450  # at this stage chain_pair_indexes is empty
451  return residue_pair_indexes,chain_pair_indexes
452  if self.style == "XTRACT" or self.style == "QUANTITATION":
453  if ":x:" in input_string:
454  # if it is a cross-link....
455  input_strings=input_string.split(":x:")
456  first_peptides=input_strings[0].split(":|:")
457  second_peptides=input_strings[1].split(":|:")
458  first_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in first_peptides]
459  second_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in second_peptides]
460  residue_pair_indexes=[]
461  chain_pair_indexes=[]
462  for fpi in first_peptides_indentifiers:
463  for spi in second_peptides_indentifiers:
464  chain1=fpi[0]
465  chain2=spi[0]
466  residue1=fpi[1]
467  residue2=spi[1]
468  residue_pair_indexes.append((residue1,residue2))
469  chain_pair_indexes.append((chain1,chain2))
470  return residue_pair_indexes, chain_pair_indexes
471  else:
472  # if it is a monolink....
473  first_peptides = input_string.split(":|:")
474  first_peptides_indentifiers = [(f.split(":")[0], f.split(":")[1]) for f in first_peptides]
475  residue_indexes = []
476  chain_indexes = []
477  for fpi in first_peptides_indentifiers:
478  chain1=fpi[0]
479  residue1=fpi[1]
480  residue_indexes.append((residue1,))
481  chain_indexes.append((chain1,))
482  return residue_indexes, chain_indexes
483  if self.style == "LAN_HUANG":
484  input_strings=input_string.split("-")
485  chain1,first_series=input_strings[0].split(":")
486  chain2,second_series=input_strings[1].split(":")
487 
488  first_residues=first_series.replace(";","|").split("|")
489  second_residues=second_series.replace(";","|").split("|")
490  residue_pair_indexes=[]
491  chain_pair_indexes=[]
492  for fpi in first_residues:
493  for spi in second_residues:
494  residue1=self.re.sub("[^0-9]", "", fpi)
495  residue2=self.re.sub("[^0-9]", "", spi)
496  residue_pair_indexes.append((residue1,residue2))
497  chain_pair_indexes.append((chain1,chain2))
498  return residue_pair_indexes, chain_pair_indexes
499 
500 
501 
502 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
503  '''
504  A class to handle different XL format with fixed format
505  currently support ProXL
506  '''
507  def __init__(self,format):
508 
509  _CrossLinkDataBaseStandardKeys.__init__(self)
510  if format == "PROXL":
511  self.format=format
512  else:
513  raise Error("FixedFormatParser: unknown list format name")
514 
515 
516  def get_data(self,input_string):
517  if self.format == "PROXL":
518  tokens=input_string.split("\t")
519  xl={}
520  if tokens[0]=="SEARCH ID(S)":
521  return None
522  else:
523  xl[self.protein1_key]=tokens[2]
524  xl[self.protein2_key]=tokens[4]
525  xl[self.residue1_key]=int(tokens[3])
526  xl[self.residue2_key]=int(tokens[5])
527  return xl
528 
529 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
530  '''
531  this class handles a cross-link dataset and do filtering
532  operations, adding cross-links, merge datasets...
533  '''
534 
535  def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=('K',)):
536  '''
537  Constructor.
538  @param converter an instance of CrossLinkDataBaseKeywordsConverter
539  @param data_base an instance of CrossLinkDataBase to build the new database on
540  @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
541  cross-link consistency. If not given consistency will not be checked
542  @param linkable_aa a tuple containing one-letter amino acids which are linkable by the cross-linker;
543  only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
544  and if a fasta_seq is given
545  '''
546 
547  if data_base is None:
548  self.data_base = {}
549  else:
550  self.data_base=data_base
551 
552  _CrossLinkDataBaseStandardKeys.__init__(self)
553  if converter is not None:
554  self.cldbkc = converter #type: CrossLinkDataBaseKeywordsConverter
555  self.list_parser=self.cldbkc.rplp
556  self.converter = converter.get_converter()
557 
558  else:
559  self.cldbkc = None #type: CrossLinkDataBaseKeywordsConverter
560  self.list_parser=None
561  self.converter = None
562 
563  # default amino acids considered to be 'linkable' if none are given
564  self.def_aa_tuple = linkable_aa
565  self.fasta_seq = fasta_seq #type: IMP.pmi.topology.Sequences
566  self.dataset = None
567  self.name = None
568  self._update()
569 
570  def _update(self):
571  '''
572  Update the whole dataset after changes
573  '''
574  self.update_cross_link_unique_sub_index()
575  self.update_cross_link_redundancy()
576  self.update_residues_links_number()
578 
579 
580  def __iter__(self):
581  sorted_ids=sorted(self.data_base.keys())
582  for k in sorted_ids:
583  for xl in self.data_base[k]:
584  yield xl
585 
586  def xlid_iterator(self):
587  sorted_ids=sorted(self.data_base.keys())
588  for xlid in sorted_ids:
589  yield xlid
590 
591  def __getitem__(self,xlid):
592  return self.data_base[xlid]
593 
594  def __len__(self):
595  return len([xl for xl in self])
596 
597  def get_name(self):
598  return self.name
599 
600  def set_name(self,name):
601  new_data_base={}
602  for k in self.data_base:
603  new_data_base[k+"."+name]=self.data_base[k]
604  self.data_base=new_data_base
605  self.name=name
606  self._update()
607 
608  def get_number_of_xlid(self):
609  return len(self.data_base)
610 
611 
612  def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
613  '''
614  if FixedFormatParser is not specified, the file is comma-separated-values
615  @param file_name a txt file to be parsed
616  @param converter an instance of CrossLinkDataBaseKeywordsConverter
617  @param FixedFormatParser a parser for a fixed format
618  '''
619  if not FixedFormatParser:
620  xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
621 
622  if converter is not None:
623  self.cldbkc = converter
624  self.list_parser=self.cldbkc.rplp
625  self.converter = converter.get_converter()
626 
627 
628  if not self.list_parser:
629  # normal procedure without a list_parser
630  # each line is a cross-link
631  new_xl_dict={}
632  for nxl,xl in enumerate(xl_list):
633  new_xl={}
634  for k in xl:
635  if k in self.converter:
636  new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
637  else:
638  new_xl[k]=xl[k]
639  if self.unique_id_key in self.cldbkc.get_setup_keys():
640  if new_xl[self.unique_id_key] not in new_xl_dict:
641  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
642  else:
643  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
644  else:
645  if str(nxl) not in new_xl_dict:
646  new_xl_dict[str(nxl)]=[new_xl]
647  else:
648  new_xl_dict[str(nxl)].append(new_xl)
649 
650  else:
651  # with a list_parser, a line can be a list of ambiguous cross-links
652  new_xl_dict={}
653  for nxl,entry in enumerate(xl_list):
654 
655  # first get the translated keywords
656  new_dict={}
657  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
658  raise Error("CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
659  for k in entry:
660  if k in self.converter:
661  new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
662  else:
663  new_dict[k]=entry[k]
664 
665  residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
666 
667  # then create the cross-links
668  for n,p in enumerate(residue_pair_list):
669  is_monolink=False
670  if len(p)==1:
671  is_monolink=True
672 
673  new_xl={}
674  for k in new_dict:
675  new_xl[k]=new_dict[k]
676  new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
677  if not is_monolink:
678  new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
679 
680  if len(chain_pair_list)==len(residue_pair_list):
681  new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
682  if not is_monolink:
683  new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
684 
685  if not is_monolink:
686  new_xl[self.link_type_key]="CROSSLINK"
687  else:
688  new_xl[self.link_type_key]="MONOLINK"
689 
690  if self.unique_id_key in self.cldbkc.get_setup_keys():
691  if new_xl[self.unique_id_key] not in new_xl_dict:
692  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
693  else:
694  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
695  else:
696  if str(nxl) not in new_xl_dict:
697  new_xl[self.unique_id_key]=str(nxl+1)
698  new_xl_dict[str(nxl)]=[new_xl]
699  else:
700  new_xl[self.unique_id_key]=str(nxl+1)
701  new_xl_dict[str(nxl)].append(new_xl)
702 
703 
704  else:
705  '''
706  if FixedFormatParser is defined
707  '''
708 
709  new_xl_dict={}
710  f=open(file_name,"r")
711  nxl=0
712  for line in f:
713  xl=FixedFormatParser.get_data(line)
714  if xl:
715  xl[self.unique_id_key]=str(nxl+1)
716  new_xl_dict[str(nxl)]=[xl]
717  nxl+=1
718 
719 
720  self.data_base=new_xl_dict
721  self.name=file_name
722  l = ihm.location.InputFileLocation(file_name, details='Crosslinks')
723  self.dataset = ihm.dataset.CXMSDataset(l)
724  self._update()
725 
726  def update_cross_link_unique_sub_index(self):
727  for k in self.data_base:
728  for n,xl in enumerate(self.data_base[k]):
729  xl[self.ambiguity_key]=len(self.data_base[k])
730  xl[self.unique_sub_index_key]=n+1
731  xl[self.unique_sub_id_key]=k+"."+str(n+1)
732 
733  def update_cross_link_redundancy(self):
734  redundancy_data_base={}
735  for xl in self:
736  pra=_ProteinsResiduesArray(xl)
737  if pra not in redundancy_data_base:
738  redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
739  redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
740  else:
741  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
742  redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
743  for xl in self:
744  pra=_ProteinsResiduesArray(xl)
745  xl[self.redundancy_key]=len(redundancy_data_base[pra])
746  xl[self.redundancy_list_key]=redundancy_data_base[pra]
747 
748  def update_residues_links_number(self):
749  residue_links={}
750  for xl in self:
751  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
752  if (p1,r1) not in residue_links:
753  residue_links[(p1,r1)]=set([(p2,r2)])
754  else:
755  residue_links[(p1,r1)].add((p2,r2))
756  if (p2,r2) not in residue_links:
757  residue_links[(p2,r2)]=set([(p1,r1)])
758  else:
759  residue_links[(p2,r2)].add((p1,r1))
760 
761  for xl in self:
762  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
763  xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
764  xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
765 
767  """This function checks the consistency of the dataset with the amino acid sequence"""
768  if self.cldbkc and self.fasta_seq:
769  cnt_matched, cnt_matched_file = 0, 0
770  matched = {}
771  non_matched = {}
772  for xl in self:
773  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
774  b_matched_file = False
775  if self.residue1_amino_acid_key in xl:
776  # either you know the residue type and aa_tuple is a single entry
777  aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
778  b_matched = self._match_xlinks(p1, r1, aa_from_file)
779  b_matched_file = b_matched
780  else:
781  # or pass the possible list of types that can be cross-linked
782  b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
783 
784  matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
785 
786  if self.residue2_amino_acid_key in xl:
787  aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
788  b_matched = self._match_xlinks(p2, r2, aa_from_file)
789  b_matched_file = b_matched
790  else:
791  b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
792 
793  matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
794  if b_matched: cnt_matched += 1
795  if b_matched_file: cnt_matched_file += 1
796  if len(self) > 0:
797  percentage_matched = round(100*cnt_matched/len(self),1)
798  percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
799  #if matched: print "Matched xlinks:", matched
800  if matched or non_matched: print("check_cross_link_consistency: Out of %d cross-links "
801  "%d were matched to the fasta sequence (%f %%).\n "
802  "%d were matched by using the cross-link file (%f %%)."%
803  (len(self),cnt_matched,percentage_matched,cnt_matched_file,
804  percentage_matched_file) )
805  if non_matched: print("check_cross_link_consistency: Warning: Non matched xlinks:",
806  [(prot_name, sorted(list(non_matched[prot_name]))) for prot_name in non_matched])
807  return matched,non_matched
808 
809  def _match_xlinks(self, prot_name, res_index, aa_tuple):
810  # returns Boolean whether given aa matches a position in the fasta file
811  # cross-link files usually start counting at 1 and not 0; therefore subtract -1 to compare with fasta
812  amino_dict = IMP.pmi.alphabets.amino_acid
813  res_index -= 1
814  for amino_acid in aa_tuple:
815  if len(amino_acid) == 3:
816  amino_acid = amino_dict.get_one_letter_code_from_residue_type(
817  amino_acid.upper())
818  if prot_name in self.fasta_seq.sequences:
819  seq = self.fasta_seq.sequences[prot_name]
820  # if we are looking at the first amino acid in a given sequence always return a match
821  # the first aa should always be the n-terminal aa
822  # which may form a cross-link in any case (for BS3-like cross-linkers)
823  # for some data sets the first aa is at position 1; todo: check why this is the case
824  if res_index == 0 or res_index == 1:
825  return True
826  if res_index < len(seq):
827  if amino_acid == seq[res_index]:
828  return True
829  # else:
830  # print "Could not match", prot, res+1, amino_acid, seq[res]
831  return False
832 
833  def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
834  if b_matched:
835  if prot in matched:
836  matched[prot].add(res)
837  else:
838  matched[prot] = set([res])
839  else:
840  if prot in non_matched:
841  non_matched[prot].add(res)
842  else:
843  non_matched[prot] = set([res])
844  return matched, non_matched
845 
846 
847  def get_cross_link_string(self,xl):
848  string='|'
849  for k in self.ordered_key_list:
850  try:
851  string+=str(k)+":"+str(xl[k])+"|"
852  except KeyError:
853  continue
854 
855  for k in xl:
856  if k not in self.ordered_key_list:
857  string+=str(k)+":"+str(xl[k])+"|"
858 
859  return string
860 
861  def get_short_cross_link_string(self,xl):
862 
863  string='|'
864  list_of_keys=[self.data_set_name_key,
865  self.unique_sub_id_key,
866  self.protein1_key,
867  self.residue1_key,
868  self.protein2_key,
869  self.residue2_key,
870  self.state_key,
871  self.psi_key]
872 
873  for k in list_of_keys:
874  try:
875  string+=str(xl[k])+"|"
876  except KeyError:
877  continue
878 
879  return string
880 
881  def filter(self,FilterOperator):
882  new_xl_dict={}
883  for id in self.data_base.keys():
884  for xl in self.data_base[id]:
885  if FilterOperator.evaluate(xl):
886  if id not in new_xl_dict:
887  new_xl_dict[id]=[xl]
888  else:
889  new_xl_dict[id].append(xl)
890  self._update()
891  cdb = CrossLinkDataBase(self.cldbkc,new_xl_dict)
892  cdb.dataset = self.dataset
893  return cdb
894 
895  def filter_score(self,score):
896  '''Get all cross-links with score greater than an input value'''
897  FilterOperator=IMP.pmi.io.crosslink.FilterOperator(self.id_score_key,operator.gt,score)
898  return self.filter(FilterOperator)
899 
900  def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
901  '''
902  This function merges two cross-link datasets so that if two conflicting cross-links have the same
903  cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
904  with different SubIDs
905  '''
906  pass
907 
908  def append_database(self,CrossLinkDataBase2):
909  '''
910  This function append one cross-link dataset to another. Unique ids will be renamed to avoid
911  conflicts.
912  '''
913  name1=self.get_name()
914  name2=CrossLinkDataBase2.get_name()
915  if name1 == name2:
916  name1=id(self)
917  name2=id(CrossLinkDataBase2)
918  self.set_name(name1)
919  CrossLinkDataBase2.set_name(name2)
920 
921  #rename first database:
922  new_data_base={}
923  for k in self.data_base:
924  new_data_base[k]=self.data_base[k]
925  for k in CrossLinkDataBase2.data_base:
926  new_data_base[k]=CrossLinkDataBase2.data_base[k]
927  self.data_base=new_data_base
928  self._update()
929 
930  def set_value(self,key,new_value,FilterOperator=None):
931  '''
932  This function changes the value for a given key in the database
933  For instance one can change the name of a protein
934  @param key: the key in the database that must be changed
935  @param new_value: the new value of the key
936  @param FilterOperator: optional FilterOperator to change the value to
937  a subset of the database
938 
939  example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
940  '''
941 
942  for xl in self:
943  if FilterOperator is not None:
944  if FilterOperator.evaluate(xl):
945  xl[key]=new_value
946  else:
947  xl[key]=new_value
948  self._update()
949 
950  def get_values(self,key):
951  '''
952  this function returns the list of values for a given key in the database
953  alphanumerically sorted
954  '''
955  values=set()
956  for xl in self:
957  values.add(xl[key])
958  return sorted(list(values))
959 
960  def offset_residue_index(self,protein_name,offset):
961  '''
962  This function offset the residue indexes of a given protein by a specified value
963  @param protein_name: the protein name that need to be changed
964  @param offset: the offset value
965  '''
966 
967  for xl in self:
968  if xl[self.protein1_key] == protein_name:
969  xl[self.residue1_key]=xl[self.residue1_key]+offset
970  if xl[self.protein2_key] == protein_name:
971  xl[self.residue2_key]=xl[self.residue2_key]+offset
972  self._update()
973 
974  def create_new_keyword(self,keyword,values_from_keyword=None):
975  '''
976  This function creates a new keyword for the whole database and set the values from
977  and existing keyword (optional), otherwise the values are set to None
978  @param keyword the new keyword name:
979  @param values_from_keyword the keyword from which we are copying the values:
980  '''
981  for xl in self:
982  if values_from_keyword is not None:
983  xl[keyword] = xl[values_from_keyword]
984  else:
985  xl[keyword] = None
986  self._update()
987 
988  def rename_proteins(self,old_to_new_names_dictionary, protein_to_rename="both"):
989  '''
990  This function renames all proteins contained in the input dictionary
991  from the old names (keys) to the new name (values)
992  @param old_to_new_names_dictionary dictionary for converting old to new names
993  @param protein_to_rename specify whether to rename both or protein1 or protein2 only
994  '''
995 
996  for old_name in old_to_new_names_dictionary:
997  new_name=old_to_new_names_dictionary[old_name]
998  if protein_to_rename == "both" or protein_to_rename == "protein1":
999  fo2=FilterOperator(self.protein1_key,operator.eq,old_name)
1000  self.set_value(self.protein1_key,new_name,fo2)
1001  if protein_to_rename == "both" or protein_to_rename == "protein2":
1002  fo2=FilterOperator(self.protein2_key,operator.eq,old_name)
1003  self.set_value(self.protein2_key,new_name,fo2)
1004 
1005  def classify_crosslinks_by_score(self,number_of_classes):
1006  '''
1007  This function creates as many classes as in the input (number_of_classes: integer)
1008  and partition cross-links according to their identification scores. Classes are defined in the psi key.
1009  '''
1010 
1011  if self.id_score_key is not None:
1012  scores=self.get_values(self.id_score_key)
1013  else:
1014  raise ValueError('The cross-link database does not contain score values')
1015  minscore=min(scores)
1016  maxscore=max(scores)
1017  scoreclasses=numpy.linspace(minscore, maxscore, number_of_classes+1)
1018  if self.psi_key is None:
1019  self.create_new_keyword(self.psi_key,values_from_keyword=None)
1020  for xl in self:
1021  score=xl[self.id_score_key]
1022  for n,classmin in enumerate(scoreclasses[0:-1]):
1023  if score>=classmin and score<=scoreclasses[n+1]:
1024  xl[self.psi_key]=str("CLASS_"+str(n))
1025  self._update()
1026 
1027  def clone_protein(self,protein_name,new_protein_name):
1028  new_xl_dict={}
1029  for id in self.data_base.keys():
1030  new_data_base=[]
1031  for xl in self.data_base[id]:
1032  new_data_base.append(xl)
1033  if xl[self.protein1_key]==protein_name and xl[self.protein2_key]!=protein_name:
1034  new_xl=dict(xl)
1035  new_xl[self.protein1_key]=new_protein_name
1036  new_data_base.append(new_xl)
1037  elif xl[self.protein1_key]!=protein_name and xl[self.protein2_key]==protein_name:
1038  new_xl=dict(xl)
1039  new_xl[self.protein2_key]=new_protein_name
1040  new_data_base.append(new_xl)
1041  elif xl[self.protein1_key]==protein_name and xl[self.protein2_key]==protein_name:
1042  new_xl=dict(xl)
1043  new_xl[self.protein1_key]=new_protein_name
1044  new_data_base.append(new_xl)
1045  new_xl=dict(xl)
1046  new_xl[self.protein2_key]=new_protein_name
1047  new_data_base.append(new_xl)
1048  new_xl=dict(xl)
1049  new_xl[self.protein1_key]=new_protein_name
1050  new_xl[self.protein2_key]=new_protein_name
1051  new_data_base.append(new_xl)
1052  self.data_base[id]=new_data_base
1053  self._update()
1054 
1056  '''
1057  This function remove cross-links applied to the same residue
1058  (ie, same chain name and residue number)
1059  '''
1060  new_xl_dict={}
1061  for id in self.data_base.keys():
1062  new_data_base=[]
1063  for xl in self.data_base[id]:
1064  if xl[self.protein1_key]==xl[self.protein2_key] and xl[self.residue1_key]==xl[self.residue2_key]:
1065  continue
1066  else:
1067  new_data_base.append(xl)
1068  self.data_base[id]=new_data_base
1069  self._update()
1070 
1071 
1072  def jackknife(self,percentage):
1073  '''
1074  this method returns a CrossLinkDataBase class containing
1075  a random subsample of the original cross-link database.
1076  @param percentage float between 0 and 1, is the percentage of
1077  of spectra taken from the original list
1078  '''
1079  import random
1080  if percentage > 1.0 or percentage < 0.0:
1081  raise ValueError('the percentage of random cross-link spectra should be between 0 and 1')
1082  nspectra=self.get_number_of_xlid()
1083  nrandom_spectra=int(nspectra*percentage)
1084  random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1085  new_data_base={}
1086  for k in random_keys:
1087  new_data_base[k]=self.data_base[k]
1088  return CrossLinkDataBase(self.cldbkc,new_data_base)
1089 
1090  def __str__(self):
1091  outstr=''
1092  sorted_ids=sorted(self.data_base.keys())
1093 
1094  for id in sorted_ids:
1095  outstr+=id+"\n"
1096  for xl in self.data_base[id]:
1097  for k in self.ordered_key_list:
1098  try:
1099  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1100  except KeyError:
1101  continue
1102 
1103  for k in xl:
1104  if k not in self.ordered_key_list:
1105  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1106  outstr+="-------------\n"
1107  return outstr
1108 
1109 
1110  def plot(self,filename,**kwargs):
1111  import matplotlib
1112  matplotlib.use('Agg')
1113  import matplotlib.pyplot as plt
1114  import matplotlib.colors
1115 
1116 
1117 
1118  if kwargs["type"] == "scatter":
1119  cmap=plt.get_cmap("rainbow")
1120  norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1121  xkey=kwargs["xkey"]
1122  ykey=kwargs["ykey"]
1123  if "colorkey" in kwargs:
1124  colorkey=kwargs["colorkey"]
1125  if "sizekey" in kwargs:
1126  sizekey=kwargs["sizekey"]
1127  if "logyscale" in kwargs:
1128  logyscale=kwargs["logyscale"]
1129  else:
1130  logyscale=False
1131  xs=[]
1132  ys=[]
1133  colors=[]
1134  for xl in self:
1135  try:
1136  xs.append(float(xl[xkey]))
1137  if logyscale:
1138  ys.append(math.log10(float(xl[ykey])))
1139  else:
1140  ys.append(float(xl[ykey]))
1141  colors.append(float(xl[colorkey]))
1142  except ValueError:
1143  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1144  continue
1145 
1146  cs=[]
1147  for color in colors:
1148  cindex=(color-min(colors))/(max(colors)-min(colors))
1149  cs.append(cmap(cindex))
1150 
1151  fig = plt.figure()
1152  ax = fig.add_subplot(111)
1153  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker="o")
1154  ax.set_xlabel(xkey)
1155  ax.set_ylabel(ykey)
1156  plt.savefig(filename)
1157  plt.show()
1158  plt.close()
1159 
1160  if kwargs["type"] == "residue_links":
1161  #plot the number of distinct links to a residue
1162  #in an histogram
1163  #molecule name
1164  molecule=kwargs["molecule"]
1165  if type(molecule) is IMP.pmi.topology.Molecule:
1166  length=len(molecule.sequence)
1167  molecule=molecule.get_name()
1168  else:
1169  #you need a IMP.pmi.topology.Sequences object
1170  sequences_object=kwargs["sequences_object"]
1171  sequence=sequences_object.sequences[molecule]
1172  length=len(sequence)
1173 
1174  histogram=[0]*length
1175  for xl in self:
1176  if xl[self.protein1_key]==molecule:
1177  histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1178  if xl[self.protein2_key]==molecule:
1179  histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1180  rects = plt.bar(range(1,length+1), histogram)
1181  #bar_width,
1182  #alpha=opacity,
1183  #color='b',
1184  #yerr=std_men,
1185  #error_kw=error_config,
1186  #label='Men')
1187  plt.savefig(filename)
1188  plt.show()
1189  plt.close()
1190 
1191 
1192 
1193  if kwargs["type"] == "histogram":
1194  valuekey=kwargs["valuekey"]
1195  reference_xline=kwargs["reference_xline"]
1196  valuename=valuekey
1197  bins=kwargs["bins"]
1198  values_list=[]
1199  for xl in self:
1200  try:
1201  values_list.append(float(xl[valuekey]))
1202  except ValueError:
1203  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1204  continue
1206  filename, [values_list], valuename=valuename, bins=bins,
1207  colors=None, format="pdf",
1208  reference_xline=None, yplotrange=None,
1209  xplotrange=None,normalized=True,
1210  leg_names=None)
1211 
1212  def dump(self,json_filename):
1213  import json
1214  with open(json_filename, 'w') as fp:
1215  json.dump(self.data_base, fp, sort_keys=True, indent=2, default=set_json_default)
1216 
1217  def load(self,json_filename):
1218  import json
1219  with open(json_filename, 'r') as fp:
1220  self.data_base = json.load(fp)
1221  self._update()
1222  #getting rid of unicode
1223  # (can't do this in Python 3, since *everything* is Unicode there)
1224  if sys.version_info[0] < 3:
1225  for xl in self:
1226  for k,v in xl.iteritems():
1227  if type(k) is unicode: k=str(k)
1228  if type(v) is unicode: v=str(v)
1229  xl[k]=v
1230 
1231  def save_csv(self,filename):
1232 
1233  import csv
1234 
1235  data=[]
1236  sorted_ids=None
1237  sorted_group_ids=sorted(self.data_base.keys())
1238  for group in sorted_group_ids:
1239  group_block=[]
1240  for xl in self.data_base[group]:
1241  if not sorted_ids:
1242  sorted_ids=sorted(xl.keys())
1243  values=[xl[k] for k in sorted_ids]
1244  group_block.append(values)
1245  data+=group_block
1246 
1247 
1248  with open(filename, 'w') as fp:
1249  a = csv.writer(fp, delimiter=',')
1250  a.writerow(sorted_ids)
1251  a.writerows(data)
1252 
1254  """
1255  Returns the number of non redundant cross-link sites
1256  """
1257  praset=set()
1258  for xl in self:
1259  pra=_ProteinsResiduesArray(xl)
1260  prai=pra.get_inverted()
1261  praset.add(pra)
1262  praset.add(prai)
1263  return len(list(praset))
1264 
1266  """This class allows to compute and plot the distance between datasets"""
1267 
1268  def __init__(self,cldb_dictionary):
1269  """Input a dictionary where keys are cldb names and values are cldbs"""
1270  import scipy.spatial.distance
1271  self.dbs=cldb_dictionary
1272  self.keylist=list(self.dbs.keys())
1273  self.distances=list()
1274 
1275 
1276  for i,key1 in enumerate(self.keylist):
1277  for key2 in self.keylist[i+1:]:
1278  distance=self.get_distance(key1,key2)
1279  self.distances.append(distance)
1280 
1281  self.distances=scipy.spatial.distance.squareform(self.distances)
1282 
1283  def get_distance(self,key1,key2):
1284  return 1.0-self.jaccard_index(self.dbs[key1],self.dbs[key2])
1285 
1286  def jaccard_index(self,CrossLinkDataBase1,CrossLinkDataBase2):
1287  """Similarity index between two datasets
1288  https://en.wikipedia.org/wiki/Jaccard_index"""
1289 
1290  set1=set()
1291  set2=set()
1292  for xl1 in CrossLinkDataBase1:
1293  a1f=_ProteinsResiduesArray(xl1)
1294  a1b=a1f.get_inverted()
1295  set1.add(a1f)
1296  set1.add(a1b)
1297  for xl2 in CrossLinkDataBase2:
1298  a2f=_ProteinsResiduesArray(xl2)
1299  a2b=a2f.get_inverted()
1300  set2.add(a2f)
1301  set2.add(a2b)
1302  return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1303 
1304  def plot_matrix(self,figurename="clustermatrix.pdf"):
1305  import matplotlib as mpl
1306  import numpy
1307  mpl.use('Agg')
1308  import matplotlib.pylab as pl
1309  from scipy.cluster import hierarchy as hrc
1310 
1311  raw_distance_matrix=self.distances
1312  labels=self.keylist
1313 
1314  fig = pl.figure()
1315  #fig.autolayout=True
1316 
1317  ax = fig.add_subplot(1,1,1)
1318  dendrogram = hrc.dendrogram(
1319  hrc.linkage(raw_distance_matrix),
1320  color_threshold=7,
1321  no_labels=True)
1322  leaves_order = dendrogram['leaves']
1323  ax.set_xlabel('Dataset')
1324  ax.set_ylabel('Jaccard Distance')
1325  pl.tight_layout()
1326  pl.savefig("dendrogram."+figurename, dpi=300)
1327  pl.close(fig)
1328 
1329  fig = pl.figure()
1330  #fig.autolayout=True
1331 
1332  ax = fig.add_subplot(1,1,1)
1333  cax = ax.imshow(
1334  raw_distance_matrix[leaves_order,
1335  :][:,
1336  leaves_order],
1337  interpolation='nearest')
1338  cb = fig.colorbar(cax)
1339  cb.set_label('Jaccard Distance')
1340  ax.set_xlabel('Dataset')
1341  ax.set_ylabel('Dataset')
1342  ax.set_xticks(range(len(labels)))
1343  ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation='vertical')
1344  ax.set_yticks(range(len(labels)))
1345  ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation='horizontal')
1346  pl.tight_layout()
1347  pl.savefig("matrix."+figurename, dpi=300)
1348  pl.close(fig)
1349 
1350 
1352  '''
1353  This class maps a CrossLinkDataBase on a given structure
1354  and save an rmf file with color-coded cross-links
1355  '''
1356 
1357 
1358  def __init__(self,CrossLinkDataBase,rmf_or_stat_handler):
1359  self.CrossLinkDataBase=CrossLinkDataBase
1360  if type(rmf_or_stat_handler) is IMP.pmi.output.RMFHierarchyHandler or \
1361  type(rmf_or_stat_handler) is IMP.pmi.output.StatHierarchyHandler:
1362  self.prots=rmf_or_stat_handler
1363  self.distances=defaultdict(list)
1364  self.array_to_id={}
1365  self.id_to_array={}
1366 
1367  print("computing distances fro all cross-links and all structures")
1368  for i in self.prots[::10]:
1369  self.compute_distances()
1370  for key,xl in enumerate(self.CrossLinkDataBase):
1371  array=_ProteinsResiduesArray(xl)
1372  self.array_to_id[array]=key
1373  self.id_to_array[key]=array
1374  if xl["MinAmbiguousDistance"] != 'None':
1375  self.distances[key].append(xl["MinAmbiguousDistance"])
1376 
1377  def compute_distances(self):
1378  data=[]
1379  sorted_ids=None
1380  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1381  for group in sorted_group_ids:
1382  #group_block=[]
1383  group_dists=[]
1384  for xl in self.CrossLinkDataBase.data_base[group]:
1385  if not sorted_ids:
1386  sorted_ids=sorted(xl.keys())
1387  data.append(sorted_ids+["UniqueID","Distance","MinAmbiguousDistance"])
1388  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1389  try:
1390  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1391  except:
1392  mdist="None"
1393  state1="None"
1394  copy1="None"
1395  state2="None"
1396  copy2="None"
1397  try:
1398  # sometimes keys get "lost" in the database, not really sure why
1399  values=[xl[k] for k in sorted_ids]
1400  values += [group, mdist]
1401  except KeyError as e:
1402  print("MapCrossLinkDataBaseOnStructure KeyError: {0} in {1}".format(e, xl))
1403  group_dists.append(mdist)
1404  #group_block.append(values)
1405  xl["Distance"]=mdist
1406  xl["State1"]=state1
1407  xl["Copy1"]=copy1
1408  xl["State2"]=state2
1409  xl["Copy2"]=copy2
1410 
1411  for xl in self.CrossLinkDataBase.data_base[group]:
1412  xl["MinAmbiguousDistance"]=min(group_dists)
1413 
1414  def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1415  '''more robust and slower version of above'''
1416  sel=IMP.atom.Selection(self.prots,molecule=c1,residue_index=r1,resolution=1)
1417  selpart_1=sel.get_selected_particles()
1418  if len(selpart_1)==0:
1419  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1420  return None
1421  sel=IMP.atom.Selection(self.prots,molecule=c2,residue_index=r2,resolution=1)
1422  selpart_2=sel.get_selected_particles()
1423  if len(selpart_2)==0:
1424  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1425  return None
1426  results=[]
1427  for p1 in selpart_1:
1428  for p2 in selpart_2:
1429  if p1 == p2 and r1 == r2: continue
1430  d1=IMP.core.XYZ(p1)
1431  d2=IMP.core.XYZ(p2)
1432  #round distance to second decimal
1433  dist=float(int(IMP.core.get_distance(d1,d2)*100.0))/100.0
1434  h1=IMP.atom.Hierarchy(p1)
1435  while not IMP.atom.Molecule.get_is_setup(h1.get_particle()):
1436  h1=h1.get_parent()
1437  copy_index1=IMP.atom.Copy(h1).get_copy_index()
1438  while not IMP.atom.State.get_is_setup(h1.get_particle()):
1439  h1=h1.get_parent()
1440  state_index1=IMP.atom.State(h1).get_state_index()
1441  h2=IMP.atom.Hierarchy(p2)
1442  while not IMP.atom.Molecule.get_is_setup(h2.get_particle()):
1443  h2=h2.get_parent()
1444  copy_index2=IMP.atom.Copy(h2).get_copy_index()
1445  while not IMP.atom.State.get_is_setup(h2.get_particle()):
1446  h2=h2.get_parent()
1447  state_index2=IMP.atom.State(h2).get_state_index()
1448 
1449  results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1450  if len(results)==0: return None
1451  results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1452  return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1453 
1454  def save_rmf_snapshot(self,filename,color_id=None):
1455  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1456  list_of_pairs=[]
1457  color_scores=[]
1458  for group in sorted_group_ids:
1459  group_dists_particles=[]
1460  for xl in self.CrossLinkDataBase.data_base[group]:
1461  xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1462  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1463  try:
1464  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1465  except TypeError:
1466  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1467  continue
1468  if color_id is not None:
1469  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1470  else:
1471  group_dists_particles.append((mdist,p1,p2,xllabel,0.0))
1472  if group_dists_particles:
1473  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1474  color_scores.append(mincolor_score)
1475  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1476  else:
1477  continue
1478 
1479  linear = IMP.core.Linear(0, 0.0)
1480  linear.set_slope(1.0)
1481  dps2 = IMP.core.DistancePairScore(linear)
1482  rslin = IMP.RestraintSet(self.prots.get_model(), 'linear_dummy_restraints')
1483  sgs=[]
1484  offset=min(color_scores)
1485  maxvalue=max(color_scores)
1486  for pair in list_of_pairs:
1487  pr = IMP.core.PairRestraint(self.prots.get_model(), dps2, (pair[0], pair[1]))
1488  pr.set_name(pair[2])
1489  if offset<maxvalue:
1490  factor=(pair[3]-offset)/(maxvalue-offset)
1491  else:
1492  factor=0.0
1493  c=IMP.display.get_rgb_color(factor)
1494  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1495  rslin.add_restraint(pr)
1496  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1497 
1498  rh = RMF.create_rmf_file(filename)
1499  IMP.rmf.add_hierarchies(rh, [self.prots])
1500  IMP.rmf.add_restraints(rh,[rslin])
1501  IMP.rmf.add_geometries(rh, sgs)
1502  IMP.rmf.save_frame(rh)
1503  del rh
1504 
1505  def boxplot_crosslink_distances(self,filename):
1506  import numpy
1507  keys=[self.id_to_array[k] for k in self.distances.keys()]
1508  medians=[numpy.median(self.distances[self.array_to_id[k]]) for k in keys]
1509  dists=[self.distances[self.array_to_id[k]] for k in keys]
1510  distances_sorted_by_median=[x for _,x in sorted(zip(medians,dists))]
1511  keys_sorted_by_median=[x for _,x in sorted(zip(medians,keys))]
1513  distances_sorted_by_median,
1514  range(len(distances_sorted_by_median)),
1515  xlabels=keys_sorted_by_median,scale_plot_length=0.2)
1516 
1517  def get_crosslink_violations(self,threshold):
1518  violations=defaultdict(list)
1519  for k in self.distances:
1520  #viols=map(lambda x:1.0*(x<= threshold), self.distances[k])
1521  viols=self.distances[k]
1522  violations[self.id_to_array[k]]=viols
1523  return violations
1524 
1525 
1527  '''
1528  This class generates a CrossLinkDataBase from a given structure
1529  '''
1530  def __init__(self, system, residue_types_1=["K"],
1531  residue_types_2=["K"], reactivity_range=[0,1], kt=1.0):
1532 
1533  import numpy.random
1534  import math
1536  cldbkc.set_protein1_key("Protein1")
1537  cldbkc.set_protein2_key("Protein2")
1538  cldbkc.set_residue1_key("Residue1")
1539  cldbkc.set_residue2_key("Residue2")
1540  self.cldb=CrossLinkDataBase(cldbkc)
1541  self.system=system
1542  self.model=self.system.model
1543  self.residue_types_1=residue_types_1
1544  self.residue_types_2=residue_types_2
1545  self.kt=kt
1546  self.indexes_dict1={}
1547  self.indexes_dict2={}
1548  self.protein_residue_dict={}
1549  self.reactivity_dictionary={}
1550  self.euclidean_interacting_pairs=None
1551  self.xwalk_interacting_pairs=None
1552  import random
1553 
1554  for state in self.system.get_states():
1555  for moleculename,molecules in state.get_molecules().items():
1556  for molecule in molecules:
1557  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
1558  seq=molecule.sequence
1559  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
1560 
1561  for r in residues:
1562  # uniform random reactivities
1563  #self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
1564  # getting reactivities from the CDF of an exponential distribution
1565  rexp=numpy.random.exponential(0.00000001)
1566  prob=1.0-math.exp(-rexp)
1567  self.reactivity_dictionary[(molecule,r)]=prob
1568 
1569  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
1570  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
1571  for r in residues1:
1572  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1573  ps=s.get_selected_particles()
1574  for p in ps:
1575  index=p.get_index()
1576  self.indexes_dict1[index]=(molecule,r)
1577  self.protein_residue_dict[(molecule,r)]=index
1578  for r in residues2:
1579  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1580  ps=s.get_selected_particles()
1581  for p in ps:
1582  index=p.get_index()
1583  self.indexes_dict2[index]=(molecule,r)
1584  self.protein_residue_dict[(molecule,r)]=index
1585 
1586 
1587  def get_all_possible_pairs(self):
1588  n=float(len(self.protein_residue_dict.keys()))
1589  return n*(n-1.0)/2.0
1590 
1591  def get_all_feasible_pairs(self,distance=21):
1592  import itertools
1593  particle_index_pairs=[]
1594  nxl=0
1595  for a,b in itertools.combinations(self.protein_residue_dict.keys(),2):
1596 
1597  new_xl={}
1598  index1=self.protein_residue_dict[a]
1599  index2=self.protein_residue_dict[b]
1600  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1601  if particle_distance <= distance:
1602  particle_index_pairs.append((index1,index2))
1603  new_xl[self.cldb.protein1_key]=a[0].get_name()
1604  new_xl[self.cldb.protein2_key]=b[0].get_name()
1605  new_xl["molecule_object1"]=a[0]
1606  new_xl["molecule_object2"]=b[0]
1607  new_xl[self.cldb.residue1_key]=a[1]
1608  new_xl[self.cldb.residue2_key]=b[1]
1609  self.cldb.data_base[str(nxl)]=[new_xl]
1610  nxl+=1
1611  self.cldb._update()
1612  return self.cldb
1613 
1614 
1615 
1616 
1617  def get_data_base(self,total_number_of_spectra,
1618  ambiguity_probability=0.1,
1619  noise=0.01,
1620  distance=21,
1621  max_delta_distance=10.0,
1622  xwalk_bin_path=None,
1623  confidence_false=0.75,
1624  confidence_true=0.75):
1625  import math
1626  from random import random,uniform
1627  import numpy as np
1628  number_of_spectra=1
1629 
1630  self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1631  self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1632  self.cldb.data_base[str(number_of_spectra)]=[]
1633  self.sites_weighted=None
1634 
1635  while number_of_spectra<total_number_of_spectra:
1636  if random() > ambiguity_probability and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1637  # new spectrum
1638  number_of_spectra+=1
1639  self.cldb.data_base[str(number_of_spectra)]=[]
1640  noisy=False
1641  if random() > noise:
1642  # not noisy cross-link
1643  pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1644  else:
1645  # noisy cross-link
1646  pra,dist=self.get_random_residue_pair(None,None,None)
1647  noisy=True
1648 
1649  new_xl={}
1650  new_xl[self.cldb.protein1_key]=pra[0].get_name()
1651  new_xl[self.cldb.protein2_key]=pra[1].get_name()
1652  new_xl["molecule_object1"]=pra[0]
1653  new_xl["molecule_object2"]=pra[1]
1654  new_xl[self.cldb.residue1_key]=pra[2]
1655  new_xl[self.cldb.residue2_key]=pra[3]
1656  new_xl["Noisy"]=noisy
1657  # the reactivity is defined as r=1-exp(-k*Delta t)
1658  new_xl["Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1659  new_xl["Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1660  r1=new_xl["Reactivity_Residue1"]
1661  r2=new_xl["Reactivity_Residue2"]
1662  #combined reactivity 1-exp(-k12*Delta t),
1663  # k12=k1*k2/(k1+k2)
1664  #new_xl["Reactivity"]=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1665  if noisy:
1666  #new_xl["Score"]=uniform(-1.0,1.0)
1667  new_xl["Score"]=np.random.beta(1.0,self.beta_false)
1668  else:
1669  #new_xl["Score"]=new_xl["Reactivity"]+uniform(0.0,2.0)
1670  new_xl["Score"]=1.0-np.random.beta(1.0,self.beta_true)
1671  new_xl["TargetDistance"]=dist
1672  new_xl["NoiseProbability"]=noise
1673  new_xl["AmbiguityProbability"]=ambiguity_probability
1674  # getting if it is intra or inter rigid body
1675  (p1,p2)=IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1676  self.protein_residue_dict[(pra[1],pra[3])]])
1679  IMP.core.RigidMember(p1).get_rigid_body() ==
1680  IMP.core.RigidMember(p2).get_rigid_body()):
1681  new_xl["InterRigidBody"] = False
1682  elif (IMP.core.RigidMember.get_is_setup(p1) and
1684  IMP.core.RigidMember(p1).get_rigid_body() !=
1685  IMP.core.RigidMember(p2).get_rigid_body()):
1686  new_xl["InterRigidBody"] = True
1687  else:
1688  new_xl["InterRigidBody"] = None
1689 
1690  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1691  self.cldb._update()
1692  return self.cldb
1693 
1694 
1695  def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1696  import IMP.pmi.tools
1697  import math
1698  from random import choice,uniform
1699  if distance is None:
1700  # get a random pair
1701  while True:
1702  (protein1,residue1)=choice(self.protein_residue_dict.keys())
1703  (protein2,residue2)=choice(self.protein_residue_dict.keys())
1704  index1=self.protein_residue_dict[(protein1,residue1)]
1705  index2=self.protein_residue_dict[(protein2,residue2)]
1706  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1707  if (protein1,residue1) != (protein2,residue2):
1708  break
1709  else:
1710  # get a pair of residues whose distance is below the threshold
1711  if not xwalk_bin_path:
1713  gcpf.set_distance(distance+max_delta_distance)
1714 
1715  while True:
1716  #setup the reaction rates lists
1717  if not self.sites_weighted:
1718  self.sites_weighted=[]
1719  for key in self.reactivity_dictionary:
1720  r=self.reactivity_dictionary[key]
1721  self.sites_weighted.append((key,r))
1722  #get a random reaction site
1723  first_site=self.weighted_choice(self.sites_weighted)
1724  #get all distances
1725  if not self.euclidean_interacting_pairs:
1726  self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1727  self.indexes_dict1.keys(),
1728  self.indexes_dict2.keys())
1729  #get the partner for the first reacted site
1730  first_site_pairs = [pair for pair in self.euclidean_interacting_pairs
1731  if self.indexes_dict1[pair[0]] == first_site or
1732  self.indexes_dict2[pair[1]] == first_site]
1733  if len(first_site_pairs)==0: continue
1734  #build the list of second reaction sites
1735  second_sites_weighted=[]
1736  for pair in first_site_pairs:
1737  if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1738  if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1739  r=self.reactivity_dictionary[second_site]
1740  second_sites_weighted.append((second_site,r))
1741  second_site=self.weighted_choice(second_sites_weighted)
1742  """
1743  interacting_pairs_weighted=[]
1744  for pair in self.euclidean_interacting_pairs:
1745  r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1746  r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1747  #combined reactivity 1-exp(-k12*Delta t),
1748  # k12=k1*k2/(k1+k2)
1749  #print(r1,r2,dist)
1750  r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1751  interacting_pairs_weighted.append((pair,r12))
1752  #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1753  #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1754  #interacting_pairs_weighted.append((pair,weight1*weight2))
1755 
1756  while True:
1757  pair=self.weighted_choice(interacting_pairs_weighted)
1758  protein1,residue1=self.indexes_dict1[pair[0]]
1759  protein2,residue2=self.indexes_dict2[pair[1]]
1760  particle_pair=IMP.get_particles(self.model,pair)
1761  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1762  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1763  break
1764  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1765  #allow some flexibility
1766  prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1767  if uniform(0.0,1.0)<prob: break
1768  """
1769  protein1,residue1=first_site
1770  protein2,residue2=second_site
1771  print("CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1772  "First site",first_site,self.reactivity_dictionary[first_site],
1773  "Second site",second_site,self.reactivity_dictionary[second_site])
1774  particle_pair=IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1775  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1776 
1777  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1778  break
1779  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1780  #allow some flexibility
1781  #prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1782  #if uniform(0.0,1.0)<prob: break
1783  if particle_distance-distance < max_delta_distance: break
1784 
1785 
1786 
1787  else:
1788  if not self.xwalk_interacting_pairs:
1789  self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1790  interacting_pairs_weighted=[]
1791 
1792  for pair in self.xwalk_interacting_pairs:
1793  protein1=pair[0]
1794  protein2=pair[1]
1795  residue1=pair[2]
1796  residue2=pair[3]
1797  weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1798  weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1799  interacting_pairs_weighted.append((pair,weight1*weight2))
1800 
1801  pair=self.weighted_choice(interacting_pairs_weighted)
1802  protein1=pair[0]
1803  protein2=pair[1]
1804  residue1=pair[2]
1805  residue2=pair[3]
1806  particle_distance=float(pair[4])
1807 
1808 
1809 
1810  return ((protein1,protein2,residue1,residue2)),particle_distance
1811 
1812  def get_xwalk_distances(self,xwalk_bin_path,distance):
1813  import IMP.pmi.output
1814  import os
1815  o=IMP.pmi.output.Output(atomistic=True)
1816  o.init_pdb("xwalk.pdb",self.representation.prot)
1817  o.write_pdb("xwalk.pdb")
1818  namechainiddict=o.dictchain["xwalk.pdb"]
1819  chainiddict={}
1820 
1821  for key in namechainiddict: chainiddict[namechainiddict[key]]=key
1822  xwalkout=os.popen('java -Xmx256m -cp ' + xwalk_bin_path +' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+' -bb').read()
1823 
1824  output_list_of_distance=[]
1825  for line in xwalkout.split("\n")[0:-2]:
1826  tokens=line.split()
1827  first=tokens[2]
1828  second=tokens[3]
1829  distance=float(tokens[6])
1830  fs=first.split("-")
1831  ss=second.split("-")
1832  chainid1=fs[2]
1833  chainid2=ss[2]
1834  protein1=chainiddict[chainid1]
1835  protein2=chainiddict[chainid2]
1836  residue1=int(fs[1])
1837  residue2=int(ss[1])
1838  output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1839  return output_list_of_distance
1840 
1841 
1842  def weighted_choice(self,choices):
1843  import random
1844  # from http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
1845  total = sum(w for c, w in choices)
1846  r = random.uniform(0, total)
1847  upto = 0
1848  for c, w in choices:
1849  if upto + w > r:
1850  return c
1851  upto += w
1852  assert False, "Shouldn't get here"
1853 
1854  def save_rmf_snapshot(self,filename,color_id=None):
1855  import IMP.rmf
1856  import RMF
1857  if color_id is None:
1858  color_id="Reactivity"
1859  sorted_ids=None
1860  sorted_group_ids=sorted(self.cldb.data_base.keys())
1861  list_of_pairs=[]
1862  color_scores=[]
1863  for group in sorted_group_ids:
1864  group_xls=[]
1865  group_dists_particles=[]
1866  for xl in self.cldb.data_base[group]:
1867  xllabel=self.cldb.get_short_cross_link_string(xl)
1868  (c1,c2,r1,r2)=(xl["molecule_object1"],xl["molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1869  try:
1870  index1=self.protein_residue_dict[(c1,r1)]
1871  index2=self.protein_residue_dict[(c2,r2)]
1872  p1,p2=IMP.get_particles(self.model,[index1])[0],IMP.get_particles(self.model,[index2])[0]
1873  mdist=xl["TargetDistance"]
1874  except TypeError:
1875  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1876  continue
1877  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1878  if group_dists_particles:
1879  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1880  color_scores.append(mincolor_score)
1881  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1882  else:
1883  continue
1884 
1885 
1886  m=self.model
1887  linear = IMP.core.Linear(0, 0.0)
1888  linear.set_slope(1.0)
1889  dps2 = IMP.core.DistancePairScore(linear)
1890  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
1891  sgs=[]
1892  offset=min(color_scores)
1893  maxvalue=max(color_scores)
1894  for pair in list_of_pairs:
1895  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
1896  pr.set_name(pair[2])
1897  factor=(pair[3]-offset)/(maxvalue-offset)
1898  c=IMP.display.get_rgb_color(factor)
1899  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1900  rslin.add_restraint(pr)
1901  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1902 
1903  rh = RMF.create_rmf_file(filename)
1904  IMP.rmf.add_hierarchies(rh, [self.system.hier])
1905  IMP.rmf.add_restraints(rh,[rslin])
1906  IMP.rmf.add_geometries(rh, sgs)
1907  IMP.rmf.save_frame(rh)
1908  del rh
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: State.h:41
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1695
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1758
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:731
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:36
Stores a named protein chain.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
The standard decorator for manipulating molecular structures.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:198
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:19
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:24
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
Definition: State.h:27
class to link stat files to several rmf files
Definition: output.py:1216
class to allow more advanced handling of RMF files.
Definition: output.py:1091
Mapping between FASTA one-letter codes and residue types.
Definition: alphabets.py:1
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Definition: PairRestraint.h:29
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.