IMP logo
IMP Reference Guide  2.12.0
The Integrative Modeling Platform
io/crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 from __future__ import print_function
8 import IMP
9 import IMP.pmi
10 import IMP.pmi.output
11 import IMP.atom
12 import IMP.core
13 import IMP.algebra
14 import IMP.rmf
15 import RMF
16 import IMP.display
17 import operator
18 import math
19 import sys
20 import ihm.location
21 import ihm.dataset
22 from collections import defaultdict
23 import numpy
24 
25 # json default serializations
26 def set_json_default(obj):
27  if isinstance(obj, set):
28  return list(obj)
29  if isinstance(obj, IMP.pmi.topology.Molecule):
30  return str(obj)
31  raise TypeError
32 
33 # Handle and return data that must be a string
34 if sys.version_info[0] >= 3:
35  def _handle_string_input(inp):
36  if not isinstance(inp, str):
37  raise TypeError("expecting a string")
38  return inp
39 else:
40  def _handle_string_input(inp):
41  if not isinstance(inp, (str, unicode)):
42  raise TypeError("expecting a string or unicode")
43  # Coerce to non-unicode representation (str)
44  if isinstance(inp, unicode):
45  return str(inp)
46  else:
47  return inp
48 
49 class _CrossLinkDataBaseStandardKeys(object):
50  '''
51  This class setup all the standard keys needed to
52  identify the crosslink features from the data sets
53  '''
54  def __init__(self):
55  self.type={}
56  self.protein1_key="Protein1"
57  self.type[self.protein1_key]=str
58  self.protein2_key="Protein2"
59  self.type[self.protein2_key]=str
60  self.residue1_key="Residue1"
61  self.type[self.residue1_key]=int
62  self.residue2_key="Residue2"
63  self.type[self.residue2_key]=int
64  self.residue1_amino_acid_key="Residue1AminoAcid"
65  self.type[self.residue1_amino_acid_key]=str
66  self.residue2_amino_acid_key="Residue2AminoAcid"
67  self.type[self.residue2_amino_acid_key]=str
68  self.residue1_moiety_key="Residue1Moiety"
69  self.type[self.residue1_moiety_key]=str
70  self.residue2_moiety_key="Residue2Moiety"
71  self.type[self.residue2_moiety_key]=str
72  self.site_pairs_key="SitePairs"
73  self.type[self.site_pairs_key]=str
74  self.unique_id_key="XLUniqueID"
75  self.type[self.unique_id_key]=str
76  self.unique_sub_index_key="XLUniqueSubIndex"
77  self.type[self.unique_sub_index_key]=int
78  self.unique_sub_id_key="XLUniqueSubID"
79  self.type[self.unique_sub_id_key]=str
80  self.data_set_name_key="DataSetName"
81  self.type[self.data_set_name_key]=str
82  self.cross_linker_chemical_key="CrossLinkerChemical"
83  self.type[self.cross_linker_chemical_key]=str
84  self.id_score_key="IDScore"
85  self.type[self.id_score_key]=float
86  self.fdr_key="FDR"
87  self.type[self.fdr_key]=float
88  self.quantitation_key="Quantitation"
89  self.type[self.quantitation_key]=float
90  self.redundancy_key="Redundancy"
91  self.type[self.redundancy_key]=int
92  self.redundancy_list_key="RedundancyList"
93  self.type[self.redundancy_key]=list
94  self.ambiguity_key="Ambiguity"
95  self.type[self.ambiguity_key]=int
96  self.residue1_links_number_key="Residue1LinksNumber"
97  self.type[self.residue1_links_number_key]=int
98  self.residue2_links_number_key="Residue2LinksNumber"
99  self.type[self.residue2_links_number_key]=int
100  self.type[self.ambiguity_key]=int
101  self.state_key="State"
102  self.type[self.state_key]=int
103  self.sigma1_key="Sigma1"
104  self.type[self.sigma1_key]=str
105  self.sigma2_key="Sigma2"
106  self.type[self.sigma2_key]=str
107  self.psi_key="Psi"
108  self.type[self.psi_key]=str
109  self.distance_key="Distance"
110  self.type[self.distance_key]=float
111  self.min_ambiguous_distance_key="MinAmbiguousDistance"
112  self.type[self.distance_key]=float
113  #link types are either Monolink, Intralink or Interlink
114  self.link_type_key="LinkType"
115  self.type[self.link_type_key]=str
116 
117  self.ordered_key_list =[self.data_set_name_key,
118  self.unique_id_key,
119  self.unique_sub_index_key,
120  self.unique_sub_id_key,
121  self.protein1_key,
122  self.protein2_key,
123  self.residue1_key,
124  self.residue2_key,
125  self.residue1_amino_acid_key,
126  self.residue2_amino_acid_key,
127  self.residue1_moiety_key,
128  self.residue2_moiety_key,
129  self.cross_linker_chemical_key,
130  self.id_score_key,
131  self.fdr_key,
132  self.quantitation_key,
133  self.redundancy_key,
134  self.redundancy_list_key,
135  self.state_key,
136  self.sigma1_key,
137  self.sigma2_key,
138  self.psi_key,
139  self.distance_key,
140  self.min_ambiguous_distance_key,
141  self.link_type_key]
142 
143 
144 class _ProteinsResiduesArray(tuple):
145  '''
146  This class is inherits from tuple, and it is a shorthand for a cross-link
147  (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
148  residue1 and residue2.
149  '''
150 
151  def __new__(self,input_data):
152  '''
153  @input_data can be a dict or a tuple
154  '''
155  self.cldbsk=_CrossLinkDataBaseStandardKeys()
156  if type(input_data) is dict:
157  monolink=False
158  p1=input_data[self.cldbsk.protein1_key]
159  try:
160  p2=input_data[self.cldbsk.protein2_key]
161  except KeyError:
162  monolink=True
163  r1=input_data[self.cldbsk.residue1_key]
164  try:
165  r2=input_data[self.cldbsk.residue2_key]
166  except KeyError:
167  monolink=True
168  if not monolink:
169  t=(p1,p2,r1,r2)
170  else:
171  t=(p1,"",r1,None)
172  elif type(input_data) is tuple:
173  if len(input_data)>4 or len(input_data)==3 or len(input_data)==1:
174  raise TypeError("_ProteinsResiduesArray: must have only 4 elements")
175  if len(input_data)==4:
176  p1 = _handle_string_input(input_data[0])
177  p2 = _handle_string_input(input_data[1])
178  r1=input_data[2]
179  r2=input_data[3]
180  if (not (type(r1) is int)) and (not (r1 is None)):
181  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
182  if (not (type(r2) is int)) and (not (r1 is None)):
183  raise TypeError("_ProteinsResiduesArray: residue2 must be a integer")
184  t=(p1,p2,r1,r2)
185  if len(input_data) == 2:
186  p1 = _handle_string_input(input_data[0])
187  r1 = input_data[1]
188  if type(r1) is not int:
189  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
190  t = (p1,"",r1,None)
191  else:
192  raise TypeError("_ProteinsResiduesArray: input must be a dict or tuple")
193  return tuple.__new__(_ProteinsResiduesArray, t)
194 
195  def get_inverted(self):
196  '''
197  Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
198  '''
199  return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
200 
201  def __repr__(self):
202  outstr=self.cldbsk.protein1_key+" "+str(self[0])
203  outstr+=" "+self.cldbsk.protein2_key+" "+str(self[1])
204  outstr+=" "+self.cldbsk.residue1_key+" "+str(self[2])
205  outstr+=" "+self.cldbsk.residue2_key+" "+str(self[3])
206  return outstr
207 
208  def __str__(self):
209  outstr=str(self[0])+"."+str(self[2])+"-"+str(self[1])+"."+str(self[3])
210  return outstr
211 
212 class FilterOperator(object):
213  '''
214  This class allows to create filter functions that can be passed to the CrossLinkDataBase
215  in this way:
216 
217  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
218 
219  where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
220 
221  A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
222 
223  fo.evaluate(xl)
224 
225  and it is used to filter the database
226  '''
227 
228  def __init__(self, argument1, operator, argument2):
229  '''
230  (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
231  or (FilterOperator1,operator.or|and...,FilterOperator2)
232  '''
233  if isinstance(argument1, FilterOperator):
234  self.operations = [argument1, operator, argument2]
235  else:
236  self.operations = []
237  self.values = (argument1, operator, argument2)
238 
239  def __or__(self, FilterOperator2):
240  return FilterOperator(self, operator.or_, FilterOperator2)
241 
242  def __and__(self, FilterOperator2):
243  return FilterOperator(self, operator.and_, FilterOperator2)
244 
245  def __invert__(self):
246  return FilterOperator(self, operator.not_, None)
247 
248  def evaluate(self, xl_item):
249 
250  if len(self.operations) == 0:
251  keyword, operator, value = self.values
252  return operator(xl_item[keyword], value)
253  FilterOperator1, op, FilterOperator2 = self.operations
254 
255  if FilterOperator2 is None:
256  return op(FilterOperator1.evaluate(xl_item))
257  else:
258  return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
259 
260 '''
261 def filter_factory(xl_):
262 
263  class FilterOperator(object):
264  import operator
265  xl = xl_
266 
267  def __new__(self,key,value,oper=operator.eq):
268  return oper(self.xl[key],value)
269 
270  return FilterOperator
271 '''
272 
273 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
274  '''
275  This class is needed to convert the keywords from a generic database
276  to the standard ones
277  '''
278 
279  def __init__(self, list_parser=None):
280  '''
281  @param list_parser an instance of ResiduePairListParser, if any is needed
282  '''
283  self.converter={}
284  self.backward_converter={}
285  _CrossLinkDataBaseStandardKeys.__init__(self)
286  self.rplp = list_parser
287  if self.rplp is None:
288  # either you have protein1, protein2, residue1, residue2
289  self.compulsory_keys=set([self.protein1_key,
290  self.protein2_key,
291  self.residue1_key,
292  self.residue2_key])
293  else:
294  self.compulsory_keys=self.rplp.get_compulsory_keys()
295  self.setup_keys=set()
296 
297  def check_keys(self):
298  '''
299  Is a function that check whether necessary keys are setup
300  '''
301  setup_keys=set(self.get_setup_keys())
302  if self.compulsory_keys & setup_keys != self.compulsory_keys:
303  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
304 
305  def get_setup_keys(self):
306  '''
307  Returns the keys that have been setup so far
308  '''
309  return self.backward_converter.keys()
310 
311  def set_standard_keys(self):
312  """
313  This sets up the standard compulsory keys as defined by
314  _CrossLinkDataBaseStandardKeys
315  """
316  for ck in self.compulsory_keys:
317  self.converter[ck]=ck
318  self.backward_converter[ck]=ck
319 
320  def set_unique_id_key(self,origin_key):
321  self.converter[origin_key]=self.unique_id_key
322  self.backward_converter[self.unique_id_key]=origin_key
323 
324  def set_protein1_key(self,origin_key):
325  self.converter[origin_key]=self.protein1_key
326  self.backward_converter[self.protein1_key]=origin_key
327 
328  def set_protein2_key(self,origin_key):
329  self.converter[origin_key]=self.protein2_key
330  self.backward_converter[self.protein2_key]=origin_key
331 
332  def set_residue1_key(self,origin_key):
333  self.converter[origin_key]=self.residue1_key
334  self.backward_converter[self.residue1_key]=origin_key
335 
336  def set_residue2_key(self,origin_key):
337  self.converter[origin_key]=self.residue2_key
338  self.backward_converter[self.residue2_key]=origin_key
339 
340  def set_residue1_amino_acid_key(self, origin_key):
341  self.converter[origin_key] = self.residue1_amino_acid_key
342  self.backward_converter[self.residue1_amino_acid_key] = origin_key
343 
344  def set_residue2_amino_acid_key(self, origin_key):
345  self.converter[origin_key] = self.residue2_amino_acid_key
346  self.backward_converter[self.residue2_amino_acid_key] = origin_key
347 
348  def set_residue1_moiety_key(self, origin_key):
349  self.converter[origin_key] = self.residue1_moiety_key
350  self.backward_converter[self.residue1_moiety_key] = origin_key
351 
352  def set_residue2_moiety_key(self, origin_key):
353  self.converter[origin_key] = self.residue2_moiety_key
354  self.backward_converter[self.residue2_moiety_key] = origin_key
355 
356  def set_site_pairs_key(self,origin_key):
357  self.converter[origin_key]=self.site_pairs_key
358  self.backward_converter[self.site_pairs_key]=origin_key
359 
360  def set_id_score_key(self,origin_key):
361  self.converter[origin_key]=self.id_score_key
362  self.backward_converter[self.id_score_key]=origin_key
363 
364  def set_fdr_key(self,origin_key):
365  self.converter[origin_key]=self.fdr_key
366  self.backward_converter[self.fdr_key]=origin_key
367 
368  def set_quantitation_key(self,origin_key):
369  self.converter[origin_key]=self.quantitation_key
370  self.backward_converter[self.quantitation_key]=origin_key
371 
372  def set_psi_key(self,origin_key):
373  self.converter[origin_key]=self.psi_key
374  self.backward_converter[self.psi_key]=origin_key
375 
376  def set_link_type_key(self,link_type_key):
377  self.converter[link_type_key]=self.link_type_key
378  self.backward_converter[self.link_type_key]=link_type_key
379 
380  def get_converter(self):
381  '''
382  Returns the dictionary that convert the old keywords to the new ones
383  '''
384  self.check_keys()
385  return self.converter
386 
388  '''
389  Returns the dictionary that convert the new keywords to the old ones
390  '''
391  self.check_keys()
392  return self.backward_converter
393 
394 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
395  '''
396  A class to handle different styles of site pairs parsers.
397  Implemented styles:
398  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
399  [Y3-;K4-] for dead-ends
400  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
401  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
402  LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
403  '''
404 
405  import re
406 
407  def __init__(self,style):
408 
409  _CrossLinkDataBaseStandardKeys.__init__(self)
410  if style is "MSSTUDIO":
411  self.style=style
412  self.compulsory_keys= set([self.protein1_key,
413  self.protein2_key,
414  self.site_pairs_key])
415  elif style is "XTRACT" or style is "QUANTITATION":
416  self.style=style
417  self.compulsory_keys= set([self.site_pairs_key])
418  elif style is "LAN_HUANG":
419  self.style=style
420  self.compulsory_keys= set([self.site_pairs_key])
421  else:
422  raise Error("ResiduePairListParser: unknown list parser style")
423 
424  def get_compulsory_keys(self):
425  return self.compulsory_keys
426 
427  def get_list(self,input_string):
428  '''
429  This function returns a list of cross-linked residues and the corresponding list of
430  cross-linked chains. The latter list can be empty, if the style doesn't have the
431  corresponding information.
432  '''
433  if self.style == "MSSTUDIO":
434  input_string=input_string.replace("[","")
435  input_string=input_string.replace("]","")
436  input_string_pairs=input_string.split(";")
437  residue_pair_indexes=[]
438  chain_pair_indexes=[]
439  for s in input_string_pairs:
440  m1=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
441  m2=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
442  if m1:
443  # cross link
444  residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
445  residue_pair_indexes.append((residue_index_1,residue_index_2))
446  elif m2:
447  # dead end
448  residue_type_1,residue_index_1=m2.group(1,2)
449  # at this stage chain_pair_indexes is empty
450  return residue_pair_indexes,chain_pair_indexes
451  if self.style is "XTRACT" or self.style is "QUANTITATION":
452  if ":x:" in input_string:
453  # if it is a crosslink....
454  input_strings=input_string.split(":x:")
455  first_peptides=input_strings[0].split(":|:")
456  second_peptides=input_strings[1].split(":|:")
457  first_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in first_peptides]
458  second_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in second_peptides]
459  residue_pair_indexes=[]
460  chain_pair_indexes=[]
461  for fpi in first_peptides_indentifiers:
462  for spi in second_peptides_indentifiers:
463  chain1=fpi[0]
464  chain2=spi[0]
465  residue1=fpi[1]
466  residue2=spi[1]
467  residue_pair_indexes.append((residue1,residue2))
468  chain_pair_indexes.append((chain1,chain2))
469  return residue_pair_indexes, chain_pair_indexes
470  else:
471  # if it is a monolink....
472  first_peptides = input_string.split(":|:")
473  first_peptides_indentifiers = [(f.split(":")[0], f.split(":")[1]) for f in first_peptides]
474  residue_indexes = []
475  chain_indexes = []
476  for fpi in first_peptides_indentifiers:
477  chain1=fpi[0]
478  residue1=fpi[1]
479  residue_indexes.append((residue1,))
480  chain_indexes.append((chain1,))
481  return residue_indexes, chain_indexes
482  if self.style is "LAN_HUANG":
483  input_strings=input_string.split("-")
484  chain1,first_series=input_strings[0].split(":")
485  chain2,second_series=input_strings[1].split(":")
486 
487  first_residues=first_series.replace(";","|").split("|")
488  second_residues=second_series.replace(";","|").split("|")
489  residue_pair_indexes=[]
490  chain_pair_indexes=[]
491  for fpi in first_residues:
492  for spi in second_residues:
493  residue1=self.re.sub("[^0-9]", "", fpi)
494  residue2=self.re.sub("[^0-9]", "", spi)
495  residue_pair_indexes.append((residue1,residue2))
496  chain_pair_indexes.append((chain1,chain2))
497  return residue_pair_indexes, chain_pair_indexes
498 
499 
500 
501 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
502  '''
503  A class to handle different XL format with fixed format
504  currently support ProXL
505  '''
506  def __init__(self,format):
507 
508  _CrossLinkDataBaseStandardKeys.__init__(self)
509  if format is "PROXL":
510  self.format=format
511  else:
512  raise Error("FixedFormatParser: unknown list format name")
513 
514 
515  def get_data(self,input_string):
516  if self.format is "PROXL":
517  tokens=input_string.split("\t")
518  xl={}
519  if tokens[0]=="SEARCH ID(S)":
520  return None
521  else:
522  xl[self.protein1_key]=tokens[2]
523  xl[self.protein2_key]=tokens[4]
524  xl[self.residue1_key]=int(tokens[3])
525  xl[self.residue2_key]=int(tokens[5])
526  return xl
527 
528 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
529  '''
530  this class handles a cross-link dataset and do filtering
531  operations, adding cross-links, merge datasets...
532  '''
533 
534  def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=('K',)):
535  '''
536  Constructor.
537  @param converter an instance of CrossLinkDataBaseKeywordsConverter
538  @param data_base an instance of CrossLinkDataBase to build the new database on
539  @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
540  crosslink consistency. If not given consistency will not be checked
541  @param linkable_aa a tuple containing one-letter amino acids which are linkable by the crosslinker;
542  only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
543  and if a fasta_seq is given
544  '''
545 
546  if data_base is None:
547  self.data_base = {}
548  else:
549  self.data_base=data_base
550 
551  _CrossLinkDataBaseStandardKeys.__init__(self)
552  if converter is not None:
553  self.cldbkc = converter #type: CrossLinkDataBaseKeywordsConverter
554  self.list_parser=self.cldbkc.rplp
555  self.converter = converter.get_converter()
556 
557  else:
558  self.cldbkc = None #type: CrossLinkDataBaseKeywordsConverter
559  self.list_parser=None
560  self.converter = None
561 
562  # default amino acids considered to be 'linkable' if none are given
563  self.def_aa_tuple = linkable_aa
564  self.fasta_seq = fasta_seq #type: IMP.pmi.topology.Sequences
565  self.dataset = None
566  self._update()
567 
568  def _update(self):
569  '''
570  Update the whole dataset after changes
571  '''
572  self.update_cross_link_unique_sub_index()
573  self.update_cross_link_redundancy()
574  self.update_residues_links_number()
576 
577 
578  def __iter__(self):
579  sorted_ids=sorted(self.data_base.keys())
580  for k in sorted_ids:
581  for xl in self.data_base[k]:
582  yield xl
583 
584  def xlid_iterator(self):
585  sorted_ids=sorted(self.data_base.keys())
586  for xlid in sorted_ids:
587  yield xlid
588 
589  def __getitem__(self,xlid):
590  return self.data_base[xlid]
591 
592  def __len__(self):
593  return len([xl for xl in self])
594 
595  def get_name(self):
596  return self.name
597 
598  def set_name(self,name):
599  new_data_base={}
600  for k in self.data_base:
601  new_data_base[k+"."+name]=self.data_base[k]
602  self.data_base=new_data_base
603  self.name=name
604  self._update()
605 
606  def get_number_of_xlid(self):
607  return len(self.data_base)
608 
609 
610  def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
611  '''
612  if FixedFormatParser is not specified, the file is comma-separated-values
613  @param file_name a txt file to be parsed
614  @param converter an instance of CrossLinkDataBaseKeywordsConverter
615  @param FixedFormatParser a parser for a fixed format
616  '''
617  if not FixedFormatParser:
618  xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
619 
620  if converter is not None:
621  self.cldbkc = converter
622  self.list_parser=self.cldbkc.rplp
623  self.converter = converter.get_converter()
624 
625 
626  if not self.list_parser:
627  # normal procedure without a list_parser
628  # each line is a cross-link
629  new_xl_dict={}
630  for nxl,xl in enumerate(xl_list):
631  new_xl={}
632  for k in xl:
633  if k in self.converter:
634  new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
635  else:
636  new_xl[k]=xl[k]
637  if self.unique_id_key in self.cldbkc.get_setup_keys():
638  if new_xl[self.unique_id_key] not in new_xl_dict:
639  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
640  else:
641  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
642  else:
643  if str(nxl) not in new_xl_dict:
644  new_xl_dict[str(nxl)]=[new_xl]
645  else:
646  new_xl_dict[str(nxl)].append(new_xl)
647 
648  else:
649  # with a list_parser, a line can be a list of ambiguous crosslinks
650  new_xl_dict={}
651  for nxl,entry in enumerate(xl_list):
652 
653  # first get the translated keywords
654  new_dict={}
655  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
656  raise Error("CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
657  for k in entry:
658  if k in self.converter:
659  new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
660  else:
661  new_dict[k]=entry[k]
662 
663  residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
664 
665  # then create the crosslinks
666  for n,p in enumerate(residue_pair_list):
667  is_monolink=False
668  if len(p)==1:
669  is_monolink=True
670 
671  new_xl={}
672  for k in new_dict:
673  new_xl[k]=new_dict[k]
674  new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
675  if not is_monolink:
676  new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
677 
678  if len(chain_pair_list)==len(residue_pair_list):
679  new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
680  if not is_monolink:
681  new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
682 
683  if not is_monolink:
684  new_xl[self.link_type_key]="CROSSLINK"
685  else:
686  new_xl[self.link_type_key]="MONOLINK"
687 
688  if self.unique_id_key in self.cldbkc.get_setup_keys():
689  if new_xl[self.unique_id_key] not in new_xl_dict:
690  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
691  else:
692  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
693  else:
694  if str(nxl) not in new_xl_dict:
695  new_xl[self.unique_id_key]=str(nxl+1)
696  new_xl_dict[str(nxl)]=[new_xl]
697  else:
698  new_xl[self.unique_id_key]=str(nxl+1)
699  new_xl_dict[str(nxl)].append(new_xl)
700 
701 
702  else:
703  '''
704  if FixedFormatParser is defined
705  '''
706 
707  new_xl_dict={}
708  f=open(file_name,"r")
709  nxl=0
710  for line in f:
711  xl=FixedFormatParser.get_data(line)
712  if xl:
713  xl[self.unique_id_key]=str(nxl+1)
714  new_xl_dict[str(nxl)]=[xl]
715  nxl+=1
716 
717 
718  self.data_base=new_xl_dict
719  self.name=file_name
720  l = ihm.location.InputFileLocation(file_name, details='Crosslinks')
721  self.dataset = ihm.dataset.CXMSDataset(l)
722  self._update()
723 
724  def update_cross_link_unique_sub_index(self):
725  for k in self.data_base:
726  for n,xl in enumerate(self.data_base[k]):
727  xl[self.ambiguity_key]=len(self.data_base[k])
728  xl[self.unique_sub_index_key]=n+1
729  xl[self.unique_sub_id_key]=k+"."+str(n+1)
730 
731  def update_cross_link_redundancy(self):
732  redundancy_data_base={}
733  for xl in self:
734  pra=_ProteinsResiduesArray(xl)
735  if pra not in redundancy_data_base:
736  redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
737  redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
738  else:
739  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
740  redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
741  for xl in self:
742  pra=_ProteinsResiduesArray(xl)
743  xl[self.redundancy_key]=len(redundancy_data_base[pra])
744  xl[self.redundancy_list_key]=redundancy_data_base[pra]
745 
746  def update_residues_links_number(self):
747  residue_links={}
748  for xl in self:
749  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
750  if (p1,r1) not in residue_links:
751  residue_links[(p1,r1)]=set([(p2,r2)])
752  else:
753  residue_links[(p1,r1)].add((p2,r2))
754  if (p2,r2) not in residue_links:
755  residue_links[(p2,r2)]=set([(p1,r1)])
756  else:
757  residue_links[(p2,r2)].add((p1,r1))
758 
759  for xl in self:
760  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
761  xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
762  xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
763 
765  """This function checks the consistency of the dataset with the amino acid sequence"""
766  if self.cldbkc and self.fasta_seq:
767  cnt_matched, cnt_matched_file = 0, 0
768  matched = {}
769  non_matched = {}
770  for xl in self:
771  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
772  b_matched_file = False
773  if self.residue1_amino_acid_key in xl:
774  # either you know the residue type and aa_tuple is a single entry
775  aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
776  b_matched = self._match_xlinks(p1, r1, aa_from_file)
777  b_matched_file = b_matched
778  else:
779  # or pass the possible list of types that can be crosslinked
780  b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
781 
782  matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
783 
784  if self.residue2_amino_acid_key in xl:
785  aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
786  b_matched = self._match_xlinks(p2, r2, aa_from_file)
787  b_matched_file = b_matched
788  else:
789  b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
790 
791  matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
792  if b_matched: cnt_matched += 1
793  if b_matched_file: cnt_matched_file += 1
794  if len(self) > 0:
795  percentage_matched = round(100*cnt_matched/len(self),1)
796  percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
797  #if matched: print "Matched xlinks:", matched
798  if matched or non_matched: print("check_cross_link_consistency: Out of %d crosslinks "
799  "%d were matched to the fasta sequence (%f %%).\n "
800  "%d were matched by using the crosslink file (%f %%)."%
801  (len(self),cnt_matched,percentage_matched,cnt_matched_file,
802  percentage_matched_file) )
803  if non_matched: print("check_cross_link_consistency: Warning: Non matched xlinks:",
804  [(prot_name, sorted(list(non_matched[prot_name]))) for prot_name in non_matched])
805  return matched,non_matched
806 
807  def _match_xlinks(self, prot_name, res_index, aa_tuple):
808  # returns Boolean whether given aa matches a position in the fasta file
809  # cross link files usually start counting at 1 and not 0; therefore subtract -1 to compare with fasta
810  amino_dict = IMP.pmi.tools.ThreeToOneConverter()
811  res_index -= 1
812  for amino_acid in aa_tuple:
813  if len(amino_acid) == 3:
814  amino_acid = amino_dict[amino_acid.upper()]
815  if prot_name in self.fasta_seq.sequences:
816  seq = self.fasta_seq.sequences[prot_name]
817  # if we are looking at the first amino acid in a given sequence always return a match
818  # the first aa should always be the n-terminal aa
819  # which may form a crosslink in any case (for BS3-like crosslinkers)
820  # for some data sets the first aa is at position 1; todo: check why this is the case
821  if res_index == 0 or res_index == 1:
822  return True
823  if res_index < len(seq):
824  if amino_acid == seq[res_index]:
825  return True
826  # else:
827  # print "Could not match", prot, res+1, amino_acid, seq[res]
828  return False
829 
830  def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
831  if b_matched:
832  if prot in matched:
833  matched[prot].add(res)
834  else:
835  matched[prot] = set([res])
836  else:
837  if prot in non_matched:
838  non_matched[prot].add(res)
839  else:
840  non_matched[prot] = set([res])
841  return matched, non_matched
842 
843 
844  def get_cross_link_string(self,xl):
845  string='|'
846  for k in self.ordered_key_list:
847  try:
848  string+=str(k)+":"+str(xl[k])+"|"
849  except KeyError:
850  continue
851 
852  for k in xl:
853  if k not in self.ordered_key_list:
854  string+=str(k)+":"+str(xl[k])+"|"
855 
856  return string
857 
858  def get_short_cross_link_string(self,xl):
859 
860  string='|'
861  list_of_keys=[self.data_set_name_key,
862  self.unique_sub_id_key,
863  self.protein1_key,
864  self.residue1_key,
865  self.protein2_key,
866  self.residue2_key,
867  self.state_key,
868  self.psi_key]
869 
870  for k in list_of_keys:
871  try:
872  string+=str(xl[k])+"|"
873  except KeyError:
874  continue
875 
876  return string
877 
878  def filter(self,FilterOperator):
879  new_xl_dict={}
880  for id in self.data_base.keys():
881  for xl in self.data_base[id]:
882  if FilterOperator.evaluate(xl):
883  if id not in new_xl_dict:
884  new_xl_dict[id]=[xl]
885  else:
886  new_xl_dict[id].append(xl)
887  self._update()
888  cdb = CrossLinkDataBase(self.cldbkc,new_xl_dict)
889  cdb.dataset = self.dataset
890  return cdb
891 
892  def filter_score(self,score):
893  '''Get all crosslinks with score greater than an input value'''
894  FilterOperator=IMP.pmi.io.crosslink.FilterOperator(self.id_score_key,operator.gt,score)
895  return self.filter(FilterOperator)
896 
897  def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
898  '''
899  This function merges two cross-link datasets so that if two conflicting crosslinks have the same
900  cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
901  with different SubIDs
902  '''
903  pass
904 
905  def append_database(self,CrossLinkDataBase2):
906  '''
907  This function append one cross-link dataset to another. Unique ids will be renamed to avoid
908  conflicts.
909  '''
910  name1=self.get_name()
911  name2=CrossLinkDataBase2.get_name()
912  if name1 == name2:
913  name1=id(self)
914  name2=id(CrossLinkDataBase2)
915  self.set_name(name1)
916  CrossLinkDataBase2.set_name(name2)
917 
918  #rename first database:
919  new_data_base={}
920  for k in self.data_base:
921  new_data_base[k]=self.data_base[k]
922  for k in CrossLinkDataBase2.data_base:
923  new_data_base[k]=CrossLinkDataBase2.data_base[k]
924  self.data_base=new_data_base
925  self._update()
926 
927  def set_value(self,key,new_value,FilterOperator=None):
928  '''
929  This function changes the value for a given key in the database
930  For instance one can change the name of a protein
931  @param key: the key in the database that must be changed
932  @param new_value: the new value of the key
933  @param FilterOperator: optional FilterOperator to change the value to
934  a subset of the database
935 
936  example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
937  '''
938 
939  for xl in self:
940  if FilterOperator is not None:
941  if FilterOperator.evaluate(xl):
942  xl[key]=new_value
943  else:
944  xl[key]=new_value
945  self._update()
946 
947  def get_values(self,key):
948  '''
949  this function returns the list of values for a given key in the database
950  alphanumerically sorted
951  '''
952  values=set()
953  for xl in self:
954  values.add(xl[key])
955  return sorted(list(values))
956 
957  def offset_residue_index(self,protein_name,offset):
958  '''
959  This function offset the residue indexes of a given protein by a specified value
960  @param protein_name: the protein name that need to be changed
961  @param offset: the offset value
962  '''
963 
964  for xl in self:
965  if xl[self.protein1_key] == protein_name:
966  xl[self.residue1_key]=xl[self.residue1_key]+offset
967  if xl[self.protein2_key] == protein_name:
968  xl[self.residue2_key]=xl[self.residue2_key]+offset
969  self._update()
970 
971  def create_new_keyword(self,keyword,values_from_keyword=None):
972  '''
973  This function creates a new keyword for the whole database and set the values from
974  and existing keyword (optional), otherwise the values are set to None
975  @param keyword the new keyword name:
976  @param values_from_keyword the keyword from which we are copying the values:
977  '''
978  for xl in self:
979  if values_from_keyword is not None:
980  xl[keyword] = xl[values_from_keyword]
981  else:
982  xl[keyword] = None
983  self._update()
984 
985  def rename_proteins(self,old_to_new_names_dictionary, protein_to_rename="both"):
986  '''
987  This function renames all proteins contained in the input dictionary
988  from the old names (keys) to the new name (values)
989  @param old_to_new_names_dictionary dictionary for converting old to new names
990  @param protein_to_rename specify whether to rename both or protein1 or protein2 only
991  '''
992 
993  for old_name in old_to_new_names_dictionary:
994  new_name=old_to_new_names_dictionary[old_name]
995  if protein_to_rename == "both" or protein_to_rename == "protein1":
996  fo2=FilterOperator(self.protein1_key,operator.eq,old_name)
997  self.set_value(self.protein1_key,new_name,fo2)
998  if protein_to_rename == "both" or protein_to_rename == "protein2":
999  fo2=FilterOperator(self.protein2_key,operator.eq,old_name)
1000  self.set_value(self.protein2_key,new_name,fo2)
1001 
1002  def classify_crosslinks_by_score(self,number_of_classes):
1003  '''
1004  This function creates as many classes as in the input (number_of_classes: integer)
1005  and partition crosslinks according to their identification scores. Classes are defined in the psi key.
1006  '''
1007 
1008  if self.id_score_key is not None:
1009  scores=self.get_values(self.id_score_key)
1010  else:
1011  raise ValueError('The crosslink database does not contain score values')
1012  minscore=min(scores)
1013  maxscore=max(scores)
1014  scoreclasses=numpy.linspace(minscore, maxscore, number_of_classes+1)
1015  if self.psi_key is None:
1016  self.create_new_keyword(self.psi_key,values_from_keyword=None)
1017  for xl in self:
1018  score=xl[self.id_score_key]
1019  for n,classmin in enumerate(scoreclasses[0:-1]):
1020  if score>=classmin and score<=scoreclasses[n+1]:
1021  xl[self.psi_key]=str("CLASS_"+str(n))
1022  self._update()
1023 
1024  def clone_protein(self,protein_name,new_protein_name):
1025  new_xl_dict={}
1026  for id in self.data_base.keys():
1027  new_data_base=[]
1028  for xl in self.data_base[id]:
1029  new_data_base.append(xl)
1030  if xl[self.protein1_key]==protein_name and xl[self.protein2_key]!=protein_name:
1031  new_xl=dict(xl)
1032  new_xl[self.protein1_key]=new_protein_name
1033  new_data_base.append(new_xl)
1034  elif xl[self.protein1_key]!=protein_name and xl[self.protein2_key]==protein_name:
1035  new_xl=dict(xl)
1036  new_xl[self.protein2_key]=new_protein_name
1037  new_data_base.append(new_xl)
1038  elif xl[self.protein1_key]==protein_name and xl[self.protein2_key]==protein_name:
1039  new_xl=dict(xl)
1040  new_xl[self.protein1_key]=new_protein_name
1041  new_data_base.append(new_xl)
1042  new_xl=dict(xl)
1043  new_xl[self.protein2_key]=new_protein_name
1044  new_data_base.append(new_xl)
1045  new_xl=dict(xl)
1046  new_xl[self.protein1_key]=new_protein_name
1047  new_xl[self.protein2_key]=new_protein_name
1048  new_data_base.append(new_xl)
1049  self.data_base[id]=new_data_base
1050  self._update()
1051 
1053  '''
1054  This function remove cross-links applied to the same residue
1055  (ie, same chain name and residue number)
1056  '''
1057  new_xl_dict={}
1058  for id in self.data_base.keys():
1059  new_data_base=[]
1060  for xl in self.data_base[id]:
1061  if xl[self.protein1_key]==xl[self.protein2_key] and xl[self.residue1_key]==xl[self.residue2_key]:
1062  continue
1063  else:
1064  new_data_base.append(xl)
1065  self.data_base[id]=new_data_base
1066  self._update()
1067 
1068 
1069  def jackknife(self,percentage):
1070  '''
1071  this method returns a CrossLinkDataBase class containing
1072  a random subsample of the original cross-link database.
1073  @param percentage float between 0 and 1, is the percentage of
1074  of spectra taken from the original list
1075  '''
1076  import random
1077  if percentage > 1.0 or percentage < 0.0:
1078  raise ValueError('the percentage of random cross-link spectra should be between 0 and 1')
1079  nspectra=self.get_number_of_xlid()
1080  nrandom_spectra=int(nspectra*percentage)
1081  random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1082  new_data_base={}
1083  for k in random_keys:
1084  new_data_base[k]=self.data_base[k]
1085  return CrossLinkDataBase(self.cldbkc,new_data_base)
1086 
1087  def __str__(self):
1088  outstr=''
1089  sorted_ids=sorted(self.data_base.keys())
1090 
1091  for id in sorted_ids:
1092  outstr+=id+"\n"
1093  for xl in self.data_base[id]:
1094  for k in self.ordered_key_list:
1095  try:
1096  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1097  except KeyError:
1098  continue
1099 
1100  for k in xl:
1101  if k not in self.ordered_key_list:
1102  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1103  outstr+="-------------\n"
1104  return outstr
1105 
1106 
1107  def plot(self,filename,**kwargs):
1108  import matplotlib
1109  matplotlib.use('Agg')
1110  import matplotlib.pyplot as plt
1111  import matplotlib.colors
1112 
1113 
1114 
1115  if kwargs["type"] == "scatter":
1116  cmap=plt.get_cmap("rainbow")
1117  norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1118  xkey=kwargs["xkey"]
1119  ykey=kwargs["ykey"]
1120  if "colorkey" in kwargs:
1121  colorkey=kwargs["colorkey"]
1122  if "sizekey" in kwargs:
1123  sizekey=kwargs["sizekey"]
1124  if "logyscale" in kwargs:
1125  logyscale=kwargs["logyscale"]
1126  else:
1127  logyscale=False
1128  xs=[]
1129  ys=[]
1130  colors=[]
1131  for xl in self:
1132  try:
1133  xs.append(float(xl[xkey]))
1134  if logyscale:
1135  ys.append(math.log10(float(xl[ykey])))
1136  else:
1137  ys.append(float(xl[ykey]))
1138  colors.append(float(xl[colorkey]))
1139  except ValueError:
1140  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1141  continue
1142 
1143  cs=[]
1144  for color in colors:
1145  cindex=(color-min(colors))/(max(colors)-min(colors))
1146  cs.append(cmap(cindex))
1147 
1148  fig = plt.figure()
1149  ax = fig.add_subplot(111)
1150  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker="o")
1151  ax.set_xlabel(xkey)
1152  ax.set_ylabel(ykey)
1153  plt.savefig(filename)
1154  plt.show()
1155  plt.close()
1156 
1157  if kwargs["type"] == "residue_links":
1158  #plot the number of distinct links to a residue
1159  #in an histogram
1160  #molecule name
1161  molecule=kwargs["molecule"]
1162  if type(molecule) is IMP.pmi.topology.Molecule:
1163  length=len(molecule.sequence)
1164  molecule=molecule.get_name()
1165  else:
1166  #you need a IMP.pmi.topology.Sequences object
1167  sequences_object=kwargs["sequences_object"]
1168  sequence=sequences_object.sequences[molecule]
1169  length=len(sequence)
1170 
1171  histogram=[0]*length
1172  for xl in self:
1173  if xl[self.protein1_key]==molecule:
1174  histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1175  if xl[self.protein2_key]==molecule:
1176  histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1177  rects = plt.bar(range(1,length+1), histogram)
1178  #bar_width,
1179  #alpha=opacity,
1180  #color='b',
1181  #yerr=std_men,
1182  #error_kw=error_config,
1183  #label='Men')
1184  plt.savefig(filename)
1185  plt.show()
1186  plt.close()
1187 
1188 
1189 
1190  if kwargs["type"] == "histogram":
1191  valuekey=kwargs["valuekey"]
1192  reference_xline=kwargs["reference_xline"]
1193  valuename=valuekey
1194  bins=kwargs["bins"]
1195  values_list=[]
1196  for xl in self:
1197  try:
1198  values_list.append(float(xl[valuekey]))
1199  except ValueError:
1200  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1201  continue
1203  filename, [values_list], valuename=valuename, bins=bins,
1204  colors=None, format="pdf",
1205  reference_xline=None, yplotrange=None,
1206  xplotrange=None,normalized=True,
1207  leg_names=None)
1208 
1209  def dump(self,json_filename):
1210  import json
1211  with open(json_filename, 'w') as fp:
1212  json.dump(self.data_base, fp, sort_keys=True, indent=2, default=set_json_default)
1213 
1214  def load(self,json_filename):
1215  import json
1216  with open(json_filename, 'r') as fp:
1217  self.data_base = json.load(fp)
1218  self._update()
1219  #getting rid of unicode
1220  # (can't do this in Python 3, since *everything* is Unicode there)
1221  if sys.version_info[0] < 3:
1222  for xl in self:
1223  for k,v in xl.iteritems():
1224  if type(k) is unicode: k=str(k)
1225  if type(v) is unicode: v=str(v)
1226  xl[k]=v
1227 
1228  def save_csv(self,filename):
1229 
1230  import csv
1231 
1232  data=[]
1233  sorted_ids=None
1234  sorted_group_ids=sorted(self.data_base.keys())
1235  for group in sorted_group_ids:
1236  group_block=[]
1237  for xl in self.data_base[group]:
1238  if not sorted_ids:
1239  sorted_ids=sorted(xl.keys())
1240  values=[xl[k] for k in sorted_ids]
1241  group_block.append(values)
1242  data+=group_block
1243 
1244 
1245  with open(filename, 'w') as fp:
1246  a = csv.writer(fp, delimiter=',')
1247  a.writerow(sorted_ids)
1248  a.writerows(data)
1249 
1251  """
1252  Returns the number of non redundant crosslink sites
1253  """
1254  praset=set()
1255  for xl in self:
1256  pra=_ProteinsResiduesArray(xl)
1257  prai=pra.get_inverted()
1258  praset.add(pra)
1259  praset.add(prai)
1260  return len(list(praset))
1261 
1263  """This class allows to compute and plot the distance between datasets"""
1264 
1265  def __init__(self,cldb_dictionary):
1266  """Input a dictionary where keys are cldb names and values are cldbs"""
1267  import scipy.spatial.distance
1268  self.dbs=cldb_dictionary
1269  self.keylist=list(self.dbs.keys())
1270  self.distances=list()
1271 
1272 
1273  for i,key1 in enumerate(self.keylist):
1274  for key2 in self.keylist[i+1:]:
1275  distance=self.get_distance(key1,key2)
1276  self.distances.append(distance)
1277 
1278  self.distances=scipy.spatial.distance.squareform(self.distances)
1279 
1280  def get_distance(self,key1,key2):
1281  return 1.0-self.jaccard_index(self.dbs[key1],self.dbs[key2])
1282 
1283  def jaccard_index(self,CrossLinkDataBase1,CrossLinkDataBase2):
1284  """Similarity index between two datasets
1285  https://en.wikipedia.org/wiki/Jaccard_index"""
1286 
1287  set1=set()
1288  set2=set()
1289  for xl1 in CrossLinkDataBase1:
1290  a1f=_ProteinsResiduesArray(xl1)
1291  a1b=a1f.get_inverted()
1292  set1.add(a1f)
1293  set1.add(a1b)
1294  for xl2 in CrossLinkDataBase2:
1295  a2f=_ProteinsResiduesArray(xl2)
1296  a2b=a2f.get_inverted()
1297  set2.add(a2f)
1298  set2.add(a2b)
1299  return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1300 
1301  def plot_matrix(self,figurename="clustermatrix.pdf"):
1302  import matplotlib as mpl
1303  import numpy
1304  mpl.use('Agg')
1305  import matplotlib.pylab as pl
1306  from scipy.cluster import hierarchy as hrc
1307 
1308  raw_distance_matrix=self.distances
1309  labels=self.keylist
1310 
1311  fig = pl.figure()
1312  #fig.autolayout=True
1313 
1314  ax = fig.add_subplot(1,1,1)
1315  dendrogram = hrc.dendrogram(
1316  hrc.linkage(raw_distance_matrix),
1317  color_threshold=7,
1318  no_labels=True)
1319  leaves_order = dendrogram['leaves']
1320  ax.set_xlabel('Dataset')
1321  ax.set_ylabel('Jaccard Distance')
1322  pl.tight_layout()
1323  pl.savefig("dendrogram."+figurename, dpi=300)
1324  pl.close(fig)
1325 
1326  fig = pl.figure()
1327  #fig.autolayout=True
1328 
1329  ax = fig.add_subplot(1,1,1)
1330  cax = ax.imshow(
1331  raw_distance_matrix[leaves_order,
1332  :][:,
1333  leaves_order],
1334  interpolation='nearest')
1335  cb = fig.colorbar(cax)
1336  cb.set_label('Jaccard Distance')
1337  ax.set_xlabel('Dataset')
1338  ax.set_ylabel('Dataset')
1339  ax.set_xticks(range(len(labels)))
1340  ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation='vertical')
1341  ax.set_yticks(range(len(labels)))
1342  ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation='horizontal')
1343  pl.tight_layout()
1344  pl.savefig("matrix."+figurename, dpi=300)
1345  pl.close(fig)
1346 
1347 
1349  '''
1350  This class maps a CrossLinkDataBase on a given structure
1351  and save an rmf file with color-coded crosslinks
1352  '''
1353 
1354 
1355  def __init__(self,CrossLinkDataBase,rmf_or_stat_handler):
1356  self.CrossLinkDataBase=CrossLinkDataBase
1357  if type(rmf_or_stat_handler) is IMP.pmi.output.RMFHierarchyHandler or \
1358  type(rmf_or_stat_handler) is IMP.pmi.output.StatHierarchyHandler:
1359  self.prots=rmf_or_stat_handler
1360  self.distances=defaultdict(list)
1361  self.array_to_id={}
1362  self.id_to_array={}
1363 
1364  print("computing distances fro all crosslinks and all structures")
1365  for i in self.prots[::10]:
1366  self.compute_distances()
1367  for key,xl in enumerate(self.CrossLinkDataBase):
1368  array=_ProteinsResiduesArray(xl)
1369  self.array_to_id[array]=key
1370  self.id_to_array[key]=array
1371  if xl["MinAmbiguousDistance"] is not 'None':
1372  self.distances[key].append(xl["MinAmbiguousDistance"])
1373 
1374  def compute_distances(self):
1375  data=[]
1376  sorted_ids=None
1377  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1378  for group in sorted_group_ids:
1379  #group_block=[]
1380  group_dists=[]
1381  for xl in self.CrossLinkDataBase.data_base[group]:
1382  if not sorted_ids:
1383  sorted_ids=sorted(xl.keys())
1384  data.append(sorted_ids+["UniqueID","Distance","MinAmbiguousDistance"])
1385  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1386  try:
1387  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1388  except:
1389  mdist="None"
1390  state1="None"
1391  copy1="None"
1392  state2="None"
1393  copy2="None"
1394  try:
1395  # sometimes keys get "lost" in the database, not really sure why
1396  values=[xl[k] for k in sorted_ids]
1397  values += [group, mdist]
1398  except KeyError as e:
1399  print("MapCrossLinkDataBaseOnStructure KeyError: {0} in {1}".format(e, xl))
1400  group_dists.append(mdist)
1401  #group_block.append(values)
1402  xl["Distance"]=mdist
1403  xl["State1"]=state1
1404  xl["Copy1"]=copy1
1405  xl["State2"]=state2
1406  xl["Copy2"]=copy2
1407 
1408  for xl in self.CrossLinkDataBase.data_base[group]:
1409  xl["MinAmbiguousDistance"]=min(group_dists)
1410 
1411  def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1412  '''more robust and slower version of above'''
1413  sel=IMP.atom.Selection(self.prots,molecule=c1,residue_index=r1,resolution=1)
1414  selpart_1=sel.get_selected_particles()
1415  if len(selpart_1)==0:
1416  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1417  return None
1418  sel=IMP.atom.Selection(self.prots,molecule=c2,residue_index=r2,resolution=1)
1419  selpart_2=sel.get_selected_particles()
1420  if len(selpart_2)==0:
1421  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1422  return None
1423  results=[]
1424  for p1 in selpart_1:
1425  for p2 in selpart_2:
1426  if p1 == p2 and r1 == r2: continue
1427  d1=IMP.core.XYZ(p1)
1428  d2=IMP.core.XYZ(p2)
1429  #round distance to second decimal
1430  dist=float(int(IMP.core.get_distance(d1,d2)*100.0))/100.0
1431  h1=IMP.atom.Hierarchy(p1)
1432  while not IMP.atom.Molecule.get_is_setup(h1.get_particle()):
1433  h1=h1.get_parent()
1434  copy_index1=IMP.atom.Copy(h1).get_copy_index()
1435  while not IMP.atom.State.get_is_setup(h1.get_particle()):
1436  h1=h1.get_parent()
1437  state_index1=IMP.atom.State(h1).get_state_index()
1438  h2=IMP.atom.Hierarchy(p2)
1439  while not IMP.atom.Molecule.get_is_setup(h2.get_particle()):
1440  h2=h2.get_parent()
1441  copy_index2=IMP.atom.Copy(h2).get_copy_index()
1442  while not IMP.atom.State.get_is_setup(h2.get_particle()):
1443  h2=h2.get_parent()
1444  state_index2=IMP.atom.State(h2).get_state_index()
1445 
1446  results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1447  if len(results)==0: return None
1448  results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1449  return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1450 
1451  def save_rmf_snapshot(self,filename,color_id=None):
1452  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1453  list_of_pairs=[]
1454  color_scores=[]
1455  for group in sorted_group_ids:
1456  group_dists_particles=[]
1457  for xl in self.CrossLinkDataBase.data_base[group]:
1458  xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1459  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1460  try:
1461  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1462  except TypeError:
1463  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1464  continue
1465  if color_id is not None:
1466  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1467  else:
1468  group_dists_particles.append((mdist,p1,p2,xllabel,0.0))
1469  if group_dists_particles:
1470  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1471  color_scores.append(mincolor_score)
1472  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1473  else:
1474  continue
1475 
1476  linear = IMP.core.Linear(0, 0.0)
1477  linear.set_slope(1.0)
1478  dps2 = IMP.core.DistancePairScore(linear)
1479  rslin = IMP.RestraintSet(self.prots.get_model(), 'linear_dummy_restraints')
1480  sgs=[]
1481  offset=min(color_scores)
1482  maxvalue=max(color_scores)
1483  for pair in list_of_pairs:
1484  pr = IMP.core.PairRestraint(self.prots.get_model(), dps2, (pair[0], pair[1]))
1485  pr.set_name(pair[2])
1486  if offset<maxvalue:
1487  factor=(pair[3]-offset)/(maxvalue-offset)
1488  else:
1489  factor=0.0
1490  c=IMP.display.get_rgb_color(factor)
1491  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1492  rslin.add_restraint(pr)
1493  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1494 
1495  rh = RMF.create_rmf_file(filename)
1496  IMP.rmf.add_hierarchies(rh, [self.prots])
1497  IMP.rmf.add_restraints(rh,[rslin])
1498  IMP.rmf.add_geometries(rh, sgs)
1499  IMP.rmf.save_frame(rh)
1500  del rh
1501 
1502  def boxplot_crosslink_distances(self,filename):
1503  import numpy
1504  keys=[self.id_to_array[k] for k in self.distances.keys()]
1505  medians=[numpy.median(self.distances[self.array_to_id[k]]) for k in keys]
1506  dists=[self.distances[self.array_to_id[k]] for k in keys]
1507  distances_sorted_by_median=[x for _,x in sorted(zip(medians,dists))]
1508  keys_sorted_by_median=[x for _,x in sorted(zip(medians,keys))]
1510  distances_sorted_by_median,
1511  range(len(distances_sorted_by_median)),
1512  xlabels=keys_sorted_by_median,scale_plot_length=0.2)
1513 
1514  def get_crosslink_violations(self,threshold):
1515  violations=defaultdict(list)
1516  for k in self.distances:
1517  #viols=map(lambda x:1.0*(x<= threshold), self.distances[k])
1518  viols=self.distances[k]
1519  violations[self.id_to_array[k]]=viols
1520  return violations
1521 
1522 
1524  '''
1525  This class generates a CrossLinkDataBase from a given structure
1526  '''
1527  def __init__(self, system, residue_types_1=["K"],
1528  residue_types_2=["K"], reactivity_range=[0,1], kt=1.0):
1529 
1530  import numpy.random
1531  import math
1533  cldbkc.set_protein1_key("Protein1")
1534  cldbkc.set_protein2_key("Protein2")
1535  cldbkc.set_residue1_key("Residue1")
1536  cldbkc.set_residue2_key("Residue2")
1537  self.cldb=CrossLinkDataBase(cldbkc)
1538  self.system=system
1539  self.model=self.system.model
1540  self.residue_types_1=residue_types_1
1541  self.residue_types_2=residue_types_2
1542  self.kt=kt
1543  self.indexes_dict1={}
1544  self.indexes_dict2={}
1545  self.protein_residue_dict={}
1546  self.reactivity_dictionary={}
1547  self.euclidean_interacting_pairs=None
1548  self.xwalk_interacting_pairs=None
1549  import random
1550 
1551  for state in self.system.get_states():
1552  for moleculename,molecules in state.get_molecules().items():
1553  for molecule in molecules:
1554  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
1555  seq=molecule.sequence
1556  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
1557 
1558  for r in residues:
1559  # uniform random reactivities
1560  #self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
1561  # getting reactivities from the CDF of an exponential distribution
1562  rexp=numpy.random.exponential(0.00000001)
1563  prob=1.0-math.exp(-rexp)
1564  self.reactivity_dictionary[(molecule,r)]=prob
1565 
1566  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
1567  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
1568  for r in residues1:
1569  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1570  ps=s.get_selected_particles()
1571  for p in ps:
1572  index=p.get_index()
1573  self.indexes_dict1[index]=(molecule,r)
1574  self.protein_residue_dict[(molecule,r)]=index
1575  for r in residues2:
1576  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1577  ps=s.get_selected_particles()
1578  for p in ps:
1579  index=p.get_index()
1580  self.indexes_dict2[index]=(molecule,r)
1581  self.protein_residue_dict[(molecule,r)]=index
1582 
1583 
1584  def get_all_possible_pairs(self):
1585  n=float(len(self.protein_residue_dict.keys()))
1586  return n*(n-1.0)/2.0
1587 
1588  def get_all_feasible_pairs(self,distance=21):
1589  import itertools
1590  particle_index_pairs=[]
1591  nxl=0
1592  for a,b in itertools.combinations(self.protein_residue_dict.keys(),2):
1593 
1594  new_xl={}
1595  index1=self.protein_residue_dict[a]
1596  index2=self.protein_residue_dict[b]
1597  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1598  if particle_distance <= distance:
1599  particle_index_pairs.append((index1,index2))
1600  new_xl[self.cldb.protein1_key]=a[0].get_name()
1601  new_xl[self.cldb.protein2_key]=b[0].get_name()
1602  new_xl["molecule_object1"]=a[0]
1603  new_xl["molecule_object2"]=b[0]
1604  new_xl[self.cldb.residue1_key]=a[1]
1605  new_xl[self.cldb.residue2_key]=b[1]
1606  self.cldb.data_base[str(nxl)]=[new_xl]
1607  nxl+=1
1608  self.cldb._update()
1609  return self.cldb
1610 
1611 
1612 
1613 
1614  def get_data_base(self,total_number_of_spectra,
1615  ambiguity_probability=0.1,
1616  noise=0.01,
1617  distance=21,
1618  max_delta_distance=10.0,
1619  xwalk_bin_path=None,
1620  confidence_false=0.75,
1621  confidence_true=0.75):
1622  import math
1623  from random import random,uniform
1624  import numpy as np
1625  number_of_spectra=1
1626 
1627  self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1628  self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1629  self.cldb.data_base[str(number_of_spectra)]=[]
1630  self.sites_weighted=None
1631 
1632  while number_of_spectra<total_number_of_spectra:
1633  if random() > ambiguity_probability and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1634  # new spectrum
1635  number_of_spectra+=1
1636  self.cldb.data_base[str(number_of_spectra)]=[]
1637  noisy=False
1638  if random() > noise:
1639  # not noisy crosslink
1640  pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1641  else:
1642  # noisy crosslink
1643  pra,dist=self.get_random_residue_pair(None,None,None)
1644  noisy=True
1645 
1646  new_xl={}
1647  new_xl[self.cldb.protein1_key]=pra[0].get_name()
1648  new_xl[self.cldb.protein2_key]=pra[1].get_name()
1649  new_xl["molecule_object1"]=pra[0]
1650  new_xl["molecule_object2"]=pra[1]
1651  new_xl[self.cldb.residue1_key]=pra[2]
1652  new_xl[self.cldb.residue2_key]=pra[3]
1653  new_xl["Noisy"]=noisy
1654  # the reactivity is defined as r=1-exp(-k*Delta t)
1655  new_xl["Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1656  new_xl["Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1657  r1=new_xl["Reactivity_Residue1"]
1658  r2=new_xl["Reactivity_Residue2"]
1659  #combined reactivity 1-exp(-k12*Delta t),
1660  # k12=k1*k2/(k1+k2)
1661  #new_xl["Reactivity"]=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1662  if noisy:
1663  #new_xl["Score"]=uniform(-1.0,1.0)
1664  new_xl["Score"]=np.random.beta(1.0,self.beta_false)
1665  else:
1666  #new_xl["Score"]=new_xl["Reactivity"]+uniform(0.0,2.0)
1667  new_xl["Score"]=1.0-np.random.beta(1.0,self.beta_true)
1668  new_xl["TargetDistance"]=dist
1669  new_xl["NoiseProbability"]=noise
1670  new_xl["AmbiguityProbability"]=ambiguity_probability
1671  # getting if it is intra or inter rigid body
1672  (p1,p2)=IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1673  self.protein_residue_dict[(pra[1],pra[3])]])
1676  IMP.core.RigidMember(p1).get_rigid_body() ==
1677  IMP.core.RigidMember(p2).get_rigid_body()):
1678  new_xl["InterRigidBody"] = False
1679  elif (IMP.core.RigidMember.get_is_setup(p1) and
1681  IMP.core.RigidMember(p1).get_rigid_body() !=
1682  IMP.core.RigidMember(p2).get_rigid_body()):
1683  new_xl["InterRigidBody"] = True
1684  else:
1685  new_xl["InterRigidBody"] = None
1686 
1687  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1688  self.cldb._update()
1689  return self.cldb
1690 
1691 
1692  def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1693  import IMP.pmi.tools
1694  import math
1695  from random import choice,uniform
1696  if distance is None:
1697  # get a random pair
1698  while True:
1699  (protein1,residue1)=choice(self.protein_residue_dict.keys())
1700  (protein2,residue2)=choice(self.protein_residue_dict.keys())
1701  index1=self.protein_residue_dict[(protein1,residue1)]
1702  index2=self.protein_residue_dict[(protein2,residue2)]
1703  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1704  if (protein1,residue1) != (protein2,residue2):
1705  break
1706  else:
1707  # get a pair of residues whose distance is below the threshold
1708  if not xwalk_bin_path:
1710  gcpf.set_distance(distance+max_delta_distance)
1711 
1712  while True:
1713  #setup the reaction rates lists
1714  if not self.sites_weighted:
1715  self.sites_weighted=[]
1716  for key in self.reactivity_dictionary:
1717  r=self.reactivity_dictionary[key]
1718  self.sites_weighted.append((key,r))
1719  #get a random reaction site
1720  first_site=self.weighted_choice(self.sites_weighted)
1721  #get all distances
1722  if not self.euclidean_interacting_pairs:
1723  self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1724  self.indexes_dict1.keys(),
1725  self.indexes_dict2.keys())
1726  #get the partner for the first reacted site
1727  first_site_pairs = [pair for pair in self.euclidean_interacting_pairs
1728  if self.indexes_dict1[pair[0]] == first_site or
1729  self.indexes_dict2[pair[1]] == first_site]
1730  if len(first_site_pairs)==0: continue
1731  #build the list of second reaction sites
1732  second_sites_weighted=[]
1733  for pair in first_site_pairs:
1734  if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1735  if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1736  r=self.reactivity_dictionary[second_site]
1737  second_sites_weighted.append((second_site,r))
1738  second_site=self.weighted_choice(second_sites_weighted)
1739  """
1740  interacting_pairs_weighted=[]
1741  for pair in self.euclidean_interacting_pairs:
1742  r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1743  r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1744  #combined reactivity 1-exp(-k12*Delta t),
1745  # k12=k1*k2/(k1+k2)
1746  #print(r1,r2,dist)
1747  r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1748  interacting_pairs_weighted.append((pair,r12))
1749  #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1750  #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1751  #interacting_pairs_weighted.append((pair,weight1*weight2))
1752 
1753  while True:
1754  pair=self.weighted_choice(interacting_pairs_weighted)
1755  protein1,residue1=self.indexes_dict1[pair[0]]
1756  protein2,residue2=self.indexes_dict2[pair[1]]
1757  particle_pair=IMP.get_particles(self.model,pair)
1758  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1759  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1760  break
1761  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1762  #allow some flexibility
1763  prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1764  if uniform(0.0,1.0)<prob: break
1765  """
1766  protein1,residue1=first_site
1767  protein2,residue2=second_site
1768  print("CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1769  "First site",first_site,self.reactivity_dictionary[first_site],
1770  "Second site",second_site,self.reactivity_dictionary[second_site])
1771  particle_pair=IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1772  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1773 
1774  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1775  break
1776  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1777  #allow some flexibility
1778  #prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1779  #if uniform(0.0,1.0)<prob: break
1780  if particle_distance-distance < max_delta_distance: break
1781 
1782 
1783 
1784  else:
1785  if not self.xwalk_interacting_pairs:
1786  self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1787  interacting_pairs_weighted=[]
1788 
1789  for pair in self.xwalk_interacting_pairs:
1790  protein1=pair[0]
1791  protein2=pair[1]
1792  residue1=pair[2]
1793  residue2=pair[3]
1794  weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1795  weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1796  interacting_pairs_weighted.append((pair,weight1*weight2))
1797 
1798  pair=self.weighted_choice(interacting_pairs_weighted)
1799  protein1=pair[0]
1800  protein2=pair[1]
1801  residue1=pair[2]
1802  residue2=pair[3]
1803  particle_distance=float(pair[4])
1804 
1805 
1806 
1807  return ((protein1,protein2,residue1,residue2)),particle_distance
1808 
1809  def get_xwalk_distances(self,xwalk_bin_path,distance):
1810  import IMP.pmi.output
1811  import os
1812  o=IMP.pmi.output.Output(atomistic=True)
1813  o.init_pdb("xwalk.pdb",self.representation.prot)
1814  o.write_pdb("xwalk.pdb")
1815  namechainiddict=o.dictchain["xwalk.pdb"]
1816  chainiddict={}
1817 
1818  for key in namechainiddict: chainiddict[namechainiddict[key]]=key
1819  xwalkout=os.popen('java -Xmx256m -cp ' + xwalk_bin_path +' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+' -bb').read()
1820 
1821  output_list_of_distance=[]
1822  for line in xwalkout.split("\n")[0:-2]:
1823  tokens=line.split()
1824  first=tokens[2]
1825  second=tokens[3]
1826  distance=float(tokens[6])
1827  fs=first.split("-")
1828  ss=second.split("-")
1829  chainid1=fs[2]
1830  chainid2=ss[2]
1831  protein1=chainiddict[chainid1]
1832  protein2=chainiddict[chainid2]
1833  residue1=int(fs[1])
1834  residue2=int(ss[1])
1835  output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1836  return output_list_of_distance
1837 
1838 
1839  def weighted_choice(self,choices):
1840  import random
1841  # from http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
1842  total = sum(w for c, w in choices)
1843  r = random.uniform(0, total)
1844  upto = 0
1845  for c, w in choices:
1846  if upto + w > r:
1847  return c
1848  upto += w
1849  assert False, "Shouldn't get here"
1850 
1851  def save_rmf_snapshot(self,filename,color_id=None):
1852  import IMP.rmf
1853  import RMF
1854  if color_id is None:
1855  color_id="Reactivity"
1856  sorted_ids=None
1857  sorted_group_ids=sorted(self.cldb.data_base.keys())
1858  list_of_pairs=[]
1859  color_scores=[]
1860  for group in sorted_group_ids:
1861  group_xls=[]
1862  group_dists_particles=[]
1863  for xl in self.cldb.data_base[group]:
1864  xllabel=self.cldb.get_short_cross_link_string(xl)
1865  (c1,c2,r1,r2)=(xl["molecule_object1"],xl["molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1866  try:
1867  index1=self.protein_residue_dict[(c1,r1)]
1868  index2=self.protein_residue_dict[(c2,r2)]
1869  p1,p2=IMP.get_particles(self.model,[index1])[0],IMP.get_particles(self.model,[index2])[0]
1870  mdist=xl["TargetDistance"]
1871  except TypeError:
1872  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1873  continue
1874  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1875  if group_dists_particles:
1876  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1877  color_scores.append(mincolor_score)
1878  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1879  else:
1880  continue
1881 
1882 
1883  m=self.model
1884  linear = IMP.core.Linear(0, 0.0)
1885  linear.set_slope(1.0)
1886  dps2 = IMP.core.DistancePairScore(linear)
1887  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
1888  sgs=[]
1889  offset=min(color_scores)
1890  maxvalue=max(color_scores)
1891  for pair in list_of_pairs:
1892  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
1893  pr.set_name(pair[2])
1894  factor=(pair[3]-offset)/(maxvalue-offset)
1895  c=IMP.display.get_rgb_color(factor)
1896  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1897  rslin.add_restraint(pr)
1898  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1899 
1900  rh = RMF.create_rmf_file(filename)
1901  IMP.rmf.add_hierarchies(rh, [self.system.hier])
1902  IMP.rmf.add_restraints(rh,[rslin])
1903  IMP.rmf.add_geometries(rh, sgs)
1904  IMP.rmf.save_frame(rh)
1905  del rh
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: State.h:41
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1557
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1622
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:731
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:36
Stores a named protein chain.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
The standard decorator for manipulating molecular structures.
This class converts three to one letter codes, and return X for any unknown codes.
Definition: tools.py:1092
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:60
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:19
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:24
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
Definition: State.h:27
class to link stat files to several rmf files
Definition: output.py:1081
class to allow more advanced handling of RMF files.
Definition: output.py:956
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Definition: PairRestraint.h:29
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.