IMP logo
IMP Reference Guide  2.14.0
The Integrative Modeling Platform
io/crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 from __future__ import print_function
8 import IMP
9 import IMP.pmi
10 import IMP.pmi.output
11 import IMP.pmi.alphabets
12 import IMP.atom
13 import IMP.core
14 import IMP.algebra
15 import IMP.rmf
16 import RMF
17 import IMP.display
18 import operator
19 import math
20 import sys
21 import ihm.location
22 import ihm.dataset
23 from collections import defaultdict
24 import numpy
25 
26 # json default serializations
27 def set_json_default(obj):
28  if isinstance(obj, set):
29  return list(obj)
30  if isinstance(obj, IMP.pmi.topology.Molecule):
31  return str(obj)
32  raise TypeError
33 
34 # Handle and return data that must be a string
35 if sys.version_info[0] >= 3:
36  def _handle_string_input(inp):
37  if not isinstance(inp, str):
38  raise TypeError("expecting a string")
39  return inp
40 else:
41  def _handle_string_input(inp):
42  if not isinstance(inp, (str, unicode)):
43  raise TypeError("expecting a string or unicode")
44  # Coerce to non-unicode representation (str)
45  if isinstance(inp, unicode):
46  return str(inp)
47  else:
48  return inp
49 
50 class _CrossLinkDataBaseStandardKeys(object):
51  '''
52  This class setup all the standard keys needed to
53  identify the cross-link features from the data sets
54  '''
55  def __init__(self):
56  self.type={}
57  self.protein1_key="Protein1"
58  self.type[self.protein1_key]=str
59  self.protein2_key="Protein2"
60  self.type[self.protein2_key]=str
61  self.residue1_key="Residue1"
62  self.type[self.residue1_key]=int
63  self.residue2_key="Residue2"
64  self.type[self.residue2_key]=int
65  self.residue1_amino_acid_key="Residue1AminoAcid"
66  self.type[self.residue1_amino_acid_key]=str
67  self.residue2_amino_acid_key="Residue2AminoAcid"
68  self.type[self.residue2_amino_acid_key]=str
69  self.residue1_moiety_key="Residue1Moiety"
70  self.type[self.residue1_moiety_key]=str
71  self.residue2_moiety_key="Residue2Moiety"
72  self.type[self.residue2_moiety_key]=str
73  self.site_pairs_key="SitePairs"
74  self.type[self.site_pairs_key]=str
75  self.unique_id_key="XLUniqueID"
76  self.type[self.unique_id_key]=str
77  self.unique_sub_index_key="XLUniqueSubIndex"
78  self.type[self.unique_sub_index_key]=int
79  self.unique_sub_id_key="XLUniqueSubID"
80  self.type[self.unique_sub_id_key]=str
81  self.data_set_name_key="DataSetName"
82  self.type[self.data_set_name_key]=str
83  self.cross_linker_chemical_key="CrossLinkerChemical"
84  self.type[self.cross_linker_chemical_key]=str
85  self.id_score_key="IDScore"
86  self.type[self.id_score_key]=float
87  self.fdr_key="FDR"
88  self.type[self.fdr_key]=float
89  self.quantitation_key="Quantitation"
90  self.type[self.quantitation_key]=float
91  self.redundancy_key="Redundancy"
92  self.type[self.redundancy_key]=int
93  self.redundancy_list_key="RedundancyList"
94  self.type[self.redundancy_key]=list
95  self.ambiguity_key="Ambiguity"
96  self.type[self.ambiguity_key]=int
97  self.residue1_links_number_key="Residue1LinksNumber"
98  self.type[self.residue1_links_number_key]=int
99  self.residue2_links_number_key="Residue2LinksNumber"
100  self.type[self.residue2_links_number_key]=int
101  self.type[self.ambiguity_key]=int
102  self.state_key="State"
103  self.type[self.state_key]=int
104  self.sigma1_key="Sigma1"
105  self.type[self.sigma1_key]=str
106  self.sigma2_key="Sigma2"
107  self.type[self.sigma2_key]=str
108  self.psi_key="Psi"
109  self.type[self.psi_key]=str
110  self.distance_key="Distance"
111  self.type[self.distance_key]=float
112  self.min_ambiguous_distance_key="MinAmbiguousDistance"
113  self.type[self.distance_key]=float
114  #link types are either Monolink, Intralink or Interlink
115  self.link_type_key="LinkType"
116  self.type[self.link_type_key]=str
117 
118  self.ordered_key_list =[self.data_set_name_key,
119  self.unique_id_key,
120  self.unique_sub_index_key,
121  self.unique_sub_id_key,
122  self.protein1_key,
123  self.protein2_key,
124  self.residue1_key,
125  self.residue2_key,
126  self.residue1_amino_acid_key,
127  self.residue2_amino_acid_key,
128  self.residue1_moiety_key,
129  self.residue2_moiety_key,
130  self.cross_linker_chemical_key,
131  self.id_score_key,
132  self.fdr_key,
133  self.quantitation_key,
134  self.redundancy_key,
135  self.redundancy_list_key,
136  self.state_key,
137  self.sigma1_key,
138  self.sigma2_key,
139  self.psi_key,
140  self.distance_key,
141  self.min_ambiguous_distance_key,
142  self.link_type_key]
143 
144 
145 class _ProteinsResiduesArray(tuple):
146  '''
147  This class is inherits from tuple, and it is a shorthand for a cross-link
148  (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
149  residue1 and residue2.
150  '''
151 
152  def __new__(self,input_data):
153  '''
154  @input_data can be a dict or a tuple
155  '''
156  self.cldbsk=_CrossLinkDataBaseStandardKeys()
157  if type(input_data) is dict:
158  monolink=False
159  p1=input_data[self.cldbsk.protein1_key]
160  try:
161  p2=input_data[self.cldbsk.protein2_key]
162  except KeyError:
163  monolink=True
164  r1=input_data[self.cldbsk.residue1_key]
165  try:
166  r2=input_data[self.cldbsk.residue2_key]
167  except KeyError:
168  monolink=True
169  if not monolink:
170  t=(p1,p2,r1,r2)
171  else:
172  t=(p1,"",r1,None)
173  elif type(input_data) is tuple:
174  if len(input_data)>4 or len(input_data)==3 or len(input_data)==1:
175  raise TypeError("_ProteinsResiduesArray: must have only 4 elements")
176  if len(input_data)==4:
177  p1 = _handle_string_input(input_data[0])
178  p2 = _handle_string_input(input_data[1])
179  r1=input_data[2]
180  r2=input_data[3]
181  if (not (type(r1) is int)) and (not (r1 is None)):
182  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
183  if (not (type(r2) is int)) and (not (r1 is None)):
184  raise TypeError("_ProteinsResiduesArray: residue2 must be a integer")
185  t=(p1,p2,r1,r2)
186  if len(input_data) == 2:
187  p1 = _handle_string_input(input_data[0])
188  r1 = input_data[1]
189  if type(r1) is not int:
190  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
191  t = (p1,"",r1,None)
192  else:
193  raise TypeError("_ProteinsResiduesArray: input must be a dict or tuple")
194  return tuple.__new__(_ProteinsResiduesArray, t)
195 
196  def get_inverted(self):
197  '''
198  Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
199  '''
200  return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
201 
202  def __repr__(self):
203  outstr=self.cldbsk.protein1_key+" "+str(self[0])
204  outstr+=" "+self.cldbsk.protein2_key+" "+str(self[1])
205  outstr+=" "+self.cldbsk.residue1_key+" "+str(self[2])
206  outstr+=" "+self.cldbsk.residue2_key+" "+str(self[3])
207  return outstr
208 
209  def __str__(self):
210  outstr=str(self[0])+"."+str(self[2])+"-"+str(self[1])+"."+str(self[3])
211  return outstr
212 
213 class FilterOperator(object):
214  '''
215  This class allows to create filter functions that can be passed to the CrossLinkDataBase
216  in this way:
217 
218  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
219 
220  where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
221 
222  A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
223 
224  fo.evaluate(xl)
225 
226  and it is used to filter the database
227  '''
228 
229  def __init__(self, argument1, operator, argument2):
230  '''
231  (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
232  or (FilterOperator1,operator.or|and...,FilterOperator2)
233  '''
234  if isinstance(argument1, FilterOperator):
235  self.operations = [argument1, operator, argument2]
236  else:
237  self.operations = []
238  self.values = (argument1, operator, argument2)
239 
240  def __or__(self, FilterOperator2):
241  return FilterOperator(self, operator.or_, FilterOperator2)
242 
243  def __and__(self, FilterOperator2):
244  return FilterOperator(self, operator.and_, FilterOperator2)
245 
246  def __invert__(self):
247  return FilterOperator(self, operator.not_, None)
248 
249  def evaluate(self, xl_item):
250 
251  if len(self.operations) == 0:
252  keyword, operator, value = self.values
253  return operator(xl_item[keyword], value)
254  FilterOperator1, op, FilterOperator2 = self.operations
255 
256  if FilterOperator2 is None:
257  return op(FilterOperator1.evaluate(xl_item))
258  else:
259  return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
260 
261 '''
262 def filter_factory(xl_):
263 
264  class FilterOperator(object):
265  import operator
266  xl = xl_
267 
268  def __new__(self,key,value,oper=operator.eq):
269  return oper(self.xl[key],value)
270 
271  return FilterOperator
272 '''
273 
274 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
275  '''
276  This class is needed to convert the keywords from a generic database
277  to the standard ones
278  '''
279 
280  def __init__(self, list_parser=None):
281  '''
282  @param list_parser an instance of ResiduePairListParser, if any is needed
283  '''
284  self.converter={}
285  self.backward_converter={}
286  _CrossLinkDataBaseStandardKeys.__init__(self)
287  self.rplp = list_parser
288  if self.rplp is None:
289  # either you have protein1, protein2, residue1, residue2
290  self.compulsory_keys=set([self.protein1_key,
291  self.protein2_key,
292  self.residue1_key,
293  self.residue2_key])
294  else:
295  self.compulsory_keys=self.rplp.get_compulsory_keys()
296  self.setup_keys=set()
297 
298  def check_keys(self):
299  '''
300  Is a function that check whether necessary keys are setup
301  '''
302  setup_keys=set(self.get_setup_keys())
303  if self.compulsory_keys & setup_keys != self.compulsory_keys:
304  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
305 
306  def get_setup_keys(self):
307  '''
308  Returns the keys that have been setup so far
309  '''
310  return self.backward_converter.keys()
311 
312  def set_standard_keys(self):
313  """
314  This sets up the standard compulsory keys as defined by
315  _CrossLinkDataBaseStandardKeys
316  """
317  for ck in self.compulsory_keys:
318  self.converter[ck]=ck
319  self.backward_converter[ck]=ck
320 
321  def set_unique_id_key(self,origin_key):
322  self.converter[origin_key]=self.unique_id_key
323  self.backward_converter[self.unique_id_key]=origin_key
324 
325  def set_protein1_key(self,origin_key):
326  self.converter[origin_key]=self.protein1_key
327  self.backward_converter[self.protein1_key]=origin_key
328 
329  def set_protein2_key(self,origin_key):
330  self.converter[origin_key]=self.protein2_key
331  self.backward_converter[self.protein2_key]=origin_key
332 
333  def set_residue1_key(self,origin_key):
334  self.converter[origin_key]=self.residue1_key
335  self.backward_converter[self.residue1_key]=origin_key
336 
337  def set_residue2_key(self,origin_key):
338  self.converter[origin_key]=self.residue2_key
339  self.backward_converter[self.residue2_key]=origin_key
340 
341  def set_residue1_amino_acid_key(self, origin_key):
342  self.converter[origin_key] = self.residue1_amino_acid_key
343  self.backward_converter[self.residue1_amino_acid_key] = origin_key
344 
345  def set_residue2_amino_acid_key(self, origin_key):
346  self.converter[origin_key] = self.residue2_amino_acid_key
347  self.backward_converter[self.residue2_amino_acid_key] = origin_key
348 
349  def set_residue1_moiety_key(self, origin_key):
350  self.converter[origin_key] = self.residue1_moiety_key
351  self.backward_converter[self.residue1_moiety_key] = origin_key
352 
353  def set_residue2_moiety_key(self, origin_key):
354  self.converter[origin_key] = self.residue2_moiety_key
355  self.backward_converter[self.residue2_moiety_key] = origin_key
356 
357  def set_site_pairs_key(self,origin_key):
358  self.converter[origin_key]=self.site_pairs_key
359  self.backward_converter[self.site_pairs_key]=origin_key
360 
361  def set_id_score_key(self,origin_key):
362  self.converter[origin_key]=self.id_score_key
363  self.backward_converter[self.id_score_key]=origin_key
364 
365  def set_fdr_key(self,origin_key):
366  self.converter[origin_key]=self.fdr_key
367  self.backward_converter[self.fdr_key]=origin_key
368 
369  def set_quantitation_key(self,origin_key):
370  self.converter[origin_key]=self.quantitation_key
371  self.backward_converter[self.quantitation_key]=origin_key
372 
373  def set_psi_key(self,origin_key):
374  self.converter[origin_key]=self.psi_key
375  self.backward_converter[self.psi_key]=origin_key
376 
377  def set_link_type_key(self,link_type_key):
378  self.converter[link_type_key]=self.link_type_key
379  self.backward_converter[self.link_type_key]=link_type_key
380 
381  def get_converter(self):
382  '''
383  Returns the dictionary that convert the old keywords to the new ones
384  '''
385  self.check_keys()
386  return self.converter
387 
389  '''
390  Returns the dictionary that convert the new keywords to the old ones
391  '''
392  self.check_keys()
393  return self.backward_converter
394 
395 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
396  '''
397  A class to handle different styles of site pairs parsers.
398  Implemented styles:
399  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
400  [Y3-;K4-] for dead-ends
401  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
402  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
403  LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
404  '''
405 
406  import re
407 
408  def __init__(self,style):
409 
410  _CrossLinkDataBaseStandardKeys.__init__(self)
411  if style == "MSSTUDIO":
412  self.style=style
413  self.compulsory_keys= set([self.protein1_key,
414  self.protein2_key,
415  self.site_pairs_key])
416  elif style == "XTRACT" or style == "QUANTITATION":
417  self.style=style
418  self.compulsory_keys= set([self.site_pairs_key])
419  elif style == "LAN_HUANG":
420  self.style=style
421  self.compulsory_keys= set([self.site_pairs_key])
422  else:
423  raise Error("ResiduePairListParser: unknown list parser style")
424 
425  def get_compulsory_keys(self):
426  return self.compulsory_keys
427 
428  def get_list(self,input_string):
429  '''
430  This function returns a list of cross-linked residues and the corresponding list of
431  cross-linked chains. The latter list can be empty, if the style doesn't have the
432  corresponding information.
433  '''
434  if self.style == "MSSTUDIO":
435  input_string=input_string.replace("[","")
436  input_string=input_string.replace("]","")
437  input_string_pairs=input_string.split(";")
438  residue_pair_indexes=[]
439  chain_pair_indexes=[]
440  for s in input_string_pairs:
441  m1=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
442  m2=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
443  if m1:
444  # cross-link
445  residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
446  residue_pair_indexes.append((residue_index_1,residue_index_2))
447  elif m2:
448  # dead end
449  residue_type_1,residue_index_1=m2.group(1,2)
450  # at this stage chain_pair_indexes is empty
451  return residue_pair_indexes,chain_pair_indexes
452  if self.style == "XTRACT" or self.style == "QUANTITATION":
453  if ":x:" in input_string:
454  # if it is a cross-link....
455  input_strings=input_string.split(":x:")
456  first_peptides=input_strings[0].split(":|:")
457  second_peptides=input_strings[1].split(":|:")
458  first_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in first_peptides]
459  second_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in second_peptides]
460  residue_pair_indexes=[]
461  chain_pair_indexes=[]
462  for fpi in first_peptides_indentifiers:
463  for spi in second_peptides_indentifiers:
464  chain1=fpi[0]
465  chain2=spi[0]
466  residue1=fpi[1]
467  residue2=spi[1]
468  residue_pair_indexes.append((residue1,residue2))
469  chain_pair_indexes.append((chain1,chain2))
470  return residue_pair_indexes, chain_pair_indexes
471  else:
472  # if it is a monolink....
473  first_peptides = input_string.split(":|:")
474  first_peptides_indentifiers = [(f.split(":")[0], f.split(":")[1]) for f in first_peptides]
475  residue_indexes = []
476  chain_indexes = []
477  for fpi in first_peptides_indentifiers:
478  chain1=fpi[0]
479  residue1=fpi[1]
480  residue_indexes.append((residue1,))
481  chain_indexes.append((chain1,))
482  return residue_indexes, chain_indexes
483  if self.style == "LAN_HUANG":
484  input_strings=input_string.split("-")
485  chain1,first_series=input_strings[0].split(":")
486  chain2,second_series=input_strings[1].split(":")
487 
488  first_residues=first_series.replace(";","|").split("|")
489  second_residues=second_series.replace(";","|").split("|")
490  residue_pair_indexes=[]
491  chain_pair_indexes=[]
492  for fpi in first_residues:
493  for spi in second_residues:
494  residue1=self.re.sub("[^0-9]", "", fpi)
495  residue2=self.re.sub("[^0-9]", "", spi)
496  residue_pair_indexes.append((residue1,residue2))
497  chain_pair_indexes.append((chain1,chain2))
498  return residue_pair_indexes, chain_pair_indexes
499 
500 
501 
502 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
503  '''
504  A class to handle different XL format with fixed format
505  currently support ProXL
506  '''
507  def __init__(self,format):
508 
509  _CrossLinkDataBaseStandardKeys.__init__(self)
510  if format == "PROXL":
511  self.format=format
512  else:
513  raise Error("FixedFormatParser: unknown list format name")
514 
515 
516  def get_data(self,input_string):
517  if self.format == "PROXL":
518  tokens=input_string.split("\t")
519  xl={}
520  if tokens[0]=="SEARCH ID(S)":
521  return None
522  else:
523  xl[self.protein1_key]=tokens[2]
524  xl[self.protein2_key]=tokens[4]
525  xl[self.residue1_key]=int(tokens[3])
526  xl[self.residue2_key]=int(tokens[5])
527  return xl
528 
529 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
530  '''
531  this class handles a cross-link dataset and do filtering
532  operations, adding cross-links, merge datasets...
533  '''
534 
535  def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=('K',)):
536  '''
537  Constructor.
538  @param converter an instance of CrossLinkDataBaseKeywordsConverter
539  @param data_base an instance of CrossLinkDataBase to build the new database on
540  @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
541  cross-link consistency. If not given consistency will not be checked
542  @param linkable_aa a tuple containing one-letter amino acids which are linkable by the cross-linker;
543  only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
544  and if a fasta_seq is given
545  '''
546 
547  if data_base is None:
548  self.data_base = {}
549  else:
550  self.data_base=data_base
551 
552  _CrossLinkDataBaseStandardKeys.__init__(self)
553  if converter is not None:
554  self.cldbkc = converter #type: CrossLinkDataBaseKeywordsConverter
555  self.list_parser=self.cldbkc.rplp
556  self.converter = converter.get_converter()
557 
558  else:
559  self.cldbkc = None #type: CrossLinkDataBaseKeywordsConverter
560  self.list_parser=None
561  self.converter = None
562 
563  # default amino acids considered to be 'linkable' if none are given
564  self.def_aa_tuple = linkable_aa
565  self.fasta_seq = fasta_seq #type: IMP.pmi.topology.Sequences
566  self.dataset = None
567  self.name = None
568  self._update()
569 
570  def _update(self):
571  '''
572  Update the whole dataset after changes
573  '''
574  self.update_cross_link_unique_sub_index()
575  self.update_cross_link_redundancy()
576  self.update_residues_links_number()
578 
579 
580  def __iter__(self):
581  sorted_ids=sorted(self.data_base.keys())
582  for k in sorted_ids:
583  for xl in self.data_base[k]:
584  yield xl
585 
586  def xlid_iterator(self):
587  sorted_ids=sorted(self.data_base.keys())
588  for xlid in sorted_ids:
589  yield xlid
590 
591  def __getitem__(self,xlid):
592  return self.data_base[xlid]
593 
594  def __len__(self):
595  return len([xl for xl in self])
596 
597  def get_name(self):
598  return self.name
599 
600  def set_name(self,name):
601  new_data_base={}
602  for k in self.data_base:
603  new_data_base[k+"."+name]=self.data_base[k]
604  self.data_base=new_data_base
605  self.name=name
606  self._update()
607 
608  def get_number_of_xlid(self):
609  return len(self.data_base)
610 
611 
612  def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
613  '''
614  if FixedFormatParser is not specified, the file is comma-separated-values
615  @param file_name a txt file to be parsed
616  @param converter an instance of CrossLinkDataBaseKeywordsConverter
617  @param FixedFormatParser a parser for a fixed format
618  '''
619  if not FixedFormatParser:
620  xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
621 
622  if converter is not None:
623  self.cldbkc = converter
624  self.list_parser=self.cldbkc.rplp
625  self.converter = converter.get_converter()
626 
627 
628  if not self.list_parser:
629  # normal procedure without a list_parser
630  # each line is a cross-link
631  new_xl_dict={}
632  for nxl,xl in enumerate(xl_list):
633  new_xl={}
634  for k in xl:
635  if k in self.converter:
636  new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
637  else:
638  new_xl[k]=xl[k]
639  if self.unique_id_key in self.cldbkc.get_setup_keys():
640  if new_xl[self.unique_id_key] not in new_xl_dict:
641  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
642  else:
643  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
644  else:
645  if str(nxl) not in new_xl_dict:
646  new_xl_dict[str(nxl)]=[new_xl]
647  else:
648  new_xl_dict[str(nxl)].append(new_xl)
649 
650  else:
651  # with a list_parser, a line can be a list of ambiguous cross-links
652  new_xl_dict={}
653  for nxl,entry in enumerate(xl_list):
654 
655  # first get the translated keywords
656  new_dict={}
657  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
658  raise Error("CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
659  for k in entry:
660  if k in self.converter:
661  new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
662  else:
663  new_dict[k]=entry[k]
664 
665  residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
666 
667  # then create the cross-links
668  for n,p in enumerate(residue_pair_list):
669  is_monolink=False
670  if len(p)==1:
671  is_monolink=True
672 
673  new_xl={}
674  for k in new_dict:
675  new_xl[k]=new_dict[k]
676  new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
677  if not is_monolink:
678  new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
679 
680  if len(chain_pair_list)==len(residue_pair_list):
681  new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
682  if not is_monolink:
683  new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
684 
685  if not is_monolink:
686  new_xl[self.link_type_key]="CROSSLINK"
687  else:
688  new_xl[self.link_type_key]="MONOLINK"
689 
690  if self.unique_id_key in self.cldbkc.get_setup_keys():
691  if new_xl[self.unique_id_key] not in new_xl_dict:
692  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
693  else:
694  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
695  else:
696  if str(nxl) not in new_xl_dict:
697  new_xl[self.unique_id_key]=str(nxl+1)
698  new_xl_dict[str(nxl)]=[new_xl]
699  else:
700  new_xl[self.unique_id_key]=str(nxl+1)
701  new_xl_dict[str(nxl)].append(new_xl)
702 
703 
704  else:
705  '''
706  if FixedFormatParser is defined
707  '''
708 
709  new_xl_dict={}
710  f=open(file_name,"r")
711  nxl=0
712  for line in f:
713  xl=FixedFormatParser.get_data(line)
714  if xl:
715  xl[self.unique_id_key]=str(nxl+1)
716  new_xl_dict[str(nxl)]=[xl]
717  nxl+=1
718 
719 
720  self.data_base=new_xl_dict
721  self.name=file_name
722  l = ihm.location.InputFileLocation(file_name, details='Crosslinks')
723  self.dataset = ihm.dataset.CXMSDataset(l)
724  self._update()
725 
726  def update_cross_link_unique_sub_index(self):
727  for k in self.data_base:
728  for n,xl in enumerate(self.data_base[k]):
729  xl[self.ambiguity_key]=len(self.data_base[k])
730  xl[self.unique_sub_index_key]=n+1
731  xl[self.unique_sub_id_key]=k+"."+str(n+1)
732 
733  def update_cross_link_redundancy(self):
734  redundancy_data_base={}
735  for xl in self:
736  pra=_ProteinsResiduesArray(xl)
737  if pra not in redundancy_data_base:
738  redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
739  redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
740  else:
741  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
742  redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
743  for xl in self:
744  pra=_ProteinsResiduesArray(xl)
745  xl[self.redundancy_key]=len(redundancy_data_base[pra])
746  xl[self.redundancy_list_key]=redundancy_data_base[pra]
747 
748  def update_residues_links_number(self):
749  residue_links={}
750  for xl in self:
751  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
752  if (p1,r1) not in residue_links:
753  residue_links[(p1,r1)]=set([(p2,r2)])
754  else:
755  residue_links[(p1,r1)].add((p2,r2))
756  if (p2,r2) not in residue_links:
757  residue_links[(p2,r2)]=set([(p1,r1)])
758  else:
759  residue_links[(p2,r2)].add((p1,r1))
760 
761  for xl in self:
762  (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
763  xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
764  xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
765 
767  """This function checks the consistency of the dataset with the amino acid sequence"""
768  if self.cldbkc and self.fasta_seq:
769  cnt_matched, cnt_matched_file = 0, 0
770  matched = {}
771  non_matched = {}
772  for xl in self:
773  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
774  b_matched_file = False
775  if self.residue1_amino_acid_key in xl:
776  # either you know the residue type and aa_tuple is a single entry
777  aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
778  b_matched = self._match_xlinks(p1, r1, aa_from_file)
779  b_matched_file = b_matched
780  else:
781  # or pass the possible list of types that can be cross-linked
782  b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
783 
784  matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
785 
786  if self.residue2_amino_acid_key in xl:
787  aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
788  b_matched = self._match_xlinks(p2, r2, aa_from_file)
789  b_matched_file = b_matched
790  else:
791  b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
792 
793  matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
794  if b_matched: cnt_matched += 1
795  if b_matched_file: cnt_matched_file += 1
796  if len(self) > 0:
797  percentage_matched = round(100*cnt_matched/len(self),1)
798  percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
799  #if matched: print "Matched xlinks:", matched
800  if matched or non_matched: print("check_cross_link_consistency: Out of %d cross-links "
801  "%d were matched to the fasta sequence (%f %%).\n "
802  "%d were matched by using the cross-link file (%f %%)."%
803  (len(self),cnt_matched,percentage_matched,cnt_matched_file,
804  percentage_matched_file) )
805  if non_matched: print("check_cross_link_consistency: Warning: Non matched xlinks:",
806  [(prot_name, sorted(list(non_matched[prot_name]))) for prot_name in non_matched])
807  return matched,non_matched
808 
809  def _match_xlinks(self, prot_name, res_index, aa_tuple):
810  # returns Boolean whether given aa matches a position in the fasta file
811  # cross-link files usually start counting at 1 and not 0; therefore subtract -1 to compare with fasta
812  amino_dict = IMP.pmi.alphabets.amino_acid
813  res_index -= 1
814  for amino_acid in aa_tuple:
815  if len(amino_acid) == 3:
816  amino_acid = amino_dict.get_one_letter_code_from_residue_type(
817  amino_acid.upper())
818  if prot_name in self.fasta_seq.sequences:
819  seq = self.fasta_seq.sequences[prot_name]
820  # if we are looking at the first amino acid in a given sequence always return a match
821  # the first aa should always be the n-terminal aa
822  # which may form a cross-link in any case (for BS3-like cross-linkers)
823  # for some data sets the first aa is at position 1; todo: check why this is the case
824  if res_index == 0 or res_index == 1:
825  return True
826  if res_index < len(seq):
827  if amino_acid == seq[res_index]:
828  return True
829  # else:
830  # print "Could not match", prot, res+1, amino_acid, seq[res]
831  return False
832 
833  def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
834  if b_matched:
835  if prot in matched:
836  matched[prot].add(res)
837  else:
838  matched[prot] = set([res])
839  else:
840  if prot in non_matched:
841  non_matched[prot].add(res)
842  else:
843  non_matched[prot] = set([res])
844  return matched, non_matched
845 
846 
847  def get_cross_link_string(self,xl):
848  string='|'
849  for k in self.ordered_key_list:
850  try:
851  string+=str(k)+":"+str(xl[k])+"|"
852  except KeyError:
853  continue
854 
855  for k in xl:
856  if k not in self.ordered_key_list:
857  string+=str(k)+":"+str(xl[k])+"|"
858 
859  return string
860 
861  def get_short_cross_link_string(self,xl):
862 
863  string='|'
864  list_of_keys=[self.data_set_name_key,
865  self.unique_sub_id_key,
866  self.protein1_key,
867  self.residue1_key,
868  self.protein2_key,
869  self.residue2_key,
870  self.state_key,
871  self.psi_key]
872 
873  for k in list_of_keys:
874  try:
875  string+=str(xl[k])+"|"
876  except KeyError:
877  continue
878 
879  return string
880 
881  def filter(self,FilterOperator):
882  new_xl_dict={}
883  for id in self.data_base.keys():
884  for xl in self.data_base[id]:
885  if FilterOperator.evaluate(xl):
886  if id not in new_xl_dict:
887  new_xl_dict[id]=[xl]
888  else:
889  new_xl_dict[id].append(xl)
890  self._update()
891  cdb = CrossLinkDataBase(self.cldbkc,new_xl_dict)
892  cdb.dataset = self.dataset
893  return cdb
894 
895  def filter_score(self,score):
896  '''Get all cross-links with score greater than an input value'''
897  FilterOperator=IMP.pmi.io.crosslink.FilterOperator(self.id_score_key,operator.gt,score)
898  return self.filter(FilterOperator)
899 
900  def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
901  '''
902  This function merges two cross-link datasets so that if two conflicting cross-links have the same
903  cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
904  with different SubIDs
905  '''
906  raise NotImplementedError()
907 
908  def append_database(self, db):
909  """Append cross-link dataset to this one."""
910  new_data_base = {}
911  for k in self.data_base:
912  new_data_base[k] = self.data_base[k]
913  for k in db.data_base:
914  new_data_base[k] = db.data_base[k]
915  self.data_base = new_data_base
916  self._update()
917 
918  def __iadd__(self, db):
919  self.append_database(db)
920  return self
921 
922  def set_value(self, key, new_value, filter_operator=None):
923  '''
924  This function changes the value for a given key in the database
925  For instance one can change the name of a protein
926  @param key: the key in the database that must be changed
927  @param new_value: the new value of the key
928  @param filter_operator: optional FilterOperator to change the value to
929  a subset of the database
930 
931  example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
932  '''
933 
934  for xl in self:
935  if filter_operator is not None:
936  if filter_operator.evaluate(xl):
937  xl[key]=new_value
938  else:
939  xl[key]=new_value
940  self._update()
941 
942  def get_values(self,key):
943  '''
944  this function returns the list of values for a given key in the database
945  alphanumerically sorted
946  '''
947  values=set()
948  for xl in self:
949  values.add(xl[key])
950  return sorted(list(values))
951 
952  def offset_residue_index(self,protein_name,offset):
953  '''
954  This function offset the residue indexes of a given protein by a specified value
955  @param protein_name: the protein name that need to be changed
956  @param offset: the offset value
957  '''
958 
959  for xl in self:
960  if xl[self.protein1_key] == protein_name:
961  xl[self.residue1_key]=xl[self.residue1_key]+offset
962  if xl[self.protein2_key] == protein_name:
963  xl[self.residue2_key]=xl[self.residue2_key]+offset
964  self._update()
965 
966  def create_new_keyword(self,keyword,values_from_keyword=None):
967  '''
968  This function creates a new keyword for the whole database and set the values from
969  and existing keyword (optional), otherwise the values are set to None
970  @param keyword the new keyword name:
971  @param values_from_keyword the keyword from which we are copying the values:
972  '''
973  for xl in self:
974  if values_from_keyword is not None:
975  xl[keyword] = xl[values_from_keyword]
976  else:
977  xl[keyword] = None
978  self._update()
979 
980  def rename_proteins(self,old_to_new_names_dictionary, protein_to_rename="both"):
981  '''
982  This function renames all proteins contained in the input dictionary
983  from the old names (keys) to the new name (values)
984  @param old_to_new_names_dictionary dictionary for converting old to new names
985  @param protein_to_rename specify whether to rename both or protein1 or protein2 only
986  '''
987 
988  for old_name in old_to_new_names_dictionary:
989  new_name=old_to_new_names_dictionary[old_name]
990  if protein_to_rename == "both" or protein_to_rename == "protein1":
991  fo2=FilterOperator(self.protein1_key,operator.eq,old_name)
992  self.set_value(self.protein1_key,new_name,fo2)
993  if protein_to_rename == "both" or protein_to_rename == "protein2":
994  fo2=FilterOperator(self.protein2_key,operator.eq,old_name)
995  self.set_value(self.protein2_key,new_name,fo2)
996 
997  def classify_crosslinks_by_score(self,number_of_classes):
998  '''
999  This function creates as many classes as in the input (number_of_classes: integer)
1000  and partition cross-links according to their identification scores. Classes are defined in the psi key.
1001  '''
1002 
1003  if self.id_score_key is not None:
1004  scores=self.get_values(self.id_score_key)
1005  else:
1006  raise ValueError('The cross-link database does not contain score values')
1007  minscore=min(scores)
1008  maxscore=max(scores)
1009  scoreclasses=numpy.linspace(minscore, maxscore, number_of_classes+1)
1010  if self.psi_key is None:
1011  self.create_new_keyword(self.psi_key,values_from_keyword=None)
1012  for xl in self:
1013  score=xl[self.id_score_key]
1014  for n,classmin in enumerate(scoreclasses[0:-1]):
1015  if score>=classmin and score<=scoreclasses[n+1]:
1016  xl[self.psi_key]=str("CLASS_"+str(n))
1017  self._update()
1018 
1019  def clone_protein(self,protein_name,new_protein_name):
1020  new_xl_dict={}
1021  for id in self.data_base.keys():
1022  new_data_base=[]
1023  for xl in self.data_base[id]:
1024  new_data_base.append(xl)
1025  if xl[self.protein1_key]==protein_name and xl[self.protein2_key]!=protein_name:
1026  new_xl=dict(xl)
1027  new_xl[self.protein1_key]=new_protein_name
1028  new_data_base.append(new_xl)
1029  elif xl[self.protein1_key]!=protein_name and xl[self.protein2_key]==protein_name:
1030  new_xl=dict(xl)
1031  new_xl[self.protein2_key]=new_protein_name
1032  new_data_base.append(new_xl)
1033  elif xl[self.protein1_key]==protein_name and xl[self.protein2_key]==protein_name:
1034  new_xl=dict(xl)
1035  new_xl[self.protein1_key]=new_protein_name
1036  new_data_base.append(new_xl)
1037  new_xl=dict(xl)
1038  new_xl[self.protein2_key]=new_protein_name
1039  new_data_base.append(new_xl)
1040  new_xl=dict(xl)
1041  new_xl[self.protein1_key]=new_protein_name
1042  new_xl[self.protein2_key]=new_protein_name
1043  new_data_base.append(new_xl)
1044  self.data_base[id]=new_data_base
1045  self._update()
1046 
1048  '''
1049  This function remove cross-links applied to the same residue
1050  (ie, same chain name and residue number)
1051  '''
1052  new_xl_dict={}
1053  for id in self.data_base.keys():
1054  new_data_base=[]
1055  for xl in self.data_base[id]:
1056  if xl[self.protein1_key]==xl[self.protein2_key] and xl[self.residue1_key]==xl[self.residue2_key]:
1057  continue
1058  else:
1059  new_data_base.append(xl)
1060  self.data_base[id]=new_data_base
1061  self._update()
1062 
1063 
1064  def jackknife(self,percentage):
1065  '''
1066  this method returns a CrossLinkDataBase class containing
1067  a random subsample of the original cross-link database.
1068  @param percentage float between 0 and 1, is the percentage of
1069  of spectra taken from the original list
1070  '''
1071  import random
1072  if percentage > 1.0 or percentage < 0.0:
1073  raise ValueError('the percentage of random cross-link spectra should be between 0 and 1')
1074  nspectra=self.get_number_of_xlid()
1075  nrandom_spectra=int(nspectra*percentage)
1076  random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1077  new_data_base={}
1078  for k in random_keys:
1079  new_data_base[k]=self.data_base[k]
1080  return CrossLinkDataBase(self.cldbkc,new_data_base)
1081 
1082  def __str__(self):
1083  outstr=''
1084  sorted_ids=sorted(self.data_base.keys())
1085 
1086  for id in sorted_ids:
1087  outstr+=id+"\n"
1088  for xl in self.data_base[id]:
1089  for k in self.ordered_key_list:
1090  try:
1091  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1092  except KeyError:
1093  continue
1094 
1095  for k in xl:
1096  if k not in self.ordered_key_list:
1097  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
1098  outstr+="-------------\n"
1099  return outstr
1100 
1101 
1102  def plot(self,filename,**kwargs):
1103  import matplotlib
1104  matplotlib.use('Agg')
1105  import matplotlib.pyplot as plt
1106  import matplotlib.colors
1107 
1108 
1109 
1110  if kwargs["type"] == "scatter":
1111  cmap=plt.get_cmap("rainbow")
1112  norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1113  xkey=kwargs["xkey"]
1114  ykey=kwargs["ykey"]
1115  if "colorkey" in kwargs:
1116  colorkey=kwargs["colorkey"]
1117  if "sizekey" in kwargs:
1118  sizekey=kwargs["sizekey"]
1119  if "logyscale" in kwargs:
1120  logyscale=kwargs["logyscale"]
1121  else:
1122  logyscale=False
1123  xs=[]
1124  ys=[]
1125  colors=[]
1126  for xl in self:
1127  try:
1128  xs.append(float(xl[xkey]))
1129  if logyscale:
1130  ys.append(math.log10(float(xl[ykey])))
1131  else:
1132  ys.append(float(xl[ykey]))
1133  colors.append(float(xl[colorkey]))
1134  except ValueError:
1135  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1136  continue
1137 
1138  cs=[]
1139  for color in colors:
1140  cindex=(color-min(colors))/(max(colors)-min(colors))
1141  cs.append(cmap(cindex))
1142 
1143  fig = plt.figure()
1144  ax = fig.add_subplot(111)
1145  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker="o")
1146  ax.set_xlabel(xkey)
1147  ax.set_ylabel(ykey)
1148  plt.savefig(filename)
1149  plt.show()
1150  plt.close()
1151 
1152  if kwargs["type"] == "residue_links":
1153  #plot the number of distinct links to a residue
1154  #in an histogram
1155  #molecule name
1156  molecule=kwargs["molecule"]
1157  if type(molecule) is IMP.pmi.topology.Molecule:
1158  length=len(molecule.sequence)
1159  molecule=molecule.get_name()
1160  else:
1161  #you need a IMP.pmi.topology.Sequences object
1162  sequences_object=kwargs["sequences_object"]
1163  sequence=sequences_object.sequences[molecule]
1164  length=len(sequence)
1165 
1166  histogram=[0]*length
1167  for xl in self:
1168  if xl[self.protein1_key]==molecule:
1169  histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1170  if xl[self.protein2_key]==molecule:
1171  histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1172  rects = plt.bar(range(1,length+1), histogram)
1173  #bar_width,
1174  #alpha=opacity,
1175  #color='b',
1176  #yerr=std_men,
1177  #error_kw=error_config,
1178  #label='Men')
1179  plt.savefig(filename)
1180  plt.show()
1181  plt.close()
1182 
1183 
1184 
1185  if kwargs["type"] == "histogram":
1186  valuekey=kwargs["valuekey"]
1187  reference_xline=kwargs["reference_xline"]
1188  valuename=valuekey
1189  bins=kwargs["bins"]
1190  values_list=[]
1191  for xl in self:
1192  try:
1193  values_list.append(float(xl[valuekey]))
1194  except ValueError:
1195  print("CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1196  continue
1198  filename, [values_list], valuename=valuename, bins=bins,
1199  colors=None, format="pdf",
1200  reference_xline=None, yplotrange=None,
1201  xplotrange=None,normalized=True,
1202  leg_names=None)
1203 
1204  def dump(self,json_filename):
1205  import json
1206  with open(json_filename, 'w') as fp:
1207  json.dump(self.data_base, fp, sort_keys=True, indent=2, default=set_json_default)
1208 
1209  def load(self,json_filename):
1210  import json
1211  with open(json_filename, 'r') as fp:
1212  self.data_base = json.load(fp)
1213  self._update()
1214  #getting rid of unicode
1215  # (can't do this in Python 3, since *everything* is Unicode there)
1216  if sys.version_info[0] < 3:
1217  for xl in self:
1218  for k,v in xl.iteritems():
1219  if type(k) is unicode: k=str(k)
1220  if type(v) is unicode: v=str(v)
1221  xl[k]=v
1222 
1223  def save_csv(self,filename):
1224 
1225  import csv
1226 
1227  data=[]
1228  sorted_ids=None
1229  sorted_group_ids=sorted(self.data_base.keys())
1230  for group in sorted_group_ids:
1231  group_block=[]
1232  for xl in self.data_base[group]:
1233  if not sorted_ids:
1234  sorted_ids=sorted(xl.keys())
1235  values=[xl[k] for k in sorted_ids]
1236  group_block.append(values)
1237  data+=group_block
1238 
1239 
1240  with open(filename, 'w') as fp:
1241  a = csv.writer(fp, delimiter=',')
1242  a.writerow(sorted_ids)
1243  a.writerows(data)
1244 
1246  """
1247  Returns the number of non redundant cross-link sites
1248  """
1249  praset=set()
1250  for xl in self:
1251  pra=_ProteinsResiduesArray(xl)
1252  prai=pra.get_inverted()
1253  praset.add(pra)
1254  praset.add(prai)
1255  return len(list(praset))
1256 
1258  """This class allows to compute and plot the distance between datasets"""
1259 
1260  def __init__(self,cldb_dictionary):
1261  """Input a dictionary where keys are cldb names and values are cldbs"""
1262  import scipy.spatial.distance
1263  self.dbs=cldb_dictionary
1264  self.keylist=list(self.dbs.keys())
1265  self.distances=list()
1266 
1267 
1268  for i,key1 in enumerate(self.keylist):
1269  for key2 in self.keylist[i+1:]:
1270  distance=self.get_distance(key1,key2)
1271  self.distances.append(distance)
1272 
1273  self.distances=scipy.spatial.distance.squareform(self.distances)
1274 
1275  def get_distance(self,key1,key2):
1276  return 1.0-self.jaccard_index(self.dbs[key1],self.dbs[key2])
1277 
1278  def jaccard_index(self,CrossLinkDataBase1,CrossLinkDataBase2):
1279  """Similarity index between two datasets
1280  https://en.wikipedia.org/wiki/Jaccard_index"""
1281 
1282  set1=set()
1283  set2=set()
1284  for xl1 in CrossLinkDataBase1:
1285  a1f=_ProteinsResiduesArray(xl1)
1286  a1b=a1f.get_inverted()
1287  set1.add(a1f)
1288  set1.add(a1b)
1289  for xl2 in CrossLinkDataBase2:
1290  a2f=_ProteinsResiduesArray(xl2)
1291  a2b=a2f.get_inverted()
1292  set2.add(a2f)
1293  set2.add(a2b)
1294  return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1295 
1296  def plot_matrix(self,figurename="clustermatrix.pdf"):
1297  import matplotlib as mpl
1298  import numpy
1299  mpl.use('Agg')
1300  import matplotlib.pylab as pl
1301  from scipy.cluster import hierarchy as hrc
1302 
1303  raw_distance_matrix=self.distances
1304  labels=self.keylist
1305 
1306  fig = pl.figure()
1307  #fig.autolayout=True
1308 
1309  ax = fig.add_subplot(1,1,1)
1310  dendrogram = hrc.dendrogram(
1311  hrc.linkage(raw_distance_matrix),
1312  color_threshold=7,
1313  no_labels=True)
1314  leaves_order = dendrogram['leaves']
1315  ax.set_xlabel('Dataset')
1316  ax.set_ylabel('Jaccard Distance')
1317  pl.tight_layout()
1318  pl.savefig("dendrogram."+figurename, dpi=300)
1319  pl.close(fig)
1320 
1321  fig = pl.figure()
1322  #fig.autolayout=True
1323 
1324  ax = fig.add_subplot(1,1,1)
1325  cax = ax.imshow(
1326  raw_distance_matrix[leaves_order,
1327  :][:,
1328  leaves_order],
1329  interpolation='nearest')
1330  cb = fig.colorbar(cax)
1331  cb.set_label('Jaccard Distance')
1332  ax.set_xlabel('Dataset')
1333  ax.set_ylabel('Dataset')
1334  ax.set_xticks(range(len(labels)))
1335  ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation='vertical')
1336  ax.set_yticks(range(len(labels)))
1337  ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation='horizontal')
1338  pl.tight_layout()
1339  pl.savefig("matrix."+figurename, dpi=300)
1340  pl.close(fig)
1341 
1342 
1344  '''
1345  This class maps a CrossLinkDataBase on a given structure
1346  and save an rmf file with color-coded cross-links
1347  '''
1348 
1349 
1350  def __init__(self,CrossLinkDataBase,rmf_or_stat_handler):
1351  self.CrossLinkDataBase=CrossLinkDataBase
1352  if type(rmf_or_stat_handler) is IMP.pmi.output.RMFHierarchyHandler or \
1353  type(rmf_or_stat_handler) is IMP.pmi.output.StatHierarchyHandler:
1354  self.prots=rmf_or_stat_handler
1355  self.distances=defaultdict(list)
1356  self.array_to_id={}
1357  self.id_to_array={}
1358 
1359  print("computing distances fro all cross-links and all structures")
1360  for i in self.prots[::10]:
1361  self.compute_distances()
1362  for key,xl in enumerate(self.CrossLinkDataBase):
1363  array=_ProteinsResiduesArray(xl)
1364  self.array_to_id[array]=key
1365  self.id_to_array[key]=array
1366  if xl["MinAmbiguousDistance"] != 'None':
1367  self.distances[key].append(xl["MinAmbiguousDistance"])
1368 
1369  def compute_distances(self):
1370  data=[]
1371  sorted_ids=None
1372  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1373  for group in sorted_group_ids:
1374  #group_block=[]
1375  group_dists=[]
1376  for xl in self.CrossLinkDataBase.data_base[group]:
1377  if not sorted_ids:
1378  sorted_ids=sorted(xl.keys())
1379  data.append(sorted_ids+["UniqueID","Distance","MinAmbiguousDistance"])
1380  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1381  try:
1382  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1383  except:
1384  mdist="None"
1385  state1="None"
1386  copy1="None"
1387  state2="None"
1388  copy2="None"
1389  try:
1390  # sometimes keys get "lost" in the database, not really sure why
1391  values=[xl[k] for k in sorted_ids]
1392  values += [group, mdist]
1393  except KeyError as e:
1394  print("MapCrossLinkDataBaseOnStructure KeyError: {0} in {1}".format(e, xl))
1395  group_dists.append(mdist)
1396  #group_block.append(values)
1397  xl["Distance"]=mdist
1398  xl["State1"]=state1
1399  xl["Copy1"]=copy1
1400  xl["State2"]=state2
1401  xl["Copy2"]=copy2
1402 
1403  for xl in self.CrossLinkDataBase.data_base[group]:
1404  xl["MinAmbiguousDistance"]=min(group_dists)
1405 
1406  def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1407  '''more robust and slower version of above'''
1408  sel=IMP.atom.Selection(self.prots,molecule=c1,residue_index=r1,resolution=1)
1409  selpart_1=sel.get_selected_particles()
1410  if len(selpart_1)==0:
1411  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1412  return None
1413  sel=IMP.atom.Selection(self.prots,molecule=c2,residue_index=r2,resolution=1)
1414  selpart_2=sel.get_selected_particles()
1415  if len(selpart_2)==0:
1416  print("MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1417  return None
1418  results=[]
1419  for p1 in selpart_1:
1420  for p2 in selpart_2:
1421  if p1 == p2 and r1 == r2: continue
1422  d1=IMP.core.XYZ(p1)
1423  d2=IMP.core.XYZ(p2)
1424  #round distance to second decimal
1425  dist=float(int(IMP.core.get_distance(d1,d2)*100.0))/100.0
1426  h1=IMP.atom.Hierarchy(p1)
1427  while not IMP.atom.Molecule.get_is_setup(h1.get_particle()):
1428  h1=h1.get_parent()
1429  copy_index1=IMP.atom.Copy(h1).get_copy_index()
1430  while not IMP.atom.State.get_is_setup(h1.get_particle()):
1431  h1=h1.get_parent()
1432  state_index1=IMP.atom.State(h1).get_state_index()
1433  h2=IMP.atom.Hierarchy(p2)
1434  while not IMP.atom.Molecule.get_is_setup(h2.get_particle()):
1435  h2=h2.get_parent()
1436  copy_index2=IMP.atom.Copy(h2).get_copy_index()
1437  while not IMP.atom.State.get_is_setup(h2.get_particle()):
1438  h2=h2.get_parent()
1439  state_index2=IMP.atom.State(h2).get_state_index()
1440 
1441  results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1442  if len(results)==0: return None
1443  results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1444  return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1445 
1446  def save_rmf_snapshot(self,filename,color_id=None):
1447  sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1448  list_of_pairs=[]
1449  color_scores=[]
1450  for group in sorted_group_ids:
1451  group_dists_particles=[]
1452  for xl in self.CrossLinkDataBase.data_base[group]:
1453  xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1454  (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1455  try:
1456  (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1457  except TypeError:
1458  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1459  continue
1460  if color_id is not None:
1461  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1462  else:
1463  group_dists_particles.append((mdist,p1,p2,xllabel,0.0))
1464  if group_dists_particles:
1465  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1466  color_scores.append(mincolor_score)
1467  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1468  else:
1469  continue
1470 
1471  linear = IMP.core.Linear(0, 0.0)
1472  linear.set_slope(1.0)
1473  dps2 = IMP.core.DistancePairScore(linear)
1474  rslin = IMP.RestraintSet(self.prots.get_model(), 'linear_dummy_restraints')
1475  sgs=[]
1476  offset=min(color_scores)
1477  maxvalue=max(color_scores)
1478  for pair in list_of_pairs:
1479  pr = IMP.core.PairRestraint(self.prots.get_model(), dps2, (pair[0], pair[1]))
1480  pr.set_name(pair[2])
1481  if offset<maxvalue:
1482  factor=(pair[3]-offset)/(maxvalue-offset)
1483  else:
1484  factor=0.0
1485  c=IMP.display.get_rgb_color(factor)
1486  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1487  rslin.add_restraint(pr)
1488  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1489 
1490  rh = RMF.create_rmf_file(filename)
1491  IMP.rmf.add_hierarchies(rh, [self.prots])
1492  IMP.rmf.add_restraints(rh,[rslin])
1493  IMP.rmf.add_geometries(rh, sgs)
1494  IMP.rmf.save_frame(rh)
1495  del rh
1496 
1497  def boxplot_crosslink_distances(self,filename):
1498  import numpy
1499  keys=[self.id_to_array[k] for k in self.distances.keys()]
1500  medians=[numpy.median(self.distances[self.array_to_id[k]]) for k in keys]
1501  dists=[self.distances[self.array_to_id[k]] for k in keys]
1502  distances_sorted_by_median=[x for _,x in sorted(zip(medians,dists))]
1503  keys_sorted_by_median=[x for _,x in sorted(zip(medians,keys))]
1505  distances_sorted_by_median,
1506  range(len(distances_sorted_by_median)),
1507  xlabels=keys_sorted_by_median,scale_plot_length=0.2)
1508 
1509  def get_crosslink_violations(self,threshold):
1510  violations=defaultdict(list)
1511  for k in self.distances:
1512  #viols=map(lambda x:1.0*(x<= threshold), self.distances[k])
1513  viols=self.distances[k]
1514  violations[self.id_to_array[k]]=viols
1515  return violations
1516 
1517 
1519  '''
1520  This class generates a CrossLinkDataBase from a given structure
1521  '''
1522  def __init__(self, system, residue_types_1=["K"],
1523  residue_types_2=["K"], reactivity_range=[0,1], kt=1.0):
1524 
1525  import numpy.random
1526  import math
1528  cldbkc.set_protein1_key("Protein1")
1529  cldbkc.set_protein2_key("Protein2")
1530  cldbkc.set_residue1_key("Residue1")
1531  cldbkc.set_residue2_key("Residue2")
1532  self.cldb=CrossLinkDataBase(cldbkc)
1533  self.system=system
1534  self.model=self.system.model
1535  self.residue_types_1=residue_types_1
1536  self.residue_types_2=residue_types_2
1537  self.kt=kt
1538  self.indexes_dict1={}
1539  self.indexes_dict2={}
1540  self.protein_residue_dict={}
1541  self.reactivity_dictionary={}
1542  self.euclidean_interacting_pairs=None
1543  self.xwalk_interacting_pairs=None
1544  import random
1545 
1546  for state in self.system.get_states():
1547  for moleculename,molecules in state.get_molecules().items():
1548  for molecule in molecules:
1549  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
1550  seq=molecule.sequence
1551  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
1552 
1553  for r in residues:
1554  # uniform random reactivities
1555  #self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
1556  # getting reactivities from the CDF of an exponential distribution
1557  rexp=numpy.random.exponential(0.00000001)
1558  prob=1.0-math.exp(-rexp)
1559  self.reactivity_dictionary[(molecule,r)]=prob
1560 
1561  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
1562  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
1563  for r in residues1:
1564  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1565  ps=s.get_selected_particles()
1566  for p in ps:
1567  index=p.get_index()
1568  self.indexes_dict1[index]=(molecule,r)
1569  self.protein_residue_dict[(molecule,r)]=index
1570  for r in residues2:
1571  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1572  ps=s.get_selected_particles()
1573  for p in ps:
1574  index=p.get_index()
1575  self.indexes_dict2[index]=(molecule,r)
1576  self.protein_residue_dict[(molecule,r)]=index
1577 
1578 
1579  def get_all_possible_pairs(self):
1580  n=float(len(self.protein_residue_dict.keys()))
1581  return n*(n-1.0)/2.0
1582 
1583  def get_all_feasible_pairs(self,distance=21):
1584  import itertools
1585  particle_index_pairs=[]
1586  nxl=0
1587  for a,b in itertools.combinations(self.protein_residue_dict.keys(),2):
1588 
1589  new_xl={}
1590  index1=self.protein_residue_dict[a]
1591  index2=self.protein_residue_dict[b]
1592  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1593  if particle_distance <= distance:
1594  particle_index_pairs.append((index1,index2))
1595  new_xl[self.cldb.protein1_key]=a[0].get_name()
1596  new_xl[self.cldb.protein2_key]=b[0].get_name()
1597  new_xl["molecule_object1"]=a[0]
1598  new_xl["molecule_object2"]=b[0]
1599  new_xl[self.cldb.residue1_key]=a[1]
1600  new_xl[self.cldb.residue2_key]=b[1]
1601  self.cldb.data_base[str(nxl)]=[new_xl]
1602  nxl+=1
1603  self.cldb._update()
1604  return self.cldb
1605 
1606 
1607 
1608 
1609  def get_data_base(self,total_number_of_spectra,
1610  ambiguity_probability=0.1,
1611  noise=0.01,
1612  distance=21,
1613  max_delta_distance=10.0,
1614  xwalk_bin_path=None,
1615  confidence_false=0.75,
1616  confidence_true=0.75):
1617  import math
1618  from random import random,uniform
1619  import numpy as np
1620  number_of_spectra=1
1621 
1622  self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1623  self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1624  self.cldb.data_base[str(number_of_spectra)]=[]
1625  self.sites_weighted=None
1626 
1627  while number_of_spectra<total_number_of_spectra:
1628  if random() > ambiguity_probability and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1629  # new spectrum
1630  number_of_spectra+=1
1631  self.cldb.data_base[str(number_of_spectra)]=[]
1632  noisy=False
1633  if random() > noise:
1634  # not noisy cross-link
1635  pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1636  else:
1637  # noisy cross-link
1638  pra,dist=self.get_random_residue_pair(None,None,None)
1639  noisy=True
1640 
1641  new_xl={}
1642  new_xl[self.cldb.protein1_key]=pra[0].get_name()
1643  new_xl[self.cldb.protein2_key]=pra[1].get_name()
1644  new_xl["molecule_object1"]=pra[0]
1645  new_xl["molecule_object2"]=pra[1]
1646  new_xl[self.cldb.residue1_key]=pra[2]
1647  new_xl[self.cldb.residue2_key]=pra[3]
1648  new_xl["Noisy"]=noisy
1649  # the reactivity is defined as r=1-exp(-k*Delta t)
1650  new_xl["Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1651  new_xl["Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1652  r1=new_xl["Reactivity_Residue1"]
1653  r2=new_xl["Reactivity_Residue2"]
1654  #combined reactivity 1-exp(-k12*Delta t),
1655  # k12=k1*k2/(k1+k2)
1656  #new_xl["Reactivity"]=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1657  if noisy:
1658  #new_xl["Score"]=uniform(-1.0,1.0)
1659  new_xl["Score"]=np.random.beta(1.0,self.beta_false)
1660  else:
1661  #new_xl["Score"]=new_xl["Reactivity"]+uniform(0.0,2.0)
1662  new_xl["Score"]=1.0-np.random.beta(1.0,self.beta_true)
1663  new_xl["TargetDistance"]=dist
1664  new_xl["NoiseProbability"]=noise
1665  new_xl["AmbiguityProbability"]=ambiguity_probability
1666  # getting if it is intra or inter rigid body
1667  (p1,p2)=IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1668  self.protein_residue_dict[(pra[1],pra[3])]])
1671  IMP.core.RigidMember(p1).get_rigid_body() ==
1672  IMP.core.RigidMember(p2).get_rigid_body()):
1673  new_xl["InterRigidBody"] = False
1674  elif (IMP.core.RigidMember.get_is_setup(p1) and
1676  IMP.core.RigidMember(p1).get_rigid_body() !=
1677  IMP.core.RigidMember(p2).get_rigid_body()):
1678  new_xl["InterRigidBody"] = True
1679  else:
1680  new_xl["InterRigidBody"] = None
1681 
1682  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1683  self.cldb._update()
1684  return self.cldb
1685 
1686 
1687  def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1688  import IMP.pmi.tools
1689  import math
1690  from random import choice,uniform
1691  if distance is None:
1692  # get a random pair
1693  while True:
1694  (protein1,residue1)=choice(self.protein_residue_dict.keys())
1695  (protein2,residue2)=choice(self.protein_residue_dict.keys())
1696  index1=self.protein_residue_dict[(protein1,residue1)]
1697  index2=self.protein_residue_dict[(protein2,residue2)]
1698  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1699  if (protein1,residue1) != (protein2,residue2):
1700  break
1701  else:
1702  # get a pair of residues whose distance is below the threshold
1703  if not xwalk_bin_path:
1705  gcpf.set_distance(distance+max_delta_distance)
1706 
1707  while True:
1708  #setup the reaction rates lists
1709  if not self.sites_weighted:
1710  self.sites_weighted=[]
1711  for key in self.reactivity_dictionary:
1712  r=self.reactivity_dictionary[key]
1713  self.sites_weighted.append((key,r))
1714  #get a random reaction site
1715  first_site=self.weighted_choice(self.sites_weighted)
1716  #get all distances
1717  if not self.euclidean_interacting_pairs:
1718  self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1719  self.indexes_dict1.keys(),
1720  self.indexes_dict2.keys())
1721  #get the partner for the first reacted site
1722  first_site_pairs = [pair for pair in self.euclidean_interacting_pairs
1723  if self.indexes_dict1[pair[0]] == first_site or
1724  self.indexes_dict2[pair[1]] == first_site]
1725  if len(first_site_pairs)==0: continue
1726  #build the list of second reaction sites
1727  second_sites_weighted=[]
1728  for pair in first_site_pairs:
1729  if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1730  if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1731  r=self.reactivity_dictionary[second_site]
1732  second_sites_weighted.append((second_site,r))
1733  second_site=self.weighted_choice(second_sites_weighted)
1734  """
1735  interacting_pairs_weighted=[]
1736  for pair in self.euclidean_interacting_pairs:
1737  r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1738  r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1739  #combined reactivity 1-exp(-k12*Delta t),
1740  # k12=k1*k2/(k1+k2)
1741  #print(r1,r2,dist)
1742  r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1743  interacting_pairs_weighted.append((pair,r12))
1744  #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1745  #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1746  #interacting_pairs_weighted.append((pair,weight1*weight2))
1747 
1748  while True:
1749  pair=self.weighted_choice(interacting_pairs_weighted)
1750  protein1,residue1=self.indexes_dict1[pair[0]]
1751  protein2,residue2=self.indexes_dict2[pair[1]]
1752  particle_pair=IMP.get_particles(self.model,pair)
1753  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1754  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1755  break
1756  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1757  #allow some flexibility
1758  prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1759  if uniform(0.0,1.0)<prob: break
1760  """
1761  protein1,residue1=first_site
1762  protein2,residue2=second_site
1763  print("CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1764  "First site",first_site,self.reactivity_dictionary[first_site],
1765  "Second site",second_site,self.reactivity_dictionary[second_site])
1766  particle_pair=IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1767  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1768 
1769  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1770  break
1771  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1772  #allow some flexibility
1773  #prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1774  #if uniform(0.0,1.0)<prob: break
1775  if particle_distance-distance < max_delta_distance: break
1776 
1777 
1778 
1779  else:
1780  if not self.xwalk_interacting_pairs:
1781  self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1782  interacting_pairs_weighted=[]
1783 
1784  for pair in self.xwalk_interacting_pairs:
1785  protein1=pair[0]
1786  protein2=pair[1]
1787  residue1=pair[2]
1788  residue2=pair[3]
1789  weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1790  weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1791  interacting_pairs_weighted.append((pair,weight1*weight2))
1792 
1793  pair=self.weighted_choice(interacting_pairs_weighted)
1794  protein1=pair[0]
1795  protein2=pair[1]
1796  residue1=pair[2]
1797  residue2=pair[3]
1798  particle_distance=float(pair[4])
1799 
1800 
1801 
1802  return ((protein1,protein2,residue1,residue2)),particle_distance
1803 
1804  def get_xwalk_distances(self,xwalk_bin_path,distance):
1805  import IMP.pmi.output
1806  import os
1807  o=IMP.pmi.output.Output(atomistic=True)
1808  o.init_pdb("xwalk.pdb",self.representation.prot)
1809  o.write_pdb("xwalk.pdb")
1810  namechainiddict=o.dictchain["xwalk.pdb"]
1811  chainiddict={}
1812 
1813  for key in namechainiddict: chainiddict[namechainiddict[key]]=key
1814  xwalkout=os.popen('java -Xmx256m -cp ' + xwalk_bin_path +' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+' -bb').read()
1815 
1816  output_list_of_distance=[]
1817  for line in xwalkout.split("\n")[0:-2]:
1818  tokens=line.split()
1819  first=tokens[2]
1820  second=tokens[3]
1821  distance=float(tokens[6])
1822  fs=first.split("-")
1823  ss=second.split("-")
1824  chainid1=fs[2]
1825  chainid2=ss[2]
1826  protein1=chainiddict[chainid1]
1827  protein2=chainiddict[chainid2]
1828  residue1=int(fs[1])
1829  residue2=int(ss[1])
1830  output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1831  return output_list_of_distance
1832 
1833 
1834  def weighted_choice(self,choices):
1835  import random
1836  # from http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
1837  total = sum(w for c, w in choices)
1838  r = random.uniform(0, total)
1839  upto = 0
1840  for c, w in choices:
1841  if upto + w > r:
1842  return c
1843  upto += w
1844  assert False, "Shouldn't get here"
1845 
1846  def save_rmf_snapshot(self,filename,color_id=None):
1847  import IMP.rmf
1848  import RMF
1849  if color_id is None:
1850  color_id="Reactivity"
1851  sorted_ids=None
1852  sorted_group_ids=sorted(self.cldb.data_base.keys())
1853  list_of_pairs=[]
1854  color_scores=[]
1855  for group in sorted_group_ids:
1856  group_xls=[]
1857  group_dists_particles=[]
1858  for xl in self.cldb.data_base[group]:
1859  xllabel=self.cldb.get_short_cross_link_string(xl)
1860  (c1,c2,r1,r2)=(xl["molecule_object1"],xl["molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1861  try:
1862  index1=self.protein_residue_dict[(c1,r1)]
1863  index2=self.protein_residue_dict[(c2,r2)]
1864  p1,p2=IMP.get_particles(self.model,[index1])[0],IMP.get_particles(self.model,[index2])[0]
1865  mdist=xl["TargetDistance"]
1866  except TypeError:
1867  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1868  continue
1869  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1870  if group_dists_particles:
1871  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1872  color_scores.append(mincolor_score)
1873  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1874  else:
1875  continue
1876 
1877 
1878  m=self.model
1879  linear = IMP.core.Linear(0, 0.0)
1880  linear.set_slope(1.0)
1881  dps2 = IMP.core.DistancePairScore(linear)
1882  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
1883  sgs=[]
1884  offset=min(color_scores)
1885  maxvalue=max(color_scores)
1886  for pair in list_of_pairs:
1887  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
1888  pr.set_name(pair[2])
1889  factor=(pair[3]-offset)/(maxvalue-offset)
1890  c=IMP.display.get_rgb_color(factor)
1891  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1892  rslin.add_restraint(pr)
1893  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1894 
1895  rh = RMF.create_rmf_file(filename)
1896  IMP.rmf.add_hierarchies(rh, [self.system.hier])
1897  IMP.rmf.add_restraints(rh,[rslin])
1898  IMP.rmf.add_geometries(rh, sgs)
1899  IMP.rmf.save_frame(rh)
1900  del rh
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: State.h:41
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1695
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1758
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:731
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:36
Stores a named protein chain.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
The standard decorator for manipulating molecular structures.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:198
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:19
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:24
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
Definition: State.h:27
class to link stat files to several rmf files
Definition: output.py:1216
class to allow more advanced handling of RMF files.
Definition: output.py:1091
Mapping between FASTA one-letter codes and residue types.
Definition: alphabets.py:1
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Definition: PairRestraint.h:29
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.