IMP logo
IMP Reference Guide  develop.50fdd7fa33,2025/08/31
The Integrative Modeling Platform
crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 
8 import IMP
9 import IMP.pmi
10 import IMP.pmi.output
11 import IMP.pmi.alphabets
12 import IMP.atom
13 import IMP.core
14 import IMP.algebra
15 import IMP.rmf
16 import RMF
17 import IMP.display
18 import operator
19 import math
20 import ihm.location
21 import ihm.dataset
22 from collections import defaultdict
23 import numpy
24 
25 
26 # json default serializations
27 def set_json_default(obj):
28  if isinstance(obj, set):
29  return list(obj)
30  if isinstance(obj, IMP.pmi.topology.Molecule):
31  return str(obj)
32  raise TypeError
33 
34 
35 # Handle and return data that must be a string
36 def _handle_string_input(inp):
37  if not isinstance(inp, str):
38  raise TypeError("expecting a string")
39  return inp
40 
41 
42 class _CrossLinkDataBaseStandardKeys:
43  '''
44  This class setup all the standard keys needed to
45  identify the cross-link features from the data sets
46  '''
47  def __init__(self):
48  self.type = {}
49  self.protein1_key = "Protein1"
50  self.type[self.protein1_key] = str
51  self.protein2_key = "Protein2"
52  self.type[self.protein2_key] = str
53  self.residue1_key = "Residue1"
54  self.type[self.residue1_key] = int
55  self.residue2_key = "Residue2"
56  self.type[self.residue2_key] = int
57  self.residue1_amino_acid_key = "Residue1AminoAcid"
58  self.type[self.residue1_amino_acid_key] = str
59  self.residue2_amino_acid_key = "Residue2AminoAcid"
60  self.type[self.residue2_amino_acid_key] = str
61  self.residue1_moiety_key = "Residue1Moiety"
62  self.type[self.residue1_moiety_key] = str
63  self.residue2_moiety_key = "Residue2Moiety"
64  self.type[self.residue2_moiety_key] = str
65  self.site_pairs_key = "SitePairs"
66  self.type[self.site_pairs_key] = str
67  self.unique_id_key = "XLUniqueID"
68  self.type[self.unique_id_key] = str
69  self.unique_sub_index_key = "XLUniqueSubIndex"
70  self.type[self.unique_sub_index_key] = int
71  self.unique_sub_id_key = "XLUniqueSubID"
72  self.type[self.unique_sub_id_key] = str
73  self.data_set_name_key = "DataSetName"
74  self.type[self.data_set_name_key] = str
75  self.cross_linker_chemical_key = "CrossLinkerChemical"
76  self.type[self.cross_linker_chemical_key] = str
77  self.id_score_key = "IDScore"
78  self.type[self.id_score_key] = float
79  self.fdr_key = "FDR"
80  self.type[self.fdr_key] = float
81  self.quantitation_key = "Quantitation"
82  self.type[self.quantitation_key] = float
83  self.redundancy_key = "Redundancy"
84  self.type[self.redundancy_key] = int
85  self.redundancy_list_key = "RedundancyList"
86  self.type[self.redundancy_key] = list
87  self.ambiguity_key = "Ambiguity"
88  self.type[self.ambiguity_key] = int
89  self.residue1_links_number_key = "Residue1LinksNumber"
90  self.type[self.residue1_links_number_key] = int
91  self.residue2_links_number_key = "Residue2LinksNumber"
92  self.type[self.residue2_links_number_key] = int
93  self.type[self.ambiguity_key] = int
94  self.state_key = "State"
95  self.type[self.state_key] = int
96  self.sigma1_key = "Sigma1"
97  self.type[self.sigma1_key] = str
98  self.sigma2_key = "Sigma2"
99  self.type[self.sigma2_key] = str
100  self.psi_key = "Psi"
101  self.type[self.psi_key] = str
102  self.alpha_key = "Alpha"
103  self.type[self.alpha_key] = str
104  self.beta_key = "Beta"
105  self.type[self.beta_key] = str
106  self.distance_key = "Distance"
107  self.type[self.distance_key] = float
108  self.min_ambiguous_distance_key = "MinAmbiguousDistance"
109  self.type[self.distance_key] = float
110  # link types are either Monolink, Intralink or Interlink
111  self.link_type_key = "LinkType"
112  self.type[self.link_type_key] = str
113 
114  self.ordered_key_list = [
115  self.data_set_name_key, self.unique_id_key,
116  self.unique_sub_index_key, self.unique_sub_id_key,
117  self.protein1_key, self.protein2_key, self.residue1_key,
118  self.residue2_key, self.residue1_amino_acid_key,
119  self.residue2_amino_acid_key, self.residue1_moiety_key,
120  self.residue2_moiety_key, self.cross_linker_chemical_key,
121  self.id_score_key, self.fdr_key, self.quantitation_key,
122  self.redundancy_key, self.redundancy_list_key, self.state_key,
123  self.sigma1_key, self.sigma2_key, self.psi_key, self.distance_key,
124  self.min_ambiguous_distance_key, self.link_type_key]
125 
126 
127 class _ProteinsResiduesArray(tuple):
128  '''
129  This class is inherits from tuple, and it is a shorthand for a cross-link
130  (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1
131  and protein2, r1 and r2 are residue1 and residue2.
132  '''
133 
134  def __new__(self, input_data):
135  '''
136  @input_data can be a dict or a tuple
137  '''
138  self.cldbsk = _CrossLinkDataBaseStandardKeys()
139  if type(input_data) is dict:
140  monolink = False
141  p1 = input_data[self.cldbsk.protein1_key]
142  try:
143  p2 = input_data[self.cldbsk.protein2_key]
144  except KeyError:
145  monolink = True
146  r1 = input_data[self.cldbsk.residue1_key]
147  try:
148  r2 = input_data[self.cldbsk.residue2_key]
149  except KeyError:
150  monolink = True
151  if not monolink:
152  t = (p1, p2, r1, r2)
153  else:
154  t = (p1, "", r1, None)
155  elif type(input_data) is tuple:
156  if len(input_data) > 4 or len(input_data) == 3 \
157  or len(input_data) == 1:
158  raise TypeError(
159  "_ProteinsResiduesArray: must have only 4 elements")
160  if len(input_data) == 4:
161  p1 = _handle_string_input(input_data[0])
162  p2 = _handle_string_input(input_data[1])
163  r1 = input_data[2]
164  r2 = input_data[3]
165  if (type(r1) is not int) and (r1 is not None):
166  raise TypeError(
167  "_ProteinsResiduesArray: residue1 must be a integer")
168  if (type(r2) is not int) and (r1 is not None):
169  raise TypeError(
170  "_ProteinsResiduesArray: residue2 must be a integer")
171  t = (p1, p2, r1, r2)
172  if len(input_data) == 2:
173  p1 = _handle_string_input(input_data[0])
174  r1 = input_data[1]
175  if type(r1) is not int:
176  raise TypeError(
177  "_ProteinsResiduesArray: residue1 must be a integer")
178  t = (p1, "", r1, None)
179  else:
180  raise TypeError(
181  "_ProteinsResiduesArray: input must be a dict or tuple")
182  return tuple.__new__(_ProteinsResiduesArray, t)
183 
184  def get_inverted(self):
185  '''
186  Returns a _ProteinsResiduesArray instance with protein1 and
187  protein2 inverted
188  '''
189  return _ProteinsResiduesArray((self[1], self[0], self[3], self[2]))
190 
191  def __repr__(self):
192  outstr = self.cldbsk.protein1_key + " " + str(self[0])
193  outstr += " " + self.cldbsk.protein2_key + " " + str(self[1])
194  outstr += " " + self.cldbsk.residue1_key + " " + str(self[2])
195  outstr += " " + self.cldbsk.residue2_key + " " + str(self[3])
196  return outstr
197 
198  def __str__(self):
199  outstr = str(self[0]) + "." + str(self[2]) + "-" + str(self[1]) \
200  + "." + str(self[3])
201  return outstr
202 
203 
205  '''
206  This class allows to create filter functions that can be passed to
207  the CrossLinkDataBase in this way:
208 
209  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
210 
211  where cldb is CrossLinkDataBase instance and it is only needed to get
212  the standard keywords
213 
214  A filter operator can be evaluate on a CrossLinkDataBase item xl and
215  returns a boolean
216 
217  fo.evaluate(xl)
218 
219  and it is used to filter the database
220  '''
221 
222  def __init__(self, argument1, operator, argument2):
223  '''
224  (argument1,operator,argument2) can be either a
225  (keyword,operator.eq|lt|gt...,value)
226  or (FilterOperator1,operator.or|and...,FilterOperator2)
227  '''
228  if isinstance(argument1, FilterOperator):
229  self.operations = [argument1, operator, argument2]
230  else:
231  self.operations = []
232  self.values = (argument1, operator, argument2)
233 
234  def __or__(self, FilterOperator2):
235  return FilterOperator(self, operator.or_, FilterOperator2)
236 
237  def __and__(self, FilterOperator2):
238  return FilterOperator(self, operator.and_, FilterOperator2)
239 
240  def __invert__(self):
241  return FilterOperator(self, operator.not_, None)
242 
243  def evaluate(self, xl_item):
244 
245  if len(self.operations) == 0:
246  keyword, operator, value = self.values
247  return operator(xl_item[keyword], value)
248  FilterOperator1, op, FilterOperator2 = self.operations
249 
250  if FilterOperator2 is None:
251  return op(FilterOperator1.evaluate(xl_item))
252  else:
253  return op(FilterOperator1.evaluate(xl_item),
254  FilterOperator2.evaluate(xl_item))
255 
256 
257 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
258  '''
259  This class is needed to convert the keywords from a generic database
260  to the standard ones
261  '''
262 
263  def __init__(self, list_parser=None):
264  '''
265  @param list_parser an instance of ResiduePairListParser, if any
266  is needed
267  '''
268  self.converter = {}
269  self.backward_converter = {}
270  _CrossLinkDataBaseStandardKeys.__init__(self)
271  self.rplp = list_parser
272  if self.rplp is None:
273  # either you have protein1, protein2, residue1, residue2
274  self.compulsory_keys = set([self.protein1_key,
275  self.protein2_key,
276  self.residue1_key,
277  self.residue2_key])
278  else:
279  self.compulsory_keys = self.rplp.get_compulsory_keys()
280  self.setup_keys = set()
281 
282  def check_keys(self):
283  '''
284  Is a function that check whether necessary keys are setup
285  '''
286  setup_keys = set(self.get_setup_keys())
287  if self.compulsory_keys & setup_keys != self.compulsory_keys:
288  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup "
289  "all necessary keys")
290 
291  def get_setup_keys(self):
292  '''
293  Returns the keys that have been setup so far
294  '''
295  return list(self.backward_converter.keys())
296 
297  def set_standard_keys(self):
298  """
299  This sets up the standard compulsory keys as defined by
300  _CrossLinkDataBaseStandardKeys
301  """
302  for ck in self.compulsory_keys:
303  self.converter[ck] = ck
304  self.backward_converter[ck] = ck
305 
306  def set_unique_id_key(self, origin_key):
307  self.converter[origin_key] = self.unique_id_key
308  self.backward_converter[self.unique_id_key] = origin_key
309 
310  def set_protein1_key(self, origin_key):
311  self.converter[origin_key] = self.protein1_key
312  self.backward_converter[self.protein1_key] = origin_key
313 
314  def set_protein2_key(self, origin_key):
315  self.converter[origin_key] = self.protein2_key
316  self.backward_converter[self.protein2_key] = origin_key
317 
318  def set_residue1_key(self, origin_key):
319  self.converter[origin_key] = self.residue1_key
320  self.backward_converter[self.residue1_key] = origin_key
321 
322  def set_residue2_key(self, origin_key):
323  self.converter[origin_key] = self.residue2_key
324  self.backward_converter[self.residue2_key] = origin_key
325 
326  def set_residue1_amino_acid_key(self, origin_key):
327  self.converter[origin_key] = self.residue1_amino_acid_key
328  self.backward_converter[self.residue1_amino_acid_key] = origin_key
329 
330  def set_residue2_amino_acid_key(self, origin_key):
331  self.converter[origin_key] = self.residue2_amino_acid_key
332  self.backward_converter[self.residue2_amino_acid_key] = origin_key
333 
334  def set_residue1_moiety_key(self, origin_key):
335  self.converter[origin_key] = self.residue1_moiety_key
336  self.backward_converter[self.residue1_moiety_key] = origin_key
337 
338  def set_residue2_moiety_key(self, origin_key):
339  self.converter[origin_key] = self.residue2_moiety_key
340  self.backward_converter[self.residue2_moiety_key] = origin_key
341 
342  def set_site_pairs_key(self, origin_key):
343  self.converter[origin_key] = self.site_pairs_key
344  self.backward_converter[self.site_pairs_key] = origin_key
345 
346  def set_id_score_key(self, origin_key):
347  self.converter[origin_key] = self.id_score_key
348  self.backward_converter[self.id_score_key] = origin_key
349 
350  def set_fdr_key(self, origin_key):
351  self.converter[origin_key] = self.fdr_key
352  self.backward_converter[self.fdr_key] = origin_key
353 
354  def set_quantitation_key(self, origin_key):
355  self.converter[origin_key] = self.quantitation_key
356  self.backward_converter[self.quantitation_key] = origin_key
357 
358  def set_psi_key(self, origin_key):
359  self.converter[origin_key] = self.psi_key
360  self.backward_converter[self.psi_key] = origin_key
361 
362  def set_link_type_key(self, link_type_key):
363  self.converter[link_type_key] = self.link_type_key
364  self.backward_converter[self.link_type_key] = link_type_key
365 
366  def get_converter(self):
367  '''
368  Returns the dictionary that convert the old keywords to the new ones
369  '''
370  self.check_keys()
371  return self.converter
372 
374  '''
375  Returns the dictionary that convert the new keywords to the old ones
376  '''
377  self.check_keys()
378  return self.backward_converter
379 
380 
381 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
382  '''
383  A class to handle different styles of site pairs parsers.
384  Implemented styles:
385  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
386  [Y3-;K4-] for dead-ends
387  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
388  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
389  LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
390  ''' # noqa: E501
391 
392  import re
393 
394  def __init__(self, style):
395  _CrossLinkDataBaseStandardKeys.__init__(self)
396  if style == "MSSTUDIO":
397  self.style = style
398  self.compulsory_keys = set([self.protein1_key,
399  self.protein2_key,
400  self.site_pairs_key])
401  elif style == "XTRACT" or style == "QUANTITATION":
402  self.style = style
403  self.compulsory_keys = set([self.site_pairs_key])
404  elif style == "LAN_HUANG":
405  self.style = style
406  self.compulsory_keys = set([self.site_pairs_key])
407  else:
408  raise Exception("ResiduePairListParser: unknown list parser style")
409 
410  def get_compulsory_keys(self):
411  return self.compulsory_keys
412 
413  def get_list(self, input_string):
414  '''
415  This function returns a list of cross-linked residues and the
416  corresponding list of cross-linked chains. The latter list can be
417  empty, if the style doesn't have the corresponding information.
418  '''
419  if self.style == "MSSTUDIO":
420  input_string = input_string.replace("[", "")
421  input_string = input_string.replace("]", "")
422  input_string_pairs = input_string.split(";")
423  residue_pair_indexes = []
424  chain_pair_indexes = []
425  for s in input_string_pairs:
426  m1 = self.re.search(
427  r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)'
428  r'(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',
429  s)
430  m2 = self.re.search(
431  r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$', s)
432  if m1:
433  # cross-link
434  (residue_type_1, residue_index_1, residue_type_2,
435  residue_index_2) = m1.group(1, 2, 3, 4)
436  residue_pair_indexes.append((residue_index_1,
437  residue_index_2))
438  elif m2:
439  # dead end
440  residue_type_1, residue_index_1 = m2.group(1, 2)
441  # at this stage chain_pair_indexes is empty
442  return residue_pair_indexes, chain_pair_indexes
443  if self.style == "XTRACT" or self.style == "QUANTITATION":
444  if ":x:" in input_string:
445  # if it is a cross-link....
446  input_strings = input_string.split(":x:")
447  first_peptides = input_strings[0].split(":|:")
448  second_peptides = input_strings[1].split(":|:")
449  first_peptides_identifiers = [
450  (f.split(":")[0],
451  f.split(":")[1]) for f in first_peptides]
452  second_peptides_identifiers = [
453  (f.split(":")[0],
454  f.split(":")[1]) for f in second_peptides]
455  residue_pair_indexes = []
456  chain_pair_indexes = []
457  for fpi in first_peptides_identifiers:
458  for spi in second_peptides_identifiers:
459  chain1 = fpi[0]
460  chain2 = spi[0]
461  residue1 = fpi[1]
462  residue2 = spi[1]
463  residue_pair_indexes.append((residue1, residue2))
464  chain_pair_indexes.append((chain1, chain2))
465  return residue_pair_indexes, chain_pair_indexes
466  else:
467  # if it is a monolink....
468  first_peptides = input_string.split(":|:")
469  first_peptides_identifiers = [
470  (f.split(":")[0], f.split(":")[1]) for f in first_peptides]
471  residue_indexes = []
472  chain_indexes = []
473  for fpi in first_peptides_identifiers:
474  chain1 = fpi[0]
475  residue1 = fpi[1]
476  residue_indexes.append((residue1,))
477  chain_indexes.append((chain1,))
478  return residue_indexes, chain_indexes
479  if self.style == "LAN_HUANG":
480  input_strings = input_string.split("-")
481  chain1, first_series = input_strings[0].split(":")
482  chain2, second_series = input_strings[1].split(":")
483 
484  first_residues = first_series.replace(";", "|").split("|")
485  second_residues = second_series.replace(";", "|").split("|")
486  residue_pair_indexes = []
487  chain_pair_indexes = []
488  for fpi in first_residues:
489  for spi in second_residues:
490  residue1 = self.re.sub("[^0-9]", "", fpi)
491  residue2 = self.re.sub("[^0-9]", "", spi)
492  residue_pair_indexes.append((residue1, residue2))
493  chain_pair_indexes.append((chain1, chain2))
494  return residue_pair_indexes, chain_pair_indexes
495 
496 
497 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
498  '''
499  A class to handle different XL format with fixed format
500  currently support ProXL
501  '''
502  def __init__(self, format):
503  _CrossLinkDataBaseStandardKeys.__init__(self)
504  if format == "PROXL":
505  self.format = format
506  else:
507  raise Exception("FixedFormatParser: unknown list format name")
508 
509  def get_data(self, input_string):
510  if self.format == "PROXL":
511  tokens = input_string.split("\t")
512  xl = {}
513  if tokens[0] == "SEARCH ID(S)":
514  return None
515  else:
516  xl[self.protein1_key] = tokens[2]
517  xl[self.protein2_key] = tokens[4]
518  xl[self.residue1_key] = int(tokens[3])
519  xl[self.residue2_key] = int(tokens[5])
520  return xl
521 
522 
523 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
524  '''
525  this class handles a cross-link dataset and do filtering
526  operations, adding cross-links, merge datasets...
527  '''
528 
529  def __init__(self, converter=None, data_base=None, fasta_seq=None,
530  linkable_aa=('K',)):
531  '''
532  Constructor.
533  @param converter an instance of CrossLinkDataBaseKeywordsConverter
534  @param data_base an instance of CrossLinkDataBase to build the
535  new database on
536  @param fasta_seq an instance of IMP.pmi.topology.Sequences containing
537  protein fasta sequences to check cross-link consistency.
538  If not given consistency will not be checked
539  @param linkable_aa a tuple containing one-letter amino acids which
540  are linkable by the cross-linker; only used if the database
541  DOES NOT provide a value for a certain residueX_amino_acid_key
542  and if a fasta_seq is given
543  '''
544 
545  if data_base is None:
546  self.data_base = {}
547  else:
548  self.data_base = data_base
549 
550  _CrossLinkDataBaseStandardKeys.__init__(self)
551  if converter is not None:
552  self.cldbkc = converter # type: CrossLinkDataBaseKeywordsConverter
553  self.list_parser = self.cldbkc.rplp
554  self.converter = converter.get_converter()
555 
556  else:
557  self.cldbkc = None # type: CrossLinkDataBaseKeywordsConverter
558  self.list_parser = None
559  self.converter = None
560 
561  # default amino acids considered to be 'linkable' if none are given
562  self.def_aa_tuple = linkable_aa
563  self.fasta_seq = fasta_seq # type: IMP.pmi.topology.Sequences
564  self.dataset = None
565  self.name = None
566  self._update()
567 
568  def _update(self):
569  '''
570  Update the whole dataset after changes
571  '''
572  self.update_cross_link_unique_sub_index()
573  self.update_cross_link_redundancy()
574  self.update_residues_links_number()
576 
577  def __iter__(self):
578  sorted_ids = sorted(list(self.data_base.keys()))
579  for k in sorted_ids:
580  for xl in self.data_base[k]:
581  yield xl
582 
583  def xlid_iterator(self):
584  sorted_ids = sorted(list(self.data_base.keys()))
585  for xlid in sorted_ids:
586  yield xlid
587 
588  def __getitem__(self, xlid):
589  return self.data_base[xlid]
590 
591  def __len__(self):
592  return len([xl for xl in self])
593 
594  def get_name(self):
595  return self.name
596 
597  def set_name(self, name):
598  new_data_base = {}
599  for k in self.data_base:
600  new_data_base[k + "." + name] = self.data_base[k]
601  self.data_base = new_data_base
602  self.name = name
603  self._update()
604 
605  def get_number_of_xlid(self):
606  return len(self.data_base)
607 
608  def create_set_from_file(self, file_name, converter=None,
609  FixedFormatParser=None, encoding=None):
610  '''
611  if FixedFormatParser is not specified, the file is
612  comma-separated-values
613 
614  @param file_name a txt file to be parsed
615  @param converter an instance of CrossLinkDataBaseKeywordsConverter
616  @param FixedFormatParser a parser for a fixed format
617  @param encoding the encoding of the file, if not the locale
618  default (usually UTF-8)
619  '''
620  if not FixedFormatParser:
621  xl_list = IMP.pmi.tools.get_db_from_csv(file_name, encoding)
622 
623  if converter is not None:
624  self.cldbkc = converter
625  self.list_parser = self.cldbkc.rplp
626  self.converter = converter.get_converter()
627 
628  if not self.list_parser:
629  # normal procedure without a list_parser
630  # each line is a cross-link
631  new_xl_dict = {}
632  for nxl, xl in enumerate(xl_list):
633  new_xl = {}
634  for k in xl:
635  if k in self.converter:
636  new_xl[self.converter[k]] = \
637  self.type[self.converter[k]](xl[k])
638  else:
639  new_xl[k] = xl[k]
640  if self.unique_id_key in self.cldbkc.get_setup_keys():
641  if new_xl[self.unique_id_key] not in new_xl_dict:
642  new_xl_dict[new_xl[self.unique_id_key]] = [new_xl]
643  else:
644  new_xl_dict[new_xl[self.unique_id_key]].append(
645  new_xl)
646  else:
647  if str(nxl) not in new_xl_dict:
648  new_xl_dict[str(nxl)] = [new_xl]
649  else:
650  new_xl_dict[str(nxl)].append(new_xl)
651 
652  else:
653  # with a list_parser, a line can be a list of ambiguous
654  # cross-links
655  new_xl_dict = {}
656  for nxl, entry in enumerate(xl_list):
657 
658  # first get the translated keywords
659  new_dict = {}
660  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
661  raise Exception(
662  "CrossLinkDataBase: expecting a site_pairs_key "
663  "for the site pair list parser")
664  for k in entry:
665  if k in self.converter:
666  new_dict[self.converter[k]] = \
667  self.type[self.converter[k]](entry[k])
668  else:
669  new_dict[k] = entry[k]
670 
671  (residue_pair_list,
672  chain_pair_list) = self.list_parser.get_list(
673  new_dict[self.site_pairs_key])
674 
675  # then create the cross-links
676  for n, p in enumerate(residue_pair_list):
677  is_monolink = False
678  if len(p) == 1:
679  is_monolink = True
680 
681  new_xl = {}
682  for k in new_dict:
683  new_xl[k] = new_dict[k]
684  new_xl[self.residue1_key] = \
685  self.type[self.residue1_key](p[0])
686  if not is_monolink:
687  new_xl[self.residue2_key] = \
688  self.type[self.residue2_key](p[1])
689 
690  if len(chain_pair_list) == len(residue_pair_list):
691  new_xl[self.protein1_key] = \
692  self.type[self.protein1_key](
693  chain_pair_list[n][0])
694  if not is_monolink:
695  new_xl[self.protein2_key] = \
696  self.type[self.protein2_key](
697  chain_pair_list[n][1])
698 
699  if not is_monolink:
700  new_xl[self.link_type_key] = "CROSSLINK"
701  else:
702  new_xl[self.link_type_key] = "MONOLINK"
703 
704  if self.unique_id_key in self.cldbkc.get_setup_keys():
705  if new_xl[self.unique_id_key] not in new_xl_dict:
706  new_xl_dict[new_xl[self.unique_id_key]] = \
707  [new_xl]
708  else:
709  new_xl_dict[new_xl[self.unique_id_key]].append(
710  new_xl)
711  else:
712  if str(nxl) not in new_xl_dict:
713  new_xl[self.unique_id_key] = str(nxl+1)
714  new_xl_dict[str(nxl)] = [new_xl]
715  else:
716  new_xl[self.unique_id_key] = str(nxl+1)
717  new_xl_dict[str(nxl)].append(new_xl)
718 
719  else:
720  '''
721  if FixedFormatParser is defined
722  '''
723 
724  new_xl_dict = {}
725  nxl = 0
726  with open(file_name, "r", encoding=encoding) as f:
727  for line in f:
728  xl = FixedFormatParser.get_data(line)
729  if xl:
730  xl[self.unique_id_key] = str(nxl+1)
731  new_xl_dict[str(nxl)] = [xl]
732  nxl += 1
733 
734  self.data_base = new_xl_dict
735  self.name = file_name
736  loc = ihm.location.InputFileLocation(file_name, details='Crosslinks')
737  self.dataset = ihm.dataset.CXMSDataset(loc)
738  self._update()
739 
740  def update_cross_link_unique_sub_index(self):
741  for k in self.data_base:
742  for n, xl in enumerate(self.data_base[k]):
743  xl[self.ambiguity_key] = len(self.data_base[k])
744  xl[self.unique_sub_index_key] = n+1
745  xl[self.unique_sub_id_key] = k + "." + str(n+1)
746 
747  def update_cross_link_redundancy(self):
748  redundancy_data_base = {}
749  for xl in self:
750  pra = _ProteinsResiduesArray(xl)
751  if pra not in redundancy_data_base:
752  redundancy_data_base[pra] = [xl[self.unique_sub_id_key]]
753  redundancy_data_base[pra.get_inverted()] = \
754  [xl[self.unique_sub_id_key]]
755  else:
756  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
757  redundancy_data_base[pra.get_inverted()].append(
758  xl[self.unique_sub_id_key])
759  for xl in self:
760  pra = _ProteinsResiduesArray(xl)
761  xl[self.redundancy_key] = len(redundancy_data_base[pra])
762  xl[self.redundancy_list_key] = redundancy_data_base[pra]
763 
764  def update_residues_links_number(self):
765  residue_links = {}
766  for xl in self:
767  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
768  if (p1, r1) not in residue_links:
769  residue_links[(p1, r1)] = set([(p2, r2)])
770  else:
771  residue_links[(p1, r1)].add((p2, r2))
772  if (p2, r2) not in residue_links:
773  residue_links[(p2, r2)] = set([(p1, r1)])
774  else:
775  residue_links[(p2, r2)].add((p1, r1))
776 
777  for xl in self:
778  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
779  xl[self.residue1_links_number_key] = len(residue_links[(p1, r1)])
780  xl[self.residue2_links_number_key] = len(residue_links[(p2, r2)])
781 
783  """This function checks the consistency of the dataset with the
784  amino acid sequence"""
785  if self.cldbkc and self.fasta_seq:
786  cnt_matched, cnt_matched_file = 0, 0
787  matched = {}
788  non_matched = {}
789  for xl in self:
790  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
791  b_matched_file = False
792  if self.residue1_amino_acid_key in xl:
793  # either you know the residue type and aa_tuple is
794  # a single entry
795  aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
796  b_matched = self._match_xlinks(p1, r1, aa_from_file)
797  b_matched_file = b_matched
798  else:
799  # or pass the possible list of types that can be
800  # cross-linked
801  b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
802 
803  matched, non_matched = self._update_matched_xlinks(
804  b_matched, p1, r1, matched, non_matched)
805 
806  if self.residue2_amino_acid_key in xl:
807  aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
808  b_matched = self._match_xlinks(p2, r2, aa_from_file)
809  b_matched_file = b_matched
810  else:
811  b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
812 
813  matched, non_matched = self._update_matched_xlinks(
814  b_matched, p2, r2, matched, non_matched)
815  if b_matched:
816  cnt_matched += 1
817  if b_matched_file:
818  cnt_matched_file += 1
819  if len(self) > 0:
820  percentage_matched = round(100*cnt_matched/len(self), 1)
821  percentage_matched_file = round(
822  100 * cnt_matched_file / len(self), 1)
823  if matched or non_matched:
824  print(
825  "check_cross_link_consistency: Out of %d cross-links "
826  "%d were matched to the fasta sequence (%f %%).\n "
827  "%d were matched by using the cross-link file (%f %%)."
828  % (len(self), cnt_matched, percentage_matched,
829  cnt_matched_file, percentage_matched_file))
830  if non_matched:
831  print("check_cross_link_consistency: Warning: Non "
832  "matched xlinks:",
833  [(prot_name, sorted(list(non_matched[prot_name])))
834  for prot_name in non_matched])
835  return matched, non_matched
836 
837  def _match_xlinks(self, prot_name, res_index, aa_tuple):
838  # returns Boolean whether given aa matches a position in the fasta file
839  # cross-link files usually start counting at 1 and not 0; therefore
840  # subtract -1 to compare with fasta
841  amino_dict = IMP.pmi.alphabets.amino_acid
842  res_index -= 1
843  for amino_acid in aa_tuple:
844  if len(amino_acid) == 3:
845  amino_acid = amino_dict.get_one_letter_code_from_residue_type(
846  amino_acid.upper())
847  if prot_name in self.fasta_seq.sequences:
848  seq = self.fasta_seq.sequences[prot_name]
849  # if we are looking at the first amino acid in a given
850  # sequence always return a match
851  # the first aa should always be the n-terminal aa
852  # which may form a cross-link in any case (for BS3-like
853  # cross-linkers)
854  # for some data sets the first aa is at position 1;
855  # todo: check why this is the case
856  if res_index == 0 or res_index == 1:
857  return True
858  if res_index < len(seq):
859  if amino_acid == seq[res_index]:
860  return True
861  return False
862 
863  def _update_matched_xlinks(self, b_matched, prot, res, matched,
864  non_matched):
865  if b_matched:
866  if prot in matched:
867  matched[prot].add(res)
868  else:
869  matched[prot] = set([res])
870  else:
871  if prot in non_matched:
872  non_matched[prot].add(res)
873  else:
874  non_matched[prot] = set([res])
875  return matched, non_matched
876 
877  def get_cross_link_string(self, xl):
878  string = '|'
879  for k in self.ordered_key_list:
880  try:
881  string += str(k) + ":" + str(xl[k]) + "|"
882  except KeyError:
883  continue
884 
885  for k in xl:
886  if k not in self.ordered_key_list:
887  string += str(k) + ":" + str(xl[k]) + "|"
888 
889  return string
890 
891  def get_short_cross_link_string(self, xl):
892  string = '|'
893  list_of_keys = [self.data_set_name_key,
894  self.unique_sub_id_key,
895  self.protein1_key,
896  self.residue1_key,
897  self.protein2_key,
898  self.residue2_key,
899  self.state_key,
900  self.psi_key]
901 
902  for k in list_of_keys:
903  try:
904  string += str(xl[k]) + "|"
905  except KeyError:
906  continue
907 
908  return string
909 
910  def filter(self, FilterOperator):
911  new_xl_dict = {}
912  for id in list(self.data_base.keys()):
913  for xl in self.data_base[id]:
914  if FilterOperator.evaluate(xl):
915  if id not in new_xl_dict:
916  new_xl_dict[id] = [xl]
917  else:
918  new_xl_dict[id].append(xl)
919  self._update()
920  cdb = CrossLinkDataBase(self.cldbkc, new_xl_dict)
921  cdb.dataset = self.dataset
922  cdb.name = self.name
923  return cdb
924 
925  def filter_score(self, score):
926  '''Get all cross-links with score greater than an input value'''
927  FilterOperator = IMP.pmi.io.crosslink.FilterOperator(
928  self.id_score_key, operator.gt, score)
929  return self.filter(FilterOperator)
930 
931  def merge(self, CrossLinkDataBase1, CrossLinkDataBase2):
932  '''
933  This function merges two cross-link datasets so that if two
934  conflicting cross-links have the same cross-link UniqueIDS,
935  the cross-links will be appended under the same UniqueID slots
936  with different SubIDs
937  '''
938  raise NotImplementedError()
939 
940  def append_database(self, db):
941  """Append cross-link dataset to this one."""
942  new_data_base = {}
943  for k in self.data_base:
944  new_data_base[k] = self.data_base[k]
945  for k in db.data_base:
946  new_data_base[k] = db.data_base[k]
947  self.data_base = new_data_base
948  self._update()
949 
950  def __iadd__(self, db):
951  self.append_database(db)
952  return self
953 
954  def set_value(self, key, new_value, filter_operator=None):
955  '''
956  This function changes the value for a given key in the database
957  For instance one can change the name of a protein
958  @param key: the key in the database that must be changed
959  @param new_value: the new value of the key
960  @param filter_operator: optional FilterOperator to change the value to
961  a subset of the database
962 
963  example: `cldb1.set_value(cldb1.protein1_key, 'FFF',
964  FO(cldb.protein1_key, operator.eq, "AAA"))`
965  '''
966 
967  for xl in self:
968  if filter_operator is not None:
969  if filter_operator.evaluate(xl):
970  xl[key] = new_value
971  else:
972  xl[key] = new_value
973  self._update()
974 
975  def get_values(self, key):
976  '''
977  this function returns the list of values for a given key in the
978  database alphanumerically sorted
979  '''
980  values = set()
981  for xl in self:
982  values.add(xl[key])
983  return sorted(list(values))
984 
985  def offset_residue_index(self, protein_name, offset):
986  '''
987  This function offset the residue indexes of a given protein by
988  a specified value
989  @param protein_name: the protein name that need to be changed
990  @param offset: the offset value
991  '''
992 
993  for xl in self:
994  if xl[self.protein1_key] == protein_name:
995  xl[self.residue1_key] = xl[self.residue1_key] + offset
996  if xl[self.protein2_key] == protein_name:
997  xl[self.residue2_key] = xl[self.residue2_key] + offset
998  self._update()
999 
1000  def create_new_keyword(self, keyword, values_from_keyword=None):
1001  '''
1002  This function creates a new keyword for the whole database and
1003  set the values from and existing keyword (optional), otherwise the
1004  values are set to None
1005  @param keyword the new keyword name:
1006  @param values_from_keyword the keyword from which we are copying
1007  the values:
1008  '''
1009  for xl in self:
1010  if values_from_keyword is not None:
1011  xl[keyword] = xl[values_from_keyword]
1012  else:
1013  xl[keyword] = None
1014  self._update()
1015 
1016  def rename_proteins(self, old_to_new_names_dictionary,
1017  protein_to_rename="both"):
1018  '''
1019  This function renames all proteins contained in the input dictionary
1020  from the old names (keys) to the new name (values)
1021  @param old_to_new_names_dictionary dictionary for converting old
1022  to new names
1023  @param protein_to_rename specify whether to rename both or protein1
1024  or protein2 only
1025  '''
1026 
1027  for old_name in old_to_new_names_dictionary:
1028  new_name = old_to_new_names_dictionary[old_name]
1029  if protein_to_rename == "both" or protein_to_rename == "protein1":
1030  fo2 = FilterOperator(self.protein1_key, operator.eq, old_name)
1031  self.set_value(self.protein1_key, new_name, fo2)
1032  if protein_to_rename == "both" or protein_to_rename == "protein2":
1033  fo2 = FilterOperator(self.protein2_key, operator.eq, old_name)
1034  self.set_value(self.protein2_key, new_name, fo2)
1035 
1036  def align_sequence(self, alignfile):
1037  """
1038  This function parses an alignment file obtained previously, to map
1039  crosslink residues onto the sequence of a homolog proteins. It is
1040  useful if you want to map the crosslinks onto a known, homolog
1041  structure without building a comparative model.
1042  The txt file of the alignment should be structured with repeating
1043  blocks, as follows:
1044 
1045  >NAME_CROSSLINKED_PROTEIN
1046  >NAME_HOMOLOG_PROTEIN_WITH_KNOWN_STRUCTURE
1047  one-letter code sequence for cross-linked protein (one line)
1048  one-letter code sequence for homolog protein with known structure (one line)
1049  """ # noqa: E501
1050 
1051  f = open(alignfile, 'r')
1052 
1053  def _assign_residues(seq1, seq2):
1054  d = {}
1055  resinds1 = []
1056  nres = 0
1057  contiguousgap = 0
1058  for r1 in seq1:
1059  if r1 != "-":
1060  nres += 1
1061  resinds1.append(nres)
1062  else:
1063  resinds1.append(None)
1064 
1065  resinds2 = [None for i in resinds1]
1066  nres = 0
1067  lastresind = 1
1068  for pos, r2 in enumerate(seq2):
1069  if r2 != "-":
1070  nres += 1
1071  resinds2[pos] = nres
1072  lastresind = nres
1073  if contiguousgap > 1:
1074  for i in range(pos-int(contiguousgap/2), pos):
1075  resinds2[i] = nres
1076  contiguousgap = 0
1077  else:
1078  contiguousgap += 1
1079  resinds2[pos] = lastresind
1080 
1081  for n, r in enumerate(resinds1):
1082  if r is not None:
1083  d[r] = resinds2[n]
1084  return d
1085 
1086  aa = True
1087  sequences1 = {}
1088  sequences2 = {}
1089  with open(alignfile, 'r') as f:
1090  while aa:
1091  aa = f.readline()
1092  bb = f.readline()
1093  cc = f.readline()
1094  dd = f.readline()
1095  protname = aa.replace(">", "").replace("\n", "")
1096  seq1 = cc.replace(",", "").replace("\n", "")
1097  seq2 = dd.replace(",", "").replace("\n", "")
1098 
1099  n1 = 0
1100  n2 = 0
1101  tmpl1 = []
1102  tmpl2 = []
1103  for i in range(len(seq1)):
1104  if seq1[i] != "-":
1105  n1 += 1
1106  c1 = n1
1107  else:
1108  c1 = None
1109  if seq2[i] != "-":
1110  n2 += 1
1111  c2 = n2
1112  else:
1113  c2 = None
1114 
1115  tmpl1.append([i, c1, seq1[i]])
1116  tmpl2.append([i, c2, seq2[i]])
1117  sequences1[protname] = tmpl1
1118  sequences2[protname] = tmpl2
1119 
1120  d = _assign_residues(seq1, seq2)
1121 
1122  fo11 = FilterOperator(self.protein1_key, operator.eq, protname)
1123  fo12 = FilterOperator(self.protein2_key, operator.eq, protname)
1124 
1125  for xl in self:
1126  if fo11.evaluate(xl):
1127  t = [item for item in sequences1[protname]
1128  if item[1] == xl[self.residue1_key]][0]
1129  sequences1[protname][t[0]] = [
1130  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1131  t = [item for item in sequences2[protname]
1132  if item[1] == d[xl[self.residue1_key]]][0]
1133  sequences2[protname][t[0]] = [
1134  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1135 
1136  xl[self.residue1_key] = d[xl[self.residue1_key]]
1137 
1138  if fo12.evaluate(xl):
1139 
1140  t = [item for item in sequences1[protname]
1141  if item[1] == xl[self.residue2_key]][0]
1142  sequences1[protname][t[0]] = [
1143  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1144  t = [item for item in sequences2[protname]
1145  if item[1] == d[xl[self.residue2_key]]][0]
1146  sequences2[protname][t[0]] = [
1147  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1148 
1149  xl[self.residue2_key] = d[xl[self.residue2_key]]
1150 
1151  print("")
1152  print(aa.replace("\n", ""))
1153  print("".join([seq[2] for seq in sequences1[protname]]))
1154  print(bb.replace("\n", ""))
1155  print("".join([seq[2] for seq in sequences2[protname]]))
1156 
1157  def classify_crosslinks_by_score(self, number_of_classes):
1158  '''
1159  This function creates as many classes as in the input
1160  (number_of_classes: integer) and partition cross-links according
1161  to their identification scores. Classes are defined in the psi key.
1162  '''
1163 
1164  if self.id_score_key is not None:
1165  scores = self.get_values(self.id_score_key)
1166  else:
1167  raise ValueError(
1168  'The cross-link database does not contain score values')
1169  minscore = min(scores)
1170  maxscore = max(scores)
1171  scoreclasses = numpy.linspace(minscore, maxscore, number_of_classes+1)
1172  if self.psi_key is None:
1173  self.create_new_keyword(self.psi_key, values_from_keyword=None)
1174  for xl in self:
1175  score = xl[self.id_score_key]
1176  for n, classmin in enumerate(scoreclasses[0:-1]):
1177  if score >= classmin and score <= scoreclasses[n+1]:
1178  xl[self.psi_key] = str("CLASS_"+str(n))
1179  self._update()
1180 
1181  def clone_protein(self, protein_name, new_protein_name):
1182  for id in list(self.data_base.keys()):
1183  new_data_base = []
1184  for xl in self.data_base[id]:
1185  new_data_base.append(xl)
1186  if xl[self.protein1_key] == protein_name \
1187  and xl[self.protein2_key] != protein_name:
1188  new_xl = dict(xl)
1189  new_xl[self.protein1_key] = new_protein_name
1190  new_data_base.append(new_xl)
1191  elif xl[self.protein1_key] != protein_name \
1192  and xl[self.protein2_key] == protein_name:
1193  new_xl = dict(xl)
1194  new_xl[self.protein2_key] = new_protein_name
1195  new_data_base.append(new_xl)
1196  elif xl[self.protein1_key] == protein_name \
1197  and xl[self.protein2_key] == protein_name:
1198  new_xl = dict(xl)
1199  new_xl[self.protein1_key] = new_protein_name
1200  new_data_base.append(new_xl)
1201  new_xl = dict(xl)
1202  new_xl[self.protein2_key] = new_protein_name
1203  new_data_base.append(new_xl)
1204  new_xl = dict(xl)
1205  new_xl[self.protein1_key] = new_protein_name
1206  new_xl[self.protein2_key] = new_protein_name
1207  new_data_base.append(new_xl)
1208  self.data_base[id] = new_data_base
1209  self._update()
1210 
1212  '''
1213  This function remove cross-links applied to the same residue
1214  (ie, same chain name and residue number)
1215  '''
1216  for id in list(self.data_base.keys()):
1217  new_data_base = []
1218  for xl in self.data_base[id]:
1219  if xl[self.protein1_key] == xl[self.protein2_key] \
1220  and xl[self.residue1_key] == xl[self.residue2_key]:
1221  continue
1222  else:
1223  new_data_base.append(xl)
1224  self.data_base[id] = new_data_base
1225  self._update()
1226 
1227  def jackknife(self, percentage):
1228  '''
1229  this method returns a CrossLinkDataBase class containing
1230  a random subsample of the original cross-link database.
1231  @param percentage float between 0 and 1, is the percentage of
1232  of spectra taken from the original list
1233  '''
1234  import random
1235  if percentage > 1.0 or percentage < 0.0:
1236  raise ValueError('the percentage of random cross-link spectra '
1237  'should be between 0 and 1')
1238  nspectra = self.get_number_of_xlid()
1239  nrandom_spectra = int(nspectra*percentage)
1240  random_keys = random.sample(sorted(list(self.data_base.keys())),
1241  nrandom_spectra)
1242  new_data_base = {}
1243  for k in random_keys:
1244  new_data_base[k] = self.data_base[k]
1245  return CrossLinkDataBase(self.cldbkc, new_data_base)
1246 
1247  def __str__(self):
1248  outstr = ''
1249  sorted_ids = sorted(list(self.data_base.keys()))
1250 
1251  for id in sorted_ids:
1252  outstr += id + "\n"
1253  for xl in self.data_base[id]:
1254  for k in self.ordered_key_list:
1255  try:
1256  outstr += "--- " + str(k) + " " + str(xl[k]) + "\n"
1257  except KeyError:
1258  continue
1259 
1260  for k in xl:
1261  if k not in self.ordered_key_list:
1262  outstr += "--- " + str(k) + " " + str(xl[k]) + "\n"
1263  outstr += "-------------\n"
1264  return outstr
1265 
1266  def plot(self, filename, **kwargs):
1267  import matplotlib
1268  matplotlib.use('Agg')
1269  import matplotlib.pyplot as plt
1270  import matplotlib.colors
1271 
1272  if kwargs["type"] == "scatter":
1273  cmap = plt.get_cmap("rainbow")
1274  _ = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1275  xkey = kwargs["xkey"]
1276  ykey = kwargs["ykey"]
1277  if "colorkey" in kwargs:
1278  colorkey = kwargs["colorkey"]
1279  if "logyscale" in kwargs:
1280  logyscale = kwargs["logyscale"]
1281  else:
1282  logyscale = False
1283  xs = []
1284  ys = []
1285  colors = []
1286  for xl in self:
1287  try:
1288  xs.append(float(xl[xkey]))
1289  if logyscale:
1290  ys.append(math.log10(float(xl[ykey])))
1291  else:
1292  ys.append(float(xl[ykey]))
1293  colors.append(float(xl[colorkey]))
1294  except ValueError:
1295  print("CrossLinkDataBase.plot: Value error for "
1296  "cross-link %s" % (xl[self.unique_id_key]))
1297  continue
1298 
1299  cs = []
1300  for color in colors:
1301  cindex = (color-min(colors))/(max(colors)-min(colors))
1302  cs.append(cmap(cindex))
1303 
1304  fig = plt.figure()
1305  ax = fig.add_subplot(111)
1306  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8, marker="o")
1307  ax.set_xlabel(xkey)
1308  ax.set_ylabel(ykey)
1309  plt.savefig(filename)
1310  plt.show()
1311  plt.close()
1312 
1313  if kwargs["type"] == "residue_links":
1314  # plot the number of distinct links to a residue
1315  # in an histogram
1316  # molecule name
1317  molecule = kwargs["molecule"]
1318  if type(molecule) is IMP.pmi.topology.Molecule:
1319  length = len(molecule.sequence)
1320  molecule = molecule.get_name()
1321  else:
1322  # you need a IMP.pmi.topology.Sequences object
1323  sequences_object = kwargs["sequences_object"]
1324  sequence = sequences_object.sequences[molecule]
1325  length = len(sequence)
1326 
1327  histogram = [0]*length
1328  for xl in self:
1329  if xl[self.protein1_key] == molecule:
1330  histogram[xl[self.residue1_key]-1] = \
1331  xl[self.residue1_links_number_key]
1332  if xl[self.protein2_key] == molecule:
1333  histogram[xl[self.residue2_key]-1] = \
1334  xl[self.residue2_links_number_key]
1335  plt.bar(range(1, length+1), histogram)
1336  plt.savefig(filename)
1337  plt.show()
1338  plt.close()
1339 
1340  if kwargs["type"] == "histogram":
1341  valuekey = kwargs["valuekey"]
1342  valuename = valuekey
1343  bins = kwargs["bins"]
1344  values_list = []
1345  for xl in self:
1346  try:
1347  values_list.append(float(xl[valuekey]))
1348  except ValueError:
1349  print("CrossLinkDataBase.plot: Value error for cross-link "
1350  "%s" % (xl[self.unique_id_key]))
1351  continue
1353  filename, [values_list], valuename=valuename, bins=bins,
1354  colors=None, format="pdf",
1355  reference_xline=None, yplotrange=None,
1356  xplotrange=None, normalized=True,
1357  leg_names=None)
1358 
1359  def dump(self, json_filename):
1360  import json
1361  with open(json_filename, 'w') as fp:
1362  json.dump(self.data_base, fp, sort_keys=True, indent=2,
1363  default=set_json_default)
1364 
1365  def load(self, json_filename):
1366  import json
1367  with open(json_filename, 'r') as fp:
1368  self.data_base = json.load(fp)
1369  self._update()
1370 
1371  def save_csv(self, filename):
1372  import csv
1373 
1374  data = []
1375  sorted_ids = None
1376  sorted_group_ids = sorted(list(self.data_base.keys()))
1377  for group in sorted_group_ids:
1378  group_block = []
1379  for xl in self.data_base[group]:
1380  if not sorted_ids:
1381  sorted_ids = sorted(list(xl.keys()))
1382  values = [xl[k] for k in sorted_ids]
1383  group_block.append(values)
1384  data += group_block
1385 
1386  with open(filename, 'w') as fp:
1387  a = csv.writer(fp, delimiter=',')
1388  a.writerow(sorted_ids)
1389  a.writerows(data)
1390 
1392  """
1393  Returns the number of non redundant cross-link sites
1394  """
1395  praset = set()
1396  for xl in self:
1397  pra = _ProteinsResiduesArray(xl)
1398  prai = pra.get_inverted()
1399  praset.add(pra)
1400  praset.add(prai)
1401  return len(list(praset))
1402 
1403 
1405  """This class allows to compute and plot the distance between datasets"""
1406 
1407  def __init__(self, cldb_dictionary):
1408  """Input a dictionary where keys are cldb names and values are cldbs"""
1409  import scipy.spatial.distance
1410  self.dbs = cldb_dictionary
1411  self.keylist = list(self.dbs.keys())
1412  self.distances = list()
1413 
1414  for i, key1 in enumerate(self.keylist):
1415  for key2 in self.keylist[i+1:]:
1416  distance = self.get_distance(key1, key2)
1417  self.distances.append(distance)
1418 
1419  self.distances = scipy.spatial.distance.squareform(self.distances)
1420 
1421  def get_distance(self, key1, key2):
1422  return 1.0 - self.jaccard_index(self.dbs[key1], self.dbs[key2])
1423 
1424  def jaccard_index(self, CrossLinkDataBase1, CrossLinkDataBase2):
1425  """Similarity index between two datasets
1426  https://en.wikipedia.org/wiki/Jaccard_index"""
1427 
1428  set1 = set()
1429  set2 = set()
1430  for xl1 in CrossLinkDataBase1:
1431  a1f = _ProteinsResiduesArray(xl1)
1432  a1b = a1f.get_inverted()
1433  set1.add(a1f)
1434  set1.add(a1b)
1435  for xl2 in CrossLinkDataBase2:
1436  a2f = _ProteinsResiduesArray(xl2)
1437  a2b = a2f.get_inverted()
1438  set2.add(a2f)
1439  set2.add(a2b)
1440  return (float(len(set1 & set2)/2)
1441  / (len(set1)/2+len(set2)/2-len(set1 & set2)/2))
1442 
1443  def plot_matrix(self, figurename="clustermatrix.pdf"):
1444  import matplotlib as mpl
1445  import numpy
1446  mpl.use('Agg')
1447  import matplotlib.pylab as pl
1448  from scipy.cluster import hierarchy as hrc
1449 
1450  raw_distance_matrix = self.distances
1451  labels = self.keylist
1452 
1453  fig = pl.figure()
1454 
1455  ax = fig.add_subplot(1, 1, 1)
1456  dendrogram = hrc.dendrogram(
1457  hrc.linkage(raw_distance_matrix),
1458  color_threshold=7,
1459  no_labels=True)
1460  leaves_order = dendrogram['leaves']
1461  ax.set_xlabel('Dataset')
1462  ax.set_ylabel('Jaccard Distance')
1463  pl.tight_layout()
1464  pl.savefig("dendrogram."+figurename, dpi=300)
1465  pl.close(fig)
1466 
1467  fig = pl.figure()
1468 
1469  ax = fig.add_subplot(1, 1, 1)
1470  cax = ax.imshow(raw_distance_matrix[leaves_order, :][:, leaves_order],
1471  interpolation='nearest')
1472  cb = fig.colorbar(cax)
1473  cb.set_label('Jaccard Distance')
1474  ax.set_xlabel('Dataset')
1475  ax.set_ylabel('Dataset')
1476  ax.set_xticks(range(len(labels)))
1477  ax.set_xticklabels(numpy.array(labels)[leaves_order],
1478  rotation='vertical')
1479  ax.set_yticks(range(len(labels)))
1480  ax.set_yticklabels(numpy.array(labels)[leaves_order],
1481  rotation='horizontal')
1482  pl.tight_layout()
1483  pl.savefig("matrix." + figurename, dpi=300)
1484  pl.close(fig)
1485 
1486 
1488  '''
1489  This class maps a CrossLinkDataBase on a given structure
1490  and save an rmf file with color-coded cross-links
1491  '''
1492 
1493  def __init__(self, CrossLinkDataBase, rmf_or_stat_handler):
1494  self.CrossLinkDataBase = CrossLinkDataBase
1495  if type(rmf_or_stat_handler) is IMP.pmi.output.RMFHierarchyHandler or \
1496  type(rmf_or_stat_handler) is IMP.pmi.output.StatHierarchyHandler:
1497  self.prots = rmf_or_stat_handler
1498  self.distances = defaultdict(list)
1499  self.array_to_id = {}
1500  self.id_to_array = {}
1501 
1502  print("computing distances for all cross-links and all structures")
1503  for i in self.prots[::10]:
1504  self.compute_distances()
1505  for key, xl in enumerate(self.CrossLinkDataBase):
1506  array = _ProteinsResiduesArray(xl)
1507  self.array_to_id[array] = key
1508  self.id_to_array[key] = array
1509  if xl["MinAmbiguousDistance"] != 'None':
1510  self.distances[key].append(xl["MinAmbiguousDistance"])
1511 
1512  def compute_distances(self):
1513  data = []
1514  sorted_ids = None
1515  sorted_group_ids = \
1516  sorted(list(self.CrossLinkDataBase.data_base.keys()))
1517  for group in sorted_group_ids:
1518  group_dists = []
1519  for xl in self.CrossLinkDataBase.data_base[group]:
1520  if not sorted_ids:
1521  sorted_ids = sorted(list(xl.keys()))
1522  data.append(
1523  sorted_ids
1524  + ["UniqueID", "Distance", "MinAmbiguousDistance"])
1525  c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1526  try:
1527  (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1528  self._get_distance_and_particle_pair(r1, c1, r2, c2)
1529  except: # noqa: E722
1530  mdist = "None"
1531  state1 = "None"
1532  copy1 = "None"
1533  state2 = "None"
1534  copy2 = "None"
1535  try:
1536  # sometimes keys get "lost" in the database,
1537  # not really sure why
1538  values = [xl[k] for k in sorted_ids]
1539  values += [group, mdist]
1540  except KeyError as e:
1541  print("MapCrossLinkDataBaseOnStructure "
1542  "KeyError: {0} in {1}".format(e, xl))
1543  group_dists.append(mdist)
1544  xl["Distance"] = mdist
1545  xl["State1"] = state1
1546  xl["Copy1"] = copy1
1547  xl["State2"] = state2
1548  xl["Copy2"] = copy2
1549 
1550  for xl in self.CrossLinkDataBase.data_base[group]:
1551  xl["MinAmbiguousDistance"] = min(group_dists)
1552 
1553  def _get_distance_and_particle_pair(self, r1, c1, r2, c2):
1554  '''more robust and slower version of above'''
1555  sel = IMP.atom.Selection(self.prots, molecule=c1, residue_index=r1,
1556  resolution=1)
1557  selpart_1 = sel.get_selected_particles()
1558  if len(selpart_1) == 0:
1559  print("MapCrossLinkDataBaseOnStructure: Warning: no particle "
1560  "selected for first site")
1561  return None
1562  sel = IMP.atom.Selection(self.prots, molecule=c2, residue_index=r2,
1563  resolution=1)
1564  selpart_2 = sel.get_selected_particles()
1565  if len(selpart_2) == 0:
1566  print("MapCrossLinkDataBaseOnStructure: Warning: no particle "
1567  "selected for second site")
1568  return None
1569  results = []
1570  for p1 in selpart_1:
1571  for p2 in selpart_2:
1572  if p1 == p2 and r1 == r2:
1573  continue
1574  d1 = IMP.core.XYZ(p1)
1575  d2 = IMP.core.XYZ(p2)
1576  # round distance to second decimal
1577  dist = float(int(IMP.core.get_distance(d1, d2)*100.0))/100.0
1578  h1 = IMP.atom.Hierarchy(p1)
1579  while not IMP.atom.Molecule.get_is_setup(h1.get_particle()):
1580  h1 = h1.get_parent()
1581  copy_index1 = IMP.atom.Copy(h1).get_copy_index()
1582  while not IMP.atom.State.get_is_setup(h1.get_particle()):
1583  h1 = h1.get_parent()
1584  state_index1 = IMP.atom.State(h1).get_state_index()
1585  h2 = IMP.atom.Hierarchy(p2)
1586  while not IMP.atom.Molecule.get_is_setup(h2.get_particle()):
1587  h2 = h2.get_parent()
1588  copy_index2 = IMP.atom.Copy(h2).get_copy_index()
1589  while not IMP.atom.State.get_is_setup(h2.get_particle()):
1590  h2 = h2.get_parent()
1591  state_index2 = IMP.atom.State(h2).get_state_index()
1592 
1593  results.append((dist, state_index1, copy_index1,
1594  state_index2, copy_index2, p1, p2))
1595  if len(results) == 0:
1596  return None
1597  results_sorted = sorted(results,
1598  key=operator.itemgetter(0, 1, 2, 3, 4))
1599  return ((results_sorted[0][0],
1600  results_sorted[0][5],
1601  results_sorted[0][6]),
1602  (results_sorted[0][1],
1603  results_sorted[0][2],
1604  results_sorted[0][3],
1605  results_sorted[0][4]))
1606 
1607  def save_rmf_snapshot(self, filename, color_id=None):
1608  sorted_group_ids = \
1609  sorted(list(self.CrossLinkDataBase.data_base.keys()))
1610  list_of_pairs = []
1611  color_scores = []
1612  for group in sorted_group_ids:
1613  group_dists_particles = []
1614  for xl in self.CrossLinkDataBase.data_base[group]:
1615  xllabel = \
1616  self.CrossLinkDataBase.get_short_cross_link_string(xl)
1617  c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1618  try:
1619  (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1620  self._get_distance_and_particle_pair(r1, c1, r2, c2)
1621  except TypeError:
1622  print("TypeError or missing chain/residue ",
1623  r1, c1, r2, c2)
1624  continue
1625  if color_id is not None:
1626  group_dists_particles.append((mdist, p1, p2, xllabel,
1627  float(xl[color_id])))
1628  else:
1629  group_dists_particles.append((mdist, p1, p2, xllabel, 0.0))
1630  if group_dists_particles:
1631  (minmdist, minp1, minp2, minxllabel, mincolor_score) = \
1632  min(group_dists_particles, key=lambda t: t[0])
1633  color_scores.append(mincolor_score)
1634  list_of_pairs.append((minp1, minp2, minxllabel,
1635  mincolor_score))
1636  else:
1637  continue
1638 
1639  linear = IMP.core.Linear(0, 0.0)
1640  linear.set_slope(1.0)
1641  dps2 = IMP.core.DistancePairScore(linear)
1642  rslin = IMP.RestraintSet(self.prots.get_model(),
1643  'linear_dummy_restraints')
1644  sgs = []
1645  offset = min(color_scores)
1646  maxvalue = max(color_scores)
1647  for pair in list_of_pairs:
1648  pr = IMP.core.PairRestraint(self.prots.get_model(), dps2,
1649  (pair[0], pair[1]))
1650  pr.set_name(pair[2])
1651  if offset < maxvalue:
1652  factor = (pair[3]-offset)/(maxvalue-offset)
1653  else:
1654  factor = 0.0
1655  c = IMP.display.get_rgb_color(factor)
1656  seg = IMP.algebra.Segment3D(
1657  IMP.core.XYZ(pair[0]).get_coordinates(),
1658  IMP.core.XYZ(pair[1]).get_coordinates())
1659  rslin.add_restraint(pr)
1660  sgs.append(IMP.display.SegmentGeometry(seg, c, pair[2]))
1661 
1662  rh = RMF.create_rmf_file(filename)
1663  IMP.rmf.add_hierarchies(rh, [self.prots])
1664  IMP.rmf.add_restraints(rh, [rslin])
1665  IMP.rmf.add_geometries(rh, sgs)
1666  IMP.rmf.save_frame(rh)
1667  del rh
1668 
1669  def boxplot_crosslink_distances(self, filename):
1670  import numpy
1671  keys = [self.id_to_array[k] for k in list(self.distances.keys())]
1672  medians = [numpy.median(self.distances[self.array_to_id[k]])
1673  for k in keys]
1674  dists = [self.distances[self.array_to_id[k]] for k in keys]
1675  distances_sorted_by_median = \
1676  [x for _, x in sorted(zip(medians, dists))]
1677  keys_sorted_by_median = [x for _, x in sorted(zip(medians, keys))]
1679  filename, distances_sorted_by_median,
1680  range(len(distances_sorted_by_median)),
1681  xlabels=keys_sorted_by_median,
1682  scale_plot_length=0.2)
1683 
1684  def get_crosslink_violations(self, threshold):
1685  violations = defaultdict(list)
1686  for k in self.distances:
1687  # viols=map(lambda x:1.0*(x<= threshold), self.distances[k])
1688  viols = self.distances[k]
1689  violations[self.id_to_array[k]] = viols
1690  return violations
1691 
1692 
1694  '''
1695  This class generates a CrossLinkDataBase from a given structure
1696  '''
1697  def __init__(self, system, residue_types_1=["K"],
1698  residue_types_2=["K"], reactivity_range=[0, 1], kt=1.0):
1699  import numpy.random
1700  import math
1702  cldbkc.set_protein1_key("Protein1")
1703  cldbkc.set_protein2_key("Protein2")
1704  cldbkc.set_residue1_key("Residue1")
1705  cldbkc.set_residue2_key("Residue2")
1706  self.cldb = CrossLinkDataBase(cldbkc)
1707  self.system = system
1708  self.model = self.system.model
1709  self.residue_types_1 = residue_types_1
1710  self.residue_types_2 = residue_types_2
1711  self.kt = kt
1712  self.indexes_dict1 = {}
1713  self.indexes_dict2 = {}
1714  self.protein_residue_dict = {}
1715  self.reactivity_dictionary = {}
1716  self.euclidean_interacting_pairs = None
1717  self.xwalk_interacting_pairs = None
1718 
1719  for state in self.system.get_states():
1720  for moleculename, molecules in state.get_molecules().items():
1721  for molecule in molecules:
1722  # we are saving a dictionary with protein name,
1723  # residue number and random reactivity of the residue
1724  seq = molecule.sequence
1725  residues = [i for i in range(1, len(seq)+1)
1726  if ((seq[i-1] in self.residue_types_1)
1727  or (seq[i-1] in self.residue_types_2))]
1728 
1729  for r in residues:
1730  # uniform random reactivities
1731  # getting reactivities from the CDF of an
1732  # exponential distribution
1733  rexp = numpy.random.exponential(0.00000001)
1734  prob = 1.0-math.exp(-rexp)
1735  self.reactivity_dictionary[(molecule, r)] = prob
1736 
1737  residues1 = [i for i in range(1, len(seq)+1)
1738  if seq[i-1] in self.residue_types_1]
1739  residues2 = [i for i in range(1, len(seq)+1)
1740  if seq[i-1] in self.residue_types_2]
1741  for r in residues1:
1742  s = IMP.atom.Selection(
1743  molecule.hier, residue_index=r, resolution=1)
1744  ps = s.get_selected_particles()
1745  for p in ps:
1746  index = p.get_index()
1747  self.indexes_dict1[index] = (molecule, r)
1748  self.protein_residue_dict[(molecule, r)] = index
1749  for r in residues2:
1750  s = IMP.atom.Selection(
1751  molecule.hier, residue_index=r, resolution=1)
1752  ps = s.get_selected_particles()
1753  for p in ps:
1754  index = p.get_index()
1755  self.indexes_dict2[index] = (molecule, r)
1756  self.protein_residue_dict[(molecule, r)] = index
1757 
1758  def get_all_possible_pairs(self):
1759  n = float(len(list(self.protein_residue_dict.keys())))
1760  return n*(n-1.0)/2.0
1761 
1762  def get_all_feasible_pairs(self, distance=21):
1763  import itertools
1764  particle_index_pairs = []
1765  nxl = 0
1766  for a, b in itertools.combinations(
1767  list(self.protein_residue_dict.keys()), 2):
1768  new_xl = {}
1769  index1 = self.protein_residue_dict[a]
1770  index2 = self.protein_residue_dict[b]
1771  particle_distance = IMP.core.get_distance(
1772  IMP.core.XYZ(IMP.get_particles(self.model, [index1])[0]),
1773  IMP.core.XYZ(IMP.get_particles(self.model, [index2])[0]))
1774  if particle_distance <= distance:
1775  particle_index_pairs.append((index1, index2))
1776  new_xl[self.cldb.protein1_key] = a[0].get_name()
1777  new_xl[self.cldb.protein2_key] = b[0].get_name()
1778  new_xl["molecule_object1"] = a[0]
1779  new_xl["molecule_object2"] = b[0]
1780  new_xl[self.cldb.residue1_key] = a[1]
1781  new_xl[self.cldb.residue2_key] = b[1]
1782  self.cldb.data_base[str(nxl)] = [new_xl]
1783  nxl += 1
1784  self.cldb._update()
1785  return self.cldb
1786 
1787  def get_data_base(self, total_number_of_spectra, ambiguity_probability=0.1,
1788  noise=0.01, distance=21, max_delta_distance=10.0,
1789  xwalk_bin_path=None, confidence_false=0.75,
1790  confidence_true=0.75):
1791  import math
1792  from random import random
1793  import numpy as np
1794  number_of_spectra = 1
1795 
1796  self.beta_true = -1.4427*math.log(0.5*(1.0-confidence_true))
1797  self.beta_false = -1.4427*math.log(0.5*(1.0-confidence_false))
1798  self.cldb.data_base[str(number_of_spectra)] = []
1799  self.sites_weighted = None
1800 
1801  while number_of_spectra < total_number_of_spectra:
1802  if (random() > ambiguity_probability
1803  and len(self.cldb.data_base[str(number_of_spectra)]) != 0):
1804  # new spectrum
1805  number_of_spectra += 1
1806  self.cldb.data_base[str(number_of_spectra)] = []
1807  noisy = False
1808  if random() > noise:
1809  # not noisy cross-link
1810  pra, dist = self.get_random_residue_pair(
1811  distance, xwalk_bin_path, max_delta_distance)
1812  else:
1813  # noisy cross-link
1814  pra, dist = self.get_random_residue_pair(None, None, None)
1815  noisy = True
1816 
1817  new_xl = {}
1818  new_xl[self.cldb.protein1_key] = pra[0].get_name()
1819  new_xl[self.cldb.protein2_key] = pra[1].get_name()
1820  new_xl["molecule_object1"] = pra[0]
1821  new_xl["molecule_object2"] = pra[1]
1822  new_xl[self.cldb.residue1_key] = pra[2]
1823  new_xl[self.cldb.residue2_key] = pra[3]
1824  new_xl["Noisy"] = noisy
1825  # the reactivity is defined as r=1-exp(-k*Delta t)
1826  new_xl["Reactivity_Residue1"] = \
1827  self.reactivity_dictionary[(pra[0], pra[2])]
1828  new_xl["Reactivity_Residue2"] = \
1829  self.reactivity_dictionary[(pra[1], pra[3])]
1830  new_xl["Reactivity"] = \
1831  new_xl["Reactivity_Residue1"]+new_xl["Reactivity_Residue2"]
1832  if noisy:
1833  new_xl["IDScore"] = np.random.beta(1.0, self.beta_false)
1834  else:
1835  new_xl["IDScore"] = 1.0-np.random.beta(1.0, self.beta_true)
1836  new_xl["TargetDistance"] = dist
1837  new_xl["NoiseProbability"] = noise
1838  new_xl["AmbiguityProbability"] = ambiguity_probability
1839  # getting if it is intra or inter rigid body
1840  (p1, p2) = IMP.get_particles(
1841  self.model, [self.protein_residue_dict[(pra[0], pra[2])],
1842  self.protein_residue_dict[(pra[1], pra[3])]])
1845  IMP.core.RigidMember(p1).get_rigid_body() ==
1846  IMP.core.RigidMember(p2).get_rigid_body()):
1847  new_xl["InterRigidBody"] = False
1848  elif (IMP.core.RigidMember.get_is_setup(p1) and
1850  IMP.core.RigidMember(p1).get_rigid_body() !=
1851  IMP.core.RigidMember(p2).get_rigid_body()):
1852  new_xl["InterRigidBody"] = True
1853  else:
1854  new_xl["InterRigidBody"] = None
1855 
1856  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1857  self.cldb._update()
1858  return self.cldb
1859 
1860  def get_random_residue_pair(self, distance=None, xwalk_bin_path=None,
1861  max_delta_distance=None):
1862  import IMP.pmi.tools
1863  import math
1864  from random import choice
1865  if distance is None:
1866  # get a random pair
1867  while True:
1868  protein1, residue1 = \
1869  choice(list(self.protein_residue_dict.keys()))
1870  protein2, residue2 = \
1871  choice(list(self.protein_residue_dict.keys()))
1872  index1 = self.protein_residue_dict[(protein1, residue1)]
1873  index2 = self.protein_residue_dict[(protein2, residue2)]
1874  particle_distance = IMP.core.get_distance(
1875  IMP.core.XYZ(IMP.get_particles(self.model, [index1])[0]),
1876  IMP.core.XYZ(IMP.get_particles(self.model, [index2])[0]))
1877  if (protein1, residue1) != (protein2, residue2):
1878  break
1879  else:
1880  # get a pair of residues whose distance is below the threshold
1881  if not xwalk_bin_path:
1883  gcpf.set_distance(distance+max_delta_distance)
1884 
1885  while True:
1886  # setup the reaction rates lists
1887  if not self.sites_weighted:
1888  self.sites_weighted = []
1889  for key in self.reactivity_dictionary:
1890  r = self.reactivity_dictionary[key]
1891  self.sites_weighted.append((key, r))
1892  # get a random reaction site
1893  first_site = self.weighted_choice(self.sites_weighted)
1894  # get all distances
1895  if self.euclidean_interacting_pairs is None:
1896  self.euclidean_interacting_pairs = \
1897  gcpf.get_close_pairs(
1898  self.model,
1899  list(self.indexes_dict1.keys()),
1900  list(self.indexes_dict2.keys()))
1901  # get the partner for the first reacted site
1902  first_site_pairs = \
1903  [pair for pair in self.euclidean_interacting_pairs
1904  if self.indexes_dict1[pair[0]] == first_site or
1905  self.indexes_dict2[pair[1]] == first_site]
1906  if len(first_site_pairs) == 0:
1907  continue
1908  # build the list of second reaction sites
1909  second_sites_weighted = []
1910  for pair in first_site_pairs:
1911  if self.indexes_dict1[pair[0]] == first_site:
1912  second_site = self.indexes_dict2[pair[1]]
1913  if self.indexes_dict2[pair[1]] == first_site:
1914  second_site = self.indexes_dict1[pair[0]]
1915  r = self.reactivity_dictionary[second_site]
1916  second_sites_weighted.append((second_site, r))
1917  second_site = self.weighted_choice(second_sites_weighted)
1918  protein1, residue1 = first_site
1919  protein2, residue2 = second_site
1920  print("CrossLinkDataBaseFromStructure."
1921  "get_random_residue_pair:",
1922  "First site", first_site,
1923  self.reactivity_dictionary[first_site],
1924  "Second site", second_site,
1925  self.reactivity_dictionary[second_site])
1926  particle_pair = IMP.get_particles(
1927  self.model,
1928  [self.protein_residue_dict[first_site],
1929  self.protein_residue_dict[second_site]])
1930  particle_distance = IMP.core.get_distance(
1931  IMP.core.XYZ(particle_pair[0]),
1932  IMP.core.XYZ(particle_pair[1]))
1933 
1934  if particle_distance < distance \
1935  and (protein1, residue1) != (protein2, residue2):
1936  break
1937  elif (particle_distance >= distance
1938  and (protein1, residue1) != (protein2, residue2)
1939  and max_delta_distance):
1940  # allow some flexibility
1941  if particle_distance-distance < max_delta_distance:
1942  break
1943 
1944  else:
1945  if not self.xwalk_interacting_pairs:
1946  self.xwalk_interacting_pairs = \
1947  self.get_xwalk_distances(xwalk_bin_path, distance)
1948  interacting_pairs_weighted = []
1949 
1950  for pair in self.xwalk_interacting_pairs:
1951  protein1 = pair[0]
1952  protein2 = pair[1]
1953  residue1 = pair[2]
1954  residue2 = pair[3]
1955  weight1 = math.exp(
1956  -self.reactivity_dictionary[(protein1, residue1)]
1957  / self.kt)
1958  weight2 = math.exp(
1959  -self.reactivity_dictionary[(protein2, residue2)]
1960  / self.kt)
1961  interacting_pairs_weighted.append((pair, weight1*weight2))
1962 
1963  pair = self.weighted_choice(interacting_pairs_weighted)
1964  protein1 = pair[0]
1965  protein2 = pair[1]
1966  residue1 = pair[2]
1967  residue2 = pair[3]
1968  particle_distance = float(pair[4])
1969 
1970  return ((protein1, protein2, residue1, residue2)), particle_distance
1971 
1972  def get_xwalk_distances(self, xwalk_bin_path, distance):
1973  import IMP.pmi.output
1974  import os
1975  o = IMP.pmi.output.Output(atomistic=True)
1976  o.init_pdb("xwalk.pdb", self.representation.prot)
1977  o.write_pdb("xwalk.pdb")
1978  namechainiddict = o.dictchain["xwalk.pdb"]
1979  chainiddict = {}
1980 
1981  for key in namechainiddict:
1982  chainiddict[namechainiddict[key]] = key
1983  xwalkout = os.popen('java -Xmx256m -cp ' + xwalk_bin_path
1984  + ' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys '
1985  '-a1 cb -a2 cb -max '
1986  + str(distance) + ' -bb').read()
1987 
1988  output_list_of_distance = []
1989  for line in xwalkout.split("\n")[0:-2]:
1990  tokens = line.split()
1991  first = tokens[2]
1992  second = tokens[3]
1993  distance = float(tokens[6])
1994  fs = first.split("-")
1995  ss = second.split("-")
1996  chainid1 = fs[2]
1997  chainid2 = ss[2]
1998  protein1 = chainiddict[chainid1]
1999  protein2 = chainiddict[chainid2]
2000  residue1 = int(fs[1])
2001  residue2 = int(ss[1])
2002  output_list_of_distance.append(
2003  (protein1, protein2, residue1, residue2, distance))
2004  return output_list_of_distance
2005 
2006  def weighted_choice(self, choices):
2007  import random
2008  # http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
2009  total = sum(w for c, w in choices)
2010  r = random.uniform(0, total)
2011  upto = 0
2012  for c, w in choices:
2013  if upto + w > r:
2014  return c
2015  upto += w
2016  assert False, "Shouldn't get here"
2017 
2018  def save_rmf_snapshot(self, filename, color_id=None):
2019  import IMP.rmf
2020  import RMF
2021  if color_id is None:
2022  color_id = "Reactivity"
2023  sorted_group_ids = sorted(list(self.cldb.data_base.keys()))
2024  list_of_pairs = []
2025  color_scores = []
2026  for group in sorted_group_ids:
2027  group_dists_particles = []
2028  for xl in self.cldb.data_base[group]:
2029  xllabel = self.cldb.get_short_cross_link_string(xl)
2030  (c1, c2, r1, r2) = (xl["molecule_object1"],
2031  xl["molecule_object2"],
2032  xl[self.cldb.residue1_key],
2033  xl[self.cldb.residue2_key])
2034  try:
2035  index1 = self.protein_residue_dict[(c1, r1)]
2036  index2 = self.protein_residue_dict[(c2, r2)]
2037  p1, p2 = (IMP.get_particles(self.model, [index1])[0],
2038  IMP.get_particles(self.model, [index2])[0])
2039  mdist = xl["TargetDistance"]
2040  except TypeError:
2041  print("TypeError or missing chain/residue ",
2042  r1, c1, r2, c2)
2043  continue
2044  group_dists_particles.append((mdist, p1, p2, xllabel,
2045  float(xl[color_id])))
2046  if group_dists_particles:
2047  (minmdist, minp1, minp2, minxllabel, mincolor_score) \
2048  = min(group_dists_particles, key=lambda t: t[0])
2049  color_scores.append(mincolor_score)
2050  list_of_pairs.append((minp1, minp2, minxllabel,
2051  mincolor_score))
2052  else:
2053  continue
2054 
2055  m = self.model
2056  linear = IMP.core.Linear(0, 0.0)
2057  linear.set_slope(1.0)
2058  dps2 = IMP.core.DistancePairScore(linear)
2059  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
2060  sgs = []
2061  offset = min(color_scores)
2062  maxvalue = max(color_scores)
2063  for pair in list_of_pairs:
2064  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
2065  pr.set_name(pair[2])
2066  factor = (pair[3]-offset)/(maxvalue-offset)
2067  c = IMP.display.get_rgb_color(factor)
2068  seg = IMP.algebra.Segment3D(
2069  IMP.core.XYZ(pair[0]).get_coordinates(),
2070  IMP.core.XYZ(pair[1]).get_coordinates())
2071  rslin.add_restraint(pr)
2072  sgs.append(IMP.display.SegmentGeometry(seg, c, pair[2]))
2073 
2074  rh = RMF.create_rmf_file(filename)
2075  IMP.rmf.add_hierarchies(rh, [self.system.hier])
2076  IMP.rmf.add_restraints(rh, [rslin])
2077  IMP.rmf.add_geometries(rh, sgs)
2078  IMP.rmf.save_frame(rh)
2079  del rh
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: State.h:41
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1754
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1819
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: pmi/tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
Get the particles from a list of indexes.
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:749
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:41
Stores a named protein chain.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
The standard decorator for manipulating molecular structures.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:199
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:22
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Score a pair of particles based on the distance between their centers.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:25
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Definition: exception.h:48
Associate an integer "state" index with a hierarchy node.
Definition: State.h:27
class to link stat files to several rmf files
Definition: output.py:1253
class to allow more advanced handling of RMF files.
Definition: output.py:1129
Mapping between FASTA one-letter codes and residue types.
Definition: alphabets.py:1
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Definition: PairRestraint.h:31
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.