IMP logo
IMP Reference Guide  develop.d4e9f3251e,2024/04/26
The Integrative Modeling Platform
io/crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 
8 from __future__ import print_function
9 import IMP
10 import IMP.pmi
11 import IMP.pmi.output
12 import IMP.pmi.alphabets
13 import IMP.atom
14 import IMP.core
15 import IMP.algebra
16 import IMP.rmf
17 import RMF
18 import IMP.display
19 import operator
20 import math
21 import sys
22 import ihm.location
23 import ihm.dataset
24 from collections import defaultdict
25 import numpy
26 
27 
28 # json default serializations
29 def set_json_default(obj):
30  if isinstance(obj, set):
31  return list(obj)
32  if isinstance(obj, IMP.pmi.topology.Molecule):
33  return str(obj)
34  raise TypeError
35 
36 
37 # Handle and return data that must be a string
38 if sys.version_info[0] >= 3:
39  def _handle_string_input(inp):
40  if not isinstance(inp, str):
41  raise TypeError("expecting a string")
42  return inp
43 else:
44  def _handle_string_input(inp):
45  if not isinstance(inp, (str, unicode)): # noqa: F821
46  raise TypeError("expecting a string or unicode")
47  # Coerce to non-unicode representation (str)
48  if isinstance(inp, unicode): # noqa: F821
49  return str(inp)
50  else:
51  return inp
52 
53 
54 class _CrossLinkDataBaseStandardKeys(object):
55  '''
56  This class setup all the standard keys needed to
57  identify the cross-link features from the data sets
58  '''
59  def __init__(self):
60  self.type = {}
61  self.protein1_key = "Protein1"
62  self.type[self.protein1_key] = str
63  self.protein2_key = "Protein2"
64  self.type[self.protein2_key] = str
65  self.residue1_key = "Residue1"
66  self.type[self.residue1_key] = int
67  self.residue2_key = "Residue2"
68  self.type[self.residue2_key] = int
69  self.residue1_amino_acid_key = "Residue1AminoAcid"
70  self.type[self.residue1_amino_acid_key] = str
71  self.residue2_amino_acid_key = "Residue2AminoAcid"
72  self.type[self.residue2_amino_acid_key] = str
73  self.residue1_moiety_key = "Residue1Moiety"
74  self.type[self.residue1_moiety_key] = str
75  self.residue2_moiety_key = "Residue2Moiety"
76  self.type[self.residue2_moiety_key] = str
77  self.site_pairs_key = "SitePairs"
78  self.type[self.site_pairs_key] = str
79  self.unique_id_key = "XLUniqueID"
80  self.type[self.unique_id_key] = str
81  self.unique_sub_index_key = "XLUniqueSubIndex"
82  self.type[self.unique_sub_index_key] = int
83  self.unique_sub_id_key = "XLUniqueSubID"
84  self.type[self.unique_sub_id_key] = str
85  self.data_set_name_key = "DataSetName"
86  self.type[self.data_set_name_key] = str
87  self.cross_linker_chemical_key = "CrossLinkerChemical"
88  self.type[self.cross_linker_chemical_key] = str
89  self.id_score_key = "IDScore"
90  self.type[self.id_score_key] = float
91  self.fdr_key = "FDR"
92  self.type[self.fdr_key] = float
93  self.quantitation_key = "Quantitation"
94  self.type[self.quantitation_key] = float
95  self.redundancy_key = "Redundancy"
96  self.type[self.redundancy_key] = int
97  self.redundancy_list_key = "RedundancyList"
98  self.type[self.redundancy_key] = list
99  self.ambiguity_key = "Ambiguity"
100  self.type[self.ambiguity_key] = int
101  self.residue1_links_number_key = "Residue1LinksNumber"
102  self.type[self.residue1_links_number_key] = int
103  self.residue2_links_number_key = "Residue2LinksNumber"
104  self.type[self.residue2_links_number_key] = int
105  self.type[self.ambiguity_key] = int
106  self.state_key = "State"
107  self.type[self.state_key] = int
108  self.sigma1_key = "Sigma1"
109  self.type[self.sigma1_key] = str
110  self.sigma2_key = "Sigma2"
111  self.type[self.sigma2_key] = str
112  self.psi_key = "Psi"
113  self.type[self.psi_key] = str
114  self.distance_key = "Distance"
115  self.type[self.distance_key] = float
116  self.min_ambiguous_distance_key = "MinAmbiguousDistance"
117  self.type[self.distance_key] = float
118  # link types are either Monolink, Intralink or Interlink
119  self.link_type_key = "LinkType"
120  self.type[self.link_type_key] = str
121 
122  self.ordered_key_list = [
123  self.data_set_name_key, self.unique_id_key,
124  self.unique_sub_index_key, self.unique_sub_id_key,
125  self.protein1_key, self.protein2_key, self.residue1_key,
126  self.residue2_key, self.residue1_amino_acid_key,
127  self.residue2_amino_acid_key, self.residue1_moiety_key,
128  self.residue2_moiety_key, self.cross_linker_chemical_key,
129  self.id_score_key, self.fdr_key, self.quantitation_key,
130  self.redundancy_key, self.redundancy_list_key, self.state_key,
131  self.sigma1_key, self.sigma2_key, self.psi_key, self.distance_key,
132  self.min_ambiguous_distance_key, self.link_type_key]
133 
134 
135 class _ProteinsResiduesArray(tuple):
136  '''
137  This class is inherits from tuple, and it is a shorthand for a cross-link
138  (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1
139  and protein2, r1 and r2 are residue1 and residue2.
140  '''
141 
142  def __new__(self, input_data):
143  '''
144  @input_data can be a dict or a tuple
145  '''
146  self.cldbsk = _CrossLinkDataBaseStandardKeys()
147  if type(input_data) is dict:
148  monolink = False
149  p1 = input_data[self.cldbsk.protein1_key]
150  try:
151  p2 = input_data[self.cldbsk.protein2_key]
152  except KeyError:
153  monolink = True
154  r1 = input_data[self.cldbsk.residue1_key]
155  try:
156  r2 = input_data[self.cldbsk.residue2_key]
157  except KeyError:
158  monolink = True
159  if not monolink:
160  t = (p1, p2, r1, r2)
161  else:
162  t = (p1, "", r1, None)
163  elif type(input_data) is tuple:
164  if len(input_data) > 4 or len(input_data) == 3 \
165  or len(input_data) == 1:
166  raise TypeError(
167  "_ProteinsResiduesArray: must have only 4 elements")
168  if len(input_data) == 4:
169  p1 = _handle_string_input(input_data[0])
170  p2 = _handle_string_input(input_data[1])
171  r1 = input_data[2]
172  r2 = input_data[3]
173  if (type(r1) is not int) and (r1 is not None):
174  raise TypeError(
175  "_ProteinsResiduesArray: residue1 must be a integer")
176  if (type(r2) is not int) and (r1 is not None):
177  raise TypeError(
178  "_ProteinsResiduesArray: residue2 must be a integer")
179  t = (p1, p2, r1, r2)
180  if len(input_data) == 2:
181  p1 = _handle_string_input(input_data[0])
182  r1 = input_data[1]
183  if type(r1) is not int:
184  raise TypeError(
185  "_ProteinsResiduesArray: residue1 must be a integer")
186  t = (p1, "", r1, None)
187  else:
188  raise TypeError(
189  "_ProteinsResiduesArray: input must be a dict or tuple")
190  return tuple.__new__(_ProteinsResiduesArray, t)
191 
192  def get_inverted(self):
193  '''
194  Returns a _ProteinsResiduesArray instance with protein1 and
195  protein2 inverted
196  '''
197  return _ProteinsResiduesArray((self[1], self[0], self[3], self[2]))
198 
199  def __repr__(self):
200  outstr = self.cldbsk.protein1_key + " " + str(self[0])
201  outstr += " " + self.cldbsk.protein2_key + " " + str(self[1])
202  outstr += " " + self.cldbsk.residue1_key + " " + str(self[2])
203  outstr += " " + self.cldbsk.residue2_key + " " + str(self[3])
204  return outstr
205 
206  def __str__(self):
207  outstr = str(self[0]) + "." + str(self[2]) + "-" + str(self[1]) \
208  + "." + str(self[3])
209  return outstr
210 
211 
212 class FilterOperator(object):
213  '''
214  This class allows to create filter functions that can be passed to
215  the CrossLinkDataBase in this way:
216 
217  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
218 
219  where cldb is CrossLinkDataBase instance and it is only needed to get
220  the standard keywords
221 
222  A filter operator can be evaluate on a CrossLinkDataBase item xl and
223  returns a boolean
224 
225  fo.evaluate(xl)
226 
227  and it is used to filter the database
228  '''
229 
230  def __init__(self, argument1, operator, argument2):
231  '''
232  (argument1,operator,argument2) can be either a
233  (keyword,operator.eq|lt|gt...,value)
234  or (FilterOperator1,operator.or|and...,FilterOperator2)
235  '''
236  if isinstance(argument1, FilterOperator):
237  self.operations = [argument1, operator, argument2]
238  else:
239  self.operations = []
240  self.values = (argument1, operator, argument2)
241 
242  def __or__(self, FilterOperator2):
243  return FilterOperator(self, operator.or_, FilterOperator2)
244 
245  def __and__(self, FilterOperator2):
246  return FilterOperator(self, operator.and_, FilterOperator2)
247 
248  def __invert__(self):
249  return FilterOperator(self, operator.not_, None)
250 
251  def evaluate(self, xl_item):
252 
253  if len(self.operations) == 0:
254  keyword, operator, value = self.values
255  return operator(xl_item[keyword], value)
256  FilterOperator1, op, FilterOperator2 = self.operations
257 
258  if FilterOperator2 is None:
259  return op(FilterOperator1.evaluate(xl_item))
260  else:
261  return op(FilterOperator1.evaluate(xl_item),
262  FilterOperator2.evaluate(xl_item))
263 
264 
265 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
266  '''
267  This class is needed to convert the keywords from a generic database
268  to the standard ones
269  '''
270 
271  def __init__(self, list_parser=None):
272  '''
273  @param list_parser an instance of ResiduePairListParser, if any
274  is needed
275  '''
276  self.converter = {}
277  self.backward_converter = {}
278  _CrossLinkDataBaseStandardKeys.__init__(self)
279  self.rplp = list_parser
280  if self.rplp is None:
281  # either you have protein1, protein2, residue1, residue2
282  self.compulsory_keys = set([self.protein1_key,
283  self.protein2_key,
284  self.residue1_key,
285  self.residue2_key])
286  else:
287  self.compulsory_keys = self.rplp.get_compulsory_keys()
288  self.setup_keys = set()
289 
290  def check_keys(self):
291  '''
292  Is a function that check whether necessary keys are setup
293  '''
294  setup_keys = set(self.get_setup_keys())
295  if self.compulsory_keys & setup_keys != self.compulsory_keys:
296  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup "
297  "all necessary keys")
298 
299  def get_setup_keys(self):
300  '''
301  Returns the keys that have been setup so far
302  '''
303  return self.backward_converter.keys()
304 
305  def set_standard_keys(self):
306  """
307  This sets up the standard compulsory keys as defined by
308  _CrossLinkDataBaseStandardKeys
309  """
310  for ck in self.compulsory_keys:
311  self.converter[ck] = ck
312  self.backward_converter[ck] = ck
313 
314  def set_unique_id_key(self, origin_key):
315  self.converter[origin_key] = self.unique_id_key
316  self.backward_converter[self.unique_id_key] = origin_key
317 
318  def set_protein1_key(self, origin_key):
319  self.converter[origin_key] = self.protein1_key
320  self.backward_converter[self.protein1_key] = origin_key
321 
322  def set_protein2_key(self, origin_key):
323  self.converter[origin_key] = self.protein2_key
324  self.backward_converter[self.protein2_key] = origin_key
325 
326  def set_residue1_key(self, origin_key):
327  self.converter[origin_key] = self.residue1_key
328  self.backward_converter[self.residue1_key] = origin_key
329 
330  def set_residue2_key(self, origin_key):
331  self.converter[origin_key] = self.residue2_key
332  self.backward_converter[self.residue2_key] = origin_key
333 
334  def set_residue1_amino_acid_key(self, origin_key):
335  self.converter[origin_key] = self.residue1_amino_acid_key
336  self.backward_converter[self.residue1_amino_acid_key] = origin_key
337 
338  def set_residue2_amino_acid_key(self, origin_key):
339  self.converter[origin_key] = self.residue2_amino_acid_key
340  self.backward_converter[self.residue2_amino_acid_key] = origin_key
341 
342  def set_residue1_moiety_key(self, origin_key):
343  self.converter[origin_key] = self.residue1_moiety_key
344  self.backward_converter[self.residue1_moiety_key] = origin_key
345 
346  def set_residue2_moiety_key(self, origin_key):
347  self.converter[origin_key] = self.residue2_moiety_key
348  self.backward_converter[self.residue2_moiety_key] = origin_key
349 
350  def set_site_pairs_key(self, origin_key):
351  self.converter[origin_key] = self.site_pairs_key
352  self.backward_converter[self.site_pairs_key] = origin_key
353 
354  def set_id_score_key(self, origin_key):
355  self.converter[origin_key] = self.id_score_key
356  self.backward_converter[self.id_score_key] = origin_key
357 
358  def set_fdr_key(self, origin_key):
359  self.converter[origin_key] = self.fdr_key
360  self.backward_converter[self.fdr_key] = origin_key
361 
362  def set_quantitation_key(self, origin_key):
363  self.converter[origin_key] = self.quantitation_key
364  self.backward_converter[self.quantitation_key] = origin_key
365 
366  def set_psi_key(self, origin_key):
367  self.converter[origin_key] = self.psi_key
368  self.backward_converter[self.psi_key] = origin_key
369 
370  def set_link_type_key(self, link_type_key):
371  self.converter[link_type_key] = self.link_type_key
372  self.backward_converter[self.link_type_key] = link_type_key
373 
374  def get_converter(self):
375  '''
376  Returns the dictionary that convert the old keywords to the new ones
377  '''
378  self.check_keys()
379  return self.converter
380 
382  '''
383  Returns the dictionary that convert the new keywords to the old ones
384  '''
385  self.check_keys()
386  return self.backward_converter
387 
388 
389 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
390  '''
391  A class to handle different styles of site pairs parsers.
392  Implemented styles:
393  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
394  [Y3-;K4-] for dead-ends
395  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
396  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
397  LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
398  ''' # noqa: E501
399 
400  import re
401 
402  def __init__(self, style):
403  _CrossLinkDataBaseStandardKeys.__init__(self)
404  if style == "MSSTUDIO":
405  self.style = style
406  self.compulsory_keys = set([self.protein1_key,
407  self.protein2_key,
408  self.site_pairs_key])
409  elif style == "XTRACT" or style == "QUANTITATION":
410  self.style = style
411  self.compulsory_keys = set([self.site_pairs_key])
412  elif style == "LAN_HUANG":
413  self.style = style
414  self.compulsory_keys = set([self.site_pairs_key])
415  else:
416  raise Exception("ResiduePairListParser: unknown list parser style")
417 
418  def get_compulsory_keys(self):
419  return self.compulsory_keys
420 
421  def get_list(self, input_string):
422  '''
423  This function returns a list of cross-linked residues and the
424  corresponding list of cross-linked chains. The latter list can be
425  empty, if the style doesn't have the corresponding information.
426  '''
427  if self.style == "MSSTUDIO":
428  input_string = input_string.replace("[", "")
429  input_string = input_string.replace("]", "")
430  input_string_pairs = input_string.split(";")
431  residue_pair_indexes = []
432  chain_pair_indexes = []
433  for s in input_string_pairs:
434  m1 = self.re.search(
435  r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)'
436  r'(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',
437  s)
438  m2 = self.re.search(
439  r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$', s)
440  if m1:
441  # cross-link
442  (residue_type_1, residue_index_1, residue_type_2,
443  residue_index_2) = m1.group(1, 2, 3, 4)
444  residue_pair_indexes.append((residue_index_1,
445  residue_index_2))
446  elif m2:
447  # dead end
448  residue_type_1, residue_index_1 = m2.group(1, 2)
449  # at this stage chain_pair_indexes is empty
450  return residue_pair_indexes, chain_pair_indexes
451  if self.style == "XTRACT" or self.style == "QUANTITATION":
452  if ":x:" in input_string:
453  # if it is a cross-link....
454  input_strings = input_string.split(":x:")
455  first_peptides = input_strings[0].split(":|:")
456  second_peptides = input_strings[1].split(":|:")
457  first_peptides_identifiers = [
458  (f.split(":")[0],
459  f.split(":")[1]) for f in first_peptides]
460  second_peptides_identifiers = [
461  (f.split(":")[0],
462  f.split(":")[1]) for f in second_peptides]
463  residue_pair_indexes = []
464  chain_pair_indexes = []
465  for fpi in first_peptides_identifiers:
466  for spi in second_peptides_identifiers:
467  chain1 = fpi[0]
468  chain2 = spi[0]
469  residue1 = fpi[1]
470  residue2 = spi[1]
471  residue_pair_indexes.append((residue1, residue2))
472  chain_pair_indexes.append((chain1, chain2))
473  return residue_pair_indexes, chain_pair_indexes
474  else:
475  # if it is a monolink....
476  first_peptides = input_string.split(":|:")
477  first_peptides_identifiers = [
478  (f.split(":")[0], f.split(":")[1]) for f in first_peptides]
479  residue_indexes = []
480  chain_indexes = []
481  for fpi in first_peptides_identifiers:
482  chain1 = fpi[0]
483  residue1 = fpi[1]
484  residue_indexes.append((residue1,))
485  chain_indexes.append((chain1,))
486  return residue_indexes, chain_indexes
487  if self.style == "LAN_HUANG":
488  input_strings = input_string.split("-")
489  chain1, first_series = input_strings[0].split(":")
490  chain2, second_series = input_strings[1].split(":")
491 
492  first_residues = first_series.replace(";", "|").split("|")
493  second_residues = second_series.replace(";", "|").split("|")
494  residue_pair_indexes = []
495  chain_pair_indexes = []
496  for fpi in first_residues:
497  for spi in second_residues:
498  residue1 = self.re.sub("[^0-9]", "", fpi)
499  residue2 = self.re.sub("[^0-9]", "", spi)
500  residue_pair_indexes.append((residue1, residue2))
501  chain_pair_indexes.append((chain1, chain2))
502  return residue_pair_indexes, chain_pair_indexes
503 
504 
505 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
506  '''
507  A class to handle different XL format with fixed format
508  currently support ProXL
509  '''
510  def __init__(self, format):
511  _CrossLinkDataBaseStandardKeys.__init__(self)
512  if format == "PROXL":
513  self.format = format
514  else:
515  raise Exception("FixedFormatParser: unknown list format name")
516 
517  def get_data(self, input_string):
518  if self.format == "PROXL":
519  tokens = input_string.split("\t")
520  xl = {}
521  if tokens[0] == "SEARCH ID(S)":
522  return None
523  else:
524  xl[self.protein1_key] = tokens[2]
525  xl[self.protein2_key] = tokens[4]
526  xl[self.residue1_key] = int(tokens[3])
527  xl[self.residue2_key] = int(tokens[5])
528  return xl
529 
530 
531 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
532  '''
533  this class handles a cross-link dataset and do filtering
534  operations, adding cross-links, merge datasets...
535  '''
536 
537  def __init__(self, converter=None, data_base=None, fasta_seq=None,
538  linkable_aa=('K',)):
539  '''
540  Constructor.
541  @param converter an instance of CrossLinkDataBaseKeywordsConverter
542  @param data_base an instance of CrossLinkDataBase to build the
543  new database on
544  @param fasta_seq an instance of IMP.pmi.topology.Sequences containing
545  protein fasta sequences to check cross-link consistency.
546  If not given consistency will not be checked
547  @param linkable_aa a tuple containing one-letter amino acids which
548  are linkable by the cross-linker; only used if the database
549  DOES NOT provide a value for a certain residueX_amino_acid_key
550  and if a fasta_seq is given
551  '''
552 
553  if data_base is None:
554  self.data_base = {}
555  else:
556  self.data_base = data_base
557 
558  _CrossLinkDataBaseStandardKeys.__init__(self)
559  if converter is not None:
560  self.cldbkc = converter # type: CrossLinkDataBaseKeywordsConverter
561  self.list_parser = self.cldbkc.rplp
562  self.converter = converter.get_converter()
563 
564  else:
565  self.cldbkc = None # type: CrossLinkDataBaseKeywordsConverter
566  self.list_parser = None
567  self.converter = None
568 
569  # default amino acids considered to be 'linkable' if none are given
570  self.def_aa_tuple = linkable_aa
571  self.fasta_seq = fasta_seq # type: IMP.pmi.topology.Sequences
572  self.dataset = None
573  self.name = None
574  self._update()
575 
576  def _update(self):
577  '''
578  Update the whole dataset after changes
579  '''
580  self.update_cross_link_unique_sub_index()
581  self.update_cross_link_redundancy()
582  self.update_residues_links_number()
584 
585  def __iter__(self):
586  sorted_ids = sorted(self.data_base.keys())
587  for k in sorted_ids:
588  for xl in self.data_base[k]:
589  yield xl
590 
591  def xlid_iterator(self):
592  sorted_ids = sorted(self.data_base.keys())
593  for xlid in sorted_ids:
594  yield xlid
595 
596  def __getitem__(self, xlid):
597  return self.data_base[xlid]
598 
599  def __len__(self):
600  return len([xl for xl in self])
601 
602  def get_name(self):
603  return self.name
604 
605  def set_name(self, name):
606  new_data_base = {}
607  for k in self.data_base:
608  new_data_base[k + "." + name] = self.data_base[k]
609  self.data_base = new_data_base
610  self.name = name
611  self._update()
612 
613  def get_number_of_xlid(self):
614  return len(self.data_base)
615 
616  def create_set_from_file(self, file_name, converter=None,
617  FixedFormatParser=None, encoding=None):
618  '''
619  if FixedFormatParser is not specified, the file is
620  comma-separated-values
621 
622  @param file_name a txt file to be parsed
623  @param converter an instance of CrossLinkDataBaseKeywordsConverter
624  @param FixedFormatParser a parser for a fixed format
625  @param encoding the encoding of the file, if not the locale
626  default (usually UTF-8)
627  '''
628  if not FixedFormatParser:
629  xl_list = IMP.pmi.tools.get_db_from_csv(file_name, encoding)
630 
631  if converter is not None:
632  self.cldbkc = converter
633  self.list_parser = self.cldbkc.rplp
634  self.converter = converter.get_converter()
635 
636  if not self.list_parser:
637  # normal procedure without a list_parser
638  # each line is a cross-link
639  new_xl_dict = {}
640  for nxl, xl in enumerate(xl_list):
641  new_xl = {}
642  for k in xl:
643  if k in self.converter:
644  new_xl[self.converter[k]] = \
645  self.type[self.converter[k]](xl[k])
646  else:
647  new_xl[k] = xl[k]
648  if self.unique_id_key in self.cldbkc.get_setup_keys():
649  if new_xl[self.unique_id_key] not in new_xl_dict:
650  new_xl_dict[new_xl[self.unique_id_key]] = [new_xl]
651  else:
652  new_xl_dict[new_xl[self.unique_id_key]].append(
653  new_xl)
654  else:
655  if str(nxl) not in new_xl_dict:
656  new_xl_dict[str(nxl)] = [new_xl]
657  else:
658  new_xl_dict[str(nxl)].append(new_xl)
659 
660  else:
661  # with a list_parser, a line can be a list of ambiguous
662  # cross-links
663  new_xl_dict = {}
664  for nxl, entry in enumerate(xl_list):
665 
666  # first get the translated keywords
667  new_dict = {}
668  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
669  raise Exception(
670  "CrossLinkDataBase: expecting a site_pairs_key "
671  "for the site pair list parser")
672  for k in entry:
673  if k in self.converter:
674  new_dict[self.converter[k]] = \
675  self.type[self.converter[k]](entry[k])
676  else:
677  new_dict[k] = entry[k]
678 
679  (residue_pair_list,
680  chain_pair_list) = self.list_parser.get_list(
681  new_dict[self.site_pairs_key])
682 
683  # then create the cross-links
684  for n, p in enumerate(residue_pair_list):
685  is_monolink = False
686  if len(p) == 1:
687  is_monolink = True
688 
689  new_xl = {}
690  for k in new_dict:
691  new_xl[k] = new_dict[k]
692  new_xl[self.residue1_key] = \
693  self.type[self.residue1_key](p[0])
694  if not is_monolink:
695  new_xl[self.residue2_key] = \
696  self.type[self.residue2_key](p[1])
697 
698  if len(chain_pair_list) == len(residue_pair_list):
699  new_xl[self.protein1_key] = \
700  self.type[self.protein1_key](
701  chain_pair_list[n][0])
702  if not is_monolink:
703  new_xl[self.protein2_key] = \
704  self.type[self.protein2_key](
705  chain_pair_list[n][1])
706 
707  if not is_monolink:
708  new_xl[self.link_type_key] = "CROSSLINK"
709  else:
710  new_xl[self.link_type_key] = "MONOLINK"
711 
712  if self.unique_id_key in self.cldbkc.get_setup_keys():
713  if new_xl[self.unique_id_key] not in new_xl_dict:
714  new_xl_dict[new_xl[self.unique_id_key]] = \
715  [new_xl]
716  else:
717  new_xl_dict[new_xl[self.unique_id_key]].append(
718  new_xl)
719  else:
720  if str(nxl) not in new_xl_dict:
721  new_xl[self.unique_id_key] = str(nxl+1)
722  new_xl_dict[str(nxl)] = [new_xl]
723  else:
724  new_xl[self.unique_id_key] = str(nxl+1)
725  new_xl_dict[str(nxl)].append(new_xl)
726 
727  else:
728  '''
729  if FixedFormatParser is defined
730  '''
731  if sys.version_info[0] == 2:
732  def open_with_encoding(fname, mode, encoding):
733  return open(fname, mode)
734  else:
735  open_with_encoding = open
736 
737  new_xl_dict = {}
738  nxl = 0
739  with open_with_encoding(file_name, "r", encoding=encoding) as f:
740  for line in f:
741  xl = FixedFormatParser.get_data(line)
742  if xl:
743  xl[self.unique_id_key] = str(nxl+1)
744  new_xl_dict[str(nxl)] = [xl]
745  nxl += 1
746 
747  self.data_base = new_xl_dict
748  self.name = file_name
749  loc = ihm.location.InputFileLocation(file_name, details='Crosslinks')
750  self.dataset = ihm.dataset.CXMSDataset(loc)
751  self._update()
752 
753  def update_cross_link_unique_sub_index(self):
754  for k in self.data_base:
755  for n, xl in enumerate(self.data_base[k]):
756  xl[self.ambiguity_key] = len(self.data_base[k])
757  xl[self.unique_sub_index_key] = n+1
758  xl[self.unique_sub_id_key] = k + "." + str(n+1)
759 
760  def update_cross_link_redundancy(self):
761  redundancy_data_base = {}
762  for xl in self:
763  pra = _ProteinsResiduesArray(xl)
764  if pra not in redundancy_data_base:
765  redundancy_data_base[pra] = [xl[self.unique_sub_id_key]]
766  redundancy_data_base[pra.get_inverted()] = \
767  [xl[self.unique_sub_id_key]]
768  else:
769  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
770  redundancy_data_base[pra.get_inverted()].append(
771  xl[self.unique_sub_id_key])
772  for xl in self:
773  pra = _ProteinsResiduesArray(xl)
774  xl[self.redundancy_key] = len(redundancy_data_base[pra])
775  xl[self.redundancy_list_key] = redundancy_data_base[pra]
776 
777  def update_residues_links_number(self):
778  residue_links = {}
779  for xl in self:
780  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
781  if (p1, r1) not in residue_links:
782  residue_links[(p1, r1)] = set([(p2, r2)])
783  else:
784  residue_links[(p1, r1)].add((p2, r2))
785  if (p2, r2) not in residue_links:
786  residue_links[(p2, r2)] = set([(p1, r1)])
787  else:
788  residue_links[(p2, r2)].add((p1, r1))
789 
790  for xl in self:
791  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
792  xl[self.residue1_links_number_key] = len(residue_links[(p1, r1)])
793  xl[self.residue2_links_number_key] = len(residue_links[(p2, r2)])
794 
796  """This function checks the consistency of the dataset with the
797  amino acid sequence"""
798  if self.cldbkc and self.fasta_seq:
799  cnt_matched, cnt_matched_file = 0, 0
800  matched = {}
801  non_matched = {}
802  for xl in self:
803  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
804  b_matched_file = False
805  if self.residue1_amino_acid_key in xl:
806  # either you know the residue type and aa_tuple is
807  # a single entry
808  aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
809  b_matched = self._match_xlinks(p1, r1, aa_from_file)
810  b_matched_file = b_matched
811  else:
812  # or pass the possible list of types that can be
813  # cross-linked
814  b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
815 
816  matched, non_matched = self._update_matched_xlinks(
817  b_matched, p1, r1, matched, non_matched)
818 
819  if self.residue2_amino_acid_key in xl:
820  aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
821  b_matched = self._match_xlinks(p2, r2, aa_from_file)
822  b_matched_file = b_matched
823  else:
824  b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
825 
826  matched, non_matched = self._update_matched_xlinks(
827  b_matched, p2, r2, matched, non_matched)
828  if b_matched:
829  cnt_matched += 1
830  if b_matched_file:
831  cnt_matched_file += 1
832  if len(self) > 0:
833  percentage_matched = round(100*cnt_matched/len(self), 1)
834  percentage_matched_file = round(
835  100 * cnt_matched_file / len(self), 1)
836  if matched or non_matched:
837  print(
838  "check_cross_link_consistency: Out of %d cross-links "
839  "%d were matched to the fasta sequence (%f %%).\n "
840  "%d were matched by using the cross-link file (%f %%)."
841  % (len(self), cnt_matched, percentage_matched,
842  cnt_matched_file, percentage_matched_file))
843  if non_matched:
844  print("check_cross_link_consistency: Warning: Non "
845  "matched xlinks:",
846  [(prot_name, sorted(list(non_matched[prot_name])))
847  for prot_name in non_matched])
848  return matched, non_matched
849 
850  def _match_xlinks(self, prot_name, res_index, aa_tuple):
851  # returns Boolean whether given aa matches a position in the fasta file
852  # cross-link files usually start counting at 1 and not 0; therefore
853  # subtract -1 to compare with fasta
854  amino_dict = IMP.pmi.alphabets.amino_acid
855  res_index -= 1
856  for amino_acid in aa_tuple:
857  if len(amino_acid) == 3:
858  amino_acid = amino_dict.get_one_letter_code_from_residue_type(
859  amino_acid.upper())
860  if prot_name in self.fasta_seq.sequences:
861  seq = self.fasta_seq.sequences[prot_name]
862  # if we are looking at the first amino acid in a given
863  # sequence always return a match
864  # the first aa should always be the n-terminal aa
865  # which may form a cross-link in any case (for BS3-like
866  # cross-linkers)
867  # for some data sets the first aa is at position 1;
868  # todo: check why this is the case
869  if res_index == 0 or res_index == 1:
870  return True
871  if res_index < len(seq):
872  if amino_acid == seq[res_index]:
873  return True
874  return False
875 
876  def _update_matched_xlinks(self, b_matched, prot, res, matched,
877  non_matched):
878  if b_matched:
879  if prot in matched:
880  matched[prot].add(res)
881  else:
882  matched[prot] = set([res])
883  else:
884  if prot in non_matched:
885  non_matched[prot].add(res)
886  else:
887  non_matched[prot] = set([res])
888  return matched, non_matched
889 
890  def get_cross_link_string(self, xl):
891  string = '|'
892  for k in self.ordered_key_list:
893  try:
894  string += str(k) + ":" + str(xl[k]) + "|"
895  except KeyError:
896  continue
897 
898  for k in xl:
899  if k not in self.ordered_key_list:
900  string += str(k) + ":" + str(xl[k]) + "|"
901 
902  return string
903 
904  def get_short_cross_link_string(self, xl):
905  string = '|'
906  list_of_keys = [self.data_set_name_key,
907  self.unique_sub_id_key,
908  self.protein1_key,
909  self.residue1_key,
910  self.protein2_key,
911  self.residue2_key,
912  self.state_key,
913  self.psi_key]
914 
915  for k in list_of_keys:
916  try:
917  string += str(xl[k]) + "|"
918  except KeyError:
919  continue
920 
921  return string
922 
923  def filter(self, FilterOperator):
924  new_xl_dict = {}
925  for id in self.data_base.keys():
926  for xl in self.data_base[id]:
927  if FilterOperator.evaluate(xl):
928  if id not in new_xl_dict:
929  new_xl_dict[id] = [xl]
930  else:
931  new_xl_dict[id].append(xl)
932  self._update()
933  cdb = CrossLinkDataBase(self.cldbkc, new_xl_dict)
934  cdb.dataset = self.dataset
935  cdb.name = self.name
936  return cdb
937 
938  def filter_score(self, score):
939  '''Get all cross-links with score greater than an input value'''
940  FilterOperator = IMP.pmi.io.crosslink.FilterOperator(
941  self.id_score_key, operator.gt, score)
942  return self.filter(FilterOperator)
943 
944  def merge(self, CrossLinkDataBase1, CrossLinkDataBase2):
945  '''
946  This function merges two cross-link datasets so that if two
947  conflicting cross-links have the same cross-link UniqueIDS,
948  the cross-links will be appended under the same UniqueID slots
949  with different SubIDs
950  '''
951  raise NotImplementedError()
952 
953  def append_database(self, db):
954  """Append cross-link dataset to this one."""
955  new_data_base = {}
956  for k in self.data_base:
957  new_data_base[k] = self.data_base[k]
958  for k in db.data_base:
959  new_data_base[k] = db.data_base[k]
960  self.data_base = new_data_base
961  self._update()
962 
963  def __iadd__(self, db):
964  self.append_database(db)
965  return self
966 
967  def set_value(self, key, new_value, filter_operator=None):
968  '''
969  This function changes the value for a given key in the database
970  For instance one can change the name of a protein
971  @param key: the key in the database that must be changed
972  @param new_value: the new value of the key
973  @param filter_operator: optional FilterOperator to change the value to
974  a subset of the database
975 
976  example: `cldb1.set_value(cldb1.protein1_key, 'FFF',
977  FO(cldb.protein1_key, operator.eq, "AAA"))`
978  '''
979 
980  for xl in self:
981  if filter_operator is not None:
982  if filter_operator.evaluate(xl):
983  xl[key] = new_value
984  else:
985  xl[key] = new_value
986  self._update()
987 
988  def get_values(self, key):
989  '''
990  this function returns the list of values for a given key in the
991  database alphanumerically sorted
992  '''
993  values = set()
994  for xl in self:
995  values.add(xl[key])
996  return sorted(list(values))
997 
998  def offset_residue_index(self, protein_name, offset):
999  '''
1000  This function offset the residue indexes of a given protein by
1001  a specified value
1002  @param protein_name: the protein name that need to be changed
1003  @param offset: the offset value
1004  '''
1005 
1006  for xl in self:
1007  if xl[self.protein1_key] == protein_name:
1008  xl[self.residue1_key] = xl[self.residue1_key] + offset
1009  if xl[self.protein2_key] == protein_name:
1010  xl[self.residue2_key] = xl[self.residue2_key] + offset
1011  self._update()
1012 
1013  def create_new_keyword(self, keyword, values_from_keyword=None):
1014  '''
1015  This function creates a new keyword for the whole database and
1016  set the values from and existing keyword (optional), otherwise the
1017  values are set to None
1018  @param keyword the new keyword name:
1019  @param values_from_keyword the keyword from which we are copying
1020  the values:
1021  '''
1022  for xl in self:
1023  if values_from_keyword is not None:
1024  xl[keyword] = xl[values_from_keyword]
1025  else:
1026  xl[keyword] = None
1027  self._update()
1028 
1029  def rename_proteins(self, old_to_new_names_dictionary,
1030  protein_to_rename="both"):
1031  '''
1032  This function renames all proteins contained in the input dictionary
1033  from the old names (keys) to the new name (values)
1034  @param old_to_new_names_dictionary dictionary for converting old
1035  to new names
1036  @param protein_to_rename specify whether to rename both or protein1
1037  or protein2 only
1038  '''
1039 
1040  for old_name in old_to_new_names_dictionary:
1041  new_name = old_to_new_names_dictionary[old_name]
1042  if protein_to_rename == "both" or protein_to_rename == "protein1":
1043  fo2 = FilterOperator(self.protein1_key, operator.eq, old_name)
1044  self.set_value(self.protein1_key, new_name, fo2)
1045  if protein_to_rename == "both" or protein_to_rename == "protein2":
1046  fo2 = FilterOperator(self.protein2_key, operator.eq, old_name)
1047  self.set_value(self.protein2_key, new_name, fo2)
1048 
1049  def align_sequence(self, alignfile):
1050  """
1051  This function parses an alignment file obtained previously, to map
1052  crosslink residues onto the sequence of a homolog proteins. It is
1053  useful if you want to map the crosslinks onto a known, homolog
1054  structure without building a comparative model.
1055  The txt file of the alignment should be structured with repeating
1056  blocks, as follows:
1057 
1058  >NAME_CROSSLINKED_PROTEIN
1059  >NAME_HOMOLOG_PROTEIN_WITH_KNOWN_STRUCTURE
1060  one-letter code sequence for cross-linked protein (one line)
1061  one-letter code sequence for homolog protein with known structure (one line)
1062  """ # noqa: E501
1063 
1064  f = open(alignfile, 'r')
1065 
1066  def _assign_residues(seq1, seq2):
1067  d = {}
1068  resinds1 = []
1069  nres = 0
1070  contiguousgap = 0
1071  for r1 in seq1:
1072  if r1 != "-":
1073  nres += 1
1074  resinds1.append(nres)
1075  else:
1076  resinds1.append(None)
1077 
1078  resinds2 = [None for i in resinds1]
1079  nres = 0
1080  lastresind = 1
1081  for pos, r2 in enumerate(seq2):
1082  if r2 != "-":
1083  nres += 1
1084  resinds2[pos] = nres
1085  lastresind = nres
1086  if contiguousgap > 1:
1087  for i in range(pos-int(contiguousgap/2), pos):
1088  resinds2[i] = nres
1089  contiguousgap = 0
1090  else:
1091  contiguousgap += 1
1092  resinds2[pos] = lastresind
1093 
1094  for n, r in enumerate(resinds1):
1095  if r is not None:
1096  d[r] = resinds2[n]
1097  return d
1098 
1099  aa = True
1100  sequences1 = {}
1101  sequences2 = {}
1102  with open(alignfile, 'r') as f:
1103  while aa:
1104  aa = f.readline()
1105  bb = f.readline()
1106  cc = f.readline()
1107  dd = f.readline()
1108  protname = aa.replace(">", "").replace("\n", "")
1109  seq1 = cc.replace(",", "").replace("\n", "")
1110  seq2 = dd.replace(",", "").replace("\n", "")
1111 
1112  n1 = 0
1113  n2 = 0
1114  tmpl1 = []
1115  tmpl2 = []
1116  for i in range(len(seq1)):
1117  if seq1[i] != "-":
1118  n1 += 1
1119  c1 = n1
1120  else:
1121  c1 = None
1122  if seq2[i] != "-":
1123  n2 += 1
1124  c2 = n2
1125  else:
1126  c2 = None
1127 
1128  tmpl1.append([i, c1, seq1[i]])
1129  tmpl2.append([i, c2, seq2[i]])
1130  sequences1[protname] = tmpl1
1131  sequences2[protname] = tmpl2
1132 
1133  d = _assign_residues(seq1, seq2)
1134 
1135  fo11 = FilterOperator(self.protein1_key, operator.eq, protname)
1136  fo12 = FilterOperator(self.protein2_key, operator.eq, protname)
1137 
1138  for xl in self:
1139  if fo11.evaluate(xl):
1140  t = [item for item in sequences1[protname]
1141  if item[1] == xl[self.residue1_key]][0]
1142  sequences1[protname][t[0]] = [
1143  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1144  t = [item for item in sequences2[protname]
1145  if item[1] == d[xl[self.residue1_key]]][0]
1146  sequences2[protname][t[0]] = [
1147  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1148 
1149  xl[self.residue1_key] = d[xl[self.residue1_key]]
1150 
1151  if fo12.evaluate(xl):
1152 
1153  t = [item for item in sequences1[protname]
1154  if item[1] == xl[self.residue2_key]][0]
1155  sequences1[protname][t[0]] = [
1156  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1157  t = [item for item in sequences2[protname]
1158  if item[1] == d[xl[self.residue2_key]]][0]
1159  sequences2[protname][t[0]] = [
1160  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1161 
1162  xl[self.residue2_key] = d[xl[self.residue2_key]]
1163 
1164  print("")
1165  print(aa.replace("\n", ""))
1166  print("".join([seq[2] for seq in sequences1[protname]]))
1167  print(bb.replace("\n", ""))
1168  print("".join([seq[2] for seq in sequences2[protname]]))
1169 
1170  def classify_crosslinks_by_score(self, number_of_classes):
1171  '''
1172  This function creates as many classes as in the input
1173  (number_of_classes: integer) and partition cross-links according
1174  to their identification scores. Classes are defined in the psi key.
1175  '''
1176 
1177  if self.id_score_key is not None:
1178  scores = self.get_values(self.id_score_key)
1179  else:
1180  raise ValueError(
1181  'The cross-link database does not contain score values')
1182  minscore = min(scores)
1183  maxscore = max(scores)
1184  scoreclasses = numpy.linspace(minscore, maxscore, number_of_classes+1)
1185  if self.psi_key is None:
1186  self.create_new_keyword(self.psi_key, values_from_keyword=None)
1187  for xl in self:
1188  score = xl[self.id_score_key]
1189  for n, classmin in enumerate(scoreclasses[0:-1]):
1190  if score >= classmin and score <= scoreclasses[n+1]:
1191  xl[self.psi_key] = str("CLASS_"+str(n))
1192  self._update()
1193 
1194  def clone_protein(self, protein_name, new_protein_name):
1195  for id in self.data_base.keys():
1196  new_data_base = []
1197  for xl in self.data_base[id]:
1198  new_data_base.append(xl)
1199  if xl[self.protein1_key] == protein_name \
1200  and xl[self.protein2_key] != protein_name:
1201  new_xl = dict(xl)
1202  new_xl[self.protein1_key] = new_protein_name
1203  new_data_base.append(new_xl)
1204  elif xl[self.protein1_key] != protein_name \
1205  and xl[self.protein2_key] == protein_name:
1206  new_xl = dict(xl)
1207  new_xl[self.protein2_key] = new_protein_name
1208  new_data_base.append(new_xl)
1209  elif xl[self.protein1_key] == protein_name \
1210  and xl[self.protein2_key] == protein_name:
1211  new_xl = dict(xl)
1212  new_xl[self.protein1_key] = new_protein_name
1213  new_data_base.append(new_xl)
1214  new_xl = dict(xl)
1215  new_xl[self.protein2_key] = new_protein_name
1216  new_data_base.append(new_xl)
1217  new_xl = dict(xl)
1218  new_xl[self.protein1_key] = new_protein_name
1219  new_xl[self.protein2_key] = new_protein_name
1220  new_data_base.append(new_xl)
1221  self.data_base[id] = new_data_base
1222  self._update()
1223 
1225  '''
1226  This function remove cross-links applied to the same residue
1227  (ie, same chain name and residue number)
1228  '''
1229  for id in self.data_base.keys():
1230  new_data_base = []
1231  for xl in self.data_base[id]:
1232  if xl[self.protein1_key] == xl[self.protein2_key] \
1233  and xl[self.residue1_key] == xl[self.residue2_key]:
1234  continue
1235  else:
1236  new_data_base.append(xl)
1237  self.data_base[id] = new_data_base
1238  self._update()
1239 
1240  def jackknife(self, percentage):
1241  '''
1242  this method returns a CrossLinkDataBase class containing
1243  a random subsample of the original cross-link database.
1244  @param percentage float between 0 and 1, is the percentage of
1245  of spectra taken from the original list
1246  '''
1247  import random
1248  if percentage > 1.0 or percentage < 0.0:
1249  raise ValueError('the percentage of random cross-link spectra '
1250  'should be between 0 and 1')
1251  nspectra = self.get_number_of_xlid()
1252  nrandom_spectra = int(nspectra*percentage)
1253  random_keys = random.sample(sorted(self.data_base.keys()),
1254  nrandom_spectra)
1255  new_data_base = {}
1256  for k in random_keys:
1257  new_data_base[k] = self.data_base[k]
1258  return CrossLinkDataBase(self.cldbkc, new_data_base)
1259 
1260  def __str__(self):
1261  outstr = ''
1262  sorted_ids = sorted(self.data_base.keys())
1263 
1264  for id in sorted_ids:
1265  outstr += id + "\n"
1266  for xl in self.data_base[id]:
1267  for k in self.ordered_key_list:
1268  try:
1269  outstr += "--- " + str(k) + " " + str(xl[k]) + "\n"
1270  except KeyError:
1271  continue
1272 
1273  for k in xl:
1274  if k not in self.ordered_key_list:
1275  outstr += "--- " + str(k) + " " + str(xl[k]) + "\n"
1276  outstr += "-------------\n"
1277  return outstr
1278 
1279  def plot(self, filename, **kwargs):
1280  import matplotlib
1281  matplotlib.use('Agg')
1282  import matplotlib.pyplot as plt
1283  import matplotlib.colors
1284 
1285  if kwargs["type"] == "scatter":
1286  cmap = plt.get_cmap("rainbow")
1287  _ = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1288  xkey = kwargs["xkey"]
1289  ykey = kwargs["ykey"]
1290  if "colorkey" in kwargs:
1291  colorkey = kwargs["colorkey"]
1292  if "logyscale" in kwargs:
1293  logyscale = kwargs["logyscale"]
1294  else:
1295  logyscale = False
1296  xs = []
1297  ys = []
1298  colors = []
1299  for xl in self:
1300  try:
1301  xs.append(float(xl[xkey]))
1302  if logyscale:
1303  ys.append(math.log10(float(xl[ykey])))
1304  else:
1305  ys.append(float(xl[ykey]))
1306  colors.append(float(xl[colorkey]))
1307  except ValueError:
1308  print("CrossLinkDataBase.plot: Value error for "
1309  "cross-link %s" % (xl[self.unique_id_key]))
1310  continue
1311 
1312  cs = []
1313  for color in colors:
1314  cindex = (color-min(colors))/(max(colors)-min(colors))
1315  cs.append(cmap(cindex))
1316 
1317  fig = plt.figure()
1318  ax = fig.add_subplot(111)
1319  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8, marker="o")
1320  ax.set_xlabel(xkey)
1321  ax.set_ylabel(ykey)
1322  plt.savefig(filename)
1323  plt.show()
1324  plt.close()
1325 
1326  if kwargs["type"] == "residue_links":
1327  # plot the number of distinct links to a residue
1328  # in an histogram
1329  # molecule name
1330  molecule = kwargs["molecule"]
1331  if type(molecule) is IMP.pmi.topology.Molecule:
1332  length = len(molecule.sequence)
1333  molecule = molecule.get_name()
1334  else:
1335  # you need a IMP.pmi.topology.Sequences object
1336  sequences_object = kwargs["sequences_object"]
1337  sequence = sequences_object.sequences[molecule]
1338  length = len(sequence)
1339 
1340  histogram = [0]*length
1341  for xl in self:
1342  if xl[self.protein1_key] == molecule:
1343  histogram[xl[self.residue1_key]-1] = \
1344  xl[self.residue1_links_number_key]
1345  if xl[self.protein2_key] == molecule:
1346  histogram[xl[self.residue2_key]-1] = \
1347  xl[self.residue2_links_number_key]
1348  plt.bar(range(1, length+1), histogram)
1349  plt.savefig(filename)
1350  plt.show()
1351  plt.close()
1352 
1353  if kwargs["type"] == "histogram":
1354  valuekey = kwargs["valuekey"]
1355  valuename = valuekey
1356  bins = kwargs["bins"]
1357  values_list = []
1358  for xl in self:
1359  try:
1360  values_list.append(float(xl[valuekey]))
1361  except ValueError:
1362  print("CrossLinkDataBase.plot: Value error for cross-link "
1363  "%s" % (xl[self.unique_id_key]))
1364  continue
1366  filename, [values_list], valuename=valuename, bins=bins,
1367  colors=None, format="pdf",
1368  reference_xline=None, yplotrange=None,
1369  xplotrange=None, normalized=True,
1370  leg_names=None)
1371 
1372  def dump(self, json_filename):
1373  import json
1374  with open(json_filename, 'w') as fp:
1375  json.dump(self.data_base, fp, sort_keys=True, indent=2,
1376  default=set_json_default)
1377 
1378  def load(self, json_filename):
1379  import json
1380  with open(json_filename, 'r') as fp:
1381  self.data_base = json.load(fp)
1382  self._update()
1383  # getting rid of unicode
1384  # (can't do this in Python 3, since *everything* is Unicode there)
1385  if sys.version_info[0] < 3:
1386  for xl in self:
1387  for k, v in xl.iteritems():
1388  if type(k) is unicode: # noqa: F821
1389  k = str(k)
1390  if type(v) is unicode: # noqa: F821
1391  v = str(v)
1392  xl[k] = v
1393 
1394  def save_csv(self, filename):
1395  import csv
1396 
1397  data = []
1398  sorted_ids = None
1399  sorted_group_ids = sorted(self.data_base.keys())
1400  for group in sorted_group_ids:
1401  group_block = []
1402  for xl in self.data_base[group]:
1403  if not sorted_ids:
1404  sorted_ids = sorted(xl.keys())
1405  values = [xl[k] for k in sorted_ids]
1406  group_block.append(values)
1407  data += group_block
1408 
1409  with open(filename, 'w') as fp:
1410  a = csv.writer(fp, delimiter=',')
1411  a.writerow(sorted_ids)
1412  a.writerows(data)
1413 
1415  """
1416  Returns the number of non redundant cross-link sites
1417  """
1418  praset = set()
1419  for xl in self:
1420  pra = _ProteinsResiduesArray(xl)
1421  prai = pra.get_inverted()
1422  praset.add(pra)
1423  praset.add(prai)
1424  return len(list(praset))
1425 
1426 
1428  """This class allows to compute and plot the distance between datasets"""
1429 
1430  def __init__(self, cldb_dictionary):
1431  """Input a dictionary where keys are cldb names and values are cldbs"""
1432  import scipy.spatial.distance
1433  self.dbs = cldb_dictionary
1434  self.keylist = list(self.dbs.keys())
1435  self.distances = list()
1436 
1437  for i, key1 in enumerate(self.keylist):
1438  for key2 in self.keylist[i+1:]:
1439  distance = self.get_distance(key1, key2)
1440  self.distances.append(distance)
1441 
1442  self.distances = scipy.spatial.distance.squareform(self.distances)
1443 
1444  def get_distance(self, key1, key2):
1445  return 1.0 - self.jaccard_index(self.dbs[key1], self.dbs[key2])
1446 
1447  def jaccard_index(self, CrossLinkDataBase1, CrossLinkDataBase2):
1448  """Similarity index between two datasets
1449  https://en.wikipedia.org/wiki/Jaccard_index"""
1450 
1451  set1 = set()
1452  set2 = set()
1453  for xl1 in CrossLinkDataBase1:
1454  a1f = _ProteinsResiduesArray(xl1)
1455  a1b = a1f.get_inverted()
1456  set1.add(a1f)
1457  set1.add(a1b)
1458  for xl2 in CrossLinkDataBase2:
1459  a2f = _ProteinsResiduesArray(xl2)
1460  a2b = a2f.get_inverted()
1461  set2.add(a2f)
1462  set2.add(a2b)
1463  return (float(len(set1 & set2)/2)
1464  / (len(set1)/2+len(set2)/2-len(set1 & set2)/2))
1465 
1466  def plot_matrix(self, figurename="clustermatrix.pdf"):
1467  import matplotlib as mpl
1468  import numpy
1469  mpl.use('Agg')
1470  import matplotlib.pylab as pl
1471  from scipy.cluster import hierarchy as hrc
1472 
1473  raw_distance_matrix = self.distances
1474  labels = self.keylist
1475 
1476  fig = pl.figure()
1477 
1478  ax = fig.add_subplot(1, 1, 1)
1479  dendrogram = hrc.dendrogram(
1480  hrc.linkage(raw_distance_matrix),
1481  color_threshold=7,
1482  no_labels=True)
1483  leaves_order = dendrogram['leaves']
1484  ax.set_xlabel('Dataset')
1485  ax.set_ylabel('Jaccard Distance')
1486  pl.tight_layout()
1487  pl.savefig("dendrogram."+figurename, dpi=300)
1488  pl.close(fig)
1489 
1490  fig = pl.figure()
1491 
1492  ax = fig.add_subplot(1, 1, 1)
1493  cax = ax.imshow(raw_distance_matrix[leaves_order, :][:, leaves_order],
1494  interpolation='nearest')
1495  cb = fig.colorbar(cax)
1496  cb.set_label('Jaccard Distance')
1497  ax.set_xlabel('Dataset')
1498  ax.set_ylabel('Dataset')
1499  ax.set_xticks(range(len(labels)))
1500  ax.set_xticklabels(numpy.array(labels)[leaves_order],
1501  rotation='vertical')
1502  ax.set_yticks(range(len(labels)))
1503  ax.set_yticklabels(numpy.array(labels)[leaves_order],
1504  rotation='horizontal')
1505  pl.tight_layout()
1506  pl.savefig("matrix." + figurename, dpi=300)
1507  pl.close(fig)
1508 
1509 
1511  '''
1512  This class maps a CrossLinkDataBase on a given structure
1513  and save an rmf file with color-coded cross-links
1514  '''
1515 
1516  def __init__(self, CrossLinkDataBase, rmf_or_stat_handler):
1517  self.CrossLinkDataBase = CrossLinkDataBase
1518  if type(rmf_or_stat_handler) is IMP.pmi.output.RMFHierarchyHandler or \
1519  type(rmf_or_stat_handler) is IMP.pmi.output.StatHierarchyHandler:
1520  self.prots = rmf_or_stat_handler
1521  self.distances = defaultdict(list)
1522  self.array_to_id = {}
1523  self.id_to_array = {}
1524 
1525  print("computing distances for all cross-links and all structures")
1526  for i in self.prots[::10]:
1527  self.compute_distances()
1528  for key, xl in enumerate(self.CrossLinkDataBase):
1529  array = _ProteinsResiduesArray(xl)
1530  self.array_to_id[array] = key
1531  self.id_to_array[key] = array
1532  if xl["MinAmbiguousDistance"] != 'None':
1533  self.distances[key].append(xl["MinAmbiguousDistance"])
1534 
1535  def compute_distances(self):
1536  data = []
1537  sorted_ids = None
1538  sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1539  for group in sorted_group_ids:
1540  group_dists = []
1541  for xl in self.CrossLinkDataBase.data_base[group]:
1542  if not sorted_ids:
1543  sorted_ids = sorted(xl.keys())
1544  data.append(
1545  sorted_ids
1546  + ["UniqueID", "Distance", "MinAmbiguousDistance"])
1547  c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1548  try:
1549  (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1550  self._get_distance_and_particle_pair(r1, c1, r2, c2)
1551  except: # noqa: E722
1552  mdist = "None"
1553  state1 = "None"
1554  copy1 = "None"
1555  state2 = "None"
1556  copy2 = "None"
1557  try:
1558  # sometimes keys get "lost" in the database,
1559  # not really sure why
1560  values = [xl[k] for k in sorted_ids]
1561  values += [group, mdist]
1562  except KeyError as e:
1563  print("MapCrossLinkDataBaseOnStructure "
1564  "KeyError: {0} in {1}".format(e, xl))
1565  group_dists.append(mdist)
1566  xl["Distance"] = mdist
1567  xl["State1"] = state1
1568  xl["Copy1"] = copy1
1569  xl["State2"] = state2
1570  xl["Copy2"] = copy2
1571 
1572  for xl in self.CrossLinkDataBase.data_base[group]:
1573  xl["MinAmbiguousDistance"] = min(group_dists)
1574 
1575  def _get_distance_and_particle_pair(self, r1, c1, r2, c2):
1576  '''more robust and slower version of above'''
1577  sel = IMP.atom.Selection(self.prots, molecule=c1, residue_index=r1,
1578  resolution=1)
1579  selpart_1 = sel.get_selected_particles()
1580  if len(selpart_1) == 0:
1581  print("MapCrossLinkDataBaseOnStructure: Warning: no particle "
1582  "selected for first site")
1583  return None
1584  sel = IMP.atom.Selection(self.prots, molecule=c2, residue_index=r2,
1585  resolution=1)
1586  selpart_2 = sel.get_selected_particles()
1587  if len(selpart_2) == 0:
1588  print("MapCrossLinkDataBaseOnStructure: Warning: no particle "
1589  "selected for second site")
1590  return None
1591  results = []
1592  for p1 in selpart_1:
1593  for p2 in selpart_2:
1594  if p1 == p2 and r1 == r2:
1595  continue
1596  d1 = IMP.core.XYZ(p1)
1597  d2 = IMP.core.XYZ(p2)
1598  # round distance to second decimal
1599  dist = float(int(IMP.core.get_distance(d1, d2)*100.0))/100.0
1600  h1 = IMP.atom.Hierarchy(p1)
1601  while not IMP.atom.Molecule.get_is_setup(h1.get_particle()):
1602  h1 = h1.get_parent()
1603  copy_index1 = IMP.atom.Copy(h1).get_copy_index()
1604  while not IMP.atom.State.get_is_setup(h1.get_particle()):
1605  h1 = h1.get_parent()
1606  state_index1 = IMP.atom.State(h1).get_state_index()
1607  h2 = IMP.atom.Hierarchy(p2)
1608  while not IMP.atom.Molecule.get_is_setup(h2.get_particle()):
1609  h2 = h2.get_parent()
1610  copy_index2 = IMP.atom.Copy(h2).get_copy_index()
1611  while not IMP.atom.State.get_is_setup(h2.get_particle()):
1612  h2 = h2.get_parent()
1613  state_index2 = IMP.atom.State(h2).get_state_index()
1614 
1615  results.append((dist, state_index1, copy_index1,
1616  state_index2, copy_index2, p1, p2))
1617  if len(results) == 0:
1618  return None
1619  results_sorted = sorted(results,
1620  key=operator.itemgetter(0, 1, 2, 3, 4))
1621  return ((results_sorted[0][0],
1622  results_sorted[0][5],
1623  results_sorted[0][6]),
1624  (results_sorted[0][1],
1625  results_sorted[0][2],
1626  results_sorted[0][3],
1627  results_sorted[0][4]))
1628 
1629  def save_rmf_snapshot(self, filename, color_id=None):
1630  sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1631  list_of_pairs = []
1632  color_scores = []
1633  for group in sorted_group_ids:
1634  group_dists_particles = []
1635  for xl in self.CrossLinkDataBase.data_base[group]:
1636  xllabel = \
1637  self.CrossLinkDataBase.get_short_cross_link_string(xl)
1638  c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1639  try:
1640  (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1641  self._get_distance_and_particle_pair(r1, c1, r2, c2)
1642  except TypeError:
1643  print("TypeError or missing chain/residue ",
1644  r1, c1, r2, c2)
1645  continue
1646  if color_id is not None:
1647  group_dists_particles.append((mdist, p1, p2, xllabel,
1648  float(xl[color_id])))
1649  else:
1650  group_dists_particles.append((mdist, p1, p2, xllabel, 0.0))
1651  if group_dists_particles:
1652  (minmdist, minp1, minp2, minxllabel, mincolor_score) = \
1653  min(group_dists_particles, key=lambda t: t[0])
1654  color_scores.append(mincolor_score)
1655  list_of_pairs.append((minp1, minp2, minxllabel,
1656  mincolor_score))
1657  else:
1658  continue
1659 
1660  linear = IMP.core.Linear(0, 0.0)
1661  linear.set_slope(1.0)
1662  dps2 = IMP.core.DistancePairScore(linear)
1663  rslin = IMP.RestraintSet(self.prots.get_model(),
1664  'linear_dummy_restraints')
1665  sgs = []
1666  offset = min(color_scores)
1667  maxvalue = max(color_scores)
1668  for pair in list_of_pairs:
1669  pr = IMP.core.PairRestraint(self.prots.get_model(), dps2,
1670  (pair[0], pair[1]))
1671  pr.set_name(pair[2])
1672  if offset < maxvalue:
1673  factor = (pair[3]-offset)/(maxvalue-offset)
1674  else:
1675  factor = 0.0
1676  c = IMP.display.get_rgb_color(factor)
1677  seg = IMP.algebra.Segment3D(
1678  IMP.core.XYZ(pair[0]).get_coordinates(),
1679  IMP.core.XYZ(pair[1]).get_coordinates())
1680  rslin.add_restraint(pr)
1681  sgs.append(IMP.display.SegmentGeometry(seg, c, pair[2]))
1682 
1683  rh = RMF.create_rmf_file(filename)
1684  IMP.rmf.add_hierarchies(rh, [self.prots])
1685  IMP.rmf.add_restraints(rh, [rslin])
1686  IMP.rmf.add_geometries(rh, sgs)
1687  IMP.rmf.save_frame(rh)
1688  del rh
1689 
1690  def boxplot_crosslink_distances(self, filename):
1691  import numpy
1692  keys = [self.id_to_array[k] for k in self.distances.keys()]
1693  medians = [numpy.median(self.distances[self.array_to_id[k]])
1694  for k in keys]
1695  dists = [self.distances[self.array_to_id[k]] for k in keys]
1696  distances_sorted_by_median = \
1697  [x for _, x in sorted(zip(medians, dists))]
1698  keys_sorted_by_median = [x for _, x in sorted(zip(medians, keys))]
1700  filename, distances_sorted_by_median,
1701  range(len(distances_sorted_by_median)),
1702  xlabels=keys_sorted_by_median,
1703  scale_plot_length=0.2)
1704 
1705  def get_crosslink_violations(self, threshold):
1706  violations = defaultdict(list)
1707  for k in self.distances:
1708  # viols=map(lambda x:1.0*(x<= threshold), self.distances[k])
1709  viols = self.distances[k]
1710  violations[self.id_to_array[k]] = viols
1711  return violations
1712 
1713 
1715  '''
1716  This class generates a CrossLinkDataBase from a given structure
1717  '''
1718  def __init__(self, system, residue_types_1=["K"],
1719  residue_types_2=["K"], reactivity_range=[0, 1], kt=1.0):
1720  import numpy.random
1721  import math
1723  cldbkc.set_protein1_key("Protein1")
1724  cldbkc.set_protein2_key("Protein2")
1725  cldbkc.set_residue1_key("Residue1")
1726  cldbkc.set_residue2_key("Residue2")
1727  self.cldb = CrossLinkDataBase(cldbkc)
1728  self.system = system
1729  self.model = self.system.model
1730  self.residue_types_1 = residue_types_1
1731  self.residue_types_2 = residue_types_2
1732  self.kt = kt
1733  self.indexes_dict1 = {}
1734  self.indexes_dict2 = {}
1735  self.protein_residue_dict = {}
1736  self.reactivity_dictionary = {}
1737  self.euclidean_interacting_pairs = None
1738  self.xwalk_interacting_pairs = None
1739 
1740  for state in self.system.get_states():
1741  for moleculename, molecules in state.get_molecules().items():
1742  for molecule in molecules:
1743  # we are saving a dictionary with protein name,
1744  # residue number and random reactivity of the residue
1745  seq = molecule.sequence
1746  residues = [i for i in range(1, len(seq)+1)
1747  if ((seq[i-1] in self.residue_types_1)
1748  or (seq[i-1] in self.residue_types_2))]
1749 
1750  for r in residues:
1751  # uniform random reactivities
1752  # getting reactivities from the CDF of an
1753  # exponential distribution
1754  rexp = numpy.random.exponential(0.00000001)
1755  prob = 1.0-math.exp(-rexp)
1756  self.reactivity_dictionary[(molecule, r)] = prob
1757 
1758  residues1 = [i for i in range(1, len(seq)+1)
1759  if seq[i-1] in self.residue_types_1]
1760  residues2 = [i for i in range(1, len(seq)+1)
1761  if seq[i-1] in self.residue_types_2]
1762  for r in residues1:
1763  s = IMP.atom.Selection(
1764  molecule.hier, residue_index=r, resolution=1)
1765  ps = s.get_selected_particles()
1766  for p in ps:
1767  index = p.get_index()
1768  self.indexes_dict1[index] = (molecule, r)
1769  self.protein_residue_dict[(molecule, r)] = index
1770  for r in residues2:
1771  s = IMP.atom.Selection(
1772  molecule.hier, residue_index=r, resolution=1)
1773  ps = s.get_selected_particles()
1774  for p in ps:
1775  index = p.get_index()
1776  self.indexes_dict2[index] = (molecule, r)
1777  self.protein_residue_dict[(molecule, r)] = index
1778 
1779  def get_all_possible_pairs(self):
1780  n = float(len(self.protein_residue_dict.keys()))
1781  return n*(n-1.0)/2.0
1782 
1783  def get_all_feasible_pairs(self, distance=21):
1784  import itertools
1785  particle_index_pairs = []
1786  nxl = 0
1787  for a, b in itertools.combinations(
1788  self.protein_residue_dict.keys(), 2):
1789  new_xl = {}
1790  index1 = self.protein_residue_dict[a]
1791  index2 = self.protein_residue_dict[b]
1792  particle_distance = IMP.core.get_distance(
1793  IMP.core.XYZ(IMP.get_particles(self.model, [index1])[0]),
1794  IMP.core.XYZ(IMP.get_particles(self.model, [index2])[0]))
1795  if particle_distance <= distance:
1796  particle_index_pairs.append((index1, index2))
1797  new_xl[self.cldb.protein1_key] = a[0].get_name()
1798  new_xl[self.cldb.protein2_key] = b[0].get_name()
1799  new_xl["molecule_object1"] = a[0]
1800  new_xl["molecule_object2"] = b[0]
1801  new_xl[self.cldb.residue1_key] = a[1]
1802  new_xl[self.cldb.residue2_key] = b[1]
1803  self.cldb.data_base[str(nxl)] = [new_xl]
1804  nxl += 1
1805  self.cldb._update()
1806  return self.cldb
1807 
1808  def get_data_base(self, total_number_of_spectra, ambiguity_probability=0.1,
1809  noise=0.01, distance=21, max_delta_distance=10.0,
1810  xwalk_bin_path=None, confidence_false=0.75,
1811  confidence_true=0.75):
1812  import math
1813  from random import random
1814  import numpy as np
1815  number_of_spectra = 1
1816 
1817  self.beta_true = -1.4427*math.log(0.5*(1.0-confidence_true))
1818  self.beta_false = -1.4427*math.log(0.5*(1.0-confidence_false))
1819  self.cldb.data_base[str(number_of_spectra)] = []
1820  self.sites_weighted = None
1821 
1822  while number_of_spectra < total_number_of_spectra:
1823  if (random() > ambiguity_probability
1824  and len(self.cldb.data_base[str(number_of_spectra)]) != 0):
1825  # new spectrum
1826  number_of_spectra += 1
1827  self.cldb.data_base[str(number_of_spectra)] = []
1828  noisy = False
1829  if random() > noise:
1830  # not noisy cross-link
1831  pra, dist = self.get_random_residue_pair(
1832  distance, xwalk_bin_path, max_delta_distance)
1833  else:
1834  # noisy cross-link
1835  pra, dist = self.get_random_residue_pair(None, None, None)
1836  noisy = True
1837 
1838  new_xl = {}
1839  new_xl[self.cldb.protein1_key] = pra[0].get_name()
1840  new_xl[self.cldb.protein2_key] = pra[1].get_name()
1841  new_xl["molecule_object1"] = pra[0]
1842  new_xl["molecule_object2"] = pra[1]
1843  new_xl[self.cldb.residue1_key] = pra[2]
1844  new_xl[self.cldb.residue2_key] = pra[3]
1845  new_xl["Noisy"] = noisy
1846  # the reactivity is defined as r=1-exp(-k*Delta t)
1847  new_xl["Reactivity_Residue1"] = \
1848  self.reactivity_dictionary[(pra[0], pra[2])]
1849  new_xl["Reactivity_Residue2"] = \
1850  self.reactivity_dictionary[(pra[1], pra[3])]
1851  if noisy:
1852  new_xl["Score"] = np.random.beta(1.0, self.beta_false)
1853  else:
1854  new_xl["Score"] = 1.0-np.random.beta(1.0, self.beta_true)
1855  new_xl["TargetDistance"] = dist
1856  new_xl["NoiseProbability"] = noise
1857  new_xl["AmbiguityProbability"] = ambiguity_probability
1858  # getting if it is intra or inter rigid body
1859  (p1, p2) = IMP.get_particles(
1860  self.model, [self.protein_residue_dict[(pra[0], pra[2])],
1861  self.protein_residue_dict[(pra[1], pra[3])]])
1864  IMP.core.RigidMember(p1).get_rigid_body() ==
1865  IMP.core.RigidMember(p2).get_rigid_body()):
1866  new_xl["InterRigidBody"] = False
1867  elif (IMP.core.RigidMember.get_is_setup(p1) and
1869  IMP.core.RigidMember(p1).get_rigid_body() !=
1870  IMP.core.RigidMember(p2).get_rigid_body()):
1871  new_xl["InterRigidBody"] = True
1872  else:
1873  new_xl["InterRigidBody"] = None
1874 
1875  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1876  self.cldb._update()
1877  return self.cldb
1878 
1879  def get_random_residue_pair(self, distance=None, xwalk_bin_path=None,
1880  max_delta_distance=None):
1881  import IMP.pmi.tools
1882  import math
1883  from random import choice
1884  if distance is None:
1885  # get a random pair
1886  while True:
1887  protein1, residue1 = choice(self.protein_residue_dict.keys())
1888  protein2, residue2 = choice(self.protein_residue_dict.keys())
1889  index1 = self.protein_residue_dict[(protein1, residue1)]
1890  index2 = self.protein_residue_dict[(protein2, residue2)]
1891  particle_distance = IMP.core.get_distance(
1892  IMP.core.XYZ(IMP.get_particles(self.model, [index1])[0]),
1893  IMP.core.XYZ(IMP.get_particles(self.model, [index2])[0]))
1894  if (protein1, residue1) != (protein2, residue2):
1895  break
1896  else:
1897  # get a pair of residues whose distance is below the threshold
1898  if not xwalk_bin_path:
1900  gcpf.set_distance(distance+max_delta_distance)
1901 
1902  while True:
1903  # setup the reaction rates lists
1904  if not self.sites_weighted:
1905  self.sites_weighted = []
1906  for key in self.reactivity_dictionary:
1907  r = self.reactivity_dictionary[key]
1908  self.sites_weighted.append((key, r))
1909  # get a random reaction site
1910  first_site = self.weighted_choice(self.sites_weighted)
1911  # get all distances
1912  if not self.euclidean_interacting_pairs:
1913  self.euclidean_interacting_pairs = \
1914  gcpf.get_close_pairs(
1915  self.model,
1916  self.indexes_dict1.keys(),
1917  self.indexes_dict2.keys())
1918  # get the partner for the first reacted site
1919  first_site_pairs = \
1920  [pair for pair in self.euclidean_interacting_pairs
1921  if self.indexes_dict1[pair[0]] == first_site or
1922  self.indexes_dict2[pair[1]] == first_site]
1923  if len(first_site_pairs) == 0:
1924  continue
1925  # build the list of second reaction sites
1926  second_sites_weighted = []
1927  for pair in first_site_pairs:
1928  if self.indexes_dict1[pair[0]] == first_site:
1929  second_site = self.indexes_dict2[pair[1]]
1930  if self.indexes_dict2[pair[1]] == first_site:
1931  second_site = self.indexes_dict1[pair[0]]
1932  r = self.reactivity_dictionary[second_site]
1933  second_sites_weighted.append((second_site, r))
1934  second_site = self.weighted_choice(second_sites_weighted)
1935  protein1, residue1 = first_site
1936  protein2, residue2 = second_site
1937  print("CrossLinkDataBaseFromStructure."
1938  "get_random_residue_pair:",
1939  "First site", first_site,
1940  self.reactivity_dictionary[first_site],
1941  "Second site", second_site,
1942  self.reactivity_dictionary[second_site])
1943  particle_pair = IMP.get_particles(
1944  self.model,
1945  [self.protein_residue_dict[first_site],
1946  self.protein_residue_dict[second_site]])
1947  particle_distance = IMP.core.get_distance(
1948  IMP.core.XYZ(particle_pair[0]),
1949  IMP.core.XYZ(particle_pair[1]))
1950 
1951  if particle_distance < distance \
1952  and (protein1, residue1) != (protein2, residue2):
1953  break
1954  elif (particle_distance >= distance
1955  and (protein1, residue1) != (protein2, residue2)
1956  and max_delta_distance):
1957  # allow some flexibility
1958  if particle_distance-distance < max_delta_distance:
1959  break
1960 
1961  else:
1962  if not self.xwalk_interacting_pairs:
1963  self.xwalk_interacting_pairs = \
1964  self.get_xwalk_distances(xwalk_bin_path, distance)
1965  interacting_pairs_weighted = []
1966 
1967  for pair in self.xwalk_interacting_pairs:
1968  protein1 = pair[0]
1969  protein2 = pair[1]
1970  residue1 = pair[2]
1971  residue2 = pair[3]
1972  weight1 = math.exp(
1973  -self.reactivity_dictionary[(protein1, residue1)]
1974  / self.kt)
1975  weight2 = math.exp(
1976  -self.reactivity_dictionary[(protein2, residue2)]
1977  / self.kt)
1978  interacting_pairs_weighted.append((pair, weight1*weight2))
1979 
1980  pair = self.weighted_choice(interacting_pairs_weighted)
1981  protein1 = pair[0]
1982  protein2 = pair[1]
1983  residue1 = pair[2]
1984  residue2 = pair[3]
1985  particle_distance = float(pair[4])
1986 
1987  return ((protein1, protein2, residue1, residue2)), particle_distance
1988 
1989  def get_xwalk_distances(self, xwalk_bin_path, distance):
1990  import IMP.pmi.output
1991  import os
1992  o = IMP.pmi.output.Output(atomistic=True)
1993  o.init_pdb("xwalk.pdb", self.representation.prot)
1994  o.write_pdb("xwalk.pdb")
1995  namechainiddict = o.dictchain["xwalk.pdb"]
1996  chainiddict = {}
1997 
1998  for key in namechainiddict:
1999  chainiddict[namechainiddict[key]] = key
2000  xwalkout = os.popen('java -Xmx256m -cp ' + xwalk_bin_path
2001  + ' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys '
2002  '-a1 cb -a2 cb -max '
2003  + str(distance) + ' -bb').read()
2004 
2005  output_list_of_distance = []
2006  for line in xwalkout.split("\n")[0:-2]:
2007  tokens = line.split()
2008  first = tokens[2]
2009  second = tokens[3]
2010  distance = float(tokens[6])
2011  fs = first.split("-")
2012  ss = second.split("-")
2013  chainid1 = fs[2]
2014  chainid2 = ss[2]
2015  protein1 = chainiddict[chainid1]
2016  protein2 = chainiddict[chainid2]
2017  residue1 = int(fs[1])
2018  residue2 = int(ss[1])
2019  output_list_of_distance.append(
2020  (protein1, protein2, residue1, residue2, distance))
2021  return output_list_of_distance
2022 
2023  def weighted_choice(self, choices):
2024  import random
2025  # http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
2026  total = sum(w for c, w in choices)
2027  r = random.uniform(0, total)
2028  upto = 0
2029  for c, w in choices:
2030  if upto + w > r:
2031  return c
2032  upto += w
2033  assert False, "Shouldn't get here"
2034 
2035  def save_rmf_snapshot(self, filename, color_id=None):
2036  import IMP.rmf
2037  import RMF
2038  if color_id is None:
2039  color_id = "Reactivity"
2040  sorted_group_ids = sorted(self.cldb.data_base.keys())
2041  list_of_pairs = []
2042  color_scores = []
2043  for group in sorted_group_ids:
2044  group_dists_particles = []
2045  for xl in self.cldb.data_base[group]:
2046  xllabel = self.cldb.get_short_cross_link_string(xl)
2047  (c1, c2, r1, r2) = (xl["molecule_object1"],
2048  xl["molecule_object2"],
2049  xl[self.cldb.residue1_key],
2050  xl[self.cldb.residue2_key])
2051  try:
2052  index1 = self.protein_residue_dict[(c1, r1)]
2053  index2 = self.protein_residue_dict[(c2, r2)]
2054  p1, p2 = (IMP.get_particles(self.model, [index1])[0],
2055  IMP.get_particles(self.model, [index2])[0])
2056  mdist = xl["TargetDistance"]
2057  except TypeError:
2058  print("TypeError or missing chain/residue ",
2059  r1, c1, r2, c2)
2060  continue
2061  group_dists_particles.append((mdist, p1, p2, xllabel,
2062  float(xl[color_id])))
2063  if group_dists_particles:
2064  (minmdist, minp1, minp2, minxllabel, mincolor_score) \
2065  = min(group_dists_particles, key=lambda t: t[0])
2066  color_scores.append(mincolor_score)
2067  list_of_pairs.append((minp1, minp2, minxllabel,
2068  mincolor_score))
2069  else:
2070  continue
2071 
2072  m = self.model
2073  linear = IMP.core.Linear(0, 0.0)
2074  linear.set_slope(1.0)
2075  dps2 = IMP.core.DistancePairScore(linear)
2076  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
2077  sgs = []
2078  offset = min(color_scores)
2079  maxvalue = max(color_scores)
2080  for pair in list_of_pairs:
2081  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
2082  pr.set_name(pair[2])
2083  factor = (pair[3]-offset)/(maxvalue-offset)
2084  c = IMP.display.get_rgb_color(factor)
2085  seg = IMP.algebra.Segment3D(
2086  IMP.core.XYZ(pair[0]).get_coordinates(),
2087  IMP.core.XYZ(pair[1]).get_coordinates())
2088  rslin.add_restraint(pr)
2089  sgs.append(IMP.display.SegmentGeometry(seg, c, pair[2]))
2090 
2091  rh = RMF.create_rmf_file(filename)
2092  IMP.rmf.add_hierarchies(rh, [self.system.hier])
2093  IMP.rmf.add_restraints(rh, [rslin])
2094  IMP.rmf.add_geometries(rh, sgs)
2095  IMP.rmf.save_frame(rh)
2096  del rh
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: State.h:41
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1758
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1823
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
Get the particles from a list of indexes.
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:749
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:41
Stores a named protein chain.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
The standard decorator for manipulating molecular structures.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:203
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:22
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Score a pair of particles based on the distance between their centers.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:25
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Definition: exception.h:48
Associate an integer "state" index with a hierarchy node.
Definition: State.h:27
class to link stat files to several rmf files
Definition: output.py:1257
class to allow more advanced handling of RMF files.
Definition: output.py:1133
Mapping between FASTA one-letter codes and residue types.
Definition: alphabets.py:1
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Definition: PairRestraint.h:31
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.