IMP logo
IMP Reference Guide  2.15.0
The Integrative Modeling Platform
io/crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 
8 from __future__ import print_function
9 import IMP
10 import IMP.pmi
11 import IMP.pmi.output
12 import IMP.pmi.alphabets
13 import IMP.atom
14 import IMP.core
15 import IMP.algebra
16 import IMP.rmf
17 import RMF
18 import IMP.display
19 import operator
20 import math
21 import sys
22 import ihm.location
23 import ihm.dataset
24 from collections import defaultdict
25 import numpy
26 
27 
28 # json default serializations
29 def set_json_default(obj):
30  if isinstance(obj, set):
31  return list(obj)
32  if isinstance(obj, IMP.pmi.topology.Molecule):
33  return str(obj)
34  raise TypeError
35 
36 
37 # Handle and return data that must be a string
38 if sys.version_info[0] >= 3:
39  def _handle_string_input(inp):
40  if not isinstance(inp, str):
41  raise TypeError("expecting a string")
42  return inp
43 else:
44  def _handle_string_input(inp):
45  if not isinstance(inp, (str, unicode)): # noqa: F821
46  raise TypeError("expecting a string or unicode")
47  # Coerce to non-unicode representation (str)
48  if isinstance(inp, unicode): # noqa: F821
49  return str(inp)
50  else:
51  return inp
52 
53 
54 class _CrossLinkDataBaseStandardKeys(object):
55  '''
56  This class setup all the standard keys needed to
57  identify the cross-link features from the data sets
58  '''
59  def __init__(self):
60  self.type = {}
61  self.protein1_key = "Protein1"
62  self.type[self.protein1_key] = str
63  self.protein2_key = "Protein2"
64  self.type[self.protein2_key] = str
65  self.residue1_key = "Residue1"
66  self.type[self.residue1_key] = int
67  self.residue2_key = "Residue2"
68  self.type[self.residue2_key] = int
69  self.residue1_amino_acid_key = "Residue1AminoAcid"
70  self.type[self.residue1_amino_acid_key] = str
71  self.residue2_amino_acid_key = "Residue2AminoAcid"
72  self.type[self.residue2_amino_acid_key] = str
73  self.residue1_moiety_key = "Residue1Moiety"
74  self.type[self.residue1_moiety_key] = str
75  self.residue2_moiety_key = "Residue2Moiety"
76  self.type[self.residue2_moiety_key] = str
77  self.site_pairs_key = "SitePairs"
78  self.type[self.site_pairs_key] = str
79  self.unique_id_key = "XLUniqueID"
80  self.type[self.unique_id_key] = str
81  self.unique_sub_index_key = "XLUniqueSubIndex"
82  self.type[self.unique_sub_index_key] = int
83  self.unique_sub_id_key = "XLUniqueSubID"
84  self.type[self.unique_sub_id_key] = str
85  self.data_set_name_key = "DataSetName"
86  self.type[self.data_set_name_key] = str
87  self.cross_linker_chemical_key = "CrossLinkerChemical"
88  self.type[self.cross_linker_chemical_key] = str
89  self.id_score_key = "IDScore"
90  self.type[self.id_score_key] = float
91  self.fdr_key = "FDR"
92  self.type[self.fdr_key] = float
93  self.quantitation_key = "Quantitation"
94  self.type[self.quantitation_key] = float
95  self.redundancy_key = "Redundancy"
96  self.type[self.redundancy_key] = int
97  self.redundancy_list_key = "RedundancyList"
98  self.type[self.redundancy_key] = list
99  self.ambiguity_key = "Ambiguity"
100  self.type[self.ambiguity_key] = int
101  self.residue1_links_number_key = "Residue1LinksNumber"
102  self.type[self.residue1_links_number_key] = int
103  self.residue2_links_number_key = "Residue2LinksNumber"
104  self.type[self.residue2_links_number_key] = int
105  self.type[self.ambiguity_key] = int
106  self.state_key = "State"
107  self.type[self.state_key] = int
108  self.sigma1_key = "Sigma1"
109  self.type[self.sigma1_key] = str
110  self.sigma2_key = "Sigma2"
111  self.type[self.sigma2_key] = str
112  self.psi_key = "Psi"
113  self.type[self.psi_key] = str
114  self.distance_key = "Distance"
115  self.type[self.distance_key] = float
116  self.min_ambiguous_distance_key = "MinAmbiguousDistance"
117  self.type[self.distance_key] = float
118  # link types are either Monolink, Intralink or Interlink
119  self.link_type_key = "LinkType"
120  self.type[self.link_type_key] = str
121 
122  self.ordered_key_list = [
123  self.data_set_name_key, self.unique_id_key,
124  self.unique_sub_index_key, self.unique_sub_id_key,
125  self.protein1_key, self.protein2_key, self.residue1_key,
126  self.residue2_key, self.residue1_amino_acid_key,
127  self.residue2_amino_acid_key, self.residue1_moiety_key,
128  self.residue2_moiety_key, self.cross_linker_chemical_key,
129  self.id_score_key, self.fdr_key, self.quantitation_key,
130  self.redundancy_key, self.redundancy_list_key, self.state_key,
131  self.sigma1_key, self.sigma2_key, self.psi_key, self.distance_key,
132  self.min_ambiguous_distance_key, self.link_type_key]
133 
134 
135 class _ProteinsResiduesArray(tuple):
136  '''
137  This class is inherits from tuple, and it is a shorthand for a cross-link
138  (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1
139  and protein2, r1 and r2 are residue1 and residue2.
140  '''
141 
142  def __new__(self, input_data):
143  '''
144  @input_data can be a dict or a tuple
145  '''
146  self.cldbsk = _CrossLinkDataBaseStandardKeys()
147  if type(input_data) is dict:
148  monolink = False
149  p1 = input_data[self.cldbsk.protein1_key]
150  try:
151  p2 = input_data[self.cldbsk.protein2_key]
152  except KeyError:
153  monolink = True
154  r1 = input_data[self.cldbsk.residue1_key]
155  try:
156  r2 = input_data[self.cldbsk.residue2_key]
157  except KeyError:
158  monolink = True
159  if not monolink:
160  t = (p1, p2, r1, r2)
161  else:
162  t = (p1, "", r1, None)
163  elif type(input_data) is tuple:
164  if len(input_data) > 4 or len(input_data) == 3 \
165  or len(input_data) == 1:
166  raise TypeError(
167  "_ProteinsResiduesArray: must have only 4 elements")
168  if len(input_data) == 4:
169  p1 = _handle_string_input(input_data[0])
170  p2 = _handle_string_input(input_data[1])
171  r1 = input_data[2]
172  r2 = input_data[3]
173  if (type(r1) is not int) and (r1 is not None):
174  raise TypeError(
175  "_ProteinsResiduesArray: residue1 must be a integer")
176  if (type(r2) is not int) and (r1 is not None):
177  raise TypeError(
178  "_ProteinsResiduesArray: residue2 must be a integer")
179  t = (p1, p2, r1, r2)
180  if len(input_data) == 2:
181  p1 = _handle_string_input(input_data[0])
182  r1 = input_data[1]
183  if type(r1) is not int:
184  raise TypeError(
185  "_ProteinsResiduesArray: residue1 must be a integer")
186  t = (p1, "", r1, None)
187  else:
188  raise TypeError(
189  "_ProteinsResiduesArray: input must be a dict or tuple")
190  return tuple.__new__(_ProteinsResiduesArray, t)
191 
192  def get_inverted(self):
193  '''
194  Returns a _ProteinsResiduesArray instance with protein1 and
195  protein2 inverted
196  '''
197  return _ProteinsResiduesArray((self[1], self[0], self[3], self[2]))
198 
199  def __repr__(self):
200  outstr = self.cldbsk.protein1_key + " " + str(self[0])
201  outstr += " " + self.cldbsk.protein2_key + " " + str(self[1])
202  outstr += " " + self.cldbsk.residue1_key + " " + str(self[2])
203  outstr += " " + self.cldbsk.residue2_key + " " + str(self[3])
204  return outstr
205 
206  def __str__(self):
207  outstr = str(self[0]) + "." + str(self[2]) + "-" + str(self[1]) \
208  + "." + str(self[3])
209  return outstr
210 
211 
212 class FilterOperator(object):
213  '''
214  This class allows to create filter functions that can be passed to
215  the CrossLinkDataBase in this way:
216 
217  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
218 
219  where cldb is CrossLinkDataBase instance and it is only needed to get
220  the standard keywords
221 
222  A filter operator can be evaluate on a CrossLinkDataBase item xl and
223  returns a boolean
224 
225  fo.evaluate(xl)
226 
227  and it is used to filter the database
228  '''
229 
230  def __init__(self, argument1, operator, argument2):
231  '''
232  (argument1,operator,argument2) can be either a
233  (keyword,operator.eq|lt|gt...,value)
234  or (FilterOperator1,operator.or|and...,FilterOperator2)
235  '''
236  if isinstance(argument1, FilterOperator):
237  self.operations = [argument1, operator, argument2]
238  else:
239  self.operations = []
240  self.values = (argument1, operator, argument2)
241 
242  def __or__(self, FilterOperator2):
243  return FilterOperator(self, operator.or_, FilterOperator2)
244 
245  def __and__(self, FilterOperator2):
246  return FilterOperator(self, operator.and_, FilterOperator2)
247 
248  def __invert__(self):
249  return FilterOperator(self, operator.not_, None)
250 
251  def evaluate(self, xl_item):
252 
253  if len(self.operations) == 0:
254  keyword, operator, value = self.values
255  return operator(xl_item[keyword], value)
256  FilterOperator1, op, FilterOperator2 = self.operations
257 
258  if FilterOperator2 is None:
259  return op(FilterOperator1.evaluate(xl_item))
260  else:
261  return op(FilterOperator1.evaluate(xl_item),
262  FilterOperator2.evaluate(xl_item))
263 
264 
265 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
266  '''
267  This class is needed to convert the keywords from a generic database
268  to the standard ones
269  '''
270 
271  def __init__(self, list_parser=None):
272  '''
273  @param list_parser an instance of ResiduePairListParser, if any
274  is needed
275  '''
276  self.converter = {}
277  self.backward_converter = {}
278  _CrossLinkDataBaseStandardKeys.__init__(self)
279  self.rplp = list_parser
280  if self.rplp is None:
281  # either you have protein1, protein2, residue1, residue2
282  self.compulsory_keys = set([self.protein1_key,
283  self.protein2_key,
284  self.residue1_key,
285  self.residue2_key])
286  else:
287  self.compulsory_keys = self.rplp.get_compulsory_keys()
288  self.setup_keys = set()
289 
290  def check_keys(self):
291  '''
292  Is a function that check whether necessary keys are setup
293  '''
294  setup_keys = set(self.get_setup_keys())
295  if self.compulsory_keys & setup_keys != self.compulsory_keys:
296  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup "
297  "all necessary keys")
298 
299  def get_setup_keys(self):
300  '''
301  Returns the keys that have been setup so far
302  '''
303  return self.backward_converter.keys()
304 
305  def set_standard_keys(self):
306  """
307  This sets up the standard compulsory keys as defined by
308  _CrossLinkDataBaseStandardKeys
309  """
310  for ck in self.compulsory_keys:
311  self.converter[ck] = ck
312  self.backward_converter[ck] = ck
313 
314  def set_unique_id_key(self, origin_key):
315  self.converter[origin_key] = self.unique_id_key
316  self.backward_converter[self.unique_id_key] = origin_key
317 
318  def set_protein1_key(self, origin_key):
319  self.converter[origin_key] = self.protein1_key
320  self.backward_converter[self.protein1_key] = origin_key
321 
322  def set_protein2_key(self, origin_key):
323  self.converter[origin_key] = self.protein2_key
324  self.backward_converter[self.protein2_key] = origin_key
325 
326  def set_residue1_key(self, origin_key):
327  self.converter[origin_key] = self.residue1_key
328  self.backward_converter[self.residue1_key] = origin_key
329 
330  def set_residue2_key(self, origin_key):
331  self.converter[origin_key] = self.residue2_key
332  self.backward_converter[self.residue2_key] = origin_key
333 
334  def set_residue1_amino_acid_key(self, origin_key):
335  self.converter[origin_key] = self.residue1_amino_acid_key
336  self.backward_converter[self.residue1_amino_acid_key] = origin_key
337 
338  def set_residue2_amino_acid_key(self, origin_key):
339  self.converter[origin_key] = self.residue2_amino_acid_key
340  self.backward_converter[self.residue2_amino_acid_key] = origin_key
341 
342  def set_residue1_moiety_key(self, origin_key):
343  self.converter[origin_key] = self.residue1_moiety_key
344  self.backward_converter[self.residue1_moiety_key] = origin_key
345 
346  def set_residue2_moiety_key(self, origin_key):
347  self.converter[origin_key] = self.residue2_moiety_key
348  self.backward_converter[self.residue2_moiety_key] = origin_key
349 
350  def set_site_pairs_key(self, origin_key):
351  self.converter[origin_key] = self.site_pairs_key
352  self.backward_converter[self.site_pairs_key] = origin_key
353 
354  def set_id_score_key(self, origin_key):
355  self.converter[origin_key] = self.id_score_key
356  self.backward_converter[self.id_score_key] = origin_key
357 
358  def set_fdr_key(self, origin_key):
359  self.converter[origin_key] = self.fdr_key
360  self.backward_converter[self.fdr_key] = origin_key
361 
362  def set_quantitation_key(self, origin_key):
363  self.converter[origin_key] = self.quantitation_key
364  self.backward_converter[self.quantitation_key] = origin_key
365 
366  def set_psi_key(self, origin_key):
367  self.converter[origin_key] = self.psi_key
368  self.backward_converter[self.psi_key] = origin_key
369 
370  def set_link_type_key(self, link_type_key):
371  self.converter[link_type_key] = self.link_type_key
372  self.backward_converter[self.link_type_key] = link_type_key
373 
374  def get_converter(self):
375  '''
376  Returns the dictionary that convert the old keywords to the new ones
377  '''
378  self.check_keys()
379  return self.converter
380 
382  '''
383  Returns the dictionary that convert the new keywords to the old ones
384  '''
385  self.check_keys()
386  return self.backward_converter
387 
388 
389 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
390  '''
391  A class to handle different styles of site pairs parsers.
392  Implemented styles:
393  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
394  [Y3-;K4-] for dead-ends
395  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
396  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
397  LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
398  ''' # noqa: E501
399 
400  import re
401 
402  def __init__(self, style):
403  _CrossLinkDataBaseStandardKeys.__init__(self)
404  if style == "MSSTUDIO":
405  self.style = style
406  self.compulsory_keys = set([self.protein1_key,
407  self.protein2_key,
408  self.site_pairs_key])
409  elif style == "XTRACT" or style == "QUANTITATION":
410  self.style = style
411  self.compulsory_keys = set([self.site_pairs_key])
412  elif style == "LAN_HUANG":
413  self.style = style
414  self.compulsory_keys = set([self.site_pairs_key])
415  else:
416  raise Exception("ResiduePairListParser: unknown list parser style")
417 
418  def get_compulsory_keys(self):
419  return self.compulsory_keys
420 
421  def get_list(self, input_string):
422  '''
423  This function returns a list of cross-linked residues and the
424  corresponding list of cross-linked chains. The latter list can be
425  empty, if the style doesn't have the corresponding information.
426  '''
427  if self.style == "MSSTUDIO":
428  input_string = input_string.replace("[", "")
429  input_string = input_string.replace("]", "")
430  input_string_pairs = input_string.split(";")
431  residue_pair_indexes = []
432  chain_pair_indexes = []
433  for s in input_string_pairs:
434  m1 = self.re.search(
435  r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)'
436  r'(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',
437  s)
438  m2 = self.re.search(
439  r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$', s)
440  if m1:
441  # cross-link
442  (residue_type_1, residue_index_1, residue_type_2,
443  residue_index_2) = m1.group(1, 2, 3, 4)
444  residue_pair_indexes.append((residue_index_1,
445  residue_index_2))
446  elif m2:
447  # dead end
448  residue_type_1, residue_index_1 = m2.group(1, 2)
449  # at this stage chain_pair_indexes is empty
450  return residue_pair_indexes, chain_pair_indexes
451  if self.style == "XTRACT" or self.style == "QUANTITATION":
452  if ":x:" in input_string:
453  # if it is a cross-link....
454  input_strings = input_string.split(":x:")
455  first_peptides = input_strings[0].split(":|:")
456  second_peptides = input_strings[1].split(":|:")
457  first_peptides_identifiers = [
458  (f.split(":")[0],
459  f.split(":")[1]) for f in first_peptides]
460  second_peptides_identifiers = [
461  (f.split(":")[0],
462  f.split(":")[1]) for f in second_peptides]
463  residue_pair_indexes = []
464  chain_pair_indexes = []
465  for fpi in first_peptides_identifiers:
466  for spi in second_peptides_identifiers:
467  chain1 = fpi[0]
468  chain2 = spi[0]
469  residue1 = fpi[1]
470  residue2 = spi[1]
471  residue_pair_indexes.append((residue1, residue2))
472  chain_pair_indexes.append((chain1, chain2))
473  return residue_pair_indexes, chain_pair_indexes
474  else:
475  # if it is a monolink....
476  first_peptides = input_string.split(":|:")
477  first_peptides_identifiers = [
478  (f.split(":")[0], f.split(":")[1]) for f in first_peptides]
479  residue_indexes = []
480  chain_indexes = []
481  for fpi in first_peptides_identifiers:
482  chain1 = fpi[0]
483  residue1 = fpi[1]
484  residue_indexes.append((residue1,))
485  chain_indexes.append((chain1,))
486  return residue_indexes, chain_indexes
487  if self.style == "LAN_HUANG":
488  input_strings = input_string.split("-")
489  chain1, first_series = input_strings[0].split(":")
490  chain2, second_series = input_strings[1].split(":")
491 
492  first_residues = first_series.replace(";", "|").split("|")
493  second_residues = second_series.replace(";", "|").split("|")
494  residue_pair_indexes = []
495  chain_pair_indexes = []
496  for fpi in first_residues:
497  for spi in second_residues:
498  residue1 = self.re.sub("[^0-9]", "", fpi)
499  residue2 = self.re.sub("[^0-9]", "", spi)
500  residue_pair_indexes.append((residue1, residue2))
501  chain_pair_indexes.append((chain1, chain2))
502  return residue_pair_indexes, chain_pair_indexes
503 
504 
505 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
506  '''
507  A class to handle different XL format with fixed format
508  currently support ProXL
509  '''
510  def __init__(self, format):
511  _CrossLinkDataBaseStandardKeys.__init__(self)
512  if format == "PROXL":
513  self.format = format
514  else:
515  raise Exception("FixedFormatParser: unknown list format name")
516 
517  def get_data(self, input_string):
518  if self.format == "PROXL":
519  tokens = input_string.split("\t")
520  xl = {}
521  if tokens[0] == "SEARCH ID(S)":
522  return None
523  else:
524  xl[self.protein1_key] = tokens[2]
525  xl[self.protein2_key] = tokens[4]
526  xl[self.residue1_key] = int(tokens[3])
527  xl[self.residue2_key] = int(tokens[5])
528  return xl
529 
530 
531 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
532  '''
533  this class handles a cross-link dataset and do filtering
534  operations, adding cross-links, merge datasets...
535  '''
536 
537  def __init__(self, converter=None, data_base=None, fasta_seq=None,
538  linkable_aa=('K',)):
539  '''
540  Constructor.
541  @param converter an instance of CrossLinkDataBaseKeywordsConverter
542  @param data_base an instance of CrossLinkDataBase to build the
543  new database on
544  @param fasta_seq an instance of IMP.pmi.topology.Sequences containing
545  protein fasta sequences to check cross-link consistency.
546  If not given consistency will not be checked
547  @param linkable_aa a tuple containing one-letter amino acids which
548  are linkable by the cross-linker; only used if the database
549  DOES NOT provide a value for a certain residueX_amino_acid_key
550  and if a fasta_seq is given
551  '''
552 
553  if data_base is None:
554  self.data_base = {}
555  else:
556  self.data_base = data_base
557 
558  _CrossLinkDataBaseStandardKeys.__init__(self)
559  if converter is not None:
560  self.cldbkc = converter # type: CrossLinkDataBaseKeywordsConverter
561  self.list_parser = self.cldbkc.rplp
562  self.converter = converter.get_converter()
563 
564  else:
565  self.cldbkc = None # type: CrossLinkDataBaseKeywordsConverter
566  self.list_parser = None
567  self.converter = None
568 
569  # default amino acids considered to be 'linkable' if none are given
570  self.def_aa_tuple = linkable_aa
571  self.fasta_seq = fasta_seq # type: IMP.pmi.topology.Sequences
572  self.dataset = None
573  self.name = None
574  self._update()
575 
576  def _update(self):
577  '''
578  Update the whole dataset after changes
579  '''
580  self.update_cross_link_unique_sub_index()
581  self.update_cross_link_redundancy()
582  self.update_residues_links_number()
584 
585  def __iter__(self):
586  sorted_ids = sorted(self.data_base.keys())
587  for k in sorted_ids:
588  for xl in self.data_base[k]:
589  yield xl
590 
591  def xlid_iterator(self):
592  sorted_ids = sorted(self.data_base.keys())
593  for xlid in sorted_ids:
594  yield xlid
595 
596  def __getitem__(self, xlid):
597  return self.data_base[xlid]
598 
599  def __len__(self):
600  return len([xl for xl in self])
601 
602  def get_name(self):
603  return self.name
604 
605  def set_name(self, name):
606  new_data_base = {}
607  for k in self.data_base:
608  new_data_base[k + "." + name] = self.data_base[k]
609  self.data_base = new_data_base
610  self.name = name
611  self._update()
612 
613  def get_number_of_xlid(self):
614  return len(self.data_base)
615 
616  def create_set_from_file(self, file_name, converter=None,
617  FixedFormatParser=None):
618  '''
619  if FixedFormatParser is not specified, the file is
620  comma-separated-values
621 
622  @param file_name a txt file to be parsed
623  @param converter an instance of CrossLinkDataBaseKeywordsConverter
624  @param FixedFormatParser a parser for a fixed format
625  '''
626  if not FixedFormatParser:
627  xl_list = IMP.pmi.tools.get_db_from_csv(file_name)
628 
629  if converter is not None:
630  self.cldbkc = converter
631  self.list_parser = self.cldbkc.rplp
632  self.converter = converter.get_converter()
633 
634  if not self.list_parser:
635  # normal procedure without a list_parser
636  # each line is a cross-link
637  new_xl_dict = {}
638  for nxl, xl in enumerate(xl_list):
639  new_xl = {}
640  for k in xl:
641  if k in self.converter:
642  new_xl[self.converter[k]] = \
643  self.type[self.converter[k]](xl[k])
644  else:
645  new_xl[k] = xl[k]
646  if self.unique_id_key in self.cldbkc.get_setup_keys():
647  if new_xl[self.unique_id_key] not in new_xl_dict:
648  new_xl_dict[new_xl[self.unique_id_key]] = [new_xl]
649  else:
650  new_xl_dict[new_xl[self.unique_id_key]].append(
651  new_xl)
652  else:
653  if str(nxl) not in new_xl_dict:
654  new_xl_dict[str(nxl)] = [new_xl]
655  else:
656  new_xl_dict[str(nxl)].append(new_xl)
657 
658  else:
659  # with a list_parser, a line can be a list of ambiguous
660  # cross-links
661  new_xl_dict = {}
662  for nxl, entry in enumerate(xl_list):
663 
664  # first get the translated keywords
665  new_dict = {}
666  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
667  raise Exception(
668  "CrossLinkDataBase: expecting a site_pairs_key "
669  "for the site pair list parser")
670  for k in entry:
671  if k in self.converter:
672  new_dict[self.converter[k]] = \
673  self.type[self.converter[k]](entry[k])
674  else:
675  new_dict[k] = entry[k]
676 
677  (residue_pair_list,
678  chain_pair_list) = self.list_parser.get_list(
679  new_dict[self.site_pairs_key])
680 
681  # then create the cross-links
682  for n, p in enumerate(residue_pair_list):
683  is_monolink = False
684  if len(p) == 1:
685  is_monolink = True
686 
687  new_xl = {}
688  for k in new_dict:
689  new_xl[k] = new_dict[k]
690  new_xl[self.residue1_key] = \
691  self.type[self.residue1_key](p[0])
692  if not is_monolink:
693  new_xl[self.residue2_key] = \
694  self.type[self.residue2_key](p[1])
695 
696  if len(chain_pair_list) == len(residue_pair_list):
697  new_xl[self.protein1_key] = \
698  self.type[self.protein1_key](
699  chain_pair_list[n][0])
700  if not is_monolink:
701  new_xl[self.protein2_key] = \
702  self.type[self.protein2_key](
703  chain_pair_list[n][1])
704 
705  if not is_monolink:
706  new_xl[self.link_type_key] = "CROSSLINK"
707  else:
708  new_xl[self.link_type_key] = "MONOLINK"
709 
710  if self.unique_id_key in self.cldbkc.get_setup_keys():
711  if new_xl[self.unique_id_key] not in new_xl_dict:
712  new_xl_dict[new_xl[self.unique_id_key]] = \
713  [new_xl]
714  else:
715  new_xl_dict[new_xl[self.unique_id_key]].append(
716  new_xl)
717  else:
718  if str(nxl) not in new_xl_dict:
719  new_xl[self.unique_id_key] = str(nxl+1)
720  new_xl_dict[str(nxl)] = [new_xl]
721  else:
722  new_xl[self.unique_id_key] = str(nxl+1)
723  new_xl_dict[str(nxl)].append(new_xl)
724 
725  else:
726  '''
727  if FixedFormatParser is defined
728  '''
729 
730  new_xl_dict = {}
731  f = open(file_name, "r")
732  nxl = 0
733  for line in f:
734  xl = FixedFormatParser.get_data(line)
735  if xl:
736  xl[self.unique_id_key] = str(nxl+1)
737  new_xl_dict[str(nxl)] = [xl]
738  nxl += 1
739 
740  self.data_base = new_xl_dict
741  self.name = file_name
742  loc = ihm.location.InputFileLocation(file_name, details='Crosslinks')
743  self.dataset = ihm.dataset.CXMSDataset(loc)
744  self._update()
745 
746  def update_cross_link_unique_sub_index(self):
747  for k in self.data_base:
748  for n, xl in enumerate(self.data_base[k]):
749  xl[self.ambiguity_key] = len(self.data_base[k])
750  xl[self.unique_sub_index_key] = n+1
751  xl[self.unique_sub_id_key] = k + "." + str(n+1)
752 
753  def update_cross_link_redundancy(self):
754  redundancy_data_base = {}
755  for xl in self:
756  pra = _ProteinsResiduesArray(xl)
757  if pra not in redundancy_data_base:
758  redundancy_data_base[pra] = [xl[self.unique_sub_id_key]]
759  redundancy_data_base[pra.get_inverted()] = \
760  [xl[self.unique_sub_id_key]]
761  else:
762  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
763  redundancy_data_base[pra.get_inverted()].append(
764  xl[self.unique_sub_id_key])
765  for xl in self:
766  pra = _ProteinsResiduesArray(xl)
767  xl[self.redundancy_key] = len(redundancy_data_base[pra])
768  xl[self.redundancy_list_key] = redundancy_data_base[pra]
769 
770  def update_residues_links_number(self):
771  residue_links = {}
772  for xl in self:
773  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
774  if (p1, r1) not in residue_links:
775  residue_links[(p1, r1)] = set([(p2, r2)])
776  else:
777  residue_links[(p1, r1)].add((p2, r2))
778  if (p2, r2) not in residue_links:
779  residue_links[(p2, r2)] = set([(p1, r1)])
780  else:
781  residue_links[(p2, r2)].add((p1, r1))
782 
783  for xl in self:
784  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
785  xl[self.residue1_links_number_key] = len(residue_links[(p1, r1)])
786  xl[self.residue2_links_number_key] = len(residue_links[(p2, r2)])
787 
789  """This function checks the consistency of the dataset with the
790  amino acid sequence"""
791  if self.cldbkc and self.fasta_seq:
792  cnt_matched, cnt_matched_file = 0, 0
793  matched = {}
794  non_matched = {}
795  for xl in self:
796  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
797  b_matched_file = False
798  if self.residue1_amino_acid_key in xl:
799  # either you know the residue type and aa_tuple is
800  # a single entry
801  aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
802  b_matched = self._match_xlinks(p1, r1, aa_from_file)
803  b_matched_file = b_matched
804  else:
805  # or pass the possible list of types that can be
806  # cross-linked
807  b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
808 
809  matched, non_matched = self._update_matched_xlinks(
810  b_matched, p1, r1, matched, non_matched)
811 
812  if self.residue2_amino_acid_key in xl:
813  aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
814  b_matched = self._match_xlinks(p2, r2, aa_from_file)
815  b_matched_file = b_matched
816  else:
817  b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
818 
819  matched, non_matched = self._update_matched_xlinks(
820  b_matched, p2, r2, matched, non_matched)
821  if b_matched:
822  cnt_matched += 1
823  if b_matched_file:
824  cnt_matched_file += 1
825  if len(self) > 0:
826  percentage_matched = round(100*cnt_matched/len(self), 1)
827  percentage_matched_file = round(
828  100 * cnt_matched_file / len(self), 1)
829  if matched or non_matched:
830  print(
831  "check_cross_link_consistency: Out of %d cross-links "
832  "%d were matched to the fasta sequence (%f %%).\n "
833  "%d were matched by using the cross-link file (%f %%)."
834  % (len(self), cnt_matched, percentage_matched,
835  cnt_matched_file, percentage_matched_file))
836  if non_matched:
837  print("check_cross_link_consistency: Warning: Non "
838  "matched xlinks:",
839  [(prot_name, sorted(list(non_matched[prot_name])))
840  for prot_name in non_matched])
841  return matched, non_matched
842 
843  def _match_xlinks(self, prot_name, res_index, aa_tuple):
844  # returns Boolean whether given aa matches a position in the fasta file
845  # cross-link files usually start counting at 1 and not 0; therefore
846  # subtract -1 to compare with fasta
847  amino_dict = IMP.pmi.alphabets.amino_acid
848  res_index -= 1
849  for amino_acid in aa_tuple:
850  if len(amino_acid) == 3:
851  amino_acid = amino_dict.get_one_letter_code_from_residue_type(
852  amino_acid.upper())
853  if prot_name in self.fasta_seq.sequences:
854  seq = self.fasta_seq.sequences[prot_name]
855  # if we are looking at the first amino acid in a given
856  # sequence always return a match
857  # the first aa should always be the n-terminal aa
858  # which may form a cross-link in any case (for BS3-like
859  # cross-linkers)
860  # for some data sets the first aa is at position 1;
861  # todo: check why this is the case
862  if res_index == 0 or res_index == 1:
863  return True
864  if res_index < len(seq):
865  if amino_acid == seq[res_index]:
866  return True
867  return False
868 
869  def _update_matched_xlinks(self, b_matched, prot, res, matched,
870  non_matched):
871  if b_matched:
872  if prot in matched:
873  matched[prot].add(res)
874  else:
875  matched[prot] = set([res])
876  else:
877  if prot in non_matched:
878  non_matched[prot].add(res)
879  else:
880  non_matched[prot] = set([res])
881  return matched, non_matched
882 
883  def get_cross_link_string(self, xl):
884  string = '|'
885  for k in self.ordered_key_list:
886  try:
887  string += str(k) + ":" + str(xl[k]) + "|"
888  except KeyError:
889  continue
890 
891  for k in xl:
892  if k not in self.ordered_key_list:
893  string += str(k) + ":" + str(xl[k]) + "|"
894 
895  return string
896 
897  def get_short_cross_link_string(self, xl):
898  string = '|'
899  list_of_keys = [self.data_set_name_key,
900  self.unique_sub_id_key,
901  self.protein1_key,
902  self.residue1_key,
903  self.protein2_key,
904  self.residue2_key,
905  self.state_key,
906  self.psi_key]
907 
908  for k in list_of_keys:
909  try:
910  string += str(xl[k]) + "|"
911  except KeyError:
912  continue
913 
914  return string
915 
916  def filter(self, FilterOperator):
917  new_xl_dict = {}
918  for id in self.data_base.keys():
919  for xl in self.data_base[id]:
920  if FilterOperator.evaluate(xl):
921  if id not in new_xl_dict:
922  new_xl_dict[id] = [xl]
923  else:
924  new_xl_dict[id].append(xl)
925  self._update()
926  cdb = CrossLinkDataBase(self.cldbkc, new_xl_dict)
927  cdb.dataset = self.dataset
928  return cdb
929 
930  def filter_score(self, score):
931  '''Get all cross-links with score greater than an input value'''
932  FilterOperator = IMP.pmi.io.crosslink.FilterOperator(
933  self.id_score_key, operator.gt, score)
934  return self.filter(FilterOperator)
935 
936  def merge(self, CrossLinkDataBase1, CrossLinkDataBase2):
937  '''
938  This function merges two cross-link datasets so that if two
939  conflicting cross-links have the same cross-link UniqueIDS,
940  the cross-links will be appended under the same UniqueID slots
941  with different SubIDs
942  '''
943  raise NotImplementedError()
944 
945  def append_database(self, db):
946  """Append cross-link dataset to this one."""
947  new_data_base = {}
948  for k in self.data_base:
949  new_data_base[k] = self.data_base[k]
950  for k in db.data_base:
951  new_data_base[k] = db.data_base[k]
952  self.data_base = new_data_base
953  self._update()
954 
955  def __iadd__(self, db):
956  self.append_database(db)
957  return self
958 
959  def set_value(self, key, new_value, filter_operator=None):
960  '''
961  This function changes the value for a given key in the database
962  For instance one can change the name of a protein
963  @param key: the key in the database that must be changed
964  @param new_value: the new value of the key
965  @param filter_operator: optional FilterOperator to change the value to
966  a subset of the database
967 
968  example: `cldb1.set_value(cldb1.protein1_key, 'FFF',
969  FO(cldb.protein1_key, operator.eq, "AAA"))`
970  '''
971 
972  for xl in self:
973  if filter_operator is not None:
974  if filter_operator.evaluate(xl):
975  xl[key] = new_value
976  else:
977  xl[key] = new_value
978  self._update()
979 
980  def get_values(self, key):
981  '''
982  this function returns the list of values for a given key in the
983  database alphanumerically sorted
984  '''
985  values = set()
986  for xl in self:
987  values.add(xl[key])
988  return sorted(list(values))
989 
990  def offset_residue_index(self, protein_name, offset):
991  '''
992  This function offset the residue indexes of a given protein by
993  a specified value
994  @param protein_name: the protein name that need to be changed
995  @param offset: the offset value
996  '''
997 
998  for xl in self:
999  if xl[self.protein1_key] == protein_name:
1000  xl[self.residue1_key] = xl[self.residue1_key] + offset
1001  if xl[self.protein2_key] == protein_name:
1002  xl[self.residue2_key] = xl[self.residue2_key] + offset
1003  self._update()
1004 
1005  def create_new_keyword(self, keyword, values_from_keyword=None):
1006  '''
1007  This function creates a new keyword for the whole database and
1008  set the values from and existing keyword (optional), otherwise the
1009  values are set to None
1010  @param keyword the new keyword name:
1011  @param values_from_keyword the keyword from which we are copying
1012  the values:
1013  '''
1014  for xl in self:
1015  if values_from_keyword is not None:
1016  xl[keyword] = xl[values_from_keyword]
1017  else:
1018  xl[keyword] = None
1019  self._update()
1020 
1021  def rename_proteins(self, old_to_new_names_dictionary,
1022  protein_to_rename="both"):
1023  '''
1024  This function renames all proteins contained in the input dictionary
1025  from the old names (keys) to the new name (values)
1026  @param old_to_new_names_dictionary dictionary for converting old
1027  to new names
1028  @param protein_to_rename specify whether to rename both or protein1
1029  or protein2 only
1030  '''
1031 
1032  for old_name in old_to_new_names_dictionary:
1033  new_name = old_to_new_names_dictionary[old_name]
1034  if protein_to_rename == "both" or protein_to_rename == "protein1":
1035  fo2 = FilterOperator(self.protein1_key, operator.eq, old_name)
1036  self.set_value(self.protein1_key, new_name, fo2)
1037  if protein_to_rename == "both" or protein_to_rename == "protein2":
1038  fo2 = FilterOperator(self.protein2_key, operator.eq, old_name)
1039  self.set_value(self.protein2_key, new_name, fo2)
1040 
1041  def align_sequence(self, alignfile):
1042  """
1043  This function parses an alignment file obtained previously, to map
1044  crosslink residues onto the sequence of a homolog proteins. It is
1045  useful if you want to map the crosslinks onto a known, homolog
1046  structure without building a comparative model.
1047  The txt file of the alignment should be structured with repeating
1048  blocks, as follows:
1049 
1050  >NAME_CROSSLINKED_PROTEIN
1051  >NAME_HOMOLOG_PROTEIN_WITH_KNOWN_STRUCTURE
1052  one-letter code sequence for cross-linked protein (one line)
1053  one-letter code sequence for homolog protein with known structure (one line)
1054  """ # noqa: E501
1055 
1056  f = open(alignfile, 'r')
1057 
1058  def _assign_residues(seq1, seq2):
1059  d = {}
1060  resinds1 = []
1061  nres = 0
1062  contiguousgap = 0
1063  for r1 in seq1:
1064  if r1 != "-":
1065  nres += 1
1066  resinds1.append(nres)
1067  else:
1068  resinds1.append(None)
1069 
1070  resinds2 = [None for i in resinds1]
1071  nres = 0
1072  lastresind = 1
1073  for pos, r2 in enumerate(seq2):
1074  if r2 != "-":
1075  nres += 1
1076  resinds2[pos] = nres
1077  lastresind = nres
1078  if contiguousgap > 1:
1079  for i in range(pos-int(contiguousgap/2), pos):
1080  resinds2[i] = nres
1081  contiguousgap = 0
1082  else:
1083  contiguousgap += 1
1084  resinds2[pos] = lastresind
1085 
1086  for n, r in enumerate(resinds1):
1087  if r is not None:
1088  d[r] = resinds2[n]
1089  return d
1090 
1091  aa = True
1092  sequences1 = {}
1093  sequences2 = {}
1094  with open(alignfile, 'r') as f:
1095  while aa:
1096  aa = f.readline()
1097  bb = f.readline()
1098  cc = f.readline()
1099  dd = f.readline()
1100  protname = aa.replace(">", "").replace("\n", "")
1101  seq1 = cc.replace(",", "").replace("\n", "")
1102  seq2 = dd.replace(",", "").replace("\n", "")
1103 
1104  n1 = 0
1105  n2 = 0
1106  tmpl1 = []
1107  tmpl2 = []
1108  for i in range(len(seq1)):
1109  if seq1[i] != "-":
1110  n1 += 1
1111  c1 = n1
1112  else:
1113  c1 = None
1114  if seq2[i] != "-":
1115  n2 += 1
1116  c2 = n2
1117  else:
1118  c2 = None
1119 
1120  tmpl1.append([i, c1, seq1[i]])
1121  tmpl2.append([i, c2, seq2[i]])
1122  sequences1[protname] = tmpl1
1123  sequences2[protname] = tmpl2
1124 
1125  d = _assign_residues(seq1, seq2)
1126 
1127  fo11 = FilterOperator(self.protein1_key, operator.eq, protname)
1128  fo12 = FilterOperator(self.protein2_key, operator.eq, protname)
1129 
1130  for xl in self:
1131  if fo11.evaluate(xl):
1132  t = [item for item in sequences1[protname]
1133  if item[1] == xl[self.residue1_key]][0]
1134  sequences1[protname][t[0]] = [
1135  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1136  t = [item for item in sequences2[protname]
1137  if item[1] == d[xl[self.residue1_key]]][0]
1138  sequences2[protname][t[0]] = [
1139  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1140 
1141  xl[self.residue1_key] = d[xl[self.residue1_key]]
1142 
1143  if fo12.evaluate(xl):
1144 
1145  t = [item for item in sequences1[protname]
1146  if item[1] == xl[self.residue2_key]][0]
1147  sequences1[protname][t[0]] = [
1148  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1149  t = [item for item in sequences2[protname]
1150  if item[1] == d[xl[self.residue2_key]]][0]
1151  sequences2[protname][t[0]] = [
1152  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1153 
1154  xl[self.residue2_key] = d[xl[self.residue2_key]]
1155 
1156  print("")
1157  print(aa.replace("\n", ""))
1158  print("".join([seq[2] for seq in sequences1[protname]]))
1159  print(bb.replace("\n", ""))
1160  print("".join([seq[2] for seq in sequences2[protname]]))
1161 
1162  def classify_crosslinks_by_score(self, number_of_classes):
1163  '''
1164  This function creates as many classes as in the input
1165  (number_of_classes: integer) and partition cross-links according
1166  to their identification scores. Classes are defined in the psi key.
1167  '''
1168 
1169  if self.id_score_key is not None:
1170  scores = self.get_values(self.id_score_key)
1171  else:
1172  raise ValueError(
1173  'The cross-link database does not contain score values')
1174  minscore = min(scores)
1175  maxscore = max(scores)
1176  scoreclasses = numpy.linspace(minscore, maxscore, number_of_classes+1)
1177  if self.psi_key is None:
1178  self.create_new_keyword(self.psi_key, values_from_keyword=None)
1179  for xl in self:
1180  score = xl[self.id_score_key]
1181  for n, classmin in enumerate(scoreclasses[0:-1]):
1182  if score >= classmin and score <= scoreclasses[n+1]:
1183  xl[self.psi_key] = str("CLASS_"+str(n))
1184  self._update()
1185 
1186  def clone_protein(self, protein_name, new_protein_name):
1187  for id in self.data_base.keys():
1188  new_data_base = []
1189  for xl in self.data_base[id]:
1190  new_data_base.append(xl)
1191  if xl[self.protein1_key] == protein_name \
1192  and xl[self.protein2_key] != protein_name:
1193  new_xl = dict(xl)
1194  new_xl[self.protein1_key] = new_protein_name
1195  new_data_base.append(new_xl)
1196  elif xl[self.protein1_key] != protein_name \
1197  and xl[self.protein2_key] == protein_name:
1198  new_xl = dict(xl)
1199  new_xl[self.protein2_key] = new_protein_name
1200  new_data_base.append(new_xl)
1201  elif xl[self.protein1_key] == protein_name \
1202  and xl[self.protein2_key] == protein_name:
1203  new_xl = dict(xl)
1204  new_xl[self.protein1_key] = new_protein_name
1205  new_data_base.append(new_xl)
1206  new_xl = dict(xl)
1207  new_xl[self.protein2_key] = new_protein_name
1208  new_data_base.append(new_xl)
1209  new_xl = dict(xl)
1210  new_xl[self.protein1_key] = new_protein_name
1211  new_xl[self.protein2_key] = new_protein_name
1212  new_data_base.append(new_xl)
1213  self.data_base[id] = new_data_base
1214  self._update()
1215 
1217  '''
1218  This function remove cross-links applied to the same residue
1219  (ie, same chain name and residue number)
1220  '''
1221  for id in self.data_base.keys():
1222  new_data_base = []
1223  for xl in self.data_base[id]:
1224  if xl[self.protein1_key] == xl[self.protein2_key] \
1225  and xl[self.residue1_key] == xl[self.residue2_key]:
1226  continue
1227  else:
1228  new_data_base.append(xl)
1229  self.data_base[id] = new_data_base
1230  self._update()
1231 
1232  def jackknife(self, percentage):
1233  '''
1234  this method returns a CrossLinkDataBase class containing
1235  a random subsample of the original cross-link database.
1236  @param percentage float between 0 and 1, is the percentage of
1237  of spectra taken from the original list
1238  '''
1239  import random
1240  if percentage > 1.0 or percentage < 0.0:
1241  raise ValueError('the percentage of random cross-link spectra '
1242  'should be between 0 and 1')
1243  nspectra = self.get_number_of_xlid()
1244  nrandom_spectra = int(nspectra*percentage)
1245  random_keys = random.sample(self.data_base.keys(), nrandom_spectra)
1246  new_data_base = {}
1247  for k in random_keys:
1248  new_data_base[k] = self.data_base[k]
1249  return CrossLinkDataBase(self.cldbkc, new_data_base)
1250 
1251  def __str__(self):
1252  outstr = ''
1253  sorted_ids = sorted(self.data_base.keys())
1254 
1255  for id in sorted_ids:
1256  outstr += id + "\n"
1257  for xl in self.data_base[id]:
1258  for k in self.ordered_key_list:
1259  try:
1260  outstr += "--- " + str(k) + " " + str(xl[k]) + "\n"
1261  except KeyError:
1262  continue
1263 
1264  for k in xl:
1265  if k not in self.ordered_key_list:
1266  outstr += "--- " + str(k) + " " + str(xl[k]) + "\n"
1267  outstr += "-------------\n"
1268  return outstr
1269 
1270  def plot(self, filename, **kwargs):
1271  import matplotlib
1272  matplotlib.use('Agg')
1273  import matplotlib.pyplot as plt
1274  import matplotlib.colors
1275 
1276  if kwargs["type"] == "scatter":
1277  cmap = plt.get_cmap("rainbow")
1278  _ = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1279  xkey = kwargs["xkey"]
1280  ykey = kwargs["ykey"]
1281  if "colorkey" in kwargs:
1282  colorkey = kwargs["colorkey"]
1283  if "logyscale" in kwargs:
1284  logyscale = kwargs["logyscale"]
1285  else:
1286  logyscale = False
1287  xs = []
1288  ys = []
1289  colors = []
1290  for xl in self:
1291  try:
1292  xs.append(float(xl[xkey]))
1293  if logyscale:
1294  ys.append(math.log10(float(xl[ykey])))
1295  else:
1296  ys.append(float(xl[ykey]))
1297  colors.append(float(xl[colorkey]))
1298  except ValueError:
1299  print("CrossLinkDataBase.plot: Value error for "
1300  "cross-link %s" % (xl[self.unique_id_key]))
1301  continue
1302 
1303  cs = []
1304  for color in colors:
1305  cindex = (color-min(colors))/(max(colors)-min(colors))
1306  cs.append(cmap(cindex))
1307 
1308  fig = plt.figure()
1309  ax = fig.add_subplot(111)
1310  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8, marker="o")
1311  ax.set_xlabel(xkey)
1312  ax.set_ylabel(ykey)
1313  plt.savefig(filename)
1314  plt.show()
1315  plt.close()
1316 
1317  if kwargs["type"] == "residue_links":
1318  # plot the number of distinct links to a residue
1319  # in an histogram
1320  # molecule name
1321  molecule = kwargs["molecule"]
1322  if type(molecule) is IMP.pmi.topology.Molecule:
1323  length = len(molecule.sequence)
1324  molecule = molecule.get_name()
1325  else:
1326  # you need a IMP.pmi.topology.Sequences object
1327  sequences_object = kwargs["sequences_object"]
1328  sequence = sequences_object.sequences[molecule]
1329  length = len(sequence)
1330 
1331  histogram = [0]*length
1332  for xl in self:
1333  if xl[self.protein1_key] == molecule:
1334  histogram[xl[self.residue1_key]-1] = \
1335  xl[self.residue1_links_number_key]
1336  if xl[self.protein2_key] == molecule:
1337  histogram[xl[self.residue2_key]-1] = \
1338  xl[self.residue2_links_number_key]
1339  plt.bar(range(1, length+1), histogram)
1340  plt.savefig(filename)
1341  plt.show()
1342  plt.close()
1343 
1344  if kwargs["type"] == "histogram":
1345  valuekey = kwargs["valuekey"]
1346  valuename = valuekey
1347  bins = kwargs["bins"]
1348  values_list = []
1349  for xl in self:
1350  try:
1351  values_list.append(float(xl[valuekey]))
1352  except ValueError:
1353  print("CrossLinkDataBase.plot: Value error for cross-link "
1354  "%s" % (xl[self.unique_id_key]))
1355  continue
1357  filename, [values_list], valuename=valuename, bins=bins,
1358  colors=None, format="pdf",
1359  reference_xline=None, yplotrange=None,
1360  xplotrange=None, normalized=True,
1361  leg_names=None)
1362 
1363  def dump(self, json_filename):
1364  import json
1365  with open(json_filename, 'w') as fp:
1366  json.dump(self.data_base, fp, sort_keys=True, indent=2,
1367  default=set_json_default)
1368 
1369  def load(self, json_filename):
1370  import json
1371  with open(json_filename, 'r') as fp:
1372  self.data_base = json.load(fp)
1373  self._update()
1374  # getting rid of unicode
1375  # (can't do this in Python 3, since *everything* is Unicode there)
1376  if sys.version_info[0] < 3:
1377  for xl in self:
1378  for k, v in xl.iteritems():
1379  if type(k) is unicode: # noqa: F821
1380  k = str(k)
1381  if type(v) is unicode: # noqa: F821
1382  v = str(v)
1383  xl[k] = v
1384 
1385  def save_csv(self, filename):
1386  import csv
1387 
1388  data = []
1389  sorted_ids = None
1390  sorted_group_ids = sorted(self.data_base.keys())
1391  for group in sorted_group_ids:
1392  group_block = []
1393  for xl in self.data_base[group]:
1394  if not sorted_ids:
1395  sorted_ids = sorted(xl.keys())
1396  values = [xl[k] for k in sorted_ids]
1397  group_block.append(values)
1398  data += group_block
1399 
1400  with open(filename, 'w') as fp:
1401  a = csv.writer(fp, delimiter=',')
1402  a.writerow(sorted_ids)
1403  a.writerows(data)
1404 
1406  """
1407  Returns the number of non redundant cross-link sites
1408  """
1409  praset = set()
1410  for xl in self:
1411  pra = _ProteinsResiduesArray(xl)
1412  prai = pra.get_inverted()
1413  praset.add(pra)
1414  praset.add(prai)
1415  return len(list(praset))
1416 
1417 
1419  """This class allows to compute and plot the distance between datasets"""
1420 
1421  def __init__(self, cldb_dictionary):
1422  """Input a dictionary where keys are cldb names and values are cldbs"""
1423  import scipy.spatial.distance
1424  self.dbs = cldb_dictionary
1425  self.keylist = list(self.dbs.keys())
1426  self.distances = list()
1427 
1428  for i, key1 in enumerate(self.keylist):
1429  for key2 in self.keylist[i+1:]:
1430  distance = self.get_distance(key1, key2)
1431  self.distances.append(distance)
1432 
1433  self.distances = scipy.spatial.distance.squareform(self.distances)
1434 
1435  def get_distance(self, key1, key2):
1436  return 1.0 - self.jaccard_index(self.dbs[key1], self.dbs[key2])
1437 
1438  def jaccard_index(self, CrossLinkDataBase1, CrossLinkDataBase2):
1439  """Similarity index between two datasets
1440  https://en.wikipedia.org/wiki/Jaccard_index"""
1441 
1442  set1 = set()
1443  set2 = set()
1444  for xl1 in CrossLinkDataBase1:
1445  a1f = _ProteinsResiduesArray(xl1)
1446  a1b = a1f.get_inverted()
1447  set1.add(a1f)
1448  set1.add(a1b)
1449  for xl2 in CrossLinkDataBase2:
1450  a2f = _ProteinsResiduesArray(xl2)
1451  a2b = a2f.get_inverted()
1452  set2.add(a2f)
1453  set2.add(a2b)
1454  return (float(len(set1 & set2)/2)
1455  / (len(set1)/2+len(set2)/2-len(set1 & set2)/2))
1456 
1457  def plot_matrix(self, figurename="clustermatrix.pdf"):
1458  import matplotlib as mpl
1459  import numpy
1460  mpl.use('Agg')
1461  import matplotlib.pylab as pl
1462  from scipy.cluster import hierarchy as hrc
1463 
1464  raw_distance_matrix = self.distances
1465  labels = self.keylist
1466 
1467  fig = pl.figure()
1468 
1469  ax = fig.add_subplot(1, 1, 1)
1470  dendrogram = hrc.dendrogram(
1471  hrc.linkage(raw_distance_matrix),
1472  color_threshold=7,
1473  no_labels=True)
1474  leaves_order = dendrogram['leaves']
1475  ax.set_xlabel('Dataset')
1476  ax.set_ylabel('Jaccard Distance')
1477  pl.tight_layout()
1478  pl.savefig("dendrogram."+figurename, dpi=300)
1479  pl.close(fig)
1480 
1481  fig = pl.figure()
1482 
1483  ax = fig.add_subplot(1, 1, 1)
1484  cax = ax.imshow(raw_distance_matrix[leaves_order, :][:, leaves_order],
1485  interpolation='nearest')
1486  cb = fig.colorbar(cax)
1487  cb.set_label('Jaccard Distance')
1488  ax.set_xlabel('Dataset')
1489  ax.set_ylabel('Dataset')
1490  ax.set_xticks(range(len(labels)))
1491  ax.set_xticklabels(numpy.array(labels)[leaves_order],
1492  rotation='vertical')
1493  ax.set_yticks(range(len(labels)))
1494  ax.set_yticklabels(numpy.array(labels)[leaves_order],
1495  rotation='horizontal')
1496  pl.tight_layout()
1497  pl.savefig("matrix." + figurename, dpi=300)
1498  pl.close(fig)
1499 
1500 
1502  '''
1503  This class maps a CrossLinkDataBase on a given structure
1504  and save an rmf file with color-coded cross-links
1505  '''
1506 
1507  def __init__(self, CrossLinkDataBase, rmf_or_stat_handler):
1508  self.CrossLinkDataBase = CrossLinkDataBase
1509  if type(rmf_or_stat_handler) is IMP.pmi.output.RMFHierarchyHandler or \
1510  type(rmf_or_stat_handler) is IMP.pmi.output.StatHierarchyHandler:
1511  self.prots = rmf_or_stat_handler
1512  self.distances = defaultdict(list)
1513  self.array_to_id = {}
1514  self.id_to_array = {}
1515 
1516  print("computing distances fro all cross-links and all structures")
1517  for i in self.prots[::10]:
1518  self.compute_distances()
1519  for key, xl in enumerate(self.CrossLinkDataBase):
1520  array = _ProteinsResiduesArray(xl)
1521  self.array_to_id[array] = key
1522  self.id_to_array[key] = array
1523  if xl["MinAmbiguousDistance"] != 'None':
1524  self.distances[key].append(xl["MinAmbiguousDistance"])
1525 
1526  def compute_distances(self):
1527  data = []
1528  sorted_ids = None
1529  sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1530  for group in sorted_group_ids:
1531  group_dists = []
1532  for xl in self.CrossLinkDataBase.data_base[group]:
1533  if not sorted_ids:
1534  sorted_ids = sorted(xl.keys())
1535  data.append(
1536  sorted_ids
1537  + ["UniqueID", "Distance", "MinAmbiguousDistance"])
1538  c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1539  try:
1540  (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1541  self._get_distance_and_particle_pair(r1, c1, r2, c2)
1542  except: # noqa: E722
1543  mdist = "None"
1544  state1 = "None"
1545  copy1 = "None"
1546  state2 = "None"
1547  copy2 = "None"
1548  try:
1549  # sometimes keys get "lost" in the database,
1550  # not really sure why
1551  values = [xl[k] for k in sorted_ids]
1552  values += [group, mdist]
1553  except KeyError as e:
1554  print("MapCrossLinkDataBaseOnStructure "
1555  "KeyError: {0} in {1}".format(e, xl))
1556  group_dists.append(mdist)
1557  xl["Distance"] = mdist
1558  xl["State1"] = state1
1559  xl["Copy1"] = copy1
1560  xl["State2"] = state2
1561  xl["Copy2"] = copy2
1562 
1563  for xl in self.CrossLinkDataBase.data_base[group]:
1564  xl["MinAmbiguousDistance"] = min(group_dists)
1565 
1566  def _get_distance_and_particle_pair(self, r1, c1, r2, c2):
1567  '''more robust and slower version of above'''
1568  sel = IMP.atom.Selection(self.prots, molecule=c1, residue_index=r1,
1569  resolution=1)
1570  selpart_1 = sel.get_selected_particles()
1571  if len(selpart_1) == 0:
1572  print("MapCrossLinkDataBaseOnStructure: Warning: no particle "
1573  "selected for first site")
1574  return None
1575  sel = IMP.atom.Selection(self.prots, molecule=c2, residue_index=r2,
1576  resolution=1)
1577  selpart_2 = sel.get_selected_particles()
1578  if len(selpart_2) == 0:
1579  print("MapCrossLinkDataBaseOnStructure: Warning: no particle "
1580  "selected for second site")
1581  return None
1582  results = []
1583  for p1 in selpart_1:
1584  for p2 in selpart_2:
1585  if p1 == p2 and r1 == r2:
1586  continue
1587  d1 = IMP.core.XYZ(p1)
1588  d2 = IMP.core.XYZ(p2)
1589  # round distance to second decimal
1590  dist = float(int(IMP.core.get_distance(d1, d2)*100.0))/100.0
1591  h1 = IMP.atom.Hierarchy(p1)
1592  while not IMP.atom.Molecule.get_is_setup(h1.get_particle()):
1593  h1 = h1.get_parent()
1594  copy_index1 = IMP.atom.Copy(h1).get_copy_index()
1595  while not IMP.atom.State.get_is_setup(h1.get_particle()):
1596  h1 = h1.get_parent()
1597  state_index1 = IMP.atom.State(h1).get_state_index()
1598  h2 = IMP.atom.Hierarchy(p2)
1599  while not IMP.atom.Molecule.get_is_setup(h2.get_particle()):
1600  h2 = h2.get_parent()
1601  copy_index2 = IMP.atom.Copy(h2).get_copy_index()
1602  while not IMP.atom.State.get_is_setup(h2.get_particle()):
1603  h2 = h2.get_parent()
1604  state_index2 = IMP.atom.State(h2).get_state_index()
1605 
1606  results.append((dist, state_index1, copy_index1,
1607  state_index2, copy_index2, p1, p2))
1608  if len(results) == 0:
1609  return None
1610  results_sorted = sorted(results,
1611  key=operator.itemgetter(0, 1, 2, 3, 4))
1612  return ((results_sorted[0][0],
1613  results_sorted[0][5],
1614  results_sorted[0][6]),
1615  (results_sorted[0][1],
1616  results_sorted[0][2],
1617  results_sorted[0][3],
1618  results_sorted[0][4]))
1619 
1620  def save_rmf_snapshot(self, filename, color_id=None):
1621  sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1622  list_of_pairs = []
1623  color_scores = []
1624  for group in sorted_group_ids:
1625  group_dists_particles = []
1626  for xl in self.CrossLinkDataBase.data_base[group]:
1627  xllabel = \
1628  self.CrossLinkDataBase.get_short_cross_link_string(xl)
1629  c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1630  try:
1631  (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1632  self._get_distance_and_particle_pair(r1, c1, r2, c2)
1633  except TypeError:
1634  print("TypeError or missing chain/residue ",
1635  r1, c1, r2, c2)
1636  continue
1637  if color_id is not None:
1638  group_dists_particles.append((mdist, p1, p2, xllabel,
1639  float(xl[color_id])))
1640  else:
1641  group_dists_particles.append((mdist, p1, p2, xllabel, 0.0))
1642  if group_dists_particles:
1643  (minmdist, minp1, minp2, minxllabel, mincolor_score) = \
1644  min(group_dists_particles, key=lambda t: t[0])
1645  color_scores.append(mincolor_score)
1646  list_of_pairs.append((minp1, minp2, minxllabel,
1647  mincolor_score))
1648  else:
1649  continue
1650 
1651  linear = IMP.core.Linear(0, 0.0)
1652  linear.set_slope(1.0)
1653  dps2 = IMP.core.DistancePairScore(linear)
1654  rslin = IMP.RestraintSet(self.prots.get_model(),
1655  'linear_dummy_restraints')
1656  sgs = []
1657  offset = min(color_scores)
1658  maxvalue = max(color_scores)
1659  for pair in list_of_pairs:
1660  pr = IMP.core.PairRestraint(self.prots.get_model(), dps2,
1661  (pair[0], pair[1]))
1662  pr.set_name(pair[2])
1663  if offset < maxvalue:
1664  factor = (pair[3]-offset)/(maxvalue-offset)
1665  else:
1666  factor = 0.0
1667  c = IMP.display.get_rgb_color(factor)
1668  seg = IMP.algebra.Segment3D(
1669  IMP.core.XYZ(pair[0]).get_coordinates(),
1670  IMP.core.XYZ(pair[1]).get_coordinates())
1671  rslin.add_restraint(pr)
1672  sgs.append(IMP.display.SegmentGeometry(seg, c, pair[2]))
1673 
1674  rh = RMF.create_rmf_file(filename)
1675  IMP.rmf.add_hierarchies(rh, [self.prots])
1676  IMP.rmf.add_restraints(rh, [rslin])
1677  IMP.rmf.add_geometries(rh, sgs)
1678  IMP.rmf.save_frame(rh)
1679  del rh
1680 
1681  def boxplot_crosslink_distances(self, filename):
1682  import numpy
1683  keys = [self.id_to_array[k] for k in self.distances.keys()]
1684  medians = [numpy.median(self.distances[self.array_to_id[k]])
1685  for k in keys]
1686  dists = [self.distances[self.array_to_id[k]] for k in keys]
1687  distances_sorted_by_median = \
1688  [x for _, x in sorted(zip(medians, dists))]
1689  keys_sorted_by_median = [x for _, x in sorted(zip(medians, keys))]
1691  filename, distances_sorted_by_median,
1692  range(len(distances_sorted_by_median)),
1693  xlabels=keys_sorted_by_median,
1694  scale_plot_length=0.2)
1695 
1696  def get_crosslink_violations(self, threshold):
1697  violations = defaultdict(list)
1698  for k in self.distances:
1699  # viols=map(lambda x:1.0*(x<= threshold), self.distances[k])
1700  viols = self.distances[k]
1701  violations[self.id_to_array[k]] = viols
1702  return violations
1703 
1704 
1706  '''
1707  This class generates a CrossLinkDataBase from a given structure
1708  '''
1709  def __init__(self, system, residue_types_1=["K"],
1710  residue_types_2=["K"], reactivity_range=[0, 1], kt=1.0):
1711  import numpy.random
1712  import math
1714  cldbkc.set_protein1_key("Protein1")
1715  cldbkc.set_protein2_key("Protein2")
1716  cldbkc.set_residue1_key("Residue1")
1717  cldbkc.set_residue2_key("Residue2")
1718  self.cldb = CrossLinkDataBase(cldbkc)
1719  self.system = system
1720  self.model = self.system.model
1721  self.residue_types_1 = residue_types_1
1722  self.residue_types_2 = residue_types_2
1723  self.kt = kt
1724  self.indexes_dict1 = {}
1725  self.indexes_dict2 = {}
1726  self.protein_residue_dict = {}
1727  self.reactivity_dictionary = {}
1728  self.euclidean_interacting_pairs = None
1729  self.xwalk_interacting_pairs = None
1730 
1731  for state in self.system.get_states():
1732  for moleculename, molecules in state.get_molecules().items():
1733  for molecule in molecules:
1734  # we are saving a dictionary with protein name,
1735  # residue number and random reactivity of the residue
1736  seq = molecule.sequence
1737  residues = [i for i in range(1, len(seq)+1)
1738  if ((seq[i-1] in self.residue_types_1)
1739  or (seq[i-1] in self.residue_types_2))]
1740 
1741  for r in residues:
1742  # uniform random reactivities
1743  # getting reactivities from the CDF of an
1744  # exponential distribution
1745  rexp = numpy.random.exponential(0.00000001)
1746  prob = 1.0-math.exp(-rexp)
1747  self.reactivity_dictionary[(molecule, r)] = prob
1748 
1749  residues1 = [i for i in range(1, len(seq)+1)
1750  if seq[i-1] in self.residue_types_1]
1751  residues2 = [i for i in range(1, len(seq)+1)
1752  if seq[i-1] in self.residue_types_2]
1753  for r in residues1:
1754  s = IMP.atom.Selection(
1755  molecule.hier, residue_index=r, resolution=1)
1756  ps = s.get_selected_particles()
1757  for p in ps:
1758  index = p.get_index()
1759  self.indexes_dict1[index] = (molecule, r)
1760  self.protein_residue_dict[(molecule, r)] = index
1761  for r in residues2:
1762  s = IMP.atom.Selection(
1763  molecule.hier, residue_index=r, resolution=1)
1764  ps = s.get_selected_particles()
1765  for p in ps:
1766  index = p.get_index()
1767  self.indexes_dict2[index] = (molecule, r)
1768  self.protein_residue_dict[(molecule, r)] = index
1769 
1770  def get_all_possible_pairs(self):
1771  n = float(len(self.protein_residue_dict.keys()))
1772  return n*(n-1.0)/2.0
1773 
1774  def get_all_feasible_pairs(self, distance=21):
1775  import itertools
1776  particle_index_pairs = []
1777  nxl = 0
1778  for a, b in itertools.combinations(
1779  self.protein_residue_dict.keys(), 2):
1780  new_xl = {}
1781  index1 = self.protein_residue_dict[a]
1782  index2 = self.protein_residue_dict[b]
1783  particle_distance = IMP.core.get_distance(
1784  IMP.core.XYZ(IMP.get_particles(self.model, [index1])[0]),
1785  IMP.core.XYZ(IMP.get_particles(self.model, [index2])[0]))
1786  if particle_distance <= distance:
1787  particle_index_pairs.append((index1, index2))
1788  new_xl[self.cldb.protein1_key] = a[0].get_name()
1789  new_xl[self.cldb.protein2_key] = b[0].get_name()
1790  new_xl["molecule_object1"] = a[0]
1791  new_xl["molecule_object2"] = b[0]
1792  new_xl[self.cldb.residue1_key] = a[1]
1793  new_xl[self.cldb.residue2_key] = b[1]
1794  self.cldb.data_base[str(nxl)] = [new_xl]
1795  nxl += 1
1796  self.cldb._update()
1797  return self.cldb
1798 
1799  def get_data_base(self, total_number_of_spectra, ambiguity_probability=0.1,
1800  noise=0.01, distance=21, max_delta_distance=10.0,
1801  xwalk_bin_path=None, confidence_false=0.75,
1802  confidence_true=0.75):
1803  import math
1804  from random import random
1805  import numpy as np
1806  number_of_spectra = 1
1807 
1808  self.beta_true = -1.4427*math.log(0.5*(1.0-confidence_true))
1809  self.beta_false = -1.4427*math.log(0.5*(1.0-confidence_false))
1810  self.cldb.data_base[str(number_of_spectra)] = []
1811  self.sites_weighted = None
1812 
1813  while number_of_spectra < total_number_of_spectra:
1814  if (random() > ambiguity_probability
1815  and len(self.cldb.data_base[str(number_of_spectra)]) != 0):
1816  # new spectrum
1817  number_of_spectra += 1
1818  self.cldb.data_base[str(number_of_spectra)] = []
1819  noisy = False
1820  if random() > noise:
1821  # not noisy cross-link
1822  pra, dist = self.get_random_residue_pair(
1823  distance, xwalk_bin_path, max_delta_distance)
1824  else:
1825  # noisy cross-link
1826  pra, dist = self.get_random_residue_pair(None, None, None)
1827  noisy = True
1828 
1829  new_xl = {}
1830  new_xl[self.cldb.protein1_key] = pra[0].get_name()
1831  new_xl[self.cldb.protein2_key] = pra[1].get_name()
1832  new_xl["molecule_object1"] = pra[0]
1833  new_xl["molecule_object2"] = pra[1]
1834  new_xl[self.cldb.residue1_key] = pra[2]
1835  new_xl[self.cldb.residue2_key] = pra[3]
1836  new_xl["Noisy"] = noisy
1837  # the reactivity is defined as r=1-exp(-k*Delta t)
1838  new_xl["Reactivity_Residue1"] = \
1839  self.reactivity_dictionary[(pra[0], pra[2])]
1840  new_xl["Reactivity_Residue2"] = \
1841  self.reactivity_dictionary[(pra[1], pra[3])]
1842  if noisy:
1843  new_xl["Score"] = np.random.beta(1.0, self.beta_false)
1844  else:
1845  new_xl["Score"] = 1.0-np.random.beta(1.0, self.beta_true)
1846  new_xl["TargetDistance"] = dist
1847  new_xl["NoiseProbability"] = noise
1848  new_xl["AmbiguityProbability"] = ambiguity_probability
1849  # getting if it is intra or inter rigid body
1850  (p1, p2) = IMP.get_particles(
1851  self.model, [self.protein_residue_dict[(pra[0], pra[2])],
1852  self.protein_residue_dict[(pra[1], pra[3])]])
1855  IMP.core.RigidMember(p1).get_rigid_body() ==
1856  IMP.core.RigidMember(p2).get_rigid_body()):
1857  new_xl["InterRigidBody"] = False
1858  elif (IMP.core.RigidMember.get_is_setup(p1) and
1860  IMP.core.RigidMember(p1).get_rigid_body() !=
1861  IMP.core.RigidMember(p2).get_rigid_body()):
1862  new_xl["InterRigidBody"] = True
1863  else:
1864  new_xl["InterRigidBody"] = None
1865 
1866  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1867  self.cldb._update()
1868  return self.cldb
1869 
1870  def get_random_residue_pair(self, distance=None, xwalk_bin_path=None,
1871  max_delta_distance=None):
1872  import IMP.pmi.tools
1873  import math
1874  from random import choice
1875  if distance is None:
1876  # get a random pair
1877  while True:
1878  protein1, residue1 = choice(self.protein_residue_dict.keys())
1879  protein2, residue2 = choice(self.protein_residue_dict.keys())
1880  index1 = self.protein_residue_dict[(protein1, residue1)]
1881  index2 = self.protein_residue_dict[(protein2, residue2)]
1882  particle_distance = IMP.core.get_distance(
1883  IMP.core.XYZ(IMP.get_particles(self.model, [index1])[0]),
1884  IMP.core.XYZ(IMP.get_particles(self.model, [index2])[0]))
1885  if (protein1, residue1) != (protein2, residue2):
1886  break
1887  else:
1888  # get a pair of residues whose distance is below the threshold
1889  if not xwalk_bin_path:
1891  gcpf.set_distance(distance+max_delta_distance)
1892 
1893  while True:
1894  # setup the reaction rates lists
1895  if not self.sites_weighted:
1896  self.sites_weighted = []
1897  for key in self.reactivity_dictionary:
1898  r = self.reactivity_dictionary[key]
1899  self.sites_weighted.append((key, r))
1900  # get a random reaction site
1901  first_site = self.weighted_choice(self.sites_weighted)
1902  # get all distances
1903  if not self.euclidean_interacting_pairs:
1904  self.euclidean_interacting_pairs = \
1905  gcpf.get_close_pairs(
1906  self.model,
1907  self.indexes_dict1.keys(),
1908  self.indexes_dict2.keys())
1909  # get the partner for the first reacted site
1910  first_site_pairs = \
1911  [pair for pair in self.euclidean_interacting_pairs
1912  if self.indexes_dict1[pair[0]] == first_site or
1913  self.indexes_dict2[pair[1]] == first_site]
1914  if len(first_site_pairs) == 0:
1915  continue
1916  # build the list of second reaction sites
1917  second_sites_weighted = []
1918  for pair in first_site_pairs:
1919  if self.indexes_dict1[pair[0]] == first_site:
1920  second_site = self.indexes_dict2[pair[1]]
1921  if self.indexes_dict2[pair[1]] == first_site:
1922  second_site = self.indexes_dict1[pair[0]]
1923  r = self.reactivity_dictionary[second_site]
1924  second_sites_weighted.append((second_site, r))
1925  second_site = self.weighted_choice(second_sites_weighted)
1926  protein1, residue1 = first_site
1927  protein2, residue2 = second_site
1928  print("CrossLinkDataBaseFromStructure."
1929  "get_random_residue_pair:",
1930  "First site", first_site,
1931  self.reactivity_dictionary[first_site],
1932  "Second site", second_site,
1933  self.reactivity_dictionary[second_site])
1934  particle_pair = IMP.get_particles(
1935  self.model,
1936  [self.protein_residue_dict[first_site],
1937  self.protein_residue_dict[second_site]])
1938  particle_distance = IMP.core.get_distance(
1939  IMP.core.XYZ(particle_pair[0]),
1940  IMP.core.XYZ(particle_pair[1]))
1941 
1942  if particle_distance < distance \
1943  and (protein1, residue1) != (protein2, residue2):
1944  break
1945  elif (particle_distance >= distance
1946  and (protein1, residue1) != (protein2, residue2)
1947  and max_delta_distance):
1948  # allow some flexibility
1949  if particle_distance-distance < max_delta_distance:
1950  break
1951 
1952  else:
1953  if not self.xwalk_interacting_pairs:
1954  self.xwalk_interacting_pairs = \
1955  self.get_xwalk_distances(xwalk_bin_path, distance)
1956  interacting_pairs_weighted = []
1957 
1958  for pair in self.xwalk_interacting_pairs:
1959  protein1 = pair[0]
1960  protein2 = pair[1]
1961  residue1 = pair[2]
1962  residue2 = pair[3]
1963  weight1 = math.exp(
1964  -self.reactivity_dictionary[(protein1, residue1)]
1965  / self.kt)
1966  weight2 = math.exp(
1967  -self.reactivity_dictionary[(protein2, residue2)]
1968  / self.kt)
1969  interacting_pairs_weighted.append((pair, weight1*weight2))
1970 
1971  pair = self.weighted_choice(interacting_pairs_weighted)
1972  protein1 = pair[0]
1973  protein2 = pair[1]
1974  residue1 = pair[2]
1975  residue2 = pair[3]
1976  particle_distance = float(pair[4])
1977 
1978  return ((protein1, protein2, residue1, residue2)), particle_distance
1979 
1980  def get_xwalk_distances(self, xwalk_bin_path, distance):
1981  import IMP.pmi.output
1982  import os
1983  o = IMP.pmi.output.Output(atomistic=True)
1984  o.init_pdb("xwalk.pdb", self.representation.prot)
1985  o.write_pdb("xwalk.pdb")
1986  namechainiddict = o.dictchain["xwalk.pdb"]
1987  chainiddict = {}
1988 
1989  for key in namechainiddict:
1990  chainiddict[namechainiddict[key]] = key
1991  xwalkout = os.popen('java -Xmx256m -cp ' + xwalk_bin_path
1992  + ' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys '
1993  '-a1 cb -a2 cb -max '
1994  + str(distance) + ' -bb').read()
1995 
1996  output_list_of_distance = []
1997  for line in xwalkout.split("\n")[0:-2]:
1998  tokens = line.split()
1999  first = tokens[2]
2000  second = tokens[3]
2001  distance = float(tokens[6])
2002  fs = first.split("-")
2003  ss = second.split("-")
2004  chainid1 = fs[2]
2005  chainid2 = ss[2]
2006  protein1 = chainiddict[chainid1]
2007  protein2 = chainiddict[chainid2]
2008  residue1 = int(fs[1])
2009  residue2 = int(ss[1])
2010  output_list_of_distance.append(
2011  (protein1, protein2, residue1, residue2, distance))
2012  return output_list_of_distance
2013 
2014  def weighted_choice(self, choices):
2015  import random
2016  # http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
2017  total = sum(w for c, w in choices)
2018  r = random.uniform(0, total)
2019  upto = 0
2020  for c, w in choices:
2021  if upto + w > r:
2022  return c
2023  upto += w
2024  assert False, "Shouldn't get here"
2025 
2026  def save_rmf_snapshot(self, filename, color_id=None):
2027  import IMP.rmf
2028  import RMF
2029  if color_id is None:
2030  color_id = "Reactivity"
2031  sorted_group_ids = sorted(self.cldb.data_base.keys())
2032  list_of_pairs = []
2033  color_scores = []
2034  for group in sorted_group_ids:
2035  group_dists_particles = []
2036  for xl in self.cldb.data_base[group]:
2037  xllabel = self.cldb.get_short_cross_link_string(xl)
2038  (c1, c2, r1, r2) = (xl["molecule_object1"],
2039  xl["molecule_object2"],
2040  xl[self.cldb.residue1_key],
2041  xl[self.cldb.residue2_key])
2042  try:
2043  index1 = self.protein_residue_dict[(c1, r1)]
2044  index2 = self.protein_residue_dict[(c2, r2)]
2045  p1, p2 = (IMP.get_particles(self.model, [index1])[0],
2046  IMP.get_particles(self.model, [index2])[0])
2047  mdist = xl["TargetDistance"]
2048  except TypeError:
2049  print("TypeError or missing chain/residue ",
2050  r1, c1, r2, c2)
2051  continue
2052  group_dists_particles.append((mdist, p1, p2, xllabel,
2053  float(xl[color_id])))
2054  if group_dists_particles:
2055  (minmdist, minp1, minp2, minxllabel, mincolor_score) \
2056  = min(group_dists_particles, key=lambda t: t[0])
2057  color_scores.append(mincolor_score)
2058  list_of_pairs.append((minp1, minp2, minxllabel,
2059  mincolor_score))
2060  else:
2061  continue
2062 
2063  m = self.model
2064  linear = IMP.core.Linear(0, 0.0)
2065  linear.set_slope(1.0)
2066  dps2 = IMP.core.DistancePairScore(linear)
2067  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
2068  sgs = []
2069  offset = min(color_scores)
2070  maxvalue = max(color_scores)
2071  for pair in list_of_pairs:
2072  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
2073  pr.set_name(pair[2])
2074  factor = (pair[3]-offset)/(maxvalue-offset)
2075  c = IMP.display.get_rgb_color(factor)
2076  seg = IMP.algebra.Segment3D(
2077  IMP.core.XYZ(pair[0]).get_coordinates(),
2078  IMP.core.XYZ(pair[1]).get_coordinates())
2079  rslin.add_restraint(pr)
2080  sgs.append(IMP.display.SegmentGeometry(seg, c, pair[2]))
2081 
2082  rh = RMF.create_rmf_file(filename)
2083  IMP.rmf.add_hierarchies(rh, [self.system.hier])
2084  IMP.rmf.add_restraints(rh, [rslin])
2085  IMP.rmf.add_geometries(rh, sgs)
2086  IMP.rmf.save_frame(rh)
2087  del rh
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: State.h:41
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1749
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1809
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
Get the particles from a list of indexes.
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:731
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:37
Stores a named protein chain.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
The standard decorator for manipulating molecular structures.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:200
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:19
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:24
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Definition: exception.h:49
Associate an integer "state" index with a hierarchy node.
Definition: State.h:27
class to link stat files to several rmf files
Definition: output.py:1248
class to allow more advanced handling of RMF files.
Definition: output.py:1124
Mapping between FASTA one-letter codes and residue types.
Definition: alphabets.py:1
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Definition: PairRestraint.h:29
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.