IMP logo
IMP Reference Guide  2.18.0
The Integrative Modeling Platform
io/crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 
8 from __future__ import print_function
9 import IMP
10 import IMP.pmi
11 import IMP.pmi.output
12 import IMP.pmi.alphabets
13 import IMP.atom
14 import IMP.core
15 import IMP.algebra
16 import IMP.rmf
17 import RMF
18 import IMP.display
19 import operator
20 import math
21 import sys
22 import ihm.location
23 import ihm.dataset
24 from collections import defaultdict
25 import numpy
26 
27 
28 # json default serializations
29 def set_json_default(obj):
30  if isinstance(obj, set):
31  return list(obj)
32  if isinstance(obj, IMP.pmi.topology.Molecule):
33  return str(obj)
34  raise TypeError
35 
36 
37 # Handle and return data that must be a string
38 if sys.version_info[0] >= 3:
39  def _handle_string_input(inp):
40  if not isinstance(inp, str):
41  raise TypeError("expecting a string")
42  return inp
43 else:
44  def _handle_string_input(inp):
45  if not isinstance(inp, (str, unicode)): # noqa: F821
46  raise TypeError("expecting a string or unicode")
47  # Coerce to non-unicode representation (str)
48  if isinstance(inp, unicode): # noqa: F821
49  return str(inp)
50  else:
51  return inp
52 
53 
54 class _CrossLinkDataBaseStandardKeys(object):
55  '''
56  This class setup all the standard keys needed to
57  identify the cross-link features from the data sets
58  '''
59  def __init__(self):
60  self.type = {}
61  self.protein1_key = "Protein1"
62  self.type[self.protein1_key] = str
63  self.protein2_key = "Protein2"
64  self.type[self.protein2_key] = str
65  self.residue1_key = "Residue1"
66  self.type[self.residue1_key] = int
67  self.residue2_key = "Residue2"
68  self.type[self.residue2_key] = int
69  self.residue1_amino_acid_key = "Residue1AminoAcid"
70  self.type[self.residue1_amino_acid_key] = str
71  self.residue2_amino_acid_key = "Residue2AminoAcid"
72  self.type[self.residue2_amino_acid_key] = str
73  self.residue1_moiety_key = "Residue1Moiety"
74  self.type[self.residue1_moiety_key] = str
75  self.residue2_moiety_key = "Residue2Moiety"
76  self.type[self.residue2_moiety_key] = str
77  self.site_pairs_key = "SitePairs"
78  self.type[self.site_pairs_key] = str
79  self.unique_id_key = "XLUniqueID"
80  self.type[self.unique_id_key] = str
81  self.unique_sub_index_key = "XLUniqueSubIndex"
82  self.type[self.unique_sub_index_key] = int
83  self.unique_sub_id_key = "XLUniqueSubID"
84  self.type[self.unique_sub_id_key] = str
85  self.data_set_name_key = "DataSetName"
86  self.type[self.data_set_name_key] = str
87  self.cross_linker_chemical_key = "CrossLinkerChemical"
88  self.type[self.cross_linker_chemical_key] = str
89  self.id_score_key = "IDScore"
90  self.type[self.id_score_key] = float
91  self.fdr_key = "FDR"
92  self.type[self.fdr_key] = float
93  self.quantitation_key = "Quantitation"
94  self.type[self.quantitation_key] = float
95  self.redundancy_key = "Redundancy"
96  self.type[self.redundancy_key] = int
97  self.redundancy_list_key = "RedundancyList"
98  self.type[self.redundancy_key] = list
99  self.ambiguity_key = "Ambiguity"
100  self.type[self.ambiguity_key] = int
101  self.residue1_links_number_key = "Residue1LinksNumber"
102  self.type[self.residue1_links_number_key] = int
103  self.residue2_links_number_key = "Residue2LinksNumber"
104  self.type[self.residue2_links_number_key] = int
105  self.type[self.ambiguity_key] = int
106  self.state_key = "State"
107  self.type[self.state_key] = int
108  self.sigma1_key = "Sigma1"
109  self.type[self.sigma1_key] = str
110  self.sigma2_key = "Sigma2"
111  self.type[self.sigma2_key] = str
112  self.psi_key = "Psi"
113  self.type[self.psi_key] = str
114  self.distance_key = "Distance"
115  self.type[self.distance_key] = float
116  self.min_ambiguous_distance_key = "MinAmbiguousDistance"
117  self.type[self.distance_key] = float
118  # link types are either Monolink, Intralink or Interlink
119  self.link_type_key = "LinkType"
120  self.type[self.link_type_key] = str
121 
122  self.ordered_key_list = [
123  self.data_set_name_key, self.unique_id_key,
124  self.unique_sub_index_key, self.unique_sub_id_key,
125  self.protein1_key, self.protein2_key, self.residue1_key,
126  self.residue2_key, self.residue1_amino_acid_key,
127  self.residue2_amino_acid_key, self.residue1_moiety_key,
128  self.residue2_moiety_key, self.cross_linker_chemical_key,
129  self.id_score_key, self.fdr_key, self.quantitation_key,
130  self.redundancy_key, self.redundancy_list_key, self.state_key,
131  self.sigma1_key, self.sigma2_key, self.psi_key, self.distance_key,
132  self.min_ambiguous_distance_key, self.link_type_key]
133 
134 
135 class _ProteinsResiduesArray(tuple):
136  '''
137  This class is inherits from tuple, and it is a shorthand for a cross-link
138  (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1
139  and protein2, r1 and r2 are residue1 and residue2.
140  '''
141 
142  def __new__(self, input_data):
143  '''
144  @input_data can be a dict or a tuple
145  '''
146  self.cldbsk = _CrossLinkDataBaseStandardKeys()
147  if type(input_data) is dict:
148  monolink = False
149  p1 = input_data[self.cldbsk.protein1_key]
150  try:
151  p2 = input_data[self.cldbsk.protein2_key]
152  except KeyError:
153  monolink = True
154  r1 = input_data[self.cldbsk.residue1_key]
155  try:
156  r2 = input_data[self.cldbsk.residue2_key]
157  except KeyError:
158  monolink = True
159  if not monolink:
160  t = (p1, p2, r1, r2)
161  else:
162  t = (p1, "", r1, None)
163  elif type(input_data) is tuple:
164  if len(input_data) > 4 or len(input_data) == 3 \
165  or len(input_data) == 1:
166  raise TypeError(
167  "_ProteinsResiduesArray: must have only 4 elements")
168  if len(input_data) == 4:
169  p1 = _handle_string_input(input_data[0])
170  p2 = _handle_string_input(input_data[1])
171  r1 = input_data[2]
172  r2 = input_data[3]
173  if (type(r1) is not int) and (r1 is not None):
174  raise TypeError(
175  "_ProteinsResiduesArray: residue1 must be a integer")
176  if (type(r2) is not int) and (r1 is not None):
177  raise TypeError(
178  "_ProteinsResiduesArray: residue2 must be a integer")
179  t = (p1, p2, r1, r2)
180  if len(input_data) == 2:
181  p1 = _handle_string_input(input_data[0])
182  r1 = input_data[1]
183  if type(r1) is not int:
184  raise TypeError(
185  "_ProteinsResiduesArray: residue1 must be a integer")
186  t = (p1, "", r1, None)
187  else:
188  raise TypeError(
189  "_ProteinsResiduesArray: input must be a dict or tuple")
190  return tuple.__new__(_ProteinsResiduesArray, t)
191 
192  def get_inverted(self):
193  '''
194  Returns a _ProteinsResiduesArray instance with protein1 and
195  protein2 inverted
196  '''
197  return _ProteinsResiduesArray((self[1], self[0], self[3], self[2]))
198 
199  def __repr__(self):
200  outstr = self.cldbsk.protein1_key + " " + str(self[0])
201  outstr += " " + self.cldbsk.protein2_key + " " + str(self[1])
202  outstr += " " + self.cldbsk.residue1_key + " " + str(self[2])
203  outstr += " " + self.cldbsk.residue2_key + " " + str(self[3])
204  return outstr
205 
206  def __str__(self):
207  outstr = str(self[0]) + "." + str(self[2]) + "-" + str(self[1]) \
208  + "." + str(self[3])
209  return outstr
210 
211 
212 class FilterOperator(object):
213  '''
214  This class allows to create filter functions that can be passed to
215  the CrossLinkDataBase in this way:
216 
217  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
218 
219  where cldb is CrossLinkDataBase instance and it is only needed to get
220  the standard keywords
221 
222  A filter operator can be evaluate on a CrossLinkDataBase item xl and
223  returns a boolean
224 
225  fo.evaluate(xl)
226 
227  and it is used to filter the database
228  '''
229 
230  def __init__(self, argument1, operator, argument2):
231  '''
232  (argument1,operator,argument2) can be either a
233  (keyword,operator.eq|lt|gt...,value)
234  or (FilterOperator1,operator.or|and...,FilterOperator2)
235  '''
236  if isinstance(argument1, FilterOperator):
237  self.operations = [argument1, operator, argument2]
238  else:
239  self.operations = []
240  self.values = (argument1, operator, argument2)
241 
242  def __or__(self, FilterOperator2):
243  return FilterOperator(self, operator.or_, FilterOperator2)
244 
245  def __and__(self, FilterOperator2):
246  return FilterOperator(self, operator.and_, FilterOperator2)
247 
248  def __invert__(self):
249  return FilterOperator(self, operator.not_, None)
250 
251  def evaluate(self, xl_item):
252 
253  if len(self.operations) == 0:
254  keyword, operator, value = self.values
255  return operator(xl_item[keyword], value)
256  FilterOperator1, op, FilterOperator2 = self.operations
257 
258  if FilterOperator2 is None:
259  return op(FilterOperator1.evaluate(xl_item))
260  else:
261  return op(FilterOperator1.evaluate(xl_item),
262  FilterOperator2.evaluate(xl_item))
263 
264 
265 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
266  '''
267  This class is needed to convert the keywords from a generic database
268  to the standard ones
269  '''
270 
271  def __init__(self, list_parser=None):
272  '''
273  @param list_parser an instance of ResiduePairListParser, if any
274  is needed
275  '''
276  self.converter = {}
277  self.backward_converter = {}
278  _CrossLinkDataBaseStandardKeys.__init__(self)
279  self.rplp = list_parser
280  if self.rplp is None:
281  # either you have protein1, protein2, residue1, residue2
282  self.compulsory_keys = set([self.protein1_key,
283  self.protein2_key,
284  self.residue1_key,
285  self.residue2_key])
286  else:
287  self.compulsory_keys = self.rplp.get_compulsory_keys()
288  self.setup_keys = set()
289 
290  def check_keys(self):
291  '''
292  Is a function that check whether necessary keys are setup
293  '''
294  setup_keys = set(self.get_setup_keys())
295  if self.compulsory_keys & setup_keys != self.compulsory_keys:
296  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup "
297  "all necessary keys")
298 
299  def get_setup_keys(self):
300  '''
301  Returns the keys that have been setup so far
302  '''
303  return self.backward_converter.keys()
304 
305  def set_standard_keys(self):
306  """
307  This sets up the standard compulsory keys as defined by
308  _CrossLinkDataBaseStandardKeys
309  """
310  for ck in self.compulsory_keys:
311  self.converter[ck] = ck
312  self.backward_converter[ck] = ck
313 
314  def set_unique_id_key(self, origin_key):
315  self.converter[origin_key] = self.unique_id_key
316  self.backward_converter[self.unique_id_key] = origin_key
317 
318  def set_protein1_key(self, origin_key):
319  self.converter[origin_key] = self.protein1_key
320  self.backward_converter[self.protein1_key] = origin_key
321 
322  def set_protein2_key(self, origin_key):
323  self.converter[origin_key] = self.protein2_key
324  self.backward_converter[self.protein2_key] = origin_key
325 
326  def set_residue1_key(self, origin_key):
327  self.converter[origin_key] = self.residue1_key
328  self.backward_converter[self.residue1_key] = origin_key
329 
330  def set_residue2_key(self, origin_key):
331  self.converter[origin_key] = self.residue2_key
332  self.backward_converter[self.residue2_key] = origin_key
333 
334  def set_residue1_amino_acid_key(self, origin_key):
335  self.converter[origin_key] = self.residue1_amino_acid_key
336  self.backward_converter[self.residue1_amino_acid_key] = origin_key
337 
338  def set_residue2_amino_acid_key(self, origin_key):
339  self.converter[origin_key] = self.residue2_amino_acid_key
340  self.backward_converter[self.residue2_amino_acid_key] = origin_key
341 
342  def set_residue1_moiety_key(self, origin_key):
343  self.converter[origin_key] = self.residue1_moiety_key
344  self.backward_converter[self.residue1_moiety_key] = origin_key
345 
346  def set_residue2_moiety_key(self, origin_key):
347  self.converter[origin_key] = self.residue2_moiety_key
348  self.backward_converter[self.residue2_moiety_key] = origin_key
349 
350  def set_site_pairs_key(self, origin_key):
351  self.converter[origin_key] = self.site_pairs_key
352  self.backward_converter[self.site_pairs_key] = origin_key
353 
354  def set_id_score_key(self, origin_key):
355  self.converter[origin_key] = self.id_score_key
356  self.backward_converter[self.id_score_key] = origin_key
357 
358  def set_fdr_key(self, origin_key):
359  self.converter[origin_key] = self.fdr_key
360  self.backward_converter[self.fdr_key] = origin_key
361 
362  def set_quantitation_key(self, origin_key):
363  self.converter[origin_key] = self.quantitation_key
364  self.backward_converter[self.quantitation_key] = origin_key
365 
366  def set_psi_key(self, origin_key):
367  self.converter[origin_key] = self.psi_key
368  self.backward_converter[self.psi_key] = origin_key
369 
370  def set_link_type_key(self, link_type_key):
371  self.converter[link_type_key] = self.link_type_key
372  self.backward_converter[self.link_type_key] = link_type_key
373 
374  def get_converter(self):
375  '''
376  Returns the dictionary that convert the old keywords to the new ones
377  '''
378  self.check_keys()
379  return self.converter
380 
382  '''
383  Returns the dictionary that convert the new keywords to the old ones
384  '''
385  self.check_keys()
386  return self.backward_converter
387 
388 
389 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
390  '''
391  A class to handle different styles of site pairs parsers.
392  Implemented styles:
393  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
394  [Y3-;K4-] for dead-ends
395  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
396  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
397  LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
398  ''' # noqa: E501
399 
400  import re
401 
402  def __init__(self, style):
403  _CrossLinkDataBaseStandardKeys.__init__(self)
404  if style == "MSSTUDIO":
405  self.style = style
406  self.compulsory_keys = set([self.protein1_key,
407  self.protein2_key,
408  self.site_pairs_key])
409  elif style == "XTRACT" or style == "QUANTITATION":
410  self.style = style
411  self.compulsory_keys = set([self.site_pairs_key])
412  elif style == "LAN_HUANG":
413  self.style = style
414  self.compulsory_keys = set([self.site_pairs_key])
415  else:
416  raise Exception("ResiduePairListParser: unknown list parser style")
417 
418  def get_compulsory_keys(self):
419  return self.compulsory_keys
420 
421  def get_list(self, input_string):
422  '''
423  This function returns a list of cross-linked residues and the
424  corresponding list of cross-linked chains. The latter list can be
425  empty, if the style doesn't have the corresponding information.
426  '''
427  if self.style == "MSSTUDIO":
428  input_string = input_string.replace("[", "")
429  input_string = input_string.replace("]", "")
430  input_string_pairs = input_string.split(";")
431  residue_pair_indexes = []
432  chain_pair_indexes = []
433  for s in input_string_pairs:
434  m1 = self.re.search(
435  r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)'
436  r'(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',
437  s)
438  m2 = self.re.search(
439  r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$', s)
440  if m1:
441  # cross-link
442  (residue_type_1, residue_index_1, residue_type_2,
443  residue_index_2) = m1.group(1, 2, 3, 4)
444  residue_pair_indexes.append((residue_index_1,
445  residue_index_2))
446  elif m2:
447  # dead end
448  residue_type_1, residue_index_1 = m2.group(1, 2)
449  # at this stage chain_pair_indexes is empty
450  return residue_pair_indexes, chain_pair_indexes
451  if self.style == "XTRACT" or self.style == "QUANTITATION":
452  if ":x:" in input_string:
453  # if it is a cross-link....
454  input_strings = input_string.split(":x:")
455  first_peptides = input_strings[0].split(":|:")
456  second_peptides = input_strings[1].split(":|:")
457  first_peptides_identifiers = [
458  (f.split(":")[0],
459  f.split(":")[1]) for f in first_peptides]
460  second_peptides_identifiers = [
461  (f.split(":")[0],
462  f.split(":")[1]) for f in second_peptides]
463  residue_pair_indexes = []
464  chain_pair_indexes = []
465  for fpi in first_peptides_identifiers:
466  for spi in second_peptides_identifiers:
467  chain1 = fpi[0]
468  chain2 = spi[0]
469  residue1 = fpi[1]
470  residue2 = spi[1]
471  residue_pair_indexes.append((residue1, residue2))
472  chain_pair_indexes.append((chain1, chain2))
473  return residue_pair_indexes, chain_pair_indexes
474  else:
475  # if it is a monolink....
476  first_peptides = input_string.split(":|:")
477  first_peptides_identifiers = [
478  (f.split(":")[0], f.split(":")[1]) for f in first_peptides]
479  residue_indexes = []
480  chain_indexes = []
481  for fpi in first_peptides_identifiers:
482  chain1 = fpi[0]
483  residue1 = fpi[1]
484  residue_indexes.append((residue1,))
485  chain_indexes.append((chain1,))
486  return residue_indexes, chain_indexes
487  if self.style == "LAN_HUANG":
488  input_strings = input_string.split("-")
489  chain1, first_series = input_strings[0].split(":")
490  chain2, second_series = input_strings[1].split(":")
491 
492  first_residues = first_series.replace(";", "|").split("|")
493  second_residues = second_series.replace(";", "|").split("|")
494  residue_pair_indexes = []
495  chain_pair_indexes = []
496  for fpi in first_residues:
497  for spi in second_residues:
498  residue1 = self.re.sub("[^0-9]", "", fpi)
499  residue2 = self.re.sub("[^0-9]", "", spi)
500  residue_pair_indexes.append((residue1, residue2))
501  chain_pair_indexes.append((chain1, chain2))
502  return residue_pair_indexes, chain_pair_indexes
503 
504 
505 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
506  '''
507  A class to handle different XL format with fixed format
508  currently support ProXL
509  '''
510  def __init__(self, format):
511  _CrossLinkDataBaseStandardKeys.__init__(self)
512  if format == "PROXL":
513  self.format = format
514  else:
515  raise Exception("FixedFormatParser: unknown list format name")
516 
517  def get_data(self, input_string):
518  if self.format == "PROXL":
519  tokens = input_string.split("\t")
520  xl = {}
521  if tokens[0] == "SEARCH ID(S)":
522  return None
523  else:
524  xl[self.protein1_key] = tokens[2]
525  xl[self.protein2_key] = tokens[4]
526  xl[self.residue1_key] = int(tokens[3])
527  xl[self.residue2_key] = int(tokens[5])
528  return xl
529 
530 
531 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
532  '''
533  this class handles a cross-link dataset and do filtering
534  operations, adding cross-links, merge datasets...
535  '''
536 
537  def __init__(self, converter=None, data_base=None, fasta_seq=None,
538  linkable_aa=('K',)):
539  '''
540  Constructor.
541  @param converter an instance of CrossLinkDataBaseKeywordsConverter
542  @param data_base an instance of CrossLinkDataBase to build the
543  new database on
544  @param fasta_seq an instance of IMP.pmi.topology.Sequences containing
545  protein fasta sequences to check cross-link consistency.
546  If not given consistency will not be checked
547  @param linkable_aa a tuple containing one-letter amino acids which
548  are linkable by the cross-linker; only used if the database
549  DOES NOT provide a value for a certain residueX_amino_acid_key
550  and if a fasta_seq is given
551  '''
552 
553  if data_base is None:
554  self.data_base = {}
555  else:
556  self.data_base = data_base
557 
558  _CrossLinkDataBaseStandardKeys.__init__(self)
559  if converter is not None:
560  self.cldbkc = converter # type: CrossLinkDataBaseKeywordsConverter
561  self.list_parser = self.cldbkc.rplp
562  self.converter = converter.get_converter()
563 
564  else:
565  self.cldbkc = None # type: CrossLinkDataBaseKeywordsConverter
566  self.list_parser = None
567  self.converter = None
568 
569  # default amino acids considered to be 'linkable' if none are given
570  self.def_aa_tuple = linkable_aa
571  self.fasta_seq = fasta_seq # type: IMP.pmi.topology.Sequences
572  self.dataset = None
573  self.name = None
574  self._update()
575 
576  def _update(self):
577  '''
578  Update the whole dataset after changes
579  '''
580  self.update_cross_link_unique_sub_index()
581  self.update_cross_link_redundancy()
582  self.update_residues_links_number()
584 
585  def __iter__(self):
586  sorted_ids = sorted(self.data_base.keys())
587  for k in sorted_ids:
588  for xl in self.data_base[k]:
589  yield xl
590 
591  def xlid_iterator(self):
592  sorted_ids = sorted(self.data_base.keys())
593  for xlid in sorted_ids:
594  yield xlid
595 
596  def __getitem__(self, xlid):
597  return self.data_base[xlid]
598 
599  def __len__(self):
600  return len([xl for xl in self])
601 
602  def get_name(self):
603  return self.name
604 
605  def set_name(self, name):
606  new_data_base = {}
607  for k in self.data_base:
608  new_data_base[k + "." + name] = self.data_base[k]
609  self.data_base = new_data_base
610  self.name = name
611  self._update()
612 
613  def get_number_of_xlid(self):
614  return len(self.data_base)
615 
616  def create_set_from_file(self, file_name, converter=None,
617  FixedFormatParser=None):
618  '''
619  if FixedFormatParser is not specified, the file is
620  comma-separated-values
621 
622  @param file_name a txt file to be parsed
623  @param converter an instance of CrossLinkDataBaseKeywordsConverter
624  @param FixedFormatParser a parser for a fixed format
625  '''
626  if not FixedFormatParser:
627  xl_list = IMP.pmi.tools.get_db_from_csv(file_name)
628 
629  if converter is not None:
630  self.cldbkc = converter
631  self.list_parser = self.cldbkc.rplp
632  self.converter = converter.get_converter()
633 
634  if not self.list_parser:
635  # normal procedure without a list_parser
636  # each line is a cross-link
637  new_xl_dict = {}
638  for nxl, xl in enumerate(xl_list):
639  new_xl = {}
640  for k in xl:
641  if k in self.converter:
642  new_xl[self.converter[k]] = \
643  self.type[self.converter[k]](xl[k])
644  else:
645  new_xl[k] = xl[k]
646  if self.unique_id_key in self.cldbkc.get_setup_keys():
647  if new_xl[self.unique_id_key] not in new_xl_dict:
648  new_xl_dict[new_xl[self.unique_id_key]] = [new_xl]
649  else:
650  new_xl_dict[new_xl[self.unique_id_key]].append(
651  new_xl)
652  else:
653  if str(nxl) not in new_xl_dict:
654  new_xl_dict[str(nxl)] = [new_xl]
655  else:
656  new_xl_dict[str(nxl)].append(new_xl)
657 
658  else:
659  # with a list_parser, a line can be a list of ambiguous
660  # cross-links
661  new_xl_dict = {}
662  for nxl, entry in enumerate(xl_list):
663 
664  # first get the translated keywords
665  new_dict = {}
666  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
667  raise Exception(
668  "CrossLinkDataBase: expecting a site_pairs_key "
669  "for the site pair list parser")
670  for k in entry:
671  if k in self.converter:
672  new_dict[self.converter[k]] = \
673  self.type[self.converter[k]](entry[k])
674  else:
675  new_dict[k] = entry[k]
676 
677  (residue_pair_list,
678  chain_pair_list) = self.list_parser.get_list(
679  new_dict[self.site_pairs_key])
680 
681  # then create the cross-links
682  for n, p in enumerate(residue_pair_list):
683  is_monolink = False
684  if len(p) == 1:
685  is_monolink = True
686 
687  new_xl = {}
688  for k in new_dict:
689  new_xl[k] = new_dict[k]
690  new_xl[self.residue1_key] = \
691  self.type[self.residue1_key](p[0])
692  if not is_monolink:
693  new_xl[self.residue2_key] = \
694  self.type[self.residue2_key](p[1])
695 
696  if len(chain_pair_list) == len(residue_pair_list):
697  new_xl[self.protein1_key] = \
698  self.type[self.protein1_key](
699  chain_pair_list[n][0])
700  if not is_monolink:
701  new_xl[self.protein2_key] = \
702  self.type[self.protein2_key](
703  chain_pair_list[n][1])
704 
705  if not is_monolink:
706  new_xl[self.link_type_key] = "CROSSLINK"
707  else:
708  new_xl[self.link_type_key] = "MONOLINK"
709 
710  if self.unique_id_key in self.cldbkc.get_setup_keys():
711  if new_xl[self.unique_id_key] not in new_xl_dict:
712  new_xl_dict[new_xl[self.unique_id_key]] = \
713  [new_xl]
714  else:
715  new_xl_dict[new_xl[self.unique_id_key]].append(
716  new_xl)
717  else:
718  if str(nxl) not in new_xl_dict:
719  new_xl[self.unique_id_key] = str(nxl+1)
720  new_xl_dict[str(nxl)] = [new_xl]
721  else:
722  new_xl[self.unique_id_key] = str(nxl+1)
723  new_xl_dict[str(nxl)].append(new_xl)
724 
725  else:
726  '''
727  if FixedFormatParser is defined
728  '''
729 
730  new_xl_dict = {}
731  nxl = 0
732  with open(file_name, "r") as f:
733  for line in f:
734  xl = FixedFormatParser.get_data(line)
735  if xl:
736  xl[self.unique_id_key] = str(nxl+1)
737  new_xl_dict[str(nxl)] = [xl]
738  nxl += 1
739 
740  self.data_base = new_xl_dict
741  self.name = file_name
742  loc = ihm.location.InputFileLocation(file_name, details='Crosslinks')
743  self.dataset = ihm.dataset.CXMSDataset(loc)
744  self._update()
745 
746  def update_cross_link_unique_sub_index(self):
747  for k in self.data_base:
748  for n, xl in enumerate(self.data_base[k]):
749  xl[self.ambiguity_key] = len(self.data_base[k])
750  xl[self.unique_sub_index_key] = n+1
751  xl[self.unique_sub_id_key] = k + "." + str(n+1)
752 
753  def update_cross_link_redundancy(self):
754  redundancy_data_base = {}
755  for xl in self:
756  pra = _ProteinsResiduesArray(xl)
757  if pra not in redundancy_data_base:
758  redundancy_data_base[pra] = [xl[self.unique_sub_id_key]]
759  redundancy_data_base[pra.get_inverted()] = \
760  [xl[self.unique_sub_id_key]]
761  else:
762  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
763  redundancy_data_base[pra.get_inverted()].append(
764  xl[self.unique_sub_id_key])
765  for xl in self:
766  pra = _ProteinsResiduesArray(xl)
767  xl[self.redundancy_key] = len(redundancy_data_base[pra])
768  xl[self.redundancy_list_key] = redundancy_data_base[pra]
769 
770  def update_residues_links_number(self):
771  residue_links = {}
772  for xl in self:
773  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
774  if (p1, r1) not in residue_links:
775  residue_links[(p1, r1)] = set([(p2, r2)])
776  else:
777  residue_links[(p1, r1)].add((p2, r2))
778  if (p2, r2) not in residue_links:
779  residue_links[(p2, r2)] = set([(p1, r1)])
780  else:
781  residue_links[(p2, r2)].add((p1, r1))
782 
783  for xl in self:
784  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
785  xl[self.residue1_links_number_key] = len(residue_links[(p1, r1)])
786  xl[self.residue2_links_number_key] = len(residue_links[(p2, r2)])
787 
789  """This function checks the consistency of the dataset with the
790  amino acid sequence"""
791  if self.cldbkc and self.fasta_seq:
792  cnt_matched, cnt_matched_file = 0, 0
793  matched = {}
794  non_matched = {}
795  for xl in self:
796  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
797  b_matched_file = False
798  if self.residue1_amino_acid_key in xl:
799  # either you know the residue type and aa_tuple is
800  # a single entry
801  aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
802  b_matched = self._match_xlinks(p1, r1, aa_from_file)
803  b_matched_file = b_matched
804  else:
805  # or pass the possible list of types that can be
806  # cross-linked
807  b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
808 
809  matched, non_matched = self._update_matched_xlinks(
810  b_matched, p1, r1, matched, non_matched)
811 
812  if self.residue2_amino_acid_key in xl:
813  aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
814  b_matched = self._match_xlinks(p2, r2, aa_from_file)
815  b_matched_file = b_matched
816  else:
817  b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
818 
819  matched, non_matched = self._update_matched_xlinks(
820  b_matched, p2, r2, matched, non_matched)
821  if b_matched:
822  cnt_matched += 1
823  if b_matched_file:
824  cnt_matched_file += 1
825  if len(self) > 0:
826  percentage_matched = round(100*cnt_matched/len(self), 1)
827  percentage_matched_file = round(
828  100 * cnt_matched_file / len(self), 1)
829  if matched or non_matched:
830  print(
831  "check_cross_link_consistency: Out of %d cross-links "
832  "%d were matched to the fasta sequence (%f %%).\n "
833  "%d were matched by using the cross-link file (%f %%)."
834  % (len(self), cnt_matched, percentage_matched,
835  cnt_matched_file, percentage_matched_file))
836  if non_matched:
837  print("check_cross_link_consistency: Warning: Non "
838  "matched xlinks:",
839  [(prot_name, sorted(list(non_matched[prot_name])))
840  for prot_name in non_matched])
841  return matched, non_matched
842 
843  def _match_xlinks(self, prot_name, res_index, aa_tuple):
844  # returns Boolean whether given aa matches a position in the fasta file
845  # cross-link files usually start counting at 1 and not 0; therefore
846  # subtract -1 to compare with fasta
847  amino_dict = IMP.pmi.alphabets.amino_acid
848  res_index -= 1
849  for amino_acid in aa_tuple:
850  if len(amino_acid) == 3:
851  amino_acid = amino_dict.get_one_letter_code_from_residue_type(
852  amino_acid.upper())
853  if prot_name in self.fasta_seq.sequences:
854  seq = self.fasta_seq.sequences[prot_name]
855  # if we are looking at the first amino acid in a given
856  # sequence always return a match
857  # the first aa should always be the n-terminal aa
858  # which may form a cross-link in any case (for BS3-like
859  # cross-linkers)
860  # for some data sets the first aa is at position 1;
861  # todo: check why this is the case
862  if res_index == 0 or res_index == 1:
863  return True
864  if res_index < len(seq):
865  if amino_acid == seq[res_index]:
866  return True
867  return False
868 
869  def _update_matched_xlinks(self, b_matched, prot, res, matched,
870  non_matched):
871  if b_matched:
872  if prot in matched:
873  matched[prot].add(res)
874  else:
875  matched[prot] = set([res])
876  else:
877  if prot in non_matched:
878  non_matched[prot].add(res)
879  else:
880  non_matched[prot] = set([res])
881  return matched, non_matched
882 
883  def get_cross_link_string(self, xl):
884  string = '|'
885  for k in self.ordered_key_list:
886  try:
887  string += str(k) + ":" + str(xl[k]) + "|"
888  except KeyError:
889  continue
890 
891  for k in xl:
892  if k not in self.ordered_key_list:
893  string += str(k) + ":" + str(xl[k]) + "|"
894 
895  return string
896 
897  def get_short_cross_link_string(self, xl):
898  string = '|'
899  list_of_keys = [self.data_set_name_key,
900  self.unique_sub_id_key,
901  self.protein1_key,
902  self.residue1_key,
903  self.protein2_key,
904  self.residue2_key,
905  self.state_key,
906  self.psi_key]
907 
908  for k in list_of_keys:
909  try:
910  string += str(xl[k]) + "|"
911  except KeyError:
912  continue
913 
914  return string
915 
916  def filter(self, FilterOperator):
917  new_xl_dict = {}
918  for id in self.data_base.keys():
919  for xl in self.data_base[id]:
920  if FilterOperator.evaluate(xl):
921  if id not in new_xl_dict:
922  new_xl_dict[id] = [xl]
923  else:
924  new_xl_dict[id].append(xl)
925  self._update()
926  cdb = CrossLinkDataBase(self.cldbkc, new_xl_dict)
927  cdb.dataset = self.dataset
928  return cdb
929 
930  def filter_score(self, score):
931  '''Get all cross-links with score greater than an input value'''
932  FilterOperator = IMP.pmi.io.crosslink.FilterOperator(
933  self.id_score_key, operator.gt, score)
934  return self.filter(FilterOperator)
935 
936  def merge(self, CrossLinkDataBase1, CrossLinkDataBase2):
937  '''
938  This function merges two cross-link datasets so that if two
939  conflicting cross-links have the same cross-link UniqueIDS,
940  the cross-links will be appended under the same UniqueID slots
941  with different SubIDs
942  '''
943  raise NotImplementedError()
944 
945  def append_database(self, db):
946  """Append cross-link dataset to this one."""
947  new_data_base = {}
948  for k in self.data_base:
949  new_data_base[k] = self.data_base[k]
950  for k in db.data_base:
951  new_data_base[k] = db.data_base[k]
952  self.data_base = new_data_base
953  self._update()
954 
955  def __iadd__(self, db):
956  self.append_database(db)
957  return self
958 
959  def set_value(self, key, new_value, filter_operator=None):
960  '''
961  This function changes the value for a given key in the database
962  For instance one can change the name of a protein
963  @param key: the key in the database that must be changed
964  @param new_value: the new value of the key
965  @param filter_operator: optional FilterOperator to change the value to
966  a subset of the database
967 
968  example: `cldb1.set_value(cldb1.protein1_key, 'FFF',
969  FO(cldb.protein1_key, operator.eq, "AAA"))`
970  '''
971 
972  for xl in self:
973  if filter_operator is not None:
974  if filter_operator.evaluate(xl):
975  xl[key] = new_value
976  else:
977  xl[key] = new_value
978  self._update()
979 
980  def get_values(self, key):
981  '''
982  this function returns the list of values for a given key in the
983  database alphanumerically sorted
984  '''
985  values = set()
986  for xl in self:
987  values.add(xl[key])
988  return sorted(list(values))
989 
990  def offset_residue_index(self, protein_name, offset):
991  '''
992  This function offset the residue indexes of a given protein by
993  a specified value
994  @param protein_name: the protein name that need to be changed
995  @param offset: the offset value
996  '''
997 
998  for xl in self:
999  if xl[self.protein1_key] == protein_name:
1000  xl[self.residue1_key] = xl[self.residue1_key] + offset
1001  if xl[self.protein2_key] == protein_name:
1002  xl[self.residue2_key] = xl[self.residue2_key] + offset
1003  self._update()
1004 
1005  def create_new_keyword(self, keyword, values_from_keyword=None):
1006  '''
1007  This function creates a new keyword for the whole database and
1008  set the values from and existing keyword (optional), otherwise the
1009  values are set to None
1010  @param keyword the new keyword name:
1011  @param values_from_keyword the keyword from which we are copying
1012  the values:
1013  '''
1014  for xl in self:
1015  if values_from_keyword is not None:
1016  xl[keyword] = xl[values_from_keyword]
1017  else:
1018  xl[keyword] = None
1019  self._update()
1020 
1021  def rename_proteins(self, old_to_new_names_dictionary,
1022  protein_to_rename="both"):
1023  '''
1024  This function renames all proteins contained in the input dictionary
1025  from the old names (keys) to the new name (values)
1026  @param old_to_new_names_dictionary dictionary for converting old
1027  to new names
1028  @param protein_to_rename specify whether to rename both or protein1
1029  or protein2 only
1030  '''
1031 
1032  for old_name in old_to_new_names_dictionary:
1033  new_name = old_to_new_names_dictionary[old_name]
1034  if protein_to_rename == "both" or protein_to_rename == "protein1":
1035  fo2 = FilterOperator(self.protein1_key, operator.eq, old_name)
1036  self.set_value(self.protein1_key, new_name, fo2)
1037  if protein_to_rename == "both" or protein_to_rename == "protein2":
1038  fo2 = FilterOperator(self.protein2_key, operator.eq, old_name)
1039  self.set_value(self.protein2_key, new_name, fo2)
1040 
1041  def align_sequence(self, alignfile):
1042  """
1043  This function parses an alignment file obtained previously, to map
1044  crosslink residues onto the sequence of a homolog proteins. It is
1045  useful if you want to map the crosslinks onto a known, homolog
1046  structure without building a comparative model.
1047  The txt file of the alignment should be structured with repeating
1048  blocks, as follows:
1049 
1050  >NAME_CROSSLINKED_PROTEIN
1051  >NAME_HOMOLOG_PROTEIN_WITH_KNOWN_STRUCTURE
1052  one-letter code sequence for cross-linked protein (one line)
1053  one-letter code sequence for homolog protein with known structure (one line)
1054  """ # noqa: E501
1055 
1056  f = open(alignfile, 'r')
1057 
1058  def _assign_residues(seq1, seq2):
1059  d = {}
1060  resinds1 = []
1061  nres = 0
1062  contiguousgap = 0
1063  for r1 in seq1:
1064  if r1 != "-":
1065  nres += 1
1066  resinds1.append(nres)
1067  else:
1068  resinds1.append(None)
1069 
1070  resinds2 = [None for i in resinds1]
1071  nres = 0
1072  lastresind = 1
1073  for pos, r2 in enumerate(seq2):
1074  if r2 != "-":
1075  nres += 1
1076  resinds2[pos] = nres
1077  lastresind = nres
1078  if contiguousgap > 1:
1079  for i in range(pos-int(contiguousgap/2), pos):
1080  resinds2[i] = nres
1081  contiguousgap = 0
1082  else:
1083  contiguousgap += 1
1084  resinds2[pos] = lastresind
1085 
1086  for n, r in enumerate(resinds1):
1087  if r is not None:
1088  d[r] = resinds2[n]
1089  return d
1090 
1091  aa = True
1092  sequences1 = {}
1093  sequences2 = {}
1094  with open(alignfile, 'r') as f:
1095  while aa:
1096  aa = f.readline()
1097  bb = f.readline()
1098  cc = f.readline()
1099  dd = f.readline()
1100  protname = aa.replace(">", "").replace("\n", "")
1101  seq1 = cc.replace(",", "").replace("\n", "")
1102  seq2 = dd.replace(",", "").replace("\n", "")
1103 
1104  n1 = 0
1105  n2 = 0
1106  tmpl1 = []
1107  tmpl2 = []
1108  for i in range(len(seq1)):
1109  if seq1[i] != "-":
1110  n1 += 1
1111  c1 = n1
1112  else:
1113  c1 = None
1114  if seq2[i] != "-":
1115  n2 += 1
1116  c2 = n2
1117  else:
1118  c2 = None
1119 
1120  tmpl1.append([i, c1, seq1[i]])
1121  tmpl2.append([i, c2, seq2[i]])
1122  sequences1[protname] = tmpl1
1123  sequences2[protname] = tmpl2
1124 
1125  d = _assign_residues(seq1, seq2)
1126 
1127  fo11 = FilterOperator(self.protein1_key, operator.eq, protname)
1128  fo12 = FilterOperator(self.protein2_key, operator.eq, protname)
1129 
1130  for xl in self:
1131  if fo11.evaluate(xl):
1132  t = [item for item in sequences1[protname]
1133  if item[1] == xl[self.residue1_key]][0]
1134  sequences1[protname][t[0]] = [
1135  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1136  t = [item for item in sequences2[protname]
1137  if item[1] == d[xl[self.residue1_key]]][0]
1138  sequences2[protname][t[0]] = [
1139  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1140 
1141  xl[self.residue1_key] = d[xl[self.residue1_key]]
1142 
1143  if fo12.evaluate(xl):
1144 
1145  t = [item for item in sequences1[protname]
1146  if item[1] == xl[self.residue2_key]][0]
1147  sequences1[protname][t[0]] = [
1148  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1149  t = [item for item in sequences2[protname]
1150  if item[1] == d[xl[self.residue2_key]]][0]
1151  sequences2[protname][t[0]] = [
1152  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1153 
1154  xl[self.residue2_key] = d[xl[self.residue2_key]]
1155 
1156  print("")
1157  print(aa.replace("\n", ""))
1158  print("".join([seq[2] for seq in sequences1[protname]]))
1159  print(bb.replace("\n", ""))
1160  print("".join([seq[2] for seq in sequences2[protname]]))
1161 
1162  def classify_crosslinks_by_score(self, number_of_classes):
1163  '''
1164  This function creates as many classes as in the input
1165  (number_of_classes: integer) and partition cross-links according
1166  to their identification scores. Classes are defined in the psi key.
1167  '''
1168 
1169  if self.id_score_key is not None:
1170  scores = self.get_values(self.id_score_key)
1171  else:
1172  raise ValueError(
1173  'The cross-link database does not contain score values')
1174  minscore = min(scores)
1175  maxscore = max(scores)
1176  scoreclasses = numpy.linspace(minscore, maxscore, number_of_classes+1)
1177  if self.psi_key is None:
1178  self.create_new_keyword(self.psi_key, values_from_keyword=None)
1179  for xl in self:
1180  score = xl[self.id_score_key]
1181  for n, classmin in enumerate(scoreclasses[0:-1]):
1182  if score >= classmin and score <= scoreclasses[n+1]:
1183  xl[self.psi_key] = str("CLASS_"+str(n))
1184  self._update()
1185 
1186  def clone_protein(self, protein_name, new_protein_name):
1187  for id in self.data_base.keys():
1188  new_data_base = []
1189  for xl in self.data_base[id]:
1190  new_data_base.append(xl)
1191  if xl[self.protein1_key] == protein_name \
1192  and xl[self.protein2_key] != protein_name:
1193  new_xl = dict(xl)
1194  new_xl[self.protein1_key] = new_protein_name
1195  new_data_base.append(new_xl)
1196  elif xl[self.protein1_key] != protein_name \
1197  and xl[self.protein2_key] == protein_name:
1198  new_xl = dict(xl)
1199  new_xl[self.protein2_key] = new_protein_name
1200  new_data_base.append(new_xl)
1201  elif xl[self.protein1_key] == protein_name \
1202  and xl[self.protein2_key] == protein_name:
1203  new_xl = dict(xl)
1204  new_xl[self.protein1_key] = new_protein_name
1205  new_data_base.append(new_xl)
1206  new_xl = dict(xl)
1207  new_xl[self.protein2_key] = new_protein_name
1208  new_data_base.append(new_xl)
1209  new_xl = dict(xl)
1210  new_xl[self.protein1_key] = new_protein_name
1211  new_xl[self.protein2_key] = new_protein_name
1212  new_data_base.append(new_xl)
1213  self.data_base[id] = new_data_base
1214  self._update()
1215 
1217  '''
1218  This function remove cross-links applied to the same residue
1219  (ie, same chain name and residue number)
1220  '''
1221  for id in self.data_base.keys():
1222  new_data_base = []
1223  for xl in self.data_base[id]:
1224  if xl[self.protein1_key] == xl[self.protein2_key] \
1225  and xl[self.residue1_key] == xl[self.residue2_key]:
1226  continue
1227  else:
1228  new_data_base.append(xl)
1229  self.data_base[id] = new_data_base
1230  self._update()
1231 
1232  def jackknife(self, percentage):
1233  '''
1234  this method returns a CrossLinkDataBase class containing
1235  a random subsample of the original cross-link database.
1236  @param percentage float between 0 and 1, is the percentage of
1237  of spectra taken from the original list
1238  '''
1239  import random
1240  if percentage > 1.0 or percentage < 0.0:
1241  raise ValueError('the percentage of random cross-link spectra '
1242  'should be between 0 and 1')
1243  nspectra = self.get_number_of_xlid()
1244  nrandom_spectra = int(nspectra*percentage)
1245  random_keys = random.sample(sorted(self.data_base.keys()),
1246  nrandom_spectra)
1247  new_data_base = {}
1248  for k in random_keys:
1249  new_data_base[k] = self.data_base[k]
1250  return CrossLinkDataBase(self.cldbkc, new_data_base)
1251 
1252  def __str__(self):
1253  outstr = ''
1254  sorted_ids = sorted(self.data_base.keys())
1255 
1256  for id in sorted_ids:
1257  outstr += id + "\n"
1258  for xl in self.data_base[id]:
1259  for k in self.ordered_key_list:
1260  try:
1261  outstr += "--- " + str(k) + " " + str(xl[k]) + "\n"
1262  except KeyError:
1263  continue
1264 
1265  for k in xl:
1266  if k not in self.ordered_key_list:
1267  outstr += "--- " + str(k) + " " + str(xl[k]) + "\n"
1268  outstr += "-------------\n"
1269  return outstr
1270 
1271  def plot(self, filename, **kwargs):
1272  import matplotlib
1273  matplotlib.use('Agg')
1274  import matplotlib.pyplot as plt
1275  import matplotlib.colors
1276 
1277  if kwargs["type"] == "scatter":
1278  cmap = plt.get_cmap("rainbow")
1279  _ = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1280  xkey = kwargs["xkey"]
1281  ykey = kwargs["ykey"]
1282  if "colorkey" in kwargs:
1283  colorkey = kwargs["colorkey"]
1284  if "logyscale" in kwargs:
1285  logyscale = kwargs["logyscale"]
1286  else:
1287  logyscale = False
1288  xs = []
1289  ys = []
1290  colors = []
1291  for xl in self:
1292  try:
1293  xs.append(float(xl[xkey]))
1294  if logyscale:
1295  ys.append(math.log10(float(xl[ykey])))
1296  else:
1297  ys.append(float(xl[ykey]))
1298  colors.append(float(xl[colorkey]))
1299  except ValueError:
1300  print("CrossLinkDataBase.plot: Value error for "
1301  "cross-link %s" % (xl[self.unique_id_key]))
1302  continue
1303 
1304  cs = []
1305  for color in colors:
1306  cindex = (color-min(colors))/(max(colors)-min(colors))
1307  cs.append(cmap(cindex))
1308 
1309  fig = plt.figure()
1310  ax = fig.add_subplot(111)
1311  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8, marker="o")
1312  ax.set_xlabel(xkey)
1313  ax.set_ylabel(ykey)
1314  plt.savefig(filename)
1315  plt.show()
1316  plt.close()
1317 
1318  if kwargs["type"] == "residue_links":
1319  # plot the number of distinct links to a residue
1320  # in an histogram
1321  # molecule name
1322  molecule = kwargs["molecule"]
1323  if type(molecule) is IMP.pmi.topology.Molecule:
1324  length = len(molecule.sequence)
1325  molecule = molecule.get_name()
1326  else:
1327  # you need a IMP.pmi.topology.Sequences object
1328  sequences_object = kwargs["sequences_object"]
1329  sequence = sequences_object.sequences[molecule]
1330  length = len(sequence)
1331 
1332  histogram = [0]*length
1333  for xl in self:
1334  if xl[self.protein1_key] == molecule:
1335  histogram[xl[self.residue1_key]-1] = \
1336  xl[self.residue1_links_number_key]
1337  if xl[self.protein2_key] == molecule:
1338  histogram[xl[self.residue2_key]-1] = \
1339  xl[self.residue2_links_number_key]
1340  plt.bar(range(1, length+1), histogram)
1341  plt.savefig(filename)
1342  plt.show()
1343  plt.close()
1344 
1345  if kwargs["type"] == "histogram":
1346  valuekey = kwargs["valuekey"]
1347  valuename = valuekey
1348  bins = kwargs["bins"]
1349  values_list = []
1350  for xl in self:
1351  try:
1352  values_list.append(float(xl[valuekey]))
1353  except ValueError:
1354  print("CrossLinkDataBase.plot: Value error for cross-link "
1355  "%s" % (xl[self.unique_id_key]))
1356  continue
1358  filename, [values_list], valuename=valuename, bins=bins,
1359  colors=None, format="pdf",
1360  reference_xline=None, yplotrange=None,
1361  xplotrange=None, normalized=True,
1362  leg_names=None)
1363 
1364  def dump(self, json_filename):
1365  import json
1366  with open(json_filename, 'w') as fp:
1367  json.dump(self.data_base, fp, sort_keys=True, indent=2,
1368  default=set_json_default)
1369 
1370  def load(self, json_filename):
1371  import json
1372  with open(json_filename, 'r') as fp:
1373  self.data_base = json.load(fp)
1374  self._update()
1375  # getting rid of unicode
1376  # (can't do this in Python 3, since *everything* is Unicode there)
1377  if sys.version_info[0] < 3:
1378  for xl in self:
1379  for k, v in xl.iteritems():
1380  if type(k) is unicode: # noqa: F821
1381  k = str(k)
1382  if type(v) is unicode: # noqa: F821
1383  v = str(v)
1384  xl[k] = v
1385 
1386  def save_csv(self, filename):
1387  import csv
1388 
1389  data = []
1390  sorted_ids = None
1391  sorted_group_ids = sorted(self.data_base.keys())
1392  for group in sorted_group_ids:
1393  group_block = []
1394  for xl in self.data_base[group]:
1395  if not sorted_ids:
1396  sorted_ids = sorted(xl.keys())
1397  values = [xl[k] for k in sorted_ids]
1398  group_block.append(values)
1399  data += group_block
1400 
1401  with open(filename, 'w') as fp:
1402  a = csv.writer(fp, delimiter=',')
1403  a.writerow(sorted_ids)
1404  a.writerows(data)
1405 
1407  """
1408  Returns the number of non redundant cross-link sites
1409  """
1410  praset = set()
1411  for xl in self:
1412  pra = _ProteinsResiduesArray(xl)
1413  prai = pra.get_inverted()
1414  praset.add(pra)
1415  praset.add(prai)
1416  return len(list(praset))
1417 
1418 
1420  """This class allows to compute and plot the distance between datasets"""
1421 
1422  def __init__(self, cldb_dictionary):
1423  """Input a dictionary where keys are cldb names and values are cldbs"""
1424  import scipy.spatial.distance
1425  self.dbs = cldb_dictionary
1426  self.keylist = list(self.dbs.keys())
1427  self.distances = list()
1428 
1429  for i, key1 in enumerate(self.keylist):
1430  for key2 in self.keylist[i+1:]:
1431  distance = self.get_distance(key1, key2)
1432  self.distances.append(distance)
1433 
1434  self.distances = scipy.spatial.distance.squareform(self.distances)
1435 
1436  def get_distance(self, key1, key2):
1437  return 1.0 - self.jaccard_index(self.dbs[key1], self.dbs[key2])
1438 
1439  def jaccard_index(self, CrossLinkDataBase1, CrossLinkDataBase2):
1440  """Similarity index between two datasets
1441  https://en.wikipedia.org/wiki/Jaccard_index"""
1442 
1443  set1 = set()
1444  set2 = set()
1445  for xl1 in CrossLinkDataBase1:
1446  a1f = _ProteinsResiduesArray(xl1)
1447  a1b = a1f.get_inverted()
1448  set1.add(a1f)
1449  set1.add(a1b)
1450  for xl2 in CrossLinkDataBase2:
1451  a2f = _ProteinsResiduesArray(xl2)
1452  a2b = a2f.get_inverted()
1453  set2.add(a2f)
1454  set2.add(a2b)
1455  return (float(len(set1 & set2)/2)
1456  / (len(set1)/2+len(set2)/2-len(set1 & set2)/2))
1457 
1458  def plot_matrix(self, figurename="clustermatrix.pdf"):
1459  import matplotlib as mpl
1460  import numpy
1461  mpl.use('Agg')
1462  import matplotlib.pylab as pl
1463  from scipy.cluster import hierarchy as hrc
1464 
1465  raw_distance_matrix = self.distances
1466  labels = self.keylist
1467 
1468  fig = pl.figure()
1469 
1470  ax = fig.add_subplot(1, 1, 1)
1471  dendrogram = hrc.dendrogram(
1472  hrc.linkage(raw_distance_matrix),
1473  color_threshold=7,
1474  no_labels=True)
1475  leaves_order = dendrogram['leaves']
1476  ax.set_xlabel('Dataset')
1477  ax.set_ylabel('Jaccard Distance')
1478  pl.tight_layout()
1479  pl.savefig("dendrogram."+figurename, dpi=300)
1480  pl.close(fig)
1481 
1482  fig = pl.figure()
1483 
1484  ax = fig.add_subplot(1, 1, 1)
1485  cax = ax.imshow(raw_distance_matrix[leaves_order, :][:, leaves_order],
1486  interpolation='nearest')
1487  cb = fig.colorbar(cax)
1488  cb.set_label('Jaccard Distance')
1489  ax.set_xlabel('Dataset')
1490  ax.set_ylabel('Dataset')
1491  ax.set_xticks(range(len(labels)))
1492  ax.set_xticklabels(numpy.array(labels)[leaves_order],
1493  rotation='vertical')
1494  ax.set_yticks(range(len(labels)))
1495  ax.set_yticklabels(numpy.array(labels)[leaves_order],
1496  rotation='horizontal')
1497  pl.tight_layout()
1498  pl.savefig("matrix." + figurename, dpi=300)
1499  pl.close(fig)
1500 
1501 
1503  '''
1504  This class maps a CrossLinkDataBase on a given structure
1505  and save an rmf file with color-coded cross-links
1506  '''
1507 
1508  def __init__(self, CrossLinkDataBase, rmf_or_stat_handler):
1509  self.CrossLinkDataBase = CrossLinkDataBase
1510  if type(rmf_or_stat_handler) is IMP.pmi.output.RMFHierarchyHandler or \
1511  type(rmf_or_stat_handler) is IMP.pmi.output.StatHierarchyHandler:
1512  self.prots = rmf_or_stat_handler
1513  self.distances = defaultdict(list)
1514  self.array_to_id = {}
1515  self.id_to_array = {}
1516 
1517  print("computing distances fro all cross-links and all structures")
1518  for i in self.prots[::10]:
1519  self.compute_distances()
1520  for key, xl in enumerate(self.CrossLinkDataBase):
1521  array = _ProteinsResiduesArray(xl)
1522  self.array_to_id[array] = key
1523  self.id_to_array[key] = array
1524  if xl["MinAmbiguousDistance"] != 'None':
1525  self.distances[key].append(xl["MinAmbiguousDistance"])
1526 
1527  def compute_distances(self):
1528  data = []
1529  sorted_ids = None
1530  sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1531  for group in sorted_group_ids:
1532  group_dists = []
1533  for xl in self.CrossLinkDataBase.data_base[group]:
1534  if not sorted_ids:
1535  sorted_ids = sorted(xl.keys())
1536  data.append(
1537  sorted_ids
1538  + ["UniqueID", "Distance", "MinAmbiguousDistance"])
1539  c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1540  try:
1541  (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1542  self._get_distance_and_particle_pair(r1, c1, r2, c2)
1543  except: # noqa: E722
1544  mdist = "None"
1545  state1 = "None"
1546  copy1 = "None"
1547  state2 = "None"
1548  copy2 = "None"
1549  try:
1550  # sometimes keys get "lost" in the database,
1551  # not really sure why
1552  values = [xl[k] for k in sorted_ids]
1553  values += [group, mdist]
1554  except KeyError as e:
1555  print("MapCrossLinkDataBaseOnStructure "
1556  "KeyError: {0} in {1}".format(e, xl))
1557  group_dists.append(mdist)
1558  xl["Distance"] = mdist
1559  xl["State1"] = state1
1560  xl["Copy1"] = copy1
1561  xl["State2"] = state2
1562  xl["Copy2"] = copy2
1563 
1564  for xl in self.CrossLinkDataBase.data_base[group]:
1565  xl["MinAmbiguousDistance"] = min(group_dists)
1566 
1567  def _get_distance_and_particle_pair(self, r1, c1, r2, c2):
1568  '''more robust and slower version of above'''
1569  sel = IMP.atom.Selection(self.prots, molecule=c1, residue_index=r1,
1570  resolution=1)
1571  selpart_1 = sel.get_selected_particles()
1572  if len(selpart_1) == 0:
1573  print("MapCrossLinkDataBaseOnStructure: Warning: no particle "
1574  "selected for first site")
1575  return None
1576  sel = IMP.atom.Selection(self.prots, molecule=c2, residue_index=r2,
1577  resolution=1)
1578  selpart_2 = sel.get_selected_particles()
1579  if len(selpart_2) == 0:
1580  print("MapCrossLinkDataBaseOnStructure: Warning: no particle "
1581  "selected for second site")
1582  return None
1583  results = []
1584  for p1 in selpart_1:
1585  for p2 in selpart_2:
1586  if p1 == p2 and r1 == r2:
1587  continue
1588  d1 = IMP.core.XYZ(p1)
1589  d2 = IMP.core.XYZ(p2)
1590  # round distance to second decimal
1591  dist = float(int(IMP.core.get_distance(d1, d2)*100.0))/100.0
1592  h1 = IMP.atom.Hierarchy(p1)
1593  while not IMP.atom.Molecule.get_is_setup(h1.get_particle()):
1594  h1 = h1.get_parent()
1595  copy_index1 = IMP.atom.Copy(h1).get_copy_index()
1596  while not IMP.atom.State.get_is_setup(h1.get_particle()):
1597  h1 = h1.get_parent()
1598  state_index1 = IMP.atom.State(h1).get_state_index()
1599  h2 = IMP.atom.Hierarchy(p2)
1600  while not IMP.atom.Molecule.get_is_setup(h2.get_particle()):
1601  h2 = h2.get_parent()
1602  copy_index2 = IMP.atom.Copy(h2).get_copy_index()
1603  while not IMP.atom.State.get_is_setup(h2.get_particle()):
1604  h2 = h2.get_parent()
1605  state_index2 = IMP.atom.State(h2).get_state_index()
1606 
1607  results.append((dist, state_index1, copy_index1,
1608  state_index2, copy_index2, p1, p2))
1609  if len(results) == 0:
1610  return None
1611  results_sorted = sorted(results,
1612  key=operator.itemgetter(0, 1, 2, 3, 4))
1613  return ((results_sorted[0][0],
1614  results_sorted[0][5],
1615  results_sorted[0][6]),
1616  (results_sorted[0][1],
1617  results_sorted[0][2],
1618  results_sorted[0][3],
1619  results_sorted[0][4]))
1620 
1621  def save_rmf_snapshot(self, filename, color_id=None):
1622  sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1623  list_of_pairs = []
1624  color_scores = []
1625  for group in sorted_group_ids:
1626  group_dists_particles = []
1627  for xl in self.CrossLinkDataBase.data_base[group]:
1628  xllabel = \
1629  self.CrossLinkDataBase.get_short_cross_link_string(xl)
1630  c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1631  try:
1632  (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1633  self._get_distance_and_particle_pair(r1, c1, r2, c2)
1634  except TypeError:
1635  print("TypeError or missing chain/residue ",
1636  r1, c1, r2, c2)
1637  continue
1638  if color_id is not None:
1639  group_dists_particles.append((mdist, p1, p2, xllabel,
1640  float(xl[color_id])))
1641  else:
1642  group_dists_particles.append((mdist, p1, p2, xllabel, 0.0))
1643  if group_dists_particles:
1644  (minmdist, minp1, minp2, minxllabel, mincolor_score) = \
1645  min(group_dists_particles, key=lambda t: t[0])
1646  color_scores.append(mincolor_score)
1647  list_of_pairs.append((minp1, minp2, minxllabel,
1648  mincolor_score))
1649  else:
1650  continue
1651 
1652  linear = IMP.core.Linear(0, 0.0)
1653  linear.set_slope(1.0)
1654  dps2 = IMP.core.DistancePairScore(linear)
1655  rslin = IMP.RestraintSet(self.prots.get_model(),
1656  'linear_dummy_restraints')
1657  sgs = []
1658  offset = min(color_scores)
1659  maxvalue = max(color_scores)
1660  for pair in list_of_pairs:
1661  pr = IMP.core.PairRestraint(self.prots.get_model(), dps2,
1662  (pair[0], pair[1]))
1663  pr.set_name(pair[2])
1664  if offset < maxvalue:
1665  factor = (pair[3]-offset)/(maxvalue-offset)
1666  else:
1667  factor = 0.0
1668  c = IMP.display.get_rgb_color(factor)
1669  seg = IMP.algebra.Segment3D(
1670  IMP.core.XYZ(pair[0]).get_coordinates(),
1671  IMP.core.XYZ(pair[1]).get_coordinates())
1672  rslin.add_restraint(pr)
1673  sgs.append(IMP.display.SegmentGeometry(seg, c, pair[2]))
1674 
1675  rh = RMF.create_rmf_file(filename)
1676  IMP.rmf.add_hierarchies(rh, [self.prots])
1677  IMP.rmf.add_restraints(rh, [rslin])
1678  IMP.rmf.add_geometries(rh, sgs)
1679  IMP.rmf.save_frame(rh)
1680  del rh
1681 
1682  def boxplot_crosslink_distances(self, filename):
1683  import numpy
1684  keys = [self.id_to_array[k] for k in self.distances.keys()]
1685  medians = [numpy.median(self.distances[self.array_to_id[k]])
1686  for k in keys]
1687  dists = [self.distances[self.array_to_id[k]] for k in keys]
1688  distances_sorted_by_median = \
1689  [x for _, x in sorted(zip(medians, dists))]
1690  keys_sorted_by_median = [x for _, x in sorted(zip(medians, keys))]
1692  filename, distances_sorted_by_median,
1693  range(len(distances_sorted_by_median)),
1694  xlabels=keys_sorted_by_median,
1695  scale_plot_length=0.2)
1696 
1697  def get_crosslink_violations(self, threshold):
1698  violations = defaultdict(list)
1699  for k in self.distances:
1700  # viols=map(lambda x:1.0*(x<= threshold), self.distances[k])
1701  viols = self.distances[k]
1702  violations[self.id_to_array[k]] = viols
1703  return violations
1704 
1705 
1707  '''
1708  This class generates a CrossLinkDataBase from a given structure
1709  '''
1710  def __init__(self, system, residue_types_1=["K"],
1711  residue_types_2=["K"], reactivity_range=[0, 1], kt=1.0):
1712  import numpy.random
1713  import math
1715  cldbkc.set_protein1_key("Protein1")
1716  cldbkc.set_protein2_key("Protein2")
1717  cldbkc.set_residue1_key("Residue1")
1718  cldbkc.set_residue2_key("Residue2")
1719  self.cldb = CrossLinkDataBase(cldbkc)
1720  self.system = system
1721  self.model = self.system.model
1722  self.residue_types_1 = residue_types_1
1723  self.residue_types_2 = residue_types_2
1724  self.kt = kt
1725  self.indexes_dict1 = {}
1726  self.indexes_dict2 = {}
1727  self.protein_residue_dict = {}
1728  self.reactivity_dictionary = {}
1729  self.euclidean_interacting_pairs = None
1730  self.xwalk_interacting_pairs = None
1731 
1732  for state in self.system.get_states():
1733  for moleculename, molecules in state.get_molecules().items():
1734  for molecule in molecules:
1735  # we are saving a dictionary with protein name,
1736  # residue number and random reactivity of the residue
1737  seq = molecule.sequence
1738  residues = [i for i in range(1, len(seq)+1)
1739  if ((seq[i-1] in self.residue_types_1)
1740  or (seq[i-1] in self.residue_types_2))]
1741 
1742  for r in residues:
1743  # uniform random reactivities
1744  # getting reactivities from the CDF of an
1745  # exponential distribution
1746  rexp = numpy.random.exponential(0.00000001)
1747  prob = 1.0-math.exp(-rexp)
1748  self.reactivity_dictionary[(molecule, r)] = prob
1749 
1750  residues1 = [i for i in range(1, len(seq)+1)
1751  if seq[i-1] in self.residue_types_1]
1752  residues2 = [i for i in range(1, len(seq)+1)
1753  if seq[i-1] in self.residue_types_2]
1754  for r in residues1:
1755  s = IMP.atom.Selection(
1756  molecule.hier, residue_index=r, resolution=1)
1757  ps = s.get_selected_particles()
1758  for p in ps:
1759  index = p.get_index()
1760  self.indexes_dict1[index] = (molecule, r)
1761  self.protein_residue_dict[(molecule, r)] = index
1762  for r in residues2:
1763  s = IMP.atom.Selection(
1764  molecule.hier, residue_index=r, resolution=1)
1765  ps = s.get_selected_particles()
1766  for p in ps:
1767  index = p.get_index()
1768  self.indexes_dict2[index] = (molecule, r)
1769  self.protein_residue_dict[(molecule, r)] = index
1770 
1771  def get_all_possible_pairs(self):
1772  n = float(len(self.protein_residue_dict.keys()))
1773  return n*(n-1.0)/2.0
1774 
1775  def get_all_feasible_pairs(self, distance=21):
1776  import itertools
1777  particle_index_pairs = []
1778  nxl = 0
1779  for a, b in itertools.combinations(
1780  self.protein_residue_dict.keys(), 2):
1781  new_xl = {}
1782  index1 = self.protein_residue_dict[a]
1783  index2 = self.protein_residue_dict[b]
1784  particle_distance = IMP.core.get_distance(
1785  IMP.core.XYZ(IMP.get_particles(self.model, [index1])[0]),
1786  IMP.core.XYZ(IMP.get_particles(self.model, [index2])[0]))
1787  if particle_distance <= distance:
1788  particle_index_pairs.append((index1, index2))
1789  new_xl[self.cldb.protein1_key] = a[0].get_name()
1790  new_xl[self.cldb.protein2_key] = b[0].get_name()
1791  new_xl["molecule_object1"] = a[0]
1792  new_xl["molecule_object2"] = b[0]
1793  new_xl[self.cldb.residue1_key] = a[1]
1794  new_xl[self.cldb.residue2_key] = b[1]
1795  self.cldb.data_base[str(nxl)] = [new_xl]
1796  nxl += 1
1797  self.cldb._update()
1798  return self.cldb
1799 
1800  def get_data_base(self, total_number_of_spectra, ambiguity_probability=0.1,
1801  noise=0.01, distance=21, max_delta_distance=10.0,
1802  xwalk_bin_path=None, confidence_false=0.75,
1803  confidence_true=0.75):
1804  import math
1805  from random import random
1806  import numpy as np
1807  number_of_spectra = 1
1808 
1809  self.beta_true = -1.4427*math.log(0.5*(1.0-confidence_true))
1810  self.beta_false = -1.4427*math.log(0.5*(1.0-confidence_false))
1811  self.cldb.data_base[str(number_of_spectra)] = []
1812  self.sites_weighted = None
1813 
1814  while number_of_spectra < total_number_of_spectra:
1815  if (random() > ambiguity_probability
1816  and len(self.cldb.data_base[str(number_of_spectra)]) != 0):
1817  # new spectrum
1818  number_of_spectra += 1
1819  self.cldb.data_base[str(number_of_spectra)] = []
1820  noisy = False
1821  if random() > noise:
1822  # not noisy cross-link
1823  pra, dist = self.get_random_residue_pair(
1824  distance, xwalk_bin_path, max_delta_distance)
1825  else:
1826  # noisy cross-link
1827  pra, dist = self.get_random_residue_pair(None, None, None)
1828  noisy = True
1829 
1830  new_xl = {}
1831  new_xl[self.cldb.protein1_key] = pra[0].get_name()
1832  new_xl[self.cldb.protein2_key] = pra[1].get_name()
1833  new_xl["molecule_object1"] = pra[0]
1834  new_xl["molecule_object2"] = pra[1]
1835  new_xl[self.cldb.residue1_key] = pra[2]
1836  new_xl[self.cldb.residue2_key] = pra[3]
1837  new_xl["Noisy"] = noisy
1838  # the reactivity is defined as r=1-exp(-k*Delta t)
1839  new_xl["Reactivity_Residue1"] = \
1840  self.reactivity_dictionary[(pra[0], pra[2])]
1841  new_xl["Reactivity_Residue2"] = \
1842  self.reactivity_dictionary[(pra[1], pra[3])]
1843  if noisy:
1844  new_xl["Score"] = np.random.beta(1.0, self.beta_false)
1845  else:
1846  new_xl["Score"] = 1.0-np.random.beta(1.0, self.beta_true)
1847  new_xl["TargetDistance"] = dist
1848  new_xl["NoiseProbability"] = noise
1849  new_xl["AmbiguityProbability"] = ambiguity_probability
1850  # getting if it is intra or inter rigid body
1851  (p1, p2) = IMP.get_particles(
1852  self.model, [self.protein_residue_dict[(pra[0], pra[2])],
1853  self.protein_residue_dict[(pra[1], pra[3])]])
1856  IMP.core.RigidMember(p1).get_rigid_body() ==
1857  IMP.core.RigidMember(p2).get_rigid_body()):
1858  new_xl["InterRigidBody"] = False
1859  elif (IMP.core.RigidMember.get_is_setup(p1) and
1861  IMP.core.RigidMember(p1).get_rigid_body() !=
1862  IMP.core.RigidMember(p2).get_rigid_body()):
1863  new_xl["InterRigidBody"] = True
1864  else:
1865  new_xl["InterRigidBody"] = None
1866 
1867  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1868  self.cldb._update()
1869  return self.cldb
1870 
1871  def get_random_residue_pair(self, distance=None, xwalk_bin_path=None,
1872  max_delta_distance=None):
1873  import IMP.pmi.tools
1874  import math
1875  from random import choice
1876  if distance is None:
1877  # get a random pair
1878  while True:
1879  protein1, residue1 = choice(self.protein_residue_dict.keys())
1880  protein2, residue2 = choice(self.protein_residue_dict.keys())
1881  index1 = self.protein_residue_dict[(protein1, residue1)]
1882  index2 = self.protein_residue_dict[(protein2, residue2)]
1883  particle_distance = IMP.core.get_distance(
1884  IMP.core.XYZ(IMP.get_particles(self.model, [index1])[0]),
1885  IMP.core.XYZ(IMP.get_particles(self.model, [index2])[0]))
1886  if (protein1, residue1) != (protein2, residue2):
1887  break
1888  else:
1889  # get a pair of residues whose distance is below the threshold
1890  if not xwalk_bin_path:
1892  gcpf.set_distance(distance+max_delta_distance)
1893 
1894  while True:
1895  # setup the reaction rates lists
1896  if not self.sites_weighted:
1897  self.sites_weighted = []
1898  for key in self.reactivity_dictionary:
1899  r = self.reactivity_dictionary[key]
1900  self.sites_weighted.append((key, r))
1901  # get a random reaction site
1902  first_site = self.weighted_choice(self.sites_weighted)
1903  # get all distances
1904  if not self.euclidean_interacting_pairs:
1905  self.euclidean_interacting_pairs = \
1906  gcpf.get_close_pairs(
1907  self.model,
1908  self.indexes_dict1.keys(),
1909  self.indexes_dict2.keys())
1910  # get the partner for the first reacted site
1911  first_site_pairs = \
1912  [pair for pair in self.euclidean_interacting_pairs
1913  if self.indexes_dict1[pair[0]] == first_site or
1914  self.indexes_dict2[pair[1]] == first_site]
1915  if len(first_site_pairs) == 0:
1916  continue
1917  # build the list of second reaction sites
1918  second_sites_weighted = []
1919  for pair in first_site_pairs:
1920  if self.indexes_dict1[pair[0]] == first_site:
1921  second_site = self.indexes_dict2[pair[1]]
1922  if self.indexes_dict2[pair[1]] == first_site:
1923  second_site = self.indexes_dict1[pair[0]]
1924  r = self.reactivity_dictionary[second_site]
1925  second_sites_weighted.append((second_site, r))
1926  second_site = self.weighted_choice(second_sites_weighted)
1927  protein1, residue1 = first_site
1928  protein2, residue2 = second_site
1929  print("CrossLinkDataBaseFromStructure."
1930  "get_random_residue_pair:",
1931  "First site", first_site,
1932  self.reactivity_dictionary[first_site],
1933  "Second site", second_site,
1934  self.reactivity_dictionary[second_site])
1935  particle_pair = IMP.get_particles(
1936  self.model,
1937  [self.protein_residue_dict[first_site],
1938  self.protein_residue_dict[second_site]])
1939  particle_distance = IMP.core.get_distance(
1940  IMP.core.XYZ(particle_pair[0]),
1941  IMP.core.XYZ(particle_pair[1]))
1942 
1943  if particle_distance < distance \
1944  and (protein1, residue1) != (protein2, residue2):
1945  break
1946  elif (particle_distance >= distance
1947  and (protein1, residue1) != (protein2, residue2)
1948  and max_delta_distance):
1949  # allow some flexibility
1950  if particle_distance-distance < max_delta_distance:
1951  break
1952 
1953  else:
1954  if not self.xwalk_interacting_pairs:
1955  self.xwalk_interacting_pairs = \
1956  self.get_xwalk_distances(xwalk_bin_path, distance)
1957  interacting_pairs_weighted = []
1958 
1959  for pair in self.xwalk_interacting_pairs:
1960  protein1 = pair[0]
1961  protein2 = pair[1]
1962  residue1 = pair[2]
1963  residue2 = pair[3]
1964  weight1 = math.exp(
1965  -self.reactivity_dictionary[(protein1, residue1)]
1966  / self.kt)
1967  weight2 = math.exp(
1968  -self.reactivity_dictionary[(protein2, residue2)]
1969  / self.kt)
1970  interacting_pairs_weighted.append((pair, weight1*weight2))
1971 
1972  pair = self.weighted_choice(interacting_pairs_weighted)
1973  protein1 = pair[0]
1974  protein2 = pair[1]
1975  residue1 = pair[2]
1976  residue2 = pair[3]
1977  particle_distance = float(pair[4])
1978 
1979  return ((protein1, protein2, residue1, residue2)), particle_distance
1980 
1981  def get_xwalk_distances(self, xwalk_bin_path, distance):
1982  import IMP.pmi.output
1983  import os
1984  o = IMP.pmi.output.Output(atomistic=True)
1985  o.init_pdb("xwalk.pdb", self.representation.prot)
1986  o.write_pdb("xwalk.pdb")
1987  namechainiddict = o.dictchain["xwalk.pdb"]
1988  chainiddict = {}
1989 
1990  for key in namechainiddict:
1991  chainiddict[namechainiddict[key]] = key
1992  xwalkout = os.popen('java -Xmx256m -cp ' + xwalk_bin_path
1993  + ' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys '
1994  '-a1 cb -a2 cb -max '
1995  + str(distance) + ' -bb').read()
1996 
1997  output_list_of_distance = []
1998  for line in xwalkout.split("\n")[0:-2]:
1999  tokens = line.split()
2000  first = tokens[2]
2001  second = tokens[3]
2002  distance = float(tokens[6])
2003  fs = first.split("-")
2004  ss = second.split("-")
2005  chainid1 = fs[2]
2006  chainid2 = ss[2]
2007  protein1 = chainiddict[chainid1]
2008  protein2 = chainiddict[chainid2]
2009  residue1 = int(fs[1])
2010  residue2 = int(ss[1])
2011  output_list_of_distance.append(
2012  (protein1, protein2, residue1, residue2, distance))
2013  return output_list_of_distance
2014 
2015  def weighted_choice(self, choices):
2016  import random
2017  # http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
2018  total = sum(w for c, w in choices)
2019  r = random.uniform(0, total)
2020  upto = 0
2021  for c, w in choices:
2022  if upto + w > r:
2023  return c
2024  upto += w
2025  assert False, "Shouldn't get here"
2026 
2027  def save_rmf_snapshot(self, filename, color_id=None):
2028  import IMP.rmf
2029  import RMF
2030  if color_id is None:
2031  color_id = "Reactivity"
2032  sorted_group_ids = sorted(self.cldb.data_base.keys())
2033  list_of_pairs = []
2034  color_scores = []
2035  for group in sorted_group_ids:
2036  group_dists_particles = []
2037  for xl in self.cldb.data_base[group]:
2038  xllabel = self.cldb.get_short_cross_link_string(xl)
2039  (c1, c2, r1, r2) = (xl["molecule_object1"],
2040  xl["molecule_object2"],
2041  xl[self.cldb.residue1_key],
2042  xl[self.cldb.residue2_key])
2043  try:
2044  index1 = self.protein_residue_dict[(c1, r1)]
2045  index2 = self.protein_residue_dict[(c2, r2)]
2046  p1, p2 = (IMP.get_particles(self.model, [index1])[0],
2047  IMP.get_particles(self.model, [index2])[0])
2048  mdist = xl["TargetDistance"]
2049  except TypeError:
2050  print("TypeError or missing chain/residue ",
2051  r1, c1, r2, c2)
2052  continue
2053  group_dists_particles.append((mdist, p1, p2, xllabel,
2054  float(xl[color_id])))
2055  if group_dists_particles:
2056  (minmdist, minp1, minp2, minxllabel, mincolor_score) \
2057  = min(group_dists_particles, key=lambda t: t[0])
2058  color_scores.append(mincolor_score)
2059  list_of_pairs.append((minp1, minp2, minxllabel,
2060  mincolor_score))
2061  else:
2062  continue
2063 
2064  m = self.model
2065  linear = IMP.core.Linear(0, 0.0)
2066  linear.set_slope(1.0)
2067  dps2 = IMP.core.DistancePairScore(linear)
2068  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
2069  sgs = []
2070  offset = min(color_scores)
2071  maxvalue = max(color_scores)
2072  for pair in list_of_pairs:
2073  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
2074  pr.set_name(pair[2])
2075  factor = (pair[3]-offset)/(maxvalue-offset)
2076  c = IMP.display.get_rgb_color(factor)
2077  seg = IMP.algebra.Segment3D(
2078  IMP.core.XYZ(pair[0]).get_coordinates(),
2079  IMP.core.XYZ(pair[1]).get_coordinates())
2080  rslin.add_restraint(pr)
2081  sgs.append(IMP.display.SegmentGeometry(seg, c, pair[2]))
2082 
2083  rh = RMF.create_rmf_file(filename)
2084  IMP.rmf.add_hierarchies(rh, [self.system.hier])
2085  IMP.rmf.add_restraints(rh, [rslin])
2086  IMP.rmf.add_geometries(rh, sgs)
2087  IMP.rmf.save_frame(rh)
2088  del rh
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: State.h:41
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1770
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1835
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
Get the particles from a list of indexes.
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:731
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:37
Stores a named protein chain.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
The standard decorator for manipulating molecular structures.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:203
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:19
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:25
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Definition: exception.h:48
Associate an integer "state" index with a hierarchy node.
Definition: State.h:27
class to link stat files to several rmf files
Definition: output.py:1269
class to allow more advanced handling of RMF files.
Definition: output.py:1145
Mapping between FASTA one-letter codes and residue types.
Definition: alphabets.py:1
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Definition: PairRestraint.h:29
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.