IMP logo
IMP Reference Guide  develop.5651aa123e,2024/07/26
The Integrative Modeling Platform
io/crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 
8 import IMP
9 import IMP.pmi
10 import IMP.pmi.output
11 import IMP.pmi.alphabets
12 import IMP.atom
13 import IMP.core
14 import IMP.algebra
15 import IMP.rmf
16 import RMF
17 import IMP.display
18 import operator
19 import math
20 import ihm.location
21 import ihm.dataset
22 from collections import defaultdict
23 import numpy
24 
25 
26 # json default serializations
27 def set_json_default(obj):
28  if isinstance(obj, set):
29  return list(obj)
30  if isinstance(obj, IMP.pmi.topology.Molecule):
31  return str(obj)
32  raise TypeError
33 
34 
35 # Handle and return data that must be a string
36 def _handle_string_input(inp):
37  if not isinstance(inp, str):
38  raise TypeError("expecting a string")
39  return inp
40 
41 
42 class _CrossLinkDataBaseStandardKeys:
43  '''
44  This class setup all the standard keys needed to
45  identify the cross-link features from the data sets
46  '''
47  def __init__(self):
48  self.type = {}
49  self.protein1_key = "Protein1"
50  self.type[self.protein1_key] = str
51  self.protein2_key = "Protein2"
52  self.type[self.protein2_key] = str
53  self.residue1_key = "Residue1"
54  self.type[self.residue1_key] = int
55  self.residue2_key = "Residue2"
56  self.type[self.residue2_key] = int
57  self.residue1_amino_acid_key = "Residue1AminoAcid"
58  self.type[self.residue1_amino_acid_key] = str
59  self.residue2_amino_acid_key = "Residue2AminoAcid"
60  self.type[self.residue2_amino_acid_key] = str
61  self.residue1_moiety_key = "Residue1Moiety"
62  self.type[self.residue1_moiety_key] = str
63  self.residue2_moiety_key = "Residue2Moiety"
64  self.type[self.residue2_moiety_key] = str
65  self.site_pairs_key = "SitePairs"
66  self.type[self.site_pairs_key] = str
67  self.unique_id_key = "XLUniqueID"
68  self.type[self.unique_id_key] = str
69  self.unique_sub_index_key = "XLUniqueSubIndex"
70  self.type[self.unique_sub_index_key] = int
71  self.unique_sub_id_key = "XLUniqueSubID"
72  self.type[self.unique_sub_id_key] = str
73  self.data_set_name_key = "DataSetName"
74  self.type[self.data_set_name_key] = str
75  self.cross_linker_chemical_key = "CrossLinkerChemical"
76  self.type[self.cross_linker_chemical_key] = str
77  self.id_score_key = "IDScore"
78  self.type[self.id_score_key] = float
79  self.fdr_key = "FDR"
80  self.type[self.fdr_key] = float
81  self.quantitation_key = "Quantitation"
82  self.type[self.quantitation_key] = float
83  self.redundancy_key = "Redundancy"
84  self.type[self.redundancy_key] = int
85  self.redundancy_list_key = "RedundancyList"
86  self.type[self.redundancy_key] = list
87  self.ambiguity_key = "Ambiguity"
88  self.type[self.ambiguity_key] = int
89  self.residue1_links_number_key = "Residue1LinksNumber"
90  self.type[self.residue1_links_number_key] = int
91  self.residue2_links_number_key = "Residue2LinksNumber"
92  self.type[self.residue2_links_number_key] = int
93  self.type[self.ambiguity_key] = int
94  self.state_key = "State"
95  self.type[self.state_key] = int
96  self.sigma1_key = "Sigma1"
97  self.type[self.sigma1_key] = str
98  self.sigma2_key = "Sigma2"
99  self.type[self.sigma2_key] = str
100  self.psi_key = "Psi"
101  self.type[self.psi_key] = str
102  self.distance_key = "Distance"
103  self.type[self.distance_key] = float
104  self.min_ambiguous_distance_key = "MinAmbiguousDistance"
105  self.type[self.distance_key] = float
106  # link types are either Monolink, Intralink or Interlink
107  self.link_type_key = "LinkType"
108  self.type[self.link_type_key] = str
109 
110  self.ordered_key_list = [
111  self.data_set_name_key, self.unique_id_key,
112  self.unique_sub_index_key, self.unique_sub_id_key,
113  self.protein1_key, self.protein2_key, self.residue1_key,
114  self.residue2_key, self.residue1_amino_acid_key,
115  self.residue2_amino_acid_key, self.residue1_moiety_key,
116  self.residue2_moiety_key, self.cross_linker_chemical_key,
117  self.id_score_key, self.fdr_key, self.quantitation_key,
118  self.redundancy_key, self.redundancy_list_key, self.state_key,
119  self.sigma1_key, self.sigma2_key, self.psi_key, self.distance_key,
120  self.min_ambiguous_distance_key, self.link_type_key]
121 
122 
123 class _ProteinsResiduesArray(tuple):
124  '''
125  This class is inherits from tuple, and it is a shorthand for a cross-link
126  (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1
127  and protein2, r1 and r2 are residue1 and residue2.
128  '''
129 
130  def __new__(self, input_data):
131  '''
132  @input_data can be a dict or a tuple
133  '''
134  self.cldbsk = _CrossLinkDataBaseStandardKeys()
135  if type(input_data) is dict:
136  monolink = False
137  p1 = input_data[self.cldbsk.protein1_key]
138  try:
139  p2 = input_data[self.cldbsk.protein2_key]
140  except KeyError:
141  monolink = True
142  r1 = input_data[self.cldbsk.residue1_key]
143  try:
144  r2 = input_data[self.cldbsk.residue2_key]
145  except KeyError:
146  monolink = True
147  if not monolink:
148  t = (p1, p2, r1, r2)
149  else:
150  t = (p1, "", r1, None)
151  elif type(input_data) is tuple:
152  if len(input_data) > 4 or len(input_data) == 3 \
153  or len(input_data) == 1:
154  raise TypeError(
155  "_ProteinsResiduesArray: must have only 4 elements")
156  if len(input_data) == 4:
157  p1 = _handle_string_input(input_data[0])
158  p2 = _handle_string_input(input_data[1])
159  r1 = input_data[2]
160  r2 = input_data[3]
161  if (type(r1) is not int) and (r1 is not None):
162  raise TypeError(
163  "_ProteinsResiduesArray: residue1 must be a integer")
164  if (type(r2) is not int) and (r1 is not None):
165  raise TypeError(
166  "_ProteinsResiduesArray: residue2 must be a integer")
167  t = (p1, p2, r1, r2)
168  if len(input_data) == 2:
169  p1 = _handle_string_input(input_data[0])
170  r1 = input_data[1]
171  if type(r1) is not int:
172  raise TypeError(
173  "_ProteinsResiduesArray: residue1 must be a integer")
174  t = (p1, "", r1, None)
175  else:
176  raise TypeError(
177  "_ProteinsResiduesArray: input must be a dict or tuple")
178  return tuple.__new__(_ProteinsResiduesArray, t)
179 
180  def get_inverted(self):
181  '''
182  Returns a _ProteinsResiduesArray instance with protein1 and
183  protein2 inverted
184  '''
185  return _ProteinsResiduesArray((self[1], self[0], self[3], self[2]))
186 
187  def __repr__(self):
188  outstr = self.cldbsk.protein1_key + " " + str(self[0])
189  outstr += " " + self.cldbsk.protein2_key + " " + str(self[1])
190  outstr += " " + self.cldbsk.residue1_key + " " + str(self[2])
191  outstr += " " + self.cldbsk.residue2_key + " " + str(self[3])
192  return outstr
193 
194  def __str__(self):
195  outstr = str(self[0]) + "." + str(self[2]) + "-" + str(self[1]) \
196  + "." + str(self[3])
197  return outstr
198 
199 
201  '''
202  This class allows to create filter functions that can be passed to
203  the CrossLinkDataBase in this way:
204 
205  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
206 
207  where cldb is CrossLinkDataBase instance and it is only needed to get
208  the standard keywords
209 
210  A filter operator can be evaluate on a CrossLinkDataBase item xl and
211  returns a boolean
212 
213  fo.evaluate(xl)
214 
215  and it is used to filter the database
216  '''
217 
218  def __init__(self, argument1, operator, argument2):
219  '''
220  (argument1,operator,argument2) can be either a
221  (keyword,operator.eq|lt|gt...,value)
222  or (FilterOperator1,operator.or|and...,FilterOperator2)
223  '''
224  if isinstance(argument1, FilterOperator):
225  self.operations = [argument1, operator, argument2]
226  else:
227  self.operations = []
228  self.values = (argument1, operator, argument2)
229 
230  def __or__(self, FilterOperator2):
231  return FilterOperator(self, operator.or_, FilterOperator2)
232 
233  def __and__(self, FilterOperator2):
234  return FilterOperator(self, operator.and_, FilterOperator2)
235 
236  def __invert__(self):
237  return FilterOperator(self, operator.not_, None)
238 
239  def evaluate(self, xl_item):
240 
241  if len(self.operations) == 0:
242  keyword, operator, value = self.values
243  return operator(xl_item[keyword], value)
244  FilterOperator1, op, FilterOperator2 = self.operations
245 
246  if FilterOperator2 is None:
247  return op(FilterOperator1.evaluate(xl_item))
248  else:
249  return op(FilterOperator1.evaluate(xl_item),
250  FilterOperator2.evaluate(xl_item))
251 
252 
253 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
254  '''
255  This class is needed to convert the keywords from a generic database
256  to the standard ones
257  '''
258 
259  def __init__(self, list_parser=None):
260  '''
261  @param list_parser an instance of ResiduePairListParser, if any
262  is needed
263  '''
264  self.converter = {}
265  self.backward_converter = {}
266  _CrossLinkDataBaseStandardKeys.__init__(self)
267  self.rplp = list_parser
268  if self.rplp is None:
269  # either you have protein1, protein2, residue1, residue2
270  self.compulsory_keys = set([self.protein1_key,
271  self.protein2_key,
272  self.residue1_key,
273  self.residue2_key])
274  else:
275  self.compulsory_keys = self.rplp.get_compulsory_keys()
276  self.setup_keys = set()
277 
278  def check_keys(self):
279  '''
280  Is a function that check whether necessary keys are setup
281  '''
282  setup_keys = set(self.get_setup_keys())
283  if self.compulsory_keys & setup_keys != self.compulsory_keys:
284  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup "
285  "all necessary keys")
286 
287  def get_setup_keys(self):
288  '''
289  Returns the keys that have been setup so far
290  '''
291  return self.backward_converter.keys()
292 
293  def set_standard_keys(self):
294  """
295  This sets up the standard compulsory keys as defined by
296  _CrossLinkDataBaseStandardKeys
297  """
298  for ck in self.compulsory_keys:
299  self.converter[ck] = ck
300  self.backward_converter[ck] = ck
301 
302  def set_unique_id_key(self, origin_key):
303  self.converter[origin_key] = self.unique_id_key
304  self.backward_converter[self.unique_id_key] = origin_key
305 
306  def set_protein1_key(self, origin_key):
307  self.converter[origin_key] = self.protein1_key
308  self.backward_converter[self.protein1_key] = origin_key
309 
310  def set_protein2_key(self, origin_key):
311  self.converter[origin_key] = self.protein2_key
312  self.backward_converter[self.protein2_key] = origin_key
313 
314  def set_residue1_key(self, origin_key):
315  self.converter[origin_key] = self.residue1_key
316  self.backward_converter[self.residue1_key] = origin_key
317 
318  def set_residue2_key(self, origin_key):
319  self.converter[origin_key] = self.residue2_key
320  self.backward_converter[self.residue2_key] = origin_key
321 
322  def set_residue1_amino_acid_key(self, origin_key):
323  self.converter[origin_key] = self.residue1_amino_acid_key
324  self.backward_converter[self.residue1_amino_acid_key] = origin_key
325 
326  def set_residue2_amino_acid_key(self, origin_key):
327  self.converter[origin_key] = self.residue2_amino_acid_key
328  self.backward_converter[self.residue2_amino_acid_key] = origin_key
329 
330  def set_residue1_moiety_key(self, origin_key):
331  self.converter[origin_key] = self.residue1_moiety_key
332  self.backward_converter[self.residue1_moiety_key] = origin_key
333 
334  def set_residue2_moiety_key(self, origin_key):
335  self.converter[origin_key] = self.residue2_moiety_key
336  self.backward_converter[self.residue2_moiety_key] = origin_key
337 
338  def set_site_pairs_key(self, origin_key):
339  self.converter[origin_key] = self.site_pairs_key
340  self.backward_converter[self.site_pairs_key] = origin_key
341 
342  def set_id_score_key(self, origin_key):
343  self.converter[origin_key] = self.id_score_key
344  self.backward_converter[self.id_score_key] = origin_key
345 
346  def set_fdr_key(self, origin_key):
347  self.converter[origin_key] = self.fdr_key
348  self.backward_converter[self.fdr_key] = origin_key
349 
350  def set_quantitation_key(self, origin_key):
351  self.converter[origin_key] = self.quantitation_key
352  self.backward_converter[self.quantitation_key] = origin_key
353 
354  def set_psi_key(self, origin_key):
355  self.converter[origin_key] = self.psi_key
356  self.backward_converter[self.psi_key] = origin_key
357 
358  def set_link_type_key(self, link_type_key):
359  self.converter[link_type_key] = self.link_type_key
360  self.backward_converter[self.link_type_key] = link_type_key
361 
362  def get_converter(self):
363  '''
364  Returns the dictionary that convert the old keywords to the new ones
365  '''
366  self.check_keys()
367  return self.converter
368 
370  '''
371  Returns the dictionary that convert the new keywords to the old ones
372  '''
373  self.check_keys()
374  return self.backward_converter
375 
376 
377 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
378  '''
379  A class to handle different styles of site pairs parsers.
380  Implemented styles:
381  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
382  [Y3-;K4-] for dead-ends
383  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
384  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
385  LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
386  ''' # noqa: E501
387 
388  import re
389 
390  def __init__(self, style):
391  _CrossLinkDataBaseStandardKeys.__init__(self)
392  if style == "MSSTUDIO":
393  self.style = style
394  self.compulsory_keys = set([self.protein1_key,
395  self.protein2_key,
396  self.site_pairs_key])
397  elif style == "XTRACT" or style == "QUANTITATION":
398  self.style = style
399  self.compulsory_keys = set([self.site_pairs_key])
400  elif style == "LAN_HUANG":
401  self.style = style
402  self.compulsory_keys = set([self.site_pairs_key])
403  else:
404  raise Exception("ResiduePairListParser: unknown list parser style")
405 
406  def get_compulsory_keys(self):
407  return self.compulsory_keys
408 
409  def get_list(self, input_string):
410  '''
411  This function returns a list of cross-linked residues and the
412  corresponding list of cross-linked chains. The latter list can be
413  empty, if the style doesn't have the corresponding information.
414  '''
415  if self.style == "MSSTUDIO":
416  input_string = input_string.replace("[", "")
417  input_string = input_string.replace("]", "")
418  input_string_pairs = input_string.split(";")
419  residue_pair_indexes = []
420  chain_pair_indexes = []
421  for s in input_string_pairs:
422  m1 = self.re.search(
423  r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)'
424  r'(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',
425  s)
426  m2 = self.re.search(
427  r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$', s)
428  if m1:
429  # cross-link
430  (residue_type_1, residue_index_1, residue_type_2,
431  residue_index_2) = m1.group(1, 2, 3, 4)
432  residue_pair_indexes.append((residue_index_1,
433  residue_index_2))
434  elif m2:
435  # dead end
436  residue_type_1, residue_index_1 = m2.group(1, 2)
437  # at this stage chain_pair_indexes is empty
438  return residue_pair_indexes, chain_pair_indexes
439  if self.style == "XTRACT" or self.style == "QUANTITATION":
440  if ":x:" in input_string:
441  # if it is a cross-link....
442  input_strings = input_string.split(":x:")
443  first_peptides = input_strings[0].split(":|:")
444  second_peptides = input_strings[1].split(":|:")
445  first_peptides_identifiers = [
446  (f.split(":")[0],
447  f.split(":")[1]) for f in first_peptides]
448  second_peptides_identifiers = [
449  (f.split(":")[0],
450  f.split(":")[1]) for f in second_peptides]
451  residue_pair_indexes = []
452  chain_pair_indexes = []
453  for fpi in first_peptides_identifiers:
454  for spi in second_peptides_identifiers:
455  chain1 = fpi[0]
456  chain2 = spi[0]
457  residue1 = fpi[1]
458  residue2 = spi[1]
459  residue_pair_indexes.append((residue1, residue2))
460  chain_pair_indexes.append((chain1, chain2))
461  return residue_pair_indexes, chain_pair_indexes
462  else:
463  # if it is a monolink....
464  first_peptides = input_string.split(":|:")
465  first_peptides_identifiers = [
466  (f.split(":")[0], f.split(":")[1]) for f in first_peptides]
467  residue_indexes = []
468  chain_indexes = []
469  for fpi in first_peptides_identifiers:
470  chain1 = fpi[0]
471  residue1 = fpi[1]
472  residue_indexes.append((residue1,))
473  chain_indexes.append((chain1,))
474  return residue_indexes, chain_indexes
475  if self.style == "LAN_HUANG":
476  input_strings = input_string.split("-")
477  chain1, first_series = input_strings[0].split(":")
478  chain2, second_series = input_strings[1].split(":")
479 
480  first_residues = first_series.replace(";", "|").split("|")
481  second_residues = second_series.replace(";", "|").split("|")
482  residue_pair_indexes = []
483  chain_pair_indexes = []
484  for fpi in first_residues:
485  for spi in second_residues:
486  residue1 = self.re.sub("[^0-9]", "", fpi)
487  residue2 = self.re.sub("[^0-9]", "", spi)
488  residue_pair_indexes.append((residue1, residue2))
489  chain_pair_indexes.append((chain1, chain2))
490  return residue_pair_indexes, chain_pair_indexes
491 
492 
493 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
494  '''
495  A class to handle different XL format with fixed format
496  currently support ProXL
497  '''
498  def __init__(self, format):
499  _CrossLinkDataBaseStandardKeys.__init__(self)
500  if format == "PROXL":
501  self.format = format
502  else:
503  raise Exception("FixedFormatParser: unknown list format name")
504 
505  def get_data(self, input_string):
506  if self.format == "PROXL":
507  tokens = input_string.split("\t")
508  xl = {}
509  if tokens[0] == "SEARCH ID(S)":
510  return None
511  else:
512  xl[self.protein1_key] = tokens[2]
513  xl[self.protein2_key] = tokens[4]
514  xl[self.residue1_key] = int(tokens[3])
515  xl[self.residue2_key] = int(tokens[5])
516  return xl
517 
518 
519 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
520  '''
521  this class handles a cross-link dataset and do filtering
522  operations, adding cross-links, merge datasets...
523  '''
524 
525  def __init__(self, converter=None, data_base=None, fasta_seq=None,
526  linkable_aa=('K',)):
527  '''
528  Constructor.
529  @param converter an instance of CrossLinkDataBaseKeywordsConverter
530  @param data_base an instance of CrossLinkDataBase to build the
531  new database on
532  @param fasta_seq an instance of IMP.pmi.topology.Sequences containing
533  protein fasta sequences to check cross-link consistency.
534  If not given consistency will not be checked
535  @param linkable_aa a tuple containing one-letter amino acids which
536  are linkable by the cross-linker; only used if the database
537  DOES NOT provide a value for a certain residueX_amino_acid_key
538  and if a fasta_seq is given
539  '''
540 
541  if data_base is None:
542  self.data_base = {}
543  else:
544  self.data_base = data_base
545 
546  _CrossLinkDataBaseStandardKeys.__init__(self)
547  if converter is not None:
548  self.cldbkc = converter # type: CrossLinkDataBaseKeywordsConverter
549  self.list_parser = self.cldbkc.rplp
550  self.converter = converter.get_converter()
551 
552  else:
553  self.cldbkc = None # type: CrossLinkDataBaseKeywordsConverter
554  self.list_parser = None
555  self.converter = None
556 
557  # default amino acids considered to be 'linkable' if none are given
558  self.def_aa_tuple = linkable_aa
559  self.fasta_seq = fasta_seq # type: IMP.pmi.topology.Sequences
560  self.dataset = None
561  self.name = None
562  self._update()
563 
564  def _update(self):
565  '''
566  Update the whole dataset after changes
567  '''
568  self.update_cross_link_unique_sub_index()
569  self.update_cross_link_redundancy()
570  self.update_residues_links_number()
572 
573  def __iter__(self):
574  sorted_ids = sorted(self.data_base.keys())
575  for k in sorted_ids:
576  for xl in self.data_base[k]:
577  yield xl
578 
579  def xlid_iterator(self):
580  sorted_ids = sorted(self.data_base.keys())
581  for xlid in sorted_ids:
582  yield xlid
583 
584  def __getitem__(self, xlid):
585  return self.data_base[xlid]
586 
587  def __len__(self):
588  return len([xl for xl in self])
589 
590  def get_name(self):
591  return self.name
592 
593  def set_name(self, name):
594  new_data_base = {}
595  for k in self.data_base:
596  new_data_base[k + "." + name] = self.data_base[k]
597  self.data_base = new_data_base
598  self.name = name
599  self._update()
600 
601  def get_number_of_xlid(self):
602  return len(self.data_base)
603 
604  def create_set_from_file(self, file_name, converter=None,
605  FixedFormatParser=None, encoding=None):
606  '''
607  if FixedFormatParser is not specified, the file is
608  comma-separated-values
609 
610  @param file_name a txt file to be parsed
611  @param converter an instance of CrossLinkDataBaseKeywordsConverter
612  @param FixedFormatParser a parser for a fixed format
613  @param encoding the encoding of the file, if not the locale
614  default (usually UTF-8)
615  '''
616  if not FixedFormatParser:
617  xl_list = IMP.pmi.tools.get_db_from_csv(file_name, encoding)
618 
619  if converter is not None:
620  self.cldbkc = converter
621  self.list_parser = self.cldbkc.rplp
622  self.converter = converter.get_converter()
623 
624  if not self.list_parser:
625  # normal procedure without a list_parser
626  # each line is a cross-link
627  new_xl_dict = {}
628  for nxl, xl in enumerate(xl_list):
629  new_xl = {}
630  for k in xl:
631  if k in self.converter:
632  new_xl[self.converter[k]] = \
633  self.type[self.converter[k]](xl[k])
634  else:
635  new_xl[k] = xl[k]
636  if self.unique_id_key in self.cldbkc.get_setup_keys():
637  if new_xl[self.unique_id_key] not in new_xl_dict:
638  new_xl_dict[new_xl[self.unique_id_key]] = [new_xl]
639  else:
640  new_xl_dict[new_xl[self.unique_id_key]].append(
641  new_xl)
642  else:
643  if str(nxl) not in new_xl_dict:
644  new_xl_dict[str(nxl)] = [new_xl]
645  else:
646  new_xl_dict[str(nxl)].append(new_xl)
647 
648  else:
649  # with a list_parser, a line can be a list of ambiguous
650  # cross-links
651  new_xl_dict = {}
652  for nxl, entry in enumerate(xl_list):
653 
654  # first get the translated keywords
655  new_dict = {}
656  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
657  raise Exception(
658  "CrossLinkDataBase: expecting a site_pairs_key "
659  "for the site pair list parser")
660  for k in entry:
661  if k in self.converter:
662  new_dict[self.converter[k]] = \
663  self.type[self.converter[k]](entry[k])
664  else:
665  new_dict[k] = entry[k]
666 
667  (residue_pair_list,
668  chain_pair_list) = self.list_parser.get_list(
669  new_dict[self.site_pairs_key])
670 
671  # then create the cross-links
672  for n, p in enumerate(residue_pair_list):
673  is_monolink = False
674  if len(p) == 1:
675  is_monolink = True
676 
677  new_xl = {}
678  for k in new_dict:
679  new_xl[k] = new_dict[k]
680  new_xl[self.residue1_key] = \
681  self.type[self.residue1_key](p[0])
682  if not is_monolink:
683  new_xl[self.residue2_key] = \
684  self.type[self.residue2_key](p[1])
685 
686  if len(chain_pair_list) == len(residue_pair_list):
687  new_xl[self.protein1_key] = \
688  self.type[self.protein1_key](
689  chain_pair_list[n][0])
690  if not is_monolink:
691  new_xl[self.protein2_key] = \
692  self.type[self.protein2_key](
693  chain_pair_list[n][1])
694 
695  if not is_monolink:
696  new_xl[self.link_type_key] = "CROSSLINK"
697  else:
698  new_xl[self.link_type_key] = "MONOLINK"
699 
700  if self.unique_id_key in self.cldbkc.get_setup_keys():
701  if new_xl[self.unique_id_key] not in new_xl_dict:
702  new_xl_dict[new_xl[self.unique_id_key]] = \
703  [new_xl]
704  else:
705  new_xl_dict[new_xl[self.unique_id_key]].append(
706  new_xl)
707  else:
708  if str(nxl) not in new_xl_dict:
709  new_xl[self.unique_id_key] = str(nxl+1)
710  new_xl_dict[str(nxl)] = [new_xl]
711  else:
712  new_xl[self.unique_id_key] = str(nxl+1)
713  new_xl_dict[str(nxl)].append(new_xl)
714 
715  else:
716  '''
717  if FixedFormatParser is defined
718  '''
719 
720  new_xl_dict = {}
721  nxl = 0
722  with open(file_name, "r", encoding=encoding) as f:
723  for line in f:
724  xl = FixedFormatParser.get_data(line)
725  if xl:
726  xl[self.unique_id_key] = str(nxl+1)
727  new_xl_dict[str(nxl)] = [xl]
728  nxl += 1
729 
730  self.data_base = new_xl_dict
731  self.name = file_name
732  loc = ihm.location.InputFileLocation(file_name, details='Crosslinks')
733  self.dataset = ihm.dataset.CXMSDataset(loc)
734  self._update()
735 
736  def update_cross_link_unique_sub_index(self):
737  for k in self.data_base:
738  for n, xl in enumerate(self.data_base[k]):
739  xl[self.ambiguity_key] = len(self.data_base[k])
740  xl[self.unique_sub_index_key] = n+1
741  xl[self.unique_sub_id_key] = k + "." + str(n+1)
742 
743  def update_cross_link_redundancy(self):
744  redundancy_data_base = {}
745  for xl in self:
746  pra = _ProteinsResiduesArray(xl)
747  if pra not in redundancy_data_base:
748  redundancy_data_base[pra] = [xl[self.unique_sub_id_key]]
749  redundancy_data_base[pra.get_inverted()] = \
750  [xl[self.unique_sub_id_key]]
751  else:
752  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
753  redundancy_data_base[pra.get_inverted()].append(
754  xl[self.unique_sub_id_key])
755  for xl in self:
756  pra = _ProteinsResiduesArray(xl)
757  xl[self.redundancy_key] = len(redundancy_data_base[pra])
758  xl[self.redundancy_list_key] = redundancy_data_base[pra]
759 
760  def update_residues_links_number(self):
761  residue_links = {}
762  for xl in self:
763  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
764  if (p1, r1) not in residue_links:
765  residue_links[(p1, r1)] = set([(p2, r2)])
766  else:
767  residue_links[(p1, r1)].add((p2, r2))
768  if (p2, r2) not in residue_links:
769  residue_links[(p2, r2)] = set([(p1, r1)])
770  else:
771  residue_links[(p2, r2)].add((p1, r1))
772 
773  for xl in self:
774  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
775  xl[self.residue1_links_number_key] = len(residue_links[(p1, r1)])
776  xl[self.residue2_links_number_key] = len(residue_links[(p2, r2)])
777 
779  """This function checks the consistency of the dataset with the
780  amino acid sequence"""
781  if self.cldbkc and self.fasta_seq:
782  cnt_matched, cnt_matched_file = 0, 0
783  matched = {}
784  non_matched = {}
785  for xl in self:
786  (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
787  b_matched_file = False
788  if self.residue1_amino_acid_key in xl:
789  # either you know the residue type and aa_tuple is
790  # a single entry
791  aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
792  b_matched = self._match_xlinks(p1, r1, aa_from_file)
793  b_matched_file = b_matched
794  else:
795  # or pass the possible list of types that can be
796  # cross-linked
797  b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
798 
799  matched, non_matched = self._update_matched_xlinks(
800  b_matched, p1, r1, matched, non_matched)
801 
802  if self.residue2_amino_acid_key in xl:
803  aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
804  b_matched = self._match_xlinks(p2, r2, aa_from_file)
805  b_matched_file = b_matched
806  else:
807  b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
808 
809  matched, non_matched = self._update_matched_xlinks(
810  b_matched, p2, r2, matched, non_matched)
811  if b_matched:
812  cnt_matched += 1
813  if b_matched_file:
814  cnt_matched_file += 1
815  if len(self) > 0:
816  percentage_matched = round(100*cnt_matched/len(self), 1)
817  percentage_matched_file = round(
818  100 * cnt_matched_file / len(self), 1)
819  if matched or non_matched:
820  print(
821  "check_cross_link_consistency: Out of %d cross-links "
822  "%d were matched to the fasta sequence (%f %%).\n "
823  "%d were matched by using the cross-link file (%f %%)."
824  % (len(self), cnt_matched, percentage_matched,
825  cnt_matched_file, percentage_matched_file))
826  if non_matched:
827  print("check_cross_link_consistency: Warning: Non "
828  "matched xlinks:",
829  [(prot_name, sorted(list(non_matched[prot_name])))
830  for prot_name in non_matched])
831  return matched, non_matched
832 
833  def _match_xlinks(self, prot_name, res_index, aa_tuple):
834  # returns Boolean whether given aa matches a position in the fasta file
835  # cross-link files usually start counting at 1 and not 0; therefore
836  # subtract -1 to compare with fasta
837  amino_dict = IMP.pmi.alphabets.amino_acid
838  res_index -= 1
839  for amino_acid in aa_tuple:
840  if len(amino_acid) == 3:
841  amino_acid = amino_dict.get_one_letter_code_from_residue_type(
842  amino_acid.upper())
843  if prot_name in self.fasta_seq.sequences:
844  seq = self.fasta_seq.sequences[prot_name]
845  # if we are looking at the first amino acid in a given
846  # sequence always return a match
847  # the first aa should always be the n-terminal aa
848  # which may form a cross-link in any case (for BS3-like
849  # cross-linkers)
850  # for some data sets the first aa is at position 1;
851  # todo: check why this is the case
852  if res_index == 0 or res_index == 1:
853  return True
854  if res_index < len(seq):
855  if amino_acid == seq[res_index]:
856  return True
857  return False
858 
859  def _update_matched_xlinks(self, b_matched, prot, res, matched,
860  non_matched):
861  if b_matched:
862  if prot in matched:
863  matched[prot].add(res)
864  else:
865  matched[prot] = set([res])
866  else:
867  if prot in non_matched:
868  non_matched[prot].add(res)
869  else:
870  non_matched[prot] = set([res])
871  return matched, non_matched
872 
873  def get_cross_link_string(self, xl):
874  string = '|'
875  for k in self.ordered_key_list:
876  try:
877  string += str(k) + ":" + str(xl[k]) + "|"
878  except KeyError:
879  continue
880 
881  for k in xl:
882  if k not in self.ordered_key_list:
883  string += str(k) + ":" + str(xl[k]) + "|"
884 
885  return string
886 
887  def get_short_cross_link_string(self, xl):
888  string = '|'
889  list_of_keys = [self.data_set_name_key,
890  self.unique_sub_id_key,
891  self.protein1_key,
892  self.residue1_key,
893  self.protein2_key,
894  self.residue2_key,
895  self.state_key,
896  self.psi_key]
897 
898  for k in list_of_keys:
899  try:
900  string += str(xl[k]) + "|"
901  except KeyError:
902  continue
903 
904  return string
905 
906  def filter(self, FilterOperator):
907  new_xl_dict = {}
908  for id in self.data_base.keys():
909  for xl in self.data_base[id]:
910  if FilterOperator.evaluate(xl):
911  if id not in new_xl_dict:
912  new_xl_dict[id] = [xl]
913  else:
914  new_xl_dict[id].append(xl)
915  self._update()
916  cdb = CrossLinkDataBase(self.cldbkc, new_xl_dict)
917  cdb.dataset = self.dataset
918  cdb.name = self.name
919  return cdb
920 
921  def filter_score(self, score):
922  '''Get all cross-links with score greater than an input value'''
923  FilterOperator = IMP.pmi.io.crosslink.FilterOperator(
924  self.id_score_key, operator.gt, score)
925  return self.filter(FilterOperator)
926 
927  def merge(self, CrossLinkDataBase1, CrossLinkDataBase2):
928  '''
929  This function merges two cross-link datasets so that if two
930  conflicting cross-links have the same cross-link UniqueIDS,
931  the cross-links will be appended under the same UniqueID slots
932  with different SubIDs
933  '''
934  raise NotImplementedError()
935 
936  def append_database(self, db):
937  """Append cross-link dataset to this one."""
938  new_data_base = {}
939  for k in self.data_base:
940  new_data_base[k] = self.data_base[k]
941  for k in db.data_base:
942  new_data_base[k] = db.data_base[k]
943  self.data_base = new_data_base
944  self._update()
945 
946  def __iadd__(self, db):
947  self.append_database(db)
948  return self
949 
950  def set_value(self, key, new_value, filter_operator=None):
951  '''
952  This function changes the value for a given key in the database
953  For instance one can change the name of a protein
954  @param key: the key in the database that must be changed
955  @param new_value: the new value of the key
956  @param filter_operator: optional FilterOperator to change the value to
957  a subset of the database
958 
959  example: `cldb1.set_value(cldb1.protein1_key, 'FFF',
960  FO(cldb.protein1_key, operator.eq, "AAA"))`
961  '''
962 
963  for xl in self:
964  if filter_operator is not None:
965  if filter_operator.evaluate(xl):
966  xl[key] = new_value
967  else:
968  xl[key] = new_value
969  self._update()
970 
971  def get_values(self, key):
972  '''
973  this function returns the list of values for a given key in the
974  database alphanumerically sorted
975  '''
976  values = set()
977  for xl in self:
978  values.add(xl[key])
979  return sorted(list(values))
980 
981  def offset_residue_index(self, protein_name, offset):
982  '''
983  This function offset the residue indexes of a given protein by
984  a specified value
985  @param protein_name: the protein name that need to be changed
986  @param offset: the offset value
987  '''
988 
989  for xl in self:
990  if xl[self.protein1_key] == protein_name:
991  xl[self.residue1_key] = xl[self.residue1_key] + offset
992  if xl[self.protein2_key] == protein_name:
993  xl[self.residue2_key] = xl[self.residue2_key] + offset
994  self._update()
995 
996  def create_new_keyword(self, keyword, values_from_keyword=None):
997  '''
998  This function creates a new keyword for the whole database and
999  set the values from and existing keyword (optional), otherwise the
1000  values are set to None
1001  @param keyword the new keyword name:
1002  @param values_from_keyword the keyword from which we are copying
1003  the values:
1004  '''
1005  for xl in self:
1006  if values_from_keyword is not None:
1007  xl[keyword] = xl[values_from_keyword]
1008  else:
1009  xl[keyword] = None
1010  self._update()
1011 
1012  def rename_proteins(self, old_to_new_names_dictionary,
1013  protein_to_rename="both"):
1014  '''
1015  This function renames all proteins contained in the input dictionary
1016  from the old names (keys) to the new name (values)
1017  @param old_to_new_names_dictionary dictionary for converting old
1018  to new names
1019  @param protein_to_rename specify whether to rename both or protein1
1020  or protein2 only
1021  '''
1022 
1023  for old_name in old_to_new_names_dictionary:
1024  new_name = old_to_new_names_dictionary[old_name]
1025  if protein_to_rename == "both" or protein_to_rename == "protein1":
1026  fo2 = FilterOperator(self.protein1_key, operator.eq, old_name)
1027  self.set_value(self.protein1_key, new_name, fo2)
1028  if protein_to_rename == "both" or protein_to_rename == "protein2":
1029  fo2 = FilterOperator(self.protein2_key, operator.eq, old_name)
1030  self.set_value(self.protein2_key, new_name, fo2)
1031 
1032  def align_sequence(self, alignfile):
1033  """
1034  This function parses an alignment file obtained previously, to map
1035  crosslink residues onto the sequence of a homolog proteins. It is
1036  useful if you want to map the crosslinks onto a known, homolog
1037  structure without building a comparative model.
1038  The txt file of the alignment should be structured with repeating
1039  blocks, as follows:
1040 
1041  >NAME_CROSSLINKED_PROTEIN
1042  >NAME_HOMOLOG_PROTEIN_WITH_KNOWN_STRUCTURE
1043  one-letter code sequence for cross-linked protein (one line)
1044  one-letter code sequence for homolog protein with known structure (one line)
1045  """ # noqa: E501
1046 
1047  f = open(alignfile, 'r')
1048 
1049  def _assign_residues(seq1, seq2):
1050  d = {}
1051  resinds1 = []
1052  nres = 0
1053  contiguousgap = 0
1054  for r1 in seq1:
1055  if r1 != "-":
1056  nres += 1
1057  resinds1.append(nres)
1058  else:
1059  resinds1.append(None)
1060 
1061  resinds2 = [None for i in resinds1]
1062  nres = 0
1063  lastresind = 1
1064  for pos, r2 in enumerate(seq2):
1065  if r2 != "-":
1066  nres += 1
1067  resinds2[pos] = nres
1068  lastresind = nres
1069  if contiguousgap > 1:
1070  for i in range(pos-int(contiguousgap/2), pos):
1071  resinds2[i] = nres
1072  contiguousgap = 0
1073  else:
1074  contiguousgap += 1
1075  resinds2[pos] = lastresind
1076 
1077  for n, r in enumerate(resinds1):
1078  if r is not None:
1079  d[r] = resinds2[n]
1080  return d
1081 
1082  aa = True
1083  sequences1 = {}
1084  sequences2 = {}
1085  with open(alignfile, 'r') as f:
1086  while aa:
1087  aa = f.readline()
1088  bb = f.readline()
1089  cc = f.readline()
1090  dd = f.readline()
1091  protname = aa.replace(">", "").replace("\n", "")
1092  seq1 = cc.replace(",", "").replace("\n", "")
1093  seq2 = dd.replace(",", "").replace("\n", "")
1094 
1095  n1 = 0
1096  n2 = 0
1097  tmpl1 = []
1098  tmpl2 = []
1099  for i in range(len(seq1)):
1100  if seq1[i] != "-":
1101  n1 += 1
1102  c1 = n1
1103  else:
1104  c1 = None
1105  if seq2[i] != "-":
1106  n2 += 1
1107  c2 = n2
1108  else:
1109  c2 = None
1110 
1111  tmpl1.append([i, c1, seq1[i]])
1112  tmpl2.append([i, c2, seq2[i]])
1113  sequences1[protname] = tmpl1
1114  sequences2[protname] = tmpl2
1115 
1116  d = _assign_residues(seq1, seq2)
1117 
1118  fo11 = FilterOperator(self.protein1_key, operator.eq, protname)
1119  fo12 = FilterOperator(self.protein2_key, operator.eq, protname)
1120 
1121  for xl in self:
1122  if fo11.evaluate(xl):
1123  t = [item for item in sequences1[protname]
1124  if item[1] == xl[self.residue1_key]][0]
1125  sequences1[protname][t[0]] = [
1126  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1127  t = [item for item in sequences2[protname]
1128  if item[1] == d[xl[self.residue1_key]]][0]
1129  sequences2[protname][t[0]] = [
1130  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1131 
1132  xl[self.residue1_key] = d[xl[self.residue1_key]]
1133 
1134  if fo12.evaluate(xl):
1135 
1136  t = [item for item in sequences1[protname]
1137  if item[1] == xl[self.residue2_key]][0]
1138  sequences1[protname][t[0]] = [
1139  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1140  t = [item for item in sequences2[protname]
1141  if item[1] == d[xl[self.residue2_key]]][0]
1142  sequences2[protname][t[0]] = [
1143  t[0], t[1], '\033[91m' + t[2] + '\033[0m']
1144 
1145  xl[self.residue2_key] = d[xl[self.residue2_key]]
1146 
1147  print("")
1148  print(aa.replace("\n", ""))
1149  print("".join([seq[2] for seq in sequences1[protname]]))
1150  print(bb.replace("\n", ""))
1151  print("".join([seq[2] for seq in sequences2[protname]]))
1152 
1153  def classify_crosslinks_by_score(self, number_of_classes):
1154  '''
1155  This function creates as many classes as in the input
1156  (number_of_classes: integer) and partition cross-links according
1157  to their identification scores. Classes are defined in the psi key.
1158  '''
1159 
1160  if self.id_score_key is not None:
1161  scores = self.get_values(self.id_score_key)
1162  else:
1163  raise ValueError(
1164  'The cross-link database does not contain score values')
1165  minscore = min(scores)
1166  maxscore = max(scores)
1167  scoreclasses = numpy.linspace(minscore, maxscore, number_of_classes+1)
1168  if self.psi_key is None:
1169  self.create_new_keyword(self.psi_key, values_from_keyword=None)
1170  for xl in self:
1171  score = xl[self.id_score_key]
1172  for n, classmin in enumerate(scoreclasses[0:-1]):
1173  if score >= classmin and score <= scoreclasses[n+1]:
1174  xl[self.psi_key] = str("CLASS_"+str(n))
1175  self._update()
1176 
1177  def clone_protein(self, protein_name, new_protein_name):
1178  for id in self.data_base.keys():
1179  new_data_base = []
1180  for xl in self.data_base[id]:
1181  new_data_base.append(xl)
1182  if xl[self.protein1_key] == protein_name \
1183  and xl[self.protein2_key] != protein_name:
1184  new_xl = dict(xl)
1185  new_xl[self.protein1_key] = new_protein_name
1186  new_data_base.append(new_xl)
1187  elif xl[self.protein1_key] != protein_name \
1188  and xl[self.protein2_key] == protein_name:
1189  new_xl = dict(xl)
1190  new_xl[self.protein2_key] = new_protein_name
1191  new_data_base.append(new_xl)
1192  elif xl[self.protein1_key] == protein_name \
1193  and xl[self.protein2_key] == protein_name:
1194  new_xl = dict(xl)
1195  new_xl[self.protein1_key] = new_protein_name
1196  new_data_base.append(new_xl)
1197  new_xl = dict(xl)
1198  new_xl[self.protein2_key] = new_protein_name
1199  new_data_base.append(new_xl)
1200  new_xl = dict(xl)
1201  new_xl[self.protein1_key] = new_protein_name
1202  new_xl[self.protein2_key] = new_protein_name
1203  new_data_base.append(new_xl)
1204  self.data_base[id] = new_data_base
1205  self._update()
1206 
1208  '''
1209  This function remove cross-links applied to the same residue
1210  (ie, same chain name and residue number)
1211  '''
1212  for id in self.data_base.keys():
1213  new_data_base = []
1214  for xl in self.data_base[id]:
1215  if xl[self.protein1_key] == xl[self.protein2_key] \
1216  and xl[self.residue1_key] == xl[self.residue2_key]:
1217  continue
1218  else:
1219  new_data_base.append(xl)
1220  self.data_base[id] = new_data_base
1221  self._update()
1222 
1223  def jackknife(self, percentage):
1224  '''
1225  this method returns a CrossLinkDataBase class containing
1226  a random subsample of the original cross-link database.
1227  @param percentage float between 0 and 1, is the percentage of
1228  of spectra taken from the original list
1229  '''
1230  import random
1231  if percentage > 1.0 or percentage < 0.0:
1232  raise ValueError('the percentage of random cross-link spectra '
1233  'should be between 0 and 1')
1234  nspectra = self.get_number_of_xlid()
1235  nrandom_spectra = int(nspectra*percentage)
1236  random_keys = random.sample(sorted(self.data_base.keys()),
1237  nrandom_spectra)
1238  new_data_base = {}
1239  for k in random_keys:
1240  new_data_base[k] = self.data_base[k]
1241  return CrossLinkDataBase(self.cldbkc, new_data_base)
1242 
1243  def __str__(self):
1244  outstr = ''
1245  sorted_ids = sorted(self.data_base.keys())
1246 
1247  for id in sorted_ids:
1248  outstr += id + "\n"
1249  for xl in self.data_base[id]:
1250  for k in self.ordered_key_list:
1251  try:
1252  outstr += "--- " + str(k) + " " + str(xl[k]) + "\n"
1253  except KeyError:
1254  continue
1255 
1256  for k in xl:
1257  if k not in self.ordered_key_list:
1258  outstr += "--- " + str(k) + " " + str(xl[k]) + "\n"
1259  outstr += "-------------\n"
1260  return outstr
1261 
1262  def plot(self, filename, **kwargs):
1263  import matplotlib
1264  matplotlib.use('Agg')
1265  import matplotlib.pyplot as plt
1266  import matplotlib.colors
1267 
1268  if kwargs["type"] == "scatter":
1269  cmap = plt.get_cmap("rainbow")
1270  _ = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1271  xkey = kwargs["xkey"]
1272  ykey = kwargs["ykey"]
1273  if "colorkey" in kwargs:
1274  colorkey = kwargs["colorkey"]
1275  if "logyscale" in kwargs:
1276  logyscale = kwargs["logyscale"]
1277  else:
1278  logyscale = False
1279  xs = []
1280  ys = []
1281  colors = []
1282  for xl in self:
1283  try:
1284  xs.append(float(xl[xkey]))
1285  if logyscale:
1286  ys.append(math.log10(float(xl[ykey])))
1287  else:
1288  ys.append(float(xl[ykey]))
1289  colors.append(float(xl[colorkey]))
1290  except ValueError:
1291  print("CrossLinkDataBase.plot: Value error for "
1292  "cross-link %s" % (xl[self.unique_id_key]))
1293  continue
1294 
1295  cs = []
1296  for color in colors:
1297  cindex = (color-min(colors))/(max(colors)-min(colors))
1298  cs.append(cmap(cindex))
1299 
1300  fig = plt.figure()
1301  ax = fig.add_subplot(111)
1302  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8, marker="o")
1303  ax.set_xlabel(xkey)
1304  ax.set_ylabel(ykey)
1305  plt.savefig(filename)
1306  plt.show()
1307  plt.close()
1308 
1309  if kwargs["type"] == "residue_links":
1310  # plot the number of distinct links to a residue
1311  # in an histogram
1312  # molecule name
1313  molecule = kwargs["molecule"]
1314  if type(molecule) is IMP.pmi.topology.Molecule:
1315  length = len(molecule.sequence)
1316  molecule = molecule.get_name()
1317  else:
1318  # you need a IMP.pmi.topology.Sequences object
1319  sequences_object = kwargs["sequences_object"]
1320  sequence = sequences_object.sequences[molecule]
1321  length = len(sequence)
1322 
1323  histogram = [0]*length
1324  for xl in self:
1325  if xl[self.protein1_key] == molecule:
1326  histogram[xl[self.residue1_key]-1] = \
1327  xl[self.residue1_links_number_key]
1328  if xl[self.protein2_key] == molecule:
1329  histogram[xl[self.residue2_key]-1] = \
1330  xl[self.residue2_links_number_key]
1331  plt.bar(range(1, length+1), histogram)
1332  plt.savefig(filename)
1333  plt.show()
1334  plt.close()
1335 
1336  if kwargs["type"] == "histogram":
1337  valuekey = kwargs["valuekey"]
1338  valuename = valuekey
1339  bins = kwargs["bins"]
1340  values_list = []
1341  for xl in self:
1342  try:
1343  values_list.append(float(xl[valuekey]))
1344  except ValueError:
1345  print("CrossLinkDataBase.plot: Value error for cross-link "
1346  "%s" % (xl[self.unique_id_key]))
1347  continue
1349  filename, [values_list], valuename=valuename, bins=bins,
1350  colors=None, format="pdf",
1351  reference_xline=None, yplotrange=None,
1352  xplotrange=None, normalized=True,
1353  leg_names=None)
1354 
1355  def dump(self, json_filename):
1356  import json
1357  with open(json_filename, 'w') as fp:
1358  json.dump(self.data_base, fp, sort_keys=True, indent=2,
1359  default=set_json_default)
1360 
1361  def load(self, json_filename):
1362  import json
1363  with open(json_filename, 'r') as fp:
1364  self.data_base = json.load(fp)
1365  self._update()
1366 
1367  def save_csv(self, filename):
1368  import csv
1369 
1370  data = []
1371  sorted_ids = None
1372  sorted_group_ids = sorted(self.data_base.keys())
1373  for group in sorted_group_ids:
1374  group_block = []
1375  for xl in self.data_base[group]:
1376  if not sorted_ids:
1377  sorted_ids = sorted(xl.keys())
1378  values = [xl[k] for k in sorted_ids]
1379  group_block.append(values)
1380  data += group_block
1381 
1382  with open(filename, 'w') as fp:
1383  a = csv.writer(fp, delimiter=',')
1384  a.writerow(sorted_ids)
1385  a.writerows(data)
1386 
1388  """
1389  Returns the number of non redundant cross-link sites
1390  """
1391  praset = set()
1392  for xl in self:
1393  pra = _ProteinsResiduesArray(xl)
1394  prai = pra.get_inverted()
1395  praset.add(pra)
1396  praset.add(prai)
1397  return len(list(praset))
1398 
1399 
1401  """This class allows to compute and plot the distance between datasets"""
1402 
1403  def __init__(self, cldb_dictionary):
1404  """Input a dictionary where keys are cldb names and values are cldbs"""
1405  import scipy.spatial.distance
1406  self.dbs = cldb_dictionary
1407  self.keylist = list(self.dbs.keys())
1408  self.distances = list()
1409 
1410  for i, key1 in enumerate(self.keylist):
1411  for key2 in self.keylist[i+1:]:
1412  distance = self.get_distance(key1, key2)
1413  self.distances.append(distance)
1414 
1415  self.distances = scipy.spatial.distance.squareform(self.distances)
1416 
1417  def get_distance(self, key1, key2):
1418  return 1.0 - self.jaccard_index(self.dbs[key1], self.dbs[key2])
1419 
1420  def jaccard_index(self, CrossLinkDataBase1, CrossLinkDataBase2):
1421  """Similarity index between two datasets
1422  https://en.wikipedia.org/wiki/Jaccard_index"""
1423 
1424  set1 = set()
1425  set2 = set()
1426  for xl1 in CrossLinkDataBase1:
1427  a1f = _ProteinsResiduesArray(xl1)
1428  a1b = a1f.get_inverted()
1429  set1.add(a1f)
1430  set1.add(a1b)
1431  for xl2 in CrossLinkDataBase2:
1432  a2f = _ProteinsResiduesArray(xl2)
1433  a2b = a2f.get_inverted()
1434  set2.add(a2f)
1435  set2.add(a2b)
1436  return (float(len(set1 & set2)/2)
1437  / (len(set1)/2+len(set2)/2-len(set1 & set2)/2))
1438 
1439  def plot_matrix(self, figurename="clustermatrix.pdf"):
1440  import matplotlib as mpl
1441  import numpy
1442  mpl.use('Agg')
1443  import matplotlib.pylab as pl
1444  from scipy.cluster import hierarchy as hrc
1445 
1446  raw_distance_matrix = self.distances
1447  labels = self.keylist
1448 
1449  fig = pl.figure()
1450 
1451  ax = fig.add_subplot(1, 1, 1)
1452  dendrogram = hrc.dendrogram(
1453  hrc.linkage(raw_distance_matrix),
1454  color_threshold=7,
1455  no_labels=True)
1456  leaves_order = dendrogram['leaves']
1457  ax.set_xlabel('Dataset')
1458  ax.set_ylabel('Jaccard Distance')
1459  pl.tight_layout()
1460  pl.savefig("dendrogram."+figurename, dpi=300)
1461  pl.close(fig)
1462 
1463  fig = pl.figure()
1464 
1465  ax = fig.add_subplot(1, 1, 1)
1466  cax = ax.imshow(raw_distance_matrix[leaves_order, :][:, leaves_order],
1467  interpolation='nearest')
1468  cb = fig.colorbar(cax)
1469  cb.set_label('Jaccard Distance')
1470  ax.set_xlabel('Dataset')
1471  ax.set_ylabel('Dataset')
1472  ax.set_xticks(range(len(labels)))
1473  ax.set_xticklabels(numpy.array(labels)[leaves_order],
1474  rotation='vertical')
1475  ax.set_yticks(range(len(labels)))
1476  ax.set_yticklabels(numpy.array(labels)[leaves_order],
1477  rotation='horizontal')
1478  pl.tight_layout()
1479  pl.savefig("matrix." + figurename, dpi=300)
1480  pl.close(fig)
1481 
1482 
1484  '''
1485  This class maps a CrossLinkDataBase on a given structure
1486  and save an rmf file with color-coded cross-links
1487  '''
1488 
1489  def __init__(self, CrossLinkDataBase, rmf_or_stat_handler):
1490  self.CrossLinkDataBase = CrossLinkDataBase
1491  if type(rmf_or_stat_handler) is IMP.pmi.output.RMFHierarchyHandler or \
1492  type(rmf_or_stat_handler) is IMP.pmi.output.StatHierarchyHandler:
1493  self.prots = rmf_or_stat_handler
1494  self.distances = defaultdict(list)
1495  self.array_to_id = {}
1496  self.id_to_array = {}
1497 
1498  print("computing distances for all cross-links and all structures")
1499  for i in self.prots[::10]:
1500  self.compute_distances()
1501  for key, xl in enumerate(self.CrossLinkDataBase):
1502  array = _ProteinsResiduesArray(xl)
1503  self.array_to_id[array] = key
1504  self.id_to_array[key] = array
1505  if xl["MinAmbiguousDistance"] != 'None':
1506  self.distances[key].append(xl["MinAmbiguousDistance"])
1507 
1508  def compute_distances(self):
1509  data = []
1510  sorted_ids = None
1511  sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1512  for group in sorted_group_ids:
1513  group_dists = []
1514  for xl in self.CrossLinkDataBase.data_base[group]:
1515  if not sorted_ids:
1516  sorted_ids = sorted(xl.keys())
1517  data.append(
1518  sorted_ids
1519  + ["UniqueID", "Distance", "MinAmbiguousDistance"])
1520  c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1521  try:
1522  (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1523  self._get_distance_and_particle_pair(r1, c1, r2, c2)
1524  except: # noqa: E722
1525  mdist = "None"
1526  state1 = "None"
1527  copy1 = "None"
1528  state2 = "None"
1529  copy2 = "None"
1530  try:
1531  # sometimes keys get "lost" in the database,
1532  # not really sure why
1533  values = [xl[k] for k in sorted_ids]
1534  values += [group, mdist]
1535  except KeyError as e:
1536  print("MapCrossLinkDataBaseOnStructure "
1537  "KeyError: {0} in {1}".format(e, xl))
1538  group_dists.append(mdist)
1539  xl["Distance"] = mdist
1540  xl["State1"] = state1
1541  xl["Copy1"] = copy1
1542  xl["State2"] = state2
1543  xl["Copy2"] = copy2
1544 
1545  for xl in self.CrossLinkDataBase.data_base[group]:
1546  xl["MinAmbiguousDistance"] = min(group_dists)
1547 
1548  def _get_distance_and_particle_pair(self, r1, c1, r2, c2):
1549  '''more robust and slower version of above'''
1550  sel = IMP.atom.Selection(self.prots, molecule=c1, residue_index=r1,
1551  resolution=1)
1552  selpart_1 = sel.get_selected_particles()
1553  if len(selpart_1) == 0:
1554  print("MapCrossLinkDataBaseOnStructure: Warning: no particle "
1555  "selected for first site")
1556  return None
1557  sel = IMP.atom.Selection(self.prots, molecule=c2, residue_index=r2,
1558  resolution=1)
1559  selpart_2 = sel.get_selected_particles()
1560  if len(selpart_2) == 0:
1561  print("MapCrossLinkDataBaseOnStructure: Warning: no particle "
1562  "selected for second site")
1563  return None
1564  results = []
1565  for p1 in selpart_1:
1566  for p2 in selpart_2:
1567  if p1 == p2 and r1 == r2:
1568  continue
1569  d1 = IMP.core.XYZ(p1)
1570  d2 = IMP.core.XYZ(p2)
1571  # round distance to second decimal
1572  dist = float(int(IMP.core.get_distance(d1, d2)*100.0))/100.0
1573  h1 = IMP.atom.Hierarchy(p1)
1574  while not IMP.atom.Molecule.get_is_setup(h1.get_particle()):
1575  h1 = h1.get_parent()
1576  copy_index1 = IMP.atom.Copy(h1).get_copy_index()
1577  while not IMP.atom.State.get_is_setup(h1.get_particle()):
1578  h1 = h1.get_parent()
1579  state_index1 = IMP.atom.State(h1).get_state_index()
1580  h2 = IMP.atom.Hierarchy(p2)
1581  while not IMP.atom.Molecule.get_is_setup(h2.get_particle()):
1582  h2 = h2.get_parent()
1583  copy_index2 = IMP.atom.Copy(h2).get_copy_index()
1584  while not IMP.atom.State.get_is_setup(h2.get_particle()):
1585  h2 = h2.get_parent()
1586  state_index2 = IMP.atom.State(h2).get_state_index()
1587 
1588  results.append((dist, state_index1, copy_index1,
1589  state_index2, copy_index2, p1, p2))
1590  if len(results) == 0:
1591  return None
1592  results_sorted = sorted(results,
1593  key=operator.itemgetter(0, 1, 2, 3, 4))
1594  return ((results_sorted[0][0],
1595  results_sorted[0][5],
1596  results_sorted[0][6]),
1597  (results_sorted[0][1],
1598  results_sorted[0][2],
1599  results_sorted[0][3],
1600  results_sorted[0][4]))
1601 
1602  def save_rmf_snapshot(self, filename, color_id=None):
1603  sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1604  list_of_pairs = []
1605  color_scores = []
1606  for group in sorted_group_ids:
1607  group_dists_particles = []
1608  for xl in self.CrossLinkDataBase.data_base[group]:
1609  xllabel = \
1610  self.CrossLinkDataBase.get_short_cross_link_string(xl)
1611  c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1612  try:
1613  (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1614  self._get_distance_and_particle_pair(r1, c1, r2, c2)
1615  except TypeError:
1616  print("TypeError or missing chain/residue ",
1617  r1, c1, r2, c2)
1618  continue
1619  if color_id is not None:
1620  group_dists_particles.append((mdist, p1, p2, xllabel,
1621  float(xl[color_id])))
1622  else:
1623  group_dists_particles.append((mdist, p1, p2, xllabel, 0.0))
1624  if group_dists_particles:
1625  (minmdist, minp1, minp2, minxllabel, mincolor_score) = \
1626  min(group_dists_particles, key=lambda t: t[0])
1627  color_scores.append(mincolor_score)
1628  list_of_pairs.append((minp1, minp2, minxllabel,
1629  mincolor_score))
1630  else:
1631  continue
1632 
1633  linear = IMP.core.Linear(0, 0.0)
1634  linear.set_slope(1.0)
1635  dps2 = IMP.core.DistancePairScore(linear)
1636  rslin = IMP.RestraintSet(self.prots.get_model(),
1637  'linear_dummy_restraints')
1638  sgs = []
1639  offset = min(color_scores)
1640  maxvalue = max(color_scores)
1641  for pair in list_of_pairs:
1642  pr = IMP.core.PairRestraint(self.prots.get_model(), dps2,
1643  (pair[0], pair[1]))
1644  pr.set_name(pair[2])
1645  if offset < maxvalue:
1646  factor = (pair[3]-offset)/(maxvalue-offset)
1647  else:
1648  factor = 0.0
1649  c = IMP.display.get_rgb_color(factor)
1650  seg = IMP.algebra.Segment3D(
1651  IMP.core.XYZ(pair[0]).get_coordinates(),
1652  IMP.core.XYZ(pair[1]).get_coordinates())
1653  rslin.add_restraint(pr)
1654  sgs.append(IMP.display.SegmentGeometry(seg, c, pair[2]))
1655 
1656  rh = RMF.create_rmf_file(filename)
1657  IMP.rmf.add_hierarchies(rh, [self.prots])
1658  IMP.rmf.add_restraints(rh, [rslin])
1659  IMP.rmf.add_geometries(rh, sgs)
1660  IMP.rmf.save_frame(rh)
1661  del rh
1662 
1663  def boxplot_crosslink_distances(self, filename):
1664  import numpy
1665  keys = [self.id_to_array[k] for k in self.distances.keys()]
1666  medians = [numpy.median(self.distances[self.array_to_id[k]])
1667  for k in keys]
1668  dists = [self.distances[self.array_to_id[k]] for k in keys]
1669  distances_sorted_by_median = \
1670  [x for _, x in sorted(zip(medians, dists))]
1671  keys_sorted_by_median = [x for _, x in sorted(zip(medians, keys))]
1673  filename, distances_sorted_by_median,
1674  range(len(distances_sorted_by_median)),
1675  xlabels=keys_sorted_by_median,
1676  scale_plot_length=0.2)
1677 
1678  def get_crosslink_violations(self, threshold):
1679  violations = defaultdict(list)
1680  for k in self.distances:
1681  # viols=map(lambda x:1.0*(x<= threshold), self.distances[k])
1682  viols = self.distances[k]
1683  violations[self.id_to_array[k]] = viols
1684  return violations
1685 
1686 
1688  '''
1689  This class generates a CrossLinkDataBase from a given structure
1690  '''
1691  def __init__(self, system, residue_types_1=["K"],
1692  residue_types_2=["K"], reactivity_range=[0, 1], kt=1.0):
1693  import numpy.random
1694  import math
1696  cldbkc.set_protein1_key("Protein1")
1697  cldbkc.set_protein2_key("Protein2")
1698  cldbkc.set_residue1_key("Residue1")
1699  cldbkc.set_residue2_key("Residue2")
1700  self.cldb = CrossLinkDataBase(cldbkc)
1701  self.system = system
1702  self.model = self.system.model
1703  self.residue_types_1 = residue_types_1
1704  self.residue_types_2 = residue_types_2
1705  self.kt = kt
1706  self.indexes_dict1 = {}
1707  self.indexes_dict2 = {}
1708  self.protein_residue_dict = {}
1709  self.reactivity_dictionary = {}
1710  self.euclidean_interacting_pairs = None
1711  self.xwalk_interacting_pairs = None
1712 
1713  for state in self.system.get_states():
1714  for moleculename, molecules in state.get_molecules().items():
1715  for molecule in molecules:
1716  # we are saving a dictionary with protein name,
1717  # residue number and random reactivity of the residue
1718  seq = molecule.sequence
1719  residues = [i for i in range(1, len(seq)+1)
1720  if ((seq[i-1] in self.residue_types_1)
1721  or (seq[i-1] in self.residue_types_2))]
1722 
1723  for r in residues:
1724  # uniform random reactivities
1725  # getting reactivities from the CDF of an
1726  # exponential distribution
1727  rexp = numpy.random.exponential(0.00000001)
1728  prob = 1.0-math.exp(-rexp)
1729  self.reactivity_dictionary[(molecule, r)] = prob
1730 
1731  residues1 = [i for i in range(1, len(seq)+1)
1732  if seq[i-1] in self.residue_types_1]
1733  residues2 = [i for i in range(1, len(seq)+1)
1734  if seq[i-1] in self.residue_types_2]
1735  for r in residues1:
1736  s = IMP.atom.Selection(
1737  molecule.hier, residue_index=r, resolution=1)
1738  ps = s.get_selected_particles()
1739  for p in ps:
1740  index = p.get_index()
1741  self.indexes_dict1[index] = (molecule, r)
1742  self.protein_residue_dict[(molecule, r)] = index
1743  for r in residues2:
1744  s = IMP.atom.Selection(
1745  molecule.hier, residue_index=r, resolution=1)
1746  ps = s.get_selected_particles()
1747  for p in ps:
1748  index = p.get_index()
1749  self.indexes_dict2[index] = (molecule, r)
1750  self.protein_residue_dict[(molecule, r)] = index
1751 
1752  def get_all_possible_pairs(self):
1753  n = float(len(self.protein_residue_dict.keys()))
1754  return n*(n-1.0)/2.0
1755 
1756  def get_all_feasible_pairs(self, distance=21):
1757  import itertools
1758  particle_index_pairs = []
1759  nxl = 0
1760  for a, b in itertools.combinations(
1761  self.protein_residue_dict.keys(), 2):
1762  new_xl = {}
1763  index1 = self.protein_residue_dict[a]
1764  index2 = self.protein_residue_dict[b]
1765  particle_distance = IMP.core.get_distance(
1766  IMP.core.XYZ(IMP.get_particles(self.model, [index1])[0]),
1767  IMP.core.XYZ(IMP.get_particles(self.model, [index2])[0]))
1768  if particle_distance <= distance:
1769  particle_index_pairs.append((index1, index2))
1770  new_xl[self.cldb.protein1_key] = a[0].get_name()
1771  new_xl[self.cldb.protein2_key] = b[0].get_name()
1772  new_xl["molecule_object1"] = a[0]
1773  new_xl["molecule_object2"] = b[0]
1774  new_xl[self.cldb.residue1_key] = a[1]
1775  new_xl[self.cldb.residue2_key] = b[1]
1776  self.cldb.data_base[str(nxl)] = [new_xl]
1777  nxl += 1
1778  self.cldb._update()
1779  return self.cldb
1780 
1781  def get_data_base(self, total_number_of_spectra, ambiguity_probability=0.1,
1782  noise=0.01, distance=21, max_delta_distance=10.0,
1783  xwalk_bin_path=None, confidence_false=0.75,
1784  confidence_true=0.75):
1785  import math
1786  from random import random
1787  import numpy as np
1788  number_of_spectra = 1
1789 
1790  self.beta_true = -1.4427*math.log(0.5*(1.0-confidence_true))
1791  self.beta_false = -1.4427*math.log(0.5*(1.0-confidence_false))
1792  self.cldb.data_base[str(number_of_spectra)] = []
1793  self.sites_weighted = None
1794 
1795  while number_of_spectra < total_number_of_spectra:
1796  if (random() > ambiguity_probability
1797  and len(self.cldb.data_base[str(number_of_spectra)]) != 0):
1798  # new spectrum
1799  number_of_spectra += 1
1800  self.cldb.data_base[str(number_of_spectra)] = []
1801  noisy = False
1802  if random() > noise:
1803  # not noisy cross-link
1804  pra, dist = self.get_random_residue_pair(
1805  distance, xwalk_bin_path, max_delta_distance)
1806  else:
1807  # noisy cross-link
1808  pra, dist = self.get_random_residue_pair(None, None, None)
1809  noisy = True
1810 
1811  new_xl = {}
1812  new_xl[self.cldb.protein1_key] = pra[0].get_name()
1813  new_xl[self.cldb.protein2_key] = pra[1].get_name()
1814  new_xl["molecule_object1"] = pra[0]
1815  new_xl["molecule_object2"] = pra[1]
1816  new_xl[self.cldb.residue1_key] = pra[2]
1817  new_xl[self.cldb.residue2_key] = pra[3]
1818  new_xl["Noisy"] = noisy
1819  # the reactivity is defined as r=1-exp(-k*Delta t)
1820  new_xl["Reactivity_Residue1"] = \
1821  self.reactivity_dictionary[(pra[0], pra[2])]
1822  new_xl["Reactivity_Residue2"] = \
1823  self.reactivity_dictionary[(pra[1], pra[3])]
1824  if noisy:
1825  new_xl["Score"] = np.random.beta(1.0, self.beta_false)
1826  else:
1827  new_xl["Score"] = 1.0-np.random.beta(1.0, self.beta_true)
1828  new_xl["TargetDistance"] = dist
1829  new_xl["NoiseProbability"] = noise
1830  new_xl["AmbiguityProbability"] = ambiguity_probability
1831  # getting if it is intra or inter rigid body
1832  (p1, p2) = IMP.get_particles(
1833  self.model, [self.protein_residue_dict[(pra[0], pra[2])],
1834  self.protein_residue_dict[(pra[1], pra[3])]])
1837  IMP.core.RigidMember(p1).get_rigid_body() ==
1838  IMP.core.RigidMember(p2).get_rigid_body()):
1839  new_xl["InterRigidBody"] = False
1840  elif (IMP.core.RigidMember.get_is_setup(p1) and
1842  IMP.core.RigidMember(p1).get_rigid_body() !=
1843  IMP.core.RigidMember(p2).get_rigid_body()):
1844  new_xl["InterRigidBody"] = True
1845  else:
1846  new_xl["InterRigidBody"] = None
1847 
1848  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1849  self.cldb._update()
1850  return self.cldb
1851 
1852  def get_random_residue_pair(self, distance=None, xwalk_bin_path=None,
1853  max_delta_distance=None):
1854  import IMP.pmi.tools
1855  import math
1856  from random import choice
1857  if distance is None:
1858  # get a random pair
1859  while True:
1860  protein1, residue1 = choice(self.protein_residue_dict.keys())
1861  protein2, residue2 = choice(self.protein_residue_dict.keys())
1862  index1 = self.protein_residue_dict[(protein1, residue1)]
1863  index2 = self.protein_residue_dict[(protein2, residue2)]
1864  particle_distance = IMP.core.get_distance(
1865  IMP.core.XYZ(IMP.get_particles(self.model, [index1])[0]),
1866  IMP.core.XYZ(IMP.get_particles(self.model, [index2])[0]))
1867  if (protein1, residue1) != (protein2, residue2):
1868  break
1869  else:
1870  # get a pair of residues whose distance is below the threshold
1871  if not xwalk_bin_path:
1873  gcpf.set_distance(distance+max_delta_distance)
1874 
1875  while True:
1876  # setup the reaction rates lists
1877  if not self.sites_weighted:
1878  self.sites_weighted = []
1879  for key in self.reactivity_dictionary:
1880  r = self.reactivity_dictionary[key]
1881  self.sites_weighted.append((key, r))
1882  # get a random reaction site
1883  first_site = self.weighted_choice(self.sites_weighted)
1884  # get all distances
1885  if not self.euclidean_interacting_pairs:
1886  self.euclidean_interacting_pairs = \
1887  gcpf.get_close_pairs(
1888  self.model,
1889  self.indexes_dict1.keys(),
1890  self.indexes_dict2.keys())
1891  # get the partner for the first reacted site
1892  first_site_pairs = \
1893  [pair for pair in self.euclidean_interacting_pairs
1894  if self.indexes_dict1[pair[0]] == first_site or
1895  self.indexes_dict2[pair[1]] == first_site]
1896  if len(first_site_pairs) == 0:
1897  continue
1898  # build the list of second reaction sites
1899  second_sites_weighted = []
1900  for pair in first_site_pairs:
1901  if self.indexes_dict1[pair[0]] == first_site:
1902  second_site = self.indexes_dict2[pair[1]]
1903  if self.indexes_dict2[pair[1]] == first_site:
1904  second_site = self.indexes_dict1[pair[0]]
1905  r = self.reactivity_dictionary[second_site]
1906  second_sites_weighted.append((second_site, r))
1907  second_site = self.weighted_choice(second_sites_weighted)
1908  protein1, residue1 = first_site
1909  protein2, residue2 = second_site
1910  print("CrossLinkDataBaseFromStructure."
1911  "get_random_residue_pair:",
1912  "First site", first_site,
1913  self.reactivity_dictionary[first_site],
1914  "Second site", second_site,
1915  self.reactivity_dictionary[second_site])
1916  particle_pair = IMP.get_particles(
1917  self.model,
1918  [self.protein_residue_dict[first_site],
1919  self.protein_residue_dict[second_site]])
1920  particle_distance = IMP.core.get_distance(
1921  IMP.core.XYZ(particle_pair[0]),
1922  IMP.core.XYZ(particle_pair[1]))
1923 
1924  if particle_distance < distance \
1925  and (protein1, residue1) != (protein2, residue2):
1926  break
1927  elif (particle_distance >= distance
1928  and (protein1, residue1) != (protein2, residue2)
1929  and max_delta_distance):
1930  # allow some flexibility
1931  if particle_distance-distance < max_delta_distance:
1932  break
1933 
1934  else:
1935  if not self.xwalk_interacting_pairs:
1936  self.xwalk_interacting_pairs = \
1937  self.get_xwalk_distances(xwalk_bin_path, distance)
1938  interacting_pairs_weighted = []
1939 
1940  for pair in self.xwalk_interacting_pairs:
1941  protein1 = pair[0]
1942  protein2 = pair[1]
1943  residue1 = pair[2]
1944  residue2 = pair[3]
1945  weight1 = math.exp(
1946  -self.reactivity_dictionary[(protein1, residue1)]
1947  / self.kt)
1948  weight2 = math.exp(
1949  -self.reactivity_dictionary[(protein2, residue2)]
1950  / self.kt)
1951  interacting_pairs_weighted.append((pair, weight1*weight2))
1952 
1953  pair = self.weighted_choice(interacting_pairs_weighted)
1954  protein1 = pair[0]
1955  protein2 = pair[1]
1956  residue1 = pair[2]
1957  residue2 = pair[3]
1958  particle_distance = float(pair[4])
1959 
1960  return ((protein1, protein2, residue1, residue2)), particle_distance
1961 
1962  def get_xwalk_distances(self, xwalk_bin_path, distance):
1963  import IMP.pmi.output
1964  import os
1965  o = IMP.pmi.output.Output(atomistic=True)
1966  o.init_pdb("xwalk.pdb", self.representation.prot)
1967  o.write_pdb("xwalk.pdb")
1968  namechainiddict = o.dictchain["xwalk.pdb"]
1969  chainiddict = {}
1970 
1971  for key in namechainiddict:
1972  chainiddict[namechainiddict[key]] = key
1973  xwalkout = os.popen('java -Xmx256m -cp ' + xwalk_bin_path
1974  + ' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys '
1975  '-a1 cb -a2 cb -max '
1976  + str(distance) + ' -bb').read()
1977 
1978  output_list_of_distance = []
1979  for line in xwalkout.split("\n")[0:-2]:
1980  tokens = line.split()
1981  first = tokens[2]
1982  second = tokens[3]
1983  distance = float(tokens[6])
1984  fs = first.split("-")
1985  ss = second.split("-")
1986  chainid1 = fs[2]
1987  chainid2 = ss[2]
1988  protein1 = chainiddict[chainid1]
1989  protein2 = chainiddict[chainid2]
1990  residue1 = int(fs[1])
1991  residue2 = int(ss[1])
1992  output_list_of_distance.append(
1993  (protein1, protein2, residue1, residue2, distance))
1994  return output_list_of_distance
1995 
1996  def weighted_choice(self, choices):
1997  import random
1998  # http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
1999  total = sum(w for c, w in choices)
2000  r = random.uniform(0, total)
2001  upto = 0
2002  for c, w in choices:
2003  if upto + w > r:
2004  return c
2005  upto += w
2006  assert False, "Shouldn't get here"
2007 
2008  def save_rmf_snapshot(self, filename, color_id=None):
2009  import IMP.rmf
2010  import RMF
2011  if color_id is None:
2012  color_id = "Reactivity"
2013  sorted_group_ids = sorted(self.cldb.data_base.keys())
2014  list_of_pairs = []
2015  color_scores = []
2016  for group in sorted_group_ids:
2017  group_dists_particles = []
2018  for xl in self.cldb.data_base[group]:
2019  xllabel = self.cldb.get_short_cross_link_string(xl)
2020  (c1, c2, r1, r2) = (xl["molecule_object1"],
2021  xl["molecule_object2"],
2022  xl[self.cldb.residue1_key],
2023  xl[self.cldb.residue2_key])
2024  try:
2025  index1 = self.protein_residue_dict[(c1, r1)]
2026  index2 = self.protein_residue_dict[(c2, r2)]
2027  p1, p2 = (IMP.get_particles(self.model, [index1])[0],
2028  IMP.get_particles(self.model, [index2])[0])
2029  mdist = xl["TargetDistance"]
2030  except TypeError:
2031  print("TypeError or missing chain/residue ",
2032  r1, c1, r2, c2)
2033  continue
2034  group_dists_particles.append((mdist, p1, p2, xllabel,
2035  float(xl[color_id])))
2036  if group_dists_particles:
2037  (minmdist, minp1, minp2, minxllabel, mincolor_score) \
2038  = min(group_dists_particles, key=lambda t: t[0])
2039  color_scores.append(mincolor_score)
2040  list_of_pairs.append((minp1, minp2, minxllabel,
2041  mincolor_score))
2042  else:
2043  continue
2044 
2045  m = self.model
2046  linear = IMP.core.Linear(0, 0.0)
2047  linear.set_slope(1.0)
2048  dps2 = IMP.core.DistancePairScore(linear)
2049  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
2050  sgs = []
2051  offset = min(color_scores)
2052  maxvalue = max(color_scores)
2053  for pair in list_of_pairs:
2054  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
2055  pr.set_name(pair[2])
2056  factor = (pair[3]-offset)/(maxvalue-offset)
2057  c = IMP.display.get_rgb_color(factor)
2058  seg = IMP.algebra.Segment3D(
2059  IMP.core.XYZ(pair[0]).get_coordinates(),
2060  IMP.core.XYZ(pair[1]).get_coordinates())
2061  rslin.add_restraint(pr)
2062  sgs.append(IMP.display.SegmentGeometry(seg, c, pair[2]))
2063 
2064  rh = RMF.create_rmf_file(filename)
2065  IMP.rmf.add_hierarchies(rh, [self.system.hier])
2066  IMP.rmf.add_restraints(rh, [rslin])
2067  IMP.rmf.add_geometries(rh, sgs)
2068  IMP.rmf.save_frame(rh)
2069  del rh
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: Molecule.h:35
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: State.h:41
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:1754
def plot_fields_box_plots
Plot time series as boxplots.
Definition: output.py:1819
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
Get the particles from a list of indexes.
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:749
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:41
Stores a named protein chain.
A decorator for keeping track of copies of a molecule.
Definition: Copy.h:28
The standard decorator for manipulating molecular structures.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:199
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:22
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Score a pair of particles based on the distance between their centers.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:25
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Definition: exception.h:48
Associate an integer "state" index with a hierarchy node.
Definition: State.h:27
class to link stat files to several rmf files
Definition: output.py:1253
class to allow more advanced handling of RMF files.
Definition: output.py:1129
Mapping between FASTA one-letter codes and residue types.
Definition: alphabets.py:1
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Definition: PairRestraint.h:31
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:70
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.