IMP logo
IMP Reference Guide  2.7.0
The Integrative Modeling Platform
crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 
8 import IMP
9 import IMP.pmi
10 import operator
11 import sys
12 
13 # json default serializations
14 def set_json_default(obj):
15  if isinstance(obj, set):
16  return list(obj)
17  if isinstance(obj, IMP.pmi.topology.Molecule):
18  return str(obj)
19  raise TypeError
20 
21 # Handle and return data that must be a string
22 if sys.version_info[0] >= 3:
23  def _handle_string_input(inp):
24  if not isinstance(inp, str):
25  raise TypeError("expecting a string")
26  return inp
27 else:
28  def _handle_string_input(inp):
29  if not isinstance(inp, (str, unicode)):
30  raise TypeError("expecting a string or unicode")
31  # Coerce to non-unicode representation (str)
32  if isinstance(inp, unicode):
33  return str(inp)
34  else:
35  return inp
36 
37 class _CrossLinkDataBaseStandardKeys(object):
38  '''
39  This class setup all the standard keys needed to
40  identify the crosslink features from the data sets
41  '''
42  def __init__(self):
43  self.type={}
44  self.protein1_key="Protein1"
45  self.type[self.protein1_key]=str
46  self.protein2_key="Protein2"
47  self.type[self.protein2_key]=str
48  self.residue1_key="Residue1"
49  self.type[self.residue1_key]=int
50  self.residue2_key="Residue2"
51  self.type[self.residue2_key]=int
52  self.site_pairs_key="SitePairs"
53  self.type[self.site_pairs_key]=str
54  self.unique_id_key="XLUniqueID"
55  self.type[self.unique_id_key]=str
56  self.unique_sub_index_key="XLUniqueSubIndex"
57  self.type[self.unique_sub_index_key]=int
58  self.unique_sub_id_key="XLUniqueSubID"
59  self.type[self.unique_sub_id_key]=str
60  self.data_set_name_key="DataSetName"
61  self.type[self.data_set_name_key]=str
62  self.cross_linker_chemical_key="CrossLinkerChemical"
63  self.type[self.cross_linker_chemical_key]=str
64  self.id_score_key="IDScore"
65  self.type[self.id_score_key]=float
66  self.fdr_key="FDR"
67  self.type[self.fdr_key]=float
68  self.quantitation_key="Quantitation"
69  self.type[self.quantitation_key]=float
70  self.redundancy_key="Redundancy"
71  self.type[self.redundancy_key]=int
72  self.redundancy_list_key="RedundancyList"
73  self.type[self.redundancy_key]=list
74  self.ambiguity_key="Ambiguity"
75  self.type[self.ambiguity_key]=int
76  self.state_key="State"
77  self.type[self.state_key]=int
78  self.sigma1_key="Sigma1"
79  self.type[self.sigma1_key]=str
80  self.sigma2_key="Sigma2"
81  self.type[self.sigma2_key]=str
82  self.psi_key="Psi"
83  self.type[self.psi_key]=str
84  self.distance_key="Distance"
85  self.type[self.distance_key]=float
86  self.min_ambiguous_distance_key="MinAmbiguousDistance"
87  self.type[self.distance_key]=float
88 
89  self.ordered_key_list =[self.data_set_name_key,
90  self.unique_id_key,
91  self.unique_sub_index_key,
92  self.unique_sub_id_key,
93  self.protein1_key,
94  self.protein2_key,
95  self.residue1_key,
96  self.residue2_key,
97  self.cross_linker_chemical_key,
98  self.id_score_key,
99  self.fdr_key,
100  self.quantitation_key,
101  self.redundancy_key,
102  self.redundancy_list_key,
103  self.state_key,
104  self.sigma1_key,
105  self.sigma2_key,
106  self.psi_key,
107  self.distance_key,
108  self.min_ambiguous_distance_key]
109 
110 
111 class _ProteinsResiduesArray(tuple):
112  '''
113  This class is inherits from tuple, and it is a shorthand for a cross-link
114  (p1,p2,r1,r2) where p1 and p2 are protein1 and protein2, r1 and r2 are
115  residue1 and residue2.
116  '''
117 
118  def __new__(self,input_data):
119  '''
120  @input_data can be a dict or a tuple
121  '''
122  self.cldbsk=_CrossLinkDataBaseStandardKeys()
123  if type(input_data) is dict:
124  p1=input_data[self.cldbsk.protein1_key]
125  p2=input_data[self.cldbsk.protein2_key]
126  r1=input_data[self.cldbsk.residue1_key]
127  r2=input_data[self.cldbsk.residue2_key]
128  t=(p1,p2,r1,r2)
129  elif type(input_data) is tuple:
130  if len(input_data)>4:
131  raise TypeError("_ProteinsResiduesArray: must have only 4 elements")
132  p1 = _handle_string_input(input_data[0])
133  p2 = _handle_string_input(input_data[1])
134  r1=input_data[2]
135  r2=input_data[3]
136  if type(r1) is not int:
137  raise TypeError("_ProteinsResiduesArray: residue1 must be a integer")
138  if type(r2) is not int:
139  raise TypeError("_ProteinsResiduesArray: residue2 must be a integer")
140  t=(p1,p2,r1,r2)
141  else:
142  raise TypeError("_ProteinsResiduesArray: input must be a dict or tuple")
143  return tuple.__new__(_ProteinsResiduesArray, t)
144 
145  def get_inverted(self):
146  '''
147  Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
148  '''
149  return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
150 
151  def __str__(self):
152  outstr=self.cldbsk.protein1_key+" "+str(self[0])
153  outstr+=" "+self.cldbsk.protein2_key+" "+str(self[1])
154  outstr+=" "+self.cldbsk.residue1_key+" "+str(self[2])
155  outstr+=" "+self.cldbsk.residue2_key+" "+str(self[3])
156  return outstr
157 
158 class FilterOperator(object):
159  '''
160  This class allows to create filter functions that can be passed to the CrossLinkDataBase
161  in this way:
162 
163  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
164 
165  where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
166 
167  A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
168 
169  fo.evaluate(xl)
170 
171  and it is used to filter the database
172  '''
173 
174  def __init__(self, argument1, operator, argument2):
175  '''
176  (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
177  or (FilterOperator1,operator.or|and...,FilterOperator2)
178 
179  we need to implement a NOT
180  '''
181  if isinstance(argument1, FilterOperator):
182  self.operations = [argument1, operator, argument2]
183  else:
184  self.operations = []
185  self.values = (argument1, operator, argument2)
186 
187  def __or__(self, FilterOperator2):
188  return FilterOperator(self, operator.or_, FilterOperator2)
189 
190  def __and__(self, FilterOperator2):
191  return FilterOperator(self, operator.and_, FilterOperator2)
192 
193  def evaluate(self, xl_item):
194 
195  if len(self.operations) == 0:
196  keyword, operator, value = self.values
197  return operator(xl_item[keyword], value)
198  FilterOperator1, op, FilterOperator2 = self.operations
199 
200  return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
201 
202 '''
203 def filter_factory(xl_):
204 
205  class FilterOperator(object):
206  import operator
207  xl = xl_
208 
209  def __new__(self,key,value,oper=operator.eq):
210  return oper(self.xl[key],value)
211 
212  return FilterOperator
213 '''
214 
215 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
216  '''
217  This class is needed to convert the keywords from a generic database
218  to the standard ones
219  '''
220 
221  def __init__(self, list_parser=None):
222  '''
223  @param list_parser an instance of ResiduePairListParser, if any is needed
224  '''
225  self.converter={}
226  self.backward_converter={}
227  _CrossLinkDataBaseStandardKeys.__init__(self)
228  self.rplp = list_parser
229  if self.rplp is None:
230  # either you have protein1, protein2, residue1, residue2
231  self.compulsory_keys=set([self.protein1_key,
232  self.protein2_key,
233  self.residue1_key,
234  self.residue2_key])
235  else:
236  self.compulsory_keys=self.rplp.get_compulsory_keys()
237  self.setup_keys=set()
238 
239  def check_keys(self):
240  '''
241  Is a function that check whether necessary keys are setup
242  '''
243  setup_keys=set(self.get_setup_keys())
244  if self.compulsory_keys & setup_keys != self.compulsory_keys:
245  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
246 
247  def get_setup_keys(self):
248  '''
249  Returns the keys that have been setup so far
250  '''
251  return self.backward_converter.keys()
252 
253  def set_standard_keys(self):
254  """
255  This sets up the standard compulsory keys as defined by
256  _CrossLinkDataBaseStandardKeys
257  """
258  for ck in self.compulsory_keys:
259  self.converter[ck]=ck
260  self.backward_converter[ck]=ck
261 
262  def set_unique_id_key(self,origin_key):
263  self.converter[origin_key]=self.unique_id_key
264  self.backward_converter[self.unique_id_key]=origin_key
265 
266  def set_protein1_key(self,origin_key):
267  self.converter[origin_key]=self.protein1_key
268  self.backward_converter[self.protein1_key]=origin_key
269 
270  def set_protein2_key(self,origin_key):
271  self.converter[origin_key]=self.protein2_key
272  self.backward_converter[self.protein2_key]=origin_key
273 
274  def set_residue1_key(self,origin_key):
275  self.converter[origin_key]=self.residue1_key
276  self.backward_converter[self.residue1_key]=origin_key
277 
278  def set_residue2_key(self,origin_key):
279  self.converter[origin_key]=self.residue2_key
280  self.backward_converter[self.residue2_key]=origin_key
281 
282  def set_site_pairs_key(self,origin_key):
283  self.converter[origin_key]=self.site_pairs_key
284  self.backward_converter[self.site_pairs_key]=origin_key
285 
286  def set_id_score_key(self,origin_key):
287  self.converter[origin_key]=self.id_score_key
288  self.backward_converter[self.id_score_key]=origin_key
289 
290  def set_fdr_key(self,origin_key):
291  self.converter[origin_key]=self.fdr_key
292  self.backward_converter[self.fdr_key]=origin_key
293 
294  def set_quantitation_key(self,origin_key):
295  self.converter[origin_key]=self.quantitation_key
296  self.backward_converter[self.quantitation_key]=origin_key
297 
298  def set_psi_key(self,origin_key):
299  self.converter[origin_key]=self.psi_key
300  self.backward_converter[self.psi_key]=origin_key
301 
302  def get_converter(self):
303  '''
304  Returns the dictionary that convert the old keywords to the new ones
305  '''
306  self.check_keys()
307  return self.converter
308 
310  '''
311  Returns the dictionary that convert the new keywords to the old ones
312  '''
313  self.check_keys()
314  return self.backward_converter
315 
316 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
317  '''
318  A class to handle different styles of site pairs parsers.
319  Implemented styles:
320  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
321  [Y3-;K4-] for dead-ends
322  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
323  QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
324  '''
325 
326  import re
327 
328  def __init__(self,style):
329 
330  _CrossLinkDataBaseStandardKeys.__init__(self)
331  if style is "MSSTUDIO":
332  self.style=style
333  self.compulsory_keys= set([self.protein1_key,
334  self.protein2_key,
335  self.site_pairs_key])
336  elif style is "QUANTITATION":
337  self.style=style
338  self.compulsory_keys= set([self.site_pairs_key])
339  else:
340  raise Error("ResiduePairListParser: unknown list parser style")
341 
342  def get_compulsory_keys(self):
343  return self.compulsory_keys
344 
345  def get_list(self,input_string):
346  '''
347  This function returns a list of cross-linked residues and the corresponding list of
348  cross-linked chains. The latter list can be empty, if the style doesn't have the
349  corresponding information.
350  '''
351  if self.style == "MSSTUDIO":
352  input_string=input_string.replace("[","")
353  input_string=input_string.replace("]","")
354  input_string_pairs=input_string.split(";")
355  residue_pair_indexes=[]
356  chain_pair_indexes=[]
357  for s in input_string_pairs:
358  m1=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
359  m2=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
360  if m1:
361  # cross link
362  residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
363  residue_pair_indexes.append((residue_index_1,residue_index_2))
364  elif m2:
365  # dead end
366  residue_type_1,residue_index_1=m2.group(1,2)
367  # at this stage chain_pair_indexes is empty
368  return residue_pair_indexes,chain_pair_indexes
369  if self.style == "QUANTITATION":
370  input_strings=input_string.split(":x:")
371  first_peptides=input_strings[0].split(":|:")
372  second_peptides=input_strings[1].split(":|:")
373  first_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in first_peptides]
374  second_peptides_indentifiers=[(f.split(":")[0],f.split(":")[1]) for f in second_peptides]
375  residue_pair_indexes=[]
376  chain_pair_indexes=[]
377  for fpi in first_peptides_indentifiers:
378  for spi in second_peptides_indentifiers:
379  chain1=fpi[0]
380  chain2=spi[0]
381  residue1=fpi[1]
382  residue2=spi[1]
383  residue_pair_indexes.append((residue1,residue2))
384  chain_pair_indexes.append((chain1,chain2))
385  return residue_pair_indexes,chain_pair_indexes
386 
387 
388 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
389  '''
390  A class to handle different XL format with fixed format
391  currently support ProXL
392  '''
393  def __init__(self,format):
394 
395  _CrossLinkDataBaseStandardKeys.__init__(self)
396  if format is "PROXL":
397  self.format=format
398  else:
399  raise Error("FixedFormatParser: unknown list format name")
400 
401 
402  def get_data(self,input_string):
403  if self.format is "PROXL":
404  tockens=input_string.split("\t")
405  xl={}
406  if tockens[0]=="SEARCH ID(S)":
407  return None
408  else:
409  xl[self.protein1_key]=tockens[2]
410  xl[self.protein2_key]=tockens[4]
411  xl[self.residue1_key]=int(tockens[3])
412  xl[self.residue2_key]=int(tockens[5])
413  return xl
414 
415 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
416  import operator
417  '''
418  this class handles a cross-link dataset and do filtering
419  operations, adding cross-links, merge datasets...
420  '''
421 
422  def __init__(self, converter=None, data_base=None):
423  '''
424  Constructor.
425  @param converter an instance of CrossLinkDataBaseKeywordsConverter
426  @param data_base an instance of CrossLinkDataBase to build the new database on
427  '''
428  if data_base is None:
429  self.data_base = {}
430  else:
431  self.data_base=data_base
432 
433  _CrossLinkDataBaseStandardKeys.__init__(self)
434  if converter is not None:
435  self.cldbkc = converter
436  self.list_parser=self.cldbkc.rplp
437  self.converter = converter.get_converter()
438 
439  else:
440  self.cldbkc = None
441  self.list_parser=None
442  self.converter = None
443 
444  self._update()
445 
446  def _update(self):
447  '''
448  Update the whole dataset after changes
449  '''
450  self.update_cross_link_unique_sub_index()
451  self.update_cross_link_redundancy()
452 
453  def __iter__(self):
454  sorted_ids=sorted(self.data_base.keys())
455  for k in sorted_ids:
456  for xl in self.data_base[k]:
457  yield xl
458 
459  def xlid_iterator(self):
460  sorted_ids=sorted(self.data_base.keys())
461  for xlid in sorted_ids:
462  yield xlid
463 
464  def __getitem__(self,xlid):
465  return self.data_base[xlid]
466 
467  def __len__(self):
468  return len([xl for xl in self])
469 
470  def get_name(self):
471  return self.name
472 
473  def set_name(self,name):
474  new_data_base={}
475  for k in self.data_base:
476  new_data_base[k+"."+name]=self.data_base[k]
477  self.data_base=new_data_base
478  self.name=name
479  self._update()
480 
481  def get_number_of_xlid(self):
482  return len(self.data_base)
483 
484 
485  def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
486  '''
487  if FixedFormatParser is not specified, the file is comma-separated-values
488  @param file_name a txt file to be parsed
489  @param converter an instance of CrossLinkDataBaseKeywordsConverter
490  @param FixedFormatParser a parser for a fixed format
491  '''
492  if not FixedFormatParser:
493  xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
494 
495  if converter is not None:
496  self.cldbkc = converter
497  self.list_parser=self.cldbkc.rplp
498  self.converter = converter.get_converter()
499 
500 
501  if not self.list_parser:
502  # normal procedure without a list_parser
503  # each line is a cross-link
504  new_xl_dict={}
505  for nxl,xl in enumerate(xl_list):
506  new_xl={}
507  for k in xl:
508  if k in self.converter:
509  new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
510  else:
511  new_xl[k]=xl[k]
512  if self.unique_id_key in self.cldbkc.get_setup_keys():
513  if new_xl[self.unique_id_key] not in new_xl_dict:
514  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
515  else:
516  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
517  else:
518  if str(nxl) not in new_xl_dict:
519  new_xl_dict[str(nxl)]=[new_xl]
520  else:
521  new_xl_dict[str(nxl)].append(new_xl)
522 
523  else:
524  # with a list_parser, a line can be a list of ambiguous crosslinks
525  new_xl_dict={}
526  for nxl,entry in enumerate(xl_list):
527 
528  # first get the translated keywords
529  new_dict={}
530  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
531  raise Error("CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
532  for k in entry:
533  if k in self.converter:
534  new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
535  else:
536  new_dict[k]=entry[k]
537 
538  residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
539 
540  # then create the crosslinks
541  for n,p in enumerate(residue_pair_list):
542  new_xl={}
543  for k in new_dict:
544  new_xl[k]=new_dict[k]
545  new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
546  new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
547 
548  if len(chain_pair_list)==len(residue_pair_list):
549  new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
550  new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
551 
552  if self.unique_id_key in self.cldbkc.get_setup_keys():
553  if new_xl[self.unique_id_key] not in new_xl_dict:
554  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
555  else:
556  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
557  else:
558  if str(nxl) not in new_xl_dict:
559  new_xl[self.unique_id_key]=str(nxl+1)
560  new_xl_dict[str(nxl)]=[new_xl]
561  else:
562  new_xl[self.unique_id_key]=str(nxl+1)
563  new_xl_dict[str(nxl)].append(new_xl)
564 
565 
566  else:
567  '''
568  if FixedFormatParser is defined
569  '''
570 
571  new_xl_dict={}
572  f=open(file_name,"r")
573  nxl=0
574  for line in f:
575  xl=FixedFormatParser.get_data(line)
576  if xl:
577  xl[self.unique_id_key]=str(nxl+1)
578  new_xl_dict[str(nxl)]=[xl]
579  nxl+=1
580 
581 
582  self.data_base=new_xl_dict
583  self.name=file_name
584  self._update()
585 
586  def update_cross_link_unique_sub_index(self):
587  for k in self.data_base:
588  for n,xl in enumerate(self.data_base[k]):
589  xl[self.ambiguity_key]=len(self.data_base[k])
590  xl[self.unique_sub_index_key]=n+1
591  xl[self.unique_sub_id_key]=k+"."+str(n+1)
592 
593  def update_cross_link_redundancy(self):
594  redundancy_data_base={}
595  for xl in self:
596  pra=_ProteinsResiduesArray(xl)
597  if pra not in redundancy_data_base:
598  redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
599  redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
600  else:
601  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
602  redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
603  for xl in self:
604  pra=_ProteinsResiduesArray(xl)
605  xl[self.redundancy_key]=len(redundancy_data_base[pra])
606  xl[self.redundancy_list_key]=redundancy_data_base[pra]
607 
608  def get_cross_link_string(self,xl):
609  string='|'
610  for k in self.ordered_key_list:
611  try:
612  string+=str(k)+":"+str(xl[k])+"|"
613  except KeyError:
614  continue
615 
616  for k in xl:
617  if k not in self.ordered_key_list:
618  string+=str(k)+":"+str(xl[k])+"|"
619 
620  return string
621 
622  def get_short_cross_link_string(self,xl):
623 
624  string='|'
625  list_of_keys=[self.data_set_name_key,
626  self.unique_sub_id_key,
627  self.protein1_key,
628  self.residue1_key,
629  self.protein2_key,
630  self.residue2_key,
631  self.state_key,
632  self.psi_key]
633 
634  for k in list_of_keys:
635  try:
636  string+=str(xl[k])+"|"
637  except KeyError:
638  continue
639 
640  return string
641 
642  def filter(self,FilterOperator):
643  new_xl_dict={}
644  for id in self.data_base.keys():
645  for xl in self.data_base[id]:
646  if FilterOperator.evaluate(xl):
647  if id not in new_xl_dict:
648  new_xl_dict[id]=[xl]
649  else:
650  new_xl_dict[id].append(xl)
651  return CrossLinkDataBase(self.cldbkc,new_xl_dict)
652 
653 
654  def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
655  '''
656  This function merges two cross-link datasets so that if two conflicting crosslinks have the same
657  cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
658  with different SubIDs
659  '''
660  pass
661 
662  def append_database(self,CrossLinkDataBase2):
663  '''
664  This function append one cross-link dataset to another. Unique ids will be renamed to avoid
665  conflicts.
666  '''
667  name1=self.get_name()
668  name2=CrossLinkDataBase2.get_name()
669  if name1 == name2:
670  name1=id(self)
671  name2=id(CrossLinkDataBase2)
672 
673  #rename first database:
674  new_data_base={}
675  for k in self.data_base:
676  new_data_base[k]=self.data_base[k]
677  for k in CrossLinkDataBase2.data_base:
678  new_data_base[k]=CrossLinkDataBase2.data_base[k]
679  self.data_base=new_data_base
680  self._update()
681 
682  def set_value(self,key,new_value,FilterOperator=None):
683  '''
684  This function changes the value for a given key in the database
685  For instance one can change the name of a protein
686  @param key: the key in the database that must be changed
687  @param new_value: the new value of the key
688  @param FilterOperator: optional FilterOperator to change the value to
689  a subset of the database
690 
691  example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
692  '''
693 
694  for xl in self:
695  if FilterOperator is not None:
696  if FilterOperator.evaluate(xl):
697  xl[key]=new_value
698  else:
699  xl[key]=new_value
700  self._update()
701 
702  def get_values(self,key):
703  '''
704  this function returns the list of values for a given key in the database
705  alphanumerically sorted
706  '''
707  values=set()
708  for xl in self:
709  values.add(xl[key])
710  return sorted(list(values))
711 
712  def offset_residue_index(self,protein_name,offset):
713  '''
714  This function offset the residue indexes of a given protein by a specified value
715  @param protein_name: the protein name that need to be changed
716  @param offset: the offset value
717  '''
718 
719  for xl in self:
720  if xl[self.protein1_key] == protein_name:
721  xl[self.residue1_key]=xl[self.residue1_key]+offset
722  if xl[self.protein2_key] == protein_name:
723  xl[self.residue2_key]=xl[self.residue2_key]+offset
724  self._update()
725 
726  def create_new_keyword(self,keyword,values_from_keyword=None):
727  '''
728  This function creates a new keyword for the whole database and set the values from
729  and existing keyword (optional), otherwise the values are set to None
730  @param keyword the new keyword name:
731  @param values_from_keyword the keyword from which we are copying the values:
732  '''
733  for xl in self:
734  if values_from_keyword is not None:
735  xl[keyword] = xl[values_from_keyword]
736  else:
737  xl[keyword] = None
738  self._update()
739 
740  def rename_proteins(self,old_to_new_names_dictionary):
741  '''
742  This function renames all proteins contained in the input dictionary
743  from the old names (keys) to the new name (values)
744  '''
745 
746  for old_name in old_to_new_names_dictionary:
747  new_name=old_to_new_names_dictionary[old_name]
748  fo2=FilterOperator(self.protein1_key,operator.eq,old_name)
749  self.set_value(self.protein1_key,new_name,fo2)
750  fo2=FilterOperator(self.protein2_key,operator.eq,old_name)
751  self.set_value(self.protein2_key,new_name,fo2)
752 
753  def clone_protein(self,protein_name,new_protein_name):
754  new_xl_dict={}
755  for id in self.data_base.keys():
756  new_data_base=[]
757  for xl in self.data_base[id]:
758  new_data_base.append(xl)
759  if xl[self.protein1_key]==protein_name and xl[self.protein2_key]!=protein_name:
760  new_xl=dict(xl)
761  new_xl[self.protein1_key]=new_protein_name
762  new_data_base.append(new_xl)
763  elif xl[self.protein1_key]!=protein_name and xl[self.protein2_key]==protein_name:
764  new_xl=dict(xl)
765  new_xl[self.protein2_key]=new_protein_name
766  new_data_base.append(new_xl)
767  elif xl[self.protein1_key]==protein_name and xl[self.protein2_key]==protein_name:
768  new_xl=dict(xl)
769  new_xl[self.protein1_key]=new_protein_name
770  new_data_base.append(new_xl)
771  new_xl=dict(xl)
772  new_xl[self.protein2_key]=new_protein_name
773  new_data_base.append(new_xl)
774  new_xl=dict(xl)
775  new_xl[self.protein1_key]=new_protein_name
776  new_xl[self.protein2_key]=new_protein_name
777  new_data_base.append(new_xl)
778  self.data_base[id]=new_data_base
779  self._update()
780 
781  def filter_out_same_residues(self):
782  '''
783  This function remove cross-links applied to the same residue
784  (ie, same chain name and residue number)
785  '''
786  new_xl_dict={}
787  for id in self.data_base.keys():
788  new_data_base=[]
789  for xl in self.data_base[id]:
790  if xl[self.protein1_key]==xl[self.protein2_key] and xl[self.residue1_key]==xl[self.residue2_key]:
791  continue
792  else:
793  new_data_base.append(xl)
794  self.data_base[id]=new_data_base
795  self._update()
796 
797 
798  def jackknife(self,percentage):
799  '''
800  this method returns a CrossLinkDataBase class containing
801  a random subsample of the original cross-link database.
802  @param percentage float between 0 and 1, is the percentage of
803  of spectra taken from the original list
804  '''
805  import random
806  if percentage > 1.0 or percentage < 0.0:
807  raise ValueError('the percentage of random cross-link spectra should be between 0 and 1')
808  nspectra=self.get_number_of_xlid()
809  nrandom_spectra=int(nspectra*percentage)
810  random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
811  new_data_base={}
812  for k in random_keys:
813  new_data_base[k]=self.data_base[k]
814  return CrossLinkDataBase(self.cldbkc,new_data_base)
815 
816  def __str__(self):
817  outstr=''
818  sorted_ids=sorted(self.data_base.keys())
819 
820  for id in sorted_ids:
821  outstr+=id+"\n"
822  for xl in self.data_base[id]:
823  for k in self.ordered_key_list:
824  try:
825  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
826  except KeyError:
827  continue
828 
829  for k in xl:
830  if k not in self.ordered_key_list:
831  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
832  outstr+="-------------\n"
833  return outstr
834 
835 
836  def plot(self,filename,**kwargs):
837  import matplotlib
838  matplotlib.use('Agg')
839  import matplotlib.pyplot as plt
840  import matplotlib.colors
841 
842 
843 
844  if kwargs["type"] == "scatter":
845  cmap=plt.get_cmap("rainbow")
846  norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
847  xkey=kwargs["xkey"]
848  ykey=kwargs["ykey"]
849  if "colorkey" in kwargs:
850  colorkey=kwargs["colorkey"]
851  if "sizekey" in kwargs:
852  sizekey=kwargs["sizekey"]
853  xs=[]
854  ys=[]
855  colors=[]
856  for xl in self:
857  try:
858  xs.append(float(xl[xkey]))
859  ys.append(float(xl[ykey]))
860  colors.append(float(xl[colorkey]))
861  except ValueError:
862  print("Value error for cross-link %s" % (xl[self.unique_id_key]))
863  continue
864 
865  cs=[]
866  for color in colors:
867  cindex=(color-min(colors))/(max(colors)-min(colors))
868  cs.append(cmap(cindex))
869 
870  fig = plt.figure()
871  ax = fig.add_subplot(111)
872  ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker="o")
873  ax.set_xlabel(xkey)
874  ax.set_ylabel(ykey)
875  plt.savefig(filename)
876  plt.show()
877  plt.close()
878 
879  if kwargs["type"] == "histogram":
880  valuekey=kwargs["valuekey"]
881  reference_xline=kwargs["reference_xline"]
882  valuename=valuekey
883  bins=kwargs["bins"]
884  values_list=[]
885  for xl in self:
886  try:
887  values_list.append(float(xl[valuekey]))
888  except ValueError:
889  print("Value error for cross-link %s" % (xl[self.unique_id_key]))
890  continue
892  filename, [values_list], valuename=valuename, bins=bins,
893  colors=None, format="pdf",
894  reference_xline=None, yplotrange=None,
895  xplotrange=None,normalized=True,
896  leg_names=None)
897 
898  def dump(self,json_filename):
899  import json
900  with open(json_filename, 'w') as fp:
901  json.dump(self.data_base, fp, sort_keys=True, indent=2, default=set_json_default)
902 
903  def load(self,json_filename):
904  import json
905  with open(json_filename, 'r') as fp:
906  self.data_base = json.load(fp)
907  self._update()
908  #getting rid of unicode
909  # (can't do this in Python 3, since *everything* is Unicode there)
910  if sys.version_info[0] < 3:
911  for xl in self:
912  for k,v in xl.iteritems():
913  if type(k) is unicode: k=str(k)
914  if type(v) is unicode: v=str(v)
915  xl[k]=v
916 
917  def save_csv(self,filename):
918 
919  import csv
920 
921  data=[]
922  sorted_ids=None
923  sorted_group_ids=sorted(self.data_base.keys())
924  for group in sorted_group_ids:
925  group_block=[]
926  for xl in self.data_base[group]:
927  if not sorted_ids:
928  sorted_ids=sorted(xl.keys())
929  values=[xl[k] for k in sorted_ids]
930  group_block.append(values)
931  data+=group_block
932 
933 
934  with open(filename, 'w') as fp:
935  a = csv.writer(fp, delimiter=',')
936  a.writerow(sorted_ids)
937  a.writerows(data)
938 
940  '''
941  This class generates a CrossLinkDataBase from a given structure
942  '''
943  def __init__(self,representation=None,
944  system=None,
945  residue_types_1=["K"],
946  residue_types_2=["K"],
947  reactivity_range=[0,1],
948  kt=1.0):
949 
950  import numpy.random
951  import math
953  cldbkc.set_protein1_key("Protein1")
954  cldbkc.set_protein2_key("Protein2")
955  cldbkc.set_residue1_key("Residue1")
956  cldbkc.set_residue2_key("Residue2")
957  self.cldb=CrossLinkDataBase(cldbkc)
958  if representation is not None:
959  #PMI 1.0 mode
960  self.mode="pmi1"
961  self.representation=representation
962  self.model=self.representation.m
963  elif system is not None:
964  #PMI 2.0 mode
965  self.system=system
966  self.model=self.system.mdl
967  self.mode="pmi2"
968  else:
969  print("Argument error: please provide either a representation object or a IMP.Hierarchy")
970  raise
971  self.residue_types_1=residue_types_1
972  self.residue_types_2=residue_types_2
973  self.kt=kt
974  self.indexes_dict1={}
975  self.indexes_dict2={}
976  self.protein_residue_dict={}
977  self.reactivity_dictionary={}
978  self.euclidean_interacting_pairs=None
979  self.xwalk_interacting_pairs=None
980  import random
981 
982  if self.mode=="pmi1":
983  for protein in self.representation.sequence_dict.keys():
984  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
985  seq=self.representation.sequence_dict[protein]
986  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
987 
988  for r in residues:
989  # uniform random reactivities
990  #self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
991  # getting reactivities from the CDF of an exponential distribution
992  rexp=numpy.random.exponential(0.1)
993  prob=1.0-math.exp(-rexp)
994  self.reactivity_dictionary[(protein,r)]=prob
995 
996 
997  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
998  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
999  for r in residues1:
1000  h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1001  p=h.get_particle()
1002  index=p.get_index()
1003  self.indexes_dict1[index]=(protein,r)
1004  self.protein_residue_dict[(protein,r)]=index
1005  for r in residues2:
1006  h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1007  p=h.get_particle()
1008  index=p.get_index()
1009  self.indexes_dict2[index]=(protein,r)
1010  self.protein_residue_dict[(protein,r)]=index
1011 
1012  if self.mode=="pmi2":
1013  for state in self.system.get_states():
1014  for moleculename,molecules in state.get_molecules().iteritems():
1015  for molecule in molecules:
1016  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
1017  seq=molecule.sequence
1018  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
1019 
1020  for r in residues:
1021  # uniform random reactivities
1022  #self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
1023  # getting reactivities from the CDF of an exponential distribution
1024  rexp=numpy.random.exponential(0.1)
1025  prob=1.0-math.exp(-rexp)
1026  self.reactivity_dictionary[(molecule,r)]=prob
1027 
1028  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
1029  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
1030  for r in residues1:
1031  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1032  ps=s.get_selected_particles()
1033  for p in ps:
1034  index=p.get_index()
1035  self.indexes_dict1[index]=(molecule,r)
1036  self.protein_residue_dict[(molecule,r)]=index
1037  for r in residues2:
1038  s=IMP.atom.Selection(molecule.hier,residue_index=r,resolution=1)
1039  ps=s.get_selected_particles()
1040  for p in ps:
1041  index=p.get_index()
1042  self.indexes_dict2[index]=(molecule,r)
1043  self.protein_residue_dict[(molecule,r)]=index
1044 
1045 
1046 
1047  def get_data_base(self,total_number_of_spectra,
1048  ambiguity_probability=0.1,
1049  noise=0.01,
1050  distance=21,
1051  max_delta_distance=10.0,
1052  xwalk_bin_path=None,
1053  confidence_false=0.75,
1054  confidence_true=0.75):
1055  import math
1056  from random import random,uniform
1057  import numpy as np
1058  number_of_spectra=1
1059 
1060  self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1061  self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1062  self.cldb.data_base[str(number_of_spectra)]=[]
1063  self.sites_weighted=None
1064 
1065  while number_of_spectra<total_number_of_spectra:
1066  if random() > ambiguity_probability and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1067  # new spectrum
1068  number_of_spectra+=1
1069  self.cldb.data_base[str(number_of_spectra)]=[]
1070  noisy=False
1071  if random() > noise:
1072  # not noisy crosslink
1073  pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1074  else:
1075  # noisy crosslink
1076  pra,dist=self.get_random_residue_pair(None,None,None)
1077  noisy=True
1078 
1079  new_xl={}
1080  if self.mode=="pmi1":
1081  new_xl[self.cldb.protein1_key]=pra[0]
1082  new_xl[self.cldb.protein2_key]=pra[1]
1083  elif self.mode=="pmi2":
1084  new_xl[self.cldb.protein1_key]=pra[0].get_name()
1085  new_xl[self.cldb.protein2_key]=pra[1].get_name()
1086  new_xl["molecule_object1"]=pra[0]
1087  new_xl["molecule_object2"]=pra[1]
1088  new_xl[self.cldb.residue1_key]=pra[2]
1089  new_xl[self.cldb.residue2_key]=pra[3]
1090  new_xl["Noisy"]=noisy
1091  # the reactivity is defined as r=1-exp(-k*Delta t)
1092  new_xl["Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1093  new_xl["Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1094  r1=new_xl["Reactivity_Residue1"]
1095  r2=new_xl["Reactivity_Residue2"]
1096  #combined reactivity 1-exp(-k12*Delta t),
1097  # k12=k1*k2/(k1+k2)
1098  new_xl["Reactivity"]=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1099  if noisy:
1100  #new_xl["Score"]=uniform(-1.0,1.0)
1101  new_xl["Score"]=np.random.beta(1.0,self.beta_false)
1102  else:
1103  #new_xl["Score"]=new_xl["Reactivity"]+uniform(0.0,2.0)
1104  new_xl["Score"]=1.0-np.random.beta(1.0,self.beta_true)
1105  new_xl["TargetDistance"]=dist
1106  new_xl["NoiseProbability"]=noise
1107  new_xl["AmbiguityProbability"]=ambiguity_probability
1108  # getting if it is intra or inter rigid body
1109  (p1,p2)=IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1110  self.protein_residue_dict[(pra[1],pra[3])]])
1113  IMP.core.RigidMember(p1).get_rigid_body() ==
1114  IMP.core.RigidMember(p2).get_rigid_body()):
1115  new_xl["InterRigidBody"] = False
1116  elif (IMP.core.RigidMember.get_is_setup(p1) and
1118  IMP.core.RigidMember(p1).get_rigid_body() !=
1119  IMP.core.RigidMember(p2).get_rigid_body()):
1120  new_xl["InterRigidBody"] = True
1121  else:
1122  new_xl["InterRigidBody"] = None
1123 
1124  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1125  self.cldb._update()
1126  return self.cldb
1127 
1128 
1129  def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1130  import IMP.pmi.tools
1131  import math
1132  from random import choice,uniform
1133  if distance is None:
1134  # get a random pair
1135  while True:
1136  if self.mode=="pmi1":
1137  protein1=choice(self.representation.sequence_dict.keys())
1138  protein2=choice(self.representation.sequence_dict.keys())
1139  seq1=self.representation.sequence_dict[protein1]
1140  seq2=self.representation.sequence_dict[protein2]
1141  residue1=choice([i for i in range(1,len(seq1)+1) if seq1[i-1] in self.residue_types_1])
1142  residue2=choice([i for i in range(1,len(seq2)+1) if seq2[i-1] in self.residue_types_2])
1143  h1=IMP.pmi.tools.select_by_tuple(self.representation,(residue1,residue1,protein1),resolution=1)[0]
1144  h2=IMP.pmi.tools.select_by_tuple(self.representation,(residue2,residue2,protein2),resolution=1)[0]
1145  particle_distance=IMP.core.get_distance(IMP.core.XYZ(h1.get_particle()),IMP.core.XYZ(h2.get_particle()))
1146  if (protein1,residue1) != (protein2,residue2):
1147  break
1148  elif self.mode=="pmi2":
1149  (protein1,residue1)=choice(self.protein_residue_dict.keys())
1150  (protein2,residue2)=choice(self.protein_residue_dict.keys())
1151  index1=self.protein_residue_dict[(protein1,residue1)]
1152  index2=self.protein_residue_dict[(protein2,residue2)]
1153  particle_distance=IMP.core.get_distance(IMP.core.XYZ(IMP.get_particles(self.model,[index1])[0]),IMP.core.XYZ(IMP.get_particles(self.model,[index2])[0]))
1154  if (protein1,residue1) != (protein2,residue2):
1155  break
1156  else:
1157  # get a pair of residues whose distance is below the threshold
1158  if not xwalk_bin_path:
1160  gcpf.set_distance(distance+max_delta_distance)
1161 
1162  while True:
1163  #setup the reaction rates lists
1164  if not self.sites_weighted:
1165  self.sites_weighted=[]
1166  for key in self.reactivity_dictionary:
1167  r=self.reactivity_dictionary[key]
1168  self.sites_weighted.append((key,r))
1169  #get a random reaction site
1170  first_site=self.weighted_choice(self.sites_weighted)
1171  #get all distances
1172  if not self.euclidean_interacting_pairs:
1173  self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1174  self.indexes_dict1.keys(),
1175  self.indexes_dict2.keys())
1176  #get the partner for the first reacted site
1177  first_site_pairs = [pair for pair in self.euclidean_interacting_pairs
1178  if self.indexes_dict1[pair[0]] == first_site or
1179  self.indexes_dict2[pair[1]] == first_site]
1180  if len(first_site_pairs)==0: continue
1181  #build the list of second reaction sites
1182  second_sites_weighted=[]
1183  for pair in first_site_pairs:
1184  if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1185  if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1186  r=self.reactivity_dictionary[second_site]
1187  second_sites_weighted.append((second_site,r))
1188  second_site=self.weighted_choice(second_sites_weighted)
1189  """
1190  interacting_pairs_weighted=[]
1191  for pair in self.euclidean_interacting_pairs:
1192  r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1193  r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1194  #combined reactivity 1-exp(-k12*Delta t),
1195  # k12=k1*k2/(k1+k2)
1196  #print(r1,r2,dist)
1197  r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1198  interacting_pairs_weighted.append((pair,r12))
1199  #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1200  #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1201  #interacting_pairs_weighted.append((pair,weight1*weight2))
1202 
1203  while True:
1204  pair=self.weighted_choice(interacting_pairs_weighted)
1205  protein1,residue1=self.indexes_dict1[pair[0]]
1206  protein2,residue2=self.indexes_dict2[pair[1]]
1207  particle_pair=IMP.get_particles(self.model,pair)
1208  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1209  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1210  break
1211  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1212  #allow some flexibility
1213  prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1214  if uniform(0.0,1.0)<prob: break
1215  """
1216  protein1,residue1=first_site
1217  protein2,residue2=second_site
1218  print("CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1219  "First site",first_site,self.reactivity_dictionary[first_site],
1220  "Second site",second_site,self.reactivity_dictionary[second_site])
1221  particle_pair=IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1222  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1223 
1224  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1225  break
1226  elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1227  #allow some flexibility
1228  prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1229  if uniform(0.0,1.0)<prob: break
1230 
1231 
1232 
1233  else:
1234  if not self.xwalk_interacting_pairs:
1235  self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1236  interacting_pairs_weighted=[]
1237 
1238  for pair in self.xwalk_interacting_pairs:
1239  protein1=pair[0]
1240  protein2=pair[1]
1241  residue1=pair[2]
1242  residue2=pair[3]
1243  weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1244  weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1245  interacting_pairs_weighted.append((pair,weight1*weight2))
1246 
1247  pair=self.weighted_choice(interacting_pairs_weighted)
1248  protein1=pair[0]
1249  protein2=pair[1]
1250  residue1=pair[2]
1251  residue2=pair[3]
1252  particle_distance=float(pair[4])
1253 
1254 
1255 
1256  return ((protein1,protein2,residue1,residue2)),particle_distance
1257 
1258  def get_xwalk_distances(self,xwalk_bin_path,distance):
1259  import IMP.pmi.output
1260  import os
1261  o=IMP.pmi.output.Output(atomistic=True)
1262  o.init_pdb("xwalk.pdb",self.representation.prot)
1263  o.write_pdb("xwalk.pdb")
1264  namechainiddict=o.dictchain["xwalk.pdb"]
1265  chainiddict={}
1266 
1267  for key in namechainiddict: chainiddict[namechainiddict[key]]=key
1268  xwalkout=os.popen('java -Xmx256m -cp ' + xwalk_bin_path +' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+' -bb').read()
1269 
1270  output_list_of_distance=[]
1271  for line in xwalkout.split("\n")[0:-2]:
1272  tockens=line.split()
1273  first=tockens[2]
1274  second=tockens[3]
1275  distance=float(tockens[6])
1276  fs=first.split("-")
1277  ss=second.split("-")
1278  chainid1=fs[2]
1279  chainid2=ss[2]
1280  protein1=chainiddict[chainid1]
1281  protein2=chainiddict[chainid2]
1282  residue1=int(fs[1])
1283  residue2=int(ss[1])
1284  output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1285  return output_list_of_distance
1286 
1287 
1288  def weighted_choice(self,choices):
1289  import random
1290  # from http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
1291  total = sum(w for c, w in choices)
1292  r = random.uniform(0, total)
1293  upto = 0
1294  for c, w in choices:
1295  if upto + w > r:
1296  return c
1297  upto += w
1298  assert False, "Shouldn't get here"
1299 
1300  def save_rmf_snapshot(self,filename,color_id=None):
1301  import IMP.rmf
1302  import RMF
1303  if color_id is None:
1304  color_id="Reactivity"
1305  sorted_ids=None
1306  sorted_group_ids=sorted(self.cldb.data_base.keys())
1307  list_of_pairs=[]
1308  color_scores=[]
1309  for group in sorted_group_ids:
1310  group_xls=[]
1311  group_dists_particles=[]
1312  for xl in self.cldb.data_base[group]:
1313  xllabel=self.cldb.get_short_cross_link_string(xl)
1314  (c1,c2,r1,r2)=(xl["molecule_object1"],xl["molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1315  try:
1316  index1=self.protein_residue_dict[(c1,r1)]
1317  index2=self.protein_residue_dict[(c2,r2)]
1318  p1,p2=IMP.get_particles(self.model,[index1])[0],IMP.get_particles(self.model,[index2])[0]
1319  mdist=xl["TargetDistance"]
1320  except TypeError:
1321  print("TypeError or missing chain/residue ",r1,c1,r2,c2)
1322  continue
1323  group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1324  if group_dists_particles:
1325  (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key = lambda t: t[0])
1326  color_scores.append(mincolor_score)
1327  list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1328  else:
1329  continue
1330 
1331 
1332  m=self.model
1333  linear = IMP.core.Linear(0, 0.0)
1334  linear.set_slope(1.0)
1335  dps2 = IMP.core.DistancePairScore(linear)
1336  rslin = IMP.RestraintSet(m, 'linear_dummy_restraints')
1337  sgs=[]
1338  offset=min(color_scores)
1339  maxvalue=max(color_scores)
1340  for pair in list_of_pairs:
1341  pr = IMP.core.PairRestraint(m, dps2, (pair[0], pair[1]))
1342  pr.set_name(pair[2])
1343  factor=(pair[3]-offset)/(maxvalue-offset)
1344  c=IMP.display.get_rgb_color(factor)
1345  seg=IMP.algebra.Segment3D(IMP.core.XYZ(pair[0]).get_coordinates(),IMP.core.XYZ(pair[1]).get_coordinates())
1346  rslin.add_restraint(pr)
1347  sgs.append(IMP.display.SegmentGeometry(seg,c,pair[2]))
1348 
1349  rh = RMF.create_rmf_file(filename)
1350  IMP.rmf.add_hierarchies(rh, [self.system.hier])
1351  IMP.rmf.add_restraints(rh,[rslin])
1352  IMP.rmf.add_geometries(rh, sgs)
1353  IMP.rmf.save_frame(rh)
1354  del rh
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Definition: output.py:959
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
Miscellaneous utilities.
Definition: tools.py:1
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:619
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:89
Object used to hold a set of restraints.
Definition: RestraintSet.h:36
Stores a named protein chain.
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:47
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
Linear function
Definition: Linear.h:19
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Definition: output.py:1
Simple implementation of segments in 3D.
Definition: Segment3D.h:24
Applies a PairScore to a Pair.
Definition: PairRestraint.h:29
Python classes to represent, score, sample and analyze models.
Select hierarchy particles identified by the biological name.
Definition: Selection.h:66
Support for the RMF file format for storing hierarchical molecular data and markup.