IMP logo
IMP Reference Guide  2.6.0
The Integrative Modeling Platform
crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 
8 import IMP
9 import IMP.pmi
10 import operator
11 
12 class _CrossLinkDataBaseStandardKeys(object):
13  '''
14  This class setup all the standard keys needed to
15  identify the crosslink features from the data sets
16  '''
17  def __init__(self):
18  self.type={}
19  self.protein1_key="Protein1"
20  self.type[self.protein1_key]=str
21  self.protein2_key="Protein2"
22  self.type[self.protein2_key]=str
23  self.residue1_key="Residue1"
24  self.type[self.residue1_key]=int
25  self.residue2_key="Residue2"
26  self.type[self.residue2_key]=int
27  self.site_pairs_key="SitePairs"
28  self.type[self.site_pairs_key]=str
29  self.unique_id_key="XLUniqueID"
30  self.type[self.unique_id_key]=str
31  self.unique_sub_index_key="XLUniqueSubIndex"
32  self.type[self.unique_sub_index_key]=int
33  self.unique_sub_id_key="XLUniqueSubID"
34  self.type[self.unique_sub_id_key]=str
35  self.data_set_name_key="DataSetName"
36  self.type[self.data_set_name_key]=str
37  self.cross_linker_chemical_key="CrossLinkerChemical"
38  self.type[self.cross_linker_chemical_key]=str
39  self.id_score_key="IDScore"
40  self.type[self.id_score_key]=float
41  self.quantitation_key="Quantitation"
42  self.type[self.quantitation_key]=float
43  self.redundancy_key="Redundancy"
44  self.type[self.redundancy_key]=int
45  self.redundancy_list_key="RedundancyList"
46  self.type[self.redundancy_key]=list
47  self.state_key="State"
48  self.type[self.state_key]=int
49  self.sigma1_key="Sigma1"
50  self.type[self.sigma1_key]=str
51  self.sigma2_key="Sigma2"
52  self.type[self.sigma2_key]=str
53  self.psi_key="Psi"
54  self.type[self.psi_key]=str
55 
56  self.ordered_key_list =[self.data_set_name_key,
57  self.unique_id_key,
58  self.unique_sub_index_key,
59  self.unique_sub_id_key,
60  self.protein1_key,
61  self.protein2_key,
62  self.residue1_key,
63  self.residue2_key,
64  self.cross_linker_chemical_key,
65  self.id_score_key,
66  self.quantitation_key,
67  self.redundancy_key,
68  self.redundancy_list_key,
69  self.state_key,
70  self.sigma1_key,
71  self.sigma2_key,
72  self.psi_key]
73 
74 
75 class _ProteinsResiduesArray(tuple):
76  '''
77  This class is inherits from tuple, and it is a shorthand for a cross-link
78  (p1,p2,r1,r2) where p1 and p2 are protein1 and protein2, r1 and r2 are
79  residue1 and residue2.
80  '''
81 
82  def __new__(self,input_data):
83  '''
84  @input_data can be a dict or a tuple
85  '''
86  self.cldbsk=_CrossLinkDataBaseStandardKeys()
87  if type(input_data) is dict:
88  p1=input_data[self.cldbsk.protein1_key]
89  p2=input_data[self.cldbsk.protein2_key]
90  r1=input_data[self.cldbsk.residue1_key]
91  r2=input_data[self.cldbsk.residue2_key]
92  t=(p1,p2,r1,r2)
93  elif type(input_data) is tuple:
94  if len(input_data)>4:
95  raise TypeError("_ProteinsResiduesArray: must have only 4 elements")
96  if type(input_data[0]) is not str:
97  raise TypeError("_ProteinsResiduesArray: input_data[0] must be a string")
98  if type(input_data[1]) is not str:
99  raise TypeError("_ProteinsResiduesArray: input_data[1] must be a string")
100  if type(input_data[2]) is not int:
101  raise TypeError("_ProteinsResiduesArray: input_data[2] must be a integer")
102  if type(input_data[3]) is not int:
103  raise TypeError("_ProteinsResiduesArray: input_data[3] must be a integer")
104  t=input_data
105  else:
106  raise TypeError("_ProteinsResiduesArray: input must be a dict or tuple")
107  return tuple.__new__(_ProteinsResiduesArray, t)
108 
109  def get_inverted(self):
110  '''
111  Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
112  '''
113  return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
114 
115  def __str__(self):
116  outstr=self.cldbsk.protein1_key+" "+str(self[0])
117  outstr+=" "+self.cldbsk.protein2_key+" "+str(self[1])
118  outstr+=" "+self.cldbsk.residue1_key+" "+str(self[2])
119  outstr+=" "+self.cldbsk.residue2_key+" "+str(self[3])
120  return outstr
121 
122 class FilterOperator(object):
123  '''
124  This class allows to create filter functions that can be passed to the CrossLinkDataBase
125  in this way:
126 
127  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
128 
129  where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
130 
131  A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
132 
133  fo.evaluate(xl)
134 
135  and it is used to filter the database
136  '''
137 
138  def __init__(self, argument1, operator, argument2):
139  '''
140  (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
141  or (FilterOperator1,operator.or|and...,FilterOperator2)
142 
143  we need to implement a NOT
144  '''
145  if isinstance(argument1, FilterOperator):
146  self.operations = [argument1, operator, argument2]
147  else:
148  self.operations = []
149  self.values = (argument1, operator, argument2)
150 
151  def __or__(self, FilterOperator2):
152  return FilterOperator(self, operator.or_, FilterOperator2)
153 
154  def __and__(self, FilterOperator2):
155  return FilterOperator(self, operator.and_, FilterOperator2)
156 
157  def evaluate(self, xl_item):
158 
159  if len(self.operations) == 0:
160  keyword, operator, value = self.values
161  return operator(xl_item[keyword], value)
162  FilterOperator1, op, FilterOperator2 = self.operations
163 
164  return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
165 
166 '''
167 def filter_factory(xl_):
168 
169  class FilterOperator(object):
170  import operator
171  xl = xl_
172 
173  def __new__(self,key,value,oper=operator.eq):
174  return oper(self.xl[key],value)
175 
176  return FilterOperator
177 '''
178 
179 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
180  '''
181  This class is needed to convert the keywords from a generic database
182  to the standard ones
183  '''
184 
185  def __init__(self, list_parser=None):
186  '''
187  @param list_parser an instance of ResiduePairListParser, if any is needed
188  '''
189  self.converter={}
190  self.backward_converter={}
191  _CrossLinkDataBaseStandardKeys.__init__(self)
192  self.rplp = list_parser
193  if self.rplp is None:
194  # either you have protein1, protein2, residue1, residue2
195  self.compulsory_keys=set([self.protein1_key,
196  self.protein2_key,
197  self.residue1_key,
198  self.residue2_key])
199  else:
200  self.compulsory_keys=self.rplp.get_compulsory_keys()
201  self.setup_keys=set()
202 
203  def check_keys(self):
204  '''
205  Is a function that check whether necessary keys are setup
206  '''
207  setup_keys=set(self.get_setup_keys())
208  if self.compulsory_keys & setup_keys != self.compulsory_keys:
209  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
210 
211  def get_setup_keys(self):
212  '''
213  Returns the keys that have been setup so far
214  '''
215  return self.backward_converter.keys()
216 
217  def set_unique_id_key(self,origin_key):
218  self.converter[origin_key]=self.unique_id_key
219  self.backward_converter[self.unique_id_key]=origin_key
220 
221  def set_protein1_key(self,origin_key):
222  self.converter[origin_key]=self.protein1_key
223  self.backward_converter[self.protein1_key]=origin_key
224 
225  def set_protein2_key(self,origin_key):
226  self.converter[origin_key]=self.protein2_key
227  self.backward_converter[self.protein2_key]=origin_key
228 
229  def set_residue1_key(self,origin_key):
230  self.converter[origin_key]=self.residue1_key
231  self.backward_converter[self.residue1_key]=origin_key
232 
233  def set_residue2_key(self,origin_key):
234  self.converter[origin_key]=self.residue2_key
235  self.backward_converter[self.residue2_key]=origin_key
236 
237  def set_site_pairs_key(self,origin_key):
238  self.converter[origin_key]=self.site_pairs_key
239  self.backward_converter[self.site_pairs_key]=origin_key
240 
241  def set_id_score_key(self,origin_key):
242  self.converter[origin_key]=self.id_score_key
243  self.backward_converter[self.id_score_key]=origin_key
244 
245  def set_quantitation_key(self,origin_key):
246  self.converter[origin_key]=self.quantitation_key
247  self.backward_converter[self.quantitation_key]=origin_key
248 
249  def set_psi_key(self,origin_key):
250  self.converter[origin_key]=self.psi_key
251  self.backward_converter[self.psi_key]=origin_key
252 
253  def get_converter(self):
254  '''
255  Returns the dictionary that convert the old keywords to the new ones
256  '''
257  self.check_keys()
258  return self.converter
259 
261  '''
262  Returns the dictionary that convert the new keywords to the old ones
263  '''
264  self.check_keys()
265  return self.backward_converter
266 
267 class ResiduePairListParser(_CrossLinkDataBaseStandardKeys):
268  '''
269  A class to handle different styles of site pairs parsers.
270  Implemented styles:
271  MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
272  [Y3-;K4-] for dead-ends
273  QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
274  '''
275 
276  import re
277 
278  def __init__(self,style):
279 
280  _CrossLinkDataBaseStandardKeys.__init__(self)
281  if style is "MSSTUDIO":
282  self.style=style
283  self.compulsory_keys= set([self.protein1_key,
284  self.protein2_key,
285  self.site_pairs_key])
286  elif style is "QUANTITATION":
287  self.style=style
288  self.compulsory_keys= set([self.site_pairs_key])
289  else:
290  raise Error("ResiduePairListParser: unknown list parser style")
291 
292  def get_compulsory_keys(self):
293  return self.compulsory_keys
294 
295  def get_list(self,input_string):
296  '''
297  This function returns a list of cross-linked residues and the corresponding list of
298  cross-linked chains. The latter list can be empty, if the style doesn't have the
299  corresponding information.
300  '''
301  if self.style == "MSSTUDIO":
302  input_string=input_string.replace("[","")
303  input_string=input_string.replace("]","")
304  input_string_pairs=input_string.split(";")
305  residue_pair_indexes=[]
306  chain_pair_indexes=[]
307  for s in input_string_pairs:
308  m1=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
309  m2=self.re.search(r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
310  if m1:
311  # cross link
312  residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
313  residue_pair_indexes.append((residue_index_1,residue_index_2))
314  elif m2:
315  # dead end
316  residue_type_1,residue_index_1=m2.group(1,2)
317  # at this stage chain_pair_indexes is empty
318  return residue_pair_indexes,chain_pair_indexes
319  if self.style == "QUANTITATION":
320  input_string=input_string.replace("x:","")
321  input_string_tockens=input_string.split(":")
322  residue_pair_indexes=[(input_string_tockens[1],input_string_tockens[3])]
323  chain_pair_indexes=[(input_string_tockens[0],input_string_tockens[2])]
324  return residue_pair_indexes,chain_pair_indexes
325 
326 
327 class FixedFormatParser(_CrossLinkDataBaseStandardKeys):
328  '''
329  A class to handle different XL format with fixed format
330  currently support ProXL
331  '''
332  def __init__(self,format):
333 
334  _CrossLinkDataBaseStandardKeys.__init__(self)
335  if format is "PROXL":
336  self.format=format
337  else:
338  raise Error("FixedFormatParser: unknown list format name")
339 
340 
341  def get_data(self,input_string):
342  if self.format is "PROXL":
343  tockens=input_string.split("\t")
344  xl={}
345  if tockens[0]=="SEARCH ID(S)":
346  return None
347  else:
348  xl[self.protein1_key]=tockens[2]
349  xl[self.protein2_key]=tockens[4]
350  xl[self.residue1_key]=int(tockens[3])
351  xl[self.residue2_key]=int(tockens[5])
352  return xl
353 
354 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
355  import operator
356  '''
357  this class handles a cross-link dataset and do filtering
358  operations, adding cross-links, merge datasets...
359  '''
360 
361  def __init__(self, converter=None, data_base=None):
362  '''
363  Constructor.
364  @param converter an instance of CrossLinkDataBaseKeywordsConverter
365  @param data_base an instance of CrossLinkDataBase to build the new database on
366  '''
367  if data_base is None:
368  self.data_base = {}
369  else:
370  self.data_base=data_base
371 
372  _CrossLinkDataBaseStandardKeys.__init__(self)
373  if converter is not None:
374  self.cldbkc = converter
375  self.list_parser=self.cldbkc.rplp
376  self.converter = converter.get_converter()
377 
378  else:
379  self.cldbkc = None
380  self.list_parser=None
381  self.converter = None
382 
383  self.__update()
384 
385  def __update(self):
386  '''
387  Update the whole dataset after changes
388  '''
389  self.update_cross_link_unique_sub_index()
390  self.update_cross_link_redundancy()
391 
392  def __iter__(self):
393  sorted_ids=sorted(self.data_base.keys())
394  for k in sorted_ids:
395  for xl in self.data_base[k]:
396  yield xl
397 
398  def xlid_iterator(self):
399  sorted_ids=sorted(self.data_base.keys())
400  for xlid in sorted_ids:
401  yield xlid
402 
403  def __getitem__(self,xlid):
404  return self.data_base[xlid]
405 
406  def __len__(self):
407  return len([xl for xl in self])
408 
409  def get_name(self):
410  return self.name
411 
412  def set_name(self,name):
413  new_data_base={}
414  for k in self.data_base:
415  new_data_base[k+"."+name]=self.data_base[k]
416  self.data_base=new_data_base
417  self.name=name
418  self.__update()
419 
420  def get_number_of_xlid(self):
421  return len(self.data_base)
422 
423 
424  def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
425  '''
426  if FixedFormatParser is not specified, the file is comma-separated-values
427  @param file_name a txt file to be parsed
428  @param converter an instance of CrossLinkDataBaseKeywordsConverter
429  @param FixedFormatParser a parser for a fixed format
430  '''
431  if not FixedFormatParser:
432  xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
433 
434  if converter is not None:
435  self.cldbkc = converter
436  self.list_parser=self.cldbkc.rplp
437  self.converter = converter.get_converter()
438 
439 
440  if not self.list_parser:
441  # normal procedure without a list_parser
442  # each line is a cross-link
443  new_xl_dict={}
444  for nxl,xl in enumerate(xl_list):
445  new_xl={}
446  for k in xl:
447  if k in self.converter:
448  new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
449  else:
450  new_xl[k]=xl[k]
451  if self.unique_id_key in self.cldbkc.get_setup_keys():
452  if new_xl[self.unique_id_key] not in new_xl_dict:
453  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
454  else:
455  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
456  else:
457  if str(nxl) not in new_xl_dict:
458  new_xl_dict[str(nxl)]=[new_xl]
459  else:
460  new_xl_dict[str(nxl)].append(new_xl)
461 
462  else:
463  # with a list_parser, a line can be a list of ambiguous crosslinks
464  new_xl_dict={}
465  for nxl,entry in enumerate(xl_list):
466 
467  # first get the translated keywords
468  new_dict={}
469  if self.site_pairs_key not in self.cldbkc.get_setup_keys():
470  raise Error("CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
471  for k in entry:
472  if k in self.converter:
473  new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
474  else:
475  new_dict[k]=entry[k]
476 
477  residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
478 
479  # then create the crosslinks
480  for n,p in enumerate(residue_pair_list):
481  new_xl={}
482  for k in new_dict:
483  new_xl[k]=new_dict[k]
484  new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
485  new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
486 
487  if len(chain_pair_list)==len(residue_pair_list):
488  new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
489  new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
490 
491  if self.unique_id_key in self.cldbkc.get_setup_keys():
492  if new_xl[self.unique_id_key] not in new_xl_dict:
493  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
494  else:
495  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
496  else:
497  if str(nxl) not in new_xl_dict:
498  new_xl[self.unique_id_key]=str(nxl+1)
499  new_xl_dict[str(nxl)]=[new_xl]
500  else:
501  new_xl[self.unique_id_key]=str(nxl+1)
502  new_xl_dict[str(nxl)].append(new_xl)
503 
504 
505  else:
506  '''
507  if FixedFormatParser is defined
508  '''
509 
510  new_xl_dict={}
511  f=open(file_name,"r")
512  nxl=0
513  for line in f:
514  xl=FixedFormatParser.get_data(line)
515  if xl:
516  xl[self.unique_id_key]=str(nxl+1)
517  new_xl_dict[str(nxl)]=[xl]
518  nxl+=1
519 
520 
521  self.data_base=new_xl_dict
522  self.name=file_name
523  self.__update()
524 
525  def update_cross_link_unique_sub_index(self):
526  for k in self.data_base:
527  for n,xl in enumerate(self.data_base[k]):
528  xl[self.unique_sub_index_key]=n+1
529  xl[self.unique_sub_id_key]=k+"."+str(n+1)
530 
531  def update_cross_link_redundancy(self):
532  redundancy_data_base={}
533  for xl in self:
534  pra=_ProteinsResiduesArray(xl)
535  if pra not in redundancy_data_base:
536  redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
537  redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
538  else:
539  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
540  redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
541  for xl in self:
542  pra=_ProteinsResiduesArray(xl)
543  xl[self.redundancy_key]=len(redundancy_data_base[pra])
544  xl[self.redundancy_list_key]=redundancy_data_base[pra]
545 
546  def get_cross_link_string(self,xl):
547  string='|'
548  for k in self.ordered_key_list:
549  try:
550  string+=str(k)+":"+str(xl[k])+"|"
551  except KeyError:
552  continue
553 
554  for k in xl:
555  if k not in self.ordered_key_list:
556  string+=str(k)+":"+str(xl[k])+"|"
557 
558  return string
559 
560  def get_short_cross_link_string(self,xl):
561 
562  string='|'
563  list_of_keys=[self.data_set_name_key,
564  self.unique_sub_id_key,
565  self.protein1_key,
566  self.residue1_key,
567  self.protein2_key,
568  self.residue2_key,
569  self.state_key,
570  self.psi_key]
571 
572  for k in list_of_keys:
573  try:
574  string+=str(xl[k])+"|"
575  except KeyError:
576  continue
577 
578  return string
579 
580  def filter(self,FilterOperator):
581  new_xl_dict={}
582  for id in self.data_base.keys():
583  for xl in self.data_base[id]:
584  if FilterOperator.evaluate(xl):
585  if id not in new_xl_dict:
586  new_xl_dict[id]=[xl]
587  else:
588  new_xl_dict[id].append(xl)
589  return CrossLinkDataBase(self.cldbkc,new_xl_dict)
590 
591 
592  def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
593  '''
594  This function merges two cross-link datasets so that if two conflicting crosslinks have the same
595  cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
596  with different SubIDs
597  '''
598  pass
599 
600  def append_database(self,CrossLinkDataBase2):
601  '''
602  This function append one cross-link dataset to another. Unique ids will be renamed to avoid
603  conflicts.
604  '''
605  name1=self.get_name()
606  name2=CrossLinkDataBase2.get_name()
607  if name1 == name2:
608  name1=id(self)
609  name2=id(CrossLinkDataBase2)
610 
611  #rename first database:
612  new_data_base={}
613  for k in self.data_base:
614  new_data_base[k]=self.data_base[k]
615  for k in CrossLinkDataBase2.data_base:
616  new_data_base[k]=CrossLinkDataBase2.data_base[k]
617  self.data_base=new_data_base
618  self.__update()
619 
620  def set_value(self,key,new_value,FilterOperator=None):
621  '''
622  This function changes the value for a given key in the database
623  For instance one can change the name of a protein
624  @param key: the key in the database that must be changed
625  @param new_value: the new value of the key
626  @param FilterOperator: optional FilterOperator to change the value to
627  a subset of the database
628 
629  example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
630  '''
631 
632  for xl in self:
633  if FilterOperator is not None:
634  if FilterOperator.evaluate(xl):
635  xl[key]=new_value
636  else:
637  xl[key]=new_value
638  self.__update()
639 
640  def get_values(self,key):
641  '''
642  this function returns the list of values for a given key in the database
643  alphanumerically sorted
644  '''
645  values=set()
646  for xl in self:
647  values.add(xl[key])
648  return sorted(list(values))
649 
650  def offset_residue_index(self,protein_name,offset):
651  '''
652  This function offset the residue indexes of a given protein by a specified value
653  @param protein_name: the protein name that need to be changed
654  @param offset: the offset value
655  '''
656 
657  for xl in self:
658  if xl[self.protein1_key] == protein_name:
659  xl[self.residue1_key]=xl[self.residue1_key]+offset
660  if xl[self.protein2_key] == protein_name:
661  xl[self.residue2_key]=xl[self.residue2_key]+offset
662  self.__update()
663 
664  def create_new_keyword(self,keyword,values_from_keyword=None):
665  '''
666  This function creates a new keyword for the whole database and set the values from
667  and existing keyword (optional), otherwise the values are set to None
668  @param keyword the new keyword name:
669  @param values_from_keyword the keyword from which we are copying the values:
670  '''
671  for xl in self:
672  if values_from_keyword is not None:
673  xl[keyword] = xl[values_from_keyword]
674  else:
675  xl[keyword] = None
676  self.__update()
677 
678  def clone_protein(self,protein_name,new_protein_name):
679  new_xl_dict={}
680  for id in self.data_base.keys():
681  new_data_base=[]
682  for xl in self.data_base[id]:
683  new_data_base.append(xl)
684  if xl[self.protein1_key]==protein_name and xl[self.protein2_key]!=protein_name:
685  new_xl=dict(xl)
686  new_xl[self.protein1_key]=new_protein_name
687  new_data_base.append(new_xl)
688  elif xl[self.protein1_key]!=protein_name and xl[self.protein2_key]==protein_name:
689  new_xl=dict(xl)
690  new_xl[self.protein2_key]=new_protein_name
691  new_data_base.append(new_xl)
692  elif xl[self.protein1_key]==protein_name and xl[self.protein2_key]==protein_name:
693  new_xl=dict(xl)
694  new_xl[self.protein1_key]=new_protein_name
695  new_data_base.append(new_xl)
696  new_xl=dict(xl)
697  new_xl[self.protein2_key]=new_protein_name
698  new_data_base.append(new_xl)
699  new_xl=dict(xl)
700  new_xl[self.protein1_key]=new_protein_name
701  new_xl[self.protein2_key]=new_protein_name
702  new_data_base.append(new_xl)
703  self.data_base[id]=new_data_base
704  self.__update()
705 
706  def filter_out_same_residues(self):
707  '''
708  This function remove cross-links applied to the same residue
709  (ie, same chain name and residue number)
710  '''
711  new_xl_dict={}
712  for id in self.data_base.keys():
713  new_data_base=[]
714  for xl in self.data_base[id]:
715  if xl[self.protein1_key]==xl[self.protein2_key] and xl[self.residue1_key]==xl[self.residue2_key]:
716  continue
717  else:
718  new_data_base.append(xl)
719  self.data_base[id]=new_data_base
720  self.__update()
721 
722 
723  def jackknife(self,percentage):
724  '''
725  this method returns a CrossLinkDataBase class containing
726  a random subsample of the original cross-link database.
727  @param percentage float between 0 and 1, is the percentage of
728  of spectra taken from the original list
729  '''
730  import random
731  if percentage > 1.0 or percentage < 0.0:
732  raise ValueError('the percentage of random cross-link spectra should be between 0 and 1')
733  nspectra=self.get_number_of_xlid()
734  nrandom_spectra=int(nspectra*percentage)
735  random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
736  new_data_base={}
737  for k in random_keys:
738  new_data_base[k]=self.data_base[k]
739  return CrossLinkDataBase(self.cldbkc,new_data_base)
740 
741  def __str__(self):
742  outstr=''
743  sorted_ids=sorted(self.data_base.keys())
744 
745  for id in sorted_ids:
746  outstr+=id+"\n"
747  for xl in self.data_base[id]:
748  for k in self.ordered_key_list:
749  try:
750  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
751  except KeyError:
752  continue
753 
754  for k in xl:
755  if k not in self.ordered_key_list:
756  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
757  outstr+="-------------\n"
758  return outstr
759 
760  def dump(self,json_filename):
761  import json
762  with open(json_filename, 'w') as fp:
763  json.dump(self.data_base, fp, sort_keys=True, indent=2)
764 
765  def load(self,json_filename):
766  import json
767  with open(json_filename, 'r') as fp:
768  self.data_base = json.load(fp)
769  self.__update()
770 
771  def save_csv(self,filename):
772 
773  import csv
774 
775  data=[]
776  sorted_ids=None
777  sorted_group_ids=sorted(self.data_base.keys())
778  for group in sorted_group_ids:
779  group_block=[]
780  for xl in self.data_base[group]:
781  if not sorted_ids:
782  sorted_ids=sorted(xl.keys())
783  values=[xl[k] for k in sorted_ids]
784  group_block.append(values)
785  data+=group_block
786 
787  with open(filename, 'w') as fp:
788  a = csv.writer(fp, delimiter=',')
789  a.writerows(data)
790 
792  '''
793  This class generates a CrossLinkDataBase from a given structure
794  '''
795  def __init__(self,representation,
796  residue_types_1=["K"],
797  residue_types_2=["K"],
798  reactivity_range=[0,3],
799  kt=1.0):
800 
802  cldbkc.set_protein1_key("Protein1")
803  cldbkc.set_protein2_key("Protein2")
804  cldbkc.set_residue1_key("Residue1")
805  cldbkc.set_residue2_key("Residue2")
806  self.cldb=CrossLinkDataBase(cldbkc)
807  self.representation=representation
808  self.residue_types_1=residue_types_1
809  self.residue_types_2=residue_types_2
810  self.kt=kt
811  self.indexes_dict1={}
812  self.indexes_dict2={}
813  self.protein_residue_dict={}
814  self.reactivity_dictionary={}
815  self.euclidean_interacting_pairs=None
816  self.xwalk_interacting_pairs=None
817  import random
818 
819  for protein in self.representation.sequence_dict.keys():
820  # we are saving a dictionary with protein name, residue number and random reactivity of the residue
821  seq=self.representation.sequence_dict[protein]
822  residues=[i for i in range(1,len(seq)+1) if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2))]
823  for i in range(1,len(seq)+1):
824  if ((seq[i-1] in self.residue_types_1) or (seq[i-1] in self.residue_types_2)):
825  print(i, protein, seq[i-1])
826 
827  for r in residues:
828  self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
829 
830  residues1=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_1]
831  residues2=[i for i in range(1,len(seq)+1) if seq[i-1] in self.residue_types_2]
832  for r in residues1:
833  h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
834  p=h.get_particle()
835  index=p.get_index()
836  self.indexes_dict1[index]=(protein,r)
837  self.protein_residue_dict[(protein,r)]=index
838  for r in residues2:
839  h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
840  p=h.get_particle()
841  index=p.get_index()
842  self.indexes_dict2[index]=(protein,r)
843  self.protein_residue_dict[(protein,r)]=index
844 
845 
846  def get_data_base(self,total_number_of_spectra,
847  ambiguity_probability=0.1,
848  noise=0.01,
849  distance=21,
850  xwalk_bin_path=None):
851 
852  from random import random
853  number_of_spectra=1
854 
855  self.cldb.data_base[str(number_of_spectra)]=[]
856  while number_of_spectra<total_number_of_spectra:
857  if random() > ambiguity_probability and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
858  # new spectrum
859  number_of_spectra+=1
860  self.cldb.data_base[str(number_of_spectra)]=[]
861  noisy="F"
862  if random() > noise:
863  # not noisy crosslink
864  pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path)
865 
866  else:
867  # noisy crosslink
868  pra,dist=self.get_random_residue_pair(None,None)
869  noisy="T"
870 
871  new_xl={}
872  new_xl[self.cldb.protein1_key]=pra[0]
873  new_xl[self.cldb.protein2_key]=pra[1]
874  new_xl[self.cldb.residue1_key]=pra[2]
875  new_xl[self.cldb.residue2_key]=pra[3]
876  new_xl["Noisy"]=noisy
877  new_xl["Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
878  new_xl["Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
879  new_xl["TargetDistance"]=dist
880  new_xl["NoiseProbability"]=noise
881  new_xl["AmbiguityProbability"]=ambiguity_probability
882  new_xl["TargetDistance"]=dist
883  # getting if it is intra or inter rigid body
884  (p1,p2)=IMP.kernel.get_particles(self.representation.m,[self.protein_residue_dict[(pra[0],pra[2])],
885  self.protein_residue_dict[(pra[1],pra[3])]])
888  IMP.core.RigidMember(p1).get_rigid_body() ==
889  IMP.core.RigidMember(p2).get_rigid_body()):
890  new_xl["InterRigidBody"] = False
893  IMP.core.RigidMember(p1).get_rigid_body() !=
894  IMP.core.RigidMember(p2).get_rigid_body()):
895  new_xl["InterRigidBody"] = True
896  else:
897  new_xl["InterRigidBody"] = None
898 
899  self.cldb.data_base[str(number_of_spectra)].append(new_xl)
900  self.cldb.__update()
901  return self.cldb
902 
903 
904  def get_random_residue_pair(self,distance=None,xwalk_bin_path=None):
905  import IMP.pmi.tools
906  import math
907  from random import choice
908  if distance is None:
909  # get a random pair
910  while True:
911  protein1=choice(self.representation.sequence_dict.keys())
912  protein2=choice(self.representation.sequence_dict.keys())
913  seq1=self.representation.sequence_dict[protein1]
914  seq2=self.representation.sequence_dict[protein2]
915  residue1=choice([i for i in range(1,len(seq1)+1) if seq1[i-1] in self.residue_types_1])
916  residue2=choice([i for i in range(1,len(seq2)+1) if seq2[i-1] in self.residue_types_2])
917  h1=IMP.pmi.tools.select_by_tuple(self.representation,(residue1,residue1,protein1),resolution=1)[0]
918  h2=IMP.pmi.tools.select_by_tuple(self.representation,(residue2,residue2,protein2),resolution=1)[0]
919  particle_distance=IMP.core.get_distance(IMP.core.XYZ(h1.get_particle()),IMP.core.XYZ(h2.get_particle()))
920  if (protein1,residue1) != (protein2,residue2):
921  break
922  else:
923  # get a pair of residues whose distance is below the threshold
924  if not xwalk_bin_path:
926  gcpf.set_distance(distance)
927  if not self.euclidean_interacting_pairs:
928  self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.representation.m,
929  self.indexes_dict1.keys(),
930  self.indexes_dict2.keys())
931  interacting_pairs_weighted=[]
932  for pair in self.euclidean_interacting_pairs:
933  weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
934  weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
935  interacting_pairs_weighted.append((pair,weight1*weight2))
936 
937  while True:
938  pair=self.weighted_choice(interacting_pairs_weighted)
939  protein1,residue1=self.indexes_dict1[pair[0]]
940  protein2,residue2=self.indexes_dict2[pair[1]]
941  particle_pair=IMP.kernel.get_particles(self.representation.m,pair)
942  particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
943  if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
944  break
945 
946  else:
947  if not self.xwalk_interacting_pairs:
948  self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
949  interacting_pairs_weighted=[]
950 
951  print(self.reactivity_dictionary)
952 
953  for pair in self.xwalk_interacting_pairs:
954  protein1=pair[0]
955  protein2=pair[1]
956  residue1=pair[2]
957  residue2=pair[3]
958  weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
959  weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
960  interacting_pairs_weighted.append((pair,weight1*weight2))
961 
962  pair=self.weighted_choice(interacting_pairs_weighted)
963  protein1=pair[0]
964  protein2=pair[1]
965  residue1=pair[2]
966  residue2=pair[3]
967  particle_distance=float(pair[4])
968 
969 
970 
971  return _ProteinsResiduesArray((protein1,protein2,residue1,residue2)),particle_distance
972 
973  def get_xwalk_distances(self,xwalk_bin_path,distance):
974  import IMP.pmi.output
975  import os
976  o=IMP.pmi.output.Output(atomistic=True)
977  o.init_pdb("xwalk.pdb",self.representation.prot)
978  o.write_pdb("xwalk.pdb")
979  namechainiddict=o.dictchain["xwalk.pdb"]
980  chainiddict={}
981 
982  for key in namechainiddict: chainiddict[namechainiddict[key]]=key
983  xwalkout=os.popen('java -Xmx256m -cp ' + xwalk_bin_path +' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+' -bb').read()
984 
985  output_list_of_distance=[]
986  for line in xwalkout.split("\n")[0:-2]:
987  tockens=line.split()
988  first=tockens[2]
989  second=tockens[3]
990  distance=float(tockens[6])
991  fs=first.split("-")
992  ss=second.split("-")
993  chainid1=fs[2]
994  chainid2=ss[2]
995  protein1=chainiddict[chainid1]
996  protein2=chainiddict[chainid2]
997  residue1=int(fs[1])
998  residue2=int(ss[1])
999  print(protein1,protein2,residue1,residue2,distance)
1000  output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1001  return output_list_of_distance
1002 
1003 
1004  def weighted_choice(self,choices):
1005 
1006  import random
1007  # from http://stackoverflow.com/questions/3679694/a-weighted-version-of-random-choice
1008  total = sum(w for c, w in choices)
1009  r = random.uniform(0, total)
1010  upto = 0
1011  for c, w in choices:
1012  if upto + w > r:
1013  return c
1014  upto += w
1015  assert False, "Shouldn't get here"
Miscellaneous utilities.
Definition: tools.py:1
static bool get_is_setup(const IMP::ParticleAdaptor &p)
Definition: rigid_bodies.h:469
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
Definition: XYZR.h:87
A decorator for a particle with x,y,z coordinates.
Definition: XYZ.h:30
Class for easy writing of PDBs, RMFs, and stat files.
Definition: output.py:32
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Definition: output.py:1
Python classes to represent, score, sample and analyze models.