IMP logo
IMP Reference Guide  2.5.0
The Integrative Modeling Platform
crosslink.py
1 """@namespace IMP.pmi.io.crosslink
2  Handles cross-link data sets.
3 
4  Utilities are also provided to help in the analysis of models that
5  contain cross-links.
6 """
7 
8 import IMP
9 import IMP.pmi
10 import operator
11 
12 class _CrossLinkDataBaseStandardKeys(object):
13  '''
14  This class setup all the standard keys needed to
15  identify the crosslink features from the data sets
16  '''
17  def __init__(self):
18  self.type={}
19  self.protein1_key="Protein1"
20  self.type[self.protein1_key]=str
21  self.protein2_key="Protein2"
22  self.type[self.protein2_key]=str
23  self.residue1_key="Residue1"
24  self.type[self.residue1_key]=int
25  self.residue2_key="Residue2"
26  self.type[self.residue2_key]=int
27  self.unique_id_key="XLUniqueID"
28  self.type[self.unique_id_key]=str
29  self.unique_sub_index_key="XLUniqueSubIndex"
30  self.type[self.unique_sub_index_key]=int
31  self.unique_sub_id_key="XLUniqueSubID"
32  self.type[self.unique_sub_id_key]=str
33  self.data_set_name_key="DataSetName"
34  self.type[self.data_set_name_key]=str
35  self.cross_linker_chemical_key="CrossLinkerChemical"
36  self.type[self.cross_linker_chemical_key]=str
37  self.id_score_key="IDScore"
38  self.type[self.id_score_key]=float
39  self.quantitation_key="Quantitation"
40  self.type[self.quantitation_key]=float
41  self.redundancy_key="Redundancy"
42  self.type[self.redundancy_key]=int
43  self.redundancy_list_key="RedundancyList"
44  self.type[self.redundancy_key]=list
45  self.state_key="State"
46  self.type[self.state_key]=int
47  self.sigma1_key="Sigma1"
48  self.type[self.sigma1_key]=float
49  self.sigma2_key="Sigma2"
50  self.type[self.sigma2_key]=float
51  self.psi_key="Psi"
52  self.type[self.psi_key]=float
53 
54  self.ordered_key_list =[self.data_set_name_key,
55  self.unique_id_key,
56  self.unique_sub_index_key,
57  self.unique_sub_id_key,
58  self.protein1_key,
59  self.protein2_key,
60  self.residue1_key,
61  self.residue2_key,
62  self.cross_linker_chemical_key,
63  self.id_score_key,
64  self.quantitation_key,
65  self.redundancy_key,
66  self.redundancy_list_key,
67  self.state_key,
68  self.sigma1_key,
69  self.sigma2_key,
70  self.psi_key]
71 
72 
73 class _ProteinsResiduesArray(tuple):
74  '''
75  This class is inherits from tuple, and it is a shorthand for a cross-link
76  (p1,p2,r1,r2) where p1 and p2 are protein1 and protein2, r1 and r2 are
77  residue1 and residue2.
78  '''
79 
80  def __new__(self,input_data):
81  '''
82  @input_data can be a dict or a tuple
83  '''
84  if type(input_data) is dict:
85  self.cldbsk=_CrossLinkDataBaseStandardKeys()
86  p1=input_data[self.cldbsk.protein1_key]
87  p2=input_data[self.cldbsk.protein2_key]
88  r1=input_data[self.cldbsk.residue1_key]
89  r2=input_data[self.cldbsk.residue2_key]
90  t=(p1,p2,r1,r2)
91  elif type(input_data) is tuple:
92  if len(input_data)>4:
93  raise TypeError("_ProteinsResiduesArray: must have only 4 elements")
94  if type(input_data[0]) is not str:
95  raise TypeError("_ProteinsResiduesArray: input_data[0] must be a string")
96  if type(input_data[1]) is not str:
97  raise TypeError("_ProteinsResiduesArray: input_data[1] must be a string")
98  if type(input_data[2]) is not int:
99  raise TypeError("_ProteinsResiduesArray: input_data[2] must be a integer")
100  if type(input_data[3]) is not int:
101  raise TypeError("_ProteinsResiduesArray: input_data[3] must be a integer")
102  t=input_data
103  else:
104  raise TypeError("_ProteinsResiduesArray: input must be a dict or tuple")
105  return tuple.__new__(_ProteinsResiduesArray, t)
106 
107  def get_inverted(self):
108  '''
109  Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
110  '''
111  return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
112 
113  def __str__(self):
114  outstr=self.cldbsk.protein1_key+" "+str(self[0])
115  outstr+=" "+self.cldbsk.protein2_key+" "+str(self[1])
116  outstr+=" "+self.cldbsk.residue1_key+" "+str(self[2])
117  outstr+=" "+self.cldbsk.residue2_key+" "+str(self[3])
118  return outstr
119 
120 class FilterOperator(object):
121  '''
122  This class allows to create filter functions that can be passed to the CrossLinkDataBase
123  in this way:
124 
125  fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
126 
127  where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
128 
129  A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
130 
131  fo.evaluate(xl)
132 
133  and it is used to filter the database
134  '''
135 
136  def __init__(self, argument1, operator, argument2):
137  '''
138  (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
139  or (FilterOperator1,operator.or|and...,FilterOperator2)
140 
141  we need to implement a NOT
142  '''
143  if isinstance(argument1, FilterOperator):
144  self.operations = [argument1, operator, argument2]
145  else:
146  self.operations = []
147  self.values = (argument1, operator, argument2)
148 
149  def __or__(self, FilterOperator2):
150  return FilterOperator(self, operator.or_, FilterOperator2)
151 
152  def __and__(self, FilterOperator2):
153  return FilterOperator(self, operator.and_, FilterOperator2)
154 
155  def evaluate(self, xl_item):
156 
157  if len(self.operations) == 0:
158  keyword, operator, value = self.values
159  return operator(xl_item[keyword], value)
160  FilterOperator1, op, FilterOperator2 = self.operations
161 
162  return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
163 
164 '''
165 def filter_factory(xl_):
166 
167  class FilterOperator(object):
168  import operator
169  xl = xl_
170 
171  def __new__(self,key,value,oper=operator.eq):
172  return oper(self.xl[key],value)
173 
174  return FilterOperator
175 '''
176 
177 class CrossLinkDataBaseKeywordsConverter(_CrossLinkDataBaseStandardKeys):
178  '''
179  This class is needed to convert the keywords from a generic database
180  to the standard ones
181  '''
182 
183  def __init__(self):
184  self.converter={}
185  self.backward_converter={}
186  _CrossLinkDataBaseStandardKeys.__init__(self)
187  self.compulsory_keys=set([self.protein1_key,
188  self.protein2_key,
189  self.residue1_key,
190  self.residue2_key])
191  self.setup_keys=set()
192 
193  def check_keys(self):
194  '''
195  Is a function that check whether necessary keys are setup
196  '''
197  setup_keys=set(self.get_setup_keys())
198  if self.compulsory_keys & setup_keys != self.compulsory_keys:
199  raise KeyError("CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
200 
201  def get_setup_keys(self):
202  '''
203  Returns the keys that have been setup so far
204  '''
205  return self.backward_converter.keys()
206 
207  def set_unique_id_key(self,origin_key):
208  self.converter[origin_key]=self.unique_id_key
209  self.backward_converter[self.unique_id_key]=origin_key
210 
211  def set_protein1_key(self,origin_key):
212  self.converter[origin_key]=self.protein1_key
213  self.backward_converter[self.protein1_key]=origin_key
214 
215  def set_protein2_key(self,origin_key):
216  self.converter[origin_key]=self.protein2_key
217  self.backward_converter[self.protein2_key]=origin_key
218 
219  def set_residue1_key(self,origin_key):
220  self.converter[origin_key]=self.residue1_key
221  self.backward_converter[self.residue1_key]=origin_key
222 
223  def set_residue2_key(self,origin_key):
224  self.converter[origin_key]=self.residue2_key
225  self.backward_converter[self.residue2_key]=origin_key
226 
227  def set_idscore_key(self,origin_key):
228  self.converter[origin_key]=self.id_score_key
229  self.backward_converter[self.id_score_key]=origin_key
230 
231  def set_quantification_key(self,origin_key):
232  self.converter[origin_key]=self.quantitation_key
233  self.backward_converter[self.quantitation_key]=origin_key
234 
235  def get_converter(self):
236  '''
237  Returns the dictionary that convert the old keywords to the new ones
238  '''
239  self.check_keys()
240  return self.converter
241 
243  '''
244  Returns the dictionary that convert the new keywords to the old ones
245  '''
246  self.check_keys()
247  return self.backward_converter
248 
249 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
250  import operator
251  '''
252  this class handles a cross-link dataset and do filtering
253  operations, adding cross-links, merge datasets...
254  '''
255 
256  def __init__(self,CrossLinkDataBaseKeywordsConverter,data_base=None):
257  '''
258  To be constructed it needs a CrossLinkDataBaseKeywordsConverter instance first
259 
260  '''
261  if data_base is None:
262  self.data_base = {}
263  else:
264  self.data_base=data_base
265  self.cldbkc=CrossLinkDataBaseKeywordsConverter
266  _CrossLinkDataBaseStandardKeys.__init__(self)
267  self.converter=CrossLinkDataBaseKeywordsConverter.get_converter()
268  self.__update__()
269 
270  def __update__(self):
271  '''
272  Update the whole dataset after changes
273  '''
274  self.update_cross_link_unique_sub_index()
275  self.update_cross_link_redundancy()
276 
277  def __iter__(self):
278  for k in self.data_base:
279  for xl in self.data_base[k]:
280  yield xl
281 
282  def xlid_iterator(self):
283  for xlid in self.data_base.keys():
284  yield xlid
285 
286  def __getitem__(self,xlid):
287  return self.data_base[xlid]
288 
289  def __len__(self):
290  return len([xl for xl in self])
291 
292  def get_name(self):
293  return self.name
294 
295  def set_name(self,name):
296  self.name=name
297 
298  def get_number_of_xlid(self):
299  return len(self.data_base)
300 
301 
302  def create_set_from_file(self,csv_file_name):
303  xl_list=IMP.pmi.tools.get_db_from_csv(csv_file_name)
304  new_xl_dict={}
305  for nxl,xl in enumerate(xl_list):
306  new_xl={}
307  for k in xl:
308  if k in self.converter:
309  new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
310  else:
311  new_xl[k]=xl[k]
312  if self.unique_id_key in self.cldbkc.get_setup_keys():
313  if new_xl[self.unique_id_key] not in new_xl_dict:
314  new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
315  else:
316  new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
317  else:
318  new_xl_dict[str(nxl)]=[new_xl]
319 
320  self.data_base=new_xl_dict
321  self.name=csv_file_name
322  self.__update__()
323 
324  def update_cross_link_unique_sub_index(self):
325  for k in self.data_base:
326  for n,xl in enumerate(self.data_base[k]):
327  xl[self.unique_sub_index_key]=n+1
328  xl[self.unique_sub_id_key]=k+"."+str(n+1)
329 
330  def update_cross_link_redundancy(self):
331  redundancy_data_base={}
332  for xl in self:
333  pra=_ProteinsResiduesArray(xl)
334  if pra not in redundancy_data_base:
335  redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
336  redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
337  else:
338  redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
339  redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
340  for xl in self:
341  pra=_ProteinsResiduesArray(xl)
342  xl[self.redundancy_key]=len(redundancy_data_base[pra])
343  xl[self.redundancy_list_key]=redundancy_data_base[pra]
344 
345  def get_cross_link_string(self,xl):
346  string='|'
347  for k in self.ordered_key_list:
348  try:
349  string+=str(k)+":"+str(xl[k])+"|"
350  except KeyError:
351  continue
352 
353  for k in xl:
354  if k not in self.ordered_key_list:
355  string+=str(k)+":"+str(xl[k])+"|"
356 
357  return string
358 
359  def get_short_cross_link_string(self,xl):
360 
361  string='|'
362  list_of_keys=[self.data_set_name_key,
363  self.unique_sub_id_key,
364  self.protein1_key,
365  self.residue1_key,
366  self.protein2_key,
367  self.residue2_key,
368  self.state_key,
369  self.psi_key]
370 
371  for k in list_of_keys:
372  try:
373  string+=str(xl[k])+"|"
374  except KeyError:
375  continue
376 
377  return string
378 
379  def filter(self,FilterOperator):
380  new_xl_dict={}
381  for id in self.data_base.keys():
382  for xl in self.data_base[id]:
383  if FilterOperator.evaluate(xl):
384  if id not in new_xl_dict:
385  new_xl_dict[id]=[xl]
386  else:
387  new_xl_dict[id].append(xl)
388  return CrossLinkDataBase(self.cldbkc,new_xl_dict)
389 
390  def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
391  '''
392  This function merges two cross-link datasets so that if two conflicting crosslinks have the same
393  cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
394  with different SubIDs
395  '''
396  pass
397 
398  def append(self,CrossLinkDataBase2):
399  '''
400  This function append one cross-link dataset to another. Unique ids will be renamed to avoid
401  conflicts.
402  '''
403  name1=self.get_name()
404  name2=CrossLinkDataBase2.get_name()
405  if name1 == name2:
406  name1=id(self)
407  name2=id(CrossLinkDataBase2)
408 
409  #rename first database:
410  new_data_base={}
411  for k in self.data_base:
412  new_data_base[k+"."+name1]=self.data_base[k]
413  for k in CrossLinkDataBase2.data_base:
414  new_data_base[k+"."+name2]=CrossLinkDataBase2.data_base[k]
415  self.data_base=new_data_base
416  self.__update__()
417 
418  def change_value(self,key,old_value,new_value):
419  pass
420 
421  def clone_protein(self,protein_name,new_protein_name):
422  new_xl_dict={}
423  for id in self.data_base.keys():
424  new_data_base=[]
425  for xl in self.data_base[id]:
426  new_data_base.append(xl)
427  if xl[self.protein1_key]==protein_name and xl[self.protein2_key]!=protein_name:
428  new_xl=dict(xl)
429  new_xl[self.protein1_key]=new_protein_name
430  new_data_base.append(new_xl)
431  elif xl[self.protein1_key]!=protein_name and xl[self.protein2_key]==protein_name:
432  new_xl=dict(xl)
433  new_xl[self.protein2_key]=new_protein_name
434  new_data_base.append(new_xl)
435  elif xl[self.protein1_key]==protein_name and xl[self.protein2_key]==protein_name:
436  new_xl=dict(xl)
437  new_xl[self.protein1_key]=new_protein_name
438  new_data_base.append(new_xl)
439  new_xl=dict(xl)
440  new_xl[self.protein2_key]=new_protein_name
441  new_data_base.append(new_xl)
442  new_xl=dict(xl)
443  new_xl[self.protein1_key]=new_protein_name
444  new_xl[self.protein2_key]=new_protein_name
445  new_data_base.append(new_xl)
446  self.data_base[id]=new_data_base
447  self.__update__()
448 
449 
450  def jackknife(self,percentage):
451  '''
452  this method returns a CrossLinkDataBase class containing
453  a random subsample of the original cross-link database.
454  @param percentage float between 0 and 1, is the percentage of
455  of spectra taken from the original list
456  '''
457  import random
458  if percentage > 1.0 or percentage < 0.0:
459  raise ValueError('the percentage of random cross-link spectra should be between 0 and 1')
460  nspectra=self.get_number_of_xlid()
461  nrandom_spectra=int(nspectra*percentage)
462  random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
463  new_data_base={}
464  for k in random_keys:
465  new_data_base[k]=self.data_base[k]
466  return CrossLinkDataBase(self.cldbkc,new_data_base)
467 
468  def __str__(self):
469  outstr=''
470  sorted_ids=sorted(self.data_base.keys())
471 
472  for id in sorted_ids:
473  outstr+=id+"\n"
474  for xl in self.data_base[id]:
475  for k in self.ordered_key_list:
476  try:
477  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
478  except KeyError:
479  continue
480 
481  for k in xl:
482  if k not in self.ordered_key_list:
483  outstr+="--- "+str(k)+" "+str(xl[k])+"\n"
484  outstr+="-------------\n"
485  return outstr
Python classes to represent, score, sample and analyze models.