1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
12 class _CrossLinkDataBaseStandardKeys(object):
14 This class setup all the standard keys needed to
15 identify the crosslink features from the data sets
19 self.protein1_key=
"Protein1"
20 self.type[self.protein1_key]=str
21 self.protein2_key=
"Protein2"
22 self.type[self.protein2_key]=str
23 self.residue1_key=
"Residue1"
24 self.type[self.residue1_key]=int
25 self.residue2_key=
"Residue2"
26 self.type[self.residue2_key]=int
27 self.unique_id_key=
"XLUniqueID"
28 self.type[self.unique_id_key]=str
29 self.unique_sub_index_key=
"XLUniqueSubIndex"
30 self.type[self.unique_sub_index_key]=int
31 self.unique_sub_id_key=
"XLUniqueSubID"
32 self.type[self.unique_sub_id_key]=str
33 self.data_set_name_key=
"DataSetName"
34 self.type[self.data_set_name_key]=str
35 self.cross_linker_chemical_key=
"CrossLinkerChemical"
36 self.type[self.cross_linker_chemical_key]=str
37 self.id_score_key=
"IDScore"
38 self.type[self.id_score_key]=float
39 self.quantitation_key=
"Quantitation"
40 self.type[self.quantitation_key]=float
41 self.redundancy_key=
"Redundancy"
42 self.type[self.redundancy_key]=int
43 self.redundancy_list_key=
"RedundancyList"
44 self.type[self.redundancy_key]=list
45 self.state_key=
"State"
46 self.type[self.state_key]=int
47 self.sigma1_key=
"Sigma1"
48 self.type[self.sigma1_key]=float
49 self.sigma2_key=
"Sigma2"
50 self.type[self.sigma2_key]=float
52 self.type[self.psi_key]=float
54 self.ordered_key_list =[self.data_set_name_key,
56 self.unique_sub_index_key,
57 self.unique_sub_id_key,
62 self.cross_linker_chemical_key,
64 self.quantitation_key,
66 self.redundancy_list_key,
73 class _ProteinsResiduesArray(tuple):
75 This class is inherits from tuple, and it is a shorthand for a cross-link
76 (p1,p2,r1,r2) where p1 and p2 are protein1 and protein2, r1 and r2 are
77 residue1 and residue2.
80 def __new__(self,input_data):
82 @input_data can be a dict or a tuple
84 if type(input_data)
is dict:
85 self.cldbsk=_CrossLinkDataBaseStandardKeys()
86 p1=input_data[self.cldbsk.protein1_key]
87 p2=input_data[self.cldbsk.protein2_key]
88 r1=input_data[self.cldbsk.residue1_key]
89 r2=input_data[self.cldbsk.residue2_key]
91 elif type(input_data)
is tuple:
93 raise TypeError(
"_ProteinsResiduesArray: must have only 4 elements")
94 if type(input_data[0])
is not str:
95 raise TypeError(
"_ProteinsResiduesArray: input_data[0] must be a string")
96 if type(input_data[1])
is not str:
97 raise TypeError(
"_ProteinsResiduesArray: input_data[1] must be a string")
98 if type(input_data[2])
is not int:
99 raise TypeError(
"_ProteinsResiduesArray: input_data[2] must be a integer")
100 if type(input_data[3])
is not int:
101 raise TypeError(
"_ProteinsResiduesArray: input_data[3] must be a integer")
104 raise TypeError(
"_ProteinsResiduesArray: input must be a dict or tuple")
105 return tuple.__new__(_ProteinsResiduesArray, t)
107 def get_inverted(self):
109 Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
111 return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
114 outstr=self.cldbsk.protein1_key+
" "+str(self[0])
115 outstr+=
" "+self.cldbsk.protein2_key+
" "+str(self[1])
116 outstr+=
" "+self.cldbsk.residue1_key+
" "+str(self[2])
117 outstr+=
" "+self.cldbsk.residue2_key+
" "+str(self[3])
122 This class allows to create filter functions that can be passed to the CrossLinkDataBase
125 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
127 where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
129 A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
133 and it is used to filter the database
136 def __init__(self, argument1, operator, argument2):
138 (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
139 or (FilterOperator1,operator.or|and...,FilterOperator2)
141 we need to implement a NOT
143 if isinstance(argument1, FilterOperator):
144 self.operations = [argument1, operator, argument2]
147 self.values = (argument1, operator, argument2)
149 def __or__(self, FilterOperator2):
152 def __and__(self, FilterOperator2):
155 def evaluate(self, xl_item):
157 if len(self.operations) == 0:
158 keyword, operator, value = self.values
159 return operator(xl_item[keyword], value)
160 FilterOperator1, op, FilterOperator2 = self.operations
162 return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
165 def filter_factory(xl_):
167 class FilterOperator(object):
171 def __new__(self,key,value,oper=operator.eq):
172 return oper(self.xl[key],value)
174 return FilterOperator
179 This class is needed to convert the keywords from a generic database
185 self.backward_converter={}
186 _CrossLinkDataBaseStandardKeys.__init__(self)
187 self.compulsory_keys=set([self.protein1_key,
191 self.setup_keys=set()
195 Is a function that check whether necessary keys are setup
198 if self.compulsory_keys & setup_keys != self.compulsory_keys:
199 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
203 Returns the keys that have been setup so far
205 return self.backward_converter.keys()
207 def set_unique_id_key(self,origin_key):
208 self.converter[origin_key]=self.unique_id_key
209 self.backward_converter[self.unique_id_key]=origin_key
211 def set_protein1_key(self,origin_key):
212 self.converter[origin_key]=self.protein1_key
213 self.backward_converter[self.protein1_key]=origin_key
215 def set_protein2_key(self,origin_key):
216 self.converter[origin_key]=self.protein2_key
217 self.backward_converter[self.protein2_key]=origin_key
219 def set_residue1_key(self,origin_key):
220 self.converter[origin_key]=self.residue1_key
221 self.backward_converter[self.residue1_key]=origin_key
223 def set_residue2_key(self,origin_key):
224 self.converter[origin_key]=self.residue2_key
225 self.backward_converter[self.residue2_key]=origin_key
227 def set_idscore_key(self,origin_key):
228 self.converter[origin_key]=self.id_score_key
229 self.backward_converter[self.id_score_key]=origin_key
231 def set_quantification_key(self,origin_key):
232 self.converter[origin_key]=self.quantitation_key
233 self.backward_converter[self.quantitation_key]=origin_key
237 Returns the dictionary that convert the old keywords to the new ones
240 return self.converter
244 Returns the dictionary that convert the new keywords to the old ones
247 return self.backward_converter
249 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
252 this class handles a cross-link dataset and do filtering
253 operations, adding cross-links, merge datasets...
256 def __init__(self,CrossLinkDataBaseKeywordsConverter,data_base=None):
258 To be constructed it needs a CrossLinkDataBaseKeywordsConverter instance first
261 if data_base
is None:
264 self.data_base=data_base
265 self.cldbkc=CrossLinkDataBaseKeywordsConverter
266 _CrossLinkDataBaseStandardKeys.__init__(self)
267 self.converter=CrossLinkDataBaseKeywordsConverter.get_converter()
270 def __update__(self):
272 Update the whole dataset after changes
274 self.update_cross_link_unique_sub_index()
275 self.update_cross_link_redundancy()
278 for k
in self.data_base:
279 for xl
in self.data_base[k]:
282 def xlid_iterator(self):
283 for xlid
in self.data_base.keys():
286 def __getitem__(self,xlid):
287 return self.data_base[xlid]
290 return len([xl
for xl
in self])
295 def set_name(self,name):
298 def get_number_of_xlid(self):
299 return len(self.data_base)
302 def create_set_from_file(self,csv_file_name):
303 xl_list=IMP.pmi.tools.get_db_from_csv(csv_file_name)
305 for nxl,xl
in enumerate(xl_list):
308 if k
in self.converter:
309 new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
312 if self.unique_id_key
in self.cldbkc.get_setup_keys():
313 if new_xl[self.unique_id_key]
not in new_xl_dict:
314 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
316 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
318 new_xl_dict[str(nxl)]=[new_xl]
320 self.data_base=new_xl_dict
321 self.name=csv_file_name
324 def update_cross_link_unique_sub_index(self):
325 for k
in self.data_base:
326 for n,xl
in enumerate(self.data_base[k]):
327 xl[self.unique_sub_index_key]=n+1
328 xl[self.unique_sub_id_key]=k+
"."+str(n+1)
330 def update_cross_link_redundancy(self):
331 redundancy_data_base={}
333 pra=_ProteinsResiduesArray(xl)
334 if pra
not in redundancy_data_base:
335 redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
336 redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
338 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
339 redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
341 pra=_ProteinsResiduesArray(xl)
342 xl[self.redundancy_key]=len(redundancy_data_base[pra])
343 xl[self.redundancy_list_key]=redundancy_data_base[pra]
345 def get_cross_link_string(self,xl):
347 for k
in self.ordered_key_list:
349 string+=str(k)+
":"+str(xl[k])+
"|"
354 if k
not in self.ordered_key_list:
355 string+=str(k)+
":"+str(xl[k])+
"|"
359 def get_short_cross_link_string(self,xl):
362 list_of_keys=[self.data_set_name_key,
363 self.unique_sub_id_key,
371 for k
in list_of_keys:
373 string+=str(xl[k])+
"|"
379 def filter(self,FilterOperator):
381 for id
in self.data_base.keys():
382 for xl
in self.data_base[id]:
383 if FilterOperator.evaluate(xl):
384 if id
not in new_xl_dict:
387 new_xl_dict[id].append(xl)
388 return CrossLinkDataBase(self.cldbkc,new_xl_dict)
390 def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
392 This function merges two cross-link datasets so that if two conflicting crosslinks have the same
393 cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
394 with different SubIDs
398 def append(self,CrossLinkDataBase2):
400 This function append one cross-link dataset to another. Unique ids will be renamed to avoid
403 name1=self.get_name()
404 name2=CrossLinkDataBase2.get_name()
407 name2=id(CrossLinkDataBase2)
411 for k
in self.data_base:
412 new_data_base[k+
"."+name1]=self.data_base[k]
413 for k
in CrossLinkDataBase2.data_base:
414 new_data_base[k+
"."+name2]=CrossLinkDataBase2.data_base[k]
415 self.data_base=new_data_base
418 def change_value(self,key,old_value,new_value):
421 def clone_protein(self,protein_name,new_protein_name):
423 for id
in self.data_base.keys():
425 for xl
in self.data_base[id]:
426 new_data_base.append(xl)
427 if xl[self.protein1_key]==protein_name
and xl[self.protein2_key]!=protein_name:
429 new_xl[self.protein1_key]=new_protein_name
430 new_data_base.append(new_xl)
431 elif xl[self.protein1_key]!=protein_name
and xl[self.protein2_key]==protein_name:
433 new_xl[self.protein2_key]=new_protein_name
434 new_data_base.append(new_xl)
435 elif xl[self.protein1_key]==protein_name
and xl[self.protein2_key]==protein_name:
437 new_xl[self.protein1_key]=new_protein_name
438 new_data_base.append(new_xl)
440 new_xl[self.protein2_key]=new_protein_name
441 new_data_base.append(new_xl)
443 new_xl[self.protein1_key]=new_protein_name
444 new_xl[self.protein2_key]=new_protein_name
445 new_data_base.append(new_xl)
446 self.data_base[id]=new_data_base
450 def jackknife(self,percentage):
452 this method returns a CrossLinkDataBase class containing
453 a random subsample of the original cross-link database.
454 @param percentage float between 0 and 1, is the percentage of
455 of spectra taken from the original list
458 if percentage > 1.0
or percentage < 0.0:
459 raise ValueError(
'the percentage of random cross-link spectra should be between 0 and 1')
460 nspectra=self.get_number_of_xlid()
461 nrandom_spectra=int(nspectra*percentage)
462 random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
464 for k
in random_keys:
465 new_data_base[k]=self.data_base[k]
466 return CrossLinkDataBase(self.cldbkc,new_data_base)
470 sorted_ids=sorted(self.data_base.keys())
472 for id
in sorted_ids:
474 for xl
in self.data_base[id]:
475 for k
in self.ordered_key_list:
477 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
482 if k
not in self.ordered_key_list:
483 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
484 outstr+=
"-------------\n"
This class is needed to convert the keywords from a generic database to the standard ones...
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
def check_keys
Is a function that check whether necessary keys are setup.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
Python classes to represent, score, sample and analyze models.