1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
12 class _CrossLinkDataBaseStandardKeys(object):
14 This class setup all the standard keys needed to
15 identify the crosslink features from the data sets
19 self.protein1_key=
"Protein1"
20 self.type[self.protein1_key]=str
21 self.protein2_key=
"Protein2"
22 self.type[self.protein2_key]=str
23 self.residue1_key=
"Residue1"
24 self.type[self.residue1_key]=int
25 self.residue2_key=
"Residue2"
26 self.type[self.residue2_key]=int
27 self.site_pairs_key=
"SitePairs"
28 self.type[self.site_pairs_key]=str
29 self.unique_id_key=
"XLUniqueID"
30 self.type[self.unique_id_key]=str
31 self.unique_sub_index_key=
"XLUniqueSubIndex"
32 self.type[self.unique_sub_index_key]=int
33 self.unique_sub_id_key=
"XLUniqueSubID"
34 self.type[self.unique_sub_id_key]=str
35 self.data_set_name_key=
"DataSetName"
36 self.type[self.data_set_name_key]=str
37 self.cross_linker_chemical_key=
"CrossLinkerChemical"
38 self.type[self.cross_linker_chemical_key]=str
39 self.id_score_key=
"IDScore"
40 self.type[self.id_score_key]=float
41 self.quantitation_key=
"Quantitation"
42 self.type[self.quantitation_key]=float
43 self.redundancy_key=
"Redundancy"
44 self.type[self.redundancy_key]=int
45 self.redundancy_list_key=
"RedundancyList"
46 self.type[self.redundancy_key]=list
47 self.state_key=
"State"
48 self.type[self.state_key]=int
49 self.sigma1_key=
"Sigma1"
50 self.type[self.sigma1_key]=str
51 self.sigma2_key=
"Sigma2"
52 self.type[self.sigma2_key]=str
54 self.type[self.psi_key]=str
56 self.ordered_key_list =[self.data_set_name_key,
58 self.unique_sub_index_key,
59 self.unique_sub_id_key,
64 self.cross_linker_chemical_key,
66 self.quantitation_key,
68 self.redundancy_list_key,
75 class _ProteinsResiduesArray(tuple):
77 This class is inherits from tuple, and it is a shorthand for a cross-link
78 (p1,p2,r1,r2) where p1 and p2 are protein1 and protein2, r1 and r2 are
79 residue1 and residue2.
82 def __new__(self,input_data):
84 @input_data can be a dict or a tuple
86 self.cldbsk=_CrossLinkDataBaseStandardKeys()
87 if type(input_data)
is dict:
88 p1=input_data[self.cldbsk.protein1_key]
89 p2=input_data[self.cldbsk.protein2_key]
90 r1=input_data[self.cldbsk.residue1_key]
91 r2=input_data[self.cldbsk.residue2_key]
93 elif type(input_data)
is tuple:
95 raise TypeError(
"_ProteinsResiduesArray: must have only 4 elements")
96 if type(input_data[0])
is not str:
97 raise TypeError(
"_ProteinsResiduesArray: input_data[0] must be a string")
98 if type(input_data[1])
is not str:
99 raise TypeError(
"_ProteinsResiduesArray: input_data[1] must be a string")
100 if type(input_data[2])
is not int:
101 raise TypeError(
"_ProteinsResiduesArray: input_data[2] must be a integer")
102 if type(input_data[3])
is not int:
103 raise TypeError(
"_ProteinsResiduesArray: input_data[3] must be a integer")
106 raise TypeError(
"_ProteinsResiduesArray: input must be a dict or tuple")
107 return tuple.__new__(_ProteinsResiduesArray, t)
109 def get_inverted(self):
111 Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
113 return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
116 outstr=self.cldbsk.protein1_key+
" "+str(self[0])
117 outstr+=
" "+self.cldbsk.protein2_key+
" "+str(self[1])
118 outstr+=
" "+self.cldbsk.residue1_key+
" "+str(self[2])
119 outstr+=
" "+self.cldbsk.residue2_key+
" "+str(self[3])
124 This class allows to create filter functions that can be passed to the CrossLinkDataBase
127 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
129 where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
131 A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
135 and it is used to filter the database
138 def __init__(self, argument1, operator, argument2):
140 (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
141 or (FilterOperator1,operator.or|and...,FilterOperator2)
143 we need to implement a NOT
145 if isinstance(argument1, FilterOperator):
146 self.operations = [argument1, operator, argument2]
149 self.values = (argument1, operator, argument2)
151 def __or__(self, FilterOperator2):
154 def __and__(self, FilterOperator2):
157 def evaluate(self, xl_item):
159 if len(self.operations) == 0:
160 keyword, operator, value = self.values
161 return operator(xl_item[keyword], value)
162 FilterOperator1, op, FilterOperator2 = self.operations
164 return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
167 def filter_factory(xl_):
169 class FilterOperator(object):
173 def __new__(self,key,value,oper=operator.eq):
174 return oper(self.xl[key],value)
176 return FilterOperator
181 This class is needed to convert the keywords from a generic database
187 @param list_parser an instance of ResiduePairListParser, if any is needed
190 self.backward_converter={}
191 _CrossLinkDataBaseStandardKeys.__init__(self)
192 self.rplp = list_parser
193 if self.rplp
is None:
195 self.compulsory_keys=set([self.protein1_key,
200 self.compulsory_keys=self.rplp.get_compulsory_keys()
201 self.setup_keys=set()
205 Is a function that check whether necessary keys are setup
208 if self.compulsory_keys & setup_keys != self.compulsory_keys:
209 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
213 Returns the keys that have been setup so far
215 return self.backward_converter.keys()
217 def set_unique_id_key(self,origin_key):
218 self.converter[origin_key]=self.unique_id_key
219 self.backward_converter[self.unique_id_key]=origin_key
221 def set_protein1_key(self,origin_key):
222 self.converter[origin_key]=self.protein1_key
223 self.backward_converter[self.protein1_key]=origin_key
225 def set_protein2_key(self,origin_key):
226 self.converter[origin_key]=self.protein2_key
227 self.backward_converter[self.protein2_key]=origin_key
229 def set_residue1_key(self,origin_key):
230 self.converter[origin_key]=self.residue1_key
231 self.backward_converter[self.residue1_key]=origin_key
233 def set_residue2_key(self,origin_key):
234 self.converter[origin_key]=self.residue2_key
235 self.backward_converter[self.residue2_key]=origin_key
237 def set_site_pairs_key(self,origin_key):
238 self.converter[origin_key]=self.site_pairs_key
239 self.backward_converter[self.site_pairs_key]=origin_key
241 def set_id_score_key(self,origin_key):
242 self.converter[origin_key]=self.id_score_key
243 self.backward_converter[self.id_score_key]=origin_key
245 def set_quantitation_key(self,origin_key):
246 self.converter[origin_key]=self.quantitation_key
247 self.backward_converter[self.quantitation_key]=origin_key
249 def set_psi_key(self,origin_key):
250 self.converter[origin_key]=self.psi_key
251 self.backward_converter[self.psi_key]=origin_key
255 Returns the dictionary that convert the old keywords to the new ones
258 return self.converter
262 Returns the dictionary that convert the new keywords to the old ones
265 return self.backward_converter
269 A class to handle different styles of site pairs parsers.
271 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
272 [Y3-;K4-] for dead-ends
273 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
278 def __init__(self,style):
280 _CrossLinkDataBaseStandardKeys.__init__(self)
281 if style
is "MSSTUDIO":
283 self.compulsory_keys= set([self.protein1_key,
285 self.site_pairs_key])
286 elif style
is "QUANTITATION":
288 self.compulsory_keys= set([self.site_pairs_key])
290 raise Error(
"ResiduePairListParser: unknown list parser style")
292 def get_compulsory_keys(self):
293 return self.compulsory_keys
297 This function returns a list of cross-linked residues and the corresponding list of
298 cross-linked chains. The latter list can be empty, if the style doesn't have the
299 corresponding information.
301 if self.style ==
"MSSTUDIO":
302 input_string=input_string.replace(
"[",
"")
303 input_string=input_string.replace(
"]",
"")
304 input_string_pairs=input_string.split(
";")
305 residue_pair_indexes=[]
306 chain_pair_indexes=[]
307 for s
in input_string_pairs:
308 m1=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
309 m2=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
312 residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
313 residue_pair_indexes.append((residue_index_1,residue_index_2))
316 residue_type_1,residue_index_1=m2.group(1,2)
318 return residue_pair_indexes,chain_pair_indexes
319 if self.style ==
"QUANTITATION":
320 input_string=input_string.replace(
"x:",
"")
321 input_string_tockens=input_string.split(
":")
322 residue_pair_indexes=[(input_string_tockens[1],input_string_tockens[3])]
323 chain_pair_indexes=[(input_string_tockens[0],input_string_tockens[2])]
324 return residue_pair_indexes,chain_pair_indexes
329 A class to handle different XL format with fixed format
330 currently support ProXL
332 def __init__(self,format):
334 _CrossLinkDataBaseStandardKeys.__init__(self)
335 if format
is "PROXL":
338 raise Error(
"FixedFormatParser: unknown list format name")
341 def get_data(self,input_string):
342 if self.format
is "PROXL":
343 tockens=input_string.split(
"\t")
345 if tockens[0]==
"SEARCH ID(S)":
348 xl[self.protein1_key]=tockens[2]
349 xl[self.protein2_key]=tockens[4]
350 xl[self.residue1_key]=int(tockens[3])
351 xl[self.residue2_key]=int(tockens[5])
354 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
357 this class handles a cross-link dataset and do filtering
358 operations, adding cross-links, merge datasets...
361 def __init__(self, converter=None, data_base=None):
364 @param converter an instance of CrossLinkDataBaseKeywordsConverter
365 @param data_base an instance of CrossLinkDataBase to build the new database on
367 if data_base
is None:
370 self.data_base=data_base
372 _CrossLinkDataBaseStandardKeys.__init__(self)
373 if converter
is not None:
374 self.cldbkc = converter
375 self.list_parser=self.cldbkc.rplp
376 self.converter = converter.get_converter()
380 self.list_parser=
None
381 self.converter =
None
387 Update the whole dataset after changes
389 self.update_cross_link_unique_sub_index()
390 self.update_cross_link_redundancy()
393 sorted_ids=sorted(self.data_base.keys())
395 for xl
in self.data_base[k]:
398 def xlid_iterator(self):
399 sorted_ids=sorted(self.data_base.keys())
400 for xlid
in sorted_ids:
403 def __getitem__(self,xlid):
404 return self.data_base[xlid]
407 return len([xl
for xl
in self])
412 def set_name(self,name):
414 for k
in self.data_base:
415 new_data_base[k+
"."+name]=self.data_base[k]
416 self.data_base=new_data_base
420 def get_number_of_xlid(self):
421 return len(self.data_base)
424 def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
426 if FixedFormatParser is not specified, the file is comma-separated-values
427 @param file_name a txt file to be parsed
428 @param converter an instance of CrossLinkDataBaseKeywordsConverter
429 @param FixedFormatParser a parser for a fixed format
431 if not FixedFormatParser:
432 xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
434 if converter
is not None:
435 self.cldbkc = converter
436 self.list_parser=self.cldbkc.rplp
437 self.converter = converter.get_converter()
440 if not self.list_parser:
444 for nxl,xl
in enumerate(xl_list):
447 if k
in self.converter:
448 new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
451 if self.unique_id_key
in self.cldbkc.get_setup_keys():
452 if new_xl[self.unique_id_key]
not in new_xl_dict:
453 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
455 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
457 if str(nxl)
not in new_xl_dict:
458 new_xl_dict[str(nxl)]=[new_xl]
460 new_xl_dict[str(nxl)].append(new_xl)
465 for nxl,entry
in enumerate(xl_list):
469 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
470 raise Error(
"CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
472 if k
in self.converter:
473 new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
477 residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
480 for n,p
in enumerate(residue_pair_list):
483 new_xl[k]=new_dict[k]
484 new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
485 new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
487 if len(chain_pair_list)==len(residue_pair_list):
488 new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
489 new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
491 if self.unique_id_key
in self.cldbkc.get_setup_keys():
492 if new_xl[self.unique_id_key]
not in new_xl_dict:
493 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
495 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
497 if str(nxl)
not in new_xl_dict:
498 new_xl[self.unique_id_key]=str(nxl+1)
499 new_xl_dict[str(nxl)]=[new_xl]
501 new_xl[self.unique_id_key]=str(nxl+1)
502 new_xl_dict[str(nxl)].append(new_xl)
507 if FixedFormatParser is defined
511 f=open(file_name,
"r")
514 xl=FixedFormatParser.get_data(line)
516 xl[self.unique_id_key]=str(nxl+1)
517 new_xl_dict[str(nxl)]=[xl]
521 self.data_base=new_xl_dict
525 def update_cross_link_unique_sub_index(self):
526 for k
in self.data_base:
527 for n,xl
in enumerate(self.data_base[k]):
528 xl[self.unique_sub_index_key]=n+1
529 xl[self.unique_sub_id_key]=k+
"."+str(n+1)
531 def update_cross_link_redundancy(self):
532 redundancy_data_base={}
534 pra=_ProteinsResiduesArray(xl)
535 if pra
not in redundancy_data_base:
536 redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
537 redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
539 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
540 redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
542 pra=_ProteinsResiduesArray(xl)
543 xl[self.redundancy_key]=len(redundancy_data_base[pra])
544 xl[self.redundancy_list_key]=redundancy_data_base[pra]
546 def get_cross_link_string(self,xl):
548 for k
in self.ordered_key_list:
550 string+=str(k)+
":"+str(xl[k])+
"|"
555 if k
not in self.ordered_key_list:
556 string+=str(k)+
":"+str(xl[k])+
"|"
560 def get_short_cross_link_string(self,xl):
563 list_of_keys=[self.data_set_name_key,
564 self.unique_sub_id_key,
572 for k
in list_of_keys:
574 string+=str(xl[k])+
"|"
580 def filter(self,FilterOperator):
582 for id
in self.data_base.keys():
583 for xl
in self.data_base[id]:
584 if FilterOperator.evaluate(xl):
585 if id
not in new_xl_dict:
588 new_xl_dict[id].append(xl)
589 return CrossLinkDataBase(self.cldbkc,new_xl_dict)
592 def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
594 This function merges two cross-link datasets so that if two conflicting crosslinks have the same
595 cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
596 with different SubIDs
600 def append_database(self,CrossLinkDataBase2):
602 This function append one cross-link dataset to another. Unique ids will be renamed to avoid
605 name1=self.get_name()
606 name2=CrossLinkDataBase2.get_name()
609 name2=id(CrossLinkDataBase2)
613 for k
in self.data_base:
614 new_data_base[k]=self.data_base[k]
615 for k
in CrossLinkDataBase2.data_base:
616 new_data_base[k]=CrossLinkDataBase2.data_base[k]
617 self.data_base=new_data_base
620 def set_value(self,key,new_value,FilterOperator=None):
622 This function changes the value for a given key in the database
623 For instance one can change the name of a protein
624 @param key: the key in the database that must be changed
625 @param new_value: the new value of the key
626 @param FilterOperator: optional FilterOperator to change the value to
627 a subset of the database
629 example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
633 if FilterOperator
is not None:
634 if FilterOperator.evaluate(xl):
640 def get_values(self,key):
642 this function returns the list of values for a given key in the database
643 alphanumerically sorted
648 return sorted(list(values))
650 def offset_residue_index(self,protein_name,offset):
652 This function offset the residue indexes of a given protein by a specified value
653 @param protein_name: the protein name that need to be changed
654 @param offset: the offset value
658 if xl[self.protein1_key] == protein_name:
659 xl[self.residue1_key]=xl[self.residue1_key]+offset
660 if xl[self.protein2_key] == protein_name:
661 xl[self.residue2_key]=xl[self.residue2_key]+offset
664 def create_new_keyword(self,keyword,values_from_keyword=None):
666 This function creates a new keyword for the whole database and set the values from
667 and existing keyword (optional), otherwise the values are set to None
668 @param keyword the new keyword name:
669 @param values_from_keyword the keyword from which we are copying the values:
672 if values_from_keyword
is not None:
673 xl[keyword] = xl[values_from_keyword]
678 def clone_protein(self,protein_name,new_protein_name):
680 for id
in self.data_base.keys():
682 for xl
in self.data_base[id]:
683 new_data_base.append(xl)
684 if xl[self.protein1_key]==protein_name
and xl[self.protein2_key]!=protein_name:
686 new_xl[self.protein1_key]=new_protein_name
687 new_data_base.append(new_xl)
688 elif xl[self.protein1_key]!=protein_name
and xl[self.protein2_key]==protein_name:
690 new_xl[self.protein2_key]=new_protein_name
691 new_data_base.append(new_xl)
692 elif xl[self.protein1_key]==protein_name
and xl[self.protein2_key]==protein_name:
694 new_xl[self.protein1_key]=new_protein_name
695 new_data_base.append(new_xl)
697 new_xl[self.protein2_key]=new_protein_name
698 new_data_base.append(new_xl)
700 new_xl[self.protein1_key]=new_protein_name
701 new_xl[self.protein2_key]=new_protein_name
702 new_data_base.append(new_xl)
703 self.data_base[id]=new_data_base
706 def filter_out_same_residues(self):
708 This function remove cross-links applied to the same residue
709 (ie, same chain name and residue number)
712 for id
in self.data_base.keys():
714 for xl
in self.data_base[id]:
715 if xl[self.protein1_key]==xl[self.protein2_key]
and xl[self.residue1_key]==xl[self.residue2_key]:
718 new_data_base.append(xl)
719 self.data_base[id]=new_data_base
723 def jackknife(self,percentage):
725 this method returns a CrossLinkDataBase class containing
726 a random subsample of the original cross-link database.
727 @param percentage float between 0 and 1, is the percentage of
728 of spectra taken from the original list
731 if percentage > 1.0
or percentage < 0.0:
732 raise ValueError(
'the percentage of random cross-link spectra should be between 0 and 1')
733 nspectra=self.get_number_of_xlid()
734 nrandom_spectra=int(nspectra*percentage)
735 random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
737 for k
in random_keys:
738 new_data_base[k]=self.data_base[k]
739 return CrossLinkDataBase(self.cldbkc,new_data_base)
743 sorted_ids=sorted(self.data_base.keys())
745 for id
in sorted_ids:
747 for xl
in self.data_base[id]:
748 for k
in self.ordered_key_list:
750 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
755 if k
not in self.ordered_key_list:
756 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
757 outstr+=
"-------------\n"
760 def dump(self,json_filename):
762 with open(json_filename,
'w')
as fp:
763 json.dump(self.data_base, fp, sort_keys=
True, indent=2)
765 def load(self,json_filename):
767 with open(json_filename,
'r') as fp:
768 self.data_base = json.load(fp)
771 def save_csv(self,filename):
777 sorted_group_ids=sorted(self.data_base.keys())
778 for group
in sorted_group_ids:
780 for xl
in self.data_base[group]:
782 sorted_ids=sorted(xl.keys())
783 values=[xl[k]
for k
in sorted_ids]
784 group_block.append(values)
787 with open(filename,
'w')
as fp:
788 a = csv.writer(fp, delimiter=
',')
793 This class generates a CrossLinkDataBase from a given structure
795 def __init__(self,representation,
796 residue_types_1=[
"K"],
797 residue_types_2=[
"K"],
798 reactivity_range=[0,3],
802 cldbkc.set_protein1_key(
"Protein1")
803 cldbkc.set_protein2_key(
"Protein2")
804 cldbkc.set_residue1_key(
"Residue1")
805 cldbkc.set_residue2_key(
"Residue2")
806 self.cldb=CrossLinkDataBase(cldbkc)
807 self.representation=representation
808 self.residue_types_1=residue_types_1
809 self.residue_types_2=residue_types_2
811 self.indexes_dict1={}
812 self.indexes_dict2={}
813 self.protein_residue_dict={}
814 self.reactivity_dictionary={}
815 self.euclidean_interacting_pairs=
None
816 self.xwalk_interacting_pairs=
None
819 for protein
in self.representation.sequence_dict.keys():
821 seq=self.representation.sequence_dict[protein]
822 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
823 for i
in range(1,len(seq)+1):
824 if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2)):
825 print(i, protein, seq[i-1])
828 self.reactivity_dictionary[(protein,r)]=random.uniform(reactivity_range[0],reactivity_range[1])
830 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
831 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
833 h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
836 self.indexes_dict1[index]=(protein,r)
837 self.protein_residue_dict[(protein,r)]=index
839 h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
842 self.indexes_dict2[index]=(protein,r)
843 self.protein_residue_dict[(protein,r)]=index
846 def get_data_base(self,total_number_of_spectra,
847 ambiguity_probability=0.1,
850 xwalk_bin_path=
None):
852 from random
import random
855 self.cldb.data_base[str(number_of_spectra)]=[]
856 while number_of_spectra<total_number_of_spectra:
857 if random() > ambiguity_probability
and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
860 self.cldb.data_base[str(number_of_spectra)]=[]
864 pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path)
868 pra,dist=self.get_random_residue_pair(
None,
None)
872 new_xl[self.cldb.protein1_key]=pra[0]
873 new_xl[self.cldb.protein2_key]=pra[1]
874 new_xl[self.cldb.residue1_key]=pra[2]
875 new_xl[self.cldb.residue2_key]=pra[3]
876 new_xl[
"Noisy"]=noisy
877 new_xl[
"Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
878 new_xl[
"Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
879 new_xl[
"TargetDistance"]=dist
880 new_xl[
"NoiseProbability"]=noise
881 new_xl[
"AmbiguityProbability"]=ambiguity_probability
882 new_xl[
"TargetDistance"]=dist
884 (p1,p2)=IMP.kernel.get_particles(self.representation.m,[self.protein_residue_dict[(pra[0],pra[2])],
885 self.protein_residue_dict[(pra[1],pra[3])]])
890 new_xl[
"InterRigidBody"] =
False
895 new_xl[
"InterRigidBody"] =
True
897 new_xl[
"InterRigidBody"] =
None
899 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
904 def get_random_residue_pair(self,distance=None,xwalk_bin_path=None):
907 from random
import choice
911 protein1=choice(self.representation.sequence_dict.keys())
912 protein2=choice(self.representation.sequence_dict.keys())
913 seq1=self.representation.sequence_dict[protein1]
914 seq2=self.representation.sequence_dict[protein2]
915 residue1=choice([i
for i
in range(1,len(seq1)+1)
if seq1[i-1]
in self.residue_types_1])
916 residue2=choice([i
for i
in range(1,len(seq2)+1)
if seq2[i-1]
in self.residue_types_2])
917 h1=IMP.pmi.tools.select_by_tuple(self.representation,(residue1,residue1,protein1),resolution=1)[0]
918 h2=IMP.pmi.tools.select_by_tuple(self.representation,(residue2,residue2,protein2),resolution=1)[0]
920 if (protein1,residue1) != (protein2,residue2):
924 if not xwalk_bin_path:
926 gcpf.set_distance(distance)
927 if not self.euclidean_interacting_pairs:
928 self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.representation.m,
929 self.indexes_dict1.keys(),
930 self.indexes_dict2.keys())
931 interacting_pairs_weighted=[]
932 for pair
in self.euclidean_interacting_pairs:
933 weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
934 weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
935 interacting_pairs_weighted.append((pair,weight1*weight2))
938 pair=self.weighted_choice(interacting_pairs_weighted)
939 protein1,residue1=self.indexes_dict1[pair[0]]
940 protein2,residue2=self.indexes_dict2[pair[1]]
941 particle_pair=IMP.kernel.get_particles(self.representation.m,pair)
943 if particle_distance<distance
and (protein1,residue1) != (protein2,residue2):
947 if not self.xwalk_interacting_pairs:
948 self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
949 interacting_pairs_weighted=[]
951 print(self.reactivity_dictionary)
953 for pair
in self.xwalk_interacting_pairs:
958 weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
959 weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
960 interacting_pairs_weighted.append((pair,weight1*weight2))
962 pair=self.weighted_choice(interacting_pairs_weighted)
967 particle_distance=float(pair[4])
971 return _ProteinsResiduesArray((protein1,protein2,residue1,residue2)),particle_distance
973 def get_xwalk_distances(self,xwalk_bin_path,distance):
977 o.init_pdb(
"xwalk.pdb",self.representation.prot)
978 o.write_pdb(
"xwalk.pdb")
979 namechainiddict=o.dictchain[
"xwalk.pdb"]
982 for key
in namechainiddict: chainiddict[namechainiddict[key]]=key
983 xwalkout=os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+
' -bb').read()
985 output_list_of_distance=[]
986 for line
in xwalkout.split(
"\n")[0:-2]:
990 distance=float(tockens[6])
995 protein1=chainiddict[chainid1]
996 protein2=chainiddict[chainid2]
999 print(protein1,protein2,residue1,residue2,distance)
1000 output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1001 return output_list_of_distance
1004 def weighted_choice(self,choices):
1008 total = sum(w
for c, w
in choices)
1009 r = random.uniform(0, total)
1011 for c, w
in choices:
1015 assert False,
"Shouldn't get here"
This class generates a CrossLinkDataBase from a given structure.
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
This class is needed to convert the keywords from a generic database to the standard ones...
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
A decorator for a particle with x,y,z coordinates.
Class for easy writing of PDBs, RMFs, and stat files.
def check_keys
Is a function that check whether necessary keys are setup.
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
A class to handle different styles of site pairs parsers.
Python classes to represent, score, sample and analyze models.