1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
14 def set_json_default(obj):
15 if isinstance(obj, set):
22 if sys.version_info[0] >= 3:
23 def _handle_string_input(inp):
24 if not isinstance(inp, str):
25 raise TypeError(
"expecting a string")
28 def _handle_string_input(inp):
29 if not isinstance(inp, (str, unicode)):
30 raise TypeError(
"expecting a string or unicode")
32 if isinstance(inp, unicode):
37 class _CrossLinkDataBaseStandardKeys(object):
39 This class setup all the standard keys needed to
40 identify the crosslink features from the data sets
44 self.protein1_key=
"Protein1"
45 self.type[self.protein1_key]=str
46 self.protein2_key=
"Protein2"
47 self.type[self.protein2_key]=str
48 self.residue1_key=
"Residue1"
49 self.type[self.residue1_key]=int
50 self.residue2_key=
"Residue2"
51 self.type[self.residue2_key]=int
52 self.site_pairs_key=
"SitePairs"
53 self.type[self.site_pairs_key]=str
54 self.unique_id_key=
"XLUniqueID"
55 self.type[self.unique_id_key]=str
56 self.unique_sub_index_key=
"XLUniqueSubIndex"
57 self.type[self.unique_sub_index_key]=int
58 self.unique_sub_id_key=
"XLUniqueSubID"
59 self.type[self.unique_sub_id_key]=str
60 self.data_set_name_key=
"DataSetName"
61 self.type[self.data_set_name_key]=str
62 self.cross_linker_chemical_key=
"CrossLinkerChemical"
63 self.type[self.cross_linker_chemical_key]=str
64 self.id_score_key=
"IDScore"
65 self.type[self.id_score_key]=float
67 self.type[self.fdr_key]=float
68 self.quantitation_key=
"Quantitation"
69 self.type[self.quantitation_key]=float
70 self.redundancy_key=
"Redundancy"
71 self.type[self.redundancy_key]=int
72 self.redundancy_list_key=
"RedundancyList"
73 self.type[self.redundancy_key]=list
74 self.ambiguity_key=
"Ambiguity"
75 self.type[self.ambiguity_key]=int
76 self.state_key=
"State"
77 self.type[self.state_key]=int
78 self.sigma1_key=
"Sigma1"
79 self.type[self.sigma1_key]=str
80 self.sigma2_key=
"Sigma2"
81 self.type[self.sigma2_key]=str
83 self.type[self.psi_key]=str
84 self.distance_key=
"Distance"
85 self.type[self.distance_key]=float
86 self.min_ambiguous_distance_key=
"MinAmbiguousDistance"
87 self.type[self.distance_key]=float
89 self.ordered_key_list =[self.data_set_name_key,
91 self.unique_sub_index_key,
92 self.unique_sub_id_key,
97 self.cross_linker_chemical_key,
100 self.quantitation_key,
102 self.redundancy_list_key,
108 self.min_ambiguous_distance_key]
111 class _ProteinsResiduesArray(tuple):
113 This class is inherits from tuple, and it is a shorthand for a cross-link
114 (p1,p2,r1,r2) where p1 and p2 are protein1 and protein2, r1 and r2 are
115 residue1 and residue2.
118 def __new__(self,input_data):
120 @input_data can be a dict or a tuple
122 self.cldbsk=_CrossLinkDataBaseStandardKeys()
123 if type(input_data)
is dict:
124 p1=input_data[self.cldbsk.protein1_key]
125 p2=input_data[self.cldbsk.protein2_key]
126 r1=input_data[self.cldbsk.residue1_key]
127 r2=input_data[self.cldbsk.residue2_key]
129 elif type(input_data)
is tuple:
130 if len(input_data)>4:
131 raise TypeError(
"_ProteinsResiduesArray: must have only 4 elements")
132 p1 = _handle_string_input(input_data[0])
133 p2 = _handle_string_input(input_data[1])
136 if type(r1)
is not int:
137 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
138 if type(r2)
is not int:
139 raise TypeError(
"_ProteinsResiduesArray: residue2 must be a integer")
142 raise TypeError(
"_ProteinsResiduesArray: input must be a dict or tuple")
143 return tuple.__new__(_ProteinsResiduesArray, t)
145 def get_inverted(self):
147 Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
149 return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
152 outstr=self.cldbsk.protein1_key+
" "+str(self[0])
153 outstr+=
" "+self.cldbsk.protein2_key+
" "+str(self[1])
154 outstr+=
" "+self.cldbsk.residue1_key+
" "+str(self[2])
155 outstr+=
" "+self.cldbsk.residue2_key+
" "+str(self[3])
160 This class allows to create filter functions that can be passed to the CrossLinkDataBase
163 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
165 where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
167 A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
171 and it is used to filter the database
174 def __init__(self, argument1, operator, argument2):
176 (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
177 or (FilterOperator1,operator.or|and...,FilterOperator2)
179 we need to implement a NOT
181 if isinstance(argument1, FilterOperator):
182 self.operations = [argument1, operator, argument2]
185 self.values = (argument1, operator, argument2)
187 def __or__(self, FilterOperator2):
190 def __and__(self, FilterOperator2):
193 def evaluate(self, xl_item):
195 if len(self.operations) == 0:
196 keyword, operator, value = self.values
197 return operator(xl_item[keyword], value)
198 FilterOperator1, op, FilterOperator2 = self.operations
200 return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
203 def filter_factory(xl_):
205 class FilterOperator(object):
209 def __new__(self,key,value,oper=operator.eq):
210 return oper(self.xl[key],value)
212 return FilterOperator
217 This class is needed to convert the keywords from a generic database
223 @param list_parser an instance of ResiduePairListParser, if any is needed
226 self.backward_converter={}
227 _CrossLinkDataBaseStandardKeys.__init__(self)
228 self.rplp = list_parser
229 if self.rplp
is None:
231 self.compulsory_keys=set([self.protein1_key,
236 self.compulsory_keys=self.rplp.get_compulsory_keys()
237 self.setup_keys=set()
241 Is a function that check whether necessary keys are setup
244 if self.compulsory_keys & setup_keys != self.compulsory_keys:
245 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
249 Returns the keys that have been setup so far
251 return self.backward_converter.keys()
255 This sets up the standard compulsory keys as defined by
256 _CrossLinkDataBaseStandardKeys
258 for ck
in self.compulsory_keys:
259 self.converter[ck]=ck
260 self.backward_converter[ck]=ck
262 def set_unique_id_key(self,origin_key):
263 self.converter[origin_key]=self.unique_id_key
264 self.backward_converter[self.unique_id_key]=origin_key
266 def set_protein1_key(self,origin_key):
267 self.converter[origin_key]=self.protein1_key
268 self.backward_converter[self.protein1_key]=origin_key
270 def set_protein2_key(self,origin_key):
271 self.converter[origin_key]=self.protein2_key
272 self.backward_converter[self.protein2_key]=origin_key
274 def set_residue1_key(self,origin_key):
275 self.converter[origin_key]=self.residue1_key
276 self.backward_converter[self.residue1_key]=origin_key
278 def set_residue2_key(self,origin_key):
279 self.converter[origin_key]=self.residue2_key
280 self.backward_converter[self.residue2_key]=origin_key
282 def set_site_pairs_key(self,origin_key):
283 self.converter[origin_key]=self.site_pairs_key
284 self.backward_converter[self.site_pairs_key]=origin_key
286 def set_id_score_key(self,origin_key):
287 self.converter[origin_key]=self.id_score_key
288 self.backward_converter[self.id_score_key]=origin_key
290 def set_fdr_key(self,origin_key):
291 self.converter[origin_key]=self.fdr_key
292 self.backward_converter[self.fdr_key]=origin_key
294 def set_quantitation_key(self,origin_key):
295 self.converter[origin_key]=self.quantitation_key
296 self.backward_converter[self.quantitation_key]=origin_key
298 def set_psi_key(self,origin_key):
299 self.converter[origin_key]=self.psi_key
300 self.backward_converter[self.psi_key]=origin_key
304 Returns the dictionary that convert the old keywords to the new ones
307 return self.converter
311 Returns the dictionary that convert the new keywords to the old ones
314 return self.backward_converter
318 A class to handle different styles of site pairs parsers.
320 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
321 [Y3-;K4-] for dead-ends
322 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
323 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
328 def __init__(self,style):
330 _CrossLinkDataBaseStandardKeys.__init__(self)
331 if style
is "MSSTUDIO":
333 self.compulsory_keys= set([self.protein1_key,
335 self.site_pairs_key])
336 elif style
is "QUANTITATION":
338 self.compulsory_keys= set([self.site_pairs_key])
340 raise Error(
"ResiduePairListParser: unknown list parser style")
342 def get_compulsory_keys(self):
343 return self.compulsory_keys
347 This function returns a list of cross-linked residues and the corresponding list of
348 cross-linked chains. The latter list can be empty, if the style doesn't have the
349 corresponding information.
351 if self.style ==
"MSSTUDIO":
352 input_string=input_string.replace(
"[",
"")
353 input_string=input_string.replace(
"]",
"")
354 input_string_pairs=input_string.split(
";")
355 residue_pair_indexes=[]
356 chain_pair_indexes=[]
357 for s
in input_string_pairs:
358 m1=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
359 m2=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
362 residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
363 residue_pair_indexes.append((residue_index_1,residue_index_2))
366 residue_type_1,residue_index_1=m2.group(1,2)
368 return residue_pair_indexes,chain_pair_indexes
369 if self.style ==
"QUANTITATION":
370 input_strings=input_string.split(
":x:")
371 first_peptides=input_strings[0].split(
":|:")
372 second_peptides=input_strings[1].split(
":|:")
373 first_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in first_peptides]
374 second_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in second_peptides]
375 residue_pair_indexes=[]
376 chain_pair_indexes=[]
377 for fpi
in first_peptides_indentifiers:
378 for spi
in second_peptides_indentifiers:
383 residue_pair_indexes.append((residue1,residue2))
384 chain_pair_indexes.append((chain1,chain2))
385 return residue_pair_indexes,chain_pair_indexes
390 A class to handle different XL format with fixed format
391 currently support ProXL
393 def __init__(self,format):
395 _CrossLinkDataBaseStandardKeys.__init__(self)
396 if format
is "PROXL":
399 raise Error(
"FixedFormatParser: unknown list format name")
402 def get_data(self,input_string):
403 if self.format
is "PROXL":
404 tockens=input_string.split(
"\t")
406 if tockens[0]==
"SEARCH ID(S)":
409 xl[self.protein1_key]=tockens[2]
410 xl[self.protein2_key]=tockens[4]
411 xl[self.residue1_key]=int(tockens[3])
412 xl[self.residue2_key]=int(tockens[5])
415 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
418 this class handles a cross-link dataset and do filtering
419 operations, adding cross-links, merge datasets...
422 def __init__(self, converter=None, data_base=None):
425 @param converter an instance of CrossLinkDataBaseKeywordsConverter
426 @param data_base an instance of CrossLinkDataBase to build the new database on
428 if data_base
is None:
431 self.data_base=data_base
433 _CrossLinkDataBaseStandardKeys.__init__(self)
434 if converter
is not None:
435 self.cldbkc = converter
436 self.list_parser=self.cldbkc.rplp
437 self.converter = converter.get_converter()
441 self.list_parser=
None
442 self.converter =
None
448 Update the whole dataset after changes
450 self.update_cross_link_unique_sub_index()
451 self.update_cross_link_redundancy()
454 sorted_ids=sorted(self.data_base.keys())
456 for xl
in self.data_base[k]:
459 def xlid_iterator(self):
460 sorted_ids=sorted(self.data_base.keys())
461 for xlid
in sorted_ids:
464 def __getitem__(self,xlid):
465 return self.data_base[xlid]
468 return len([xl
for xl
in self])
473 def set_name(self,name):
475 for k
in self.data_base:
476 new_data_base[k+
"."+name]=self.data_base[k]
477 self.data_base=new_data_base
481 def get_number_of_xlid(self):
482 return len(self.data_base)
485 def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
487 if FixedFormatParser is not specified, the file is comma-separated-values
488 @param file_name a txt file to be parsed
489 @param converter an instance of CrossLinkDataBaseKeywordsConverter
490 @param FixedFormatParser a parser for a fixed format
492 if not FixedFormatParser:
493 xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
495 if converter
is not None:
496 self.cldbkc = converter
497 self.list_parser=self.cldbkc.rplp
498 self.converter = converter.get_converter()
501 if not self.list_parser:
505 for nxl,xl
in enumerate(xl_list):
508 if k
in self.converter:
509 new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
512 if self.unique_id_key
in self.cldbkc.get_setup_keys():
513 if new_xl[self.unique_id_key]
not in new_xl_dict:
514 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
516 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
518 if str(nxl)
not in new_xl_dict:
519 new_xl_dict[str(nxl)]=[new_xl]
521 new_xl_dict[str(nxl)].append(new_xl)
526 for nxl,entry
in enumerate(xl_list):
530 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
531 raise Error(
"CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
533 if k
in self.converter:
534 new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
538 residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
541 for n,p
in enumerate(residue_pair_list):
544 new_xl[k]=new_dict[k]
545 new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
546 new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
548 if len(chain_pair_list)==len(residue_pair_list):
549 new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
550 new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
552 if self.unique_id_key
in self.cldbkc.get_setup_keys():
553 if new_xl[self.unique_id_key]
not in new_xl_dict:
554 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
556 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
558 if str(nxl)
not in new_xl_dict:
559 new_xl[self.unique_id_key]=str(nxl+1)
560 new_xl_dict[str(nxl)]=[new_xl]
562 new_xl[self.unique_id_key]=str(nxl+1)
563 new_xl_dict[str(nxl)].append(new_xl)
568 if FixedFormatParser is defined
572 f=open(file_name,
"r")
575 xl=FixedFormatParser.get_data(line)
577 xl[self.unique_id_key]=str(nxl+1)
578 new_xl_dict[str(nxl)]=[xl]
582 self.data_base=new_xl_dict
586 def update_cross_link_unique_sub_index(self):
587 for k
in self.data_base:
588 for n,xl
in enumerate(self.data_base[k]):
589 xl[self.ambiguity_key]=len(self.data_base[k])
590 xl[self.unique_sub_index_key]=n+1
591 xl[self.unique_sub_id_key]=k+
"."+str(n+1)
593 def update_cross_link_redundancy(self):
594 redundancy_data_base={}
596 pra=_ProteinsResiduesArray(xl)
597 if pra
not in redundancy_data_base:
598 redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
599 redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
601 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
602 redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
604 pra=_ProteinsResiduesArray(xl)
605 xl[self.redundancy_key]=len(redundancy_data_base[pra])
606 xl[self.redundancy_list_key]=redundancy_data_base[pra]
608 def get_cross_link_string(self,xl):
610 for k
in self.ordered_key_list:
612 string+=str(k)+
":"+str(xl[k])+
"|"
617 if k
not in self.ordered_key_list:
618 string+=str(k)+
":"+str(xl[k])+
"|"
622 def get_short_cross_link_string(self,xl):
625 list_of_keys=[self.data_set_name_key,
626 self.unique_sub_id_key,
634 for k
in list_of_keys:
636 string+=str(xl[k])+
"|"
642 def filter(self,FilterOperator):
644 for id
in self.data_base.keys():
645 for xl
in self.data_base[id]:
646 if FilterOperator.evaluate(xl):
647 if id
not in new_xl_dict:
650 new_xl_dict[id].append(xl)
651 return CrossLinkDataBase(self.cldbkc,new_xl_dict)
654 def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
656 This function merges two cross-link datasets so that if two conflicting crosslinks have the same
657 cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
658 with different SubIDs
662 def append_database(self,CrossLinkDataBase2):
664 This function append one cross-link dataset to another. Unique ids will be renamed to avoid
667 name1=self.get_name()
668 name2=CrossLinkDataBase2.get_name()
671 name2=id(CrossLinkDataBase2)
675 for k
in self.data_base:
676 new_data_base[k]=self.data_base[k]
677 for k
in CrossLinkDataBase2.data_base:
678 new_data_base[k]=CrossLinkDataBase2.data_base[k]
679 self.data_base=new_data_base
682 def set_value(self,key,new_value,FilterOperator=None):
684 This function changes the value for a given key in the database
685 For instance one can change the name of a protein
686 @param key: the key in the database that must be changed
687 @param new_value: the new value of the key
688 @param FilterOperator: optional FilterOperator to change the value to
689 a subset of the database
691 example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
695 if FilterOperator
is not None:
696 if FilterOperator.evaluate(xl):
702 def get_values(self,key):
704 this function returns the list of values for a given key in the database
705 alphanumerically sorted
710 return sorted(list(values))
712 def offset_residue_index(self,protein_name,offset):
714 This function offset the residue indexes of a given protein by a specified value
715 @param protein_name: the protein name that need to be changed
716 @param offset: the offset value
720 if xl[self.protein1_key] == protein_name:
721 xl[self.residue1_key]=xl[self.residue1_key]+offset
722 if xl[self.protein2_key] == protein_name:
723 xl[self.residue2_key]=xl[self.residue2_key]+offset
726 def create_new_keyword(self,keyword,values_from_keyword=None):
728 This function creates a new keyword for the whole database and set the values from
729 and existing keyword (optional), otherwise the values are set to None
730 @param keyword the new keyword name:
731 @param values_from_keyword the keyword from which we are copying the values:
734 if values_from_keyword
is not None:
735 xl[keyword] = xl[values_from_keyword]
740 def rename_proteins(self,old_to_new_names_dictionary):
742 This function renames all proteins contained in the input dictionary
743 from the old names (keys) to the new name (values)
746 for old_name
in old_to_new_names_dictionary:
747 new_name=old_to_new_names_dictionary[old_name]
748 fo2=FilterOperator(self.protein1_key,operator.eq,old_name)
749 self.set_value(self.protein1_key,new_name,fo2)
750 fo2=FilterOperator(self.protein2_key,operator.eq,old_name)
751 self.set_value(self.protein2_key,new_name,fo2)
753 def clone_protein(self,protein_name,new_protein_name):
755 for id
in self.data_base.keys():
757 for xl
in self.data_base[id]:
758 new_data_base.append(xl)
759 if xl[self.protein1_key]==protein_name
and xl[self.protein2_key]!=protein_name:
761 new_xl[self.protein1_key]=new_protein_name
762 new_data_base.append(new_xl)
763 elif xl[self.protein1_key]!=protein_name
and xl[self.protein2_key]==protein_name:
765 new_xl[self.protein2_key]=new_protein_name
766 new_data_base.append(new_xl)
767 elif xl[self.protein1_key]==protein_name
and xl[self.protein2_key]==protein_name:
769 new_xl[self.protein1_key]=new_protein_name
770 new_data_base.append(new_xl)
772 new_xl[self.protein2_key]=new_protein_name
773 new_data_base.append(new_xl)
775 new_xl[self.protein1_key]=new_protein_name
776 new_xl[self.protein2_key]=new_protein_name
777 new_data_base.append(new_xl)
778 self.data_base[id]=new_data_base
781 def filter_out_same_residues(self):
783 This function remove cross-links applied to the same residue
784 (ie, same chain name and residue number)
787 for id
in self.data_base.keys():
789 for xl
in self.data_base[id]:
790 if xl[self.protein1_key]==xl[self.protein2_key]
and xl[self.residue1_key]==xl[self.residue2_key]:
793 new_data_base.append(xl)
794 self.data_base[id]=new_data_base
798 def jackknife(self,percentage):
800 this method returns a CrossLinkDataBase class containing
801 a random subsample of the original cross-link database.
802 @param percentage float between 0 and 1, is the percentage of
803 of spectra taken from the original list
806 if percentage > 1.0
or percentage < 0.0:
807 raise ValueError(
'the percentage of random cross-link spectra should be between 0 and 1')
808 nspectra=self.get_number_of_xlid()
809 nrandom_spectra=int(nspectra*percentage)
810 random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
812 for k
in random_keys:
813 new_data_base[k]=self.data_base[k]
814 return CrossLinkDataBase(self.cldbkc,new_data_base)
818 sorted_ids=sorted(self.data_base.keys())
820 for id
in sorted_ids:
822 for xl
in self.data_base[id]:
823 for k
in self.ordered_key_list:
825 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
830 if k
not in self.ordered_key_list:
831 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
832 outstr+=
"-------------\n"
836 def plot(self,filename,**kwargs):
838 matplotlib.use(
'Agg')
839 import matplotlib.pyplot
as plt
840 import matplotlib.colors
844 if kwargs[
"type"] ==
"scatter":
845 cmap=plt.get_cmap(
"rainbow")
846 norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
849 if "colorkey" in kwargs:
850 colorkey=kwargs[
"colorkey"]
851 if "sizekey" in kwargs:
852 sizekey=kwargs[
"sizekey"]
858 xs.append(float(xl[xkey]))
859 ys.append(float(xl[ykey]))
860 colors.append(float(xl[colorkey]))
862 print(
"Value error for cross-link %s" % (xl[self.unique_id_key]))
867 cindex=(color-min(colors))/(max(colors)-min(colors))
868 cs.append(cmap(cindex))
871 ax = fig.add_subplot(111)
872 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker=
"o")
875 plt.savefig(filename)
879 if kwargs[
"type"] ==
"histogram":
880 valuekey=kwargs[
"valuekey"]
881 reference_xline=kwargs[
"reference_xline"]
887 values_list.append(float(xl[valuekey]))
889 print(
"Value error for cross-link %s" % (xl[self.unique_id_key]))
892 filename, [values_list], valuename=valuename, bins=bins,
893 colors=
None, format=
"pdf",
894 reference_xline=
None, yplotrange=
None,
895 xplotrange=
None,normalized=
True,
898 def dump(self,json_filename):
900 with open(json_filename,
'w')
as fp:
901 json.dump(self.data_base, fp, sort_keys=
True, indent=2, default=set_json_default)
903 def load(self,json_filename):
905 with open(json_filename,
'r') as fp:
906 self.data_base = json.load(fp)
910 if sys.version_info[0] < 3:
912 for k,v
in xl.iteritems():
913 if type(k)
is unicode: k=str(k)
914 if type(v)
is unicode: v=str(v)
917 def save_csv(self,filename):
923 sorted_group_ids=sorted(self.data_base.keys())
924 for group
in sorted_group_ids:
926 for xl
in self.data_base[group]:
928 sorted_ids=sorted(xl.keys())
929 values=[xl[k]
for k
in sorted_ids]
930 group_block.append(values)
934 with open(filename,
'w')
as fp:
935 a = csv.writer(fp, delimiter=
',')
936 a.writerow(sorted_ids)
941 This class generates a CrossLinkDataBase from a given structure
943 def __init__(self,representation=None,
945 residue_types_1=[
"K"],
946 residue_types_2=[
"K"],
947 reactivity_range=[0,1],
953 cldbkc.set_protein1_key(
"Protein1")
954 cldbkc.set_protein2_key(
"Protein2")
955 cldbkc.set_residue1_key(
"Residue1")
956 cldbkc.set_residue2_key(
"Residue2")
957 self.cldb=CrossLinkDataBase(cldbkc)
958 if representation
is not None:
961 self.representation=representation
962 self.model=self.representation.m
963 elif system
is not None:
966 self.model=self.system.mdl
969 print(
"Argument error: please provide either a representation object or a IMP.Hierarchy")
971 self.residue_types_1=residue_types_1
972 self.residue_types_2=residue_types_2
974 self.indexes_dict1={}
975 self.indexes_dict2={}
976 self.protein_residue_dict={}
977 self.reactivity_dictionary={}
978 self.euclidean_interacting_pairs=
None
979 self.xwalk_interacting_pairs=
None
982 if self.mode==
"pmi1":
983 for protein
in self.representation.sequence_dict.keys():
985 seq=self.representation.sequence_dict[protein]
986 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
992 rexp=numpy.random.exponential(0.1)
993 prob=1.0-math.exp(-rexp)
994 self.reactivity_dictionary[(protein,r)]=prob
997 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
998 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
1000 h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1003 self.indexes_dict1[index]=(protein,r)
1004 self.protein_residue_dict[(protein,r)]=index
1006 h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1009 self.indexes_dict2[index]=(protein,r)
1010 self.protein_residue_dict[(protein,r)]=index
1012 if self.mode==
"pmi2":
1013 for state
in self.system.get_states():
1014 for moleculename,molecules
in state.get_molecules().iteritems():
1015 for molecule
in molecules:
1017 seq=molecule.sequence
1018 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
1024 rexp=numpy.random.exponential(0.1)
1025 prob=1.0-math.exp(-rexp)
1026 self.reactivity_dictionary[(molecule,r)]=prob
1028 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
1029 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
1032 ps=s.get_selected_particles()
1035 self.indexes_dict1[index]=(molecule,r)
1036 self.protein_residue_dict[(molecule,r)]=index
1039 ps=s.get_selected_particles()
1042 self.indexes_dict2[index]=(molecule,r)
1043 self.protein_residue_dict[(molecule,r)]=index
1047 def get_data_base(self,total_number_of_spectra,
1048 ambiguity_probability=0.1,
1051 max_delta_distance=10.0,
1052 xwalk_bin_path=
None,
1053 confidence_false=0.75,
1054 confidence_true=0.75):
1056 from random
import random,uniform
1060 self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1061 self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1062 self.cldb.data_base[str(number_of_spectra)]=[]
1063 self.sites_weighted=
None
1065 while number_of_spectra<total_number_of_spectra:
1066 if random() > ambiguity_probability
and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1068 number_of_spectra+=1
1069 self.cldb.data_base[str(number_of_spectra)]=[]
1071 if random() > noise:
1073 pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1076 pra,dist=self.get_random_residue_pair(
None,
None,
None)
1080 if self.mode==
"pmi1":
1081 new_xl[self.cldb.protein1_key]=pra[0]
1082 new_xl[self.cldb.protein2_key]=pra[1]
1083 elif self.mode==
"pmi2":
1084 new_xl[self.cldb.protein1_key]=pra[0].get_name()
1085 new_xl[self.cldb.protein2_key]=pra[1].get_name()
1086 new_xl[
"molecule_object1"]=pra[0]
1087 new_xl[
"molecule_object2"]=pra[1]
1088 new_xl[self.cldb.residue1_key]=pra[2]
1089 new_xl[self.cldb.residue2_key]=pra[3]
1090 new_xl[
"Noisy"]=noisy
1092 new_xl[
"Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1093 new_xl[
"Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1094 r1=new_xl[
"Reactivity_Residue1"]
1095 r2=new_xl[
"Reactivity_Residue2"]
1098 new_xl[
"Reactivity"]=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1101 new_xl[
"Score"]=np.random.beta(1.0,self.beta_false)
1104 new_xl[
"Score"]=1.0-np.random.beta(1.0,self.beta_true)
1105 new_xl[
"TargetDistance"]=dist
1106 new_xl[
"NoiseProbability"]=noise
1107 new_xl[
"AmbiguityProbability"]=ambiguity_probability
1109 (p1,p2)=
IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1110 self.protein_residue_dict[(pra[1],pra[3])]])
1115 new_xl[
"InterRigidBody"] =
False
1120 new_xl[
"InterRigidBody"] =
True
1122 new_xl[
"InterRigidBody"] =
None
1124 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1129 def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1132 from random
import choice,uniform
1133 if distance
is None:
1136 if self.mode==
"pmi1":
1137 protein1=choice(self.representation.sequence_dict.keys())
1138 protein2=choice(self.representation.sequence_dict.keys())
1139 seq1=self.representation.sequence_dict[protein1]
1140 seq2=self.representation.sequence_dict[protein2]
1141 residue1=choice([i
for i
in range(1,len(seq1)+1)
if seq1[i-1]
in self.residue_types_1])
1142 residue2=choice([i
for i
in range(1,len(seq2)+1)
if seq2[i-1]
in self.residue_types_2])
1143 h1=IMP.pmi.tools.select_by_tuple(self.representation,(residue1,residue1,protein1),resolution=1)[0]
1144 h2=IMP.pmi.tools.select_by_tuple(self.representation,(residue2,residue2,protein2),resolution=1)[0]
1146 if (protein1,residue1) != (protein2,residue2):
1148 elif self.mode==
"pmi2":
1149 (protein1,residue1)=choice(self.protein_residue_dict.keys())
1150 (protein2,residue2)=choice(self.protein_residue_dict.keys())
1151 index1=self.protein_residue_dict[(protein1,residue1)]
1152 index2=self.protein_residue_dict[(protein2,residue2)]
1154 if (protein1,residue1) != (protein2,residue2):
1158 if not xwalk_bin_path:
1160 gcpf.set_distance(distance+max_delta_distance)
1164 if not self.sites_weighted:
1165 self.sites_weighted=[]
1166 for key
in self.reactivity_dictionary:
1167 r=self.reactivity_dictionary[key]
1168 self.sites_weighted.append((key,r))
1170 first_site=self.weighted_choice(self.sites_weighted)
1172 if not self.euclidean_interacting_pairs:
1173 self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1174 self.indexes_dict1.keys(),
1175 self.indexes_dict2.keys())
1177 first_site_pairs = [pair
for pair
in self.euclidean_interacting_pairs
1178 if self.indexes_dict1[pair[0]] == first_site
or
1179 self.indexes_dict2[pair[1]] == first_site]
1180 if len(first_site_pairs)==0:
continue
1182 second_sites_weighted=[]
1183 for pair
in first_site_pairs:
1184 if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1185 if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1186 r=self.reactivity_dictionary[second_site]
1187 second_sites_weighted.append((second_site,r))
1188 second_site=self.weighted_choice(second_sites_weighted)
1190 interacting_pairs_weighted=[]
1191 for pair in self.euclidean_interacting_pairs:
1192 r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1193 r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1194 #combined reactivity 1-exp(-k12*Delta t),
1197 r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1198 interacting_pairs_weighted.append((pair,r12))
1199 #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1200 #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1201 #interacting_pairs_weighted.append((pair,weight1*weight2))
1204 pair=self.weighted_choice(interacting_pairs_weighted)
1205 protein1,residue1=self.indexes_dict1[pair[0]]
1206 protein2,residue2=self.indexes_dict2[pair[1]]
1207 particle_pair=IMP.get_particles(self.model,pair)
1208 particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1209 if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1211 elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1212 #allow some flexibility
1213 prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1214 if uniform(0.0,1.0)<prob: break
1216 protein1,residue1=first_site
1217 protein2,residue2=second_site
1218 print(
"CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1219 "First site",first_site,self.reactivity_dictionary[first_site],
1220 "Second site",second_site,self.reactivity_dictionary[second_site])
1221 particle_pair=
IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1224 if particle_distance<distance
and (protein1,residue1) != (protein2,residue2):
1226 elif particle_distance>=distance
and (protein1,residue1) != (protein2,residue2)
and max_delta_distance:
1228 prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1229 if uniform(0.0,1.0)<prob:
break
1234 if not self.xwalk_interacting_pairs:
1235 self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1236 interacting_pairs_weighted=[]
1238 for pair
in self.xwalk_interacting_pairs:
1243 weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1244 weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1245 interacting_pairs_weighted.append((pair,weight1*weight2))
1247 pair=self.weighted_choice(interacting_pairs_weighted)
1252 particle_distance=float(pair[4])
1256 return ((protein1,protein2,residue1,residue2)),particle_distance
1258 def get_xwalk_distances(self,xwalk_bin_path,distance):
1262 o.init_pdb(
"xwalk.pdb",self.representation.prot)
1263 o.write_pdb(
"xwalk.pdb")
1264 namechainiddict=o.dictchain[
"xwalk.pdb"]
1267 for key
in namechainiddict: chainiddict[namechainiddict[key]]=key
1268 xwalkout=os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+
' -bb').read()
1270 output_list_of_distance=[]
1271 for line
in xwalkout.split(
"\n")[0:-2]:
1272 tockens=line.split()
1275 distance=float(tockens[6])
1277 ss=second.split(
"-")
1280 protein1=chainiddict[chainid1]
1281 protein2=chainiddict[chainid2]
1284 output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1285 return output_list_of_distance
1288 def weighted_choice(self,choices):
1291 total = sum(w
for c, w
in choices)
1292 r = random.uniform(0, total)
1294 for c, w
in choices:
1298 assert False,
"Shouldn't get here"
1300 def save_rmf_snapshot(self,filename,color_id=None):
1303 if color_id
is None:
1304 color_id=
"Reactivity"
1306 sorted_group_ids=sorted(self.cldb.data_base.keys())
1309 for group
in sorted_group_ids:
1311 group_dists_particles=[]
1312 for xl
in self.cldb.data_base[group]:
1313 xllabel=self.cldb.get_short_cross_link_string(xl)
1314 (c1,c2,r1,r2)=(xl[
"molecule_object1"],xl[
"molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1316 index1=self.protein_residue_dict[(c1,r1)]
1317 index2=self.protein_residue_dict[(c2,r2)]
1319 mdist=xl[
"TargetDistance"]
1321 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1323 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1324 if group_dists_particles:
1325 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1326 color_scores.append(mincolor_score)
1327 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1334 linear.set_slope(1.0)
1338 offset=min(color_scores)
1339 maxvalue=max(color_scores)
1340 for pair
in list_of_pairs:
1342 pr.set_name(pair[2])
1343 factor=(pair[3]-offset)/(maxvalue-offset)
1346 rslin.add_restraint(pr)
1349 rh = RMF.create_rmf_file(filename)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
A class to handle different styles of site pairs parsers.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Select hierarchy particles identified by the biological name.
Support for the RMF file format for storing hierarchical molecular data and markup.