1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
7 from __future__
import print_function
23 from collections
import defaultdict
27 def set_json_default(obj):
28 if isinstance(obj, set):
35 if sys.version_info[0] >= 3:
36 def _handle_string_input(inp):
37 if not isinstance(inp, str):
38 raise TypeError(
"expecting a string")
41 def _handle_string_input(inp):
42 if not isinstance(inp, (str, unicode)):
43 raise TypeError(
"expecting a string or unicode")
45 if isinstance(inp, unicode):
50 class _CrossLinkDataBaseStandardKeys(object):
52 This class setup all the standard keys needed to
53 identify the cross-link features from the data sets
57 self.protein1_key=
"Protein1"
58 self.type[self.protein1_key]=str
59 self.protein2_key=
"Protein2"
60 self.type[self.protein2_key]=str
61 self.residue1_key=
"Residue1"
62 self.type[self.residue1_key]=int
63 self.residue2_key=
"Residue2"
64 self.type[self.residue2_key]=int
65 self.residue1_amino_acid_key=
"Residue1AminoAcid"
66 self.type[self.residue1_amino_acid_key]=str
67 self.residue2_amino_acid_key=
"Residue2AminoAcid"
68 self.type[self.residue2_amino_acid_key]=str
69 self.residue1_moiety_key=
"Residue1Moiety"
70 self.type[self.residue1_moiety_key]=str
71 self.residue2_moiety_key=
"Residue2Moiety"
72 self.type[self.residue2_moiety_key]=str
73 self.site_pairs_key=
"SitePairs"
74 self.type[self.site_pairs_key]=str
75 self.unique_id_key=
"XLUniqueID"
76 self.type[self.unique_id_key]=str
77 self.unique_sub_index_key=
"XLUniqueSubIndex"
78 self.type[self.unique_sub_index_key]=int
79 self.unique_sub_id_key=
"XLUniqueSubID"
80 self.type[self.unique_sub_id_key]=str
81 self.data_set_name_key=
"DataSetName"
82 self.type[self.data_set_name_key]=str
83 self.cross_linker_chemical_key=
"CrossLinkerChemical"
84 self.type[self.cross_linker_chemical_key]=str
85 self.id_score_key=
"IDScore"
86 self.type[self.id_score_key]=float
88 self.type[self.fdr_key]=float
89 self.quantitation_key=
"Quantitation"
90 self.type[self.quantitation_key]=float
91 self.redundancy_key=
"Redundancy"
92 self.type[self.redundancy_key]=int
93 self.redundancy_list_key=
"RedundancyList"
94 self.type[self.redundancy_key]=list
95 self.ambiguity_key=
"Ambiguity"
96 self.type[self.ambiguity_key]=int
97 self.residue1_links_number_key=
"Residue1LinksNumber"
98 self.type[self.residue1_links_number_key]=int
99 self.residue2_links_number_key=
"Residue2LinksNumber"
100 self.type[self.residue2_links_number_key]=int
101 self.type[self.ambiguity_key]=int
102 self.state_key=
"State"
103 self.type[self.state_key]=int
104 self.sigma1_key=
"Sigma1"
105 self.type[self.sigma1_key]=str
106 self.sigma2_key=
"Sigma2"
107 self.type[self.sigma2_key]=str
109 self.type[self.psi_key]=str
110 self.distance_key=
"Distance"
111 self.type[self.distance_key]=float
112 self.min_ambiguous_distance_key=
"MinAmbiguousDistance"
113 self.type[self.distance_key]=float
115 self.link_type_key=
"LinkType"
116 self.type[self.link_type_key]=str
118 self.ordered_key_list =[self.data_set_name_key,
120 self.unique_sub_index_key,
121 self.unique_sub_id_key,
126 self.residue1_amino_acid_key,
127 self.residue2_amino_acid_key,
128 self.residue1_moiety_key,
129 self.residue2_moiety_key,
130 self.cross_linker_chemical_key,
133 self.quantitation_key,
135 self.redundancy_list_key,
141 self.min_ambiguous_distance_key,
145 class _ProteinsResiduesArray(tuple):
147 This class is inherits from tuple, and it is a shorthand for a cross-link
148 (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
149 residue1 and residue2.
152 def __new__(self,input_data):
154 @input_data can be a dict or a tuple
156 self.cldbsk=_CrossLinkDataBaseStandardKeys()
157 if type(input_data)
is dict:
159 p1=input_data[self.cldbsk.protein1_key]
161 p2=input_data[self.cldbsk.protein2_key]
164 r1=input_data[self.cldbsk.residue1_key]
166 r2=input_data[self.cldbsk.residue2_key]
173 elif type(input_data)
is tuple:
174 if len(input_data)>4
or len(input_data)==3
or len(input_data)==1:
175 raise TypeError(
"_ProteinsResiduesArray: must have only 4 elements")
176 if len(input_data)==4:
177 p1 = _handle_string_input(input_data[0])
178 p2 = _handle_string_input(input_data[1])
181 if (
not (type(r1)
is int))
and (
not (r1
is None)):
182 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
183 if (
not (type(r2)
is int))
and (
not (r1
is None)):
184 raise TypeError(
"_ProteinsResiduesArray: residue2 must be a integer")
186 if len(input_data) == 2:
187 p1 = _handle_string_input(input_data[0])
189 if type(r1)
is not int:
190 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
193 raise TypeError(
"_ProteinsResiduesArray: input must be a dict or tuple")
194 return tuple.__new__(_ProteinsResiduesArray, t)
196 def get_inverted(self):
198 Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
200 return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
203 outstr=self.cldbsk.protein1_key+
" "+str(self[0])
204 outstr+=
" "+self.cldbsk.protein2_key+
" "+str(self[1])
205 outstr+=
" "+self.cldbsk.residue1_key+
" "+str(self[2])
206 outstr+=
" "+self.cldbsk.residue2_key+
" "+str(self[3])
210 outstr=str(self[0])+
"."+str(self[2])+
"-"+str(self[1])+
"."+str(self[3])
215 This class allows to create filter functions that can be passed to the CrossLinkDataBase
218 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
220 where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
222 A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
226 and it is used to filter the database
229 def __init__(self, argument1, operator, argument2):
231 (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
232 or (FilterOperator1,operator.or|and...,FilterOperator2)
234 if isinstance(argument1, FilterOperator):
235 self.operations = [argument1, operator, argument2]
238 self.values = (argument1, operator, argument2)
240 def __or__(self, FilterOperator2):
243 def __and__(self, FilterOperator2):
246 def __invert__(self):
249 def evaluate(self, xl_item):
251 if len(self.operations) == 0:
252 keyword, operator, value = self.values
253 return operator(xl_item[keyword], value)
254 FilterOperator1, op, FilterOperator2 = self.operations
256 if FilterOperator2
is None:
257 return op(FilterOperator1.evaluate(xl_item))
259 return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
262 def filter_factory(xl_):
264 class FilterOperator(object):
268 def __new__(self,key,value,oper=operator.eq):
269 return oper(self.xl[key],value)
271 return FilterOperator
276 This class is needed to convert the keywords from a generic database
282 @param list_parser an instance of ResiduePairListParser, if any is needed
285 self.backward_converter={}
286 _CrossLinkDataBaseStandardKeys.__init__(self)
287 self.rplp = list_parser
288 if self.rplp
is None:
290 self.compulsory_keys=set([self.protein1_key,
295 self.compulsory_keys=self.rplp.get_compulsory_keys()
296 self.setup_keys=set()
300 Is a function that check whether necessary keys are setup
303 if self.compulsory_keys & setup_keys != self.compulsory_keys:
304 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
308 Returns the keys that have been setup so far
310 return self.backward_converter.keys()
314 This sets up the standard compulsory keys as defined by
315 _CrossLinkDataBaseStandardKeys
317 for ck
in self.compulsory_keys:
318 self.converter[ck]=ck
319 self.backward_converter[ck]=ck
321 def set_unique_id_key(self,origin_key):
322 self.converter[origin_key]=self.unique_id_key
323 self.backward_converter[self.unique_id_key]=origin_key
325 def set_protein1_key(self,origin_key):
326 self.converter[origin_key]=self.protein1_key
327 self.backward_converter[self.protein1_key]=origin_key
329 def set_protein2_key(self,origin_key):
330 self.converter[origin_key]=self.protein2_key
331 self.backward_converter[self.protein2_key]=origin_key
333 def set_residue1_key(self,origin_key):
334 self.converter[origin_key]=self.residue1_key
335 self.backward_converter[self.residue1_key]=origin_key
337 def set_residue2_key(self,origin_key):
338 self.converter[origin_key]=self.residue2_key
339 self.backward_converter[self.residue2_key]=origin_key
341 def set_residue1_amino_acid_key(self, origin_key):
342 self.converter[origin_key] = self.residue1_amino_acid_key
343 self.backward_converter[self.residue1_amino_acid_key] = origin_key
345 def set_residue2_amino_acid_key(self, origin_key):
346 self.converter[origin_key] = self.residue2_amino_acid_key
347 self.backward_converter[self.residue2_amino_acid_key] = origin_key
349 def set_residue1_moiety_key(self, origin_key):
350 self.converter[origin_key] = self.residue1_moiety_key
351 self.backward_converter[self.residue1_moiety_key] = origin_key
353 def set_residue2_moiety_key(self, origin_key):
354 self.converter[origin_key] = self.residue2_moiety_key
355 self.backward_converter[self.residue2_moiety_key] = origin_key
357 def set_site_pairs_key(self,origin_key):
358 self.converter[origin_key]=self.site_pairs_key
359 self.backward_converter[self.site_pairs_key]=origin_key
361 def set_id_score_key(self,origin_key):
362 self.converter[origin_key]=self.id_score_key
363 self.backward_converter[self.id_score_key]=origin_key
365 def set_fdr_key(self,origin_key):
366 self.converter[origin_key]=self.fdr_key
367 self.backward_converter[self.fdr_key]=origin_key
369 def set_quantitation_key(self,origin_key):
370 self.converter[origin_key]=self.quantitation_key
371 self.backward_converter[self.quantitation_key]=origin_key
373 def set_psi_key(self,origin_key):
374 self.converter[origin_key]=self.psi_key
375 self.backward_converter[self.psi_key]=origin_key
377 def set_link_type_key(self,link_type_key):
378 self.converter[link_type_key]=self.link_type_key
379 self.backward_converter[self.link_type_key]=link_type_key
383 Returns the dictionary that convert the old keywords to the new ones
386 return self.converter
390 Returns the dictionary that convert the new keywords to the old ones
393 return self.backward_converter
397 A class to handle different styles of site pairs parsers.
399 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
400 [Y3-;K4-] for dead-ends
401 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
402 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
403 LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
408 def __init__(self,style):
410 _CrossLinkDataBaseStandardKeys.__init__(self)
411 if style ==
"MSSTUDIO":
413 self.compulsory_keys= set([self.protein1_key,
415 self.site_pairs_key])
416 elif style ==
"XTRACT" or style ==
"QUANTITATION":
418 self.compulsory_keys= set([self.site_pairs_key])
419 elif style ==
"LAN_HUANG":
421 self.compulsory_keys= set([self.site_pairs_key])
423 raise Error(
"ResiduePairListParser: unknown list parser style")
425 def get_compulsory_keys(self):
426 return self.compulsory_keys
430 This function returns a list of cross-linked residues and the corresponding list of
431 cross-linked chains. The latter list can be empty, if the style doesn't have the
432 corresponding information.
434 if self.style ==
"MSSTUDIO":
435 input_string=input_string.replace(
"[",
"")
436 input_string=input_string.replace(
"]",
"")
437 input_string_pairs=input_string.split(
";")
438 residue_pair_indexes=[]
439 chain_pair_indexes=[]
440 for s
in input_string_pairs:
441 m1=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
442 m2=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
445 residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
446 residue_pair_indexes.append((residue_index_1,residue_index_2))
449 residue_type_1,residue_index_1=m2.group(1,2)
451 return residue_pair_indexes,chain_pair_indexes
452 if self.style ==
"XTRACT" or self.style ==
"QUANTITATION":
453 if ":x:" in input_string:
455 input_strings=input_string.split(
":x:")
456 first_peptides=input_strings[0].split(
":|:")
457 second_peptides=input_strings[1].split(
":|:")
458 first_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in first_peptides]
459 second_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in second_peptides]
460 residue_pair_indexes=[]
461 chain_pair_indexes=[]
462 for fpi
in first_peptides_indentifiers:
463 for spi
in second_peptides_indentifiers:
468 residue_pair_indexes.append((residue1,residue2))
469 chain_pair_indexes.append((chain1,chain2))
470 return residue_pair_indexes, chain_pair_indexes
473 first_peptides = input_string.split(
":|:")
474 first_peptides_indentifiers = [(f.split(
":")[0], f.split(
":")[1])
for f
in first_peptides]
477 for fpi
in first_peptides_indentifiers:
480 residue_indexes.append((residue1,))
481 chain_indexes.append((chain1,))
482 return residue_indexes, chain_indexes
483 if self.style ==
"LAN_HUANG":
484 input_strings=input_string.split(
"-")
485 chain1,first_series=input_strings[0].split(
":")
486 chain2,second_series=input_strings[1].split(
":")
488 first_residues=first_series.replace(
";",
"|").split(
"|")
489 second_residues=second_series.replace(
";",
"|").split(
"|")
490 residue_pair_indexes=[]
491 chain_pair_indexes=[]
492 for fpi
in first_residues:
493 for spi
in second_residues:
494 residue1=self.re.sub(
"[^0-9]",
"", fpi)
495 residue2=self.re.sub(
"[^0-9]",
"", spi)
496 residue_pair_indexes.append((residue1,residue2))
497 chain_pair_indexes.append((chain1,chain2))
498 return residue_pair_indexes, chain_pair_indexes
504 A class to handle different XL format with fixed format
505 currently support ProXL
507 def __init__(self,format):
509 _CrossLinkDataBaseStandardKeys.__init__(self)
510 if format ==
"PROXL":
513 raise Error(
"FixedFormatParser: unknown list format name")
516 def get_data(self,input_string):
517 if self.format ==
"PROXL":
518 tokens=input_string.split(
"\t")
520 if tokens[0]==
"SEARCH ID(S)":
523 xl[self.protein1_key]=tokens[2]
524 xl[self.protein2_key]=tokens[4]
525 xl[self.residue1_key]=int(tokens[3])
526 xl[self.residue2_key]=int(tokens[5])
531 this class handles a cross-link dataset and do filtering
532 operations, adding cross-links, merge datasets...
535 def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=(
'K',)):
538 @param converter an instance of CrossLinkDataBaseKeywordsConverter
539 @param data_base an instance of CrossLinkDataBase to build the new database on
540 @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
541 cross-link consistency. If not given consistency will not be checked
542 @param linkable_aa a tuple containing one-letter amino acids which are linkable by the cross-linker;
543 only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
544 and if a fasta_seq is given
547 if data_base
is None:
550 self.data_base=data_base
552 _CrossLinkDataBaseStandardKeys.__init__(self)
553 if converter
is not None:
554 self.cldbkc = converter
555 self.list_parser=self.cldbkc.rplp
556 self.converter = converter.get_converter()
560 self.list_parser=
None
561 self.converter =
None
564 self.def_aa_tuple = linkable_aa
565 self.fasta_seq = fasta_seq
572 Update the whole dataset after changes
574 self.update_cross_link_unique_sub_index()
575 self.update_cross_link_redundancy()
576 self.update_residues_links_number()
581 sorted_ids=sorted(self.data_base.keys())
583 for xl
in self.data_base[k]:
586 def xlid_iterator(self):
587 sorted_ids=sorted(self.data_base.keys())
588 for xlid
in sorted_ids:
591 def __getitem__(self,xlid):
592 return self.data_base[xlid]
595 return len([xl
for xl
in self])
600 def set_name(self,name):
602 for k
in self.data_base:
603 new_data_base[k+
"."+name]=self.data_base[k]
604 self.data_base=new_data_base
608 def get_number_of_xlid(self):
609 return len(self.data_base)
614 if FixedFormatParser is not specified, the file is comma-separated-values
615 @param file_name a txt file to be parsed
616 @param converter an instance of CrossLinkDataBaseKeywordsConverter
617 @param FixedFormatParser a parser for a fixed format
619 if not FixedFormatParser:
620 xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
622 if converter
is not None:
623 self.cldbkc = converter
624 self.list_parser=self.cldbkc.rplp
625 self.converter = converter.get_converter()
628 if not self.list_parser:
632 for nxl,xl
in enumerate(xl_list):
635 if k
in self.converter:
636 new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
639 if self.unique_id_key
in self.cldbkc.get_setup_keys():
640 if new_xl[self.unique_id_key]
not in new_xl_dict:
641 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
643 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
645 if str(nxl)
not in new_xl_dict:
646 new_xl_dict[str(nxl)]=[new_xl]
648 new_xl_dict[str(nxl)].append(new_xl)
653 for nxl,entry
in enumerate(xl_list):
657 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
658 raise Error(
"CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
660 if k
in self.converter:
661 new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
665 residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
668 for n,p
in enumerate(residue_pair_list):
675 new_xl[k]=new_dict[k]
676 new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
678 new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
680 if len(chain_pair_list)==len(residue_pair_list):
681 new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
683 new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
686 new_xl[self.link_type_key]=
"CROSSLINK"
688 new_xl[self.link_type_key]=
"MONOLINK"
690 if self.unique_id_key
in self.cldbkc.get_setup_keys():
691 if new_xl[self.unique_id_key]
not in new_xl_dict:
692 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
694 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
696 if str(nxl)
not in new_xl_dict:
697 new_xl[self.unique_id_key]=str(nxl+1)
698 new_xl_dict[str(nxl)]=[new_xl]
700 new_xl[self.unique_id_key]=str(nxl+1)
701 new_xl_dict[str(nxl)].append(new_xl)
706 if FixedFormatParser is defined
710 f=open(file_name,
"r")
713 xl=FixedFormatParser.get_data(line)
715 xl[self.unique_id_key]=str(nxl+1)
716 new_xl_dict[str(nxl)]=[xl]
720 self.data_base=new_xl_dict
722 l = ihm.location.InputFileLocation(file_name, details=
'Crosslinks')
723 self.dataset = ihm.dataset.CXMSDataset(l)
726 def update_cross_link_unique_sub_index(self):
727 for k
in self.data_base:
728 for n,xl
in enumerate(self.data_base[k]):
729 xl[self.ambiguity_key]=len(self.data_base[k])
730 xl[self.unique_sub_index_key]=n+1
731 xl[self.unique_sub_id_key]=k+
"."+str(n+1)
733 def update_cross_link_redundancy(self):
734 redundancy_data_base={}
736 pra=_ProteinsResiduesArray(xl)
737 if pra
not in redundancy_data_base:
738 redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
739 redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
741 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
742 redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
744 pra=_ProteinsResiduesArray(xl)
745 xl[self.redundancy_key]=len(redundancy_data_base[pra])
746 xl[self.redundancy_list_key]=redundancy_data_base[pra]
748 def update_residues_links_number(self):
751 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
752 if (p1,r1)
not in residue_links:
753 residue_links[(p1,r1)]=set([(p2,r2)])
755 residue_links[(p1,r1)].add((p2,r2))
756 if (p2,r2)
not in residue_links:
757 residue_links[(p2,r2)]=set([(p1,r1)])
759 residue_links[(p2,r2)].add((p1,r1))
762 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
763 xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
764 xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
767 """This function checks the consistency of the dataset with the amino acid sequence"""
768 if self.cldbkc
and self.fasta_seq:
769 cnt_matched, cnt_matched_file = 0, 0
773 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
774 b_matched_file =
False
775 if self.residue1_amino_acid_key
in xl:
777 aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
778 b_matched = self._match_xlinks(p1, r1, aa_from_file)
779 b_matched_file = b_matched
782 b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
784 matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
786 if self.residue2_amino_acid_key
in xl:
787 aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
788 b_matched = self._match_xlinks(p2, r2, aa_from_file)
789 b_matched_file = b_matched
791 b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
793 matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
794 if b_matched: cnt_matched += 1
795 if b_matched_file: cnt_matched_file += 1
797 percentage_matched = round(100*cnt_matched/len(self),1)
798 percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
800 if matched
or non_matched: print(
"check_cross_link_consistency: Out of %d cross-links "
801 "%d were matched to the fasta sequence (%f %%).\n "
802 "%d were matched by using the cross-link file (%f %%)."%
803 (len(self),cnt_matched,percentage_matched,cnt_matched_file,
804 percentage_matched_file) )
805 if non_matched: print(
"check_cross_link_consistency: Warning: Non matched xlinks:",
806 [(prot_name, sorted(list(non_matched[prot_name])))
for prot_name
in non_matched])
807 return matched,non_matched
809 def _match_xlinks(self, prot_name, res_index, aa_tuple):
812 amino_dict = IMP.pmi.alphabets.amino_acid
814 for amino_acid
in aa_tuple:
815 if len(amino_acid) == 3:
816 amino_acid = amino_dict.get_one_letter_code_from_residue_type(
818 if prot_name
in self.fasta_seq.sequences:
819 seq = self.fasta_seq.sequences[prot_name]
824 if res_index == 0
or res_index == 1:
826 if res_index < len(seq):
827 if amino_acid == seq[res_index]:
833 def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
836 matched[prot].add(res)
838 matched[prot] = set([res])
840 if prot
in non_matched:
841 non_matched[prot].add(res)
843 non_matched[prot] = set([res])
844 return matched, non_matched
847 def get_cross_link_string(self,xl):
849 for k
in self.ordered_key_list:
851 string+=str(k)+
":"+str(xl[k])+
"|"
856 if k
not in self.ordered_key_list:
857 string+=str(k)+
":"+str(xl[k])+
"|"
861 def get_short_cross_link_string(self,xl):
864 list_of_keys=[self.data_set_name_key,
865 self.unique_sub_id_key,
873 for k
in list_of_keys:
875 string+=str(xl[k])+
"|"
881 def filter(self,FilterOperator):
883 for id
in self.data_base.keys():
884 for xl
in self.data_base[id]:
885 if FilterOperator.evaluate(xl):
886 if id
not in new_xl_dict:
889 new_xl_dict[id].append(xl)
892 cdb.dataset = self.dataset
896 '''Get all cross-links with score greater than an input value'''
898 return self.filter(FilterOperator)
900 def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
902 This function merges two cross-link datasets so that if two conflicting cross-links have the same
903 cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
904 with different SubIDs
906 raise NotImplementedError()
909 """Append cross-link dataset to this one."""
911 for k
in self.data_base:
912 new_data_base[k] = self.data_base[k]
913 for k
in db.data_base:
914 new_data_base[k] = db.data_base[k]
915 self.data_base = new_data_base
918 def __iadd__(self, db):
922 def set_value(self, key, new_value, filter_operator=None):
924 This function changes the value for a given key in the database
925 For instance one can change the name of a protein
926 @param key: the key in the database that must be changed
927 @param new_value: the new value of the key
928 @param filter_operator: optional FilterOperator to change the value to
929 a subset of the database
931 example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
935 if filter_operator
is not None:
936 if filter_operator.evaluate(xl):
944 this function returns the list of values for a given key in the database
945 alphanumerically sorted
950 return sorted(list(values))
954 This function offset the residue indexes of a given protein by a specified value
955 @param protein_name: the protein name that need to be changed
956 @param offset: the offset value
960 if xl[self.protein1_key] == protein_name:
961 xl[self.residue1_key]=xl[self.residue1_key]+offset
962 if xl[self.protein2_key] == protein_name:
963 xl[self.residue2_key]=xl[self.residue2_key]+offset
968 This function creates a new keyword for the whole database and set the values from
969 and existing keyword (optional), otherwise the values are set to None
970 @param keyword the new keyword name:
971 @param values_from_keyword the keyword from which we are copying the values:
974 if values_from_keyword
is not None:
975 xl[keyword] = xl[values_from_keyword]
982 This function renames all proteins contained in the input dictionary
983 from the old names (keys) to the new name (values)
984 @param old_to_new_names_dictionary dictionary for converting old to new names
985 @param protein_to_rename specify whether to rename both or protein1 or protein2 only
988 for old_name
in old_to_new_names_dictionary:
989 new_name=old_to_new_names_dictionary[old_name]
990 if protein_to_rename ==
"both" or protein_to_rename ==
"protein1":
992 self.
set_value(self.protein1_key,new_name,fo2)
993 if protein_to_rename ==
"both" or protein_to_rename ==
"protein2":
995 self.
set_value(self.protein2_key,new_name,fo2)
999 This function creates as many classes as in the input (number_of_classes: integer)
1000 and partition cross-links according to their identification scores. Classes are defined in the psi key.
1003 if self.id_score_key
is not None:
1006 raise ValueError(
'The cross-link database does not contain score values')
1007 minscore=min(scores)
1008 maxscore=max(scores)
1009 scoreclasses=numpy.linspace(minscore, maxscore, number_of_classes+1)
1010 if self.psi_key
is None:
1013 score=xl[self.id_score_key]
1014 for n,classmin
in enumerate(scoreclasses[0:-1]):
1015 if score>=classmin
and score<=scoreclasses[n+1]:
1016 xl[self.psi_key]=str(
"CLASS_"+str(n))
1019 def clone_protein(self,protein_name,new_protein_name):
1021 for id
in self.data_base.keys():
1023 for xl
in self.data_base[id]:
1024 new_data_base.append(xl)
1025 if xl[self.protein1_key]==protein_name
and xl[self.protein2_key]!=protein_name:
1027 new_xl[self.protein1_key]=new_protein_name
1028 new_data_base.append(new_xl)
1029 elif xl[self.protein1_key]!=protein_name
and xl[self.protein2_key]==protein_name:
1031 new_xl[self.protein2_key]=new_protein_name
1032 new_data_base.append(new_xl)
1033 elif xl[self.protein1_key]==protein_name
and xl[self.protein2_key]==protein_name:
1035 new_xl[self.protein1_key]=new_protein_name
1036 new_data_base.append(new_xl)
1038 new_xl[self.protein2_key]=new_protein_name
1039 new_data_base.append(new_xl)
1041 new_xl[self.protein1_key]=new_protein_name
1042 new_xl[self.protein2_key]=new_protein_name
1043 new_data_base.append(new_xl)
1044 self.data_base[id]=new_data_base
1049 This function remove cross-links applied to the same residue
1050 (ie, same chain name and residue number)
1053 for id
in self.data_base.keys():
1055 for xl
in self.data_base[id]:
1056 if xl[self.protein1_key]==xl[self.protein2_key]
and xl[self.residue1_key]==xl[self.residue2_key]:
1059 new_data_base.append(xl)
1060 self.data_base[id]=new_data_base
1066 this method returns a CrossLinkDataBase class containing
1067 a random subsample of the original cross-link database.
1068 @param percentage float between 0 and 1, is the percentage of
1069 of spectra taken from the original list
1072 if percentage > 1.0
or percentage < 0.0:
1073 raise ValueError(
'the percentage of random cross-link spectra should be between 0 and 1')
1074 nspectra=self.get_number_of_xlid()
1075 nrandom_spectra=int(nspectra*percentage)
1076 random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1078 for k
in random_keys:
1079 new_data_base[k]=self.data_base[k]
1084 sorted_ids=sorted(self.data_base.keys())
1086 for id
in sorted_ids:
1088 for xl
in self.data_base[id]:
1089 for k
in self.ordered_key_list:
1091 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1096 if k
not in self.ordered_key_list:
1097 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1098 outstr+=
"-------------\n"
1102 def plot(self,filename,**kwargs):
1104 matplotlib.use(
'Agg')
1105 import matplotlib.pyplot
as plt
1106 import matplotlib.colors
1110 if kwargs[
"type"] ==
"scatter":
1111 cmap=plt.get_cmap(
"rainbow")
1112 norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1115 if "colorkey" in kwargs:
1116 colorkey=kwargs[
"colorkey"]
1117 if "sizekey" in kwargs:
1118 sizekey=kwargs[
"sizekey"]
1119 if "logyscale" in kwargs:
1120 logyscale=kwargs[
"logyscale"]
1128 xs.append(float(xl[xkey]))
1130 ys.append(math.log10(float(xl[ykey])))
1132 ys.append(float(xl[ykey]))
1133 colors.append(float(xl[colorkey]))
1135 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1139 for color
in colors:
1140 cindex=(color-min(colors))/(max(colors)-min(colors))
1141 cs.append(cmap(cindex))
1144 ax = fig.add_subplot(111)
1145 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker=
"o")
1148 plt.savefig(filename)
1152 if kwargs[
"type"] ==
"residue_links":
1156 molecule=kwargs[
"molecule"]
1158 length=len(molecule.sequence)
1159 molecule=molecule.get_name()
1162 sequences_object=kwargs[
"sequences_object"]
1163 sequence=sequences_object.sequences[molecule]
1164 length=len(sequence)
1166 histogram=[0]*length
1168 if xl[self.protein1_key]==molecule:
1169 histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1170 if xl[self.protein2_key]==molecule:
1171 histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1172 rects = plt.bar(range(1,length+1), histogram)
1179 plt.savefig(filename)
1185 if kwargs[
"type"] ==
"histogram":
1186 valuekey=kwargs[
"valuekey"]
1187 reference_xline=kwargs[
"reference_xline"]
1193 values_list.append(float(xl[valuekey]))
1195 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1198 filename, [values_list], valuename=valuename, bins=bins,
1199 colors=
None, format=
"pdf",
1200 reference_xline=
None, yplotrange=
None,
1201 xplotrange=
None,normalized=
True,
1204 def dump(self,json_filename):
1206 with open(json_filename,
'w')
as fp:
1207 json.dump(self.data_base, fp, sort_keys=
True, indent=2, default=set_json_default)
1209 def load(self,json_filename):
1211 with open(json_filename,
'r') as fp:
1212 self.data_base = json.load(fp)
1216 if sys.version_info[0] < 3:
1218 for k,v
in xl.iteritems():
1219 if type(k)
is unicode: k=str(k)
1220 if type(v)
is unicode: v=str(v)
1223 def save_csv(self,filename):
1229 sorted_group_ids=sorted(self.data_base.keys())
1230 for group
in sorted_group_ids:
1232 for xl
in self.data_base[group]:
1234 sorted_ids=sorted(xl.keys())
1235 values=[xl[k]
for k
in sorted_ids]
1236 group_block.append(values)
1240 with open(filename,
'w')
as fp:
1241 a = csv.writer(fp, delimiter=
',')
1242 a.writerow(sorted_ids)
1247 Returns the number of non redundant cross-link sites
1251 pra=_ProteinsResiduesArray(xl)
1252 prai=pra.get_inverted()
1255 return len(list(praset))
1258 """This class allows to compute and plot the distance between datasets"""
1261 """Input a dictionary where keys are cldb names and values are cldbs"""
1262 import scipy.spatial.distance
1263 self.dbs=cldb_dictionary
1264 self.keylist=list(self.dbs.keys())
1265 self.distances=list()
1268 for i,key1
in enumerate(self.keylist):
1269 for key2
in self.keylist[i+1:]:
1270 distance=self.get_distance(key1,key2)
1271 self.distances.append(distance)
1273 self.distances=scipy.spatial.distance.squareform(self.distances)
1276 return 1.0-self.
jaccard_index(self.dbs[key1],self.dbs[key2])
1279 """Similarity index between two datasets
1280 https://en.wikipedia.org/wiki/Jaccard_index"""
1284 for xl1
in CrossLinkDataBase1:
1285 a1f=_ProteinsResiduesArray(xl1)
1286 a1b=a1f.get_inverted()
1289 for xl2
in CrossLinkDataBase2:
1290 a2f=_ProteinsResiduesArray(xl2)
1291 a2b=a2f.get_inverted()
1294 return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1296 def plot_matrix(self,figurename="clustermatrix.pdf"):
1297 import matplotlib
as mpl
1300 import matplotlib.pylab
as pl
1301 from scipy.cluster
import hierarchy
as hrc
1303 raw_distance_matrix=self.distances
1309 ax = fig.add_subplot(1,1,1)
1310 dendrogram = hrc.dendrogram(
1311 hrc.linkage(raw_distance_matrix),
1314 leaves_order = dendrogram[
'leaves']
1315 ax.set_xlabel(
'Dataset')
1316 ax.set_ylabel(
'Jaccard Distance')
1318 pl.savefig(
"dendrogram."+figurename, dpi=300)
1324 ax = fig.add_subplot(1,1,1)
1326 raw_distance_matrix[leaves_order,
1329 interpolation=
'nearest')
1330 cb = fig.colorbar(cax)
1331 cb.set_label(
'Jaccard Distance')
1332 ax.set_xlabel(
'Dataset')
1333 ax.set_ylabel(
'Dataset')
1334 ax.set_xticks(range(len(labels)))
1335 ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation=
'vertical')
1336 ax.set_yticks(range(len(labels)))
1337 ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation=
'horizontal')
1339 pl.savefig(
"matrix."+figurename, dpi=300)
1345 This class maps a CrossLinkDataBase on a given structure
1346 and save an rmf file with color-coded cross-links
1350 def __init__(self,CrossLinkDataBase,rmf_or_stat_handler):
1351 self.CrossLinkDataBase=CrossLinkDataBase
1354 self.prots=rmf_or_stat_handler
1355 self.distances=defaultdict(list)
1359 print(
"computing distances fro all cross-links and all structures")
1360 for i
in self.prots[::10]:
1361 self.compute_distances()
1362 for key,xl
in enumerate(self.CrossLinkDataBase):
1363 array=_ProteinsResiduesArray(xl)
1364 self.array_to_id[array]=key
1365 self.id_to_array[key]=array
1366 if xl[
"MinAmbiguousDistance"] !=
'None':
1367 self.distances[key].append(xl[
"MinAmbiguousDistance"])
1369 def compute_distances(self):
1372 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1373 for group
in sorted_group_ids:
1376 for xl
in self.CrossLinkDataBase.data_base[group]:
1378 sorted_ids=sorted(xl.keys())
1379 data.append(sorted_ids+[
"UniqueID",
"Distance",
"MinAmbiguousDistance"])
1380 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1382 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1391 values=[xl[k]
for k
in sorted_ids]
1392 values += [group, mdist]
1393 except KeyError
as e:
1394 print(
"MapCrossLinkDataBaseOnStructure KeyError: {0} in {1}".format(e, xl))
1395 group_dists.append(mdist)
1397 xl[
"Distance"]=mdist
1403 for xl
in self.CrossLinkDataBase.data_base[group]:
1404 xl[
"MinAmbiguousDistance"]=min(group_dists)
1406 def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1407 '''more robust and slower version of above'''
1409 selpart_1=sel.get_selected_particles()
1410 if len(selpart_1)==0:
1411 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1414 selpart_2=sel.get_selected_particles()
1415 if len(selpart_2)==0:
1416 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1419 for p1
in selpart_1:
1420 for p2
in selpart_2:
1421 if p1 == p2
and r1 == r2:
continue
1441 results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1442 if len(results)==0:
return None
1443 results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1444 return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1446 def save_rmf_snapshot(self,filename,color_id=None):
1447 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1450 for group
in sorted_group_ids:
1451 group_dists_particles=[]
1452 for xl
in self.CrossLinkDataBase.data_base[group]:
1453 xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1454 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1456 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1458 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1460 if color_id
is not None:
1461 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1463 group_dists_particles.append((mdist,p1,p2,xllabel,0.0))
1464 if group_dists_particles:
1465 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1466 color_scores.append(mincolor_score)
1467 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1472 linear.set_slope(1.0)
1474 rslin =
IMP.RestraintSet(self.prots.get_model(),
'linear_dummy_restraints')
1476 offset=min(color_scores)
1477 maxvalue=max(color_scores)
1478 for pair
in list_of_pairs:
1480 pr.set_name(pair[2])
1482 factor=(pair[3]-offset)/(maxvalue-offset)
1487 rslin.add_restraint(pr)
1490 rh = RMF.create_rmf_file(filename)
1497 def boxplot_crosslink_distances(self,filename):
1499 keys=[self.id_to_array[k]
for k
in self.distances.keys()]
1500 medians=[numpy.median(self.distances[self.array_to_id[k]])
for k
in keys]
1501 dists=[self.distances[self.array_to_id[k]]
for k
in keys]
1502 distances_sorted_by_median=[x
for _,x
in sorted(zip(medians,dists))]
1503 keys_sorted_by_median=[x
for _,x
in sorted(zip(medians,keys))]
1505 distances_sorted_by_median,
1506 range(len(distances_sorted_by_median)),
1507 xlabels=keys_sorted_by_median,scale_plot_length=0.2)
1509 def get_crosslink_violations(self,threshold):
1510 violations=defaultdict(list)
1511 for k
in self.distances:
1513 viols=self.distances[k]
1514 violations[self.id_to_array[k]]=viols
1520 This class generates a CrossLinkDataBase from a given structure
1522 def __init__(self, system, residue_types_1=["K"],
1523 residue_types_2=[
"K"], reactivity_range=[0,1], kt=1.0):
1528 cldbkc.set_protein1_key(
"Protein1")
1529 cldbkc.set_protein2_key(
"Protein2")
1530 cldbkc.set_residue1_key(
"Residue1")
1531 cldbkc.set_residue2_key(
"Residue2")
1534 self.model=self.system.model
1535 self.residue_types_1=residue_types_1
1536 self.residue_types_2=residue_types_2
1538 self.indexes_dict1={}
1539 self.indexes_dict2={}
1540 self.protein_residue_dict={}
1541 self.reactivity_dictionary={}
1542 self.euclidean_interacting_pairs=
None
1543 self.xwalk_interacting_pairs=
None
1546 for state
in self.system.get_states():
1547 for moleculename,molecules
in state.get_molecules().items():
1548 for molecule
in molecules:
1550 seq=molecule.sequence
1551 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
1557 rexp=numpy.random.exponential(0.00000001)
1558 prob=1.0-math.exp(-rexp)
1559 self.reactivity_dictionary[(molecule,r)]=prob
1561 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
1562 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
1565 ps=s.get_selected_particles()
1568 self.indexes_dict1[index]=(molecule,r)
1569 self.protein_residue_dict[(molecule,r)]=index
1572 ps=s.get_selected_particles()
1575 self.indexes_dict2[index]=(molecule,r)
1576 self.protein_residue_dict[(molecule,r)]=index
1579 def get_all_possible_pairs(self):
1580 n=float(len(self.protein_residue_dict.keys()))
1581 return n*(n-1.0)/2.0
1583 def get_all_feasible_pairs(self,distance=21):
1585 particle_index_pairs=[]
1587 for a,b
in itertools.combinations(self.protein_residue_dict.keys(),2):
1590 index1=self.protein_residue_dict[a]
1591 index2=self.protein_residue_dict[b]
1593 if particle_distance <= distance:
1594 particle_index_pairs.append((index1,index2))
1595 new_xl[self.cldb.protein1_key]=a[0].get_name()
1596 new_xl[self.cldb.protein2_key]=b[0].get_name()
1597 new_xl[
"molecule_object1"]=a[0]
1598 new_xl[
"molecule_object2"]=b[0]
1599 new_xl[self.cldb.residue1_key]=a[1]
1600 new_xl[self.cldb.residue2_key]=b[1]
1601 self.cldb.data_base[str(nxl)]=[new_xl]
1609 def get_data_base(self,total_number_of_spectra,
1610 ambiguity_probability=0.1,
1613 max_delta_distance=10.0,
1614 xwalk_bin_path=
None,
1615 confidence_false=0.75,
1616 confidence_true=0.75):
1618 from random
import random,uniform
1622 self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1623 self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1624 self.cldb.data_base[str(number_of_spectra)]=[]
1625 self.sites_weighted=
None
1627 while number_of_spectra<total_number_of_spectra:
1628 if random() > ambiguity_probability
and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1630 number_of_spectra+=1
1631 self.cldb.data_base[str(number_of_spectra)]=[]
1633 if random() > noise:
1635 pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1638 pra,dist=self.get_random_residue_pair(
None,
None,
None)
1642 new_xl[self.cldb.protein1_key]=pra[0].get_name()
1643 new_xl[self.cldb.protein2_key]=pra[1].get_name()
1644 new_xl[
"molecule_object1"]=pra[0]
1645 new_xl[
"molecule_object2"]=pra[1]
1646 new_xl[self.cldb.residue1_key]=pra[2]
1647 new_xl[self.cldb.residue2_key]=pra[3]
1648 new_xl[
"Noisy"]=noisy
1650 new_xl[
"Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1651 new_xl[
"Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1652 r1=new_xl[
"Reactivity_Residue1"]
1653 r2=new_xl[
"Reactivity_Residue2"]
1659 new_xl[
"Score"]=np.random.beta(1.0,self.beta_false)
1662 new_xl[
"Score"]=1.0-np.random.beta(1.0,self.beta_true)
1663 new_xl[
"TargetDistance"]=dist
1664 new_xl[
"NoiseProbability"]=noise
1665 new_xl[
"AmbiguityProbability"]=ambiguity_probability
1667 (p1,p2)=
IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1668 self.protein_residue_dict[(pra[1],pra[3])]])
1673 new_xl[
"InterRigidBody"] =
False
1678 new_xl[
"InterRigidBody"] =
True
1680 new_xl[
"InterRigidBody"] =
None
1682 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1687 def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1690 from random
import choice,uniform
1691 if distance
is None:
1694 (protein1,residue1)=choice(self.protein_residue_dict.keys())
1695 (protein2,residue2)=choice(self.protein_residue_dict.keys())
1696 index1=self.protein_residue_dict[(protein1,residue1)]
1697 index2=self.protein_residue_dict[(protein2,residue2)]
1699 if (protein1,residue1) != (protein2,residue2):
1703 if not xwalk_bin_path:
1705 gcpf.set_distance(distance+max_delta_distance)
1709 if not self.sites_weighted:
1710 self.sites_weighted=[]
1711 for key
in self.reactivity_dictionary:
1712 r=self.reactivity_dictionary[key]
1713 self.sites_weighted.append((key,r))
1715 first_site=self.weighted_choice(self.sites_weighted)
1717 if not self.euclidean_interacting_pairs:
1718 self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1719 self.indexes_dict1.keys(),
1720 self.indexes_dict2.keys())
1722 first_site_pairs = [pair
for pair
in self.euclidean_interacting_pairs
1723 if self.indexes_dict1[pair[0]] == first_site
or
1724 self.indexes_dict2[pair[1]] == first_site]
1725 if len(first_site_pairs)==0:
continue
1727 second_sites_weighted=[]
1728 for pair
in first_site_pairs:
1729 if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1730 if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1731 r=self.reactivity_dictionary[second_site]
1732 second_sites_weighted.append((second_site,r))
1733 second_site=self.weighted_choice(second_sites_weighted)
1735 interacting_pairs_weighted=[]
1736 for pair in self.euclidean_interacting_pairs:
1737 r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1738 r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1739 #combined reactivity 1-exp(-k12*Delta t),
1742 r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1743 interacting_pairs_weighted.append((pair,r12))
1744 #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1745 #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1746 #interacting_pairs_weighted.append((pair,weight1*weight2))
1749 pair=self.weighted_choice(interacting_pairs_weighted)
1750 protein1,residue1=self.indexes_dict1[pair[0]]
1751 protein2,residue2=self.indexes_dict2[pair[1]]
1752 particle_pair=IMP.get_particles(self.model,pair)
1753 particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1754 if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1756 elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1757 #allow some flexibility
1758 prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1759 if uniform(0.0,1.0)<prob: break
1761 protein1,residue1=first_site
1762 protein2,residue2=second_site
1763 print(
"CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1764 "First site",first_site,self.reactivity_dictionary[first_site],
1765 "Second site",second_site,self.reactivity_dictionary[second_site])
1766 particle_pair=
IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1769 if particle_distance<distance
and (protein1,residue1) != (protein2,residue2):
1771 elif particle_distance>=distance
and (protein1,residue1) != (protein2,residue2)
and max_delta_distance:
1775 if particle_distance-distance < max_delta_distance:
break
1780 if not self.xwalk_interacting_pairs:
1781 self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1782 interacting_pairs_weighted=[]
1784 for pair
in self.xwalk_interacting_pairs:
1789 weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1790 weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1791 interacting_pairs_weighted.append((pair,weight1*weight2))
1793 pair=self.weighted_choice(interacting_pairs_weighted)
1798 particle_distance=float(pair[4])
1802 return ((protein1,protein2,residue1,residue2)),particle_distance
1804 def get_xwalk_distances(self,xwalk_bin_path,distance):
1808 o.init_pdb(
"xwalk.pdb",self.representation.prot)
1809 o.write_pdb(
"xwalk.pdb")
1810 namechainiddict=o.dictchain[
"xwalk.pdb"]
1813 for key
in namechainiddict: chainiddict[namechainiddict[key]]=key
1814 xwalkout=os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+
' -bb').read()
1816 output_list_of_distance=[]
1817 for line
in xwalkout.split(
"\n")[0:-2]:
1821 distance=float(tokens[6])
1823 ss=second.split(
"-")
1826 protein1=chainiddict[chainid1]
1827 protein2=chainiddict[chainid2]
1830 output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1831 return output_list_of_distance
1834 def weighted_choice(self,choices):
1837 total = sum(w
for c, w
in choices)
1838 r = random.uniform(0, total)
1840 for c, w
in choices:
1844 assert False,
"Shouldn't get here"
1846 def save_rmf_snapshot(self,filename,color_id=None):
1849 if color_id
is None:
1850 color_id=
"Reactivity"
1852 sorted_group_ids=sorted(self.cldb.data_base.keys())
1855 for group
in sorted_group_ids:
1857 group_dists_particles=[]
1858 for xl
in self.cldb.data_base[group]:
1859 xllabel=self.cldb.get_short_cross_link_string(xl)
1860 (c1,c2,r1,r2)=(xl[
"molecule_object1"],xl[
"molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1862 index1=self.protein_residue_dict[(c1,r1)]
1863 index2=self.protein_residue_dict[(c2,r2)]
1865 mdist=xl[
"TargetDistance"]
1867 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1869 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1870 if group_dists_particles:
1871 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1872 color_scores.append(mincolor_score)
1873 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1880 linear.set_slope(1.0)
1884 offset=min(color_scores)
1885 maxvalue=max(color_scores)
1886 for pair
in list_of_pairs:
1888 pr.set_name(pair[2])
1889 factor=(pair[3]-offset)/(maxvalue-offset)
1892 rslin.add_restraint(pr)
1895 rh = RMF.create_rmf_file(filename)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def __init__
Input a dictionary where keys are cldb names and values are cldbs.
def get_number_of_unique_crosslinked_sites
Returns the number of non redundant cross-link sites.
def filter_score
Get all cross-links with score greater than an input value.
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
def create_new_keyword
This function creates a new keyword for the whole database and set the values from and existing keywo...
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
def jaccard_index
Similarity index between two datasets https://en.wikipedia.org/wiki/Jaccard_index.
def merge
This function merges two cross-link datasets so that if two conflicting cross-links have the same cro...
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class maps a CrossLinkDataBase on a given structure and save an rmf file with color-coded cross-...
A decorator for keeping track of copies of a molecule.
The standard decorator for manipulating molecular structures.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def set_value
This function changes the value for a given key in the database For instance one can change the name ...
This class allows to compute and plot the distance between datasets.
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
def filter_out_same_residues
This function remove cross-links applied to the same residue (ie, same chain name and residue number)...
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
def offset_residue_index
This function offset the residue indexes of a given protein by a specified value. ...
Class for easy writing of PDBs, RMFs, and stat files.
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
def jackknife
this method returns a CrossLinkDataBase class containing a random subsample of the original cross-lin...
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
def check_cross_link_consistency
This function checks the consistency of the dataset with the amino acid sequence. ...
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
def get_values
this function returns the list of values for a given key in the database alphanumerically sorted ...
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
def append_database
Append cross-link dataset to this one.
class to link stat files to several rmf files
class to allow more advanced handling of RMF files.
Mapping between FASTA one-letter codes and residue types.
A class to handle different styles of site pairs parsers.
def classify_crosslinks_by_score
This function creates as many classes as in the input (number_of_classes: integer) and partition cros...
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
this class handles a cross-link dataset and do filtering operations, adding cross-links, merge datasets...
Support for the RMF file format for storing hierarchical molecular data and markup.
def rename_proteins
This function renames all proteins contained in the input dictionary from the old names (keys) to the...
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.
def create_set_from_file
if FixedFormatParser is not specified, the file is comma-separated-values