1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
7 from __future__
import print_function
23 from collections
import defaultdict
27 def set_json_default(obj):
28 if isinstance(obj, set):
35 if sys.version_info[0] >= 3:
36 def _handle_string_input(inp):
37 if not isinstance(inp, str):
38 raise TypeError(
"expecting a string")
41 def _handle_string_input(inp):
42 if not isinstance(inp, (str, unicode)):
43 raise TypeError(
"expecting a string or unicode")
45 if isinstance(inp, unicode):
50 class _CrossLinkDataBaseStandardKeys(object):
52 This class setup all the standard keys needed to
53 identify the cross-link features from the data sets
57 self.protein1_key=
"Protein1"
58 self.type[self.protein1_key]=str
59 self.protein2_key=
"Protein2"
60 self.type[self.protein2_key]=str
61 self.residue1_key=
"Residue1"
62 self.type[self.residue1_key]=int
63 self.residue2_key=
"Residue2"
64 self.type[self.residue2_key]=int
65 self.residue1_amino_acid_key=
"Residue1AminoAcid"
66 self.type[self.residue1_amino_acid_key]=str
67 self.residue2_amino_acid_key=
"Residue2AminoAcid"
68 self.type[self.residue2_amino_acid_key]=str
69 self.residue1_moiety_key=
"Residue1Moiety"
70 self.type[self.residue1_moiety_key]=str
71 self.residue2_moiety_key=
"Residue2Moiety"
72 self.type[self.residue2_moiety_key]=str
73 self.site_pairs_key=
"SitePairs"
74 self.type[self.site_pairs_key]=str
75 self.unique_id_key=
"XLUniqueID"
76 self.type[self.unique_id_key]=str
77 self.unique_sub_index_key=
"XLUniqueSubIndex"
78 self.type[self.unique_sub_index_key]=int
79 self.unique_sub_id_key=
"XLUniqueSubID"
80 self.type[self.unique_sub_id_key]=str
81 self.data_set_name_key=
"DataSetName"
82 self.type[self.data_set_name_key]=str
83 self.cross_linker_chemical_key=
"CrossLinkerChemical"
84 self.type[self.cross_linker_chemical_key]=str
85 self.id_score_key=
"IDScore"
86 self.type[self.id_score_key]=float
88 self.type[self.fdr_key]=float
89 self.quantitation_key=
"Quantitation"
90 self.type[self.quantitation_key]=float
91 self.redundancy_key=
"Redundancy"
92 self.type[self.redundancy_key]=int
93 self.redundancy_list_key=
"RedundancyList"
94 self.type[self.redundancy_key]=list
95 self.ambiguity_key=
"Ambiguity"
96 self.type[self.ambiguity_key]=int
97 self.residue1_links_number_key=
"Residue1LinksNumber"
98 self.type[self.residue1_links_number_key]=int
99 self.residue2_links_number_key=
"Residue2LinksNumber"
100 self.type[self.residue2_links_number_key]=int
101 self.type[self.ambiguity_key]=int
102 self.state_key=
"State"
103 self.type[self.state_key]=int
104 self.sigma1_key=
"Sigma1"
105 self.type[self.sigma1_key]=str
106 self.sigma2_key=
"Sigma2"
107 self.type[self.sigma2_key]=str
109 self.type[self.psi_key]=str
110 self.distance_key=
"Distance"
111 self.type[self.distance_key]=float
112 self.min_ambiguous_distance_key=
"MinAmbiguousDistance"
113 self.type[self.distance_key]=float
115 self.link_type_key=
"LinkType"
116 self.type[self.link_type_key]=str
118 self.ordered_key_list =[self.data_set_name_key,
120 self.unique_sub_index_key,
121 self.unique_sub_id_key,
126 self.residue1_amino_acid_key,
127 self.residue2_amino_acid_key,
128 self.residue1_moiety_key,
129 self.residue2_moiety_key,
130 self.cross_linker_chemical_key,
133 self.quantitation_key,
135 self.redundancy_list_key,
141 self.min_ambiguous_distance_key,
145 class _ProteinsResiduesArray(tuple):
147 This class is inherits from tuple, and it is a shorthand for a cross-link
148 (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
149 residue1 and residue2.
152 def __new__(self,input_data):
154 @input_data can be a dict or a tuple
156 self.cldbsk=_CrossLinkDataBaseStandardKeys()
157 if type(input_data)
is dict:
159 p1=input_data[self.cldbsk.protein1_key]
161 p2=input_data[self.cldbsk.protein2_key]
164 r1=input_data[self.cldbsk.residue1_key]
166 r2=input_data[self.cldbsk.residue2_key]
173 elif type(input_data)
is tuple:
174 if len(input_data)>4
or len(input_data)==3
or len(input_data)==1:
175 raise TypeError(
"_ProteinsResiduesArray: must have only 4 elements")
176 if len(input_data)==4:
177 p1 = _handle_string_input(input_data[0])
178 p2 = _handle_string_input(input_data[1])
181 if (
not (type(r1)
is int))
and (
not (r1
is None)):
182 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
183 if (
not (type(r2)
is int))
and (
not (r1
is None)):
184 raise TypeError(
"_ProteinsResiduesArray: residue2 must be a integer")
186 if len(input_data) == 2:
187 p1 = _handle_string_input(input_data[0])
189 if type(r1)
is not int:
190 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
193 raise TypeError(
"_ProteinsResiduesArray: input must be a dict or tuple")
194 return tuple.__new__(_ProteinsResiduesArray, t)
196 def get_inverted(self):
198 Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
200 return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
203 outstr=self.cldbsk.protein1_key+
" "+str(self[0])
204 outstr+=
" "+self.cldbsk.protein2_key+
" "+str(self[1])
205 outstr+=
" "+self.cldbsk.residue1_key+
" "+str(self[2])
206 outstr+=
" "+self.cldbsk.residue2_key+
" "+str(self[3])
210 outstr=str(self[0])+
"."+str(self[2])+
"-"+str(self[1])+
"."+str(self[3])
215 This class allows to create filter functions that can be passed to the CrossLinkDataBase
218 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
220 where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
222 A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
226 and it is used to filter the database
229 def __init__(self, argument1, operator, argument2):
231 (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
232 or (FilterOperator1,operator.or|and...,FilterOperator2)
234 if isinstance(argument1, FilterOperator):
235 self.operations = [argument1, operator, argument2]
238 self.values = (argument1, operator, argument2)
240 def __or__(self, FilterOperator2):
243 def __and__(self, FilterOperator2):
246 def __invert__(self):
249 def evaluate(self, xl_item):
251 if len(self.operations) == 0:
252 keyword, operator, value = self.values
253 return operator(xl_item[keyword], value)
254 FilterOperator1, op, FilterOperator2 = self.operations
256 if FilterOperator2
is None:
257 return op(FilterOperator1.evaluate(xl_item))
259 return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
262 def filter_factory(xl_):
264 class FilterOperator(object):
268 def __new__(self,key,value,oper=operator.eq):
269 return oper(self.xl[key],value)
271 return FilterOperator
276 This class is needed to convert the keywords from a generic database
282 @param list_parser an instance of ResiduePairListParser, if any is needed
285 self.backward_converter={}
286 _CrossLinkDataBaseStandardKeys.__init__(self)
287 self.rplp = list_parser
288 if self.rplp
is None:
290 self.compulsory_keys=set([self.protein1_key,
295 self.compulsory_keys=self.rplp.get_compulsory_keys()
296 self.setup_keys=set()
300 Is a function that check whether necessary keys are setup
303 if self.compulsory_keys & setup_keys != self.compulsory_keys:
304 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
308 Returns the keys that have been setup so far
310 return self.backward_converter.keys()
314 This sets up the standard compulsory keys as defined by
315 _CrossLinkDataBaseStandardKeys
317 for ck
in self.compulsory_keys:
318 self.converter[ck]=ck
319 self.backward_converter[ck]=ck
321 def set_unique_id_key(self,origin_key):
322 self.converter[origin_key]=self.unique_id_key
323 self.backward_converter[self.unique_id_key]=origin_key
325 def set_protein1_key(self,origin_key):
326 self.converter[origin_key]=self.protein1_key
327 self.backward_converter[self.protein1_key]=origin_key
329 def set_protein2_key(self,origin_key):
330 self.converter[origin_key]=self.protein2_key
331 self.backward_converter[self.protein2_key]=origin_key
333 def set_residue1_key(self,origin_key):
334 self.converter[origin_key]=self.residue1_key
335 self.backward_converter[self.residue1_key]=origin_key
337 def set_residue2_key(self,origin_key):
338 self.converter[origin_key]=self.residue2_key
339 self.backward_converter[self.residue2_key]=origin_key
341 def set_residue1_amino_acid_key(self, origin_key):
342 self.converter[origin_key] = self.residue1_amino_acid_key
343 self.backward_converter[self.residue1_amino_acid_key] = origin_key
345 def set_residue2_amino_acid_key(self, origin_key):
346 self.converter[origin_key] = self.residue2_amino_acid_key
347 self.backward_converter[self.residue2_amino_acid_key] = origin_key
349 def set_residue1_moiety_key(self, origin_key):
350 self.converter[origin_key] = self.residue1_moiety_key
351 self.backward_converter[self.residue1_moiety_key] = origin_key
353 def set_residue2_moiety_key(self, origin_key):
354 self.converter[origin_key] = self.residue2_moiety_key
355 self.backward_converter[self.residue2_moiety_key] = origin_key
357 def set_site_pairs_key(self,origin_key):
358 self.converter[origin_key]=self.site_pairs_key
359 self.backward_converter[self.site_pairs_key]=origin_key
361 def set_id_score_key(self,origin_key):
362 self.converter[origin_key]=self.id_score_key
363 self.backward_converter[self.id_score_key]=origin_key
365 def set_fdr_key(self,origin_key):
366 self.converter[origin_key]=self.fdr_key
367 self.backward_converter[self.fdr_key]=origin_key
369 def set_quantitation_key(self,origin_key):
370 self.converter[origin_key]=self.quantitation_key
371 self.backward_converter[self.quantitation_key]=origin_key
373 def set_psi_key(self,origin_key):
374 self.converter[origin_key]=self.psi_key
375 self.backward_converter[self.psi_key]=origin_key
377 def set_link_type_key(self,link_type_key):
378 self.converter[link_type_key]=self.link_type_key
379 self.backward_converter[self.link_type_key]=link_type_key
383 Returns the dictionary that convert the old keywords to the new ones
386 return self.converter
390 Returns the dictionary that convert the new keywords to the old ones
393 return self.backward_converter
397 A class to handle different styles of site pairs parsers.
399 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
400 [Y3-;K4-] for dead-ends
401 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
402 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
403 LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
408 def __init__(self,style):
410 _CrossLinkDataBaseStandardKeys.__init__(self)
411 if style ==
"MSSTUDIO":
413 self.compulsory_keys= set([self.protein1_key,
415 self.site_pairs_key])
416 elif style ==
"XTRACT" or style ==
"QUANTITATION":
418 self.compulsory_keys= set([self.site_pairs_key])
419 elif style ==
"LAN_HUANG":
421 self.compulsory_keys= set([self.site_pairs_key])
423 raise Error(
"ResiduePairListParser: unknown list parser style")
425 def get_compulsory_keys(self):
426 return self.compulsory_keys
430 This function returns a list of cross-linked residues and the corresponding list of
431 cross-linked chains. The latter list can be empty, if the style doesn't have the
432 corresponding information.
434 if self.style ==
"MSSTUDIO":
435 input_string=input_string.replace(
"[",
"")
436 input_string=input_string.replace(
"]",
"")
437 input_string_pairs=input_string.split(
";")
438 residue_pair_indexes=[]
439 chain_pair_indexes=[]
440 for s
in input_string_pairs:
441 m1=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
442 m2=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
445 residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
446 residue_pair_indexes.append((residue_index_1,residue_index_2))
449 residue_type_1,residue_index_1=m2.group(1,2)
451 return residue_pair_indexes,chain_pair_indexes
452 if self.style ==
"XTRACT" or self.style ==
"QUANTITATION":
453 if ":x:" in input_string:
455 input_strings=input_string.split(
":x:")
456 first_peptides=input_strings[0].split(
":|:")
457 second_peptides=input_strings[1].split(
":|:")
458 first_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in first_peptides]
459 second_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in second_peptides]
460 residue_pair_indexes=[]
461 chain_pair_indexes=[]
462 for fpi
in first_peptides_indentifiers:
463 for spi
in second_peptides_indentifiers:
468 residue_pair_indexes.append((residue1,residue2))
469 chain_pair_indexes.append((chain1,chain2))
470 return residue_pair_indexes, chain_pair_indexes
473 first_peptides = input_string.split(
":|:")
474 first_peptides_indentifiers = [(f.split(
":")[0], f.split(
":")[1])
for f
in first_peptides]
477 for fpi
in first_peptides_indentifiers:
480 residue_indexes.append((residue1,))
481 chain_indexes.append((chain1,))
482 return residue_indexes, chain_indexes
483 if self.style ==
"LAN_HUANG":
484 input_strings=input_string.split(
"-")
485 chain1,first_series=input_strings[0].split(
":")
486 chain2,second_series=input_strings[1].split(
":")
488 first_residues=first_series.replace(
";",
"|").split(
"|")
489 second_residues=second_series.replace(
";",
"|").split(
"|")
490 residue_pair_indexes=[]
491 chain_pair_indexes=[]
492 for fpi
in first_residues:
493 for spi
in second_residues:
494 residue1=self.re.sub(
"[^0-9]",
"", fpi)
495 residue2=self.re.sub(
"[^0-9]",
"", spi)
496 residue_pair_indexes.append((residue1,residue2))
497 chain_pair_indexes.append((chain1,chain2))
498 return residue_pair_indexes, chain_pair_indexes
504 A class to handle different XL format with fixed format
505 currently support ProXL
507 def __init__(self,format):
509 _CrossLinkDataBaseStandardKeys.__init__(self)
510 if format ==
"PROXL":
513 raise Error(
"FixedFormatParser: unknown list format name")
516 def get_data(self,input_string):
517 if self.format ==
"PROXL":
518 tokens=input_string.split(
"\t")
520 if tokens[0]==
"SEARCH ID(S)":
523 xl[self.protein1_key]=tokens[2]
524 xl[self.protein2_key]=tokens[4]
525 xl[self.residue1_key]=int(tokens[3])
526 xl[self.residue2_key]=int(tokens[5])
531 this class handles a cross-link dataset and do filtering
532 operations, adding cross-links, merge datasets...
535 def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=(
'K',)):
538 @param converter an instance of CrossLinkDataBaseKeywordsConverter
539 @param data_base an instance of CrossLinkDataBase to build the new database on
540 @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
541 cross-link consistency. If not given consistency will not be checked
542 @param linkable_aa a tuple containing one-letter amino acids which are linkable by the cross-linker;
543 only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
544 and if a fasta_seq is given
547 if data_base
is None:
550 self.data_base=data_base
552 _CrossLinkDataBaseStandardKeys.__init__(self)
553 if converter
is not None:
554 self.cldbkc = converter
555 self.list_parser=self.cldbkc.rplp
556 self.converter = converter.get_converter()
560 self.list_parser=
None
561 self.converter =
None
564 self.def_aa_tuple = linkable_aa
565 self.fasta_seq = fasta_seq
572 Update the whole dataset after changes
574 self.update_cross_link_unique_sub_index()
575 self.update_cross_link_redundancy()
576 self.update_residues_links_number()
581 sorted_ids=sorted(self.data_base.keys())
583 for xl
in self.data_base[k]:
586 def xlid_iterator(self):
587 sorted_ids=sorted(self.data_base.keys())
588 for xlid
in sorted_ids:
591 def __getitem__(self,xlid):
592 return self.data_base[xlid]
595 return len([xl
for xl
in self])
600 def set_name(self,name):
602 for k
in self.data_base:
603 new_data_base[k+
"."+name]=self.data_base[k]
604 self.data_base=new_data_base
608 def get_number_of_xlid(self):
609 return len(self.data_base)
614 if FixedFormatParser is not specified, the file is comma-separated-values
615 @param file_name a txt file to be parsed
616 @param converter an instance of CrossLinkDataBaseKeywordsConverter
617 @param FixedFormatParser a parser for a fixed format
619 if not FixedFormatParser:
620 xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
622 if converter
is not None:
623 self.cldbkc = converter
624 self.list_parser=self.cldbkc.rplp
625 self.converter = converter.get_converter()
628 if not self.list_parser:
632 for nxl,xl
in enumerate(xl_list):
635 if k
in self.converter:
636 new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
639 if self.unique_id_key
in self.cldbkc.get_setup_keys():
640 if new_xl[self.unique_id_key]
not in new_xl_dict:
641 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
643 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
645 if str(nxl)
not in new_xl_dict:
646 new_xl_dict[str(nxl)]=[new_xl]
648 new_xl_dict[str(nxl)].append(new_xl)
653 for nxl,entry
in enumerate(xl_list):
657 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
658 raise Error(
"CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
660 if k
in self.converter:
661 new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
665 residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
668 for n,p
in enumerate(residue_pair_list):
675 new_xl[k]=new_dict[k]
676 new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
678 new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
680 if len(chain_pair_list)==len(residue_pair_list):
681 new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
683 new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
686 new_xl[self.link_type_key]=
"CROSSLINK"
688 new_xl[self.link_type_key]=
"MONOLINK"
690 if self.unique_id_key
in self.cldbkc.get_setup_keys():
691 if new_xl[self.unique_id_key]
not in new_xl_dict:
692 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
694 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
696 if str(nxl)
not in new_xl_dict:
697 new_xl[self.unique_id_key]=str(nxl+1)
698 new_xl_dict[str(nxl)]=[new_xl]
700 new_xl[self.unique_id_key]=str(nxl+1)
701 new_xl_dict[str(nxl)].append(new_xl)
706 if FixedFormatParser is defined
710 f=open(file_name,
"r")
713 xl=FixedFormatParser.get_data(line)
715 xl[self.unique_id_key]=str(nxl+1)
716 new_xl_dict[str(nxl)]=[xl]
720 self.data_base=new_xl_dict
722 l = ihm.location.InputFileLocation(file_name, details=
'Crosslinks')
723 self.dataset = ihm.dataset.CXMSDataset(l)
726 def update_cross_link_unique_sub_index(self):
727 for k
in self.data_base:
728 for n,xl
in enumerate(self.data_base[k]):
729 xl[self.ambiguity_key]=len(self.data_base[k])
730 xl[self.unique_sub_index_key]=n+1
731 xl[self.unique_sub_id_key]=k+
"."+str(n+1)
733 def update_cross_link_redundancy(self):
734 redundancy_data_base={}
736 pra=_ProteinsResiduesArray(xl)
737 if pra
not in redundancy_data_base:
738 redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
739 redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
741 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
742 redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
744 pra=_ProteinsResiduesArray(xl)
745 xl[self.redundancy_key]=len(redundancy_data_base[pra])
746 xl[self.redundancy_list_key]=redundancy_data_base[pra]
748 def update_residues_links_number(self):
751 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
752 if (p1,r1)
not in residue_links:
753 residue_links[(p1,r1)]=set([(p2,r2)])
755 residue_links[(p1,r1)].add((p2,r2))
756 if (p2,r2)
not in residue_links:
757 residue_links[(p2,r2)]=set([(p1,r1)])
759 residue_links[(p2,r2)].add((p1,r1))
762 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
763 xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
764 xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
767 """This function checks the consistency of the dataset with the amino acid sequence"""
768 if self.cldbkc
and self.fasta_seq:
769 cnt_matched, cnt_matched_file = 0, 0
773 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
774 b_matched_file =
False
775 if self.residue1_amino_acid_key
in xl:
777 aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
778 b_matched = self._match_xlinks(p1, r1, aa_from_file)
779 b_matched_file = b_matched
782 b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
784 matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
786 if self.residue2_amino_acid_key
in xl:
787 aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
788 b_matched = self._match_xlinks(p2, r2, aa_from_file)
789 b_matched_file = b_matched
791 b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
793 matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
794 if b_matched: cnt_matched += 1
795 if b_matched_file: cnt_matched_file += 1
797 percentage_matched = round(100*cnt_matched/len(self),1)
798 percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
800 if matched
or non_matched: print(
"check_cross_link_consistency: Out of %d cross-links "
801 "%d were matched to the fasta sequence (%f %%).\n "
802 "%d were matched by using the cross-link file (%f %%)."%
803 (len(self),cnt_matched,percentage_matched,cnt_matched_file,
804 percentage_matched_file) )
805 if non_matched: print(
"check_cross_link_consistency: Warning: Non matched xlinks:",
806 [(prot_name, sorted(list(non_matched[prot_name])))
for prot_name
in non_matched])
807 return matched,non_matched
809 def _match_xlinks(self, prot_name, res_index, aa_tuple):
812 amino_dict = IMP.pmi.alphabets.amino_acid
814 for amino_acid
in aa_tuple:
815 if len(amino_acid) == 3:
816 amino_acid = amino_dict.get_one_letter_code_from_residue_type(
818 if prot_name
in self.fasta_seq.sequences:
819 seq = self.fasta_seq.sequences[prot_name]
824 if res_index == 0
or res_index == 1:
826 if res_index < len(seq):
827 if amino_acid == seq[res_index]:
833 def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
836 matched[prot].add(res)
838 matched[prot] = set([res])
840 if prot
in non_matched:
841 non_matched[prot].add(res)
843 non_matched[prot] = set([res])
844 return matched, non_matched
847 def get_cross_link_string(self,xl):
849 for k
in self.ordered_key_list:
851 string+=str(k)+
":"+str(xl[k])+
"|"
856 if k
not in self.ordered_key_list:
857 string+=str(k)+
":"+str(xl[k])+
"|"
861 def get_short_cross_link_string(self,xl):
864 list_of_keys=[self.data_set_name_key,
865 self.unique_sub_id_key,
873 for k
in list_of_keys:
875 string+=str(xl[k])+
"|"
881 def filter(self,FilterOperator):
883 for id
in self.data_base.keys():
884 for xl
in self.data_base[id]:
885 if FilterOperator.evaluate(xl):
886 if id
not in new_xl_dict:
889 new_xl_dict[id].append(xl)
892 cdb.dataset = self.dataset
896 '''Get all cross-links with score greater than an input value'''
898 return self.filter(FilterOperator)
900 def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
902 This function merges two cross-link datasets so that if two conflicting cross-links have the same
903 cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
904 with different SubIDs
910 This function append one cross-link dataset to another. Unique ids will be renamed to avoid
913 name1=self.get_name()
914 name2=CrossLinkDataBase2.get_name()
917 name2=id(CrossLinkDataBase2)
919 CrossLinkDataBase2.set_name(name2)
923 for k
in self.data_base:
924 new_data_base[k]=self.data_base[k]
925 for k
in CrossLinkDataBase2.data_base:
926 new_data_base[k]=CrossLinkDataBase2.data_base[k]
927 self.data_base=new_data_base
932 This function changes the value for a given key in the database
933 For instance one can change the name of a protein
934 @param key: the key in the database that must be changed
935 @param new_value: the new value of the key
936 @param FilterOperator: optional FilterOperator to change the value to
937 a subset of the database
939 example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
943 if FilterOperator
is not None:
944 if FilterOperator.evaluate(xl):
952 this function returns the list of values for a given key in the database
953 alphanumerically sorted
958 return sorted(list(values))
962 This function offset the residue indexes of a given protein by a specified value
963 @param protein_name: the protein name that need to be changed
964 @param offset: the offset value
968 if xl[self.protein1_key] == protein_name:
969 xl[self.residue1_key]=xl[self.residue1_key]+offset
970 if xl[self.protein2_key] == protein_name:
971 xl[self.residue2_key]=xl[self.residue2_key]+offset
976 This function creates a new keyword for the whole database and set the values from
977 and existing keyword (optional), otherwise the values are set to None
978 @param keyword the new keyword name:
979 @param values_from_keyword the keyword from which we are copying the values:
982 if values_from_keyword
is not None:
983 xl[keyword] = xl[values_from_keyword]
990 This function renames all proteins contained in the input dictionary
991 from the old names (keys) to the new name (values)
992 @param old_to_new_names_dictionary dictionary for converting old to new names
993 @param protein_to_rename specify whether to rename both or protein1 or protein2 only
996 for old_name
in old_to_new_names_dictionary:
997 new_name=old_to_new_names_dictionary[old_name]
998 if protein_to_rename ==
"both" or protein_to_rename ==
"protein1":
1000 self.
set_value(self.protein1_key,new_name,fo2)
1001 if protein_to_rename ==
"both" or protein_to_rename ==
"protein2":
1003 self.
set_value(self.protein2_key,new_name,fo2)
1007 This function creates as many classes as in the input (number_of_classes: integer)
1008 and partition cross-links according to their identification scores. Classes are defined in the psi key.
1011 if self.id_score_key
is not None:
1014 raise ValueError(
'The cross-link database does not contain score values')
1015 minscore=min(scores)
1016 maxscore=max(scores)
1017 scoreclasses=numpy.linspace(minscore, maxscore, number_of_classes+1)
1018 if self.psi_key
is None:
1021 score=xl[self.id_score_key]
1022 for n,classmin
in enumerate(scoreclasses[0:-1]):
1023 if score>=classmin
and score<=scoreclasses[n+1]:
1024 xl[self.psi_key]=str(
"CLASS_"+str(n))
1027 def clone_protein(self,protein_name,new_protein_name):
1029 for id
in self.data_base.keys():
1031 for xl
in self.data_base[id]:
1032 new_data_base.append(xl)
1033 if xl[self.protein1_key]==protein_name
and xl[self.protein2_key]!=protein_name:
1035 new_xl[self.protein1_key]=new_protein_name
1036 new_data_base.append(new_xl)
1037 elif xl[self.protein1_key]!=protein_name
and xl[self.protein2_key]==protein_name:
1039 new_xl[self.protein2_key]=new_protein_name
1040 new_data_base.append(new_xl)
1041 elif xl[self.protein1_key]==protein_name
and xl[self.protein2_key]==protein_name:
1043 new_xl[self.protein1_key]=new_protein_name
1044 new_data_base.append(new_xl)
1046 new_xl[self.protein2_key]=new_protein_name
1047 new_data_base.append(new_xl)
1049 new_xl[self.protein1_key]=new_protein_name
1050 new_xl[self.protein2_key]=new_protein_name
1051 new_data_base.append(new_xl)
1052 self.data_base[id]=new_data_base
1057 This function remove cross-links applied to the same residue
1058 (ie, same chain name and residue number)
1061 for id
in self.data_base.keys():
1063 for xl
in self.data_base[id]:
1064 if xl[self.protein1_key]==xl[self.protein2_key]
and xl[self.residue1_key]==xl[self.residue2_key]:
1067 new_data_base.append(xl)
1068 self.data_base[id]=new_data_base
1074 this method returns a CrossLinkDataBase class containing
1075 a random subsample of the original cross-link database.
1076 @param percentage float between 0 and 1, is the percentage of
1077 of spectra taken from the original list
1080 if percentage > 1.0
or percentage < 0.0:
1081 raise ValueError(
'the percentage of random cross-link spectra should be between 0 and 1')
1082 nspectra=self.get_number_of_xlid()
1083 nrandom_spectra=int(nspectra*percentage)
1084 random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1086 for k
in random_keys:
1087 new_data_base[k]=self.data_base[k]
1092 sorted_ids=sorted(self.data_base.keys())
1094 for id
in sorted_ids:
1096 for xl
in self.data_base[id]:
1097 for k
in self.ordered_key_list:
1099 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1104 if k
not in self.ordered_key_list:
1105 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1106 outstr+=
"-------------\n"
1110 def plot(self,filename,**kwargs):
1112 matplotlib.use(
'Agg')
1113 import matplotlib.pyplot
as plt
1114 import matplotlib.colors
1118 if kwargs[
"type"] ==
"scatter":
1119 cmap=plt.get_cmap(
"rainbow")
1120 norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1123 if "colorkey" in kwargs:
1124 colorkey=kwargs[
"colorkey"]
1125 if "sizekey" in kwargs:
1126 sizekey=kwargs[
"sizekey"]
1127 if "logyscale" in kwargs:
1128 logyscale=kwargs[
"logyscale"]
1136 xs.append(float(xl[xkey]))
1138 ys.append(math.log10(float(xl[ykey])))
1140 ys.append(float(xl[ykey]))
1141 colors.append(float(xl[colorkey]))
1143 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1147 for color
in colors:
1148 cindex=(color-min(colors))/(max(colors)-min(colors))
1149 cs.append(cmap(cindex))
1152 ax = fig.add_subplot(111)
1153 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker=
"o")
1156 plt.savefig(filename)
1160 if kwargs[
"type"] ==
"residue_links":
1164 molecule=kwargs[
"molecule"]
1166 length=len(molecule.sequence)
1167 molecule=molecule.get_name()
1170 sequences_object=kwargs[
"sequences_object"]
1171 sequence=sequences_object.sequences[molecule]
1172 length=len(sequence)
1174 histogram=[0]*length
1176 if xl[self.protein1_key]==molecule:
1177 histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1178 if xl[self.protein2_key]==molecule:
1179 histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1180 rects = plt.bar(range(1,length+1), histogram)
1187 plt.savefig(filename)
1193 if kwargs[
"type"] ==
"histogram":
1194 valuekey=kwargs[
"valuekey"]
1195 reference_xline=kwargs[
"reference_xline"]
1201 values_list.append(float(xl[valuekey]))
1203 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1206 filename, [values_list], valuename=valuename, bins=bins,
1207 colors=
None, format=
"pdf",
1208 reference_xline=
None, yplotrange=
None,
1209 xplotrange=
None,normalized=
True,
1212 def dump(self,json_filename):
1214 with open(json_filename,
'w')
as fp:
1215 json.dump(self.data_base, fp, sort_keys=
True, indent=2, default=set_json_default)
1217 def load(self,json_filename):
1219 with open(json_filename,
'r') as fp:
1220 self.data_base = json.load(fp)
1224 if sys.version_info[0] < 3:
1226 for k,v
in xl.iteritems():
1227 if type(k)
is unicode: k=str(k)
1228 if type(v)
is unicode: v=str(v)
1231 def save_csv(self,filename):
1237 sorted_group_ids=sorted(self.data_base.keys())
1238 for group
in sorted_group_ids:
1240 for xl
in self.data_base[group]:
1242 sorted_ids=sorted(xl.keys())
1243 values=[xl[k]
for k
in sorted_ids]
1244 group_block.append(values)
1248 with open(filename,
'w')
as fp:
1249 a = csv.writer(fp, delimiter=
',')
1250 a.writerow(sorted_ids)
1255 Returns the number of non redundant cross-link sites
1259 pra=_ProteinsResiduesArray(xl)
1260 prai=pra.get_inverted()
1263 return len(list(praset))
1266 """This class allows to compute and plot the distance between datasets"""
1269 """Input a dictionary where keys are cldb names and values are cldbs"""
1270 import scipy.spatial.distance
1271 self.dbs=cldb_dictionary
1272 self.keylist=list(self.dbs.keys())
1273 self.distances=list()
1276 for i,key1
in enumerate(self.keylist):
1277 for key2
in self.keylist[i+1:]:
1278 distance=self.get_distance(key1,key2)
1279 self.distances.append(distance)
1281 self.distances=scipy.spatial.distance.squareform(self.distances)
1284 return 1.0-self.
jaccard_index(self.dbs[key1],self.dbs[key2])
1287 """Similarity index between two datasets
1288 https://en.wikipedia.org/wiki/Jaccard_index"""
1292 for xl1
in CrossLinkDataBase1:
1293 a1f=_ProteinsResiduesArray(xl1)
1294 a1b=a1f.get_inverted()
1297 for xl2
in CrossLinkDataBase2:
1298 a2f=_ProteinsResiduesArray(xl2)
1299 a2b=a2f.get_inverted()
1302 return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1304 def plot_matrix(self,figurename="clustermatrix.pdf"):
1305 import matplotlib
as mpl
1308 import matplotlib.pylab
as pl
1309 from scipy.cluster
import hierarchy
as hrc
1311 raw_distance_matrix=self.distances
1317 ax = fig.add_subplot(1,1,1)
1318 dendrogram = hrc.dendrogram(
1319 hrc.linkage(raw_distance_matrix),
1322 leaves_order = dendrogram[
'leaves']
1323 ax.set_xlabel(
'Dataset')
1324 ax.set_ylabel(
'Jaccard Distance')
1326 pl.savefig(
"dendrogram."+figurename, dpi=300)
1332 ax = fig.add_subplot(1,1,1)
1334 raw_distance_matrix[leaves_order,
1337 interpolation=
'nearest')
1338 cb = fig.colorbar(cax)
1339 cb.set_label(
'Jaccard Distance')
1340 ax.set_xlabel(
'Dataset')
1341 ax.set_ylabel(
'Dataset')
1342 ax.set_xticks(range(len(labels)))
1343 ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation=
'vertical')
1344 ax.set_yticks(range(len(labels)))
1345 ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation=
'horizontal')
1347 pl.savefig(
"matrix."+figurename, dpi=300)
1353 This class maps a CrossLinkDataBase on a given structure
1354 and save an rmf file with color-coded cross-links
1358 def __init__(self,CrossLinkDataBase,rmf_or_stat_handler):
1359 self.CrossLinkDataBase=CrossLinkDataBase
1362 self.prots=rmf_or_stat_handler
1363 self.distances=defaultdict(list)
1367 print(
"computing distances fro all cross-links and all structures")
1368 for i
in self.prots[::10]:
1369 self.compute_distances()
1370 for key,xl
in enumerate(self.CrossLinkDataBase):
1371 array=_ProteinsResiduesArray(xl)
1372 self.array_to_id[array]=key
1373 self.id_to_array[key]=array
1374 if xl[
"MinAmbiguousDistance"] !=
'None':
1375 self.distances[key].append(xl[
"MinAmbiguousDistance"])
1377 def compute_distances(self):
1380 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1381 for group
in sorted_group_ids:
1384 for xl
in self.CrossLinkDataBase.data_base[group]:
1386 sorted_ids=sorted(xl.keys())
1387 data.append(sorted_ids+[
"UniqueID",
"Distance",
"MinAmbiguousDistance"])
1388 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1390 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1399 values=[xl[k]
for k
in sorted_ids]
1400 values += [group, mdist]
1401 except KeyError
as e:
1402 print(
"MapCrossLinkDataBaseOnStructure KeyError: {0} in {1}".format(e, xl))
1403 group_dists.append(mdist)
1405 xl[
"Distance"]=mdist
1411 for xl
in self.CrossLinkDataBase.data_base[group]:
1412 xl[
"MinAmbiguousDistance"]=min(group_dists)
1414 def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1415 '''more robust and slower version of above'''
1417 selpart_1=sel.get_selected_particles()
1418 if len(selpart_1)==0:
1419 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1422 selpart_2=sel.get_selected_particles()
1423 if len(selpart_2)==0:
1424 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1427 for p1
in selpart_1:
1428 for p2
in selpart_2:
1429 if p1 == p2
and r1 == r2:
continue
1449 results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1450 if len(results)==0:
return None
1451 results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1452 return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1454 def save_rmf_snapshot(self,filename,color_id=None):
1455 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1458 for group
in sorted_group_ids:
1459 group_dists_particles=[]
1460 for xl
in self.CrossLinkDataBase.data_base[group]:
1461 xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1462 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1464 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1466 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1468 if color_id
is not None:
1469 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1471 group_dists_particles.append((mdist,p1,p2,xllabel,0.0))
1472 if group_dists_particles:
1473 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1474 color_scores.append(mincolor_score)
1475 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1480 linear.set_slope(1.0)
1482 rslin =
IMP.RestraintSet(self.prots.get_model(),
'linear_dummy_restraints')
1484 offset=min(color_scores)
1485 maxvalue=max(color_scores)
1486 for pair
in list_of_pairs:
1488 pr.set_name(pair[2])
1490 factor=(pair[3]-offset)/(maxvalue-offset)
1495 rslin.add_restraint(pr)
1498 rh = RMF.create_rmf_file(filename)
1505 def boxplot_crosslink_distances(self,filename):
1507 keys=[self.id_to_array[k]
for k
in self.distances.keys()]
1508 medians=[numpy.median(self.distances[self.array_to_id[k]])
for k
in keys]
1509 dists=[self.distances[self.array_to_id[k]]
for k
in keys]
1510 distances_sorted_by_median=[x
for _,x
in sorted(zip(medians,dists))]
1511 keys_sorted_by_median=[x
for _,x
in sorted(zip(medians,keys))]
1513 distances_sorted_by_median,
1514 range(len(distances_sorted_by_median)),
1515 xlabels=keys_sorted_by_median,scale_plot_length=0.2)
1517 def get_crosslink_violations(self,threshold):
1518 violations=defaultdict(list)
1519 for k
in self.distances:
1521 viols=self.distances[k]
1522 violations[self.id_to_array[k]]=viols
1528 This class generates a CrossLinkDataBase from a given structure
1530 def __init__(self, system, residue_types_1=["K"],
1531 residue_types_2=[
"K"], reactivity_range=[0,1], kt=1.0):
1536 cldbkc.set_protein1_key(
"Protein1")
1537 cldbkc.set_protein2_key(
"Protein2")
1538 cldbkc.set_residue1_key(
"Residue1")
1539 cldbkc.set_residue2_key(
"Residue2")
1542 self.model=self.system.model
1543 self.residue_types_1=residue_types_1
1544 self.residue_types_2=residue_types_2
1546 self.indexes_dict1={}
1547 self.indexes_dict2={}
1548 self.protein_residue_dict={}
1549 self.reactivity_dictionary={}
1550 self.euclidean_interacting_pairs=
None
1551 self.xwalk_interacting_pairs=
None
1554 for state
in self.system.get_states():
1555 for moleculename,molecules
in state.get_molecules().items():
1556 for molecule
in molecules:
1558 seq=molecule.sequence
1559 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
1565 rexp=numpy.random.exponential(0.00000001)
1566 prob=1.0-math.exp(-rexp)
1567 self.reactivity_dictionary[(molecule,r)]=prob
1569 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
1570 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
1573 ps=s.get_selected_particles()
1576 self.indexes_dict1[index]=(molecule,r)
1577 self.protein_residue_dict[(molecule,r)]=index
1580 ps=s.get_selected_particles()
1583 self.indexes_dict2[index]=(molecule,r)
1584 self.protein_residue_dict[(molecule,r)]=index
1587 def get_all_possible_pairs(self):
1588 n=float(len(self.protein_residue_dict.keys()))
1589 return n*(n-1.0)/2.0
1591 def get_all_feasible_pairs(self,distance=21):
1593 particle_index_pairs=[]
1595 for a,b
in itertools.combinations(self.protein_residue_dict.keys(),2):
1598 index1=self.protein_residue_dict[a]
1599 index2=self.protein_residue_dict[b]
1601 if particle_distance <= distance:
1602 particle_index_pairs.append((index1,index2))
1603 new_xl[self.cldb.protein1_key]=a[0].get_name()
1604 new_xl[self.cldb.protein2_key]=b[0].get_name()
1605 new_xl[
"molecule_object1"]=a[0]
1606 new_xl[
"molecule_object2"]=b[0]
1607 new_xl[self.cldb.residue1_key]=a[1]
1608 new_xl[self.cldb.residue2_key]=b[1]
1609 self.cldb.data_base[str(nxl)]=[new_xl]
1617 def get_data_base(self,total_number_of_spectra,
1618 ambiguity_probability=0.1,
1621 max_delta_distance=10.0,
1622 xwalk_bin_path=
None,
1623 confidence_false=0.75,
1624 confidence_true=0.75):
1626 from random
import random,uniform
1630 self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1631 self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1632 self.cldb.data_base[str(number_of_spectra)]=[]
1633 self.sites_weighted=
None
1635 while number_of_spectra<total_number_of_spectra:
1636 if random() > ambiguity_probability
and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1638 number_of_spectra+=1
1639 self.cldb.data_base[str(number_of_spectra)]=[]
1641 if random() > noise:
1643 pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1646 pra,dist=self.get_random_residue_pair(
None,
None,
None)
1650 new_xl[self.cldb.protein1_key]=pra[0].get_name()
1651 new_xl[self.cldb.protein2_key]=pra[1].get_name()
1652 new_xl[
"molecule_object1"]=pra[0]
1653 new_xl[
"molecule_object2"]=pra[1]
1654 new_xl[self.cldb.residue1_key]=pra[2]
1655 new_xl[self.cldb.residue2_key]=pra[3]
1656 new_xl[
"Noisy"]=noisy
1658 new_xl[
"Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1659 new_xl[
"Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1660 r1=new_xl[
"Reactivity_Residue1"]
1661 r2=new_xl[
"Reactivity_Residue2"]
1667 new_xl[
"Score"]=np.random.beta(1.0,self.beta_false)
1670 new_xl[
"Score"]=1.0-np.random.beta(1.0,self.beta_true)
1671 new_xl[
"TargetDistance"]=dist
1672 new_xl[
"NoiseProbability"]=noise
1673 new_xl[
"AmbiguityProbability"]=ambiguity_probability
1675 (p1,p2)=
IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1676 self.protein_residue_dict[(pra[1],pra[3])]])
1681 new_xl[
"InterRigidBody"] =
False
1686 new_xl[
"InterRigidBody"] =
True
1688 new_xl[
"InterRigidBody"] =
None
1690 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1695 def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1698 from random
import choice,uniform
1699 if distance
is None:
1702 (protein1,residue1)=choice(self.protein_residue_dict.keys())
1703 (protein2,residue2)=choice(self.protein_residue_dict.keys())
1704 index1=self.protein_residue_dict[(protein1,residue1)]
1705 index2=self.protein_residue_dict[(protein2,residue2)]
1707 if (protein1,residue1) != (protein2,residue2):
1711 if not xwalk_bin_path:
1713 gcpf.set_distance(distance+max_delta_distance)
1717 if not self.sites_weighted:
1718 self.sites_weighted=[]
1719 for key
in self.reactivity_dictionary:
1720 r=self.reactivity_dictionary[key]
1721 self.sites_weighted.append((key,r))
1723 first_site=self.weighted_choice(self.sites_weighted)
1725 if not self.euclidean_interacting_pairs:
1726 self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1727 self.indexes_dict1.keys(),
1728 self.indexes_dict2.keys())
1730 first_site_pairs = [pair
for pair
in self.euclidean_interacting_pairs
1731 if self.indexes_dict1[pair[0]] == first_site
or
1732 self.indexes_dict2[pair[1]] == first_site]
1733 if len(first_site_pairs)==0:
continue
1735 second_sites_weighted=[]
1736 for pair
in first_site_pairs:
1737 if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1738 if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1739 r=self.reactivity_dictionary[second_site]
1740 second_sites_weighted.append((second_site,r))
1741 second_site=self.weighted_choice(second_sites_weighted)
1743 interacting_pairs_weighted=[]
1744 for pair in self.euclidean_interacting_pairs:
1745 r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1746 r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1747 #combined reactivity 1-exp(-k12*Delta t),
1750 r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1751 interacting_pairs_weighted.append((pair,r12))
1752 #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1753 #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1754 #interacting_pairs_weighted.append((pair,weight1*weight2))
1757 pair=self.weighted_choice(interacting_pairs_weighted)
1758 protein1,residue1=self.indexes_dict1[pair[0]]
1759 protein2,residue2=self.indexes_dict2[pair[1]]
1760 particle_pair=IMP.get_particles(self.model,pair)
1761 particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1762 if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1764 elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1765 #allow some flexibility
1766 prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1767 if uniform(0.0,1.0)<prob: break
1769 protein1,residue1=first_site
1770 protein2,residue2=second_site
1771 print(
"CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1772 "First site",first_site,self.reactivity_dictionary[first_site],
1773 "Second site",second_site,self.reactivity_dictionary[second_site])
1774 particle_pair=
IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1777 if particle_distance<distance
and (protein1,residue1) != (protein2,residue2):
1779 elif particle_distance>=distance
and (protein1,residue1) != (protein2,residue2)
and max_delta_distance:
1783 if particle_distance-distance < max_delta_distance:
break
1788 if not self.xwalk_interacting_pairs:
1789 self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1790 interacting_pairs_weighted=[]
1792 for pair
in self.xwalk_interacting_pairs:
1797 weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1798 weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1799 interacting_pairs_weighted.append((pair,weight1*weight2))
1801 pair=self.weighted_choice(interacting_pairs_weighted)
1806 particle_distance=float(pair[4])
1810 return ((protein1,protein2,residue1,residue2)),particle_distance
1812 def get_xwalk_distances(self,xwalk_bin_path,distance):
1816 o.init_pdb(
"xwalk.pdb",self.representation.prot)
1817 o.write_pdb(
"xwalk.pdb")
1818 namechainiddict=o.dictchain[
"xwalk.pdb"]
1821 for key
in namechainiddict: chainiddict[namechainiddict[key]]=key
1822 xwalkout=os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+
' -bb').read()
1824 output_list_of_distance=[]
1825 for line
in xwalkout.split(
"\n")[0:-2]:
1829 distance=float(tokens[6])
1831 ss=second.split(
"-")
1834 protein1=chainiddict[chainid1]
1835 protein2=chainiddict[chainid2]
1838 output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1839 return output_list_of_distance
1842 def weighted_choice(self,choices):
1845 total = sum(w
for c, w
in choices)
1846 r = random.uniform(0, total)
1848 for c, w
in choices:
1852 assert False,
"Shouldn't get here"
1854 def save_rmf_snapshot(self,filename,color_id=None):
1857 if color_id
is None:
1858 color_id=
"Reactivity"
1860 sorted_group_ids=sorted(self.cldb.data_base.keys())
1863 for group
in sorted_group_ids:
1865 group_dists_particles=[]
1866 for xl
in self.cldb.data_base[group]:
1867 xllabel=self.cldb.get_short_cross_link_string(xl)
1868 (c1,c2,r1,r2)=(xl[
"molecule_object1"],xl[
"molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1870 index1=self.protein_residue_dict[(c1,r1)]
1871 index2=self.protein_residue_dict[(c2,r2)]
1873 mdist=xl[
"TargetDistance"]
1875 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1877 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1878 if group_dists_particles:
1879 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1880 color_scores.append(mincolor_score)
1881 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1888 linear.set_slope(1.0)
1892 offset=min(color_scores)
1893 maxvalue=max(color_scores)
1894 for pair
in list_of_pairs:
1896 pr.set_name(pair[2])
1897 factor=(pair[3]-offset)/(maxvalue-offset)
1900 rslin.add_restraint(pr)
1903 rh = RMF.create_rmf_file(filename)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def __init__
Input a dictionary where keys are cldb names and values are cldbs.
def get_number_of_unique_crosslinked_sites
Returns the number of non redundant cross-link sites.
def filter_score
Get all cross-links with score greater than an input value.
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
def create_new_keyword
This function creates a new keyword for the whole database and set the values from and existing keywo...
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
def jaccard_index
Similarity index between two datasets https://en.wikipedia.org/wiki/Jaccard_index.
def merge
This function merges two cross-link datasets so that if two conflicting cross-links have the same cro...
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class maps a CrossLinkDataBase on a given structure and save an rmf file with color-coded cross-...
A decorator for keeping track of copies of a molecule.
The standard decorator for manipulating molecular structures.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def set_value
This function changes the value for a given key in the database For instance one can change the name ...
This class allows to compute and plot the distance between datasets.
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
def filter_out_same_residues
This function remove cross-links applied to the same residue (ie, same chain name and residue number)...
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
def offset_residue_index
This function offset the residue indexes of a given protein by a specified value. ...
Class for easy writing of PDBs, RMFs, and stat files.
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
def jackknife
this method returns a CrossLinkDataBase class containing a random subsample of the original cross-lin...
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
def check_cross_link_consistency
This function checks the consistency of the dataset with the amino acid sequence. ...
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
def get_values
this function returns the list of values for a given key in the database alphanumerically sorted ...
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
def append_database
This function append one cross-link dataset to another.
class to link stat files to several rmf files
class to allow more advanced handling of RMF files.
Mapping between FASTA one-letter codes and residue types.
A class to handle different styles of site pairs parsers.
def classify_crosslinks_by_score
This function creates as many classes as in the input (number_of_classes: integer) and partition cros...
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
this class handles a cross-link dataset and do filtering operations, adding cross-links, merge datasets...
Support for the RMF file format for storing hierarchical molecular data and markup.
def rename_proteins
This function renames all proteins contained in the input dictionary from the old names (keys) to the...
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.
def create_set_from_file
if FixedFormatParser is not specified, the file is comma-separated-values