1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
7 from __future__
import print_function
22 from collections
import defaultdict
26 def set_json_default(obj):
27 if isinstance(obj, set):
34 if sys.version_info[0] >= 3:
35 def _handle_string_input(inp):
36 if not isinstance(inp, str):
37 raise TypeError(
"expecting a string")
40 def _handle_string_input(inp):
41 if not isinstance(inp, (str, unicode)):
42 raise TypeError(
"expecting a string or unicode")
44 if isinstance(inp, unicode):
49 class _CrossLinkDataBaseStandardKeys(object):
51 This class setup all the standard keys needed to
52 identify the crosslink features from the data sets
56 self.protein1_key=
"Protein1"
57 self.type[self.protein1_key]=str
58 self.protein2_key=
"Protein2"
59 self.type[self.protein2_key]=str
60 self.residue1_key=
"Residue1"
61 self.type[self.residue1_key]=int
62 self.residue2_key=
"Residue2"
63 self.type[self.residue2_key]=int
64 self.residue1_amino_acid_key=
"Residue1AminoAcid"
65 self.type[self.residue1_amino_acid_key]=str
66 self.residue2_amino_acid_key=
"Residue2AminoAcid"
67 self.type[self.residue2_amino_acid_key]=str
68 self.residue1_moiety_key=
"Residue1Moiety"
69 self.type[self.residue1_moiety_key]=str
70 self.residue2_moiety_key=
"Residue2Moiety"
71 self.type[self.residue2_moiety_key]=str
72 self.site_pairs_key=
"SitePairs"
73 self.type[self.site_pairs_key]=str
74 self.unique_id_key=
"XLUniqueID"
75 self.type[self.unique_id_key]=str
76 self.unique_sub_index_key=
"XLUniqueSubIndex"
77 self.type[self.unique_sub_index_key]=int
78 self.unique_sub_id_key=
"XLUniqueSubID"
79 self.type[self.unique_sub_id_key]=str
80 self.data_set_name_key=
"DataSetName"
81 self.type[self.data_set_name_key]=str
82 self.cross_linker_chemical_key=
"CrossLinkerChemical"
83 self.type[self.cross_linker_chemical_key]=str
84 self.id_score_key=
"IDScore"
85 self.type[self.id_score_key]=float
87 self.type[self.fdr_key]=float
88 self.quantitation_key=
"Quantitation"
89 self.type[self.quantitation_key]=float
90 self.redundancy_key=
"Redundancy"
91 self.type[self.redundancy_key]=int
92 self.redundancy_list_key=
"RedundancyList"
93 self.type[self.redundancy_key]=list
94 self.ambiguity_key=
"Ambiguity"
95 self.type[self.ambiguity_key]=int
96 self.residue1_links_number_key=
"Residue1LinksNumber"
97 self.type[self.residue1_links_number_key]=int
98 self.residue2_links_number_key=
"Residue2LinksNumber"
99 self.type[self.residue2_links_number_key]=int
100 self.type[self.ambiguity_key]=int
101 self.state_key=
"State"
102 self.type[self.state_key]=int
103 self.sigma1_key=
"Sigma1"
104 self.type[self.sigma1_key]=str
105 self.sigma2_key=
"Sigma2"
106 self.type[self.sigma2_key]=str
108 self.type[self.psi_key]=str
109 self.distance_key=
"Distance"
110 self.type[self.distance_key]=float
111 self.min_ambiguous_distance_key=
"MinAmbiguousDistance"
112 self.type[self.distance_key]=float
114 self.link_type_key=
"LinkType"
115 self.type[self.link_type_key]=str
117 self.ordered_key_list =[self.data_set_name_key,
119 self.unique_sub_index_key,
120 self.unique_sub_id_key,
125 self.residue1_amino_acid_key,
126 self.residue2_amino_acid_key,
127 self.residue1_moiety_key,
128 self.residue2_moiety_key,
129 self.cross_linker_chemical_key,
132 self.quantitation_key,
134 self.redundancy_list_key,
140 self.min_ambiguous_distance_key,
144 class _ProteinsResiduesArray(tuple):
146 This class is inherits from tuple, and it is a shorthand for a cross-link
147 (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
148 residue1 and residue2.
151 def __new__(self,input_data):
153 @input_data can be a dict or a tuple
155 self.cldbsk=_CrossLinkDataBaseStandardKeys()
156 if type(input_data)
is dict:
158 p1=input_data[self.cldbsk.protein1_key]
160 p2=input_data[self.cldbsk.protein2_key]
163 r1=input_data[self.cldbsk.residue1_key]
165 r2=input_data[self.cldbsk.residue2_key]
172 elif type(input_data)
is tuple:
173 if len(input_data)>4
or len(input_data)==3
or len(input_data)==1:
174 raise TypeError(
"_ProteinsResiduesArray: must have only 4 elements")
175 if len(input_data)==4:
176 p1 = _handle_string_input(input_data[0])
177 p2 = _handle_string_input(input_data[1])
180 if (
not (type(r1)
is int))
and (
not (r1
is None)):
181 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
182 if (
not (type(r2)
is int))
and (
not (r1
is None)):
183 raise TypeError(
"_ProteinsResiduesArray: residue2 must be a integer")
185 if len(input_data) == 2:
186 p1 = _handle_string_input(input_data[0])
188 if type(r1)
is not int:
189 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
192 raise TypeError(
"_ProteinsResiduesArray: input must be a dict or tuple")
193 return tuple.__new__(_ProteinsResiduesArray, t)
195 def get_inverted(self):
197 Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
199 return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
202 outstr=self.cldbsk.protein1_key+
" "+str(self[0])
203 outstr+=
" "+self.cldbsk.protein2_key+
" "+str(self[1])
204 outstr+=
" "+self.cldbsk.residue1_key+
" "+str(self[2])
205 outstr+=
" "+self.cldbsk.residue2_key+
" "+str(self[3])
209 outstr=str(self[0])+
"."+str(self[2])+
"-"+str(self[1])+
"."+str(self[3])
214 This class allows to create filter functions that can be passed to the CrossLinkDataBase
217 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
219 where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
221 A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
225 and it is used to filter the database
228 def __init__(self, argument1, operator, argument2):
230 (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
231 or (FilterOperator1,operator.or|and...,FilterOperator2)
233 if isinstance(argument1, FilterOperator):
234 self.operations = [argument1, operator, argument2]
237 self.values = (argument1, operator, argument2)
239 def __or__(self, FilterOperator2):
242 def __and__(self, FilterOperator2):
245 def __invert__(self):
248 def evaluate(self, xl_item):
250 if len(self.operations) == 0:
251 keyword, operator, value = self.values
252 return operator(xl_item[keyword], value)
253 FilterOperator1, op, FilterOperator2 = self.operations
255 if FilterOperator2
is None:
256 return op(FilterOperator1.evaluate(xl_item))
258 return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
261 def filter_factory(xl_):
263 class FilterOperator(object):
267 def __new__(self,key,value,oper=operator.eq):
268 return oper(self.xl[key],value)
270 return FilterOperator
275 This class is needed to convert the keywords from a generic database
281 @param list_parser an instance of ResiduePairListParser, if any is needed
284 self.backward_converter={}
285 _CrossLinkDataBaseStandardKeys.__init__(self)
286 self.rplp = list_parser
287 if self.rplp
is None:
289 self.compulsory_keys=set([self.protein1_key,
294 self.compulsory_keys=self.rplp.get_compulsory_keys()
295 self.setup_keys=set()
299 Is a function that check whether necessary keys are setup
302 if self.compulsory_keys & setup_keys != self.compulsory_keys:
303 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
307 Returns the keys that have been setup so far
309 return self.backward_converter.keys()
313 This sets up the standard compulsory keys as defined by
314 _CrossLinkDataBaseStandardKeys
316 for ck
in self.compulsory_keys:
317 self.converter[ck]=ck
318 self.backward_converter[ck]=ck
320 def set_unique_id_key(self,origin_key):
321 self.converter[origin_key]=self.unique_id_key
322 self.backward_converter[self.unique_id_key]=origin_key
324 def set_protein1_key(self,origin_key):
325 self.converter[origin_key]=self.protein1_key
326 self.backward_converter[self.protein1_key]=origin_key
328 def set_protein2_key(self,origin_key):
329 self.converter[origin_key]=self.protein2_key
330 self.backward_converter[self.protein2_key]=origin_key
332 def set_residue1_key(self,origin_key):
333 self.converter[origin_key]=self.residue1_key
334 self.backward_converter[self.residue1_key]=origin_key
336 def set_residue2_key(self,origin_key):
337 self.converter[origin_key]=self.residue2_key
338 self.backward_converter[self.residue2_key]=origin_key
340 def set_residue1_amino_acid_key(self, origin_key):
341 self.converter[origin_key] = self.residue1_amino_acid_key
342 self.backward_converter[self.residue1_amino_acid_key] = origin_key
344 def set_residue2_amino_acid_key(self, origin_key):
345 self.converter[origin_key] = self.residue2_amino_acid_key
346 self.backward_converter[self.residue2_amino_acid_key] = origin_key
348 def set_residue1_moiety_key(self, origin_key):
349 self.converter[origin_key] = self.residue1_moiety_key
350 self.backward_converter[self.residue1_moiety_key] = origin_key
352 def set_residue2_moiety_key(self, origin_key):
353 self.converter[origin_key] = self.residue2_moiety_key
354 self.backward_converter[self.residue2_moiety_key] = origin_key
356 def set_site_pairs_key(self,origin_key):
357 self.converter[origin_key]=self.site_pairs_key
358 self.backward_converter[self.site_pairs_key]=origin_key
360 def set_id_score_key(self,origin_key):
361 self.converter[origin_key]=self.id_score_key
362 self.backward_converter[self.id_score_key]=origin_key
364 def set_fdr_key(self,origin_key):
365 self.converter[origin_key]=self.fdr_key
366 self.backward_converter[self.fdr_key]=origin_key
368 def set_quantitation_key(self,origin_key):
369 self.converter[origin_key]=self.quantitation_key
370 self.backward_converter[self.quantitation_key]=origin_key
372 def set_psi_key(self,origin_key):
373 self.converter[origin_key]=self.psi_key
374 self.backward_converter[self.psi_key]=origin_key
376 def set_link_type_key(self,link_type_key):
377 self.converter[link_type_key]=self.link_type_key
378 self.backward_converter[self.link_type_key]=link_type_key
382 Returns the dictionary that convert the old keywords to the new ones
385 return self.converter
389 Returns the dictionary that convert the new keywords to the old ones
392 return self.backward_converter
396 A class to handle different styles of site pairs parsers.
398 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
399 [Y3-;K4-] for dead-ends
400 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
401 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
402 LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
407 def __init__(self,style):
409 _CrossLinkDataBaseStandardKeys.__init__(self)
410 if style
is "MSSTUDIO":
412 self.compulsory_keys= set([self.protein1_key,
414 self.site_pairs_key])
415 elif style
is "XTRACT" or style
is "QUANTITATION":
417 self.compulsory_keys= set([self.site_pairs_key])
418 elif style
is "LAN_HUANG":
420 self.compulsory_keys= set([self.site_pairs_key])
422 raise Error(
"ResiduePairListParser: unknown list parser style")
424 def get_compulsory_keys(self):
425 return self.compulsory_keys
429 This function returns a list of cross-linked residues and the corresponding list of
430 cross-linked chains. The latter list can be empty, if the style doesn't have the
431 corresponding information.
433 if self.style ==
"MSSTUDIO":
434 input_string=input_string.replace(
"[",
"")
435 input_string=input_string.replace(
"]",
"")
436 input_string_pairs=input_string.split(
";")
437 residue_pair_indexes=[]
438 chain_pair_indexes=[]
439 for s
in input_string_pairs:
440 m1=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
441 m2=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
444 residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
445 residue_pair_indexes.append((residue_index_1,residue_index_2))
448 residue_type_1,residue_index_1=m2.group(1,2)
450 return residue_pair_indexes,chain_pair_indexes
451 if self.style
is "XTRACT" or self.style
is "QUANTITATION":
452 if ":x:" in input_string:
454 input_strings=input_string.split(
":x:")
455 first_peptides=input_strings[0].split(
":|:")
456 second_peptides=input_strings[1].split(
":|:")
457 first_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in first_peptides]
458 second_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in second_peptides]
459 residue_pair_indexes=[]
460 chain_pair_indexes=[]
461 for fpi
in first_peptides_indentifiers:
462 for spi
in second_peptides_indentifiers:
467 residue_pair_indexes.append((residue1,residue2))
468 chain_pair_indexes.append((chain1,chain2))
469 return residue_pair_indexes, chain_pair_indexes
472 first_peptides = input_string.split(
":|:")
473 first_peptides_indentifiers = [(f.split(
":")[0], f.split(
":")[1])
for f
in first_peptides]
476 for fpi
in first_peptides_indentifiers:
479 residue_indexes.append((residue1,))
480 chain_indexes.append((chain1,))
481 return residue_indexes, chain_indexes
482 if self.style
is "LAN_HUANG":
483 input_strings=input_string.split(
"-")
484 chain1,first_series=input_strings[0].split(
":")
485 chain2,second_series=input_strings[1].split(
":")
487 first_residues=first_series.replace(
";",
"|").split(
"|")
488 second_residues=second_series.replace(
";",
"|").split(
"|")
489 residue_pair_indexes=[]
490 chain_pair_indexes=[]
491 for fpi
in first_residues:
492 for spi
in second_residues:
493 residue1=self.re.sub(
"[^0-9]",
"", fpi)
494 residue2=self.re.sub(
"[^0-9]",
"", spi)
495 residue_pair_indexes.append((residue1,residue2))
496 chain_pair_indexes.append((chain1,chain2))
497 return residue_pair_indexes, chain_pair_indexes
503 A class to handle different XL format with fixed format
504 currently support ProXL
506 def __init__(self,format):
508 _CrossLinkDataBaseStandardKeys.__init__(self)
509 if format
is "PROXL":
512 raise Error(
"FixedFormatParser: unknown list format name")
515 def get_data(self,input_string):
516 if self.format
is "PROXL":
517 tokens=input_string.split(
"\t")
519 if tokens[0]==
"SEARCH ID(S)":
522 xl[self.protein1_key]=tokens[2]
523 xl[self.protein2_key]=tokens[4]
524 xl[self.residue1_key]=int(tokens[3])
525 xl[self.residue2_key]=int(tokens[5])
530 this class handles a cross-link dataset and do filtering
531 operations, adding cross-links, merge datasets...
534 def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=(
'K',)):
537 @param converter an instance of CrossLinkDataBaseKeywordsConverter
538 @param data_base an instance of CrossLinkDataBase to build the new database on
539 @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
540 crosslink consistency. If not given consistency will not be checked
541 @param linkable_aa a tuple containing one-letter amino acids which are linkable by the crosslinker;
542 only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
543 and if a fasta_seq is given
546 if data_base
is None:
549 self.data_base=data_base
551 _CrossLinkDataBaseStandardKeys.__init__(self)
552 if converter
is not None:
553 self.cldbkc = converter
554 self.list_parser=self.cldbkc.rplp
555 self.converter = converter.get_converter()
559 self.list_parser=
None
560 self.converter =
None
563 self.def_aa_tuple = linkable_aa
564 self.fasta_seq = fasta_seq
570 Update the whole dataset after changes
572 self.update_cross_link_unique_sub_index()
573 self.update_cross_link_redundancy()
574 self.update_residues_links_number()
579 sorted_ids=sorted(self.data_base.keys())
581 for xl
in self.data_base[k]:
584 def xlid_iterator(self):
585 sorted_ids=sorted(self.data_base.keys())
586 for xlid
in sorted_ids:
589 def __getitem__(self,xlid):
590 return self.data_base[xlid]
593 return len([xl
for xl
in self])
598 def set_name(self,name):
600 for k
in self.data_base:
601 new_data_base[k+
"."+name]=self.data_base[k]
602 self.data_base=new_data_base
606 def get_number_of_xlid(self):
607 return len(self.data_base)
612 if FixedFormatParser is not specified, the file is comma-separated-values
613 @param file_name a txt file to be parsed
614 @param converter an instance of CrossLinkDataBaseKeywordsConverter
615 @param FixedFormatParser a parser for a fixed format
617 if not FixedFormatParser:
618 xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
620 if converter
is not None:
621 self.cldbkc = converter
622 self.list_parser=self.cldbkc.rplp
623 self.converter = converter.get_converter()
626 if not self.list_parser:
630 for nxl,xl
in enumerate(xl_list):
633 if k
in self.converter:
634 new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
637 if self.unique_id_key
in self.cldbkc.get_setup_keys():
638 if new_xl[self.unique_id_key]
not in new_xl_dict:
639 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
641 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
643 if str(nxl)
not in new_xl_dict:
644 new_xl_dict[str(nxl)]=[new_xl]
646 new_xl_dict[str(nxl)].append(new_xl)
651 for nxl,entry
in enumerate(xl_list):
655 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
656 raise Error(
"CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
658 if k
in self.converter:
659 new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
663 residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
666 for n,p
in enumerate(residue_pair_list):
673 new_xl[k]=new_dict[k]
674 new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
676 new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
678 if len(chain_pair_list)==len(residue_pair_list):
679 new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
681 new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
684 new_xl[self.link_type_key]=
"CROSSLINK"
686 new_xl[self.link_type_key]=
"MONOLINK"
688 if self.unique_id_key
in self.cldbkc.get_setup_keys():
689 if new_xl[self.unique_id_key]
not in new_xl_dict:
690 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
692 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
694 if str(nxl)
not in new_xl_dict:
695 new_xl[self.unique_id_key]=str(nxl+1)
696 new_xl_dict[str(nxl)]=[new_xl]
698 new_xl[self.unique_id_key]=str(nxl+1)
699 new_xl_dict[str(nxl)].append(new_xl)
704 if FixedFormatParser is defined
708 f=open(file_name,
"r")
711 xl=FixedFormatParser.get_data(line)
713 xl[self.unique_id_key]=str(nxl+1)
714 new_xl_dict[str(nxl)]=[xl]
718 self.data_base=new_xl_dict
720 l = ihm.location.InputFileLocation(file_name, details=
'Crosslinks')
721 self.dataset = ihm.dataset.CXMSDataset(l)
724 def update_cross_link_unique_sub_index(self):
725 for k
in self.data_base:
726 for n,xl
in enumerate(self.data_base[k]):
727 xl[self.ambiguity_key]=len(self.data_base[k])
728 xl[self.unique_sub_index_key]=n+1
729 xl[self.unique_sub_id_key]=k+
"."+str(n+1)
731 def update_cross_link_redundancy(self):
732 redundancy_data_base={}
734 pra=_ProteinsResiduesArray(xl)
735 if pra
not in redundancy_data_base:
736 redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
737 redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
739 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
740 redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
742 pra=_ProteinsResiduesArray(xl)
743 xl[self.redundancy_key]=len(redundancy_data_base[pra])
744 xl[self.redundancy_list_key]=redundancy_data_base[pra]
746 def update_residues_links_number(self):
749 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
750 if (p1,r1)
not in residue_links:
751 residue_links[(p1,r1)]=set([(p2,r2)])
753 residue_links[(p1,r1)].add((p2,r2))
754 if (p2,r2)
not in residue_links:
755 residue_links[(p2,r2)]=set([(p1,r1)])
757 residue_links[(p2,r2)].add((p1,r1))
760 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
761 xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
762 xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
765 """This function checks the consistency of the dataset with the amino acid sequence"""
766 if self.cldbkc
and self.fasta_seq:
767 cnt_matched, cnt_matched_file = 0, 0
771 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
772 b_matched_file =
False
773 if self.residue1_amino_acid_key
in xl:
775 aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
776 b_matched = self._match_xlinks(p1, r1, aa_from_file)
777 b_matched_file = b_matched
780 b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
782 matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
784 if self.residue2_amino_acid_key
in xl:
785 aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
786 b_matched = self._match_xlinks(p2, r2, aa_from_file)
787 b_matched_file = b_matched
789 b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
791 matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
792 if b_matched: cnt_matched += 1
793 if b_matched_file: cnt_matched_file += 1
795 percentage_matched = round(100*cnt_matched/len(self),1)
796 percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
798 if matched
or non_matched: print(
"check_cross_link_consistency: Out of %d crosslinks "
799 "%d were matched to the fasta sequence (%f %%).\n "
800 "%d were matched by using the crosslink file (%f %%)."%
801 (len(self),cnt_matched,percentage_matched,cnt_matched_file,
802 percentage_matched_file) )
803 if non_matched: print(
"check_cross_link_consistency: Warning: Non matched xlinks:",
804 [(prot_name, sorted(list(non_matched[prot_name])))
for prot_name
in non_matched])
805 return matched,non_matched
807 def _match_xlinks(self, prot_name, res_index, aa_tuple):
812 for amino_acid
in aa_tuple:
813 if len(amino_acid) == 3:
814 amino_acid = amino_dict[amino_acid.upper()]
815 if prot_name
in self.fasta_seq.sequences:
816 seq = self.fasta_seq.sequences[prot_name]
821 if res_index == 0
or res_index == 1:
823 if res_index < len(seq):
824 if amino_acid == seq[res_index]:
830 def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
833 matched[prot].add(res)
835 matched[prot] = set([res])
837 if prot
in non_matched:
838 non_matched[prot].add(res)
840 non_matched[prot] = set([res])
841 return matched, non_matched
844 def get_cross_link_string(self,xl):
846 for k
in self.ordered_key_list:
848 string+=str(k)+
":"+str(xl[k])+
"|"
853 if k
not in self.ordered_key_list:
854 string+=str(k)+
":"+str(xl[k])+
"|"
858 def get_short_cross_link_string(self,xl):
861 list_of_keys=[self.data_set_name_key,
862 self.unique_sub_id_key,
870 for k
in list_of_keys:
872 string+=str(xl[k])+
"|"
878 def filter(self,FilterOperator):
880 for id
in self.data_base.keys():
881 for xl
in self.data_base[id]:
882 if FilterOperator.evaluate(xl):
883 if id
not in new_xl_dict:
886 new_xl_dict[id].append(xl)
889 cdb.dataset = self.dataset
893 '''Get all crosslinks with score greater than an input value'''
895 return self.filter(FilterOperator)
897 def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
899 This function merges two cross-link datasets so that if two conflicting crosslinks have the same
900 cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
901 with different SubIDs
907 This function append one cross-link dataset to another. Unique ids will be renamed to avoid
910 name1=self.get_name()
911 name2=CrossLinkDataBase2.get_name()
914 name2=id(CrossLinkDataBase2)
916 CrossLinkDataBase2.set_name(name2)
920 for k
in self.data_base:
921 new_data_base[k]=self.data_base[k]
922 for k
in CrossLinkDataBase2.data_base:
923 new_data_base[k]=CrossLinkDataBase2.data_base[k]
924 self.data_base=new_data_base
929 This function changes the value for a given key in the database
930 For instance one can change the name of a protein
931 @param key: the key in the database that must be changed
932 @param new_value: the new value of the key
933 @param FilterOperator: optional FilterOperator to change the value to
934 a subset of the database
936 example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
940 if FilterOperator
is not None:
941 if FilterOperator.evaluate(xl):
949 this function returns the list of values for a given key in the database
950 alphanumerically sorted
955 return sorted(list(values))
959 This function offset the residue indexes of a given protein by a specified value
960 @param protein_name: the protein name that need to be changed
961 @param offset: the offset value
965 if xl[self.protein1_key] == protein_name:
966 xl[self.residue1_key]=xl[self.residue1_key]+offset
967 if xl[self.protein2_key] == protein_name:
968 xl[self.residue2_key]=xl[self.residue2_key]+offset
973 This function creates a new keyword for the whole database and set the values from
974 and existing keyword (optional), otherwise the values are set to None
975 @param keyword the new keyword name:
976 @param values_from_keyword the keyword from which we are copying the values:
979 if values_from_keyword
is not None:
980 xl[keyword] = xl[values_from_keyword]
987 This function renames all proteins contained in the input dictionary
988 from the old names (keys) to the new name (values)
989 @param old_to_new_names_dictionary dictionary for converting old to new names
990 @param protein_to_rename specify whether to rename both or protein1 or protein2 only
993 for old_name
in old_to_new_names_dictionary:
994 new_name=old_to_new_names_dictionary[old_name]
995 if protein_to_rename ==
"both" or protein_to_rename ==
"protein1":
997 self.
set_value(self.protein1_key,new_name,fo2)
998 if protein_to_rename ==
"both" or protein_to_rename ==
"protein2":
1000 self.
set_value(self.protein2_key,new_name,fo2)
1004 This function creates as many classes as in the input (number_of_classes: integer)
1005 and partition crosslinks according to their identification scores. Classes are defined in the psi key.
1008 if self.id_score_key
is not None:
1011 raise ValueError(
'The crosslink database does not contain score values')
1012 minscore=min(scores)
1013 maxscore=max(scores)
1014 scoreclasses=numpy.linspace(minscore, maxscore, number_of_classes+1)
1015 if self.psi_key
is None:
1018 score=xl[self.id_score_key]
1019 for n,classmin
in enumerate(scoreclasses[0:-1]):
1020 if score>=classmin
and score<=scoreclasses[n+1]:
1021 xl[self.psi_key]=str(
"CLASS_"+str(n))
1024 def clone_protein(self,protein_name,new_protein_name):
1026 for id
in self.data_base.keys():
1028 for xl
in self.data_base[id]:
1029 new_data_base.append(xl)
1030 if xl[self.protein1_key]==protein_name
and xl[self.protein2_key]!=protein_name:
1032 new_xl[self.protein1_key]=new_protein_name
1033 new_data_base.append(new_xl)
1034 elif xl[self.protein1_key]!=protein_name
and xl[self.protein2_key]==protein_name:
1036 new_xl[self.protein2_key]=new_protein_name
1037 new_data_base.append(new_xl)
1038 elif xl[self.protein1_key]==protein_name
and xl[self.protein2_key]==protein_name:
1040 new_xl[self.protein1_key]=new_protein_name
1041 new_data_base.append(new_xl)
1043 new_xl[self.protein2_key]=new_protein_name
1044 new_data_base.append(new_xl)
1046 new_xl[self.protein1_key]=new_protein_name
1047 new_xl[self.protein2_key]=new_protein_name
1048 new_data_base.append(new_xl)
1049 self.data_base[id]=new_data_base
1054 This function remove cross-links applied to the same residue
1055 (ie, same chain name and residue number)
1058 for id
in self.data_base.keys():
1060 for xl
in self.data_base[id]:
1061 if xl[self.protein1_key]==xl[self.protein2_key]
and xl[self.residue1_key]==xl[self.residue2_key]:
1064 new_data_base.append(xl)
1065 self.data_base[id]=new_data_base
1071 this method returns a CrossLinkDataBase class containing
1072 a random subsample of the original cross-link database.
1073 @param percentage float between 0 and 1, is the percentage of
1074 of spectra taken from the original list
1077 if percentage > 1.0
or percentage < 0.0:
1078 raise ValueError(
'the percentage of random cross-link spectra should be between 0 and 1')
1079 nspectra=self.get_number_of_xlid()
1080 nrandom_spectra=int(nspectra*percentage)
1081 random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1083 for k
in random_keys:
1084 new_data_base[k]=self.data_base[k]
1089 sorted_ids=sorted(self.data_base.keys())
1091 for id
in sorted_ids:
1093 for xl
in self.data_base[id]:
1094 for k
in self.ordered_key_list:
1096 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1101 if k
not in self.ordered_key_list:
1102 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1103 outstr+=
"-------------\n"
1107 def plot(self,filename,**kwargs):
1109 matplotlib.use(
'Agg')
1110 import matplotlib.pyplot
as plt
1111 import matplotlib.colors
1115 if kwargs[
"type"] ==
"scatter":
1116 cmap=plt.get_cmap(
"rainbow")
1117 norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1120 if "colorkey" in kwargs:
1121 colorkey=kwargs[
"colorkey"]
1122 if "sizekey" in kwargs:
1123 sizekey=kwargs[
"sizekey"]
1124 if "logyscale" in kwargs:
1125 logyscale=kwargs[
"logyscale"]
1133 xs.append(float(xl[xkey]))
1135 ys.append(math.log10(float(xl[ykey])))
1137 ys.append(float(xl[ykey]))
1138 colors.append(float(xl[colorkey]))
1140 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1144 for color
in colors:
1145 cindex=(color-min(colors))/(max(colors)-min(colors))
1146 cs.append(cmap(cindex))
1149 ax = fig.add_subplot(111)
1150 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker=
"o")
1153 plt.savefig(filename)
1157 if kwargs[
"type"] ==
"residue_links":
1161 molecule=kwargs[
"molecule"]
1163 length=len(molecule.sequence)
1164 molecule=molecule.get_name()
1167 sequences_object=kwargs[
"sequences_object"]
1168 sequence=sequences_object.sequences[molecule]
1169 length=len(sequence)
1171 histogram=[0]*length
1173 if xl[self.protein1_key]==molecule:
1174 histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1175 if xl[self.protein2_key]==molecule:
1176 histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1177 rects = plt.bar(range(1,length+1), histogram)
1184 plt.savefig(filename)
1190 if kwargs[
"type"] ==
"histogram":
1191 valuekey=kwargs[
"valuekey"]
1192 reference_xline=kwargs[
"reference_xline"]
1198 values_list.append(float(xl[valuekey]))
1200 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1203 filename, [values_list], valuename=valuename, bins=bins,
1204 colors=
None, format=
"pdf",
1205 reference_xline=
None, yplotrange=
None,
1206 xplotrange=
None,normalized=
True,
1209 def dump(self,json_filename):
1211 with open(json_filename,
'w')
as fp:
1212 json.dump(self.data_base, fp, sort_keys=
True, indent=2, default=set_json_default)
1214 def load(self,json_filename):
1216 with open(json_filename,
'r') as fp:
1217 self.data_base = json.load(fp)
1221 if sys.version_info[0] < 3:
1223 for k,v
in xl.iteritems():
1224 if type(k)
is unicode: k=str(k)
1225 if type(v)
is unicode: v=str(v)
1228 def save_csv(self,filename):
1234 sorted_group_ids=sorted(self.data_base.keys())
1235 for group
in sorted_group_ids:
1237 for xl
in self.data_base[group]:
1239 sorted_ids=sorted(xl.keys())
1240 values=[xl[k]
for k
in sorted_ids]
1241 group_block.append(values)
1245 with open(filename,
'w')
as fp:
1246 a = csv.writer(fp, delimiter=
',')
1247 a.writerow(sorted_ids)
1252 Returns the number of non redundant crosslink sites
1256 pra=_ProteinsResiduesArray(xl)
1257 prai=pra.get_inverted()
1260 return len(list(praset))
1263 """This class allows to compute and plot the distance between datasets"""
1266 """Input a dictionary where keys are cldb names and values are cldbs"""
1267 import scipy.spatial.distance
1268 self.dbs=cldb_dictionary
1269 self.keylist=list(self.dbs.keys())
1270 self.distances=list()
1273 for i,key1
in enumerate(self.keylist):
1274 for key2
in self.keylist[i+1:]:
1275 distance=self.get_distance(key1,key2)
1276 self.distances.append(distance)
1278 self.distances=scipy.spatial.distance.squareform(self.distances)
1281 return 1.0-self.
jaccard_index(self.dbs[key1],self.dbs[key2])
1284 """Similarity index between two datasets
1285 https://en.wikipedia.org/wiki/Jaccard_index"""
1289 for xl1
in CrossLinkDataBase1:
1290 a1f=_ProteinsResiduesArray(xl1)
1291 a1b=a1f.get_inverted()
1294 for xl2
in CrossLinkDataBase2:
1295 a2f=_ProteinsResiduesArray(xl2)
1296 a2b=a2f.get_inverted()
1299 return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1301 def plot_matrix(self,figurename="clustermatrix.pdf"):
1302 import matplotlib
as mpl
1305 import matplotlib.pylab
as pl
1306 from scipy.cluster
import hierarchy
as hrc
1308 raw_distance_matrix=self.distances
1314 ax = fig.add_subplot(1,1,1)
1315 dendrogram = hrc.dendrogram(
1316 hrc.linkage(raw_distance_matrix),
1319 leaves_order = dendrogram[
'leaves']
1320 ax.set_xlabel(
'Dataset')
1321 ax.set_ylabel(
'Jaccard Distance')
1323 pl.savefig(
"dendrogram."+figurename, dpi=300)
1329 ax = fig.add_subplot(1,1,1)
1331 raw_distance_matrix[leaves_order,
1334 interpolation=
'nearest')
1335 cb = fig.colorbar(cax)
1336 cb.set_label(
'Jaccard Distance')
1337 ax.set_xlabel(
'Dataset')
1338 ax.set_ylabel(
'Dataset')
1339 ax.set_xticks(range(len(labels)))
1340 ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation=
'vertical')
1341 ax.set_yticks(range(len(labels)))
1342 ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation=
'horizontal')
1344 pl.savefig(
"matrix."+figurename, dpi=300)
1350 This class maps a CrossLinkDataBase on a given structure
1351 and save an rmf file with color-coded crosslinks
1355 def __init__(self,CrossLinkDataBase,rmf_or_stat_handler):
1356 self.CrossLinkDataBase=CrossLinkDataBase
1359 self.prots=rmf_or_stat_handler
1360 self.distances=defaultdict(list)
1364 print(
"computing distances fro all crosslinks and all structures")
1365 for i
in self.prots[::10]:
1366 self.compute_distances()
1367 for key,xl
in enumerate(self.CrossLinkDataBase):
1368 array=_ProteinsResiduesArray(xl)
1369 self.array_to_id[array]=key
1370 self.id_to_array[key]=array
1371 if xl[
"MinAmbiguousDistance"]
is not 'None':
1372 self.distances[key].append(xl[
"MinAmbiguousDistance"])
1374 def compute_distances(self):
1377 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1378 for group
in sorted_group_ids:
1381 for xl
in self.CrossLinkDataBase.data_base[group]:
1383 sorted_ids=sorted(xl.keys())
1384 data.append(sorted_ids+[
"UniqueID",
"Distance",
"MinAmbiguousDistance"])
1385 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1387 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1396 values=[xl[k]
for k
in sorted_ids]
1397 values += [group, mdist]
1398 except KeyError
as e:
1399 print(
"MapCrossLinkDataBaseOnStructure KeyError: {0} in {1}".format(e, xl))
1400 group_dists.append(mdist)
1402 xl[
"Distance"]=mdist
1408 for xl
in self.CrossLinkDataBase.data_base[group]:
1409 xl[
"MinAmbiguousDistance"]=min(group_dists)
1411 def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1412 '''more robust and slower version of above'''
1414 selpart_1=sel.get_selected_particles()
1415 if len(selpart_1)==0:
1416 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1419 selpart_2=sel.get_selected_particles()
1420 if len(selpart_2)==0:
1421 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1424 for p1
in selpart_1:
1425 for p2
in selpart_2:
1426 if p1 == p2
and r1 == r2:
continue
1446 results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1447 if len(results)==0:
return None
1448 results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1449 return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1451 def save_rmf_snapshot(self,filename,color_id=None):
1452 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1455 for group
in sorted_group_ids:
1456 group_dists_particles=[]
1457 for xl
in self.CrossLinkDataBase.data_base[group]:
1458 xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1459 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1461 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1463 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1465 if color_id
is not None:
1466 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1468 group_dists_particles.append((mdist,p1,p2,xllabel,0.0))
1469 if group_dists_particles:
1470 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1471 color_scores.append(mincolor_score)
1472 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1477 linear.set_slope(1.0)
1479 rslin =
IMP.RestraintSet(self.prots.get_model(),
'linear_dummy_restraints')
1481 offset=min(color_scores)
1482 maxvalue=max(color_scores)
1483 for pair
in list_of_pairs:
1485 pr.set_name(pair[2])
1487 factor=(pair[3]-offset)/(maxvalue-offset)
1492 rslin.add_restraint(pr)
1495 rh = RMF.create_rmf_file(filename)
1502 def boxplot_crosslink_distances(self,filename):
1504 keys=[self.id_to_array[k]
for k
in self.distances.keys()]
1505 medians=[numpy.median(self.distances[self.array_to_id[k]])
for k
in keys]
1506 dists=[self.distances[self.array_to_id[k]]
for k
in keys]
1507 distances_sorted_by_median=[x
for _,x
in sorted(zip(medians,dists))]
1508 keys_sorted_by_median=[x
for _,x
in sorted(zip(medians,keys))]
1510 distances_sorted_by_median,
1511 range(len(distances_sorted_by_median)),
1512 xlabels=keys_sorted_by_median,scale_plot_length=0.2)
1514 def get_crosslink_violations(self,threshold):
1515 violations=defaultdict(list)
1516 for k
in self.distances:
1518 viols=self.distances[k]
1519 violations[self.id_to_array[k]]=viols
1525 This class generates a CrossLinkDataBase from a given structure
1527 def __init__(self, system, residue_types_1=["K"],
1528 residue_types_2=[
"K"], reactivity_range=[0,1], kt=1.0):
1533 cldbkc.set_protein1_key(
"Protein1")
1534 cldbkc.set_protein2_key(
"Protein2")
1535 cldbkc.set_residue1_key(
"Residue1")
1536 cldbkc.set_residue2_key(
"Residue2")
1539 self.model=self.system.model
1540 self.residue_types_1=residue_types_1
1541 self.residue_types_2=residue_types_2
1543 self.indexes_dict1={}
1544 self.indexes_dict2={}
1545 self.protein_residue_dict={}
1546 self.reactivity_dictionary={}
1547 self.euclidean_interacting_pairs=
None
1548 self.xwalk_interacting_pairs=
None
1551 for state
in self.system.get_states():
1552 for moleculename,molecules
in state.get_molecules().items():
1553 for molecule
in molecules:
1555 seq=molecule.sequence
1556 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
1562 rexp=numpy.random.exponential(0.00000001)
1563 prob=1.0-math.exp(-rexp)
1564 self.reactivity_dictionary[(molecule,r)]=prob
1566 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
1567 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
1570 ps=s.get_selected_particles()
1573 self.indexes_dict1[index]=(molecule,r)
1574 self.protein_residue_dict[(molecule,r)]=index
1577 ps=s.get_selected_particles()
1580 self.indexes_dict2[index]=(molecule,r)
1581 self.protein_residue_dict[(molecule,r)]=index
1584 def get_all_possible_pairs(self):
1585 n=float(len(self.protein_residue_dict.keys()))
1586 return n*(n-1.0)/2.0
1588 def get_all_feasible_pairs(self,distance=21):
1590 particle_index_pairs=[]
1592 for a,b
in itertools.combinations(self.protein_residue_dict.keys(),2):
1595 index1=self.protein_residue_dict[a]
1596 index2=self.protein_residue_dict[b]
1598 if particle_distance <= distance:
1599 particle_index_pairs.append((index1,index2))
1600 new_xl[self.cldb.protein1_key]=a[0].get_name()
1601 new_xl[self.cldb.protein2_key]=b[0].get_name()
1602 new_xl[
"molecule_object1"]=a[0]
1603 new_xl[
"molecule_object2"]=b[0]
1604 new_xl[self.cldb.residue1_key]=a[1]
1605 new_xl[self.cldb.residue2_key]=b[1]
1606 self.cldb.data_base[str(nxl)]=[new_xl]
1614 def get_data_base(self,total_number_of_spectra,
1615 ambiguity_probability=0.1,
1618 max_delta_distance=10.0,
1619 xwalk_bin_path=
None,
1620 confidence_false=0.75,
1621 confidence_true=0.75):
1623 from random
import random,uniform
1627 self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1628 self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1629 self.cldb.data_base[str(number_of_spectra)]=[]
1630 self.sites_weighted=
None
1632 while number_of_spectra<total_number_of_spectra:
1633 if random() > ambiguity_probability
and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1635 number_of_spectra+=1
1636 self.cldb.data_base[str(number_of_spectra)]=[]
1638 if random() > noise:
1640 pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1643 pra,dist=self.get_random_residue_pair(
None,
None,
None)
1647 new_xl[self.cldb.protein1_key]=pra[0].get_name()
1648 new_xl[self.cldb.protein2_key]=pra[1].get_name()
1649 new_xl[
"molecule_object1"]=pra[0]
1650 new_xl[
"molecule_object2"]=pra[1]
1651 new_xl[self.cldb.residue1_key]=pra[2]
1652 new_xl[self.cldb.residue2_key]=pra[3]
1653 new_xl[
"Noisy"]=noisy
1655 new_xl[
"Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1656 new_xl[
"Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1657 r1=new_xl[
"Reactivity_Residue1"]
1658 r2=new_xl[
"Reactivity_Residue2"]
1664 new_xl[
"Score"]=np.random.beta(1.0,self.beta_false)
1667 new_xl[
"Score"]=1.0-np.random.beta(1.0,self.beta_true)
1668 new_xl[
"TargetDistance"]=dist
1669 new_xl[
"NoiseProbability"]=noise
1670 new_xl[
"AmbiguityProbability"]=ambiguity_probability
1672 (p1,p2)=
IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1673 self.protein_residue_dict[(pra[1],pra[3])]])
1678 new_xl[
"InterRigidBody"] =
False
1683 new_xl[
"InterRigidBody"] =
True
1685 new_xl[
"InterRigidBody"] =
None
1687 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1692 def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1695 from random
import choice,uniform
1696 if distance
is None:
1699 (protein1,residue1)=choice(self.protein_residue_dict.keys())
1700 (protein2,residue2)=choice(self.protein_residue_dict.keys())
1701 index1=self.protein_residue_dict[(protein1,residue1)]
1702 index2=self.protein_residue_dict[(protein2,residue2)]
1704 if (protein1,residue1) != (protein2,residue2):
1708 if not xwalk_bin_path:
1710 gcpf.set_distance(distance+max_delta_distance)
1714 if not self.sites_weighted:
1715 self.sites_weighted=[]
1716 for key
in self.reactivity_dictionary:
1717 r=self.reactivity_dictionary[key]
1718 self.sites_weighted.append((key,r))
1720 first_site=self.weighted_choice(self.sites_weighted)
1722 if not self.euclidean_interacting_pairs:
1723 self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1724 self.indexes_dict1.keys(),
1725 self.indexes_dict2.keys())
1727 first_site_pairs = [pair
for pair
in self.euclidean_interacting_pairs
1728 if self.indexes_dict1[pair[0]] == first_site
or
1729 self.indexes_dict2[pair[1]] == first_site]
1730 if len(first_site_pairs)==0:
continue
1732 second_sites_weighted=[]
1733 for pair
in first_site_pairs:
1734 if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1735 if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1736 r=self.reactivity_dictionary[second_site]
1737 second_sites_weighted.append((second_site,r))
1738 second_site=self.weighted_choice(second_sites_weighted)
1740 interacting_pairs_weighted=[]
1741 for pair in self.euclidean_interacting_pairs:
1742 r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1743 r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1744 #combined reactivity 1-exp(-k12*Delta t),
1747 r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1748 interacting_pairs_weighted.append((pair,r12))
1749 #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1750 #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1751 #interacting_pairs_weighted.append((pair,weight1*weight2))
1754 pair=self.weighted_choice(interacting_pairs_weighted)
1755 protein1,residue1=self.indexes_dict1[pair[0]]
1756 protein2,residue2=self.indexes_dict2[pair[1]]
1757 particle_pair=IMP.get_particles(self.model,pair)
1758 particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1759 if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1761 elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1762 #allow some flexibility
1763 prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1764 if uniform(0.0,1.0)<prob: break
1766 protein1,residue1=first_site
1767 protein2,residue2=second_site
1768 print(
"CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1769 "First site",first_site,self.reactivity_dictionary[first_site],
1770 "Second site",second_site,self.reactivity_dictionary[second_site])
1771 particle_pair=
IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1774 if particle_distance<distance
and (protein1,residue1) != (protein2,residue2):
1776 elif particle_distance>=distance
and (protein1,residue1) != (protein2,residue2)
and max_delta_distance:
1780 if particle_distance-distance < max_delta_distance:
break
1785 if not self.xwalk_interacting_pairs:
1786 self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1787 interacting_pairs_weighted=[]
1789 for pair
in self.xwalk_interacting_pairs:
1794 weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1795 weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1796 interacting_pairs_weighted.append((pair,weight1*weight2))
1798 pair=self.weighted_choice(interacting_pairs_weighted)
1803 particle_distance=float(pair[4])
1807 return ((protein1,protein2,residue1,residue2)),particle_distance
1809 def get_xwalk_distances(self,xwalk_bin_path,distance):
1813 o.init_pdb(
"xwalk.pdb",self.representation.prot)
1814 o.write_pdb(
"xwalk.pdb")
1815 namechainiddict=o.dictchain[
"xwalk.pdb"]
1818 for key
in namechainiddict: chainiddict[namechainiddict[key]]=key
1819 xwalkout=os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+
' -bb').read()
1821 output_list_of_distance=[]
1822 for line
in xwalkout.split(
"\n")[0:-2]:
1826 distance=float(tokens[6])
1828 ss=second.split(
"-")
1831 protein1=chainiddict[chainid1]
1832 protein2=chainiddict[chainid2]
1835 output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1836 return output_list_of_distance
1839 def weighted_choice(self,choices):
1842 total = sum(w
for c, w
in choices)
1843 r = random.uniform(0, total)
1845 for c, w
in choices:
1849 assert False,
"Shouldn't get here"
1851 def save_rmf_snapshot(self,filename,color_id=None):
1854 if color_id
is None:
1855 color_id=
"Reactivity"
1857 sorted_group_ids=sorted(self.cldb.data_base.keys())
1860 for group
in sorted_group_ids:
1862 group_dists_particles=[]
1863 for xl
in self.cldb.data_base[group]:
1864 xllabel=self.cldb.get_short_cross_link_string(xl)
1865 (c1,c2,r1,r2)=(xl[
"molecule_object1"],xl[
"molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1867 index1=self.protein_residue_dict[(c1,r1)]
1868 index2=self.protein_residue_dict[(c2,r2)]
1870 mdist=xl[
"TargetDistance"]
1872 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1874 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1875 if group_dists_particles:
1876 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1877 color_scores.append(mincolor_score)
1878 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1885 linear.set_slope(1.0)
1889 offset=min(color_scores)
1890 maxvalue=max(color_scores)
1891 for pair
in list_of_pairs:
1893 pr.set_name(pair[2])
1894 factor=(pair[3]-offset)/(maxvalue-offset)
1897 rslin.add_restraint(pr)
1900 rh = RMF.create_rmf_file(filename)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def __init__
Input a dictionary where keys are cldb names and values are cldbs.
def get_number_of_unique_crosslinked_sites
Returns the number of non redundant crosslink sites.
def filter_score
Get all crosslinks with score greater than an input value.
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
def create_new_keyword
This function creates a new keyword for the whole database and set the values from and existing keywo...
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
def jaccard_index
Similarity index between two datasets https://en.wikipedia.org/wiki/Jaccard_index.
def merge
This function merges two cross-link datasets so that if two conflicting crosslinks have the same cros...
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class maps a CrossLinkDataBase on a given structure and save an rmf file with color-coded crossl...
A decorator for keeping track of copies of a molecule.
The standard decorator for manipulating molecular structures.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def set_value
This function changes the value for a given key in the database For instance one can change the name ...
This class allows to compute and plot the distance between datasets.
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
def filter_out_same_residues
This function remove cross-links applied to the same residue (ie, same chain name and residue number)...
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
def offset_residue_index
This function offset the residue indexes of a given protein by a specified value. ...
Class for easy writing of PDBs, RMFs, and stat files.
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
def jackknife
this method returns a CrossLinkDataBase class containing a random subsample of the original cross-lin...
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
def check_cross_link_consistency
This function checks the consistency of the dataset with the amino acid sequence. ...
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
def get_values
this function returns the list of values for a given key in the database alphanumerically sorted ...
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
def append_database
This function append one cross-link dataset to another.
class to link stat files to several rmf files
class to allow more advanced handling of RMF files.
A class to handle different styles of site pairs parsers.
def classify_crosslinks_by_score
This function creates as many classes as in the input (number_of_classes: integer) and partition cros...
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
this class handles a cross-link dataset and do filtering operations, adding cross-links, merge datasets...
Support for the RMF file format for storing hierarchical molecular data and markup.
def rename_proteins
This function renames all proteins contained in the input dictionary from the old names (keys) to the...
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.
def create_set_from_file
if FixedFormatParser is not specified, the file is comma-separated-values