1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
7 from __future__
import print_function
22 from collections
import defaultdict
25 def set_json_default(obj):
26 if isinstance(obj, set):
33 if sys.version_info[0] >= 3:
34 def _handle_string_input(inp):
35 if not isinstance(inp, str):
36 raise TypeError(
"expecting a string")
39 def _handle_string_input(inp):
40 if not isinstance(inp, (str, unicode)):
41 raise TypeError(
"expecting a string or unicode")
43 if isinstance(inp, unicode):
48 class _CrossLinkDataBaseStandardKeys(object):
50 This class setup all the standard keys needed to
51 identify the crosslink features from the data sets
55 self.protein1_key=
"Protein1"
56 self.type[self.protein1_key]=str
57 self.protein2_key=
"Protein2"
58 self.type[self.protein2_key]=str
59 self.residue1_key=
"Residue1"
60 self.type[self.residue1_key]=int
61 self.residue2_key=
"Residue2"
62 self.type[self.residue2_key]=int
63 self.residue1_amino_acid_key=
"Residue1AminoAcid"
64 self.type[self.residue1_amino_acid_key]=str
65 self.residue2_amino_acid_key=
"Residue2AminoAcid"
66 self.type[self.residue2_amino_acid_key]=str
67 self.residue1_moiety_key=
"Residue1Moiety"
68 self.type[self.residue1_moiety_key]=str
69 self.residue2_moiety_key=
"Residue2Moiety"
70 self.type[self.residue2_moiety_key]=str
71 self.site_pairs_key=
"SitePairs"
72 self.type[self.site_pairs_key]=str
73 self.unique_id_key=
"XLUniqueID"
74 self.type[self.unique_id_key]=str
75 self.unique_sub_index_key=
"XLUniqueSubIndex"
76 self.type[self.unique_sub_index_key]=int
77 self.unique_sub_id_key=
"XLUniqueSubID"
78 self.type[self.unique_sub_id_key]=str
79 self.data_set_name_key=
"DataSetName"
80 self.type[self.data_set_name_key]=str
81 self.cross_linker_chemical_key=
"CrossLinkerChemical"
82 self.type[self.cross_linker_chemical_key]=str
83 self.id_score_key=
"IDScore"
84 self.type[self.id_score_key]=float
86 self.type[self.fdr_key]=float
87 self.quantitation_key=
"Quantitation"
88 self.type[self.quantitation_key]=float
89 self.redundancy_key=
"Redundancy"
90 self.type[self.redundancy_key]=int
91 self.redundancy_list_key=
"RedundancyList"
92 self.type[self.redundancy_key]=list
93 self.ambiguity_key=
"Ambiguity"
94 self.type[self.ambiguity_key]=int
95 self.residue1_links_number_key=
"Residue1LinksNumber"
96 self.type[self.residue1_links_number_key]=int
97 self.residue2_links_number_key=
"Residue2LinksNumber"
98 self.type[self.residue2_links_number_key]=int
99 self.type[self.ambiguity_key]=int
100 self.state_key=
"State"
101 self.type[self.state_key]=int
102 self.sigma1_key=
"Sigma1"
103 self.type[self.sigma1_key]=str
104 self.sigma2_key=
"Sigma2"
105 self.type[self.sigma2_key]=str
107 self.type[self.psi_key]=str
108 self.distance_key=
"Distance"
109 self.type[self.distance_key]=float
110 self.min_ambiguous_distance_key=
"MinAmbiguousDistance"
111 self.type[self.distance_key]=float
113 self.link_type_key=
"LinkType"
114 self.type[self.link_type_key]=str
116 self.ordered_key_list =[self.data_set_name_key,
118 self.unique_sub_index_key,
119 self.unique_sub_id_key,
124 self.residue1_amino_acid_key,
125 self.residue2_amino_acid_key,
126 self.residue1_moiety_key,
127 self.residue2_moiety_key,
128 self.cross_linker_chemical_key,
131 self.quantitation_key,
133 self.redundancy_list_key,
139 self.min_ambiguous_distance_key,
143 class _ProteinsResiduesArray(tuple):
145 This class is inherits from tuple, and it is a shorthand for a cross-link
146 (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
147 residue1 and residue2.
150 def __new__(self,input_data):
152 @input_data can be a dict or a tuple
154 self.cldbsk=_CrossLinkDataBaseStandardKeys()
155 if type(input_data)
is dict:
157 p1=input_data[self.cldbsk.protein1_key]
159 p2=input_data[self.cldbsk.protein2_key]
162 r1=input_data[self.cldbsk.residue1_key]
164 r2=input_data[self.cldbsk.residue2_key]
171 elif type(input_data)
is tuple:
172 if len(input_data)>4
or len(input_data)==3
or len(input_data)==1:
173 raise TypeError(
"_ProteinsResiduesArray: must have only 4 elements")
174 if len(input_data)==4:
175 p1 = _handle_string_input(input_data[0])
176 p2 = _handle_string_input(input_data[1])
179 if (
not (type(r1)
is int))
and (
not (r1
is None)):
180 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
181 if (
not (type(r2)
is int))
and (
not (r1
is None)):
182 raise TypeError(
"_ProteinsResiduesArray: residue2 must be a integer")
184 if len(input_data) == 2:
185 p1 = _handle_string_input(input_data[0])
187 if type(r1)
is not int:
188 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
191 raise TypeError(
"_ProteinsResiduesArray: input must be a dict or tuple")
192 return tuple.__new__(_ProteinsResiduesArray, t)
194 def get_inverted(self):
196 Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
198 return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
201 outstr=self.cldbsk.protein1_key+
" "+str(self[0])
202 outstr+=
" "+self.cldbsk.protein2_key+
" "+str(self[1])
203 outstr+=
" "+self.cldbsk.residue1_key+
" "+str(self[2])
204 outstr+=
" "+self.cldbsk.residue2_key+
" "+str(self[3])
208 outstr=str(self[0])+
"."+str(self[2])+
"-"+str(self[1])+
"."+str(self[3])
213 This class allows to create filter functions that can be passed to the CrossLinkDataBase
216 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
218 where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
220 A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
224 and it is used to filter the database
227 def __init__(self, argument1, operator, argument2):
229 (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
230 or (FilterOperator1,operator.or|and...,FilterOperator2)
232 if isinstance(argument1, FilterOperator):
233 self.operations = [argument1, operator, argument2]
236 self.values = (argument1, operator, argument2)
238 def __or__(self, FilterOperator2):
241 def __and__(self, FilterOperator2):
244 def __invert__(self):
247 def evaluate(self, xl_item):
249 if len(self.operations) == 0:
250 keyword, operator, value = self.values
251 return operator(xl_item[keyword], value)
252 FilterOperator1, op, FilterOperator2 = self.operations
254 if FilterOperator2
is None:
255 return op(FilterOperator1.evaluate(xl_item))
257 return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
260 def filter_factory(xl_):
262 class FilterOperator(object):
266 def __new__(self,key,value,oper=operator.eq):
267 return oper(self.xl[key],value)
269 return FilterOperator
274 This class is needed to convert the keywords from a generic database
280 @param list_parser an instance of ResiduePairListParser, if any is needed
283 self.backward_converter={}
284 _CrossLinkDataBaseStandardKeys.__init__(self)
285 self.rplp = list_parser
286 if self.rplp
is None:
288 self.compulsory_keys=set([self.protein1_key,
293 self.compulsory_keys=self.rplp.get_compulsory_keys()
294 self.setup_keys=set()
298 Is a function that check whether necessary keys are setup
301 if self.compulsory_keys & setup_keys != self.compulsory_keys:
302 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
306 Returns the keys that have been setup so far
308 return self.backward_converter.keys()
312 This sets up the standard compulsory keys as defined by
313 _CrossLinkDataBaseStandardKeys
315 for ck
in self.compulsory_keys:
316 self.converter[ck]=ck
317 self.backward_converter[ck]=ck
319 def set_unique_id_key(self,origin_key):
320 self.converter[origin_key]=self.unique_id_key
321 self.backward_converter[self.unique_id_key]=origin_key
323 def set_protein1_key(self,origin_key):
324 self.converter[origin_key]=self.protein1_key
325 self.backward_converter[self.protein1_key]=origin_key
327 def set_protein2_key(self,origin_key):
328 self.converter[origin_key]=self.protein2_key
329 self.backward_converter[self.protein2_key]=origin_key
331 def set_residue1_key(self,origin_key):
332 self.converter[origin_key]=self.residue1_key
333 self.backward_converter[self.residue1_key]=origin_key
335 def set_residue2_key(self,origin_key):
336 self.converter[origin_key]=self.residue2_key
337 self.backward_converter[self.residue2_key]=origin_key
339 def set_residue1_amino_acid_key(self, origin_key):
340 self.converter[origin_key] = self.residue1_amino_acid_key
341 self.backward_converter[self.residue1_amino_acid_key] = origin_key
343 def set_residue2_amino_acid_key(self, origin_key):
344 self.converter[origin_key] = self.residue2_amino_acid_key
345 self.backward_converter[self.residue2_amino_acid_key] = origin_key
347 def set_residue1_moiety_key(self, origin_key):
348 self.converter[origin_key] = self.residue1_moiety_key
349 self.backward_converter[self.residue1_moiety_key] = origin_key
351 def set_residue2_moiety_key(self, origin_key):
352 self.converter[origin_key] = self.residue2_moiety_key
353 self.backward_converter[self.residue2_moiety_key] = origin_key
355 def set_site_pairs_key(self,origin_key):
356 self.converter[origin_key]=self.site_pairs_key
357 self.backward_converter[self.site_pairs_key]=origin_key
359 def set_id_score_key(self,origin_key):
360 self.converter[origin_key]=self.id_score_key
361 self.backward_converter[self.id_score_key]=origin_key
363 def set_fdr_key(self,origin_key):
364 self.converter[origin_key]=self.fdr_key
365 self.backward_converter[self.fdr_key]=origin_key
367 def set_quantitation_key(self,origin_key):
368 self.converter[origin_key]=self.quantitation_key
369 self.backward_converter[self.quantitation_key]=origin_key
371 def set_psi_key(self,origin_key):
372 self.converter[origin_key]=self.psi_key
373 self.backward_converter[self.psi_key]=origin_key
375 def set_link_type_key(self,link_type_key):
376 self.converter[link_type_key]=self.link_type_key
377 self.backward_converter[self.link_type_key]=link_type_key
381 Returns the dictionary that convert the old keywords to the new ones
384 return self.converter
388 Returns the dictionary that convert the new keywords to the old ones
391 return self.backward_converter
395 A class to handle different styles of site pairs parsers.
397 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
398 [Y3-;K4-] for dead-ends
399 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
400 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
401 LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
406 def __init__(self,style):
408 _CrossLinkDataBaseStandardKeys.__init__(self)
409 if style
is "MSSTUDIO":
411 self.compulsory_keys= set([self.protein1_key,
413 self.site_pairs_key])
414 elif style
is "XTRACT" or style
is "QUANTITATION":
416 self.compulsory_keys= set([self.site_pairs_key])
417 elif style
is "LAN_HUANG":
419 self.compulsory_keys= set([self.site_pairs_key])
421 raise Error(
"ResiduePairListParser: unknown list parser style")
423 def get_compulsory_keys(self):
424 return self.compulsory_keys
428 This function returns a list of cross-linked residues and the corresponding list of
429 cross-linked chains. The latter list can be empty, if the style doesn't have the
430 corresponding information.
432 if self.style ==
"MSSTUDIO":
433 input_string=input_string.replace(
"[",
"")
434 input_string=input_string.replace(
"]",
"")
435 input_string_pairs=input_string.split(
";")
436 residue_pair_indexes=[]
437 chain_pair_indexes=[]
438 for s
in input_string_pairs:
439 m1=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
440 m2=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
443 residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
444 residue_pair_indexes.append((residue_index_1,residue_index_2))
447 residue_type_1,residue_index_1=m2.group(1,2)
449 return residue_pair_indexes,chain_pair_indexes
450 if self.style
is "XTRACT" or self.style
is "QUANTITATION":
451 if ":x:" in input_string:
453 input_strings=input_string.split(
":x:")
454 first_peptides=input_strings[0].split(
":|:")
455 second_peptides=input_strings[1].split(
":|:")
456 first_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in first_peptides]
457 second_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in second_peptides]
458 residue_pair_indexes=[]
459 chain_pair_indexes=[]
460 for fpi
in first_peptides_indentifiers:
461 for spi
in second_peptides_indentifiers:
466 residue_pair_indexes.append((residue1,residue2))
467 chain_pair_indexes.append((chain1,chain2))
468 return residue_pair_indexes, chain_pair_indexes
471 first_peptides = input_string.split(
":|:")
472 first_peptides_indentifiers = [(f.split(
":")[0], f.split(
":")[1])
for f
in first_peptides]
475 for fpi
in first_peptides_indentifiers:
478 residue_indexes.append((residue1,))
479 chain_indexes.append((chain1,))
480 return residue_indexes, chain_indexes
481 if self.style
is "LAN_HUANG":
482 input_strings=input_string.split(
"-")
483 chain1,first_series=input_strings[0].split(
":")
484 chain2,second_series=input_strings[1].split(
":")
486 first_residues=first_series.replace(
";",
"|").split(
"|")
487 second_residues=second_series.replace(
";",
"|").split(
"|")
488 residue_pair_indexes=[]
489 chain_pair_indexes=[]
490 for fpi
in first_residues:
491 for spi
in second_residues:
492 residue1=self.re.sub(
"[^0-9]",
"", fpi)
493 residue2=self.re.sub(
"[^0-9]",
"", spi)
494 residue_pair_indexes.append((residue1,residue2))
495 chain_pair_indexes.append((chain1,chain2))
496 return residue_pair_indexes, chain_pair_indexes
502 A class to handle different XL format with fixed format
503 currently support ProXL
505 def __init__(self,format):
507 _CrossLinkDataBaseStandardKeys.__init__(self)
508 if format
is "PROXL":
511 raise Error(
"FixedFormatParser: unknown list format name")
514 def get_data(self,input_string):
515 if self.format
is "PROXL":
516 tockens=input_string.split(
"\t")
518 if tockens[0]==
"SEARCH ID(S)":
521 xl[self.protein1_key]=tockens[2]
522 xl[self.protein2_key]=tockens[4]
523 xl[self.residue1_key]=int(tockens[3])
524 xl[self.residue2_key]=int(tockens[5])
527 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
530 this class handles a cross-link dataset and do filtering
531 operations, adding cross-links, merge datasets...
534 def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=(
'K',)):
537 @param converter an instance of CrossLinkDataBaseKeywordsConverter
538 @param data_base an instance of CrossLinkDataBase to build the new database on
539 @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
540 crosslink consistency. If not given consistency will not be checked
541 @param linkable_aa a tuple containing one-letter amino acids which are linkable by the crosslinker;
542 only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
543 and if a fasta_seq is given
546 if data_base
is None:
549 self.data_base=data_base
551 _CrossLinkDataBaseStandardKeys.__init__(self)
552 if converter
is not None:
553 self.cldbkc = converter
554 self.list_parser=self.cldbkc.rplp
555 self.converter = converter.get_converter()
559 self.list_parser=
None
560 self.converter =
None
563 self.def_aa_tuple = linkable_aa
564 self.fasta_seq = fasta_seq
570 Update the whole dataset after changes
572 self.update_cross_link_unique_sub_index()
573 self.update_cross_link_redundancy()
574 self.update_residues_links_number()
575 self.check_cross_link_consistency()
579 sorted_ids=sorted(self.data_base.keys())
581 for xl
in self.data_base[k]:
584 def xlid_iterator(self):
585 sorted_ids=sorted(self.data_base.keys())
586 for xlid
in sorted_ids:
589 def __getitem__(self,xlid):
590 return self.data_base[xlid]
593 return len([xl
for xl
in self])
598 def set_name(self,name):
600 for k
in self.data_base:
601 new_data_base[k+
"."+name]=self.data_base[k]
602 self.data_base=new_data_base
606 def get_number_of_xlid(self):
607 return len(self.data_base)
610 def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
612 if FixedFormatParser is not specified, the file is comma-separated-values
613 @param file_name a txt file to be parsed
614 @param converter an instance of CrossLinkDataBaseKeywordsConverter
615 @param FixedFormatParser a parser for a fixed format
617 if not FixedFormatParser:
618 xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
620 if converter
is not None:
621 self.cldbkc = converter
622 self.list_parser=self.cldbkc.rplp
623 self.converter = converter.get_converter()
626 if not self.list_parser:
630 for nxl,xl
in enumerate(xl_list):
633 if k
in self.converter:
634 new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
637 if self.unique_id_key
in self.cldbkc.get_setup_keys():
638 if new_xl[self.unique_id_key]
not in new_xl_dict:
639 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
641 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
643 if str(nxl)
not in new_xl_dict:
644 new_xl_dict[str(nxl)]=[new_xl]
646 new_xl_dict[str(nxl)].append(new_xl)
651 for nxl,entry
in enumerate(xl_list):
655 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
656 raise Error(
"CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
658 if k
in self.converter:
659 new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
663 residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
666 for n,p
in enumerate(residue_pair_list):
673 new_xl[k]=new_dict[k]
674 new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
676 new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
678 if len(chain_pair_list)==len(residue_pair_list):
679 new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
681 new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
684 new_xl[self.link_type_key]=
"CROSSLINK"
686 new_xl[self.link_type_key]=
"MONOLINK"
688 if self.unique_id_key
in self.cldbkc.get_setup_keys():
689 if new_xl[self.unique_id_key]
not in new_xl_dict:
690 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
692 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
694 if str(nxl)
not in new_xl_dict:
695 new_xl[self.unique_id_key]=str(nxl+1)
696 new_xl_dict[str(nxl)]=[new_xl]
698 new_xl[self.unique_id_key]=str(nxl+1)
699 new_xl_dict[str(nxl)].append(new_xl)
704 if FixedFormatParser is defined
708 f=open(file_name,
"r")
711 xl=FixedFormatParser.get_data(line)
713 xl[self.unique_id_key]=str(nxl+1)
714 new_xl_dict[str(nxl)]=[xl]
718 self.data_base=new_xl_dict
720 l = ihm.location.InputFileLocation(file_name, details=
'Crosslinks')
721 self.dataset = ihm.dataset.CXMSDataset(l)
724 def update_cross_link_unique_sub_index(self):
725 for k
in self.data_base:
726 for n,xl
in enumerate(self.data_base[k]):
727 xl[self.ambiguity_key]=len(self.data_base[k])
728 xl[self.unique_sub_index_key]=n+1
729 xl[self.unique_sub_id_key]=k+
"."+str(n+1)
731 def update_cross_link_redundancy(self):
732 redundancy_data_base={}
734 pra=_ProteinsResiduesArray(xl)
735 if pra
not in redundancy_data_base:
736 redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
737 redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
739 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
740 redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
742 pra=_ProteinsResiduesArray(xl)
743 xl[self.redundancy_key]=len(redundancy_data_base[pra])
744 xl[self.redundancy_list_key]=redundancy_data_base[pra]
746 def update_residues_links_number(self):
749 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
750 if (p1,r1)
not in residue_links:
751 residue_links[(p1,r1)]=set([(p2,r2)])
753 residue_links[(p1,r1)].add((p2,r2))
754 if (p2,r2)
not in residue_links:
755 residue_links[(p2,r2)]=set([(p1,r1)])
757 residue_links[(p2,r2)].add((p1,r1))
760 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
761 xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
762 xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
764 def check_cross_link_consistency(self):
765 """This function checks the consistency of the dataset with the amino acid sequence"""
766 if self.cldbkc
and self.fasta_seq:
767 cnt_matched, cnt_matched_file = 0, 0
771 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
772 b_matched_file =
False
773 if self.residue1_amino_acid_key
in xl:
775 aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
776 b_matched = self._match_xlinks(p1, r1, aa_from_file)
777 b_matched_file = b_matched
780 b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
782 matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
784 if self.residue2_amino_acid_key
in xl:
785 aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
786 b_matched = self._match_xlinks(p2, r2, aa_from_file)
787 b_matched_file = b_matched
789 b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
791 matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
792 if b_matched: cnt_matched += 1
793 if b_matched_file: cnt_matched_file += 1
795 percentage_matched = round(100*cnt_matched/len(self),1)
796 percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
798 if matched
or non_matched: print(
"check_cross_link_consistency: Out of %d crosslinks "
799 "%d were matched to the fasta sequence (%f %%).\n "
800 "%d were matched by using the crosslink file (%f %%)."%
801 (len(self),cnt_matched,percentage_matched,cnt_matched_file,
802 percentage_matched_file) )
803 if non_matched: print(
"check_cross_link_consistency: Warning: Non matched xlinks:",
804 [(prot_name, sorted(list(non_matched[prot_name])))
for prot_name
in non_matched])
805 return matched,non_matched
807 def _match_xlinks(self, prot_name, res_index, aa_tuple):
812 for amino_acid
in aa_tuple:
813 if len(amino_acid) == 3:
814 amino_acid = amino_dict[amino_acid.upper()]
815 if prot_name
in self.fasta_seq.sequences:
816 seq = self.fasta_seq.sequences[prot_name]
821 if res_index == 0
or res_index == 1:
823 if res_index < len(seq):
824 if amino_acid == seq[res_index]:
830 def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
833 matched[prot].add(res)
835 matched[prot] = set([res])
837 if prot
in non_matched:
838 non_matched[prot].add(res)
840 non_matched[prot] = set([res])
841 return matched, non_matched
844 def get_cross_link_string(self,xl):
846 for k
in self.ordered_key_list:
848 string+=str(k)+
":"+str(xl[k])+
"|"
853 if k
not in self.ordered_key_list:
854 string+=str(k)+
":"+str(xl[k])+
"|"
858 def get_short_cross_link_string(self,xl):
861 list_of_keys=[self.data_set_name_key,
862 self.unique_sub_id_key,
870 for k
in list_of_keys:
872 string+=str(xl[k])+
"|"
878 def filter(self,FilterOperator):
880 for id
in self.data_base.keys():
881 for xl
in self.data_base[id]:
882 if FilterOperator.evaluate(xl):
883 if id
not in new_xl_dict:
886 new_xl_dict[id].append(xl)
888 cdb = CrossLinkDataBase(self.cldbkc,new_xl_dict)
889 cdb.dataset = self.dataset
892 def filter_score(self,score):
893 '''Get all crosslinks with score greater than an input value'''
895 return self.filter(FilterOperator)
897 def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
899 This function merges two cross-link datasets so that if two conflicting crosslinks have the same
900 cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
901 with different SubIDs
905 def append_database(self,CrossLinkDataBase2):
907 This function append one cross-link dataset to another. Unique ids will be renamed to avoid
910 name1=self.get_name()
911 name2=CrossLinkDataBase2.get_name()
914 name2=id(CrossLinkDataBase2)
918 for k
in self.data_base:
919 new_data_base[k]=self.data_base[k]
920 for k
in CrossLinkDataBase2.data_base:
921 new_data_base[k]=CrossLinkDataBase2.data_base[k]
922 self.data_base=new_data_base
925 def set_value(self,key,new_value,FilterOperator=None):
927 This function changes the value for a given key in the database
928 For instance one can change the name of a protein
929 @param key: the key in the database that must be changed
930 @param new_value: the new value of the key
931 @param FilterOperator: optional FilterOperator to change the value to
932 a subset of the database
934 example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
938 if FilterOperator
is not None:
939 if FilterOperator.evaluate(xl):
945 def get_values(self,key):
947 this function returns the list of values for a given key in the database
948 alphanumerically sorted
953 return sorted(list(values))
955 def offset_residue_index(self,protein_name,offset):
957 This function offset the residue indexes of a given protein by a specified value
958 @param protein_name: the protein name that need to be changed
959 @param offset: the offset value
963 if xl[self.protein1_key] == protein_name:
964 xl[self.residue1_key]=xl[self.residue1_key]+offset
965 if xl[self.protein2_key] == protein_name:
966 xl[self.residue2_key]=xl[self.residue2_key]+offset
969 def create_new_keyword(self,keyword,values_from_keyword=None):
971 This function creates a new keyword for the whole database and set the values from
972 and existing keyword (optional), otherwise the values are set to None
973 @param keyword the new keyword name:
974 @param values_from_keyword the keyword from which we are copying the values:
977 if values_from_keyword
is not None:
978 xl[keyword] = xl[values_from_keyword]
983 def rename_proteins(self,old_to_new_names_dictionary, protein_to_rename="both"):
985 This function renames all proteins contained in the input dictionary
986 from the old names (keys) to the new name (values)
987 @param old_to_new_names_dictionary dictionary for converting old to new names
988 @param protein_to_rename specify whether to rename both or protein1 or protein2 only
991 for old_name
in old_to_new_names_dictionary:
992 new_name=old_to_new_names_dictionary[old_name]
993 if protein_to_rename ==
"both" or protein_to_rename ==
"protein1":
994 fo2=FilterOperator(self.protein1_key,operator.eq,old_name)
995 self.set_value(self.protein1_key,new_name,fo2)
996 if protein_to_rename ==
"both" or protein_to_rename ==
"protein2":
997 fo2=FilterOperator(self.protein2_key,operator.eq,old_name)
998 self.set_value(self.protein2_key,new_name,fo2)
1000 def clone_protein(self,protein_name,new_protein_name):
1002 for id
in self.data_base.keys():
1004 for xl
in self.data_base[id]:
1005 new_data_base.append(xl)
1006 if xl[self.protein1_key]==protein_name
and xl[self.protein2_key]!=protein_name:
1008 new_xl[self.protein1_key]=new_protein_name
1009 new_data_base.append(new_xl)
1010 elif xl[self.protein1_key]!=protein_name
and xl[self.protein2_key]==protein_name:
1012 new_xl[self.protein2_key]=new_protein_name
1013 new_data_base.append(new_xl)
1014 elif xl[self.protein1_key]==protein_name
and xl[self.protein2_key]==protein_name:
1016 new_xl[self.protein1_key]=new_protein_name
1017 new_data_base.append(new_xl)
1019 new_xl[self.protein2_key]=new_protein_name
1020 new_data_base.append(new_xl)
1022 new_xl[self.protein1_key]=new_protein_name
1023 new_xl[self.protein2_key]=new_protein_name
1024 new_data_base.append(new_xl)
1025 self.data_base[id]=new_data_base
1028 def filter_out_same_residues(self):
1030 This function remove cross-links applied to the same residue
1031 (ie, same chain name and residue number)
1034 for id
in self.data_base.keys():
1036 for xl
in self.data_base[id]:
1037 if xl[self.protein1_key]==xl[self.protein2_key]
and xl[self.residue1_key]==xl[self.residue2_key]:
1040 new_data_base.append(xl)
1041 self.data_base[id]=new_data_base
1045 def jackknife(self,percentage):
1047 this method returns a CrossLinkDataBase class containing
1048 a random subsample of the original cross-link database.
1049 @param percentage float between 0 and 1, is the percentage of
1050 of spectra taken from the original list
1053 if percentage > 1.0
or percentage < 0.0:
1054 raise ValueError(
'the percentage of random cross-link spectra should be between 0 and 1')
1055 nspectra=self.get_number_of_xlid()
1056 nrandom_spectra=int(nspectra*percentage)
1057 random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1059 for k
in random_keys:
1060 new_data_base[k]=self.data_base[k]
1061 return CrossLinkDataBase(self.cldbkc,new_data_base)
1065 sorted_ids=sorted(self.data_base.keys())
1067 for id
in sorted_ids:
1069 for xl
in self.data_base[id]:
1070 for k
in self.ordered_key_list:
1072 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1077 if k
not in self.ordered_key_list:
1078 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1079 outstr+=
"-------------\n"
1083 def plot(self,filename,**kwargs):
1085 matplotlib.use(
'Agg')
1086 import matplotlib.pyplot
as plt
1087 import matplotlib.colors
1091 if kwargs[
"type"] ==
"scatter":
1092 cmap=plt.get_cmap(
"rainbow")
1093 norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1096 if "colorkey" in kwargs:
1097 colorkey=kwargs[
"colorkey"]
1098 if "sizekey" in kwargs:
1099 sizekey=kwargs[
"sizekey"]
1100 if "logyscale" in kwargs:
1101 logyscale=kwargs[
"logyscale"]
1109 xs.append(float(xl[xkey]))
1111 ys.append(math.log10(float(xl[ykey])))
1113 ys.append(float(xl[ykey]))
1114 colors.append(float(xl[colorkey]))
1116 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1120 for color
in colors:
1121 cindex=(color-min(colors))/(max(colors)-min(colors))
1122 cs.append(cmap(cindex))
1125 ax = fig.add_subplot(111)
1126 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker=
"o")
1129 plt.savefig(filename)
1133 if kwargs[
"type"] ==
"residue_links":
1137 molecule=kwargs[
"molecule"]
1139 length=len(molecule.sequence)
1140 molecule=molecule.get_name()
1143 sequences_object=kwargs[
"sequences_object"]
1144 sequence=sequences_object.sequences[molecule]
1145 length=len(sequence)
1147 histogram=[0]*length
1149 if xl[self.protein1_key]==molecule:
1150 histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1151 if xl[self.protein2_key]==molecule:
1152 histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1153 rects = plt.bar(range(1,length+1), histogram)
1160 plt.savefig(filename)
1166 if kwargs[
"type"] ==
"histogram":
1167 valuekey=kwargs[
"valuekey"]
1168 reference_xline=kwargs[
"reference_xline"]
1174 values_list.append(float(xl[valuekey]))
1176 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1179 filename, [values_list], valuename=valuename, bins=bins,
1180 colors=
None, format=
"pdf",
1181 reference_xline=
None, yplotrange=
None,
1182 xplotrange=
None,normalized=
True,
1185 def dump(self,json_filename):
1187 with open(json_filename,
'w')
as fp:
1188 json.dump(self.data_base, fp, sort_keys=
True, indent=2, default=set_json_default)
1190 def load(self,json_filename):
1192 with open(json_filename,
'r') as fp:
1193 self.data_base = json.load(fp)
1197 if sys.version_info[0] < 3:
1199 for k,v
in xl.iteritems():
1200 if type(k)
is unicode: k=str(k)
1201 if type(v)
is unicode: v=str(v)
1204 def save_csv(self,filename):
1210 sorted_group_ids=sorted(self.data_base.keys())
1211 for group
in sorted_group_ids:
1213 for xl
in self.data_base[group]:
1215 sorted_ids=sorted(xl.keys())
1216 values=[xl[k]
for k
in sorted_ids]
1217 group_block.append(values)
1221 with open(filename,
'w')
as fp:
1222 a = csv.writer(fp, delimiter=
',')
1223 a.writerow(sorted_ids)
1226 def get_number_of_unique_crosslinked_sites(self):
1228 Returns the number of non redundant crosslink sites
1232 pra=_ProteinsResiduesArray(xl)
1233 prai=pra.get_inverted()
1236 return len(list(praset))
1239 """This class allows to compute and plot the distance between datasets"""
1242 """Input a dictionary where keys are cldb names and values are cldbs"""
1243 import scipy.spatial.distance
1244 self.dbs=cldb_dictionary
1245 self.keylist=list(self.dbs.keys())
1246 self.distances=list()
1249 for i,key1
in enumerate(self.keylist):
1250 for key2
in self.keylist[i+1:]:
1251 distance=self.get_distance(key1,key2)
1252 self.distances.append(distance)
1254 self.distances=scipy.spatial.distance.squareform(self.distances)
1257 return 1.0-self.
jaccard_index(self.dbs[key1],self.dbs[key2])
1260 """Similarity index between two datasets
1261 https://en.wikipedia.org/wiki/Jaccard_index"""
1265 for xl1
in CrossLinkDataBase1:
1266 a1f=_ProteinsResiduesArray(xl1)
1267 a1b=a1f.get_inverted()
1270 for xl2
in CrossLinkDataBase2:
1271 a2f=_ProteinsResiduesArray(xl2)
1272 a2b=a2f.get_inverted()
1275 return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1277 def plot_matrix(self,figurename="clustermatrix.pdf"):
1278 import matplotlib
as mpl
1281 import matplotlib.pylab
as pl
1282 from scipy.cluster
import hierarchy
as hrc
1284 raw_distance_matrix=self.distances
1290 ax = fig.add_subplot(1,1,1)
1291 dendrogram = hrc.dendrogram(
1292 hrc.linkage(raw_distance_matrix),
1295 leaves_order = dendrogram[
'leaves']
1296 ax.set_xlabel(
'Dataset')
1297 ax.set_ylabel(
'Jaccard Distance')
1299 pl.savefig(
"dendrogram."+figurename, dpi=300)
1305 ax = fig.add_subplot(1,1,1)
1307 raw_distance_matrix[leaves_order,
1310 interpolation=
'nearest')
1311 cb = fig.colorbar(cax)
1312 cb.set_label(
'Jaccard Distance')
1313 ax.set_xlabel(
'Dataset')
1314 ax.set_ylabel(
'Dataset')
1315 ax.set_xticks(range(len(labels)))
1316 ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation=
'vertical')
1317 ax.set_yticks(range(len(labels)))
1318 ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation=
'horizontal')
1320 pl.savefig(
"matrix."+figurename, dpi=300)
1326 This class maps a CrossLinkDataBase on a given structure
1327 and save an rmf file with color-coded crosslinks
1331 def __init__(self,CrossLinkDataBase,rmf_or_stat_handler):
1332 self.CrossLinkDataBase=CrossLinkDataBase
1335 self.prots=rmf_or_stat_handler
1336 self.distances=defaultdict(list)
1340 print(
"computing distances fro all crosslinks and all structures")
1341 for i
in self.prots[::10]:
1342 self.compute_distances()
1343 for key,xl
in enumerate(self.CrossLinkDataBase):
1344 array=_ProteinsResiduesArray(xl)
1345 self.array_to_id[array]=key
1346 self.id_to_array[key]=array
1347 if xl[
"MinAmbiguousDistance"]
is not 'None':
1348 self.distances[key].append(xl[
"MinAmbiguousDistance"])
1350 def compute_distances(self):
1353 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1354 for group
in sorted_group_ids:
1357 for xl
in self.CrossLinkDataBase.data_base[group]:
1359 sorted_ids=sorted(xl.keys())
1360 data.append(sorted_ids+[
"UniqueID",
"Distance",
"MinAmbiguousDistance"])
1361 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1363 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1372 values=[xl[k]
for k
in sorted_ids]
1373 values += [group, mdist]
1374 except KeyError
as e:
1375 print(
"MapCrossLinkDataBaseOnStructure KeyError: {0} in {1}".format(e, xl))
1376 group_dists.append(mdist)
1378 xl[
"Distance"]=mdist
1384 for xl
in self.CrossLinkDataBase.data_base[group]:
1385 xl[
"MinAmbiguousDistance"]=min(group_dists)
1387 def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1388 '''more robust and slower version of above'''
1390 selpart_1=sel.get_selected_particles()
1391 if len(selpart_1)==0:
1392 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1395 selpart_2=sel.get_selected_particles()
1396 if len(selpart_2)==0:
1397 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1400 for p1
in selpart_1:
1401 for p2
in selpart_2:
1402 if p1 == p2
and r1 == r2:
continue
1422 results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1423 if len(results)==0:
return None
1424 results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1425 return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1427 def save_rmf_snapshot(self,filename,color_id=None):
1428 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1431 for group
in sorted_group_ids:
1432 group_dists_particles=[]
1433 for xl
in self.CrossLinkDataBase.data_base[group]:
1434 xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1435 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1437 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1439 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1441 if color_id
is not None:
1442 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1444 group_dists_particles.append((mdist,p1,p2,xllabel,0.0))
1445 if group_dists_particles:
1446 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1447 color_scores.append(mincolor_score)
1448 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1453 linear.set_slope(1.0)
1455 rslin =
IMP.RestraintSet(self.prots.get_model(),
'linear_dummy_restraints')
1457 offset=min(color_scores)
1458 maxvalue=max(color_scores)
1459 for pair
in list_of_pairs:
1461 pr.set_name(pair[2])
1463 factor=(pair[3]-offset)/(maxvalue-offset)
1468 rslin.add_restraint(pr)
1471 rh = RMF.create_rmf_file(filename)
1478 def boxplot_crosslink_distances(self,filename):
1480 keys=[self.id_to_array[k]
for k
in self.distances.keys()]
1481 medians=[numpy.median(self.distances[self.array_to_id[k]])
for k
in keys]
1482 dists=[self.distances[self.array_to_id[k]]
for k
in keys]
1483 distances_sorted_by_median=[x
for _,x
in sorted(zip(medians,dists))]
1484 keys_sorted_by_median=[x
for _,x
in sorted(zip(medians,keys))]
1486 distances_sorted_by_median,
1487 range(len(distances_sorted_by_median)),
1488 xlabels=keys_sorted_by_median,scale_plot_length=0.2)
1490 def get_crosslink_violations(self,threshold):
1491 violations=defaultdict(list)
1492 for k
in self.distances:
1494 viols=self.distances[k]
1495 violations[self.id_to_array[k]]=viols
1501 This class generates a CrossLinkDataBase from a given structure
1503 def __init__(self,representation=None,
1505 residue_types_1=[
"K"],
1506 residue_types_2=[
"K"],
1507 reactivity_range=[0,1],
1513 cldbkc.set_protein1_key(
"Protein1")
1514 cldbkc.set_protein2_key(
"Protein2")
1515 cldbkc.set_residue1_key(
"Residue1")
1516 cldbkc.set_residue2_key(
"Residue2")
1517 self.cldb=CrossLinkDataBase(cldbkc)
1518 if representation
is not None:
1521 self.representation=representation
1522 self.model=self.representation.m
1523 elif system
is not None:
1526 self.model=self.system.mdl
1529 print(
"Argument error: please provide either a representation object or a IMP.Hierarchy")
1531 self.residue_types_1=residue_types_1
1532 self.residue_types_2=residue_types_2
1534 self.indexes_dict1={}
1535 self.indexes_dict2={}
1536 self.protein_residue_dict={}
1537 self.reactivity_dictionary={}
1538 self.euclidean_interacting_pairs=
None
1539 self.xwalk_interacting_pairs=
None
1542 if self.mode==
"pmi1":
1543 for protein
in self.representation.sequence_dict.keys():
1545 seq=self.representation.sequence_dict[protein]
1546 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
1552 rexp=numpy.random.exponential(0.1)
1553 prob=1.0-math.exp(-rexp)
1554 self.reactivity_dictionary[(protein,r)]=prob
1557 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
1558 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
1560 h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1563 self.indexes_dict1[index]=(protein,r)
1564 self.protein_residue_dict[(protein,r)]=index
1566 h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1569 self.indexes_dict2[index]=(protein,r)
1570 self.protein_residue_dict[(protein,r)]=index
1572 if self.mode==
"pmi2":
1573 for state
in self.system.get_states():
1574 for moleculename,molecules
in state.get_molecules().items():
1575 for molecule
in molecules:
1577 seq=molecule.sequence
1578 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
1584 rexp=numpy.random.exponential(0.00000001)
1585 prob=1.0-math.exp(-rexp)
1586 self.reactivity_dictionary[(molecule,r)]=prob
1588 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
1589 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
1592 ps=s.get_selected_particles()
1595 self.indexes_dict1[index]=(molecule,r)
1596 self.protein_residue_dict[(molecule,r)]=index
1599 ps=s.get_selected_particles()
1602 self.indexes_dict2[index]=(molecule,r)
1603 self.protein_residue_dict[(molecule,r)]=index
1606 def get_all_possible_pairs(self):
1607 n=float(len(self.protein_residue_dict.keys()))
1608 return n*(n-1.0)/2.0
1610 def get_all_feasible_pairs(self,distance=21):
1612 particle_index_pairs=[]
1614 for a,b
in itertools.combinations(self.protein_residue_dict.keys(),2):
1617 index1=self.protein_residue_dict[a]
1618 index2=self.protein_residue_dict[b]
1620 if particle_distance <= distance:
1621 particle_index_pairs.append((index1,index2))
1622 if self.mode==
"pmi1":
1623 new_xl[self.cldb.protein1_key]=a[0]
1624 new_xl[self.cldb.protein2_key]=b[0]
1625 elif self.mode==
"pmi2":
1626 new_xl[self.cldb.protein1_key]=a[0].get_name()
1627 new_xl[self.cldb.protein2_key]=b[0].get_name()
1628 new_xl[
"molecule_object1"]=a[0]
1629 new_xl[
"molecule_object2"]=b[0]
1630 new_xl[self.cldb.residue1_key]=a[1]
1631 new_xl[self.cldb.residue2_key]=b[1]
1632 self.cldb.data_base[str(nxl)]=[new_xl]
1640 def get_data_base(self,total_number_of_spectra,
1641 ambiguity_probability=0.1,
1644 max_delta_distance=10.0,
1645 xwalk_bin_path=
None,
1646 confidence_false=0.75,
1647 confidence_true=0.75):
1649 from random
import random,uniform
1653 self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1654 self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1655 self.cldb.data_base[str(number_of_spectra)]=[]
1656 self.sites_weighted=
None
1658 while number_of_spectra<total_number_of_spectra:
1659 if random() > ambiguity_probability
and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1661 number_of_spectra+=1
1662 self.cldb.data_base[str(number_of_spectra)]=[]
1664 if random() > noise:
1666 pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1669 pra,dist=self.get_random_residue_pair(
None,
None,
None)
1673 if self.mode==
"pmi1":
1674 new_xl[self.cldb.protein1_key]=pra[0]
1675 new_xl[self.cldb.protein2_key]=pra[1]
1676 elif self.mode==
"pmi2":
1677 new_xl[self.cldb.protein1_key]=pra[0].get_name()
1678 new_xl[self.cldb.protein2_key]=pra[1].get_name()
1679 new_xl[
"molecule_object1"]=pra[0]
1680 new_xl[
"molecule_object2"]=pra[1]
1681 new_xl[self.cldb.residue1_key]=pra[2]
1682 new_xl[self.cldb.residue2_key]=pra[3]
1683 new_xl[
"Noisy"]=noisy
1685 new_xl[
"Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1686 new_xl[
"Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1687 r1=new_xl[
"Reactivity_Residue1"]
1688 r2=new_xl[
"Reactivity_Residue2"]
1694 new_xl[
"Score"]=np.random.beta(1.0,self.beta_false)
1697 new_xl[
"Score"]=1.0-np.random.beta(1.0,self.beta_true)
1698 new_xl[
"TargetDistance"]=dist
1699 new_xl[
"NoiseProbability"]=noise
1700 new_xl[
"AmbiguityProbability"]=ambiguity_probability
1702 (p1,p2)=
IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1703 self.protein_residue_dict[(pra[1],pra[3])]])
1708 new_xl[
"InterRigidBody"] =
False
1713 new_xl[
"InterRigidBody"] =
True
1715 new_xl[
"InterRigidBody"] =
None
1717 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1722 def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1725 from random
import choice,uniform
1726 if distance
is None:
1729 if self.mode==
"pmi1":
1730 protein1=choice(self.representation.sequence_dict.keys())
1731 protein2=choice(self.representation.sequence_dict.keys())
1732 seq1=self.representation.sequence_dict[protein1]
1733 seq2=self.representation.sequence_dict[protein2]
1734 residue1=choice([i
for i
in range(1,len(seq1)+1)
if seq1[i-1]
in self.residue_types_1])
1735 residue2=choice([i
for i
in range(1,len(seq2)+1)
if seq2[i-1]
in self.residue_types_2])
1736 h1=IMP.pmi.tools.select_by_tuple(self.representation,(residue1,residue1,protein1),resolution=1)[0]
1737 h2=IMP.pmi.tools.select_by_tuple(self.representation,(residue2,residue2,protein2),resolution=1)[0]
1739 if (protein1,residue1) != (protein2,residue2):
1741 elif self.mode==
"pmi2":
1742 (protein1,residue1)=choice(self.protein_residue_dict.keys())
1743 (protein2,residue2)=choice(self.protein_residue_dict.keys())
1744 index1=self.protein_residue_dict[(protein1,residue1)]
1745 index2=self.protein_residue_dict[(protein2,residue2)]
1747 if (protein1,residue1) != (protein2,residue2):
1751 if not xwalk_bin_path:
1753 gcpf.set_distance(distance+max_delta_distance)
1757 if not self.sites_weighted:
1758 self.sites_weighted=[]
1759 for key
in self.reactivity_dictionary:
1760 r=self.reactivity_dictionary[key]
1761 self.sites_weighted.append((key,r))
1763 first_site=self.weighted_choice(self.sites_weighted)
1765 if not self.euclidean_interacting_pairs:
1766 self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1767 self.indexes_dict1.keys(),
1768 self.indexes_dict2.keys())
1770 first_site_pairs = [pair
for pair
in self.euclidean_interacting_pairs
1771 if self.indexes_dict1[pair[0]] == first_site
or
1772 self.indexes_dict2[pair[1]] == first_site]
1773 if len(first_site_pairs)==0:
continue
1775 second_sites_weighted=[]
1776 for pair
in first_site_pairs:
1777 if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1778 if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1779 r=self.reactivity_dictionary[second_site]
1780 second_sites_weighted.append((second_site,r))
1781 second_site=self.weighted_choice(second_sites_weighted)
1783 interacting_pairs_weighted=[]
1784 for pair in self.euclidean_interacting_pairs:
1785 r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1786 r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1787 #combined reactivity 1-exp(-k12*Delta t),
1790 r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1791 interacting_pairs_weighted.append((pair,r12))
1792 #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1793 #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1794 #interacting_pairs_weighted.append((pair,weight1*weight2))
1797 pair=self.weighted_choice(interacting_pairs_weighted)
1798 protein1,residue1=self.indexes_dict1[pair[0]]
1799 protein2,residue2=self.indexes_dict2[pair[1]]
1800 particle_pair=IMP.get_particles(self.model,pair)
1801 particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1802 if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1804 elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1805 #allow some flexibility
1806 prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1807 if uniform(0.0,1.0)<prob: break
1809 protein1,residue1=first_site
1810 protein2,residue2=second_site
1811 print(
"CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1812 "First site",first_site,self.reactivity_dictionary[first_site],
1813 "Second site",second_site,self.reactivity_dictionary[second_site])
1814 particle_pair=
IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1817 if particle_distance<distance
and (protein1,residue1) != (protein2,residue2):
1819 elif particle_distance>=distance
and (protein1,residue1) != (protein2,residue2)
and max_delta_distance:
1823 if particle_distance-distance < max_delta_distance:
break
1828 if not self.xwalk_interacting_pairs:
1829 self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1830 interacting_pairs_weighted=[]
1832 for pair
in self.xwalk_interacting_pairs:
1837 weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1838 weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1839 interacting_pairs_weighted.append((pair,weight1*weight2))
1841 pair=self.weighted_choice(interacting_pairs_weighted)
1846 particle_distance=float(pair[4])
1850 return ((protein1,protein2,residue1,residue2)),particle_distance
1852 def get_xwalk_distances(self,xwalk_bin_path,distance):
1856 o.init_pdb(
"xwalk.pdb",self.representation.prot)
1857 o.write_pdb(
"xwalk.pdb")
1858 namechainiddict=o.dictchain[
"xwalk.pdb"]
1861 for key
in namechainiddict: chainiddict[namechainiddict[key]]=key
1862 xwalkout=os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+
' -bb').read()
1864 output_list_of_distance=[]
1865 for line
in xwalkout.split(
"\n")[0:-2]:
1866 tockens=line.split()
1869 distance=float(tockens[6])
1871 ss=second.split(
"-")
1874 protein1=chainiddict[chainid1]
1875 protein2=chainiddict[chainid2]
1878 output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1879 return output_list_of_distance
1882 def weighted_choice(self,choices):
1885 total = sum(w
for c, w
in choices)
1886 r = random.uniform(0, total)
1888 for c, w
in choices:
1892 assert False,
"Shouldn't get here"
1894 def save_rmf_snapshot(self,filename,color_id=None):
1897 if color_id
is None:
1898 color_id=
"Reactivity"
1900 sorted_group_ids=sorted(self.cldb.data_base.keys())
1903 for group
in sorted_group_ids:
1905 group_dists_particles=[]
1906 for xl
in self.cldb.data_base[group]:
1907 xllabel=self.cldb.get_short_cross_link_string(xl)
1908 (c1,c2,r1,r2)=(xl[
"molecule_object1"],xl[
"molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1910 index1=self.protein_residue_dict[(c1,r1)]
1911 index2=self.protein_residue_dict[(c2,r2)]
1913 mdist=xl[
"TargetDistance"]
1915 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1917 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1918 if group_dists_particles:
1919 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1920 color_scores.append(mincolor_score)
1921 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1928 linear.set_slope(1.0)
1932 offset=min(color_scores)
1933 maxvalue=max(color_scores)
1934 for pair
in list_of_pairs:
1936 pr.set_name(pair[2])
1937 factor=(pair[3]-offset)/(maxvalue-offset)
1940 rslin.add_restraint(pr)
1943 rh = RMF.create_rmf_file(filename)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def __init__
Input a dictionary where keys are cldb names and values are cldbs.
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
def jaccard_index
Similarity index between two datasets https://en.wikipedia.org/wiki/Jaccard_index.
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class maps a CrossLinkDataBase on a given structure and save an rmf file with color-coded crossl...
A decorator for keeping track of copies of a molecule.
The standard decorator for manipulating molecular structures.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
This class allows to compute and plot the distance between datasets.
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
class to link stat files to several rmf files
class to allow more advanced handling of RMF files.
A class to handle different styles of site pairs parsers.
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.