1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
7 from __future__
import print_function
22 from collections
import defaultdict
25 def set_json_default(obj):
26 if isinstance(obj, set):
33 if sys.version_info[0] >= 3:
34 def _handle_string_input(inp):
35 if not isinstance(inp, str):
36 raise TypeError(
"expecting a string")
39 def _handle_string_input(inp):
40 if not isinstance(inp, (str, unicode)):
41 raise TypeError(
"expecting a string or unicode")
43 if isinstance(inp, unicode):
48 class _CrossLinkDataBaseStandardKeys(object):
50 This class setup all the standard keys needed to
51 identify the crosslink features from the data sets
55 self.protein1_key=
"Protein1"
56 self.type[self.protein1_key]=str
57 self.protein2_key=
"Protein2"
58 self.type[self.protein2_key]=str
59 self.residue1_key=
"Residue1"
60 self.type[self.residue1_key]=int
61 self.residue2_key=
"Residue2"
62 self.type[self.residue2_key]=int
63 self.residue1_amino_acid_key=
"Residue1AminoAcid"
64 self.type[self.residue1_amino_acid_key]=str
65 self.residue2_amino_acid_key=
"Residue2AminoAcid"
66 self.type[self.residue2_amino_acid_key]=str
67 self.residue1_moiety_key=
"Residue1Moiety"
68 self.type[self.residue1_moiety_key]=str
69 self.residue2_moiety_key=
"Residue2Moiety"
70 self.type[self.residue2_moiety_key]=str
71 self.site_pairs_key=
"SitePairs"
72 self.type[self.site_pairs_key]=str
73 self.unique_id_key=
"XLUniqueID"
74 self.type[self.unique_id_key]=str
75 self.unique_sub_index_key=
"XLUniqueSubIndex"
76 self.type[self.unique_sub_index_key]=int
77 self.unique_sub_id_key=
"XLUniqueSubID"
78 self.type[self.unique_sub_id_key]=str
79 self.data_set_name_key=
"DataSetName"
80 self.type[self.data_set_name_key]=str
81 self.cross_linker_chemical_key=
"CrossLinkerChemical"
82 self.type[self.cross_linker_chemical_key]=str
83 self.id_score_key=
"IDScore"
84 self.type[self.id_score_key]=float
86 self.type[self.fdr_key]=float
87 self.quantitation_key=
"Quantitation"
88 self.type[self.quantitation_key]=float
89 self.redundancy_key=
"Redundancy"
90 self.type[self.redundancy_key]=int
91 self.redundancy_list_key=
"RedundancyList"
92 self.type[self.redundancy_key]=list
93 self.ambiguity_key=
"Ambiguity"
94 self.type[self.ambiguity_key]=int
95 self.residue1_links_number_key=
"Residue1LinksNumber"
96 self.type[self.residue1_links_number_key]=int
97 self.residue2_links_number_key=
"Residue2LinksNumber"
98 self.type[self.residue2_links_number_key]=int
99 self.type[self.ambiguity_key]=int
100 self.state_key=
"State"
101 self.type[self.state_key]=int
102 self.sigma1_key=
"Sigma1"
103 self.type[self.sigma1_key]=str
104 self.sigma2_key=
"Sigma2"
105 self.type[self.sigma2_key]=str
107 self.type[self.psi_key]=str
108 self.distance_key=
"Distance"
109 self.type[self.distance_key]=float
110 self.min_ambiguous_distance_key=
"MinAmbiguousDistance"
111 self.type[self.distance_key]=float
113 self.link_type_key=
"LinkType"
114 self.type[self.link_type_key]=str
116 self.ordered_key_list =[self.data_set_name_key,
118 self.unique_sub_index_key,
119 self.unique_sub_id_key,
124 self.residue1_amino_acid_key,
125 self.residue2_amino_acid_key,
126 self.residue1_moiety_key,
127 self.residue2_moiety_key,
128 self.cross_linker_chemical_key,
131 self.quantitation_key,
133 self.redundancy_list_key,
139 self.min_ambiguous_distance_key,
143 class _ProteinsResiduesArray(tuple):
145 This class is inherits from tuple, and it is a shorthand for a cross-link
146 (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
147 residue1 and residue2.
150 def __new__(self,input_data):
152 @input_data can be a dict or a tuple
154 self.cldbsk=_CrossLinkDataBaseStandardKeys()
155 if type(input_data)
is dict:
157 p1=input_data[self.cldbsk.protein1_key]
159 p2=input_data[self.cldbsk.protein2_key]
162 r1=input_data[self.cldbsk.residue1_key]
164 r2=input_data[self.cldbsk.residue2_key]
171 elif type(input_data)
is tuple:
172 if len(input_data)>4
or len(input_data)==3
or len(input_data)==1:
173 raise TypeError(
"_ProteinsResiduesArray: must have only 4 elements")
174 if len(input_data)==4:
175 p1 = _handle_string_input(input_data[0])
176 p2 = _handle_string_input(input_data[1])
179 if (
not (type(r1)
is int))
and (
not (r1
is None)):
180 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
181 if (
not (type(r2)
is int))
and (
not (r1
is None)):
182 raise TypeError(
"_ProteinsResiduesArray: residue2 must be a integer")
184 if len(input_data) == 2:
185 p1 = _handle_string_input(input_data[0])
187 if type(r1)
is not int:
188 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
191 raise TypeError(
"_ProteinsResiduesArray: input must be a dict or tuple")
192 return tuple.__new__(_ProteinsResiduesArray, t)
194 def get_inverted(self):
196 Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
198 return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
201 outstr=self.cldbsk.protein1_key+
" "+str(self[0])
202 outstr+=
" "+self.cldbsk.protein2_key+
" "+str(self[1])
203 outstr+=
" "+self.cldbsk.residue1_key+
" "+str(self[2])
204 outstr+=
" "+self.cldbsk.residue2_key+
" "+str(self[3])
208 outstr=str(self[0])+
"."+str(self[2])+
"-"+str(self[1])+
"."+str(self[3])
213 This class allows to create filter functions that can be passed to the CrossLinkDataBase
216 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
218 where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
220 A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
224 and it is used to filter the database
227 def __init__(self, argument1, operator, argument2):
229 (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
230 or (FilterOperator1,operator.or|and...,FilterOperator2)
232 if isinstance(argument1, FilterOperator):
233 self.operations = [argument1, operator, argument2]
236 self.values = (argument1, operator, argument2)
238 def __or__(self, FilterOperator2):
241 def __and__(self, FilterOperator2):
244 def __invert__(self):
247 def evaluate(self, xl_item):
249 if len(self.operations) == 0:
250 keyword, operator, value = self.values
251 return operator(xl_item[keyword], value)
252 FilterOperator1, op, FilterOperator2 = self.operations
254 if FilterOperator2
is None:
255 return op(FilterOperator1.evaluate(xl_item))
257 return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
260 def filter_factory(xl_):
262 class FilterOperator(object):
266 def __new__(self,key,value,oper=operator.eq):
267 return oper(self.xl[key],value)
269 return FilterOperator
274 This class is needed to convert the keywords from a generic database
280 @param list_parser an instance of ResiduePairListParser, if any is needed
283 self.backward_converter={}
284 _CrossLinkDataBaseStandardKeys.__init__(self)
285 self.rplp = list_parser
286 if self.rplp
is None:
288 self.compulsory_keys=set([self.protein1_key,
293 self.compulsory_keys=self.rplp.get_compulsory_keys()
294 self.setup_keys=set()
298 Is a function that check whether necessary keys are setup
301 if self.compulsory_keys & setup_keys != self.compulsory_keys:
302 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
306 Returns the keys that have been setup so far
308 return self.backward_converter.keys()
312 This sets up the standard compulsory keys as defined by
313 _CrossLinkDataBaseStandardKeys
315 for ck
in self.compulsory_keys:
316 self.converter[ck]=ck
317 self.backward_converter[ck]=ck
319 def set_unique_id_key(self,origin_key):
320 self.converter[origin_key]=self.unique_id_key
321 self.backward_converter[self.unique_id_key]=origin_key
323 def set_protein1_key(self,origin_key):
324 self.converter[origin_key]=self.protein1_key
325 self.backward_converter[self.protein1_key]=origin_key
327 def set_protein2_key(self,origin_key):
328 self.converter[origin_key]=self.protein2_key
329 self.backward_converter[self.protein2_key]=origin_key
331 def set_residue1_key(self,origin_key):
332 self.converter[origin_key]=self.residue1_key
333 self.backward_converter[self.residue1_key]=origin_key
335 def set_residue2_key(self,origin_key):
336 self.converter[origin_key]=self.residue2_key
337 self.backward_converter[self.residue2_key]=origin_key
339 def set_residue1_amino_acid_key(self, origin_key):
340 self.converter[origin_key] = self.residue1_amino_acid_key
341 self.backward_converter[self.residue1_amino_acid_key] = origin_key
343 def set_residue2_amino_acid_key(self, origin_key):
344 self.converter[origin_key] = self.residue2_amino_acid_key
345 self.backward_converter[self.residue2_amino_acid_key] = origin_key
347 def set_residue1_moiety_key(self, origin_key):
348 self.converter[origin_key] = self.residue1_moiety_key
349 self.backward_converter[self.residue1_moiety_key] = origin_key
351 def set_residue2_moiety_key(self, origin_key):
352 self.converter[origin_key] = self.residue2_moiety_key
353 self.backward_converter[self.residue2_moiety_key] = origin_key
355 def set_site_pairs_key(self,origin_key):
356 self.converter[origin_key]=self.site_pairs_key
357 self.backward_converter[self.site_pairs_key]=origin_key
359 def set_id_score_key(self,origin_key):
360 self.converter[origin_key]=self.id_score_key
361 self.backward_converter[self.id_score_key]=origin_key
363 def set_fdr_key(self,origin_key):
364 self.converter[origin_key]=self.fdr_key
365 self.backward_converter[self.fdr_key]=origin_key
367 def set_quantitation_key(self,origin_key):
368 self.converter[origin_key]=self.quantitation_key
369 self.backward_converter[self.quantitation_key]=origin_key
371 def set_psi_key(self,origin_key):
372 self.converter[origin_key]=self.psi_key
373 self.backward_converter[self.psi_key]=origin_key
375 def set_link_type_key(self,link_type_key):
376 self.converter[link_type_key]=self.link_type_key
377 self.backward_converter[self.link_type_key]=link_type_key
381 Returns the dictionary that convert the old keywords to the new ones
384 return self.converter
388 Returns the dictionary that convert the new keywords to the old ones
391 return self.backward_converter
395 A class to handle different styles of site pairs parsers.
397 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
398 [Y3-;K4-] for dead-ends
399 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
400 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
401 LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
406 def __init__(self,style):
408 _CrossLinkDataBaseStandardKeys.__init__(self)
409 if style
is "MSSTUDIO":
411 self.compulsory_keys= set([self.protein1_key,
413 self.site_pairs_key])
414 elif style
is "XTRACT" or style
is "QUANTITATION":
416 self.compulsory_keys= set([self.site_pairs_key])
417 elif style
is "LAN_HUANG":
419 self.compulsory_keys= set([self.site_pairs_key])
421 raise Error(
"ResiduePairListParser: unknown list parser style")
423 def get_compulsory_keys(self):
424 return self.compulsory_keys
428 This function returns a list of cross-linked residues and the corresponding list of
429 cross-linked chains. The latter list can be empty, if the style doesn't have the
430 corresponding information.
432 if self.style ==
"MSSTUDIO":
433 input_string=input_string.replace(
"[",
"")
434 input_string=input_string.replace(
"]",
"")
435 input_string_pairs=input_string.split(
";")
436 residue_pair_indexes=[]
437 chain_pair_indexes=[]
438 for s
in input_string_pairs:
439 m1=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
440 m2=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
443 residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
444 residue_pair_indexes.append((residue_index_1,residue_index_2))
447 residue_type_1,residue_index_1=m2.group(1,2)
449 return residue_pair_indexes,chain_pair_indexes
450 if self.style
is "XTRACT" or self.style
is "QUANTITATION":
451 if ":x:" in input_string:
453 input_strings=input_string.split(
":x:")
454 first_peptides=input_strings[0].split(
":|:")
455 second_peptides=input_strings[1].split(
":|:")
456 first_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in first_peptides]
457 second_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in second_peptides]
458 residue_pair_indexes=[]
459 chain_pair_indexes=[]
460 for fpi
in first_peptides_indentifiers:
461 for spi
in second_peptides_indentifiers:
466 residue_pair_indexes.append((residue1,residue2))
467 chain_pair_indexes.append((chain1,chain2))
468 return residue_pair_indexes, chain_pair_indexes
471 first_peptides = input_string.split(
":|:")
472 first_peptides_indentifiers = [(f.split(
":")[0], f.split(
":")[1])
for f
in first_peptides]
475 for fpi
in first_peptides_indentifiers:
478 residue_indexes.append((residue1,))
479 chain_indexes.append((chain1,))
480 return residue_indexes, chain_indexes
481 if self.style
is "LAN_HUANG":
482 input_strings=input_string.split(
"-")
483 chain1,first_series=input_strings[0].split(
":")
484 chain2,second_series=input_strings[1].split(
":")
486 first_residues=first_series.replace(
";",
"|").split(
"|")
487 second_residues=second_series.replace(
";",
"|").split(
"|")
488 residue_pair_indexes=[]
489 chain_pair_indexes=[]
490 for fpi
in first_residues:
491 for spi
in second_residues:
492 residue1=self.re.sub(
"[^0-9]",
"", fpi)
493 residue2=self.re.sub(
"[^0-9]",
"", spi)
494 residue_pair_indexes.append((residue1,residue2))
495 chain_pair_indexes.append((chain1,chain2))
496 return residue_pair_indexes, chain_pair_indexes
502 A class to handle different XL format with fixed format
503 currently support ProXL
505 def __init__(self,format):
507 _CrossLinkDataBaseStandardKeys.__init__(self)
508 if format
is "PROXL":
511 raise Error(
"FixedFormatParser: unknown list format name")
514 def get_data(self,input_string):
515 if self.format
is "PROXL":
516 tockens=input_string.split(
"\t")
518 if tockens[0]==
"SEARCH ID(S)":
521 xl[self.protein1_key]=tockens[2]
522 xl[self.protein2_key]=tockens[4]
523 xl[self.residue1_key]=int(tockens[3])
524 xl[self.residue2_key]=int(tockens[5])
529 this class handles a cross-link dataset and do filtering
530 operations, adding cross-links, merge datasets...
533 def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=(
'K',)):
536 @param converter an instance of CrossLinkDataBaseKeywordsConverter
537 @param data_base an instance of CrossLinkDataBase to build the new database on
538 @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
539 crosslink consistency. If not given consistency will not be checked
540 @param linkable_aa a tuple containing one-letter amino acids which are linkable by the crosslinker;
541 only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
542 and if a fasta_seq is given
545 if data_base
is None:
548 self.data_base=data_base
550 _CrossLinkDataBaseStandardKeys.__init__(self)
551 if converter
is not None:
552 self.cldbkc = converter
553 self.list_parser=self.cldbkc.rplp
554 self.converter = converter.get_converter()
558 self.list_parser=
None
559 self.converter =
None
562 self.def_aa_tuple = linkable_aa
563 self.fasta_seq = fasta_seq
569 Update the whole dataset after changes
571 self.update_cross_link_unique_sub_index()
572 self.update_cross_link_redundancy()
573 self.update_residues_links_number()
578 sorted_ids=sorted(self.data_base.keys())
580 for xl
in self.data_base[k]:
583 def xlid_iterator(self):
584 sorted_ids=sorted(self.data_base.keys())
585 for xlid
in sorted_ids:
588 def __getitem__(self,xlid):
589 return self.data_base[xlid]
592 return len([xl
for xl
in self])
597 def set_name(self,name):
599 for k
in self.data_base:
600 new_data_base[k+
"."+name]=self.data_base[k]
601 self.data_base=new_data_base
605 def get_number_of_xlid(self):
606 return len(self.data_base)
611 if FixedFormatParser is not specified, the file is comma-separated-values
612 @param file_name a txt file to be parsed
613 @param converter an instance of CrossLinkDataBaseKeywordsConverter
614 @param FixedFormatParser a parser for a fixed format
616 if not FixedFormatParser:
617 xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
619 if converter
is not None:
620 self.cldbkc = converter
621 self.list_parser=self.cldbkc.rplp
622 self.converter = converter.get_converter()
625 if not self.list_parser:
629 for nxl,xl
in enumerate(xl_list):
632 if k
in self.converter:
633 new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
636 if self.unique_id_key
in self.cldbkc.get_setup_keys():
637 if new_xl[self.unique_id_key]
not in new_xl_dict:
638 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
640 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
642 if str(nxl)
not in new_xl_dict:
643 new_xl_dict[str(nxl)]=[new_xl]
645 new_xl_dict[str(nxl)].append(new_xl)
650 for nxl,entry
in enumerate(xl_list):
654 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
655 raise Error(
"CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
657 if k
in self.converter:
658 new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
662 residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
665 for n,p
in enumerate(residue_pair_list):
672 new_xl[k]=new_dict[k]
673 new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
675 new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
677 if len(chain_pair_list)==len(residue_pair_list):
678 new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
680 new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
683 new_xl[self.link_type_key]=
"CROSSLINK"
685 new_xl[self.link_type_key]=
"MONOLINK"
687 if self.unique_id_key
in self.cldbkc.get_setup_keys():
688 if new_xl[self.unique_id_key]
not in new_xl_dict:
689 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
691 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
693 if str(nxl)
not in new_xl_dict:
694 new_xl[self.unique_id_key]=str(nxl+1)
695 new_xl_dict[str(nxl)]=[new_xl]
697 new_xl[self.unique_id_key]=str(nxl+1)
698 new_xl_dict[str(nxl)].append(new_xl)
703 if FixedFormatParser is defined
707 f=open(file_name,
"r")
710 xl=FixedFormatParser.get_data(line)
712 xl[self.unique_id_key]=str(nxl+1)
713 new_xl_dict[str(nxl)]=[xl]
717 self.data_base=new_xl_dict
719 l = ihm.location.InputFileLocation(file_name, details=
'Crosslinks')
720 self.dataset = ihm.dataset.CXMSDataset(l)
723 def update_cross_link_unique_sub_index(self):
724 for k
in self.data_base:
725 for n,xl
in enumerate(self.data_base[k]):
726 xl[self.ambiguity_key]=len(self.data_base[k])
727 xl[self.unique_sub_index_key]=n+1
728 xl[self.unique_sub_id_key]=k+
"."+str(n+1)
730 def update_cross_link_redundancy(self):
731 redundancy_data_base={}
733 pra=_ProteinsResiduesArray(xl)
734 if pra
not in redundancy_data_base:
735 redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
736 redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
738 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
739 redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
741 pra=_ProteinsResiduesArray(xl)
742 xl[self.redundancy_key]=len(redundancy_data_base[pra])
743 xl[self.redundancy_list_key]=redundancy_data_base[pra]
745 def update_residues_links_number(self):
748 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
749 if (p1,r1)
not in residue_links:
750 residue_links[(p1,r1)]=set([(p2,r2)])
752 residue_links[(p1,r1)].add((p2,r2))
753 if (p2,r2)
not in residue_links:
754 residue_links[(p2,r2)]=set([(p1,r1)])
756 residue_links[(p2,r2)].add((p1,r1))
759 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
760 xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
761 xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
764 """This function checks the consistency of the dataset with the amino acid sequence"""
765 if self.cldbkc
and self.fasta_seq:
766 cnt_matched, cnt_matched_file = 0, 0
770 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
771 b_matched_file =
False
772 if self.residue1_amino_acid_key
in xl:
774 aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
775 b_matched = self._match_xlinks(p1, r1, aa_from_file)
776 b_matched_file = b_matched
779 b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
781 matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
783 if self.residue2_amino_acid_key
in xl:
784 aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
785 b_matched = self._match_xlinks(p2, r2, aa_from_file)
786 b_matched_file = b_matched
788 b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
790 matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
791 if b_matched: cnt_matched += 1
792 if b_matched_file: cnt_matched_file += 1
794 percentage_matched = round(100*cnt_matched/len(self),1)
795 percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
797 if matched
or non_matched: print(
"check_cross_link_consistency: Out of %d crosslinks "
798 "%d were matched to the fasta sequence (%f %%).\n "
799 "%d were matched by using the crosslink file (%f %%)."%
800 (len(self),cnt_matched,percentage_matched,cnt_matched_file,
801 percentage_matched_file) )
802 if non_matched: print(
"check_cross_link_consistency: Warning: Non matched xlinks:",
803 [(prot_name, sorted(list(non_matched[prot_name])))
for prot_name
in non_matched])
804 return matched,non_matched
806 def _match_xlinks(self, prot_name, res_index, aa_tuple):
811 for amino_acid
in aa_tuple:
812 if len(amino_acid) == 3:
813 amino_acid = amino_dict[amino_acid.upper()]
814 if prot_name
in self.fasta_seq.sequences:
815 seq = self.fasta_seq.sequences[prot_name]
820 if res_index == 0
or res_index == 1:
822 if res_index < len(seq):
823 if amino_acid == seq[res_index]:
829 def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
832 matched[prot].add(res)
834 matched[prot] = set([res])
836 if prot
in non_matched:
837 non_matched[prot].add(res)
839 non_matched[prot] = set([res])
840 return matched, non_matched
843 def get_cross_link_string(self,xl):
845 for k
in self.ordered_key_list:
847 string+=str(k)+
":"+str(xl[k])+
"|"
852 if k
not in self.ordered_key_list:
853 string+=str(k)+
":"+str(xl[k])+
"|"
857 def get_short_cross_link_string(self,xl):
860 list_of_keys=[self.data_set_name_key,
861 self.unique_sub_id_key,
869 for k
in list_of_keys:
871 string+=str(xl[k])+
"|"
877 def filter(self,FilterOperator):
879 for id
in self.data_base.keys():
880 for xl
in self.data_base[id]:
881 if FilterOperator.evaluate(xl):
882 if id
not in new_xl_dict:
885 new_xl_dict[id].append(xl)
888 cdb.dataset = self.dataset
892 '''Get all crosslinks with score greater than an input value'''
894 return self.filter(FilterOperator)
896 def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
898 This function merges two cross-link datasets so that if two conflicting crosslinks have the same
899 cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
900 with different SubIDs
906 This function append one cross-link dataset to another. Unique ids will be renamed to avoid
909 name1=self.get_name()
910 name2=CrossLinkDataBase2.get_name()
913 name2=id(CrossLinkDataBase2)
917 for k
in self.data_base:
918 new_data_base[k]=self.data_base[k]
919 for k
in CrossLinkDataBase2.data_base:
920 new_data_base[k]=CrossLinkDataBase2.data_base[k]
921 self.data_base=new_data_base
926 This function changes the value for a given key in the database
927 For instance one can change the name of a protein
928 @param key: the key in the database that must be changed
929 @param new_value: the new value of the key
930 @param FilterOperator: optional FilterOperator to change the value to
931 a subset of the database
933 example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
937 if FilterOperator
is not None:
938 if FilterOperator.evaluate(xl):
946 this function returns the list of values for a given key in the database
947 alphanumerically sorted
952 return sorted(list(values))
956 This function offset the residue indexes of a given protein by a specified value
957 @param protein_name: the protein name that need to be changed
958 @param offset: the offset value
962 if xl[self.protein1_key] == protein_name:
963 xl[self.residue1_key]=xl[self.residue1_key]+offset
964 if xl[self.protein2_key] == protein_name:
965 xl[self.residue2_key]=xl[self.residue2_key]+offset
970 This function creates a new keyword for the whole database and set the values from
971 and existing keyword (optional), otherwise the values are set to None
972 @param keyword the new keyword name:
973 @param values_from_keyword the keyword from which we are copying the values:
976 if values_from_keyword
is not None:
977 xl[keyword] = xl[values_from_keyword]
984 This function renames all proteins contained in the input dictionary
985 from the old names (keys) to the new name (values)
986 @param old_to_new_names_dictionary dictionary for converting old to new names
987 @param protein_to_rename specify whether to rename both or protein1 or protein2 only
990 for old_name
in old_to_new_names_dictionary:
991 new_name=old_to_new_names_dictionary[old_name]
992 if protein_to_rename ==
"both" or protein_to_rename ==
"protein1":
994 self.
set_value(self.protein1_key,new_name,fo2)
995 if protein_to_rename ==
"both" or protein_to_rename ==
"protein2":
997 self.
set_value(self.protein2_key,new_name,fo2)
999 def clone_protein(self,protein_name,new_protein_name):
1001 for id
in self.data_base.keys():
1003 for xl
in self.data_base[id]:
1004 new_data_base.append(xl)
1005 if xl[self.protein1_key]==protein_name
and xl[self.protein2_key]!=protein_name:
1007 new_xl[self.protein1_key]=new_protein_name
1008 new_data_base.append(new_xl)
1009 elif xl[self.protein1_key]!=protein_name
and xl[self.protein2_key]==protein_name:
1011 new_xl[self.protein2_key]=new_protein_name
1012 new_data_base.append(new_xl)
1013 elif xl[self.protein1_key]==protein_name
and xl[self.protein2_key]==protein_name:
1015 new_xl[self.protein1_key]=new_protein_name
1016 new_data_base.append(new_xl)
1018 new_xl[self.protein2_key]=new_protein_name
1019 new_data_base.append(new_xl)
1021 new_xl[self.protein1_key]=new_protein_name
1022 new_xl[self.protein2_key]=new_protein_name
1023 new_data_base.append(new_xl)
1024 self.data_base[id]=new_data_base
1029 This function remove cross-links applied to the same residue
1030 (ie, same chain name and residue number)
1033 for id
in self.data_base.keys():
1035 for xl
in self.data_base[id]:
1036 if xl[self.protein1_key]==xl[self.protein2_key]
and xl[self.residue1_key]==xl[self.residue2_key]:
1039 new_data_base.append(xl)
1040 self.data_base[id]=new_data_base
1046 this method returns a CrossLinkDataBase class containing
1047 a random subsample of the original cross-link database.
1048 @param percentage float between 0 and 1, is the percentage of
1049 of spectra taken from the original list
1052 if percentage > 1.0
or percentage < 0.0:
1053 raise ValueError(
'the percentage of random cross-link spectra should be between 0 and 1')
1054 nspectra=self.get_number_of_xlid()
1055 nrandom_spectra=int(nspectra*percentage)
1056 random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1058 for k
in random_keys:
1059 new_data_base[k]=self.data_base[k]
1064 sorted_ids=sorted(self.data_base.keys())
1066 for id
in sorted_ids:
1068 for xl
in self.data_base[id]:
1069 for k
in self.ordered_key_list:
1071 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1076 if k
not in self.ordered_key_list:
1077 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1078 outstr+=
"-------------\n"
1082 def plot(self,filename,**kwargs):
1084 matplotlib.use(
'Agg')
1085 import matplotlib.pyplot
as plt
1086 import matplotlib.colors
1090 if kwargs[
"type"] ==
"scatter":
1091 cmap=plt.get_cmap(
"rainbow")
1092 norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1095 if "colorkey" in kwargs:
1096 colorkey=kwargs[
"colorkey"]
1097 if "sizekey" in kwargs:
1098 sizekey=kwargs[
"sizekey"]
1099 if "logyscale" in kwargs:
1100 logyscale=kwargs[
"logyscale"]
1108 xs.append(float(xl[xkey]))
1110 ys.append(math.log10(float(xl[ykey])))
1112 ys.append(float(xl[ykey]))
1113 colors.append(float(xl[colorkey]))
1115 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1119 for color
in colors:
1120 cindex=(color-min(colors))/(max(colors)-min(colors))
1121 cs.append(cmap(cindex))
1124 ax = fig.add_subplot(111)
1125 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker=
"o")
1128 plt.savefig(filename)
1132 if kwargs[
"type"] ==
"residue_links":
1136 molecule=kwargs[
"molecule"]
1138 length=len(molecule.sequence)
1139 molecule=molecule.get_name()
1142 sequences_object=kwargs[
"sequences_object"]
1143 sequence=sequences_object.sequences[molecule]
1144 length=len(sequence)
1146 histogram=[0]*length
1148 if xl[self.protein1_key]==molecule:
1149 histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1150 if xl[self.protein2_key]==molecule:
1151 histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1152 rects = plt.bar(range(1,length+1), histogram)
1159 plt.savefig(filename)
1165 if kwargs[
"type"] ==
"histogram":
1166 valuekey=kwargs[
"valuekey"]
1167 reference_xline=kwargs[
"reference_xline"]
1173 values_list.append(float(xl[valuekey]))
1175 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1178 filename, [values_list], valuename=valuename, bins=bins,
1179 colors=
None, format=
"pdf",
1180 reference_xline=
None, yplotrange=
None,
1181 xplotrange=
None,normalized=
True,
1184 def dump(self,json_filename):
1186 with open(json_filename,
'w')
as fp:
1187 json.dump(self.data_base, fp, sort_keys=
True, indent=2, default=set_json_default)
1189 def load(self,json_filename):
1191 with open(json_filename,
'r') as fp:
1192 self.data_base = json.load(fp)
1196 if sys.version_info[0] < 3:
1198 for k,v
in xl.iteritems():
1199 if type(k)
is unicode: k=str(k)
1200 if type(v)
is unicode: v=str(v)
1203 def save_csv(self,filename):
1209 sorted_group_ids=sorted(self.data_base.keys())
1210 for group
in sorted_group_ids:
1212 for xl
in self.data_base[group]:
1214 sorted_ids=sorted(xl.keys())
1215 values=[xl[k]
for k
in sorted_ids]
1216 group_block.append(values)
1220 with open(filename,
'w')
as fp:
1221 a = csv.writer(fp, delimiter=
',')
1222 a.writerow(sorted_ids)
1227 Returns the number of non redundant crosslink sites
1231 pra=_ProteinsResiduesArray(xl)
1232 prai=pra.get_inverted()
1235 return len(list(praset))
1238 """This class allows to compute and plot the distance between datasets"""
1241 """Input a dictionary where keys are cldb names and values are cldbs"""
1242 import scipy.spatial.distance
1243 self.dbs=cldb_dictionary
1244 self.keylist=list(self.dbs.keys())
1245 self.distances=list()
1248 for i,key1
in enumerate(self.keylist):
1249 for key2
in self.keylist[i+1:]:
1250 distance=self.get_distance(key1,key2)
1251 self.distances.append(distance)
1253 self.distances=scipy.spatial.distance.squareform(self.distances)
1256 return 1.0-self.
jaccard_index(self.dbs[key1],self.dbs[key2])
1259 """Similarity index between two datasets
1260 https://en.wikipedia.org/wiki/Jaccard_index"""
1264 for xl1
in CrossLinkDataBase1:
1265 a1f=_ProteinsResiduesArray(xl1)
1266 a1b=a1f.get_inverted()
1269 for xl2
in CrossLinkDataBase2:
1270 a2f=_ProteinsResiduesArray(xl2)
1271 a2b=a2f.get_inverted()
1274 return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1276 def plot_matrix(self,figurename="clustermatrix.pdf"):
1277 import matplotlib
as mpl
1280 import matplotlib.pylab
as pl
1281 from scipy.cluster
import hierarchy
as hrc
1283 raw_distance_matrix=self.distances
1289 ax = fig.add_subplot(1,1,1)
1290 dendrogram = hrc.dendrogram(
1291 hrc.linkage(raw_distance_matrix),
1294 leaves_order = dendrogram[
'leaves']
1295 ax.set_xlabel(
'Dataset')
1296 ax.set_ylabel(
'Jaccard Distance')
1298 pl.savefig(
"dendrogram."+figurename, dpi=300)
1304 ax = fig.add_subplot(1,1,1)
1306 raw_distance_matrix[leaves_order,
1309 interpolation=
'nearest')
1310 cb = fig.colorbar(cax)
1311 cb.set_label(
'Jaccard Distance')
1312 ax.set_xlabel(
'Dataset')
1313 ax.set_ylabel(
'Dataset')
1314 ax.set_xticks(range(len(labels)))
1315 ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation=
'vertical')
1316 ax.set_yticks(range(len(labels)))
1317 ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation=
'horizontal')
1319 pl.savefig(
"matrix."+figurename, dpi=300)
1325 This class maps a CrossLinkDataBase on a given structure
1326 and save an rmf file with color-coded crosslinks
1330 def __init__(self,CrossLinkDataBase,rmf_or_stat_handler):
1331 self.CrossLinkDataBase=CrossLinkDataBase
1334 self.prots=rmf_or_stat_handler
1335 self.distances=defaultdict(list)
1339 print(
"computing distances fro all crosslinks and all structures")
1340 for i
in self.prots[::10]:
1341 self.compute_distances()
1342 for key,xl
in enumerate(self.CrossLinkDataBase):
1343 array=_ProteinsResiduesArray(xl)
1344 self.array_to_id[array]=key
1345 self.id_to_array[key]=array
1346 if xl[
"MinAmbiguousDistance"]
is not 'None':
1347 self.distances[key].append(xl[
"MinAmbiguousDistance"])
1349 def compute_distances(self):
1352 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1353 for group
in sorted_group_ids:
1356 for xl
in self.CrossLinkDataBase.data_base[group]:
1358 sorted_ids=sorted(xl.keys())
1359 data.append(sorted_ids+[
"UniqueID",
"Distance",
"MinAmbiguousDistance"])
1360 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1362 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1371 values=[xl[k]
for k
in sorted_ids]
1372 values += [group, mdist]
1373 except KeyError
as e:
1374 print(
"MapCrossLinkDataBaseOnStructure KeyError: {0} in {1}".format(e, xl))
1375 group_dists.append(mdist)
1377 xl[
"Distance"]=mdist
1383 for xl
in self.CrossLinkDataBase.data_base[group]:
1384 xl[
"MinAmbiguousDistance"]=min(group_dists)
1386 def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1387 '''more robust and slower version of above'''
1389 selpart_1=sel.get_selected_particles()
1390 if len(selpart_1)==0:
1391 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1394 selpart_2=sel.get_selected_particles()
1395 if len(selpart_2)==0:
1396 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1399 for p1
in selpart_1:
1400 for p2
in selpart_2:
1401 if p1 == p2
and r1 == r2:
continue
1421 results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1422 if len(results)==0:
return None
1423 results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1424 return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1426 def save_rmf_snapshot(self,filename,color_id=None):
1427 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1430 for group
in sorted_group_ids:
1431 group_dists_particles=[]
1432 for xl
in self.CrossLinkDataBase.data_base[group]:
1433 xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1434 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1436 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1438 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1440 if color_id
is not None:
1441 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1443 group_dists_particles.append((mdist,p1,p2,xllabel,0.0))
1444 if group_dists_particles:
1445 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1446 color_scores.append(mincolor_score)
1447 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1452 linear.set_slope(1.0)
1454 rslin =
IMP.RestraintSet(self.prots.get_model(),
'linear_dummy_restraints')
1456 offset=min(color_scores)
1457 maxvalue=max(color_scores)
1458 for pair
in list_of_pairs:
1460 pr.set_name(pair[2])
1462 factor=(pair[3]-offset)/(maxvalue-offset)
1467 rslin.add_restraint(pr)
1470 rh = RMF.create_rmf_file(filename)
1477 def boxplot_crosslink_distances(self,filename):
1479 keys=[self.id_to_array[k]
for k
in self.distances.keys()]
1480 medians=[numpy.median(self.distances[self.array_to_id[k]])
for k
in keys]
1481 dists=[self.distances[self.array_to_id[k]]
for k
in keys]
1482 distances_sorted_by_median=[x
for _,x
in sorted(zip(medians,dists))]
1483 keys_sorted_by_median=[x
for _,x
in sorted(zip(medians,keys))]
1485 distances_sorted_by_median,
1486 range(len(distances_sorted_by_median)),
1487 xlabels=keys_sorted_by_median,scale_plot_length=0.2)
1489 def get_crosslink_violations(self,threshold):
1490 violations=defaultdict(list)
1491 for k
in self.distances:
1493 viols=self.distances[k]
1494 violations[self.id_to_array[k]]=viols
1500 This class generates a CrossLinkDataBase from a given structure
1502 def __init__(self,representation=None,
1504 residue_types_1=[
"K"],
1505 residue_types_2=[
"K"],
1506 reactivity_range=[0,1],
1512 cldbkc.set_protein1_key(
"Protein1")
1513 cldbkc.set_protein2_key(
"Protein2")
1514 cldbkc.set_residue1_key(
"Residue1")
1515 cldbkc.set_residue2_key(
"Residue2")
1517 if representation
is not None:
1520 self.representation=representation
1521 self.model=self.representation.model
1522 elif system
is not None:
1525 self.model=self.system.model
1528 print(
"Argument error: please provide either a representation object or a IMP.Hierarchy")
1530 self.residue_types_1=residue_types_1
1531 self.residue_types_2=residue_types_2
1533 self.indexes_dict1={}
1534 self.indexes_dict2={}
1535 self.protein_residue_dict={}
1536 self.reactivity_dictionary={}
1537 self.euclidean_interacting_pairs=
None
1538 self.xwalk_interacting_pairs=
None
1541 if self.mode==
"pmi1":
1542 for protein
in self.representation.sequence_dict.keys():
1544 seq=self.representation.sequence_dict[protein]
1545 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
1551 rexp=numpy.random.exponential(0.1)
1552 prob=1.0-math.exp(-rexp)
1553 self.reactivity_dictionary[(protein,r)]=prob
1556 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
1557 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
1559 h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1562 self.indexes_dict1[index]=(protein,r)
1563 self.protein_residue_dict[(protein,r)]=index
1565 h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1568 self.indexes_dict2[index]=(protein,r)
1569 self.protein_residue_dict[(protein,r)]=index
1571 if self.mode==
"pmi2":
1572 for state
in self.system.get_states():
1573 for moleculename,molecules
in state.get_molecules().items():
1574 for molecule
in molecules:
1576 seq=molecule.sequence
1577 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
1583 rexp=numpy.random.exponential(0.00000001)
1584 prob=1.0-math.exp(-rexp)
1585 self.reactivity_dictionary[(molecule,r)]=prob
1587 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
1588 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
1591 ps=s.get_selected_particles()
1594 self.indexes_dict1[index]=(molecule,r)
1595 self.protein_residue_dict[(molecule,r)]=index
1598 ps=s.get_selected_particles()
1601 self.indexes_dict2[index]=(molecule,r)
1602 self.protein_residue_dict[(molecule,r)]=index
1605 def get_all_possible_pairs(self):
1606 n=float(len(self.protein_residue_dict.keys()))
1607 return n*(n-1.0)/2.0
1609 def get_all_feasible_pairs(self,distance=21):
1611 particle_index_pairs=[]
1613 for a,b
in itertools.combinations(self.protein_residue_dict.keys(),2):
1616 index1=self.protein_residue_dict[a]
1617 index2=self.protein_residue_dict[b]
1619 if particle_distance <= distance:
1620 particle_index_pairs.append((index1,index2))
1621 if self.mode==
"pmi1":
1622 new_xl[self.cldb.protein1_key]=a[0]
1623 new_xl[self.cldb.protein2_key]=b[0]
1624 elif self.mode==
"pmi2":
1625 new_xl[self.cldb.protein1_key]=a[0].get_name()
1626 new_xl[self.cldb.protein2_key]=b[0].get_name()
1627 new_xl[
"molecule_object1"]=a[0]
1628 new_xl[
"molecule_object2"]=b[0]
1629 new_xl[self.cldb.residue1_key]=a[1]
1630 new_xl[self.cldb.residue2_key]=b[1]
1631 self.cldb.data_base[str(nxl)]=[new_xl]
1639 def get_data_base(self,total_number_of_spectra,
1640 ambiguity_probability=0.1,
1643 max_delta_distance=10.0,
1644 xwalk_bin_path=
None,
1645 confidence_false=0.75,
1646 confidence_true=0.75):
1648 from random
import random,uniform
1652 self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1653 self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1654 self.cldb.data_base[str(number_of_spectra)]=[]
1655 self.sites_weighted=
None
1657 while number_of_spectra<total_number_of_spectra:
1658 if random() > ambiguity_probability
and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1660 number_of_spectra+=1
1661 self.cldb.data_base[str(number_of_spectra)]=[]
1663 if random() > noise:
1665 pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1668 pra,dist=self.get_random_residue_pair(
None,
None,
None)
1672 if self.mode==
"pmi1":
1673 new_xl[self.cldb.protein1_key]=pra[0]
1674 new_xl[self.cldb.protein2_key]=pra[1]
1675 elif self.mode==
"pmi2":
1676 new_xl[self.cldb.protein1_key]=pra[0].get_name()
1677 new_xl[self.cldb.protein2_key]=pra[1].get_name()
1678 new_xl[
"molecule_object1"]=pra[0]
1679 new_xl[
"molecule_object2"]=pra[1]
1680 new_xl[self.cldb.residue1_key]=pra[2]
1681 new_xl[self.cldb.residue2_key]=pra[3]
1682 new_xl[
"Noisy"]=noisy
1684 new_xl[
"Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1685 new_xl[
"Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1686 r1=new_xl[
"Reactivity_Residue1"]
1687 r2=new_xl[
"Reactivity_Residue2"]
1693 new_xl[
"Score"]=np.random.beta(1.0,self.beta_false)
1696 new_xl[
"Score"]=1.0-np.random.beta(1.0,self.beta_true)
1697 new_xl[
"TargetDistance"]=dist
1698 new_xl[
"NoiseProbability"]=noise
1699 new_xl[
"AmbiguityProbability"]=ambiguity_probability
1701 (p1,p2)=
IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1702 self.protein_residue_dict[(pra[1],pra[3])]])
1707 new_xl[
"InterRigidBody"] =
False
1712 new_xl[
"InterRigidBody"] =
True
1714 new_xl[
"InterRigidBody"] =
None
1716 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1721 def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1724 from random
import choice,uniform
1725 if distance
is None:
1728 if self.mode==
"pmi1":
1729 protein1=choice(self.representation.sequence_dict.keys())
1730 protein2=choice(self.representation.sequence_dict.keys())
1731 seq1=self.representation.sequence_dict[protein1]
1732 seq2=self.representation.sequence_dict[protein2]
1733 residue1=choice([i
for i
in range(1,len(seq1)+1)
if seq1[i-1]
in self.residue_types_1])
1734 residue2=choice([i
for i
in range(1,len(seq2)+1)
if seq2[i-1]
in self.residue_types_2])
1735 h1=IMP.pmi.tools.select_by_tuple(self.representation,(residue1,residue1,protein1),resolution=1)[0]
1736 h2=IMP.pmi.tools.select_by_tuple(self.representation,(residue2,residue2,protein2),resolution=1)[0]
1738 if (protein1,residue1) != (protein2,residue2):
1740 elif self.mode==
"pmi2":
1741 (protein1,residue1)=choice(self.protein_residue_dict.keys())
1742 (protein2,residue2)=choice(self.protein_residue_dict.keys())
1743 index1=self.protein_residue_dict[(protein1,residue1)]
1744 index2=self.protein_residue_dict[(protein2,residue2)]
1746 if (protein1,residue1) != (protein2,residue2):
1750 if not xwalk_bin_path:
1752 gcpf.set_distance(distance+max_delta_distance)
1756 if not self.sites_weighted:
1757 self.sites_weighted=[]
1758 for key
in self.reactivity_dictionary:
1759 r=self.reactivity_dictionary[key]
1760 self.sites_weighted.append((key,r))
1762 first_site=self.weighted_choice(self.sites_weighted)
1764 if not self.euclidean_interacting_pairs:
1765 self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1766 self.indexes_dict1.keys(),
1767 self.indexes_dict2.keys())
1769 first_site_pairs = [pair
for pair
in self.euclidean_interacting_pairs
1770 if self.indexes_dict1[pair[0]] == first_site
or
1771 self.indexes_dict2[pair[1]] == first_site]
1772 if len(first_site_pairs)==0:
continue
1774 second_sites_weighted=[]
1775 for pair
in first_site_pairs:
1776 if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1777 if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1778 r=self.reactivity_dictionary[second_site]
1779 second_sites_weighted.append((second_site,r))
1780 second_site=self.weighted_choice(second_sites_weighted)
1782 interacting_pairs_weighted=[]
1783 for pair in self.euclidean_interacting_pairs:
1784 r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1785 r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1786 #combined reactivity 1-exp(-k12*Delta t),
1789 r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1790 interacting_pairs_weighted.append((pair,r12))
1791 #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1792 #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1793 #interacting_pairs_weighted.append((pair,weight1*weight2))
1796 pair=self.weighted_choice(interacting_pairs_weighted)
1797 protein1,residue1=self.indexes_dict1[pair[0]]
1798 protein2,residue2=self.indexes_dict2[pair[1]]
1799 particle_pair=IMP.get_particles(self.model,pair)
1800 particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1801 if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1803 elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1804 #allow some flexibility
1805 prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1806 if uniform(0.0,1.0)<prob: break
1808 protein1,residue1=first_site
1809 protein2,residue2=second_site
1810 print(
"CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1811 "First site",first_site,self.reactivity_dictionary[first_site],
1812 "Second site",second_site,self.reactivity_dictionary[second_site])
1813 particle_pair=
IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1816 if particle_distance<distance
and (protein1,residue1) != (protein2,residue2):
1818 elif particle_distance>=distance
and (protein1,residue1) != (protein2,residue2)
and max_delta_distance:
1822 if particle_distance-distance < max_delta_distance:
break
1827 if not self.xwalk_interacting_pairs:
1828 self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1829 interacting_pairs_weighted=[]
1831 for pair
in self.xwalk_interacting_pairs:
1836 weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1837 weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1838 interacting_pairs_weighted.append((pair,weight1*weight2))
1840 pair=self.weighted_choice(interacting_pairs_weighted)
1845 particle_distance=float(pair[4])
1849 return ((protein1,protein2,residue1,residue2)),particle_distance
1851 def get_xwalk_distances(self,xwalk_bin_path,distance):
1855 o.init_pdb(
"xwalk.pdb",self.representation.prot)
1856 o.write_pdb(
"xwalk.pdb")
1857 namechainiddict=o.dictchain[
"xwalk.pdb"]
1860 for key
in namechainiddict: chainiddict[namechainiddict[key]]=key
1861 xwalkout=os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+
' -bb').read()
1863 output_list_of_distance=[]
1864 for line
in xwalkout.split(
"\n")[0:-2]:
1865 tockens=line.split()
1868 distance=float(tockens[6])
1870 ss=second.split(
"-")
1873 protein1=chainiddict[chainid1]
1874 protein2=chainiddict[chainid2]
1877 output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1878 return output_list_of_distance
1881 def weighted_choice(self,choices):
1884 total = sum(w
for c, w
in choices)
1885 r = random.uniform(0, total)
1887 for c, w
in choices:
1891 assert False,
"Shouldn't get here"
1893 def save_rmf_snapshot(self,filename,color_id=None):
1896 if color_id
is None:
1897 color_id=
"Reactivity"
1899 sorted_group_ids=sorted(self.cldb.data_base.keys())
1902 for group
in sorted_group_ids:
1904 group_dists_particles=[]
1905 for xl
in self.cldb.data_base[group]:
1906 xllabel=self.cldb.get_short_cross_link_string(xl)
1907 (c1,c2,r1,r2)=(xl[
"molecule_object1"],xl[
"molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1909 index1=self.protein_residue_dict[(c1,r1)]
1910 index2=self.protein_residue_dict[(c2,r2)]
1912 mdist=xl[
"TargetDistance"]
1914 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1916 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1917 if group_dists_particles:
1918 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1919 color_scores.append(mincolor_score)
1920 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1927 linear.set_slope(1.0)
1931 offset=min(color_scores)
1932 maxvalue=max(color_scores)
1933 for pair
in list_of_pairs:
1935 pr.set_name(pair[2])
1936 factor=(pair[3]-offset)/(maxvalue-offset)
1939 rslin.add_restraint(pr)
1942 rh = RMF.create_rmf_file(filename)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def __init__
Input a dictionary where keys are cldb names and values are cldbs.
def get_number_of_unique_crosslinked_sites
Returns the number of non redundant crosslink sites.
def filter_score
Get all crosslinks with score greater than an input value.
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
def create_new_keyword
This function creates a new keyword for the whole database and set the values from and existing keywo...
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
def jaccard_index
Similarity index between two datasets https://en.wikipedia.org/wiki/Jaccard_index.
def merge
This function merges two cross-link datasets so that if two conflicting crosslinks have the same cros...
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class maps a CrossLinkDataBase on a given structure and save an rmf file with color-coded crossl...
A decorator for keeping track of copies of a molecule.
The standard decorator for manipulating molecular structures.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def set_value
This function changes the value for a given key in the database For instance one can change the name ...
This class allows to compute and plot the distance between datasets.
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
def filter_out_same_residues
This function remove cross-links applied to the same residue (ie, same chain name and residue number)...
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
def offset_residue_index
This function offset the residue indexes of a given protein by a specified value. ...
Class for easy writing of PDBs, RMFs, and stat files.
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
def jackknife
this method returns a CrossLinkDataBase class containing a random subsample of the original cross-lin...
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
def check_cross_link_consistency
This function checks the consistency of the dataset with the amino acid sequence. ...
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
def get_values
this function returns the list of values for a given key in the database alphanumerically sorted ...
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
def append_database
This function append one cross-link dataset to another.
class to link stat files to several rmf files
class to allow more advanced handling of RMF files.
A class to handle different styles of site pairs parsers.
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
this class handles a cross-link dataset and do filtering operations, adding cross-links, merge datasets...
Support for the RMF file format for storing hierarchical molecular data and markup.
def rename_proteins
This function renames all proteins contained in the input dictionary from the old names (keys) to the...
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.
def create_set_from_file
if FixedFormatParser is not specified, the file is comma-separated-values