1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
7 from __future__
import print_function
21 def set_json_default(obj):
22 if isinstance(obj, set):
29 if sys.version_info[0] >= 3:
30 def _handle_string_input(inp):
31 if not isinstance(inp, str):
32 raise TypeError(
"expecting a string")
35 def _handle_string_input(inp):
36 if not isinstance(inp, (str, unicode)):
37 raise TypeError(
"expecting a string or unicode")
39 if isinstance(inp, unicode):
44 class _CrossLinkDataBaseStandardKeys(object):
46 This class setup all the standard keys needed to
47 identify the crosslink features from the data sets
51 self.protein1_key=
"Protein1"
52 self.type[self.protein1_key]=str
53 self.protein2_key=
"Protein2"
54 self.type[self.protein2_key]=str
55 self.residue1_key=
"Residue1"
56 self.type[self.residue1_key]=int
57 self.residue2_key=
"Residue2"
58 self.type[self.residue2_key]=int
59 self.residue1_amino_acid_key=
"Residue1AminoAcid"
60 self.type[self.residue1_amino_acid_key]=str
61 self.residue2_amino_acid_key=
"Residue2AminoAcid"
62 self.type[self.residue2_amino_acid_key]=str
63 self.residue1_moiety_key=
"Residue1Moiety"
64 self.type[self.residue1_moiety_key]=str
65 self.residue2_moiety_key=
"Residue2Moiety"
66 self.type[self.residue2_moiety_key]=str
67 self.site_pairs_key=
"SitePairs"
68 self.type[self.site_pairs_key]=str
69 self.unique_id_key=
"XLUniqueID"
70 self.type[self.unique_id_key]=str
71 self.unique_sub_index_key=
"XLUniqueSubIndex"
72 self.type[self.unique_sub_index_key]=int
73 self.unique_sub_id_key=
"XLUniqueSubID"
74 self.type[self.unique_sub_id_key]=str
75 self.data_set_name_key=
"DataSetName"
76 self.type[self.data_set_name_key]=str
77 self.cross_linker_chemical_key=
"CrossLinkerChemical"
78 self.type[self.cross_linker_chemical_key]=str
79 self.id_score_key=
"IDScore"
80 self.type[self.id_score_key]=float
82 self.type[self.fdr_key]=float
83 self.quantitation_key=
"Quantitation"
84 self.type[self.quantitation_key]=float
85 self.redundancy_key=
"Redundancy"
86 self.type[self.redundancy_key]=int
87 self.redundancy_list_key=
"RedundancyList"
88 self.type[self.redundancy_key]=list
89 self.ambiguity_key=
"Ambiguity"
90 self.type[self.ambiguity_key]=int
91 self.residue1_links_number_key=
"Residue1LinksNumber"
92 self.type[self.residue1_links_number_key]=int
93 self.residue2_links_number_key=
"Residue2LinksNumber"
94 self.type[self.residue2_links_number_key]=int
95 self.type[self.ambiguity_key]=int
96 self.state_key=
"State"
97 self.type[self.state_key]=int
98 self.sigma1_key=
"Sigma1"
99 self.type[self.sigma1_key]=str
100 self.sigma2_key=
"Sigma2"
101 self.type[self.sigma2_key]=str
103 self.type[self.psi_key]=str
104 self.distance_key=
"Distance"
105 self.type[self.distance_key]=float
106 self.min_ambiguous_distance_key=
"MinAmbiguousDistance"
107 self.type[self.distance_key]=float
109 self.link_type_key=
"LinkType"
110 self.type[self.link_type_key]=str
112 self.ordered_key_list =[self.data_set_name_key,
114 self.unique_sub_index_key,
115 self.unique_sub_id_key,
120 self.residue1_amino_acid_key,
121 self.residue2_amino_acid_key,
122 self.residue1_moiety_key,
123 self.residue2_moiety_key,
124 self.cross_linker_chemical_key,
127 self.quantitation_key,
129 self.redundancy_list_key,
135 self.min_ambiguous_distance_key,
139 class _ProteinsResiduesArray(tuple):
141 This class is inherits from tuple, and it is a shorthand for a cross-link
142 (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1 and protein2, r1 and r2 are
143 residue1 and residue2.
146 def __new__(self,input_data):
148 @input_data can be a dict or a tuple
150 self.cldbsk=_CrossLinkDataBaseStandardKeys()
151 if type(input_data)
is dict:
153 p1=input_data[self.cldbsk.protein1_key]
155 p2=input_data[self.cldbsk.protein2_key]
158 r1=input_data[self.cldbsk.residue1_key]
160 r2=input_data[self.cldbsk.residue2_key]
167 elif type(input_data)
is tuple:
168 if len(input_data)>4
or len(input_data)==3
or len(input_data)==1:
169 raise TypeError(
"_ProteinsResiduesArray: must have only 4 elements")
170 if len(input_data)==4:
171 p1 = _handle_string_input(input_data[0])
172 p2 = _handle_string_input(input_data[1])
175 if (
not (type(r1)
is int))
and (
not (r1
is None)):
176 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
177 if (
not (type(r2)
is int))
and (
not (r1
is None)):
178 raise TypeError(
"_ProteinsResiduesArray: residue2 must be a integer")
180 if len(input_data) == 2:
181 p1 = _handle_string_input(input_data[0])
183 if type(r1)
is not int:
184 raise TypeError(
"_ProteinsResiduesArray: residue1 must be a integer")
187 raise TypeError(
"_ProteinsResiduesArray: input must be a dict or tuple")
188 return tuple.__new__(_ProteinsResiduesArray, t)
190 def get_inverted(self):
192 Returns a _ProteinsResiduesArray instance with protein1 and protein2 inverted
194 return _ProteinsResiduesArray((self[1],self[0],self[3],self[2]))
197 outstr=self.cldbsk.protein1_key+
" "+str(self[0])
198 outstr+=
" "+self.cldbsk.protein2_key+
" "+str(self[1])
199 outstr+=
" "+self.cldbsk.residue1_key+
" "+str(self[2])
200 outstr+=
" "+self.cldbsk.residue2_key+
" "+str(self[3])
205 This class allows to create filter functions that can be passed to the CrossLinkDataBase
208 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
210 where cldb is CrossLinkDataBase instance and it is only needed to get the standard keywords
212 A filter operator can be evaluate on a CrossLinkDataBase item xl and returns a boolean
216 and it is used to filter the database
219 def __init__(self, argument1, operator, argument2):
221 (argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value)
222 or (FilterOperator1,operator.or|and...,FilterOperator2)
224 if isinstance(argument1, FilterOperator):
225 self.operations = [argument1, operator, argument2]
228 self.values = (argument1, operator, argument2)
230 def __or__(self, FilterOperator2):
233 def __and__(self, FilterOperator2):
236 def __invert__(self):
239 def evaluate(self, xl_item):
241 if len(self.operations) == 0:
242 keyword, operator, value = self.values
243 return operator(xl_item[keyword], value)
244 FilterOperator1, op, FilterOperator2 = self.operations
246 if FilterOperator2
is None:
247 return op(FilterOperator1.evaluate(xl_item))
249 return op(FilterOperator1.evaluate(xl_item), FilterOperator2.evaluate(xl_item))
252 def filter_factory(xl_):
254 class FilterOperator(object):
258 def __new__(self,key,value,oper=operator.eq):
259 return oper(self.xl[key],value)
261 return FilterOperator
266 This class is needed to convert the keywords from a generic database
272 @param list_parser an instance of ResiduePairListParser, if any is needed
275 self.backward_converter={}
276 _CrossLinkDataBaseStandardKeys.__init__(self)
277 self.rplp = list_parser
278 if self.rplp
is None:
280 self.compulsory_keys=set([self.protein1_key,
285 self.compulsory_keys=self.rplp.get_compulsory_keys()
286 self.setup_keys=set()
290 Is a function that check whether necessary keys are setup
293 if self.compulsory_keys & setup_keys != self.compulsory_keys:
294 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup all necessary keys")
298 Returns the keys that have been setup so far
300 return self.backward_converter.keys()
304 This sets up the standard compulsory keys as defined by
305 _CrossLinkDataBaseStandardKeys
307 for ck
in self.compulsory_keys:
308 self.converter[ck]=ck
309 self.backward_converter[ck]=ck
311 def set_unique_id_key(self,origin_key):
312 self.converter[origin_key]=self.unique_id_key
313 self.backward_converter[self.unique_id_key]=origin_key
315 def set_protein1_key(self,origin_key):
316 self.converter[origin_key]=self.protein1_key
317 self.backward_converter[self.protein1_key]=origin_key
319 def set_protein2_key(self,origin_key):
320 self.converter[origin_key]=self.protein2_key
321 self.backward_converter[self.protein2_key]=origin_key
323 def set_residue1_key(self,origin_key):
324 self.converter[origin_key]=self.residue1_key
325 self.backward_converter[self.residue1_key]=origin_key
327 def set_residue2_key(self,origin_key):
328 self.converter[origin_key]=self.residue2_key
329 self.backward_converter[self.residue2_key]=origin_key
331 def set_residue1_amino_acid_key(self, origin_key):
332 self.converter[origin_key] = self.residue1_amino_acid_key
333 self.backward_converter[self.residue1_amino_acid_key] = origin_key
335 def set_residue2_amino_acid_key(self, origin_key):
336 self.converter[origin_key] = self.residue2_amino_acid_key
337 self.backward_converter[self.residue2_amino_acid_key] = origin_key
339 def set_residue1_moiety_key(self, origin_key):
340 self.converter[origin_key] = self.residue1_moiety_key
341 self.backward_converter[self.residue1_moiety_key] = origin_key
343 def set_residue2_moiety_key(self, origin_key):
344 self.converter[origin_key] = self.residue2_moiety_key
345 self.backward_converter[self.residue2_moiety_key] = origin_key
347 def set_site_pairs_key(self,origin_key):
348 self.converter[origin_key]=self.site_pairs_key
349 self.backward_converter[self.site_pairs_key]=origin_key
351 def set_id_score_key(self,origin_key):
352 self.converter[origin_key]=self.id_score_key
353 self.backward_converter[self.id_score_key]=origin_key
355 def set_fdr_key(self,origin_key):
356 self.converter[origin_key]=self.fdr_key
357 self.backward_converter[self.fdr_key]=origin_key
359 def set_quantitation_key(self,origin_key):
360 self.converter[origin_key]=self.quantitation_key
361 self.backward_converter[self.quantitation_key]=origin_key
363 def set_psi_key(self,origin_key):
364 self.converter[origin_key]=self.psi_key
365 self.backward_converter[self.psi_key]=origin_key
367 def set_link_type_key(self,link_type_key):
368 self.converter[link_type_key]=self.link_type_key
369 self.backward_converter[self.link_type_key]=link_type_key
373 Returns the dictionary that convert the old keywords to the new ones
376 return self.converter
380 Returns the dictionary that convert the new keywords to the old ones
383 return self.backward_converter
387 A class to handle different styles of site pairs parsers.
389 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for crosslinks
390 [Y3-;K4-] for dead-ends
391 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
392 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
397 def __init__(self,style):
399 _CrossLinkDataBaseStandardKeys.__init__(self)
400 if style
is "MSSTUDIO":
402 self.compulsory_keys= set([self.protein1_key,
404 self.site_pairs_key])
405 elif style
is "XTRACT" or style
is "QUANTITATION":
407 self.compulsory_keys= set([self.site_pairs_key])
409 raise Error(
"ResiduePairListParser: unknown list parser style")
411 def get_compulsory_keys(self):
412 return self.compulsory_keys
416 This function returns a list of cross-linked residues and the corresponding list of
417 cross-linked chains. The latter list can be empty, if the style doesn't have the
418 corresponding information.
420 if self.style ==
"MSSTUDIO":
421 input_string=input_string.replace(
"[",
"")
422 input_string=input_string.replace(
"]",
"")
423 input_string_pairs=input_string.split(
";")
424 residue_pair_indexes=[]
425 chain_pair_indexes=[]
426 for s
in input_string_pairs:
427 m1=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',s)
428 m2=self.re.search(
r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$',s)
431 residue_type_1,residue_index_1,residue_type_2,residue_index_2=m1.group(1,2,3,4)
432 residue_pair_indexes.append((residue_index_1,residue_index_2))
435 residue_type_1,residue_index_1=m2.group(1,2)
437 return residue_pair_indexes,chain_pair_indexes
438 if self.style
is "XTRACT" or self.style
is "QUANTITATION":
439 if ":x:" in input_string:
441 input_strings=input_string.split(
":x:")
442 first_peptides=input_strings[0].split(
":|:")
443 second_peptides=input_strings[1].split(
":|:")
444 first_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in first_peptides]
445 second_peptides_indentifiers=[(f.split(
":")[0],f.split(
":")[1])
for f
in second_peptides]
446 residue_pair_indexes=[]
447 chain_pair_indexes=[]
448 for fpi
in first_peptides_indentifiers:
449 for spi
in second_peptides_indentifiers:
454 residue_pair_indexes.append((residue1,residue2))
455 chain_pair_indexes.append((chain1,chain2))
456 return residue_pair_indexes, chain_pair_indexes
459 first_peptides = input_string.split(
":|:")
460 first_peptides_indentifiers = [(f.split(
":")[0], f.split(
":")[1])
for f
in first_peptides]
463 for fpi
in first_peptides_indentifiers:
466 residue_indexes.append((residue1,))
467 chain_indexes.append((chain1,))
468 return residue_indexes, chain_indexes
473 A class to handle different XL format with fixed format
474 currently support ProXL
476 def __init__(self,format):
478 _CrossLinkDataBaseStandardKeys.__init__(self)
479 if format
is "PROXL":
482 raise Error(
"FixedFormatParser: unknown list format name")
485 def get_data(self,input_string):
486 if self.format
is "PROXL":
487 tockens=input_string.split(
"\t")
489 if tockens[0]==
"SEARCH ID(S)":
492 xl[self.protein1_key]=tockens[2]
493 xl[self.protein2_key]=tockens[4]
494 xl[self.residue1_key]=int(tockens[3])
495 xl[self.residue2_key]=int(tockens[5])
498 class CrossLinkDataBase(_CrossLinkDataBaseStandardKeys):
501 this class handles a cross-link dataset and do filtering
502 operations, adding cross-links, merge datasets...
505 def __init__(self, converter=None, data_base=None, fasta_seq=None, linkable_aa=(
'K',)):
508 @param converter an instance of CrossLinkDataBaseKeywordsConverter
509 @param data_base an instance of CrossLinkDataBase to build the new database on
510 @param fasta_seq an instance of IMP.pmi.topology.Sequences containing protein fasta sequences to check
511 crosslink consistency. If not given consistency will not be checked
512 @param linkable_aa a tuple containing one-letter amino acids which are linkable by the crosslinker;
513 only used if the database DOES NOT provide a value for a certain residueX_amino_acid_key
514 and if a fasta_seq is given
517 if data_base
is None:
520 self.data_base=data_base
522 _CrossLinkDataBaseStandardKeys.__init__(self)
523 if converter
is not None:
524 self.cldbkc = converter
525 self.list_parser=self.cldbkc.rplp
526 self.converter = converter.get_converter()
530 self.list_parser=
None
531 self.converter =
None
534 self.def_aa_tuple = linkable_aa
535 self.fasta_seq = fasta_seq
540 Update the whole dataset after changes
542 self.update_cross_link_unique_sub_index()
543 self.update_cross_link_redundancy()
544 self.update_residues_links_number()
545 self.check_cross_link_consistency()
549 sorted_ids=sorted(self.data_base.keys())
551 for xl
in self.data_base[k]:
554 def xlid_iterator(self):
555 sorted_ids=sorted(self.data_base.keys())
556 for xlid
in sorted_ids:
559 def __getitem__(self,xlid):
560 return self.data_base[xlid]
563 return len([xl
for xl
in self])
568 def set_name(self,name):
570 for k
in self.data_base:
571 new_data_base[k+
"."+name]=self.data_base[k]
572 self.data_base=new_data_base
576 def get_number_of_xlid(self):
577 return len(self.data_base)
580 def create_set_from_file(self,file_name,converter=None,FixedFormatParser=None):
582 if FixedFormatParser is not specified, the file is comma-separated-values
583 @param file_name a txt file to be parsed
584 @param converter an instance of CrossLinkDataBaseKeywordsConverter
585 @param FixedFormatParser a parser for a fixed format
587 if not FixedFormatParser:
588 xl_list=IMP.pmi.tools.get_db_from_csv(file_name)
590 if converter
is not None:
591 self.cldbkc = converter
592 self.list_parser=self.cldbkc.rplp
593 self.converter = converter.get_converter()
596 if not self.list_parser:
600 for nxl,xl
in enumerate(xl_list):
603 if k
in self.converter:
604 new_xl[self.converter[k]]=self.type[self.converter[k]](xl[k])
607 if self.unique_id_key
in self.cldbkc.get_setup_keys():
608 if new_xl[self.unique_id_key]
not in new_xl_dict:
609 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
611 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
613 if str(nxl)
not in new_xl_dict:
614 new_xl_dict[str(nxl)]=[new_xl]
616 new_xl_dict[str(nxl)].append(new_xl)
621 for nxl,entry
in enumerate(xl_list):
625 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
626 raise Error(
"CrossLinkDataBase: expecting a site_pairs_key for the site pair list parser")
628 if k
in self.converter:
629 new_dict[self.converter[k]]=self.type[self.converter[k]](entry[k])
633 residue_pair_list,chain_pair_list=self.list_parser.get_list(new_dict[self.site_pairs_key])
636 for n,p
in enumerate(residue_pair_list):
643 new_xl[k]=new_dict[k]
644 new_xl[self.residue1_key]=self.type[self.residue1_key](p[0])
646 new_xl[self.residue2_key]=self.type[self.residue2_key](p[1])
648 if len(chain_pair_list)==len(residue_pair_list):
649 new_xl[self.protein1_key]=self.type[self.protein1_key](chain_pair_list[n][0])
651 new_xl[self.protein2_key]=self.type[self.protein2_key](chain_pair_list[n][1])
654 new_xl[self.link_type_key]=
"CROSSLINK"
656 new_xl[self.link_type_key]=
"MONOLINK"
658 if self.unique_id_key
in self.cldbkc.get_setup_keys():
659 if new_xl[self.unique_id_key]
not in new_xl_dict:
660 new_xl_dict[new_xl[self.unique_id_key]]=[new_xl]
662 new_xl_dict[new_xl[self.unique_id_key]].append(new_xl)
664 if str(nxl)
not in new_xl_dict:
665 new_xl[self.unique_id_key]=str(nxl+1)
666 new_xl_dict[str(nxl)]=[new_xl]
668 new_xl[self.unique_id_key]=str(nxl+1)
669 new_xl_dict[str(nxl)].append(new_xl)
674 if FixedFormatParser is defined
678 f=open(file_name,
"r")
681 xl=FixedFormatParser.get_data(line)
683 xl[self.unique_id_key]=str(nxl+1)
684 new_xl_dict[str(nxl)]=[xl]
688 self.data_base=new_xl_dict
692 def update_cross_link_unique_sub_index(self):
693 for k
in self.data_base:
694 for n,xl
in enumerate(self.data_base[k]):
695 xl[self.ambiguity_key]=len(self.data_base[k])
696 xl[self.unique_sub_index_key]=n+1
697 xl[self.unique_sub_id_key]=k+
"."+str(n+1)
699 def update_cross_link_redundancy(self):
700 redundancy_data_base={}
702 pra=_ProteinsResiduesArray(xl)
703 if pra
not in redundancy_data_base:
704 redundancy_data_base[pra]=[xl[self.unique_sub_id_key]]
705 redundancy_data_base[pra.get_inverted()]=[xl[self.unique_sub_id_key]]
707 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
708 redundancy_data_base[pra.get_inverted()].append(xl[self.unique_sub_id_key])
710 pra=_ProteinsResiduesArray(xl)
711 xl[self.redundancy_key]=len(redundancy_data_base[pra])
712 xl[self.redundancy_list_key]=redundancy_data_base[pra]
714 def update_residues_links_number(self):
717 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
718 if (p1,r1)
not in residue_links:
719 residue_links[(p1,r1)]=set([(p2,r2)])
721 residue_links[(p1,r1)].add((p2,r2))
722 if (p2,r2)
not in residue_links:
723 residue_links[(p2,r2)]=set([(p1,r1)])
725 residue_links[(p2,r2)].add((p1,r1))
728 (p1,p2,r1,r2)=_ProteinsResiduesArray(xl)
729 xl[self.residue1_links_number_key]=len(residue_links[(p1,r1)])
730 xl[self.residue2_links_number_key]=len(residue_links[(p2,r2)])
732 def check_cross_link_consistency(self):
733 if self.cldbkc
and self.fasta_seq:
734 cnt_matched, cnt_matched_file = 0, 0
738 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
739 b_matched_file =
False
740 if self.residue1_amino_acid_key
in xl:
742 aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
743 b_matched = self._match_xlinks(p1, r1, aa_from_file)
744 b_matched_file = b_matched
747 b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
749 matched, non_matched = self._update_matched_xlinks(b_matched, p1, r1, matched, non_matched)
751 if self.residue2_amino_acid_key
in xl:
752 aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
753 b_matched = self._match_xlinks(p2, r2, aa_from_file)
754 b_matched_file = b_matched
756 b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
758 matched, non_matched = self._update_matched_xlinks(b_matched, p2, r2, matched, non_matched)
759 if b_matched: cnt_matched += 1
760 if b_matched_file: cnt_matched_file += 1
762 percentage_matched = round(100*cnt_matched/len(self),1)
763 percentage_matched_file = round(100 * cnt_matched_file / len(self), 1)
765 if matched
or non_matched: print(
"check_cross_link_consistency: Out of %d crosslinks "
766 "%d were matched to the fasta sequence (%f %%).\n "
767 "%d were matched by using the crosslink file (%f %%)."%
768 (len(self),cnt_matched,percentage_matched,cnt_matched_file,
769 percentage_matched_file) )
770 if non_matched: print(
"check_cross_link_consistency: Warning: Non matched xlinks:",
771 [(prot_name, sorted(list(non_matched[prot_name])))
for prot_name
in non_matched])
772 return matched,non_matched
774 def _match_xlinks(self, prot_name, res_index, aa_tuple):
779 for amino_acid
in aa_tuple:
780 if len(amino_acid) == 3:
781 amino_acid = amino_dict[amino_acid.upper()]
782 if prot_name
in self.fasta_seq.sequences:
783 seq = self.fasta_seq.sequences[prot_name]
788 if res_index == 0
or res_index == 1:
790 if res_index < len(seq):
791 if amino_acid == seq[res_index]:
797 def _update_matched_xlinks(self, b_matched, prot, res, matched, non_matched):
800 matched[prot].add(res)
802 matched[prot] = set([res])
804 if prot
in non_matched:
805 non_matched[prot].add(res)
807 non_matched[prot] = set([res])
808 return matched, non_matched
811 def get_cross_link_string(self,xl):
813 for k
in self.ordered_key_list:
815 string+=str(k)+
":"+str(xl[k])+
"|"
820 if k
not in self.ordered_key_list:
821 string+=str(k)+
":"+str(xl[k])+
"|"
825 def get_short_cross_link_string(self,xl):
828 list_of_keys=[self.data_set_name_key,
829 self.unique_sub_id_key,
837 for k
in list_of_keys:
839 string+=str(xl[k])+
"|"
845 def filter(self,FilterOperator):
847 for id
in self.data_base.keys():
848 for xl
in self.data_base[id]:
849 if FilterOperator.evaluate(xl):
850 if id
not in new_xl_dict:
853 new_xl_dict[id].append(xl)
855 return CrossLinkDataBase(self.cldbkc,new_xl_dict)
858 def merge(self,CrossLinkDataBase1,CrossLinkDataBase2):
860 This function merges two cross-link datasets so that if two conflicting crosslinks have the same
861 cross-link UniqueIDS, the cross-links will be appended under the same UniqueID slots
862 with different SubIDs
866 def append_database(self,CrossLinkDataBase2):
868 This function append one cross-link dataset to another. Unique ids will be renamed to avoid
871 name1=self.get_name()
872 name2=CrossLinkDataBase2.get_name()
875 name2=id(CrossLinkDataBase2)
879 for k
in self.data_base:
880 new_data_base[k]=self.data_base[k]
881 for k
in CrossLinkDataBase2.data_base:
882 new_data_base[k]=CrossLinkDataBase2.data_base[k]
883 self.data_base=new_data_base
886 def set_value(self,key,new_value,FilterOperator=None):
888 This function changes the value for a given key in the database
889 For instance one can change the name of a protein
890 @param key: the key in the database that must be changed
891 @param new_value: the new value of the key
892 @param FilterOperator: optional FilterOperator to change the value to
893 a subset of the database
895 example: `cldb1.set_value(cldb1.protein1_key,'FFF',FO(cldb.protein1_key,operator.eq,"AAA"))`
899 if FilterOperator
is not None:
900 if FilterOperator.evaluate(xl):
906 def get_values(self,key):
908 this function returns the list of values for a given key in the database
909 alphanumerically sorted
914 return sorted(list(values))
916 def offset_residue_index(self,protein_name,offset):
918 This function offset the residue indexes of a given protein by a specified value
919 @param protein_name: the protein name that need to be changed
920 @param offset: the offset value
924 if xl[self.protein1_key] == protein_name:
925 xl[self.residue1_key]=xl[self.residue1_key]+offset
926 if xl[self.protein2_key] == protein_name:
927 xl[self.residue2_key]=xl[self.residue2_key]+offset
930 def create_new_keyword(self,keyword,values_from_keyword=None):
932 This function creates a new keyword for the whole database and set the values from
933 and existing keyword (optional), otherwise the values are set to None
934 @param keyword the new keyword name:
935 @param values_from_keyword the keyword from which we are copying the values:
938 if values_from_keyword
is not None:
939 xl[keyword] = xl[values_from_keyword]
944 def rename_proteins(self,old_to_new_names_dictionary):
946 This function renames all proteins contained in the input dictionary
947 from the old names (keys) to the new name (values)
950 for old_name
in old_to_new_names_dictionary:
951 new_name=old_to_new_names_dictionary[old_name]
952 fo2=FilterOperator(self.protein1_key,operator.eq,old_name)
953 self.set_value(self.protein1_key,new_name,fo2)
954 fo2=FilterOperator(self.protein2_key,operator.eq,old_name)
955 self.set_value(self.protein2_key,new_name,fo2)
957 def clone_protein(self,protein_name,new_protein_name):
959 for id
in self.data_base.keys():
961 for xl
in self.data_base[id]:
962 new_data_base.append(xl)
963 if xl[self.protein1_key]==protein_name
and xl[self.protein2_key]!=protein_name:
965 new_xl[self.protein1_key]=new_protein_name
966 new_data_base.append(new_xl)
967 elif xl[self.protein1_key]!=protein_name
and xl[self.protein2_key]==protein_name:
969 new_xl[self.protein2_key]=new_protein_name
970 new_data_base.append(new_xl)
971 elif xl[self.protein1_key]==protein_name
and xl[self.protein2_key]==protein_name:
973 new_xl[self.protein1_key]=new_protein_name
974 new_data_base.append(new_xl)
976 new_xl[self.protein2_key]=new_protein_name
977 new_data_base.append(new_xl)
979 new_xl[self.protein1_key]=new_protein_name
980 new_xl[self.protein2_key]=new_protein_name
981 new_data_base.append(new_xl)
982 self.data_base[id]=new_data_base
985 def filter_out_same_residues(self):
987 This function remove cross-links applied to the same residue
988 (ie, same chain name and residue number)
991 for id
in self.data_base.keys():
993 for xl
in self.data_base[id]:
994 if xl[self.protein1_key]==xl[self.protein2_key]
and xl[self.residue1_key]==xl[self.residue2_key]:
997 new_data_base.append(xl)
998 self.data_base[id]=new_data_base
1002 def jackknife(self,percentage):
1004 this method returns a CrossLinkDataBase class containing
1005 a random subsample of the original cross-link database.
1006 @param percentage float between 0 and 1, is the percentage of
1007 of spectra taken from the original list
1010 if percentage > 1.0
or percentage < 0.0:
1011 raise ValueError(
'the percentage of random cross-link spectra should be between 0 and 1')
1012 nspectra=self.get_number_of_xlid()
1013 nrandom_spectra=int(nspectra*percentage)
1014 random_keys=random.sample(self.data_base.keys(),nrandom_spectra)
1016 for k
in random_keys:
1017 new_data_base[k]=self.data_base[k]
1018 return CrossLinkDataBase(self.cldbkc,new_data_base)
1022 sorted_ids=sorted(self.data_base.keys())
1024 for id
in sorted_ids:
1026 for xl
in self.data_base[id]:
1027 for k
in self.ordered_key_list:
1029 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1034 if k
not in self.ordered_key_list:
1035 outstr+=
"--- "+str(k)+
" "+str(xl[k])+
"\n"
1036 outstr+=
"-------------\n"
1040 def plot(self,filename,**kwargs):
1042 matplotlib.use(
'Agg')
1043 import matplotlib.pyplot
as plt
1044 import matplotlib.colors
1048 if kwargs[
"type"] ==
"scatter":
1049 cmap=plt.get_cmap(
"rainbow")
1050 norm=matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1053 if "colorkey" in kwargs:
1054 colorkey=kwargs[
"colorkey"]
1055 if "sizekey" in kwargs:
1056 sizekey=kwargs[
"sizekey"]
1057 if "logyscale" in kwargs:
1058 logyscale=kwargs[
"logyscale"]
1066 xs.append(float(xl[xkey]))
1068 ys.append(math.log10(float(xl[ykey])))
1070 ys.append(float(xl[ykey]))
1071 colors.append(float(xl[colorkey]))
1073 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1077 for color
in colors:
1078 cindex=(color-min(colors))/(max(colors)-min(colors))
1079 cs.append(cmap(cindex))
1082 ax = fig.add_subplot(111)
1083 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8,marker=
"o")
1086 plt.savefig(filename)
1090 if kwargs[
"type"] ==
"residue_links":
1094 molecule=kwargs[
"molecule"]
1096 length=len(molecule.sequence)
1097 molecule=molecule.get_name()
1100 sequences_object=kwargs[
"sequences_object"]
1101 sequence=sequences_object.sequences[molecule]
1102 length=len(sequence)
1104 histogram=[0]*length
1106 if xl[self.protein1_key]==molecule:
1107 histogram[xl[self.residue1_key]-1]=xl[self.residue1_links_number_key]
1108 if xl[self.protein2_key]==molecule:
1109 histogram[xl[self.residue2_key]-1]=xl[self.residue2_links_number_key]
1110 rects = plt.bar(range(1,length+1), histogram)
1117 plt.savefig(filename)
1123 if kwargs[
"type"] ==
"histogram":
1124 valuekey=kwargs[
"valuekey"]
1125 reference_xline=kwargs[
"reference_xline"]
1131 values_list.append(float(xl[valuekey]))
1133 print(
"CrossLinkDataBase.plot: Value error for cross-link %s" % (xl[self.unique_id_key]))
1136 filename, [values_list], valuename=valuename, bins=bins,
1137 colors=
None, format=
"pdf",
1138 reference_xline=
None, yplotrange=
None,
1139 xplotrange=
None,normalized=
True,
1142 def dump(self,json_filename):
1144 with open(json_filename,
'w')
as fp:
1145 json.dump(self.data_base, fp, sort_keys=
True, indent=2, default=set_json_default)
1147 def load(self,json_filename):
1149 with open(json_filename,
'r') as fp:
1150 self.data_base = json.load(fp)
1154 if sys.version_info[0] < 3:
1156 for k,v
in xl.iteritems():
1157 if type(k)
is unicode: k=str(k)
1158 if type(v)
is unicode: v=str(v)
1161 def save_csv(self,filename):
1167 sorted_group_ids=sorted(self.data_base.keys())
1168 for group
in sorted_group_ids:
1170 for xl
in self.data_base[group]:
1172 sorted_ids=sorted(xl.keys())
1173 values=[xl[k]
for k
in sorted_ids]
1174 group_block.append(values)
1178 with open(filename,
'w')
as fp:
1179 a = csv.writer(fp, delimiter=
',')
1180 a.writerow(sorted_ids)
1185 """This class allows to compute and plot the distance between datasets"""
1188 """Input a dictionary where keys are cldb names and values are cldbs"""
1189 import scipy.spatial.distance
1190 self.dbs=cldb_dictionary
1191 self.keylist=list(self.dbs.keys())
1192 self.distances=list()
1195 for i,key1
in enumerate(self.keylist):
1196 for key2
in self.keylist[i+1:]:
1197 distance=self.get_distance(key1,key2)
1198 self.distances.append(distance)
1200 self.distances=scipy.spatial.distance.squareform(self.distances)
1203 return 1.0-self.
jaccard_index(self.dbs[key1],self.dbs[key2])
1206 """Similarity index between two datasets
1207 https://en.wikipedia.org/wiki/Jaccard_index"""
1211 for xl1
in CrossLinkDataBase1:
1212 a1f=_ProteinsResiduesArray(xl1)
1213 a1b=a1f.get_inverted()
1216 for xl2
in CrossLinkDataBase2:
1217 a2f=_ProteinsResiduesArray(xl2)
1218 a2b=a2f.get_inverted()
1221 return float(len(set1&set2)/2)/(len(set1)/2+len(set2)/2-len(set1&set2)/2)
1223 def plot_matrix(self,figurename="clustermatrix.pdf"):
1224 import matplotlib
as mpl
1227 import matplotlib.pylab
as pl
1228 from scipy.cluster
import hierarchy
as hrc
1230 raw_distance_matrix=self.distances
1236 ax = fig.add_subplot(1,1,1)
1237 dendrogram = hrc.dendrogram(
1238 hrc.linkage(raw_distance_matrix),
1241 leaves_order = dendrogram[
'leaves']
1242 ax.set_xlabel(
'Dataset')
1243 ax.set_ylabel(
'Jaccard Distance')
1245 pl.savefig(
"dendrogram."+figurename, dpi=300)
1251 ax = fig.add_subplot(1,1,1)
1253 raw_distance_matrix[leaves_order,
1256 interpolation=
'nearest')
1257 cb = fig.colorbar(cax)
1258 cb.set_label(
'Jaccard Distance')
1259 ax.set_xlabel(
'Dataset')
1260 ax.set_ylabel(
'Dataset')
1261 ax.set_xticks(range(len(labels)))
1262 ax.set_xticklabels(numpy.array(labels)[leaves_order], rotation=
'vertical')
1263 ax.set_yticks(range(len(labels)))
1264 ax.set_yticklabels(numpy.array(labels)[leaves_order], rotation=
'horizontal')
1266 pl.savefig(
"matrix."+figurename, dpi=300)
1278 This class maps a CrossLinkDataBase on a given structure
1279 and save an rmf file with the color coded crosslinks
1283 def __init__(self,model,CrossLinkDataBase,rmf_name,frame_index):
1284 self.CrossLinkDataBase=CrossLinkDataBase
1285 rh = RMF.open_rmf_file_read_only(rmf_name)
1289 self.compute_distances()
1292 def compute_distances(self):
1295 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1296 for group
in sorted_group_ids:
1299 for xl
in self.CrossLinkDataBase.data_base[group]:
1301 sorted_ids=sorted(xl.keys())
1302 data.append(sorted_ids+[
"UniqueID",
"Distance",
"MinAmbiguousDistance"])
1303 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1305 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1312 values=[xl[k]
for k
in sorted_ids]
1313 values+=[group,mdist]
1314 group_dists.append(mdist)
1316 xl[
"Distance"]=mdist
1322 for xl
in self.CrossLinkDataBase.data_base[group]:
1323 xl[
"MinAmbiguousDistance"]=min(group_dists)
1325 def _get_distance_and_particle_pair(self,r1,c1,r2,c2):
1326 '''more robust and slower version of above'''
1328 selpart_1=sel.get_selected_particles()
1329 if len(selpart_1)==0:
1330 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for first site")
1333 selpart_2=sel.get_selected_particles()
1334 if len(selpart_2)==0:
1335 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle selected for second site")
1338 for p1
in selpart_1:
1339 for p2
in selpart_2:
1340 if p1 == p2
and r1 == r2:
continue
1360 results.append((dist,state_index1,copy_index1,state_index2,copy_index2,p1,p2))
1361 if len(results)==0:
return None
1362 results_sorted = sorted(results, key=operator.itemgetter(0,1,2,3,4))
1363 return (results_sorted[0][0],results_sorted[0][5],results_sorted[0][6]),(results_sorted[0][1],results_sorted[0][2],results_sorted[0][3],results_sorted[0][4])
1365 def save_rmf_snapshot(self,filename,color_id=None):
1368 if color_id
is None:
1369 color_id=self.CrossLinkDataBase.id_score_key
1370 sorted_group_ids=sorted(self.CrossLinkDataBase.data_base.keys())
1373 for group
in sorted_group_ids:
1374 group_dists_particles=[]
1375 for xl
in self.CrossLinkDataBase.data_base[group]:
1376 xllabel=self.CrossLinkDataBase.get_short_cross_link_string(xl)
1377 (c1,c2,r1,r2)=_ProteinsResiduesArray(xl)
1380 (mdist,p1,p2),(state1,copy1,state2,copy2)=self._get_distance_and_particle_pair(r1,c1,r2,c2)
1382 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1385 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1386 if group_dists_particles:
1387 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1388 color_scores.append(mincolor_score)
1389 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1394 linear.set_slope(1.0)
1398 offset=min(color_scores)
1399 maxvalue=max(color_scores)
1400 for pair
in list_of_pairs:
1402 pr.set_name(pair[2])
1403 factor=(pair[3]-offset)/(maxvalue-offset)
1407 rslin.add_restraint(pr)
1410 rh = RMF.create_rmf_file(filename)
1420 This class generates a CrossLinkDataBase from a given structure
1422 def __init__(self,representation=None,
1424 residue_types_1=[
"K"],
1425 residue_types_2=[
"K"],
1426 reactivity_range=[0,1],
1432 cldbkc.set_protein1_key(
"Protein1")
1433 cldbkc.set_protein2_key(
"Protein2")
1434 cldbkc.set_residue1_key(
"Residue1")
1435 cldbkc.set_residue2_key(
"Residue2")
1436 self.cldb=CrossLinkDataBase(cldbkc)
1437 if representation
is not None:
1440 self.representation=representation
1441 self.model=self.representation.m
1442 elif system
is not None:
1445 self.model=self.system.mdl
1448 print(
"Argument error: please provide either a representation object or a IMP.Hierarchy")
1450 self.residue_types_1=residue_types_1
1451 self.residue_types_2=residue_types_2
1453 self.indexes_dict1={}
1454 self.indexes_dict2={}
1455 self.protein_residue_dict={}
1456 self.reactivity_dictionary={}
1457 self.euclidean_interacting_pairs=
None
1458 self.xwalk_interacting_pairs=
None
1461 if self.mode==
"pmi1":
1462 for protein
in self.representation.sequence_dict.keys():
1464 seq=self.representation.sequence_dict[protein]
1465 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
1471 rexp=numpy.random.exponential(0.1)
1472 prob=1.0-math.exp(-rexp)
1473 self.reactivity_dictionary[(protein,r)]=prob
1476 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
1477 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
1479 h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1482 self.indexes_dict1[index]=(protein,r)
1483 self.protein_residue_dict[(protein,r)]=index
1485 h=IMP.pmi.tools.select_by_tuple(self.representation,(r,r,protein),resolution=1)[0]
1488 self.indexes_dict2[index]=(protein,r)
1489 self.protein_residue_dict[(protein,r)]=index
1491 if self.mode==
"pmi2":
1492 for state
in self.system.get_states():
1493 for moleculename,molecules
in state.get_molecules().iteritems():
1494 for molecule
in molecules:
1496 seq=molecule.sequence
1497 residues=[i
for i
in range(1,len(seq)+1)
if ((seq[i-1]
in self.residue_types_1)
or (seq[i-1]
in self.residue_types_2))]
1503 rexp=numpy.random.exponential(0.00000001)
1504 prob=1.0-math.exp(-rexp)
1505 self.reactivity_dictionary[(molecule,r)]=prob
1507 residues1=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_1]
1508 residues2=[i
for i
in range(1,len(seq)+1)
if seq[i-1]
in self.residue_types_2]
1511 ps=s.get_selected_particles()
1514 self.indexes_dict1[index]=(molecule,r)
1515 self.protein_residue_dict[(molecule,r)]=index
1518 ps=s.get_selected_particles()
1521 self.indexes_dict2[index]=(molecule,r)
1522 self.protein_residue_dict[(molecule,r)]=index
1525 def get_all_possible_pairs(self):
1526 n=float(len(self.protein_residue_dict.keys()))
1527 return n*(n-1.0)/2.0
1529 def get_all_feasible_pairs(self,distance=21):
1531 particle_index_pairs=[]
1533 for a,b
in itertools.combinations(self.protein_residue_dict.keys(),2):
1536 index1=self.protein_residue_dict[a]
1537 index2=self.protein_residue_dict[b]
1539 if particle_distance <= distance:
1540 particle_index_pairs.append((index1,index2))
1541 if self.mode==
"pmi1":
1542 new_xl[self.cldb.protein1_key]=a[0]
1543 new_xl[self.cldb.protein2_key]=b[0]
1544 elif self.mode==
"pmi2":
1545 new_xl[self.cldb.protein1_key]=a[0].get_name()
1546 new_xl[self.cldb.protein2_key]=b[0].get_name()
1547 new_xl[
"molecule_object1"]=a[0]
1548 new_xl[
"molecule_object2"]=b[0]
1549 new_xl[self.cldb.residue1_key]=a[1]
1550 new_xl[self.cldb.residue2_key]=b[1]
1551 self.cldb.data_base[str(nxl)]=[new_xl]
1559 def get_data_base(self,total_number_of_spectra,
1560 ambiguity_probability=0.1,
1563 max_delta_distance=10.0,
1564 xwalk_bin_path=
None,
1565 confidence_false=0.75,
1566 confidence_true=0.75):
1568 from random
import random,uniform
1572 self.beta_true=-1.4427*math.log(0.5*(1.0-confidence_true))
1573 self.beta_false=-1.4427*math.log(0.5*(1.0-confidence_false))
1574 self.cldb.data_base[str(number_of_spectra)]=[]
1575 self.sites_weighted=
None
1577 while number_of_spectra<total_number_of_spectra:
1578 if random() > ambiguity_probability
and len(self.cldb.data_base[str(number_of_spectra)]) != 0:
1580 number_of_spectra+=1
1581 self.cldb.data_base[str(number_of_spectra)]=[]
1583 if random() > noise:
1585 pra,dist=self.get_random_residue_pair(distance,xwalk_bin_path,max_delta_distance)
1588 pra,dist=self.get_random_residue_pair(
None,
None,
None)
1592 if self.mode==
"pmi1":
1593 new_xl[self.cldb.protein1_key]=pra[0]
1594 new_xl[self.cldb.protein2_key]=pra[1]
1595 elif self.mode==
"pmi2":
1596 new_xl[self.cldb.protein1_key]=pra[0].get_name()
1597 new_xl[self.cldb.protein2_key]=pra[1].get_name()
1598 new_xl[
"molecule_object1"]=pra[0]
1599 new_xl[
"molecule_object2"]=pra[1]
1600 new_xl[self.cldb.residue1_key]=pra[2]
1601 new_xl[self.cldb.residue2_key]=pra[3]
1602 new_xl[
"Noisy"]=noisy
1604 new_xl[
"Reactivity_Residue1"]=self.reactivity_dictionary[(pra[0],pra[2])]
1605 new_xl[
"Reactivity_Residue2"]=self.reactivity_dictionary[(pra[1],pra[3])]
1606 r1=new_xl[
"Reactivity_Residue1"]
1607 r2=new_xl[
"Reactivity_Residue2"]
1613 new_xl[
"Score"]=np.random.beta(1.0,self.beta_false)
1616 new_xl[
"Score"]=1.0-np.random.beta(1.0,self.beta_true)
1617 new_xl[
"TargetDistance"]=dist
1618 new_xl[
"NoiseProbability"]=noise
1619 new_xl[
"AmbiguityProbability"]=ambiguity_probability
1621 (p1,p2)=
IMP.get_particles(self.model,[self.protein_residue_dict[(pra[0],pra[2])],
1622 self.protein_residue_dict[(pra[1],pra[3])]])
1627 new_xl[
"InterRigidBody"] =
False
1632 new_xl[
"InterRigidBody"] =
True
1634 new_xl[
"InterRigidBody"] =
None
1636 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1641 def get_random_residue_pair(self,distance=None,xwalk_bin_path=None,max_delta_distance=None):
1644 from random
import choice,uniform
1645 if distance
is None:
1648 if self.mode==
"pmi1":
1649 protein1=choice(self.representation.sequence_dict.keys())
1650 protein2=choice(self.representation.sequence_dict.keys())
1651 seq1=self.representation.sequence_dict[protein1]
1652 seq2=self.representation.sequence_dict[protein2]
1653 residue1=choice([i
for i
in range(1,len(seq1)+1)
if seq1[i-1]
in self.residue_types_1])
1654 residue2=choice([i
for i
in range(1,len(seq2)+1)
if seq2[i-1]
in self.residue_types_2])
1655 h1=IMP.pmi.tools.select_by_tuple(self.representation,(residue1,residue1,protein1),resolution=1)[0]
1656 h2=IMP.pmi.tools.select_by_tuple(self.representation,(residue2,residue2,protein2),resolution=1)[0]
1658 if (protein1,residue1) != (protein2,residue2):
1660 elif self.mode==
"pmi2":
1661 (protein1,residue1)=choice(self.protein_residue_dict.keys())
1662 (protein2,residue2)=choice(self.protein_residue_dict.keys())
1663 index1=self.protein_residue_dict[(protein1,residue1)]
1664 index2=self.protein_residue_dict[(protein2,residue2)]
1666 if (protein1,residue1) != (protein2,residue2):
1670 if not xwalk_bin_path:
1672 gcpf.set_distance(distance+max_delta_distance)
1676 if not self.sites_weighted:
1677 self.sites_weighted=[]
1678 for key
in self.reactivity_dictionary:
1679 r=self.reactivity_dictionary[key]
1680 self.sites_weighted.append((key,r))
1682 first_site=self.weighted_choice(self.sites_weighted)
1684 if not self.euclidean_interacting_pairs:
1685 self.euclidean_interacting_pairs=gcpf.get_close_pairs(self.model,
1686 self.indexes_dict1.keys(),
1687 self.indexes_dict2.keys())
1689 first_site_pairs = [pair
for pair
in self.euclidean_interacting_pairs
1690 if self.indexes_dict1[pair[0]] == first_site
or
1691 self.indexes_dict2[pair[1]] == first_site]
1692 if len(first_site_pairs)==0:
continue
1694 second_sites_weighted=[]
1695 for pair
in first_site_pairs:
1696 if self.indexes_dict1[pair[0]] == first_site: second_site = self.indexes_dict2[pair[1]]
1697 if self.indexes_dict2[pair[1]] == first_site: second_site = self.indexes_dict1[pair[0]]
1698 r=self.reactivity_dictionary[second_site]
1699 second_sites_weighted.append((second_site,r))
1700 second_site=self.weighted_choice(second_sites_weighted)
1702 interacting_pairs_weighted=[]
1703 for pair in self.euclidean_interacting_pairs:
1704 r1=self.reactivity_dictionary[self.indexes_dict1[pair[0]]]
1705 r2=self.reactivity_dictionary[self.indexes_dict2[pair[1]]]
1706 #combined reactivity 1-exp(-k12*Delta t),
1709 r12=1.0-math.exp(-math.log(1.0/(1.0-r1))*math.log(1.0/(1.0-r2))/math.log(1.0/(1.0-r1)*1.0/(1.0-r2)))
1710 interacting_pairs_weighted.append((pair,r12))
1711 #weight1=math.exp(-self.reactivity_dictionary[self.indexes_dict1[pair[0]]]/self.kt)
1712 #weight2=math.exp(-self.reactivity_dictionary[self.indexes_dict2[pair[1]]]/self.kt)
1713 #interacting_pairs_weighted.append((pair,weight1*weight2))
1716 pair=self.weighted_choice(interacting_pairs_weighted)
1717 protein1,residue1=self.indexes_dict1[pair[0]]
1718 protein2,residue2=self.indexes_dict2[pair[1]]
1719 particle_pair=IMP.get_particles(self.model,pair)
1720 particle_distance=IMP.core.get_distance(IMP.core.XYZ(particle_pair[0]),IMP.core.XYZ(particle_pair[1]))
1721 if particle_distance<distance and (protein1,residue1) != (protein2,residue2):
1723 elif particle_distance>=distance and (protein1,residue1) != (protein2,residue2) and max_delta_distance:
1724 #allow some flexibility
1725 prob=1.0-((particle_distance-distance)/max_delta_distance)**(0.3)
1726 if uniform(0.0,1.0)<prob: break
1728 protein1,residue1=first_site
1729 protein2,residue2=second_site
1730 print(
"CrossLinkDataBaseFromStructure.get_random_residue_pair:",
1731 "First site",first_site,self.reactivity_dictionary[first_site],
1732 "Second site",second_site,self.reactivity_dictionary[second_site])
1733 particle_pair=
IMP.get_particles(self.model,[self.protein_residue_dict[first_site],self.protein_residue_dict[second_site]])
1736 if particle_distance<distance
and (protein1,residue1) != (protein2,residue2):
1738 elif particle_distance>=distance
and (protein1,residue1) != (protein2,residue2)
and max_delta_distance:
1742 if particle_distance-distance < max_delta_distance:
break
1747 if not self.xwalk_interacting_pairs:
1748 self.xwalk_interacting_pairs=self.get_xwalk_distances(xwalk_bin_path,distance)
1749 interacting_pairs_weighted=[]
1751 for pair
in self.xwalk_interacting_pairs:
1756 weight1=math.exp(-self.reactivity_dictionary[(protein1,residue1)]/self.kt)
1757 weight2=math.exp(-self.reactivity_dictionary[(protein2,residue2)]/self.kt)
1758 interacting_pairs_weighted.append((pair,weight1*weight2))
1760 pair=self.weighted_choice(interacting_pairs_weighted)
1765 particle_distance=float(pair[4])
1769 return ((protein1,protein2,residue1,residue2)),particle_distance
1771 def get_xwalk_distances(self,xwalk_bin_path,distance):
1775 o.init_pdb(
"xwalk.pdb",self.representation.prot)
1776 o.write_pdb(
"xwalk.pdb")
1777 namechainiddict=o.dictchain[
"xwalk.pdb"]
1780 for key
in namechainiddict: chainiddict[namechainiddict[key]]=key
1781 xwalkout=os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys -a1 cb -a2 cb -max '+str(distance)+
' -bb').read()
1783 output_list_of_distance=[]
1784 for line
in xwalkout.split(
"\n")[0:-2]:
1785 tockens=line.split()
1788 distance=float(tockens[6])
1790 ss=second.split(
"-")
1793 protein1=chainiddict[chainid1]
1794 protein2=chainiddict[chainid2]
1797 output_list_of_distance.append((protein1,protein2,residue1,residue2,distance))
1798 return output_list_of_distance
1801 def weighted_choice(self,choices):
1804 total = sum(w
for c, w
in choices)
1805 r = random.uniform(0, total)
1807 for c, w
in choices:
1811 assert False,
"Shouldn't get here"
1813 def save_rmf_snapshot(self,filename,color_id=None):
1816 if color_id
is None:
1817 color_id=
"Reactivity"
1819 sorted_group_ids=sorted(self.cldb.data_base.keys())
1822 for group
in sorted_group_ids:
1824 group_dists_particles=[]
1825 for xl
in self.cldb.data_base[group]:
1826 xllabel=self.cldb.get_short_cross_link_string(xl)
1827 (c1,c2,r1,r2)=(xl[
"molecule_object1"],xl[
"molecule_object2"],xl[self.cldb.residue1_key],xl[self.cldb.residue2_key])
1829 index1=self.protein_residue_dict[(c1,r1)]
1830 index2=self.protein_residue_dict[(c2,r2)]
1832 mdist=xl[
"TargetDistance"]
1834 print(
"TypeError or missing chain/residue ",r1,c1,r2,c2)
1836 group_dists_particles.append((mdist,p1,p2,xllabel,float(xl[color_id])))
1837 if group_dists_particles:
1838 (minmdist,minp1,minp2,minxllabel,mincolor_score)=min(group_dists_particles, key =
lambda t: t[0])
1839 color_scores.append(mincolor_score)
1840 list_of_pairs.append((minp1,minp2,minxllabel,mincolor_score))
1847 linear.set_slope(1.0)
1851 offset=min(color_scores)
1852 maxvalue=max(color_scores)
1853 for pair
in list_of_pairs:
1855 pr.set_name(pair[2])
1856 factor=(pair[3]-offset)/(maxvalue-offset)
1859 rslin.add_restraint(pr)
1862 rh = RMF.create_rmf_file(filename)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def __init__
Input a dictionary where keys are cldb names and values are cldbs.
atom::Hierarchies create_hierarchies(RMF::FileConstHandle fh, Model *m)
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
def jaccard_index
Similarity index between two datasets https://en.wikipedia.org/wiki/Jaccard_index.
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class maps a CrossLinkDataBase on a given structure and save an rmf file with the color coded cr...
A decorator for keeping track of copies of a molecule.
The standard decorator for manipulating molecular structures.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
This class allows to compute and plot the distance between datasets.
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
void load_frame(RMF::FileConstHandle file, RMF::FrameID frame)
Load the given RMF frame into the state of the linked objects.
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
Class for easy writing of PDBs, RMFs, and stat files.
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
Associate an integer "state" index with a hierarchy node.
A class to handle different styles of site pairs parsers.
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
Support for the RMF file format for storing hierarchical molecular data and markup.
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.