1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
8 from __future__
import print_function
24 from collections
import defaultdict
29 def set_json_default(obj):
30 if isinstance(obj, set):
38 if sys.version_info[0] >= 3:
39 def _handle_string_input(inp):
40 if not isinstance(inp, str):
41 raise TypeError(
"expecting a string")
44 def _handle_string_input(inp):
45 if not isinstance(inp, (str, unicode)):
46 raise TypeError(
"expecting a string or unicode")
48 if isinstance(inp, unicode):
54 class _CrossLinkDataBaseStandardKeys(object):
56 This class setup all the standard keys needed to
57 identify the cross-link features from the data sets
61 self.protein1_key =
"Protein1"
62 self.type[self.protein1_key] = str
63 self.protein2_key =
"Protein2"
64 self.type[self.protein2_key] = str
65 self.residue1_key =
"Residue1"
66 self.type[self.residue1_key] = int
67 self.residue2_key =
"Residue2"
68 self.type[self.residue2_key] = int
69 self.residue1_amino_acid_key =
"Residue1AminoAcid"
70 self.type[self.residue1_amino_acid_key] = str
71 self.residue2_amino_acid_key =
"Residue2AminoAcid"
72 self.type[self.residue2_amino_acid_key] = str
73 self.residue1_moiety_key =
"Residue1Moiety"
74 self.type[self.residue1_moiety_key] = str
75 self.residue2_moiety_key =
"Residue2Moiety"
76 self.type[self.residue2_moiety_key] = str
77 self.site_pairs_key =
"SitePairs"
78 self.type[self.site_pairs_key] = str
79 self.unique_id_key =
"XLUniqueID"
80 self.type[self.unique_id_key] = str
81 self.unique_sub_index_key =
"XLUniqueSubIndex"
82 self.type[self.unique_sub_index_key] = int
83 self.unique_sub_id_key =
"XLUniqueSubID"
84 self.type[self.unique_sub_id_key] = str
85 self.data_set_name_key =
"DataSetName"
86 self.type[self.data_set_name_key] = str
87 self.cross_linker_chemical_key =
"CrossLinkerChemical"
88 self.type[self.cross_linker_chemical_key] = str
89 self.id_score_key =
"IDScore"
90 self.type[self.id_score_key] = float
92 self.type[self.fdr_key] = float
93 self.quantitation_key =
"Quantitation"
94 self.type[self.quantitation_key] = float
95 self.redundancy_key =
"Redundancy"
96 self.type[self.redundancy_key] = int
97 self.redundancy_list_key =
"RedundancyList"
98 self.type[self.redundancy_key] = list
99 self.ambiguity_key =
"Ambiguity"
100 self.type[self.ambiguity_key] = int
101 self.residue1_links_number_key =
"Residue1LinksNumber"
102 self.type[self.residue1_links_number_key] = int
103 self.residue2_links_number_key =
"Residue2LinksNumber"
104 self.type[self.residue2_links_number_key] = int
105 self.type[self.ambiguity_key] = int
106 self.state_key =
"State"
107 self.type[self.state_key] = int
108 self.sigma1_key =
"Sigma1"
109 self.type[self.sigma1_key] = str
110 self.sigma2_key =
"Sigma2"
111 self.type[self.sigma2_key] = str
113 self.type[self.psi_key] = str
114 self.distance_key =
"Distance"
115 self.type[self.distance_key] = float
116 self.min_ambiguous_distance_key =
"MinAmbiguousDistance"
117 self.type[self.distance_key] = float
119 self.link_type_key =
"LinkType"
120 self.type[self.link_type_key] = str
122 self.ordered_key_list = [
123 self.data_set_name_key, self.unique_id_key,
124 self.unique_sub_index_key, self.unique_sub_id_key,
125 self.protein1_key, self.protein2_key, self.residue1_key,
126 self.residue2_key, self.residue1_amino_acid_key,
127 self.residue2_amino_acid_key, self.residue1_moiety_key,
128 self.residue2_moiety_key, self.cross_linker_chemical_key,
129 self.id_score_key, self.fdr_key, self.quantitation_key,
130 self.redundancy_key, self.redundancy_list_key, self.state_key,
131 self.sigma1_key, self.sigma2_key, self.psi_key, self.distance_key,
132 self.min_ambiguous_distance_key, self.link_type_key]
135 class _ProteinsResiduesArray(tuple):
137 This class is inherits from tuple, and it is a shorthand for a cross-link
138 (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1
139 and protein2, r1 and r2 are residue1 and residue2.
142 def __new__(self, input_data):
144 @input_data can be a dict or a tuple
146 self.cldbsk = _CrossLinkDataBaseStandardKeys()
147 if type(input_data)
is dict:
149 p1 = input_data[self.cldbsk.protein1_key]
151 p2 = input_data[self.cldbsk.protein2_key]
154 r1 = input_data[self.cldbsk.residue1_key]
156 r2 = input_data[self.cldbsk.residue2_key]
162 t = (p1,
"", r1,
None)
163 elif type(input_data)
is tuple:
164 if len(input_data) > 4
or len(input_data) == 3 \
165 or len(input_data) == 1:
167 "_ProteinsResiduesArray: must have only 4 elements")
168 if len(input_data) == 4:
169 p1 = _handle_string_input(input_data[0])
170 p2 = _handle_string_input(input_data[1])
173 if (type(r1)
is not int)
and (r1
is not None):
175 "_ProteinsResiduesArray: residue1 must be a integer")
176 if (type(r2)
is not int)
and (r1
is not None):
178 "_ProteinsResiduesArray: residue2 must be a integer")
180 if len(input_data) == 2:
181 p1 = _handle_string_input(input_data[0])
183 if type(r1)
is not int:
185 "_ProteinsResiduesArray: residue1 must be a integer")
186 t = (p1,
"", r1,
None)
189 "_ProteinsResiduesArray: input must be a dict or tuple")
190 return tuple.__new__(_ProteinsResiduesArray, t)
192 def get_inverted(self):
194 Returns a _ProteinsResiduesArray instance with protein1 and
197 return _ProteinsResiduesArray((self[1], self[0], self[3], self[2]))
200 outstr = self.cldbsk.protein1_key +
" " + str(self[0])
201 outstr +=
" " + self.cldbsk.protein2_key +
" " + str(self[1])
202 outstr +=
" " + self.cldbsk.residue1_key +
" " + str(self[2])
203 outstr +=
" " + self.cldbsk.residue2_key +
" " + str(self[3])
207 outstr = str(self[0]) +
"." + str(self[2]) +
"-" + str(self[1]) \
214 This class allows to create filter functions that can be passed to
215 the CrossLinkDataBase in this way:
217 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
219 where cldb is CrossLinkDataBase instance and it is only needed to get
220 the standard keywords
222 A filter operator can be evaluate on a CrossLinkDataBase item xl and
227 and it is used to filter the database
230 def __init__(self, argument1, operator, argument2):
232 (argument1,operator,argument2) can be either a
233 (keyword,operator.eq|lt|gt...,value)
234 or (FilterOperator1,operator.or|and...,FilterOperator2)
236 if isinstance(argument1, FilterOperator):
237 self.operations = [argument1, operator, argument2]
240 self.values = (argument1, operator, argument2)
242 def __or__(self, FilterOperator2):
245 def __and__(self, FilterOperator2):
248 def __invert__(self):
251 def evaluate(self, xl_item):
253 if len(self.operations) == 0:
254 keyword, operator, value = self.values
255 return operator(xl_item[keyword], value)
256 FilterOperator1, op, FilterOperator2 = self.operations
258 if FilterOperator2
is None:
259 return op(FilterOperator1.evaluate(xl_item))
261 return op(FilterOperator1.evaluate(xl_item),
262 FilterOperator2.evaluate(xl_item))
267 This class is needed to convert the keywords from a generic database
273 @param list_parser an instance of ResiduePairListParser, if any
277 self.backward_converter = {}
278 _CrossLinkDataBaseStandardKeys.__init__(self)
279 self.rplp = list_parser
280 if self.rplp
is None:
282 self.compulsory_keys = set([self.protein1_key,
287 self.compulsory_keys = self.rplp.get_compulsory_keys()
288 self.setup_keys = set()
292 Is a function that check whether necessary keys are setup
295 if self.compulsory_keys & setup_keys != self.compulsory_keys:
296 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup "
297 "all necessary keys")
301 Returns the keys that have been setup so far
303 return self.backward_converter.keys()
307 This sets up the standard compulsory keys as defined by
308 _CrossLinkDataBaseStandardKeys
310 for ck
in self.compulsory_keys:
311 self.converter[ck] = ck
312 self.backward_converter[ck] = ck
314 def set_unique_id_key(self, origin_key):
315 self.converter[origin_key] = self.unique_id_key
316 self.backward_converter[self.unique_id_key] = origin_key
318 def set_protein1_key(self, origin_key):
319 self.converter[origin_key] = self.protein1_key
320 self.backward_converter[self.protein1_key] = origin_key
322 def set_protein2_key(self, origin_key):
323 self.converter[origin_key] = self.protein2_key
324 self.backward_converter[self.protein2_key] = origin_key
326 def set_residue1_key(self, origin_key):
327 self.converter[origin_key] = self.residue1_key
328 self.backward_converter[self.residue1_key] = origin_key
330 def set_residue2_key(self, origin_key):
331 self.converter[origin_key] = self.residue2_key
332 self.backward_converter[self.residue2_key] = origin_key
334 def set_residue1_amino_acid_key(self, origin_key):
335 self.converter[origin_key] = self.residue1_amino_acid_key
336 self.backward_converter[self.residue1_amino_acid_key] = origin_key
338 def set_residue2_amino_acid_key(self, origin_key):
339 self.converter[origin_key] = self.residue2_amino_acid_key
340 self.backward_converter[self.residue2_amino_acid_key] = origin_key
342 def set_residue1_moiety_key(self, origin_key):
343 self.converter[origin_key] = self.residue1_moiety_key
344 self.backward_converter[self.residue1_moiety_key] = origin_key
346 def set_residue2_moiety_key(self, origin_key):
347 self.converter[origin_key] = self.residue2_moiety_key
348 self.backward_converter[self.residue2_moiety_key] = origin_key
350 def set_site_pairs_key(self, origin_key):
351 self.converter[origin_key] = self.site_pairs_key
352 self.backward_converter[self.site_pairs_key] = origin_key
354 def set_id_score_key(self, origin_key):
355 self.converter[origin_key] = self.id_score_key
356 self.backward_converter[self.id_score_key] = origin_key
358 def set_fdr_key(self, origin_key):
359 self.converter[origin_key] = self.fdr_key
360 self.backward_converter[self.fdr_key] = origin_key
362 def set_quantitation_key(self, origin_key):
363 self.converter[origin_key] = self.quantitation_key
364 self.backward_converter[self.quantitation_key] = origin_key
366 def set_psi_key(self, origin_key):
367 self.converter[origin_key] = self.psi_key
368 self.backward_converter[self.psi_key] = origin_key
370 def set_link_type_key(self, link_type_key):
371 self.converter[link_type_key] = self.link_type_key
372 self.backward_converter[self.link_type_key] = link_type_key
376 Returns the dictionary that convert the old keywords to the new ones
379 return self.converter
383 Returns the dictionary that convert the new keywords to the old ones
386 return self.backward_converter
391 A class to handle different styles of site pairs parsers.
393 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
394 [Y3-;K4-] for dead-ends
395 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
396 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
397 LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
402 def __init__(self, style):
403 _CrossLinkDataBaseStandardKeys.__init__(self)
404 if style ==
"MSSTUDIO":
406 self.compulsory_keys = set([self.protein1_key,
408 self.site_pairs_key])
409 elif style ==
"XTRACT" or style ==
"QUANTITATION":
411 self.compulsory_keys = set([self.site_pairs_key])
412 elif style ==
"LAN_HUANG":
414 self.compulsory_keys = set([self.site_pairs_key])
416 raise Exception(
"ResiduePairListParser: unknown list parser style")
418 def get_compulsory_keys(self):
419 return self.compulsory_keys
423 This function returns a list of cross-linked residues and the
424 corresponding list of cross-linked chains. The latter list can be
425 empty, if the style doesn't have the corresponding information.
427 if self.style ==
"MSSTUDIO":
428 input_string = input_string.replace(
"[",
"")
429 input_string = input_string.replace(
"]",
"")
430 input_string_pairs = input_string.split(
";")
431 residue_pair_indexes = []
432 chain_pair_indexes = []
433 for s
in input_string_pairs:
435 r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)'
436 r'(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',
439 r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$', s)
442 (residue_type_1, residue_index_1, residue_type_2,
443 residue_index_2) = m1.group(1, 2, 3, 4)
444 residue_pair_indexes.append((residue_index_1,
448 residue_type_1, residue_index_1 = m2.group(1, 2)
450 return residue_pair_indexes, chain_pair_indexes
451 if self.style ==
"XTRACT" or self.style ==
"QUANTITATION":
452 if ":x:" in input_string:
454 input_strings = input_string.split(
":x:")
455 first_peptides = input_strings[0].split(
":|:")
456 second_peptides = input_strings[1].split(
":|:")
457 first_peptides_identifiers = [
459 f.split(
":")[1])
for f
in first_peptides]
460 second_peptides_identifiers = [
462 f.split(
":")[1])
for f
in second_peptides]
463 residue_pair_indexes = []
464 chain_pair_indexes = []
465 for fpi
in first_peptides_identifiers:
466 for spi
in second_peptides_identifiers:
471 residue_pair_indexes.append((residue1, residue2))
472 chain_pair_indexes.append((chain1, chain2))
473 return residue_pair_indexes, chain_pair_indexes
476 first_peptides = input_string.split(
":|:")
477 first_peptides_identifiers = [
478 (f.split(
":")[0], f.split(
":")[1])
for f
in first_peptides]
481 for fpi
in first_peptides_identifiers:
484 residue_indexes.append((residue1,))
485 chain_indexes.append((chain1,))
486 return residue_indexes, chain_indexes
487 if self.style ==
"LAN_HUANG":
488 input_strings = input_string.split(
"-")
489 chain1, first_series = input_strings[0].split(
":")
490 chain2, second_series = input_strings[1].split(
":")
492 first_residues = first_series.replace(
";",
"|").split(
"|")
493 second_residues = second_series.replace(
";",
"|").split(
"|")
494 residue_pair_indexes = []
495 chain_pair_indexes = []
496 for fpi
in first_residues:
497 for spi
in second_residues:
498 residue1 = self.re.sub(
"[^0-9]",
"", fpi)
499 residue2 = self.re.sub(
"[^0-9]",
"", spi)
500 residue_pair_indexes.append((residue1, residue2))
501 chain_pair_indexes.append((chain1, chain2))
502 return residue_pair_indexes, chain_pair_indexes
507 A class to handle different XL format with fixed format
508 currently support ProXL
510 def __init__(self, format):
511 _CrossLinkDataBaseStandardKeys.__init__(self)
512 if format ==
"PROXL":
515 raise Exception(
"FixedFormatParser: unknown list format name")
517 def get_data(self, input_string):
518 if self.format ==
"PROXL":
519 tokens = input_string.split(
"\t")
521 if tokens[0] ==
"SEARCH ID(S)":
524 xl[self.protein1_key] = tokens[2]
525 xl[self.protein2_key] = tokens[4]
526 xl[self.residue1_key] = int(tokens[3])
527 xl[self.residue2_key] = int(tokens[5])
533 this class handles a cross-link dataset and do filtering
534 operations, adding cross-links, merge datasets...
537 def __init__(self, converter=None, data_base=None, fasta_seq=None,
541 @param converter an instance of CrossLinkDataBaseKeywordsConverter
542 @param data_base an instance of CrossLinkDataBase to build the
544 @param fasta_seq an instance of IMP.pmi.topology.Sequences containing
545 protein fasta sequences to check cross-link consistency.
546 If not given consistency will not be checked
547 @param linkable_aa a tuple containing one-letter amino acids which
548 are linkable by the cross-linker; only used if the database
549 DOES NOT provide a value for a certain residueX_amino_acid_key
550 and if a fasta_seq is given
553 if data_base
is None:
556 self.data_base = data_base
558 _CrossLinkDataBaseStandardKeys.__init__(self)
559 if converter
is not None:
560 self.cldbkc = converter
561 self.list_parser = self.cldbkc.rplp
562 self.converter = converter.get_converter()
566 self.list_parser =
None
567 self.converter =
None
570 self.def_aa_tuple = linkable_aa
571 self.fasta_seq = fasta_seq
578 Update the whole dataset after changes
580 self.update_cross_link_unique_sub_index()
581 self.update_cross_link_redundancy()
582 self.update_residues_links_number()
586 sorted_ids = sorted(self.data_base.keys())
588 for xl
in self.data_base[k]:
591 def xlid_iterator(self):
592 sorted_ids = sorted(self.data_base.keys())
593 for xlid
in sorted_ids:
596 def __getitem__(self, xlid):
597 return self.data_base[xlid]
600 return len([xl
for xl
in self])
605 def set_name(self, name):
607 for k
in self.data_base:
608 new_data_base[k +
"." + name] = self.data_base[k]
609 self.data_base = new_data_base
613 def get_number_of_xlid(self):
614 return len(self.data_base)
617 FixedFormatParser=
None):
619 if FixedFormatParser is not specified, the file is
620 comma-separated-values
622 @param file_name a txt file to be parsed
623 @param converter an instance of CrossLinkDataBaseKeywordsConverter
624 @param FixedFormatParser a parser for a fixed format
626 if not FixedFormatParser:
627 xl_list = IMP.pmi.tools.get_db_from_csv(file_name)
629 if converter
is not None:
630 self.cldbkc = converter
631 self.list_parser = self.cldbkc.rplp
632 self.converter = converter.get_converter()
634 if not self.list_parser:
638 for nxl, xl
in enumerate(xl_list):
641 if k
in self.converter:
642 new_xl[self.converter[k]] = \
643 self.type[self.converter[k]](xl[k])
646 if self.unique_id_key
in self.cldbkc.get_setup_keys():
647 if new_xl[self.unique_id_key]
not in new_xl_dict:
648 new_xl_dict[new_xl[self.unique_id_key]] = [new_xl]
650 new_xl_dict[new_xl[self.unique_id_key]].append(
653 if str(nxl)
not in new_xl_dict:
654 new_xl_dict[str(nxl)] = [new_xl]
656 new_xl_dict[str(nxl)].append(new_xl)
662 for nxl, entry
in enumerate(xl_list):
666 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
668 "CrossLinkDataBase: expecting a site_pairs_key "
669 "for the site pair list parser")
671 if k
in self.converter:
672 new_dict[self.converter[k]] = \
673 self.type[self.converter[k]](entry[k])
675 new_dict[k] = entry[k]
678 chain_pair_list) = self.list_parser.get_list(
679 new_dict[self.site_pairs_key])
682 for n, p
in enumerate(residue_pair_list):
689 new_xl[k] = new_dict[k]
690 new_xl[self.residue1_key] = \
691 self.type[self.residue1_key](p[0])
693 new_xl[self.residue2_key] = \
694 self.type[self.residue2_key](p[1])
696 if len(chain_pair_list) == len(residue_pair_list):
697 new_xl[self.protein1_key] = \
698 self.type[self.protein1_key](
699 chain_pair_list[n][0])
701 new_xl[self.protein2_key] = \
702 self.type[self.protein2_key](
703 chain_pair_list[n][1])
706 new_xl[self.link_type_key] =
"CROSSLINK"
708 new_xl[self.link_type_key] =
"MONOLINK"
710 if self.unique_id_key
in self.cldbkc.get_setup_keys():
711 if new_xl[self.unique_id_key]
not in new_xl_dict:
712 new_xl_dict[new_xl[self.unique_id_key]] = \
715 new_xl_dict[new_xl[self.unique_id_key]].append(
718 if str(nxl)
not in new_xl_dict:
719 new_xl[self.unique_id_key] = str(nxl+1)
720 new_xl_dict[str(nxl)] = [new_xl]
722 new_xl[self.unique_id_key] = str(nxl+1)
723 new_xl_dict[str(nxl)].append(new_xl)
727 if FixedFormatParser is defined
731 f = open(file_name,
"r")
734 xl = FixedFormatParser.get_data(line)
736 xl[self.unique_id_key] = str(nxl+1)
737 new_xl_dict[str(nxl)] = [xl]
740 self.data_base = new_xl_dict
741 self.name = file_name
742 loc = ihm.location.InputFileLocation(file_name, details=
'Crosslinks')
743 self.dataset = ihm.dataset.CXMSDataset(loc)
746 def update_cross_link_unique_sub_index(self):
747 for k
in self.data_base:
748 for n, xl
in enumerate(self.data_base[k]):
749 xl[self.ambiguity_key] = len(self.data_base[k])
750 xl[self.unique_sub_index_key] = n+1
751 xl[self.unique_sub_id_key] = k +
"." + str(n+1)
753 def update_cross_link_redundancy(self):
754 redundancy_data_base = {}
756 pra = _ProteinsResiduesArray(xl)
757 if pra
not in redundancy_data_base:
758 redundancy_data_base[pra] = [xl[self.unique_sub_id_key]]
759 redundancy_data_base[pra.get_inverted()] = \
760 [xl[self.unique_sub_id_key]]
762 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
763 redundancy_data_base[pra.get_inverted()].append(
764 xl[self.unique_sub_id_key])
766 pra = _ProteinsResiduesArray(xl)
767 xl[self.redundancy_key] = len(redundancy_data_base[pra])
768 xl[self.redundancy_list_key] = redundancy_data_base[pra]
770 def update_residues_links_number(self):
773 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
774 if (p1, r1)
not in residue_links:
775 residue_links[(p1, r1)] = set([(p2, r2)])
777 residue_links[(p1, r1)].add((p2, r2))
778 if (p2, r2)
not in residue_links:
779 residue_links[(p2, r2)] = set([(p1, r1)])
781 residue_links[(p2, r2)].add((p1, r1))
784 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
785 xl[self.residue1_links_number_key] = len(residue_links[(p1, r1)])
786 xl[self.residue2_links_number_key] = len(residue_links[(p2, r2)])
789 """This function checks the consistency of the dataset with the
790 amino acid sequence"""
791 if self.cldbkc
and self.fasta_seq:
792 cnt_matched, cnt_matched_file = 0, 0
796 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
797 b_matched_file =
False
798 if self.residue1_amino_acid_key
in xl:
801 aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
802 b_matched = self._match_xlinks(p1, r1, aa_from_file)
803 b_matched_file = b_matched
807 b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
809 matched, non_matched = self._update_matched_xlinks(
810 b_matched, p1, r1, matched, non_matched)
812 if self.residue2_amino_acid_key
in xl:
813 aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
814 b_matched = self._match_xlinks(p2, r2, aa_from_file)
815 b_matched_file = b_matched
817 b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
819 matched, non_matched = self._update_matched_xlinks(
820 b_matched, p2, r2, matched, non_matched)
824 cnt_matched_file += 1
826 percentage_matched = round(100*cnt_matched/len(self), 1)
827 percentage_matched_file = round(
828 100 * cnt_matched_file / len(self), 1)
829 if matched
or non_matched:
831 "check_cross_link_consistency: Out of %d cross-links "
832 "%d were matched to the fasta sequence (%f %%).\n "
833 "%d were matched by using the cross-link file (%f %%)."
834 % (len(self), cnt_matched, percentage_matched,
835 cnt_matched_file, percentage_matched_file))
837 print(
"check_cross_link_consistency: Warning: Non "
839 [(prot_name, sorted(list(non_matched[prot_name])))
840 for prot_name
in non_matched])
841 return matched, non_matched
843 def _match_xlinks(self, prot_name, res_index, aa_tuple):
847 amino_dict = IMP.pmi.alphabets.amino_acid
849 for amino_acid
in aa_tuple:
850 if len(amino_acid) == 3:
851 amino_acid = amino_dict.get_one_letter_code_from_residue_type(
853 if prot_name
in self.fasta_seq.sequences:
854 seq = self.fasta_seq.sequences[prot_name]
862 if res_index == 0
or res_index == 1:
864 if res_index < len(seq):
865 if amino_acid == seq[res_index]:
869 def _update_matched_xlinks(self, b_matched, prot, res, matched,
873 matched[prot].add(res)
875 matched[prot] = set([res])
877 if prot
in non_matched:
878 non_matched[prot].add(res)
880 non_matched[prot] = set([res])
881 return matched, non_matched
883 def get_cross_link_string(self, xl):
885 for k
in self.ordered_key_list:
887 string += str(k) +
":" + str(xl[k]) +
"|"
892 if k
not in self.ordered_key_list:
893 string += str(k) +
":" + str(xl[k]) +
"|"
897 def get_short_cross_link_string(self, xl):
899 list_of_keys = [self.data_set_name_key,
900 self.unique_sub_id_key,
908 for k
in list_of_keys:
910 string += str(xl[k]) +
"|"
916 def filter(self, FilterOperator):
918 for id
in self.data_base.keys():
919 for xl
in self.data_base[id]:
920 if FilterOperator.evaluate(xl):
921 if id
not in new_xl_dict:
922 new_xl_dict[id] = [xl]
924 new_xl_dict[id].append(xl)
927 cdb.dataset = self.dataset
931 '''Get all cross-links with score greater than an input value'''
933 self.id_score_key, operator.gt, score)
934 return self.filter(FilterOperator)
936 def merge(self, CrossLinkDataBase1, CrossLinkDataBase2):
938 This function merges two cross-link datasets so that if two
939 conflicting cross-links have the same cross-link UniqueIDS,
940 the cross-links will be appended under the same UniqueID slots
941 with different SubIDs
943 raise NotImplementedError()
946 """Append cross-link dataset to this one."""
948 for k
in self.data_base:
949 new_data_base[k] = self.data_base[k]
950 for k
in db.data_base:
951 new_data_base[k] = db.data_base[k]
952 self.data_base = new_data_base
955 def __iadd__(self, db):
959 def set_value(self, key, new_value, filter_operator=None):
961 This function changes the value for a given key in the database
962 For instance one can change the name of a protein
963 @param key: the key in the database that must be changed
964 @param new_value: the new value of the key
965 @param filter_operator: optional FilterOperator to change the value to
966 a subset of the database
968 example: `cldb1.set_value(cldb1.protein1_key, 'FFF',
969 FO(cldb.protein1_key, operator.eq, "AAA"))`
973 if filter_operator
is not None:
974 if filter_operator.evaluate(xl):
982 this function returns the list of values for a given key in the
983 database alphanumerically sorted
988 return sorted(list(values))
992 This function offset the residue indexes of a given protein by
994 @param protein_name: the protein name that need to be changed
995 @param offset: the offset value
999 if xl[self.protein1_key] == protein_name:
1000 xl[self.residue1_key] = xl[self.residue1_key] + offset
1001 if xl[self.protein2_key] == protein_name:
1002 xl[self.residue2_key] = xl[self.residue2_key] + offset
1007 This function creates a new keyword for the whole database and
1008 set the values from and existing keyword (optional), otherwise the
1009 values are set to None
1010 @param keyword the new keyword name:
1011 @param values_from_keyword the keyword from which we are copying
1015 if values_from_keyword
is not None:
1016 xl[keyword] = xl[values_from_keyword]
1022 protein_to_rename=
"both"):
1024 This function renames all proteins contained in the input dictionary
1025 from the old names (keys) to the new name (values)
1026 @param old_to_new_names_dictionary dictionary for converting old
1028 @param protein_to_rename specify whether to rename both or protein1
1032 for old_name
in old_to_new_names_dictionary:
1033 new_name = old_to_new_names_dictionary[old_name]
1034 if protein_to_rename ==
"both" or protein_to_rename ==
"protein1":
1036 self.
set_value(self.protein1_key, new_name, fo2)
1037 if protein_to_rename ==
"both" or protein_to_rename ==
"protein2":
1039 self.
set_value(self.protein2_key, new_name, fo2)
1043 This function parses an alignment file obtained previously, to map
1044 crosslink residues onto the sequence of a homolog proteins. It is
1045 useful if you want to map the crosslinks onto a known, homolog
1046 structure without building a comparative model.
1047 The txt file of the alignment should be structured with repeating
1050 >NAME_CROSSLINKED_PROTEIN
1051 >NAME_HOMOLOG_PROTEIN_WITH_KNOWN_STRUCTURE
1052 one-letter code sequence for cross-linked protein (one line)
1053 one-letter code sequence for homolog protein with known structure (one line)
1056 f = open(alignfile,
'r')
1058 def _assign_residues(seq1, seq2):
1066 resinds1.append(nres)
1068 resinds1.append(
None)
1070 resinds2 = [
None for i
in resinds1]
1073 for pos, r2
in enumerate(seq2):
1076 resinds2[pos] = nres
1078 if contiguousgap > 1:
1079 for i
in range(pos-int(contiguousgap/2), pos):
1084 resinds2[pos] = lastresind
1086 for n, r
in enumerate(resinds1):
1094 with open(alignfile,
'r') as f:
1100 protname = aa.replace(
">",
"").replace(
"\n",
"")
1101 seq1 = cc.replace(
",",
"").replace(
"\n",
"")
1102 seq2 = dd.replace(
",",
"").replace(
"\n",
"")
1108 for i
in range(len(seq1)):
1120 tmpl1.append([i, c1, seq1[i]])
1121 tmpl2.append([i, c2, seq2[i]])
1122 sequences1[protname] = tmpl1
1123 sequences2[protname] = tmpl2
1125 d = _assign_residues(seq1, seq2)
1131 if fo11.evaluate(xl):
1132 t = [item
for item
in sequences1[protname]
1133 if item[1] == xl[self.residue1_key]][0]
1134 sequences1[protname][t[0]] = [
1135 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1136 t = [item
for item
in sequences2[protname]
1137 if item[1] == d[xl[self.residue1_key]]][0]
1138 sequences2[protname][t[0]] = [
1139 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1141 xl[self.residue1_key] = d[xl[self.residue1_key]]
1143 if fo12.evaluate(xl):
1145 t = [item
for item
in sequences1[protname]
1146 if item[1] == xl[self.residue2_key]][0]
1147 sequences1[protname][t[0]] = [
1148 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1149 t = [item
for item
in sequences2[protname]
1150 if item[1] == d[xl[self.residue2_key]]][0]
1151 sequences2[protname][t[0]] = [
1152 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1154 xl[self.residue2_key] = d[xl[self.residue2_key]]
1157 print(aa.replace(
"\n",
""))
1158 print(
"".join([seq[2]
for seq
in sequences1[protname]]))
1159 print(bb.replace(
"\n",
""))
1160 print(
"".join([seq[2]
for seq
in sequences2[protname]]))
1164 This function creates as many classes as in the input
1165 (number_of_classes: integer) and partition cross-links according
1166 to their identification scores. Classes are defined in the psi key.
1169 if self.id_score_key
is not None:
1173 'The cross-link database does not contain score values')
1174 minscore = min(scores)
1175 maxscore = max(scores)
1176 scoreclasses = numpy.linspace(minscore, maxscore, number_of_classes+1)
1177 if self.psi_key
is None:
1180 score = xl[self.id_score_key]
1181 for n, classmin
in enumerate(scoreclasses[0:-1]):
1182 if score >= classmin
and score <= scoreclasses[n+1]:
1183 xl[self.psi_key] = str(
"CLASS_"+str(n))
1186 def clone_protein(self, protein_name, new_protein_name):
1187 for id
in self.data_base.keys():
1189 for xl
in self.data_base[id]:
1190 new_data_base.append(xl)
1191 if xl[self.protein1_key] == protein_name \
1192 and xl[self.protein2_key] != protein_name:
1194 new_xl[self.protein1_key] = new_protein_name
1195 new_data_base.append(new_xl)
1196 elif xl[self.protein1_key] != protein_name \
1197 and xl[self.protein2_key] == protein_name:
1199 new_xl[self.protein2_key] = new_protein_name
1200 new_data_base.append(new_xl)
1201 elif xl[self.protein1_key] == protein_name \
1202 and xl[self.protein2_key] == protein_name:
1204 new_xl[self.protein1_key] = new_protein_name
1205 new_data_base.append(new_xl)
1207 new_xl[self.protein2_key] = new_protein_name
1208 new_data_base.append(new_xl)
1210 new_xl[self.protein1_key] = new_protein_name
1211 new_xl[self.protein2_key] = new_protein_name
1212 new_data_base.append(new_xl)
1213 self.data_base[id] = new_data_base
1218 This function remove cross-links applied to the same residue
1219 (ie, same chain name and residue number)
1221 for id
in self.data_base.keys():
1223 for xl
in self.data_base[id]:
1224 if xl[self.protein1_key] == xl[self.protein2_key] \
1225 and xl[self.residue1_key] == xl[self.residue2_key]:
1228 new_data_base.append(xl)
1229 self.data_base[id] = new_data_base
1234 this method returns a CrossLinkDataBase class containing
1235 a random subsample of the original cross-link database.
1236 @param percentage float between 0 and 1, is the percentage of
1237 of spectra taken from the original list
1240 if percentage > 1.0
or percentage < 0.0:
1241 raise ValueError(
'the percentage of random cross-link spectra '
1242 'should be between 0 and 1')
1243 nspectra = self.get_number_of_xlid()
1244 nrandom_spectra = int(nspectra*percentage)
1245 random_keys = random.sample(self.data_base.keys(), nrandom_spectra)
1247 for k
in random_keys:
1248 new_data_base[k] = self.data_base[k]
1253 sorted_ids = sorted(self.data_base.keys())
1255 for id
in sorted_ids:
1257 for xl
in self.data_base[id]:
1258 for k
in self.ordered_key_list:
1260 outstr +=
"--- " + str(k) +
" " + str(xl[k]) +
"\n"
1265 if k
not in self.ordered_key_list:
1266 outstr +=
"--- " + str(k) +
" " + str(xl[k]) +
"\n"
1267 outstr +=
"-------------\n"
1270 def plot(self, filename, **kwargs):
1272 matplotlib.use(
'Agg')
1273 import matplotlib.pyplot
as plt
1274 import matplotlib.colors
1276 if kwargs[
"type"] ==
"scatter":
1277 cmap = plt.get_cmap(
"rainbow")
1278 _ = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1279 xkey = kwargs[
"xkey"]
1280 ykey = kwargs[
"ykey"]
1281 if "colorkey" in kwargs:
1282 colorkey = kwargs[
"colorkey"]
1283 if "logyscale" in kwargs:
1284 logyscale = kwargs[
"logyscale"]
1292 xs.append(float(xl[xkey]))
1294 ys.append(math.log10(float(xl[ykey])))
1296 ys.append(float(xl[ykey]))
1297 colors.append(float(xl[colorkey]))
1299 print(
"CrossLinkDataBase.plot: Value error for "
1300 "cross-link %s" % (xl[self.unique_id_key]))
1304 for color
in colors:
1305 cindex = (color-min(colors))/(max(colors)-min(colors))
1306 cs.append(cmap(cindex))
1309 ax = fig.add_subplot(111)
1310 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8, marker=
"o")
1313 plt.savefig(filename)
1317 if kwargs[
"type"] ==
"residue_links":
1321 molecule = kwargs[
"molecule"]
1323 length = len(molecule.sequence)
1324 molecule = molecule.get_name()
1327 sequences_object = kwargs[
"sequences_object"]
1328 sequence = sequences_object.sequences[molecule]
1329 length = len(sequence)
1331 histogram = [0]*length
1333 if xl[self.protein1_key] == molecule:
1334 histogram[xl[self.residue1_key]-1] = \
1335 xl[self.residue1_links_number_key]
1336 if xl[self.protein2_key] == molecule:
1337 histogram[xl[self.residue2_key]-1] = \
1338 xl[self.residue2_links_number_key]
1339 plt.bar(range(1, length+1), histogram)
1340 plt.savefig(filename)
1344 if kwargs[
"type"] ==
"histogram":
1345 valuekey = kwargs[
"valuekey"]
1346 valuename = valuekey
1347 bins = kwargs[
"bins"]
1351 values_list.append(float(xl[valuekey]))
1353 print(
"CrossLinkDataBase.plot: Value error for cross-link "
1354 "%s" % (xl[self.unique_id_key]))
1357 filename, [values_list], valuename=valuename, bins=bins,
1358 colors=
None, format=
"pdf",
1359 reference_xline=
None, yplotrange=
None,
1360 xplotrange=
None, normalized=
True,
1363 def dump(self, json_filename):
1365 with open(json_filename,
'w')
as fp:
1366 json.dump(self.data_base, fp, sort_keys=
True, indent=2,
1367 default=set_json_default)
1369 def load(self, json_filename):
1371 with open(json_filename,
'r') as fp:
1372 self.data_base = json.load(fp)
1376 if sys.version_info[0] < 3:
1378 for k, v
in xl.iteritems():
1379 if type(k)
is unicode:
1381 if type(v)
is unicode:
1385 def save_csv(self, filename):
1390 sorted_group_ids = sorted(self.data_base.keys())
1391 for group
in sorted_group_ids:
1393 for xl
in self.data_base[group]:
1395 sorted_ids = sorted(xl.keys())
1396 values = [xl[k]
for k
in sorted_ids]
1397 group_block.append(values)
1400 with open(filename,
'w')
as fp:
1401 a = csv.writer(fp, delimiter=
',')
1402 a.writerow(sorted_ids)
1407 Returns the number of non redundant cross-link sites
1411 pra = _ProteinsResiduesArray(xl)
1412 prai = pra.get_inverted()
1415 return len(list(praset))
1419 """This class allows to compute and plot the distance between datasets"""
1422 """Input a dictionary where keys are cldb names and values are cldbs"""
1423 import scipy.spatial.distance
1424 self.dbs = cldb_dictionary
1425 self.keylist = list(self.dbs.keys())
1426 self.distances = list()
1428 for i, key1
in enumerate(self.keylist):
1429 for key2
in self.keylist[i+1:]:
1430 distance = self.get_distance(key1, key2)
1431 self.distances.append(distance)
1433 self.distances = scipy.spatial.distance.squareform(self.distances)
1436 return 1.0 - self.
jaccard_index(self.dbs[key1], self.dbs[key2])
1439 """Similarity index between two datasets
1440 https://en.wikipedia.org/wiki/Jaccard_index"""
1444 for xl1
in CrossLinkDataBase1:
1445 a1f = _ProteinsResiduesArray(xl1)
1446 a1b = a1f.get_inverted()
1449 for xl2
in CrossLinkDataBase2:
1450 a2f = _ProteinsResiduesArray(xl2)
1451 a2b = a2f.get_inverted()
1454 return (float(len(set1 & set2)/2)
1455 / (len(set1)/2+len(set2)/2-len(set1 & set2)/2))
1457 def plot_matrix(self, figurename="clustermatrix.pdf"):
1458 import matplotlib
as mpl
1461 import matplotlib.pylab
as pl
1462 from scipy.cluster
import hierarchy
as hrc
1464 raw_distance_matrix = self.distances
1465 labels = self.keylist
1469 ax = fig.add_subplot(1, 1, 1)
1470 dendrogram = hrc.dendrogram(
1471 hrc.linkage(raw_distance_matrix),
1474 leaves_order = dendrogram[
'leaves']
1475 ax.set_xlabel(
'Dataset')
1476 ax.set_ylabel(
'Jaccard Distance')
1478 pl.savefig(
"dendrogram."+figurename, dpi=300)
1483 ax = fig.add_subplot(1, 1, 1)
1484 cax = ax.imshow(raw_distance_matrix[leaves_order, :][:, leaves_order],
1485 interpolation=
'nearest')
1486 cb = fig.colorbar(cax)
1487 cb.set_label(
'Jaccard Distance')
1488 ax.set_xlabel(
'Dataset')
1489 ax.set_ylabel(
'Dataset')
1490 ax.set_xticks(range(len(labels)))
1491 ax.set_xticklabels(numpy.array(labels)[leaves_order],
1492 rotation=
'vertical')
1493 ax.set_yticks(range(len(labels)))
1494 ax.set_yticklabels(numpy.array(labels)[leaves_order],
1495 rotation=
'horizontal')
1497 pl.savefig(
"matrix." + figurename, dpi=300)
1503 This class maps a CrossLinkDataBase on a given structure
1504 and save an rmf file with color-coded cross-links
1507 def __init__(self, CrossLinkDataBase, rmf_or_stat_handler):
1508 self.CrossLinkDataBase = CrossLinkDataBase
1511 self.prots = rmf_or_stat_handler
1512 self.distances = defaultdict(list)
1513 self.array_to_id = {}
1514 self.id_to_array = {}
1516 print(
"computing distances fro all cross-links and all structures")
1517 for i
in self.prots[::10]:
1518 self.compute_distances()
1519 for key, xl
in enumerate(self.CrossLinkDataBase):
1520 array = _ProteinsResiduesArray(xl)
1521 self.array_to_id[array] = key
1522 self.id_to_array[key] = array
1523 if xl[
"MinAmbiguousDistance"] !=
'None':
1524 self.distances[key].append(xl[
"MinAmbiguousDistance"])
1526 def compute_distances(self):
1529 sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1530 for group
in sorted_group_ids:
1532 for xl
in self.CrossLinkDataBase.data_base[group]:
1534 sorted_ids = sorted(xl.keys())
1537 + [
"UniqueID",
"Distance",
"MinAmbiguousDistance"])
1538 c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1540 (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1541 self._get_distance_and_particle_pair(r1, c1, r2, c2)
1551 values = [xl[k]
for k
in sorted_ids]
1552 values += [group, mdist]
1553 except KeyError
as e:
1554 print(
"MapCrossLinkDataBaseOnStructure "
1555 "KeyError: {0} in {1}".format(e, xl))
1556 group_dists.append(mdist)
1557 xl[
"Distance"] = mdist
1558 xl[
"State1"] = state1
1560 xl[
"State2"] = state2
1563 for xl
in self.CrossLinkDataBase.data_base[group]:
1564 xl[
"MinAmbiguousDistance"] = min(group_dists)
1566 def _get_distance_and_particle_pair(self, r1, c1, r2, c2):
1567 '''more robust and slower version of above'''
1570 selpart_1 = sel.get_selected_particles()
1571 if len(selpart_1) == 0:
1572 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle "
1573 "selected for first site")
1577 selpart_2 = sel.get_selected_particles()
1578 if len(selpart_2) == 0:
1579 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle "
1580 "selected for second site")
1583 for p1
in selpart_1:
1584 for p2
in selpart_2:
1585 if p1 == p2
and r1 == r2:
1593 h1 = h1.get_parent()
1596 h1 = h1.get_parent()
1600 h2 = h2.get_parent()
1603 h2 = h2.get_parent()
1606 results.append((dist, state_index1, copy_index1,
1607 state_index2, copy_index2, p1, p2))
1608 if len(results) == 0:
1610 results_sorted = sorted(results,
1611 key=operator.itemgetter(0, 1, 2, 3, 4))
1612 return ((results_sorted[0][0],
1613 results_sorted[0][5],
1614 results_sorted[0][6]),
1615 (results_sorted[0][1],
1616 results_sorted[0][2],
1617 results_sorted[0][3],
1618 results_sorted[0][4]))
1620 def save_rmf_snapshot(self, filename, color_id=None):
1621 sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1624 for group
in sorted_group_ids:
1625 group_dists_particles = []
1626 for xl
in self.CrossLinkDataBase.data_base[group]:
1628 self.CrossLinkDataBase.get_short_cross_link_string(xl)
1629 c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1631 (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1632 self._get_distance_and_particle_pair(r1, c1, r2, c2)
1634 print(
"TypeError or missing chain/residue ",
1637 if color_id
is not None:
1638 group_dists_particles.append((mdist, p1, p2, xllabel,
1639 float(xl[color_id])))
1641 group_dists_particles.append((mdist, p1, p2, xllabel, 0.0))
1642 if group_dists_particles:
1643 (minmdist, minp1, minp2, minxllabel, mincolor_score) = \
1644 min(group_dists_particles, key=
lambda t: t[0])
1645 color_scores.append(mincolor_score)
1646 list_of_pairs.append((minp1, minp2, minxllabel,
1652 linear.set_slope(1.0)
1655 'linear_dummy_restraints')
1657 offset = min(color_scores)
1658 maxvalue = max(color_scores)
1659 for pair
in list_of_pairs:
1662 pr.set_name(pair[2])
1663 if offset < maxvalue:
1664 factor = (pair[3]-offset)/(maxvalue-offset)
1671 rslin.add_restraint(pr)
1674 rh = RMF.create_rmf_file(filename)
1681 def boxplot_crosslink_distances(self, filename):
1683 keys = [self.id_to_array[k]
for k
in self.distances.keys()]
1684 medians = [numpy.median(self.distances[self.array_to_id[k]])
1686 dists = [self.distances[self.array_to_id[k]]
for k
in keys]
1687 distances_sorted_by_median = \
1688 [x
for _, x
in sorted(zip(medians, dists))]
1689 keys_sorted_by_median = [x
for _, x
in sorted(zip(medians, keys))]
1691 filename, distances_sorted_by_median,
1692 range(len(distances_sorted_by_median)),
1693 xlabels=keys_sorted_by_median,
1694 scale_plot_length=0.2)
1696 def get_crosslink_violations(self, threshold):
1697 violations = defaultdict(list)
1698 for k
in self.distances:
1700 viols = self.distances[k]
1701 violations[self.id_to_array[k]] = viols
1707 This class generates a CrossLinkDataBase from a given structure
1709 def __init__(self, system, residue_types_1=["K"],
1710 residue_types_2=[
"K"], reactivity_range=[0, 1], kt=1.0):
1714 cldbkc.set_protein1_key(
"Protein1")
1715 cldbkc.set_protein2_key(
"Protein2")
1716 cldbkc.set_residue1_key(
"Residue1")
1717 cldbkc.set_residue2_key(
"Residue2")
1719 self.system = system
1720 self.model = self.system.model
1721 self.residue_types_1 = residue_types_1
1722 self.residue_types_2 = residue_types_2
1724 self.indexes_dict1 = {}
1725 self.indexes_dict2 = {}
1726 self.protein_residue_dict = {}
1727 self.reactivity_dictionary = {}
1728 self.euclidean_interacting_pairs =
None
1729 self.xwalk_interacting_pairs =
None
1731 for state
in self.system.get_states():
1732 for moleculename, molecules
in state.get_molecules().items():
1733 for molecule
in molecules:
1736 seq = molecule.sequence
1737 residues = [i
for i
in range(1, len(seq)+1)
1738 if ((seq[i-1]
in self.residue_types_1)
1739 or (seq[i-1]
in self.residue_types_2))]
1745 rexp = numpy.random.exponential(0.00000001)
1746 prob = 1.0-math.exp(-rexp)
1747 self.reactivity_dictionary[(molecule, r)] = prob
1749 residues1 = [i
for i
in range(1, len(seq)+1)
1750 if seq[i-1]
in self.residue_types_1]
1751 residues2 = [i
for i
in range(1, len(seq)+1)
1752 if seq[i-1]
in self.residue_types_2]
1755 molecule.hier, residue_index=r, resolution=1)
1756 ps = s.get_selected_particles()
1758 index = p.get_index()
1759 self.indexes_dict1[index] = (molecule, r)
1760 self.protein_residue_dict[(molecule, r)] = index
1763 molecule.hier, residue_index=r, resolution=1)
1764 ps = s.get_selected_particles()
1766 index = p.get_index()
1767 self.indexes_dict2[index] = (molecule, r)
1768 self.protein_residue_dict[(molecule, r)] = index
1770 def get_all_possible_pairs(self):
1771 n = float(len(self.protein_residue_dict.keys()))
1772 return n*(n-1.0)/2.0
1774 def get_all_feasible_pairs(self, distance=21):
1776 particle_index_pairs = []
1778 for a, b
in itertools.combinations(
1779 self.protein_residue_dict.keys(), 2):
1781 index1 = self.protein_residue_dict[a]
1782 index2 = self.protein_residue_dict[b]
1786 if particle_distance <= distance:
1787 particle_index_pairs.append((index1, index2))
1788 new_xl[self.cldb.protein1_key] = a[0].get_name()
1789 new_xl[self.cldb.protein2_key] = b[0].get_name()
1790 new_xl[
"molecule_object1"] = a[0]
1791 new_xl[
"molecule_object2"] = b[0]
1792 new_xl[self.cldb.residue1_key] = a[1]
1793 new_xl[self.cldb.residue2_key] = b[1]
1794 self.cldb.data_base[str(nxl)] = [new_xl]
1799 def get_data_base(self, total_number_of_spectra, ambiguity_probability=0.1,
1800 noise=0.01, distance=21, max_delta_distance=10.0,
1801 xwalk_bin_path=
None, confidence_false=0.75,
1802 confidence_true=0.75):
1804 from random
import random
1806 number_of_spectra = 1
1808 self.beta_true = -1.4427*math.log(0.5*(1.0-confidence_true))
1809 self.beta_false = -1.4427*math.log(0.5*(1.0-confidence_false))
1810 self.cldb.data_base[str(number_of_spectra)] = []
1811 self.sites_weighted =
None
1813 while number_of_spectra < total_number_of_spectra:
1814 if (random() > ambiguity_probability
1815 and len(self.cldb.data_base[str(number_of_spectra)]) != 0):
1817 number_of_spectra += 1
1818 self.cldb.data_base[str(number_of_spectra)] = []
1820 if random() > noise:
1822 pra, dist = self.get_random_residue_pair(
1823 distance, xwalk_bin_path, max_delta_distance)
1826 pra, dist = self.get_random_residue_pair(
None,
None,
None)
1830 new_xl[self.cldb.protein1_key] = pra[0].get_name()
1831 new_xl[self.cldb.protein2_key] = pra[1].get_name()
1832 new_xl[
"molecule_object1"] = pra[0]
1833 new_xl[
"molecule_object2"] = pra[1]
1834 new_xl[self.cldb.residue1_key] = pra[2]
1835 new_xl[self.cldb.residue2_key] = pra[3]
1836 new_xl[
"Noisy"] = noisy
1838 new_xl[
"Reactivity_Residue1"] = \
1839 self.reactivity_dictionary[(pra[0], pra[2])]
1840 new_xl[
"Reactivity_Residue2"] = \
1841 self.reactivity_dictionary[(pra[1], pra[3])]
1843 new_xl[
"Score"] = np.random.beta(1.0, self.beta_false)
1845 new_xl[
"Score"] = 1.0-np.random.beta(1.0, self.beta_true)
1846 new_xl[
"TargetDistance"] = dist
1847 new_xl[
"NoiseProbability"] = noise
1848 new_xl[
"AmbiguityProbability"] = ambiguity_probability
1851 self.model, [self.protein_residue_dict[(pra[0], pra[2])],
1852 self.protein_residue_dict[(pra[1], pra[3])]])
1857 new_xl[
"InterRigidBody"] =
False
1862 new_xl[
"InterRigidBody"] =
True
1864 new_xl[
"InterRigidBody"] =
None
1866 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1870 def get_random_residue_pair(self, distance=None, xwalk_bin_path=None,
1871 max_delta_distance=
None):
1874 from random
import choice
1875 if distance
is None:
1878 protein1, residue1 = choice(self.protein_residue_dict.keys())
1879 protein2, residue2 = choice(self.protein_residue_dict.keys())
1880 index1 = self.protein_residue_dict[(protein1, residue1)]
1881 index2 = self.protein_residue_dict[(protein2, residue2)]
1885 if (protein1, residue1) != (protein2, residue2):
1889 if not xwalk_bin_path:
1891 gcpf.set_distance(distance+max_delta_distance)
1895 if not self.sites_weighted:
1896 self.sites_weighted = []
1897 for key
in self.reactivity_dictionary:
1898 r = self.reactivity_dictionary[key]
1899 self.sites_weighted.append((key, r))
1901 first_site = self.weighted_choice(self.sites_weighted)
1903 if not self.euclidean_interacting_pairs:
1904 self.euclidean_interacting_pairs = \
1905 gcpf.get_close_pairs(
1907 self.indexes_dict1.keys(),
1908 self.indexes_dict2.keys())
1910 first_site_pairs = \
1911 [pair
for pair
in self.euclidean_interacting_pairs
1912 if self.indexes_dict1[pair[0]] == first_site
or
1913 self.indexes_dict2[pair[1]] == first_site]
1914 if len(first_site_pairs) == 0:
1917 second_sites_weighted = []
1918 for pair
in first_site_pairs:
1919 if self.indexes_dict1[pair[0]] == first_site:
1920 second_site = self.indexes_dict2[pair[1]]
1921 if self.indexes_dict2[pair[1]] == first_site:
1922 second_site = self.indexes_dict1[pair[0]]
1923 r = self.reactivity_dictionary[second_site]
1924 second_sites_weighted.append((second_site, r))
1925 second_site = self.weighted_choice(second_sites_weighted)
1926 protein1, residue1 = first_site
1927 protein2, residue2 = second_site
1928 print(
"CrossLinkDataBaseFromStructure."
1929 "get_random_residue_pair:",
1930 "First site", first_site,
1931 self.reactivity_dictionary[first_site],
1932 "Second site", second_site,
1933 self.reactivity_dictionary[second_site])
1936 [self.protein_residue_dict[first_site],
1937 self.protein_residue_dict[second_site]])
1942 if particle_distance < distance \
1943 and (protein1, residue1) != (protein2, residue2):
1945 elif (particle_distance >= distance
1946 and (protein1, residue1) != (protein2, residue2)
1947 and max_delta_distance):
1949 if particle_distance-distance < max_delta_distance:
1953 if not self.xwalk_interacting_pairs:
1954 self.xwalk_interacting_pairs = \
1955 self.get_xwalk_distances(xwalk_bin_path, distance)
1956 interacting_pairs_weighted = []
1958 for pair
in self.xwalk_interacting_pairs:
1964 -self.reactivity_dictionary[(protein1, residue1)]
1967 -self.reactivity_dictionary[(protein2, residue2)]
1969 interacting_pairs_weighted.append((pair, weight1*weight2))
1971 pair = self.weighted_choice(interacting_pairs_weighted)
1976 particle_distance = float(pair[4])
1978 return ((protein1, protein2, residue1, residue2)), particle_distance
1980 def get_xwalk_distances(self, xwalk_bin_path, distance):
1984 o.init_pdb(
"xwalk.pdb", self.representation.prot)
1985 o.write_pdb(
"xwalk.pdb")
1986 namechainiddict = o.dictchain[
"xwalk.pdb"]
1989 for key
in namechainiddict:
1990 chainiddict[namechainiddict[key]] = key
1991 xwalkout = os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path
1992 +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys '
1993 '-a1 cb -a2 cb -max '
1994 + str(distance) +
' -bb').read()
1996 output_list_of_distance = []
1997 for line
in xwalkout.split(
"\n")[0:-2]:
1998 tokens = line.split()
2001 distance = float(tokens[6])
2002 fs = first.split(
"-")
2003 ss = second.split(
"-")
2006 protein1 = chainiddict[chainid1]
2007 protein2 = chainiddict[chainid2]
2008 residue1 = int(fs[1])
2009 residue2 = int(ss[1])
2010 output_list_of_distance.append(
2011 (protein1, protein2, residue1, residue2, distance))
2012 return output_list_of_distance
2014 def weighted_choice(self, choices):
2017 total = sum(w
for c, w
in choices)
2018 r = random.uniform(0, total)
2020 for c, w
in choices:
2024 assert False,
"Shouldn't get here"
2026 def save_rmf_snapshot(self, filename, color_id=None):
2029 if color_id
is None:
2030 color_id =
"Reactivity"
2031 sorted_group_ids = sorted(self.cldb.data_base.keys())
2034 for group
in sorted_group_ids:
2035 group_dists_particles = []
2036 for xl
in self.cldb.data_base[group]:
2037 xllabel = self.cldb.get_short_cross_link_string(xl)
2038 (c1, c2, r1, r2) = (xl[
"molecule_object1"],
2039 xl[
"molecule_object2"],
2040 xl[self.cldb.residue1_key],
2041 xl[self.cldb.residue2_key])
2043 index1 = self.protein_residue_dict[(c1, r1)]
2044 index2 = self.protein_residue_dict[(c2, r2)]
2047 mdist = xl[
"TargetDistance"]
2049 print(
"TypeError or missing chain/residue ",
2052 group_dists_particles.append((mdist, p1, p2, xllabel,
2053 float(xl[color_id])))
2054 if group_dists_particles:
2055 (minmdist, minp1, minp2, minxllabel, mincolor_score) \
2056 = min(group_dists_particles, key=
lambda t: t[0])
2057 color_scores.append(mincolor_score)
2058 list_of_pairs.append((minp1, minp2, minxllabel,
2065 linear.set_slope(1.0)
2069 offset = min(color_scores)
2070 maxvalue = max(color_scores)
2071 for pair
in list_of_pairs:
2073 pr.set_name(pair[2])
2074 factor = (pair[3]-offset)/(maxvalue-offset)
2079 rslin.add_restraint(pr)
2082 rh = RMF.create_rmf_file(filename)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def __init__
Input a dictionary where keys are cldb names and values are cldbs.
def get_number_of_unique_crosslinked_sites
Returns the number of non redundant cross-link sites.
def filter_score
Get all cross-links with score greater than an input value.
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
Get the particles from a list of indexes.
def create_new_keyword
This function creates a new keyword for the whole database and set the values from and existing keywo...
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
def jaccard_index
Similarity index between two datasets https://en.wikipedia.org/wiki/Jaccard_index.
def merge
This function merges two cross-link datasets so that if two conflicting cross-links have the same cro...
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class maps a CrossLinkDataBase on a given structure and save an rmf file with color-coded cross-...
A decorator for keeping track of copies of a molecule.
The standard decorator for manipulating molecular structures.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def set_value
This function changes the value for a given key in the database For instance one can change the name ...
This class allows to compute and plot the distance between datasets.
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
def filter_out_same_residues
This function remove cross-links applied to the same residue (ie, same chain name and residue number)...
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
def offset_residue_index
This function offset the residue indexes of a given protein by a specified value. ...
Class for easy writing of PDBs, RMFs, and stat files.
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
def jackknife
this method returns a CrossLinkDataBase class containing a random subsample of the original cross-lin...
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
def check_cross_link_consistency
This function checks the consistency of the dataset with the amino acid sequence. ...
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
def get_values
this function returns the list of values for a given key in the database alphanumerically sorted ...
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Associate an integer "state" index with a hierarchy node.
def append_database
Append cross-link dataset to this one.
class to link stat files to several rmf files
class to allow more advanced handling of RMF files.
Mapping between FASTA one-letter codes and residue types.
A class to handle different styles of site pairs parsers.
def classify_crosslinks_by_score
This function creates as many classes as in the input (number_of_classes: integer) and partition cros...
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
this class handles a cross-link dataset and do filtering operations, adding cross-links, merge datasets...
Support for the RMF file format for storing hierarchical molecular data and markup.
def rename_proteins
This function renames all proteins contained in the input dictionary from the old names (keys) to the...
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.
def align_sequence
This function parses an alignment file obtained previously, to map crosslink residues onto the sequen...
def create_set_from_file
if FixedFormatParser is not specified, the file is comma-separated-values