1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
22 from collections
import defaultdict
27 def set_json_default(obj):
28 if isinstance(obj, set):
36 def _handle_string_input(inp):
37 if not isinstance(inp, str):
38 raise TypeError(
"expecting a string")
42 class _CrossLinkDataBaseStandardKeys:
44 This class setup all the standard keys needed to
45 identify the cross-link features from the data sets
49 self.protein1_key =
"Protein1"
50 self.type[self.protein1_key] = str
51 self.protein2_key =
"Protein2"
52 self.type[self.protein2_key] = str
53 self.residue1_key =
"Residue1"
54 self.type[self.residue1_key] = int
55 self.residue2_key =
"Residue2"
56 self.type[self.residue2_key] = int
57 self.residue1_amino_acid_key =
"Residue1AminoAcid"
58 self.type[self.residue1_amino_acid_key] = str
59 self.residue2_amino_acid_key =
"Residue2AminoAcid"
60 self.type[self.residue2_amino_acid_key] = str
61 self.residue1_moiety_key =
"Residue1Moiety"
62 self.type[self.residue1_moiety_key] = str
63 self.residue2_moiety_key =
"Residue2Moiety"
64 self.type[self.residue2_moiety_key] = str
65 self.site_pairs_key =
"SitePairs"
66 self.type[self.site_pairs_key] = str
67 self.unique_id_key =
"XLUniqueID"
68 self.type[self.unique_id_key] = str
69 self.unique_sub_index_key =
"XLUniqueSubIndex"
70 self.type[self.unique_sub_index_key] = int
71 self.unique_sub_id_key =
"XLUniqueSubID"
72 self.type[self.unique_sub_id_key] = str
73 self.data_set_name_key =
"DataSetName"
74 self.type[self.data_set_name_key] = str
75 self.cross_linker_chemical_key =
"CrossLinkerChemical"
76 self.type[self.cross_linker_chemical_key] = str
77 self.id_score_key =
"IDScore"
78 self.type[self.id_score_key] = float
80 self.type[self.fdr_key] = float
81 self.quantitation_key =
"Quantitation"
82 self.type[self.quantitation_key] = float
83 self.redundancy_key =
"Redundancy"
84 self.type[self.redundancy_key] = int
85 self.redundancy_list_key =
"RedundancyList"
86 self.type[self.redundancy_key] = list
87 self.ambiguity_key =
"Ambiguity"
88 self.type[self.ambiguity_key] = int
89 self.residue1_links_number_key =
"Residue1LinksNumber"
90 self.type[self.residue1_links_number_key] = int
91 self.residue2_links_number_key =
"Residue2LinksNumber"
92 self.type[self.residue2_links_number_key] = int
93 self.type[self.ambiguity_key] = int
94 self.state_key =
"State"
95 self.type[self.state_key] = int
96 self.sigma1_key =
"Sigma1"
97 self.type[self.sigma1_key] = str
98 self.sigma2_key =
"Sigma2"
99 self.type[self.sigma2_key] = str
101 self.type[self.psi_key] = str
102 self.alpha_key =
"Alpha"
103 self.type[self.alpha_key] = str
104 self.beta_key =
"Beta"
105 self.type[self.beta_key] = str
106 self.distance_key =
"Distance"
107 self.type[self.distance_key] = float
108 self.min_ambiguous_distance_key =
"MinAmbiguousDistance"
109 self.type[self.distance_key] = float
111 self.link_type_key =
"LinkType"
112 self.type[self.link_type_key] = str
114 self.ordered_key_list = [
115 self.data_set_name_key, self.unique_id_key,
116 self.unique_sub_index_key, self.unique_sub_id_key,
117 self.protein1_key, self.protein2_key, self.residue1_key,
118 self.residue2_key, self.residue1_amino_acid_key,
119 self.residue2_amino_acid_key, self.residue1_moiety_key,
120 self.residue2_moiety_key, self.cross_linker_chemical_key,
121 self.id_score_key, self.fdr_key, self.quantitation_key,
122 self.redundancy_key, self.redundancy_list_key, self.state_key,
123 self.sigma1_key, self.sigma2_key, self.psi_key, self.distance_key,
124 self.min_ambiguous_distance_key, self.link_type_key]
127 class _ProteinsResiduesArray(tuple):
129 This class is inherits from tuple, and it is a shorthand for a cross-link
130 (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1
131 and protein2, r1 and r2 are residue1 and residue2.
134 def __new__(self, input_data):
136 @input_data can be a dict or a tuple
138 self.cldbsk = _CrossLinkDataBaseStandardKeys()
139 if type(input_data)
is dict:
141 p1 = input_data[self.cldbsk.protein1_key]
143 p2 = input_data[self.cldbsk.protein2_key]
146 r1 = input_data[self.cldbsk.residue1_key]
148 r2 = input_data[self.cldbsk.residue2_key]
154 t = (p1,
"", r1,
None)
155 elif type(input_data)
is tuple:
156 if len(input_data) > 4
or len(input_data) == 3 \
157 or len(input_data) == 1:
159 "_ProteinsResiduesArray: must have only 4 elements")
160 if len(input_data) == 4:
161 p1 = _handle_string_input(input_data[0])
162 p2 = _handle_string_input(input_data[1])
165 if (type(r1)
is not int)
and (r1
is not None):
167 "_ProteinsResiduesArray: residue1 must be a integer")
168 if (type(r2)
is not int)
and (r1
is not None):
170 "_ProteinsResiduesArray: residue2 must be a integer")
172 if len(input_data) == 2:
173 p1 = _handle_string_input(input_data[0])
175 if type(r1)
is not int:
177 "_ProteinsResiduesArray: residue1 must be a integer")
178 t = (p1,
"", r1,
None)
181 "_ProteinsResiduesArray: input must be a dict or tuple")
182 return tuple.__new__(_ProteinsResiduesArray, t)
184 def get_inverted(self):
186 Returns a _ProteinsResiduesArray instance with protein1 and
189 return _ProteinsResiduesArray((self[1], self[0], self[3], self[2]))
192 outstr = self.cldbsk.protein1_key +
" " + str(self[0])
193 outstr +=
" " + self.cldbsk.protein2_key +
" " + str(self[1])
194 outstr +=
" " + self.cldbsk.residue1_key +
" " + str(self[2])
195 outstr +=
" " + self.cldbsk.residue2_key +
" " + str(self[3])
199 outstr = str(self[0]) +
"." + str(self[2]) +
"-" + str(self[1]) \
206 This class allows to create filter functions that can be passed to
207 the CrossLinkDataBase in this way:
209 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
211 where cldb is CrossLinkDataBase instance and it is only needed to get
212 the standard keywords
214 A filter operator can be evaluate on a CrossLinkDataBase item xl and
219 and it is used to filter the database
222 def __init__(self, argument1, operator, argument2):
224 (argument1,operator,argument2) can be either a
225 (keyword,operator.eq|lt|gt...,value)
226 or (FilterOperator1,operator.or|and...,FilterOperator2)
228 if isinstance(argument1, FilterOperator):
229 self.operations = [argument1, operator, argument2]
232 self.values = (argument1, operator, argument2)
234 def __or__(self, FilterOperator2):
237 def __and__(self, FilterOperator2):
240 def __invert__(self):
243 def evaluate(self, xl_item):
245 if len(self.operations) == 0:
246 keyword, operator, value = self.values
247 return operator(xl_item[keyword], value)
248 FilterOperator1, op, FilterOperator2 = self.operations
250 if FilterOperator2
is None:
251 return op(FilterOperator1.evaluate(xl_item))
253 return op(FilterOperator1.evaluate(xl_item),
254 FilterOperator2.evaluate(xl_item))
259 This class is needed to convert the keywords from a generic database
265 @param list_parser an instance of ResiduePairListParser, if any
269 self.backward_converter = {}
270 _CrossLinkDataBaseStandardKeys.__init__(self)
271 self.rplp = list_parser
272 if self.rplp
is None:
274 self.compulsory_keys = set([self.protein1_key,
279 self.compulsory_keys = self.rplp.get_compulsory_keys()
280 self.setup_keys = set()
284 Is a function that check whether necessary keys are setup
287 if self.compulsory_keys & setup_keys != self.compulsory_keys:
288 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup "
289 "all necessary keys")
293 Returns the keys that have been setup so far
295 return list(self.backward_converter.keys())
299 This sets up the standard compulsory keys as defined by
300 _CrossLinkDataBaseStandardKeys
302 for ck
in self.compulsory_keys:
303 self.converter[ck] = ck
304 self.backward_converter[ck] = ck
306 def set_unique_id_key(self, origin_key):
307 self.converter[origin_key] = self.unique_id_key
308 self.backward_converter[self.unique_id_key] = origin_key
310 def set_protein1_key(self, origin_key):
311 self.converter[origin_key] = self.protein1_key
312 self.backward_converter[self.protein1_key] = origin_key
314 def set_protein2_key(self, origin_key):
315 self.converter[origin_key] = self.protein2_key
316 self.backward_converter[self.protein2_key] = origin_key
318 def set_residue1_key(self, origin_key):
319 self.converter[origin_key] = self.residue1_key
320 self.backward_converter[self.residue1_key] = origin_key
322 def set_residue2_key(self, origin_key):
323 self.converter[origin_key] = self.residue2_key
324 self.backward_converter[self.residue2_key] = origin_key
326 def set_residue1_amino_acid_key(self, origin_key):
327 self.converter[origin_key] = self.residue1_amino_acid_key
328 self.backward_converter[self.residue1_amino_acid_key] = origin_key
330 def set_residue2_amino_acid_key(self, origin_key):
331 self.converter[origin_key] = self.residue2_amino_acid_key
332 self.backward_converter[self.residue2_amino_acid_key] = origin_key
334 def set_residue1_moiety_key(self, origin_key):
335 self.converter[origin_key] = self.residue1_moiety_key
336 self.backward_converter[self.residue1_moiety_key] = origin_key
338 def set_residue2_moiety_key(self, origin_key):
339 self.converter[origin_key] = self.residue2_moiety_key
340 self.backward_converter[self.residue2_moiety_key] = origin_key
342 def set_site_pairs_key(self, origin_key):
343 self.converter[origin_key] = self.site_pairs_key
344 self.backward_converter[self.site_pairs_key] = origin_key
346 def set_id_score_key(self, origin_key):
347 self.converter[origin_key] = self.id_score_key
348 self.backward_converter[self.id_score_key] = origin_key
350 def set_fdr_key(self, origin_key):
351 self.converter[origin_key] = self.fdr_key
352 self.backward_converter[self.fdr_key] = origin_key
354 def set_quantitation_key(self, origin_key):
355 self.converter[origin_key] = self.quantitation_key
356 self.backward_converter[self.quantitation_key] = origin_key
358 def set_psi_key(self, origin_key):
359 self.converter[origin_key] = self.psi_key
360 self.backward_converter[self.psi_key] = origin_key
362 def set_link_type_key(self, link_type_key):
363 self.converter[link_type_key] = self.link_type_key
364 self.backward_converter[self.link_type_key] = link_type_key
368 Returns the dictionary that convert the old keywords to the new ones
371 return self.converter
375 Returns the dictionary that convert the new keywords to the old ones
378 return self.backward_converter
383 A class to handle different styles of site pairs parsers.
385 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
386 [Y3-;K4-] for dead-ends
387 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
388 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
389 LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
394 def __init__(self, style):
395 _CrossLinkDataBaseStandardKeys.__init__(self)
396 if style ==
"MSSTUDIO":
398 self.compulsory_keys = set([self.protein1_key,
400 self.site_pairs_key])
401 elif style ==
"XTRACT" or style ==
"QUANTITATION":
403 self.compulsory_keys = set([self.site_pairs_key])
404 elif style ==
"LAN_HUANG":
406 self.compulsory_keys = set([self.site_pairs_key])
408 raise Exception(
"ResiduePairListParser: unknown list parser style")
410 def get_compulsory_keys(self):
411 return self.compulsory_keys
415 This function returns a list of cross-linked residues and the
416 corresponding list of cross-linked chains. The latter list can be
417 empty, if the style doesn't have the corresponding information.
419 if self.style ==
"MSSTUDIO":
420 input_string = input_string.replace(
"[",
"")
421 input_string = input_string.replace(
"]",
"")
422 input_string_pairs = input_string.split(
";")
423 residue_pair_indexes = []
424 chain_pair_indexes = []
425 for s
in input_string_pairs:
427 r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)'
428 r'(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',
431 r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$', s)
434 (residue_type_1, residue_index_1, residue_type_2,
435 residue_index_2) = m1.group(1, 2, 3, 4)
436 residue_pair_indexes.append((residue_index_1,
440 residue_type_1, residue_index_1 = m2.group(1, 2)
442 return residue_pair_indexes, chain_pair_indexes
443 if self.style ==
"XTRACT" or self.style ==
"QUANTITATION":
444 if ":x:" in input_string:
446 input_strings = input_string.split(
":x:")
447 first_peptides = input_strings[0].split(
":|:")
448 second_peptides = input_strings[1].split(
":|:")
449 first_peptides_identifiers = [
451 f.split(
":")[1])
for f
in first_peptides]
452 second_peptides_identifiers = [
454 f.split(
":")[1])
for f
in second_peptides]
455 residue_pair_indexes = []
456 chain_pair_indexes = []
457 for fpi
in first_peptides_identifiers:
458 for spi
in second_peptides_identifiers:
463 residue_pair_indexes.append((residue1, residue2))
464 chain_pair_indexes.append((chain1, chain2))
465 return residue_pair_indexes, chain_pair_indexes
468 first_peptides = input_string.split(
":|:")
469 first_peptides_identifiers = [
470 (f.split(
":")[0], f.split(
":")[1])
for f
in first_peptides]
473 for fpi
in first_peptides_identifiers:
476 residue_indexes.append((residue1,))
477 chain_indexes.append((chain1,))
478 return residue_indexes, chain_indexes
479 if self.style ==
"LAN_HUANG":
480 input_strings = input_string.split(
"-")
481 chain1, first_series = input_strings[0].split(
":")
482 chain2, second_series = input_strings[1].split(
":")
484 first_residues = first_series.replace(
";",
"|").split(
"|")
485 second_residues = second_series.replace(
";",
"|").split(
"|")
486 residue_pair_indexes = []
487 chain_pair_indexes = []
488 for fpi
in first_residues:
489 for spi
in second_residues:
490 residue1 = self.re.sub(
"[^0-9]",
"", fpi)
491 residue2 = self.re.sub(
"[^0-9]",
"", spi)
492 residue_pair_indexes.append((residue1, residue2))
493 chain_pair_indexes.append((chain1, chain2))
494 return residue_pair_indexes, chain_pair_indexes
499 A class to handle different XL format with fixed format
500 currently support ProXL
502 def __init__(self, format):
503 _CrossLinkDataBaseStandardKeys.__init__(self)
504 if format ==
"PROXL":
507 raise Exception(
"FixedFormatParser: unknown list format name")
509 def get_data(self, input_string):
510 if self.format ==
"PROXL":
511 tokens = input_string.split(
"\t")
513 if tokens[0] ==
"SEARCH ID(S)":
516 xl[self.protein1_key] = tokens[2]
517 xl[self.protein2_key] = tokens[4]
518 xl[self.residue1_key] = int(tokens[3])
519 xl[self.residue2_key] = int(tokens[5])
525 this class handles a cross-link dataset and do filtering
526 operations, adding cross-links, merge datasets...
529 def __init__(self, converter=None, data_base=None, fasta_seq=None,
533 @param converter an instance of CrossLinkDataBaseKeywordsConverter
534 @param data_base an instance of CrossLinkDataBase to build the
536 @param fasta_seq an instance of IMP.pmi.topology.Sequences containing
537 protein fasta sequences to check cross-link consistency.
538 If not given consistency will not be checked
539 @param linkable_aa a tuple containing one-letter amino acids which
540 are linkable by the cross-linker; only used if the database
541 DOES NOT provide a value for a certain residueX_amino_acid_key
542 and if a fasta_seq is given
545 if data_base
is None:
548 self.data_base = data_base
550 _CrossLinkDataBaseStandardKeys.__init__(self)
551 if converter
is not None:
552 self.cldbkc = converter
553 self.list_parser = self.cldbkc.rplp
554 self.converter = converter.get_converter()
558 self.list_parser =
None
559 self.converter =
None
562 self.def_aa_tuple = linkable_aa
563 self.fasta_seq = fasta_seq
570 Update the whole dataset after changes
572 self.update_cross_link_unique_sub_index()
573 self.update_cross_link_redundancy()
574 self.update_residues_links_number()
578 sorted_ids = sorted(list(self.data_base.keys()))
580 for xl
in self.data_base[k]:
583 def xlid_iterator(self):
584 sorted_ids = sorted(list(self.data_base.keys()))
585 for xlid
in sorted_ids:
588 def __getitem__(self, xlid):
589 return self.data_base[xlid]
592 return len([xl
for xl
in self])
597 def set_name(self, name):
599 for k
in self.data_base:
600 new_data_base[k +
"." + name] = self.data_base[k]
601 self.data_base = new_data_base
605 def get_number_of_xlid(self):
606 return len(self.data_base)
609 FixedFormatParser=
None, encoding=
None):
611 if FixedFormatParser is not specified, the file is
612 comma-separated-values
614 @param file_name a txt file to be parsed
615 @param converter an instance of CrossLinkDataBaseKeywordsConverter
616 @param FixedFormatParser a parser for a fixed format
617 @param encoding the encoding of the file, if not the locale
618 default (usually UTF-8)
620 if not FixedFormatParser:
621 xl_list = IMP.pmi.tools.get_db_from_csv(file_name, encoding)
623 if converter
is not None:
624 self.cldbkc = converter
625 self.list_parser = self.cldbkc.rplp
626 self.converter = converter.get_converter()
628 if not self.list_parser:
632 for nxl, xl
in enumerate(xl_list):
635 if k
in self.converter:
636 new_xl[self.converter[k]] = \
637 self.type[self.converter[k]](xl[k])
640 if self.unique_id_key
in self.cldbkc.get_setup_keys():
641 if new_xl[self.unique_id_key]
not in new_xl_dict:
642 new_xl_dict[new_xl[self.unique_id_key]] = [new_xl]
644 new_xl_dict[new_xl[self.unique_id_key]].append(
647 if str(nxl)
not in new_xl_dict:
648 new_xl_dict[str(nxl)] = [new_xl]
650 new_xl_dict[str(nxl)].append(new_xl)
656 for nxl, entry
in enumerate(xl_list):
660 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
662 "CrossLinkDataBase: expecting a site_pairs_key "
663 "for the site pair list parser")
665 if k
in self.converter:
666 new_dict[self.converter[k]] = \
667 self.type[self.converter[k]](entry[k])
669 new_dict[k] = entry[k]
672 chain_pair_list) = self.list_parser.get_list(
673 new_dict[self.site_pairs_key])
676 for n, p
in enumerate(residue_pair_list):
683 new_xl[k] = new_dict[k]
684 new_xl[self.residue1_key] = \
685 self.type[self.residue1_key](p[0])
687 new_xl[self.residue2_key] = \
688 self.type[self.residue2_key](p[1])
690 if len(chain_pair_list) == len(residue_pair_list):
691 new_xl[self.protein1_key] = \
692 self.type[self.protein1_key](
693 chain_pair_list[n][0])
695 new_xl[self.protein2_key] = \
696 self.type[self.protein2_key](
697 chain_pair_list[n][1])
700 new_xl[self.link_type_key] =
"CROSSLINK"
702 new_xl[self.link_type_key] =
"MONOLINK"
704 if self.unique_id_key
in self.cldbkc.get_setup_keys():
705 if new_xl[self.unique_id_key]
not in new_xl_dict:
706 new_xl_dict[new_xl[self.unique_id_key]] = \
709 new_xl_dict[new_xl[self.unique_id_key]].append(
712 if str(nxl)
not in new_xl_dict:
713 new_xl[self.unique_id_key] = str(nxl+1)
714 new_xl_dict[str(nxl)] = [new_xl]
716 new_xl[self.unique_id_key] = str(nxl+1)
717 new_xl_dict[str(nxl)].append(new_xl)
721 if FixedFormatParser is defined
726 with open(file_name,
"r", encoding=encoding) as f:
728 xl = FixedFormatParser.get_data(line)
730 xl[self.unique_id_key] = str(nxl+1)
731 new_xl_dict[str(nxl)] = [xl]
734 self.data_base = new_xl_dict
735 self.name = file_name
736 loc = ihm.location.InputFileLocation(file_name, details=
'Crosslinks')
737 self.dataset = ihm.dataset.CXMSDataset(loc)
740 def update_cross_link_unique_sub_index(self):
741 for k
in self.data_base:
742 for n, xl
in enumerate(self.data_base[k]):
743 xl[self.ambiguity_key] = len(self.data_base[k])
744 xl[self.unique_sub_index_key] = n+1
745 xl[self.unique_sub_id_key] = k +
"." + str(n+1)
747 def update_cross_link_redundancy(self):
748 redundancy_data_base = {}
750 pra = _ProteinsResiduesArray(xl)
751 if pra
not in redundancy_data_base:
752 redundancy_data_base[pra] = [xl[self.unique_sub_id_key]]
753 redundancy_data_base[pra.get_inverted()] = \
754 [xl[self.unique_sub_id_key]]
756 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
757 redundancy_data_base[pra.get_inverted()].append(
758 xl[self.unique_sub_id_key])
760 pra = _ProteinsResiduesArray(xl)
761 xl[self.redundancy_key] = len(redundancy_data_base[pra])
762 xl[self.redundancy_list_key] = redundancy_data_base[pra]
764 def update_residues_links_number(self):
767 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
768 if (p1, r1)
not in residue_links:
769 residue_links[(p1, r1)] = set([(p2, r2)])
771 residue_links[(p1, r1)].add((p2, r2))
772 if (p2, r2)
not in residue_links:
773 residue_links[(p2, r2)] = set([(p1, r1)])
775 residue_links[(p2, r2)].add((p1, r1))
778 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
779 xl[self.residue1_links_number_key] = len(residue_links[(p1, r1)])
780 xl[self.residue2_links_number_key] = len(residue_links[(p2, r2)])
783 """This function checks the consistency of the dataset with the
784 amino acid sequence"""
785 if self.cldbkc
and self.fasta_seq:
786 cnt_matched, cnt_matched_file = 0, 0
790 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
791 b_matched_file =
False
792 if self.residue1_amino_acid_key
in xl:
795 aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
796 b_matched = self._match_xlinks(p1, r1, aa_from_file)
797 b_matched_file = b_matched
801 b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
803 matched, non_matched = self._update_matched_xlinks(
804 b_matched, p1, r1, matched, non_matched)
806 if self.residue2_amino_acid_key
in xl:
807 aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
808 b_matched = self._match_xlinks(p2, r2, aa_from_file)
809 b_matched_file = b_matched
811 b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
813 matched, non_matched = self._update_matched_xlinks(
814 b_matched, p2, r2, matched, non_matched)
818 cnt_matched_file += 1
820 percentage_matched = round(100*cnt_matched/len(self), 1)
821 percentage_matched_file = round(
822 100 * cnt_matched_file / len(self), 1)
823 if matched
or non_matched:
825 "check_cross_link_consistency: Out of %d cross-links "
826 "%d were matched to the fasta sequence (%f %%).\n "
827 "%d were matched by using the cross-link file (%f %%)."
828 % (len(self), cnt_matched, percentage_matched,
829 cnt_matched_file, percentage_matched_file))
831 print(
"check_cross_link_consistency: Warning: Non "
833 [(prot_name, sorted(list(non_matched[prot_name])))
834 for prot_name
in non_matched])
835 return matched, non_matched
837 def _match_xlinks(self, prot_name, res_index, aa_tuple):
841 amino_dict = IMP.pmi.alphabets.amino_acid
843 for amino_acid
in aa_tuple:
844 if len(amino_acid) == 3:
845 amino_acid = amino_dict.get_one_letter_code_from_residue_type(
847 if prot_name
in self.fasta_seq.sequences:
848 seq = self.fasta_seq.sequences[prot_name]
856 if res_index == 0
or res_index == 1:
858 if res_index < len(seq):
859 if amino_acid == seq[res_index]:
863 def _update_matched_xlinks(self, b_matched, prot, res, matched,
867 matched[prot].add(res)
869 matched[prot] = set([res])
871 if prot
in non_matched:
872 non_matched[prot].add(res)
874 non_matched[prot] = set([res])
875 return matched, non_matched
877 def get_cross_link_string(self, xl):
879 for k
in self.ordered_key_list:
881 string += str(k) +
":" + str(xl[k]) +
"|"
886 if k
not in self.ordered_key_list:
887 string += str(k) +
":" + str(xl[k]) +
"|"
891 def get_short_cross_link_string(self, xl):
893 list_of_keys = [self.data_set_name_key,
894 self.unique_sub_id_key,
902 for k
in list_of_keys:
904 string += str(xl[k]) +
"|"
910 def filter(self, FilterOperator):
912 for id
in list(self.data_base.keys()):
913 for xl
in self.data_base[id]:
914 if FilterOperator.evaluate(xl):
915 if id
not in new_xl_dict:
916 new_xl_dict[id] = [xl]
918 new_xl_dict[id].append(xl)
921 cdb.dataset = self.dataset
926 '''Get all cross-links with score greater than an input value'''
928 self.id_score_key, operator.gt, score)
929 return self.filter(FilterOperator)
931 def merge(self, CrossLinkDataBase1, CrossLinkDataBase2):
933 This function merges two cross-link datasets so that if two
934 conflicting cross-links have the same cross-link UniqueIDS,
935 the cross-links will be appended under the same UniqueID slots
936 with different SubIDs
938 raise NotImplementedError()
941 """Append cross-link dataset to this one."""
943 for k
in self.data_base:
944 new_data_base[k] = self.data_base[k]
945 for k
in db.data_base:
946 new_data_base[k] = db.data_base[k]
947 self.data_base = new_data_base
950 def __iadd__(self, db):
954 def set_value(self, key, new_value, filter_operator=None):
956 This function changes the value for a given key in the database
957 For instance one can change the name of a protein
958 @param key: the key in the database that must be changed
959 @param new_value: the new value of the key
960 @param filter_operator: optional FilterOperator to change the value to
961 a subset of the database
963 example: `cldb1.set_value(cldb1.protein1_key, 'FFF',
964 FO(cldb.protein1_key, operator.eq, "AAA"))`
968 if filter_operator
is not None:
969 if filter_operator.evaluate(xl):
977 this function returns the list of values for a given key in the
978 database alphanumerically sorted
983 return sorted(list(values))
987 This function offset the residue indexes of a given protein by
989 @param protein_name: the protein name that need to be changed
990 @param offset: the offset value
994 if xl[self.protein1_key] == protein_name:
995 xl[self.residue1_key] = xl[self.residue1_key] + offset
996 if xl[self.protein2_key] == protein_name:
997 xl[self.residue2_key] = xl[self.residue2_key] + offset
1002 This function creates a new keyword for the whole database and
1003 set the values from and existing keyword (optional), otherwise the
1004 values are set to None
1005 @param keyword the new keyword name:
1006 @param values_from_keyword the keyword from which we are copying
1010 if values_from_keyword
is not None:
1011 xl[keyword] = xl[values_from_keyword]
1017 protein_to_rename=
"both"):
1019 This function renames all proteins contained in the input dictionary
1020 from the old names (keys) to the new name (values)
1021 @param old_to_new_names_dictionary dictionary for converting old
1023 @param protein_to_rename specify whether to rename both or protein1
1027 for old_name
in old_to_new_names_dictionary:
1028 new_name = old_to_new_names_dictionary[old_name]
1029 if protein_to_rename ==
"both" or protein_to_rename ==
"protein1":
1031 self.
set_value(self.protein1_key, new_name, fo2)
1032 if protein_to_rename ==
"both" or protein_to_rename ==
"protein2":
1034 self.
set_value(self.protein2_key, new_name, fo2)
1038 This function parses an alignment file obtained previously, to map
1039 crosslink residues onto the sequence of a homolog proteins. It is
1040 useful if you want to map the crosslinks onto a known, homolog
1041 structure without building a comparative model.
1042 The txt file of the alignment should be structured with repeating
1045 >NAME_CROSSLINKED_PROTEIN
1046 >NAME_HOMOLOG_PROTEIN_WITH_KNOWN_STRUCTURE
1047 one-letter code sequence for cross-linked protein (one line)
1048 one-letter code sequence for homolog protein with known structure (one line)
1051 f = open(alignfile,
'r')
1053 def _assign_residues(seq1, seq2):
1061 resinds1.append(nres)
1063 resinds1.append(
None)
1065 resinds2 = [
None for i
in resinds1]
1068 for pos, r2
in enumerate(seq2):
1071 resinds2[pos] = nres
1073 if contiguousgap > 1:
1074 for i
in range(pos-int(contiguousgap/2), pos):
1079 resinds2[pos] = lastresind
1081 for n, r
in enumerate(resinds1):
1089 with open(alignfile,
'r') as f:
1095 protname = aa.replace(
">",
"").replace(
"\n",
"")
1096 seq1 = cc.replace(
",",
"").replace(
"\n",
"")
1097 seq2 = dd.replace(
",",
"").replace(
"\n",
"")
1103 for i
in range(len(seq1)):
1115 tmpl1.append([i, c1, seq1[i]])
1116 tmpl2.append([i, c2, seq2[i]])
1117 sequences1[protname] = tmpl1
1118 sequences2[protname] = tmpl2
1120 d = _assign_residues(seq1, seq2)
1126 if fo11.evaluate(xl):
1127 t = [item
for item
in sequences1[protname]
1128 if item[1] == xl[self.residue1_key]][0]
1129 sequences1[protname][t[0]] = [
1130 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1131 t = [item
for item
in sequences2[protname]
1132 if item[1] == d[xl[self.residue1_key]]][0]
1133 sequences2[protname][t[0]] = [
1134 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1136 xl[self.residue1_key] = d[xl[self.residue1_key]]
1138 if fo12.evaluate(xl):
1140 t = [item
for item
in sequences1[protname]
1141 if item[1] == xl[self.residue2_key]][0]
1142 sequences1[protname][t[0]] = [
1143 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1144 t = [item
for item
in sequences2[protname]
1145 if item[1] == d[xl[self.residue2_key]]][0]
1146 sequences2[protname][t[0]] = [
1147 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1149 xl[self.residue2_key] = d[xl[self.residue2_key]]
1152 print(aa.replace(
"\n",
""))
1153 print(
"".join([seq[2]
for seq
in sequences1[protname]]))
1154 print(bb.replace(
"\n",
""))
1155 print(
"".join([seq[2]
for seq
in sequences2[protname]]))
1159 This function creates as many classes as in the input
1160 (number_of_classes: integer) and partition cross-links according
1161 to their identification scores. Classes are defined in the psi key.
1164 if self.id_score_key
is not None:
1168 'The cross-link database does not contain score values')
1169 minscore = min(scores)
1170 maxscore = max(scores)
1171 scoreclasses = numpy.linspace(minscore, maxscore, number_of_classes+1)
1172 if self.psi_key
is None:
1175 score = xl[self.id_score_key]
1176 for n, classmin
in enumerate(scoreclasses[0:-1]):
1177 if score >= classmin
and score <= scoreclasses[n+1]:
1178 xl[self.psi_key] = str(
"CLASS_"+str(n))
1181 def clone_protein(self, protein_name, new_protein_name):
1182 for id
in list(self.data_base.keys()):
1184 for xl
in self.data_base[id]:
1185 new_data_base.append(xl)
1186 if xl[self.protein1_key] == protein_name \
1187 and xl[self.protein2_key] != protein_name:
1189 new_xl[self.protein1_key] = new_protein_name
1190 new_data_base.append(new_xl)
1191 elif xl[self.protein1_key] != protein_name \
1192 and xl[self.protein2_key] == protein_name:
1194 new_xl[self.protein2_key] = new_protein_name
1195 new_data_base.append(new_xl)
1196 elif xl[self.protein1_key] == protein_name \
1197 and xl[self.protein2_key] == protein_name:
1199 new_xl[self.protein1_key] = new_protein_name
1200 new_data_base.append(new_xl)
1202 new_xl[self.protein2_key] = new_protein_name
1203 new_data_base.append(new_xl)
1205 new_xl[self.protein1_key] = new_protein_name
1206 new_xl[self.protein2_key] = new_protein_name
1207 new_data_base.append(new_xl)
1208 self.data_base[id] = new_data_base
1213 This function remove cross-links applied to the same residue
1214 (ie, same chain name and residue number)
1216 for id
in list(self.data_base.keys()):
1218 for xl
in self.data_base[id]:
1219 if xl[self.protein1_key] == xl[self.protein2_key] \
1220 and xl[self.residue1_key] == xl[self.residue2_key]:
1223 new_data_base.append(xl)
1224 self.data_base[id] = new_data_base
1229 this method returns a CrossLinkDataBase class containing
1230 a random subsample of the original cross-link database.
1231 @param percentage float between 0 and 1, is the percentage of
1232 of spectra taken from the original list
1235 if percentage > 1.0
or percentage < 0.0:
1236 raise ValueError(
'the percentage of random cross-link spectra '
1237 'should be between 0 and 1')
1238 nspectra = self.get_number_of_xlid()
1239 nrandom_spectra = int(nspectra*percentage)
1240 random_keys = random.sample(sorted(list(self.data_base.keys())),
1243 for k
in random_keys:
1244 new_data_base[k] = self.data_base[k]
1249 sorted_ids = sorted(list(self.data_base.keys()))
1251 for id
in sorted_ids:
1253 for xl
in self.data_base[id]:
1254 for k
in self.ordered_key_list:
1256 outstr +=
"--- " + str(k) +
" " + str(xl[k]) +
"\n"
1261 if k
not in self.ordered_key_list:
1262 outstr +=
"--- " + str(k) +
" " + str(xl[k]) +
"\n"
1263 outstr +=
"-------------\n"
1266 def plot(self, filename, **kwargs):
1268 matplotlib.use(
'Agg')
1269 import matplotlib.pyplot
as plt
1270 import matplotlib.colors
1272 if kwargs[
"type"] ==
"scatter":
1273 cmap = plt.get_cmap(
"rainbow")
1274 _ = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1275 xkey = kwargs[
"xkey"]
1276 ykey = kwargs[
"ykey"]
1277 if "colorkey" in kwargs:
1278 colorkey = kwargs[
"colorkey"]
1279 if "logyscale" in kwargs:
1280 logyscale = kwargs[
"logyscale"]
1288 xs.append(float(xl[xkey]))
1290 ys.append(math.log10(float(xl[ykey])))
1292 ys.append(float(xl[ykey]))
1293 colors.append(float(xl[colorkey]))
1295 print(
"CrossLinkDataBase.plot: Value error for "
1296 "cross-link %s" % (xl[self.unique_id_key]))
1300 for color
in colors:
1301 cindex = (color-min(colors))/(max(colors)-min(colors))
1302 cs.append(cmap(cindex))
1305 ax = fig.add_subplot(111)
1306 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8, marker=
"o")
1309 plt.savefig(filename)
1313 if kwargs[
"type"] ==
"residue_links":
1317 molecule = kwargs[
"molecule"]
1319 length = len(molecule.sequence)
1320 molecule = molecule.get_name()
1323 sequences_object = kwargs[
"sequences_object"]
1324 sequence = sequences_object.sequences[molecule]
1325 length = len(sequence)
1327 histogram = [0]*length
1329 if xl[self.protein1_key] == molecule:
1330 histogram[xl[self.residue1_key]-1] = \
1331 xl[self.residue1_links_number_key]
1332 if xl[self.protein2_key] == molecule:
1333 histogram[xl[self.residue2_key]-1] = \
1334 xl[self.residue2_links_number_key]
1335 plt.bar(range(1, length+1), histogram)
1336 plt.savefig(filename)
1340 if kwargs[
"type"] ==
"histogram":
1341 valuekey = kwargs[
"valuekey"]
1342 valuename = valuekey
1343 bins = kwargs[
"bins"]
1347 values_list.append(float(xl[valuekey]))
1349 print(
"CrossLinkDataBase.plot: Value error for cross-link "
1350 "%s" % (xl[self.unique_id_key]))
1353 filename, [values_list], valuename=valuename, bins=bins,
1354 colors=
None, format=
"pdf",
1355 reference_xline=
None, yplotrange=
None,
1356 xplotrange=
None, normalized=
True,
1359 def dump(self, json_filename):
1361 with open(json_filename,
'w')
as fp:
1362 json.dump(self.data_base, fp, sort_keys=
True, indent=2,
1363 default=set_json_default)
1365 def load(self, json_filename):
1367 with open(json_filename,
'r') as fp:
1368 self.data_base = json.load(fp)
1371 def save_csv(self, filename):
1376 sorted_group_ids = sorted(list(self.data_base.keys()))
1377 for group
in sorted_group_ids:
1379 for xl
in self.data_base[group]:
1381 sorted_ids = sorted(list(xl.keys()))
1382 values = [xl[k]
for k
in sorted_ids]
1383 group_block.append(values)
1386 with open(filename,
'w')
as fp:
1387 a = csv.writer(fp, delimiter=
',')
1388 a.writerow(sorted_ids)
1393 Returns the number of non redundant cross-link sites
1397 pra = _ProteinsResiduesArray(xl)
1398 prai = pra.get_inverted()
1401 return len(list(praset))
1405 """This class allows to compute and plot the distance between datasets"""
1408 """Input a dictionary where keys are cldb names and values are cldbs"""
1409 import scipy.spatial.distance
1410 self.dbs = cldb_dictionary
1411 self.keylist = list(self.dbs.keys())
1412 self.distances = list()
1414 for i, key1
in enumerate(self.keylist):
1415 for key2
in self.keylist[i+1:]:
1416 distance = self.get_distance(key1, key2)
1417 self.distances.append(distance)
1419 self.distances = scipy.spatial.distance.squareform(self.distances)
1422 return 1.0 - self.
jaccard_index(self.dbs[key1], self.dbs[key2])
1425 """Similarity index between two datasets
1426 https://en.wikipedia.org/wiki/Jaccard_index"""
1430 for xl1
in CrossLinkDataBase1:
1431 a1f = _ProteinsResiduesArray(xl1)
1432 a1b = a1f.get_inverted()
1435 for xl2
in CrossLinkDataBase2:
1436 a2f = _ProteinsResiduesArray(xl2)
1437 a2b = a2f.get_inverted()
1440 return (float(len(set1 & set2)/2)
1441 / (len(set1)/2+len(set2)/2-len(set1 & set2)/2))
1443 def plot_matrix(self, figurename="clustermatrix.pdf"):
1444 import matplotlib
as mpl
1447 import matplotlib.pylab
as pl
1448 from scipy.cluster
import hierarchy
as hrc
1450 raw_distance_matrix = self.distances
1451 labels = self.keylist
1455 ax = fig.add_subplot(1, 1, 1)
1456 dendrogram = hrc.dendrogram(
1457 hrc.linkage(raw_distance_matrix),
1460 leaves_order = dendrogram[
'leaves']
1461 ax.set_xlabel(
'Dataset')
1462 ax.set_ylabel(
'Jaccard Distance')
1464 pl.savefig(
"dendrogram."+figurename, dpi=300)
1469 ax = fig.add_subplot(1, 1, 1)
1470 cax = ax.imshow(raw_distance_matrix[leaves_order, :][:, leaves_order],
1471 interpolation=
'nearest')
1472 cb = fig.colorbar(cax)
1473 cb.set_label(
'Jaccard Distance')
1474 ax.set_xlabel(
'Dataset')
1475 ax.set_ylabel(
'Dataset')
1476 ax.set_xticks(range(len(labels)))
1477 ax.set_xticklabels(numpy.array(labels)[leaves_order],
1478 rotation=
'vertical')
1479 ax.set_yticks(range(len(labels)))
1480 ax.set_yticklabels(numpy.array(labels)[leaves_order],
1481 rotation=
'horizontal')
1483 pl.savefig(
"matrix." + figurename, dpi=300)
1489 This class maps a CrossLinkDataBase on a given structure
1490 and save an rmf file with color-coded cross-links
1493 def __init__(self, CrossLinkDataBase, rmf_or_stat_handler):
1494 self.CrossLinkDataBase = CrossLinkDataBase
1497 self.prots = rmf_or_stat_handler
1498 self.distances = defaultdict(list)
1499 self.array_to_id = {}
1500 self.id_to_array = {}
1502 print(
"computing distances for all cross-links and all structures")
1503 for i
in self.prots[::10]:
1504 self.compute_distances()
1505 for key, xl
in enumerate(self.CrossLinkDataBase):
1506 array = _ProteinsResiduesArray(xl)
1507 self.array_to_id[array] = key
1508 self.id_to_array[key] = array
1509 if xl[
"MinAmbiguousDistance"] !=
'None':
1510 self.distances[key].append(xl[
"MinAmbiguousDistance"])
1512 def compute_distances(self):
1515 sorted_group_ids = \
1516 sorted(list(self.CrossLinkDataBase.data_base.keys()))
1517 for group
in sorted_group_ids:
1519 for xl
in self.CrossLinkDataBase.data_base[group]:
1521 sorted_ids = sorted(list(xl.keys()))
1524 + [
"UniqueID",
"Distance",
"MinAmbiguousDistance"])
1525 c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1527 (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1528 self._get_distance_and_particle_pair(r1, c1, r2, c2)
1538 values = [xl[k]
for k
in sorted_ids]
1539 values += [group, mdist]
1540 except KeyError
as e:
1541 print(
"MapCrossLinkDataBaseOnStructure "
1542 "KeyError: {0} in {1}".format(e, xl))
1543 group_dists.append(mdist)
1544 xl[
"Distance"] = mdist
1545 xl[
"State1"] = state1
1547 xl[
"State2"] = state2
1550 for xl
in self.CrossLinkDataBase.data_base[group]:
1551 xl[
"MinAmbiguousDistance"] = min(group_dists)
1553 def _get_distance_and_particle_pair(self, r1, c1, r2, c2):
1554 '''more robust and slower version of above'''
1557 selpart_1 = sel.get_selected_particles()
1558 if len(selpart_1) == 0:
1559 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle "
1560 "selected for first site")
1564 selpart_2 = sel.get_selected_particles()
1565 if len(selpart_2) == 0:
1566 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle "
1567 "selected for second site")
1570 for p1
in selpart_1:
1571 for p2
in selpart_2:
1572 if p1 == p2
and r1 == r2:
1580 h1 = h1.get_parent()
1583 h1 = h1.get_parent()
1587 h2 = h2.get_parent()
1590 h2 = h2.get_parent()
1593 results.append((dist, state_index1, copy_index1,
1594 state_index2, copy_index2, p1, p2))
1595 if len(results) == 0:
1597 results_sorted = sorted(results,
1598 key=operator.itemgetter(0, 1, 2, 3, 4))
1599 return ((results_sorted[0][0],
1600 results_sorted[0][5],
1601 results_sorted[0][6]),
1602 (results_sorted[0][1],
1603 results_sorted[0][2],
1604 results_sorted[0][3],
1605 results_sorted[0][4]))
1607 def save_rmf_snapshot(self, filename, color_id=None):
1608 sorted_group_ids = \
1609 sorted(list(self.CrossLinkDataBase.data_base.keys()))
1612 for group
in sorted_group_ids:
1613 group_dists_particles = []
1614 for xl
in self.CrossLinkDataBase.data_base[group]:
1616 self.CrossLinkDataBase.get_short_cross_link_string(xl)
1617 c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1619 (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1620 self._get_distance_and_particle_pair(r1, c1, r2, c2)
1622 print(
"TypeError or missing chain/residue ",
1625 if color_id
is not None:
1626 group_dists_particles.append((mdist, p1, p2, xllabel,
1627 float(xl[color_id])))
1629 group_dists_particles.append((mdist, p1, p2, xllabel, 0.0))
1630 if group_dists_particles:
1631 (minmdist, minp1, minp2, minxllabel, mincolor_score) = \
1632 min(group_dists_particles, key=
lambda t: t[0])
1633 color_scores.append(mincolor_score)
1634 list_of_pairs.append((minp1, minp2, minxllabel,
1640 linear.set_slope(1.0)
1643 'linear_dummy_restraints')
1645 offset = min(color_scores)
1646 maxvalue = max(color_scores)
1647 for pair
in list_of_pairs:
1650 pr.set_name(pair[2])
1651 if offset < maxvalue:
1652 factor = (pair[3]-offset)/(maxvalue-offset)
1659 rslin.add_restraint(pr)
1662 rh = RMF.create_rmf_file(filename)
1669 def boxplot_crosslink_distances(self, filename):
1671 keys = [self.id_to_array[k]
for k
in list(self.distances.keys())]
1672 medians = [numpy.median(self.distances[self.array_to_id[k]])
1674 dists = [self.distances[self.array_to_id[k]]
for k
in keys]
1675 distances_sorted_by_median = \
1676 [x
for _, x
in sorted(zip(medians, dists))]
1677 keys_sorted_by_median = [x
for _, x
in sorted(zip(medians, keys))]
1679 filename, distances_sorted_by_median,
1680 range(len(distances_sorted_by_median)),
1681 xlabels=keys_sorted_by_median,
1682 scale_plot_length=0.2)
1684 def get_crosslink_violations(self, threshold):
1685 violations = defaultdict(list)
1686 for k
in self.distances:
1688 viols = self.distances[k]
1689 violations[self.id_to_array[k]] = viols
1695 This class generates a CrossLinkDataBase from a given structure
1697 def __init__(self, system, residue_types_1=["K"],
1698 residue_types_2=[
"K"], reactivity_range=[0, 1], kt=1.0):
1702 cldbkc.set_protein1_key(
"Protein1")
1703 cldbkc.set_protein2_key(
"Protein2")
1704 cldbkc.set_residue1_key(
"Residue1")
1705 cldbkc.set_residue2_key(
"Residue2")
1707 self.system = system
1708 self.model = self.system.model
1709 self.residue_types_1 = residue_types_1
1710 self.residue_types_2 = residue_types_2
1712 self.indexes_dict1 = {}
1713 self.indexes_dict2 = {}
1714 self.protein_residue_dict = {}
1715 self.reactivity_dictionary = {}
1716 self.euclidean_interacting_pairs =
None
1717 self.xwalk_interacting_pairs =
None
1719 for state
in self.system.get_states():
1720 for moleculename, molecules
in state.get_molecules().items():
1721 for molecule
in molecules:
1724 seq = molecule.sequence
1725 residues = [i
for i
in range(1, len(seq)+1)
1726 if ((seq[i-1]
in self.residue_types_1)
1727 or (seq[i-1]
in self.residue_types_2))]
1733 rexp = numpy.random.exponential(0.00000001)
1734 prob = 1.0-math.exp(-rexp)
1735 self.reactivity_dictionary[(molecule, r)] = prob
1737 residues1 = [i
for i
in range(1, len(seq)+1)
1738 if seq[i-1]
in self.residue_types_1]
1739 residues2 = [i
for i
in range(1, len(seq)+1)
1740 if seq[i-1]
in self.residue_types_2]
1743 molecule.hier, residue_index=r, resolution=1)
1744 ps = s.get_selected_particles()
1746 index = p.get_index()
1747 self.indexes_dict1[index] = (molecule, r)
1748 self.protein_residue_dict[(molecule, r)] = index
1751 molecule.hier, residue_index=r, resolution=1)
1752 ps = s.get_selected_particles()
1754 index = p.get_index()
1755 self.indexes_dict2[index] = (molecule, r)
1756 self.protein_residue_dict[(molecule, r)] = index
1758 def get_all_possible_pairs(self):
1759 n = float(len(list(self.protein_residue_dict.keys())))
1760 return n*(n-1.0)/2.0
1762 def get_all_feasible_pairs(self, distance=21):
1764 particle_index_pairs = []
1766 for a, b
in itertools.combinations(
1767 list(self.protein_residue_dict.keys()), 2):
1769 index1 = self.protein_residue_dict[a]
1770 index2 = self.protein_residue_dict[b]
1774 if particle_distance <= distance:
1775 particle_index_pairs.append((index1, index2))
1776 new_xl[self.cldb.protein1_key] = a[0].get_name()
1777 new_xl[self.cldb.protein2_key] = b[0].get_name()
1778 new_xl[
"molecule_object1"] = a[0]
1779 new_xl[
"molecule_object2"] = b[0]
1780 new_xl[self.cldb.residue1_key] = a[1]
1781 new_xl[self.cldb.residue2_key] = b[1]
1782 self.cldb.data_base[str(nxl)] = [new_xl]
1787 def get_data_base(self, total_number_of_spectra, ambiguity_probability=0.1,
1788 noise=0.01, distance=21, max_delta_distance=10.0,
1789 xwalk_bin_path=
None, confidence_false=0.75,
1790 confidence_true=0.75):
1792 from random
import random
1794 number_of_spectra = 1
1796 self.beta_true = -1.4427*math.log(0.5*(1.0-confidence_true))
1797 self.beta_false = -1.4427*math.log(0.5*(1.0-confidence_false))
1798 self.cldb.data_base[str(number_of_spectra)] = []
1799 self.sites_weighted =
None
1801 while number_of_spectra < total_number_of_spectra:
1802 if (random() > ambiguity_probability
1803 and len(self.cldb.data_base[str(number_of_spectra)]) != 0):
1805 number_of_spectra += 1
1806 self.cldb.data_base[str(number_of_spectra)] = []
1808 if random() > noise:
1810 pra, dist = self.get_random_residue_pair(
1811 distance, xwalk_bin_path, max_delta_distance)
1814 pra, dist = self.get_random_residue_pair(
None,
None,
None)
1818 new_xl[self.cldb.protein1_key] = pra[0].get_name()
1819 new_xl[self.cldb.protein2_key] = pra[1].get_name()
1820 new_xl[
"molecule_object1"] = pra[0]
1821 new_xl[
"molecule_object2"] = pra[1]
1822 new_xl[self.cldb.residue1_key] = pra[2]
1823 new_xl[self.cldb.residue2_key] = pra[3]
1824 new_xl[
"Noisy"] = noisy
1826 new_xl[
"Reactivity_Residue1"] = \
1827 self.reactivity_dictionary[(pra[0], pra[2])]
1828 new_xl[
"Reactivity_Residue2"] = \
1829 self.reactivity_dictionary[(pra[1], pra[3])]
1830 new_xl[
"Reactivity"] = \
1831 new_xl[
"Reactivity_Residue1"]+new_xl[
"Reactivity_Residue2"]
1833 new_xl[
"IDScore"] = np.random.beta(1.0, self.beta_false)
1835 new_xl[
"IDScore"] = 1.0-np.random.beta(1.0, self.beta_true)
1836 new_xl[
"TargetDistance"] = dist
1837 new_xl[
"NoiseProbability"] = noise
1838 new_xl[
"AmbiguityProbability"] = ambiguity_probability
1841 self.model, [self.protein_residue_dict[(pra[0], pra[2])],
1842 self.protein_residue_dict[(pra[1], pra[3])]])
1847 new_xl[
"InterRigidBody"] =
False
1852 new_xl[
"InterRigidBody"] =
True
1854 new_xl[
"InterRigidBody"] =
None
1856 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1860 def get_random_residue_pair(self, distance=None, xwalk_bin_path=None,
1861 max_delta_distance=
None):
1864 from random
import choice
1865 if distance
is None:
1868 protein1, residue1 = \
1869 choice(list(self.protein_residue_dict.keys()))
1870 protein2, residue2 = \
1871 choice(list(self.protein_residue_dict.keys()))
1872 index1 = self.protein_residue_dict[(protein1, residue1)]
1873 index2 = self.protein_residue_dict[(protein2, residue2)]
1877 if (protein1, residue1) != (protein2, residue2):
1881 if not xwalk_bin_path:
1883 gcpf.set_distance(distance+max_delta_distance)
1887 if not self.sites_weighted:
1888 self.sites_weighted = []
1889 for key
in self.reactivity_dictionary:
1890 r = self.reactivity_dictionary[key]
1891 self.sites_weighted.append((key, r))
1893 first_site = self.weighted_choice(self.sites_weighted)
1895 if self.euclidean_interacting_pairs
is None:
1896 self.euclidean_interacting_pairs = \
1897 gcpf.get_close_pairs(
1899 list(self.indexes_dict1.keys()),
1900 list(self.indexes_dict2.keys()))
1902 first_site_pairs = \
1903 [pair
for pair
in self.euclidean_interacting_pairs
1904 if self.indexes_dict1[pair[0]] == first_site
or
1905 self.indexes_dict2[pair[1]] == first_site]
1906 if len(first_site_pairs) == 0:
1909 second_sites_weighted = []
1910 for pair
in first_site_pairs:
1911 if self.indexes_dict1[pair[0]] == first_site:
1912 second_site = self.indexes_dict2[pair[1]]
1913 if self.indexes_dict2[pair[1]] == first_site:
1914 second_site = self.indexes_dict1[pair[0]]
1915 r = self.reactivity_dictionary[second_site]
1916 second_sites_weighted.append((second_site, r))
1917 second_site = self.weighted_choice(second_sites_weighted)
1918 protein1, residue1 = first_site
1919 protein2, residue2 = second_site
1920 print(
"CrossLinkDataBaseFromStructure."
1921 "get_random_residue_pair:",
1922 "First site", first_site,
1923 self.reactivity_dictionary[first_site],
1924 "Second site", second_site,
1925 self.reactivity_dictionary[second_site])
1928 [self.protein_residue_dict[first_site],
1929 self.protein_residue_dict[second_site]])
1934 if particle_distance < distance \
1935 and (protein1, residue1) != (protein2, residue2):
1937 elif (particle_distance >= distance
1938 and (protein1, residue1) != (protein2, residue2)
1939 and max_delta_distance):
1941 if particle_distance-distance < max_delta_distance:
1945 if not self.xwalk_interacting_pairs:
1946 self.xwalk_interacting_pairs = \
1947 self.get_xwalk_distances(xwalk_bin_path, distance)
1948 interacting_pairs_weighted = []
1950 for pair
in self.xwalk_interacting_pairs:
1956 -self.reactivity_dictionary[(protein1, residue1)]
1959 -self.reactivity_dictionary[(protein2, residue2)]
1961 interacting_pairs_weighted.append((pair, weight1*weight2))
1963 pair = self.weighted_choice(interacting_pairs_weighted)
1968 particle_distance = float(pair[4])
1970 return ((protein1, protein2, residue1, residue2)), particle_distance
1972 def get_xwalk_distances(self, xwalk_bin_path, distance):
1976 o.init_pdb(
"xwalk.pdb", self.representation.prot)
1977 o.write_pdb(
"xwalk.pdb")
1978 namechainiddict = o.dictchain[
"xwalk.pdb"]
1981 for key
in namechainiddict:
1982 chainiddict[namechainiddict[key]] = key
1983 xwalkout = os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path
1984 +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys '
1985 '-a1 cb -a2 cb -max '
1986 + str(distance) +
' -bb').read()
1988 output_list_of_distance = []
1989 for line
in xwalkout.split(
"\n")[0:-2]:
1990 tokens = line.split()
1993 distance = float(tokens[6])
1994 fs = first.split(
"-")
1995 ss = second.split(
"-")
1998 protein1 = chainiddict[chainid1]
1999 protein2 = chainiddict[chainid2]
2000 residue1 = int(fs[1])
2001 residue2 = int(ss[1])
2002 output_list_of_distance.append(
2003 (protein1, protein2, residue1, residue2, distance))
2004 return output_list_of_distance
2006 def weighted_choice(self, choices):
2009 total = sum(w
for c, w
in choices)
2010 r = random.uniform(0, total)
2012 for c, w
in choices:
2016 assert False,
"Shouldn't get here"
2018 def save_rmf_snapshot(self, filename, color_id=None):
2021 if color_id
is None:
2022 color_id =
"Reactivity"
2023 sorted_group_ids = sorted(list(self.cldb.data_base.keys()))
2026 for group
in sorted_group_ids:
2027 group_dists_particles = []
2028 for xl
in self.cldb.data_base[group]:
2029 xllabel = self.cldb.get_short_cross_link_string(xl)
2030 (c1, c2, r1, r2) = (xl[
"molecule_object1"],
2031 xl[
"molecule_object2"],
2032 xl[self.cldb.residue1_key],
2033 xl[self.cldb.residue2_key])
2035 index1 = self.protein_residue_dict[(c1, r1)]
2036 index2 = self.protein_residue_dict[(c2, r2)]
2039 mdist = xl[
"TargetDistance"]
2041 print(
"TypeError or missing chain/residue ",
2044 group_dists_particles.append((mdist, p1, p2, xllabel,
2045 float(xl[color_id])))
2046 if group_dists_particles:
2047 (minmdist, minp1, minp2, minxllabel, mincolor_score) \
2048 = min(group_dists_particles, key=
lambda t: t[0])
2049 color_scores.append(mincolor_score)
2050 list_of_pairs.append((minp1, minp2, minxllabel,
2057 linear.set_slope(1.0)
2061 offset = min(color_scores)
2062 maxvalue = max(color_scores)
2063 for pair
in list_of_pairs:
2065 pr.set_name(pair[2])
2066 factor = (pair[3]-offset)/(maxvalue-offset)
2071 rslin.add_restraint(pr)
2074 rh = RMF.create_rmf_file(filename)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def __init__
Input a dictionary where keys are cldb names and values are cldbs.
def get_number_of_unique_crosslinked_sites
Returns the number of non redundant cross-link sites.
def filter_score
Get all cross-links with score greater than an input value.
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
Get the particles from a list of indexes.
def create_new_keyword
This function creates a new keyword for the whole database and set the values from and existing keywo...
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
def jaccard_index
Similarity index between two datasets https://en.wikipedia.org/wiki/Jaccard_index.
def merge
This function merges two cross-link datasets so that if two conflicting cross-links have the same cro...
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class maps a CrossLinkDataBase on a given structure and save an rmf file with color-coded cross-...
A decorator for keeping track of copies of a molecule.
The standard decorator for manipulating molecular structures.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def set_value
This function changes the value for a given key in the database For instance one can change the name ...
This class allows to compute and plot the distance between datasets.
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
def filter_out_same_residues
This function remove cross-links applied to the same residue (ie, same chain name and residue number)...
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
def offset_residue_index
This function offset the residue indexes of a given protein by a specified value. ...
Class for easy writing of PDBs, RMFs, and stat files.
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
def jackknife
this method returns a CrossLinkDataBase class containing a random subsample of the original cross-lin...
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
def check_cross_link_consistency
This function checks the consistency of the dataset with the amino acid sequence. ...
Find all nearby pairs by testing all pairs.
Score a pair of particles based on the distance between their centers.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
def get_values
this function returns the list of values for a given key in the database alphanumerically sorted ...
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Associate an integer "state" index with a hierarchy node.
def append_database
Append cross-link dataset to this one.
class to link stat files to several rmf files
class to allow more advanced handling of RMF files.
Mapping between FASTA one-letter codes and residue types.
A class to handle different styles of site pairs parsers.
def classify_crosslinks_by_score
This function creates as many classes as in the input (number_of_classes: integer) and partition cros...
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
this class handles a cross-link dataset and do filtering operations, adding cross-links, merge datasets...
Support for the RMF file format for storing hierarchical molecular data and markup.
def rename_proteins
This function renames all proteins contained in the input dictionary from the old names (keys) to the...
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.
def align_sequence
This function parses an alignment file obtained previously, to map crosslink residues onto the sequen...
def create_set_from_file
if FixedFormatParser is not specified, the file is comma-separated-values