1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
22 from collections
import defaultdict
27 def set_json_default(obj):
28 if isinstance(obj, set):
36 def _handle_string_input(inp):
37 if not isinstance(inp, str):
38 raise TypeError(
"expecting a string")
42 class _CrossLinkDataBaseStandardKeys:
44 This class setup all the standard keys needed to
45 identify the cross-link features from the data sets
49 self.protein1_key =
"Protein1"
50 self.type[self.protein1_key] = str
51 self.protein2_key =
"Protein2"
52 self.type[self.protein2_key] = str
53 self.residue1_key =
"Residue1"
54 self.type[self.residue1_key] = int
55 self.residue2_key =
"Residue2"
56 self.type[self.residue2_key] = int
57 self.residue1_amino_acid_key =
"Residue1AminoAcid"
58 self.type[self.residue1_amino_acid_key] = str
59 self.residue2_amino_acid_key =
"Residue2AminoAcid"
60 self.type[self.residue2_amino_acid_key] = str
61 self.residue1_moiety_key =
"Residue1Moiety"
62 self.type[self.residue1_moiety_key] = str
63 self.residue2_moiety_key =
"Residue2Moiety"
64 self.type[self.residue2_moiety_key] = str
65 self.site_pairs_key =
"SitePairs"
66 self.type[self.site_pairs_key] = str
67 self.unique_id_key =
"XLUniqueID"
68 self.type[self.unique_id_key] = str
69 self.unique_sub_index_key =
"XLUniqueSubIndex"
70 self.type[self.unique_sub_index_key] = int
71 self.unique_sub_id_key =
"XLUniqueSubID"
72 self.type[self.unique_sub_id_key] = str
73 self.data_set_name_key =
"DataSetName"
74 self.type[self.data_set_name_key] = str
75 self.cross_linker_chemical_key =
"CrossLinkerChemical"
76 self.type[self.cross_linker_chemical_key] = str
77 self.id_score_key =
"IDScore"
78 self.type[self.id_score_key] = float
80 self.type[self.fdr_key] = float
81 self.quantitation_key =
"Quantitation"
82 self.type[self.quantitation_key] = float
83 self.redundancy_key =
"Redundancy"
84 self.type[self.redundancy_key] = int
85 self.redundancy_list_key =
"RedundancyList"
86 self.type[self.redundancy_key] = list
87 self.ambiguity_key =
"Ambiguity"
88 self.type[self.ambiguity_key] = int
89 self.residue1_links_number_key =
"Residue1LinksNumber"
90 self.type[self.residue1_links_number_key] = int
91 self.residue2_links_number_key =
"Residue2LinksNumber"
92 self.type[self.residue2_links_number_key] = int
93 self.type[self.ambiguity_key] = int
94 self.state_key =
"State"
95 self.type[self.state_key] = int
96 self.sigma1_key =
"Sigma1"
97 self.type[self.sigma1_key] = str
98 self.sigma2_key =
"Sigma2"
99 self.type[self.sigma2_key] = str
101 self.type[self.psi_key] = str
102 self.distance_key =
"Distance"
103 self.type[self.distance_key] = float
104 self.min_ambiguous_distance_key =
"MinAmbiguousDistance"
105 self.type[self.distance_key] = float
107 self.link_type_key =
"LinkType"
108 self.type[self.link_type_key] = str
110 self.ordered_key_list = [
111 self.data_set_name_key, self.unique_id_key,
112 self.unique_sub_index_key, self.unique_sub_id_key,
113 self.protein1_key, self.protein2_key, self.residue1_key,
114 self.residue2_key, self.residue1_amino_acid_key,
115 self.residue2_amino_acid_key, self.residue1_moiety_key,
116 self.residue2_moiety_key, self.cross_linker_chemical_key,
117 self.id_score_key, self.fdr_key, self.quantitation_key,
118 self.redundancy_key, self.redundancy_list_key, self.state_key,
119 self.sigma1_key, self.sigma2_key, self.psi_key, self.distance_key,
120 self.min_ambiguous_distance_key, self.link_type_key]
123 class _ProteinsResiduesArray(tuple):
125 This class is inherits from tuple, and it is a shorthand for a cross-link
126 (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1
127 and protein2, r1 and r2 are residue1 and residue2.
130 def __new__(self, input_data):
132 @input_data can be a dict or a tuple
134 self.cldbsk = _CrossLinkDataBaseStandardKeys()
135 if type(input_data)
is dict:
137 p1 = input_data[self.cldbsk.protein1_key]
139 p2 = input_data[self.cldbsk.protein2_key]
142 r1 = input_data[self.cldbsk.residue1_key]
144 r2 = input_data[self.cldbsk.residue2_key]
150 t = (p1,
"", r1,
None)
151 elif type(input_data)
is tuple:
152 if len(input_data) > 4
or len(input_data) == 3 \
153 or len(input_data) == 1:
155 "_ProteinsResiduesArray: must have only 4 elements")
156 if len(input_data) == 4:
157 p1 = _handle_string_input(input_data[0])
158 p2 = _handle_string_input(input_data[1])
161 if (type(r1)
is not int)
and (r1
is not None):
163 "_ProteinsResiduesArray: residue1 must be a integer")
164 if (type(r2)
is not int)
and (r1
is not None):
166 "_ProteinsResiduesArray: residue2 must be a integer")
168 if len(input_data) == 2:
169 p1 = _handle_string_input(input_data[0])
171 if type(r1)
is not int:
173 "_ProteinsResiduesArray: residue1 must be a integer")
174 t = (p1,
"", r1,
None)
177 "_ProteinsResiduesArray: input must be a dict or tuple")
178 return tuple.__new__(_ProteinsResiduesArray, t)
180 def get_inverted(self):
182 Returns a _ProteinsResiduesArray instance with protein1 and
185 return _ProteinsResiduesArray((self[1], self[0], self[3], self[2]))
188 outstr = self.cldbsk.protein1_key +
" " + str(self[0])
189 outstr +=
" " + self.cldbsk.protein2_key +
" " + str(self[1])
190 outstr +=
" " + self.cldbsk.residue1_key +
" " + str(self[2])
191 outstr +=
" " + self.cldbsk.residue2_key +
" " + str(self[3])
195 outstr = str(self[0]) +
"." + str(self[2]) +
"-" + str(self[1]) \
202 This class allows to create filter functions that can be passed to
203 the CrossLinkDataBase in this way:
205 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
207 where cldb is CrossLinkDataBase instance and it is only needed to get
208 the standard keywords
210 A filter operator can be evaluate on a CrossLinkDataBase item xl and
215 and it is used to filter the database
218 def __init__(self, argument1, operator, argument2):
220 (argument1,operator,argument2) can be either a
221 (keyword,operator.eq|lt|gt...,value)
222 or (FilterOperator1,operator.or|and...,FilterOperator2)
224 if isinstance(argument1, FilterOperator):
225 self.operations = [argument1, operator, argument2]
228 self.values = (argument1, operator, argument2)
230 def __or__(self, FilterOperator2):
233 def __and__(self, FilterOperator2):
236 def __invert__(self):
239 def evaluate(self, xl_item):
241 if len(self.operations) == 0:
242 keyword, operator, value = self.values
243 return operator(xl_item[keyword], value)
244 FilterOperator1, op, FilterOperator2 = self.operations
246 if FilterOperator2
is None:
247 return op(FilterOperator1.evaluate(xl_item))
249 return op(FilterOperator1.evaluate(xl_item),
250 FilterOperator2.evaluate(xl_item))
255 This class is needed to convert the keywords from a generic database
261 @param list_parser an instance of ResiduePairListParser, if any
265 self.backward_converter = {}
266 _CrossLinkDataBaseStandardKeys.__init__(self)
267 self.rplp = list_parser
268 if self.rplp
is None:
270 self.compulsory_keys = set([self.protein1_key,
275 self.compulsory_keys = self.rplp.get_compulsory_keys()
276 self.setup_keys = set()
280 Is a function that check whether necessary keys are setup
283 if self.compulsory_keys & setup_keys != self.compulsory_keys:
284 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup "
285 "all necessary keys")
289 Returns the keys that have been setup so far
291 return self.backward_converter.keys()
295 This sets up the standard compulsory keys as defined by
296 _CrossLinkDataBaseStandardKeys
298 for ck
in self.compulsory_keys:
299 self.converter[ck] = ck
300 self.backward_converter[ck] = ck
302 def set_unique_id_key(self, origin_key):
303 self.converter[origin_key] = self.unique_id_key
304 self.backward_converter[self.unique_id_key] = origin_key
306 def set_protein1_key(self, origin_key):
307 self.converter[origin_key] = self.protein1_key
308 self.backward_converter[self.protein1_key] = origin_key
310 def set_protein2_key(self, origin_key):
311 self.converter[origin_key] = self.protein2_key
312 self.backward_converter[self.protein2_key] = origin_key
314 def set_residue1_key(self, origin_key):
315 self.converter[origin_key] = self.residue1_key
316 self.backward_converter[self.residue1_key] = origin_key
318 def set_residue2_key(self, origin_key):
319 self.converter[origin_key] = self.residue2_key
320 self.backward_converter[self.residue2_key] = origin_key
322 def set_residue1_amino_acid_key(self, origin_key):
323 self.converter[origin_key] = self.residue1_amino_acid_key
324 self.backward_converter[self.residue1_amino_acid_key] = origin_key
326 def set_residue2_amino_acid_key(self, origin_key):
327 self.converter[origin_key] = self.residue2_amino_acid_key
328 self.backward_converter[self.residue2_amino_acid_key] = origin_key
330 def set_residue1_moiety_key(self, origin_key):
331 self.converter[origin_key] = self.residue1_moiety_key
332 self.backward_converter[self.residue1_moiety_key] = origin_key
334 def set_residue2_moiety_key(self, origin_key):
335 self.converter[origin_key] = self.residue2_moiety_key
336 self.backward_converter[self.residue2_moiety_key] = origin_key
338 def set_site_pairs_key(self, origin_key):
339 self.converter[origin_key] = self.site_pairs_key
340 self.backward_converter[self.site_pairs_key] = origin_key
342 def set_id_score_key(self, origin_key):
343 self.converter[origin_key] = self.id_score_key
344 self.backward_converter[self.id_score_key] = origin_key
346 def set_fdr_key(self, origin_key):
347 self.converter[origin_key] = self.fdr_key
348 self.backward_converter[self.fdr_key] = origin_key
350 def set_quantitation_key(self, origin_key):
351 self.converter[origin_key] = self.quantitation_key
352 self.backward_converter[self.quantitation_key] = origin_key
354 def set_psi_key(self, origin_key):
355 self.converter[origin_key] = self.psi_key
356 self.backward_converter[self.psi_key] = origin_key
358 def set_link_type_key(self, link_type_key):
359 self.converter[link_type_key] = self.link_type_key
360 self.backward_converter[self.link_type_key] = link_type_key
364 Returns the dictionary that convert the old keywords to the new ones
367 return self.converter
371 Returns the dictionary that convert the new keywords to the old ones
374 return self.backward_converter
379 A class to handle different styles of site pairs parsers.
381 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
382 [Y3-;K4-] for dead-ends
383 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
384 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
385 LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
390 def __init__(self, style):
391 _CrossLinkDataBaseStandardKeys.__init__(self)
392 if style ==
"MSSTUDIO":
394 self.compulsory_keys = set([self.protein1_key,
396 self.site_pairs_key])
397 elif style ==
"XTRACT" or style ==
"QUANTITATION":
399 self.compulsory_keys = set([self.site_pairs_key])
400 elif style ==
"LAN_HUANG":
402 self.compulsory_keys = set([self.site_pairs_key])
404 raise Exception(
"ResiduePairListParser: unknown list parser style")
406 def get_compulsory_keys(self):
407 return self.compulsory_keys
411 This function returns a list of cross-linked residues and the
412 corresponding list of cross-linked chains. The latter list can be
413 empty, if the style doesn't have the corresponding information.
415 if self.style ==
"MSSTUDIO":
416 input_string = input_string.replace(
"[",
"")
417 input_string = input_string.replace(
"]",
"")
418 input_string_pairs = input_string.split(
";")
419 residue_pair_indexes = []
420 chain_pair_indexes = []
421 for s
in input_string_pairs:
423 r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)'
424 r'(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',
427 r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$', s)
430 (residue_type_1, residue_index_1, residue_type_2,
431 residue_index_2) = m1.group(1, 2, 3, 4)
432 residue_pair_indexes.append((residue_index_1,
436 residue_type_1, residue_index_1 = m2.group(1, 2)
438 return residue_pair_indexes, chain_pair_indexes
439 if self.style ==
"XTRACT" or self.style ==
"QUANTITATION":
440 if ":x:" in input_string:
442 input_strings = input_string.split(
":x:")
443 first_peptides = input_strings[0].split(
":|:")
444 second_peptides = input_strings[1].split(
":|:")
445 first_peptides_identifiers = [
447 f.split(
":")[1])
for f
in first_peptides]
448 second_peptides_identifiers = [
450 f.split(
":")[1])
for f
in second_peptides]
451 residue_pair_indexes = []
452 chain_pair_indexes = []
453 for fpi
in first_peptides_identifiers:
454 for spi
in second_peptides_identifiers:
459 residue_pair_indexes.append((residue1, residue2))
460 chain_pair_indexes.append((chain1, chain2))
461 return residue_pair_indexes, chain_pair_indexes
464 first_peptides = input_string.split(
":|:")
465 first_peptides_identifiers = [
466 (f.split(
":")[0], f.split(
":")[1])
for f
in first_peptides]
469 for fpi
in first_peptides_identifiers:
472 residue_indexes.append((residue1,))
473 chain_indexes.append((chain1,))
474 return residue_indexes, chain_indexes
475 if self.style ==
"LAN_HUANG":
476 input_strings = input_string.split(
"-")
477 chain1, first_series = input_strings[0].split(
":")
478 chain2, second_series = input_strings[1].split(
":")
480 first_residues = first_series.replace(
";",
"|").split(
"|")
481 second_residues = second_series.replace(
";",
"|").split(
"|")
482 residue_pair_indexes = []
483 chain_pair_indexes = []
484 for fpi
in first_residues:
485 for spi
in second_residues:
486 residue1 = self.re.sub(
"[^0-9]",
"", fpi)
487 residue2 = self.re.sub(
"[^0-9]",
"", spi)
488 residue_pair_indexes.append((residue1, residue2))
489 chain_pair_indexes.append((chain1, chain2))
490 return residue_pair_indexes, chain_pair_indexes
495 A class to handle different XL format with fixed format
496 currently support ProXL
498 def __init__(self, format):
499 _CrossLinkDataBaseStandardKeys.__init__(self)
500 if format ==
"PROXL":
503 raise Exception(
"FixedFormatParser: unknown list format name")
505 def get_data(self, input_string):
506 if self.format ==
"PROXL":
507 tokens = input_string.split(
"\t")
509 if tokens[0] ==
"SEARCH ID(S)":
512 xl[self.protein1_key] = tokens[2]
513 xl[self.protein2_key] = tokens[4]
514 xl[self.residue1_key] = int(tokens[3])
515 xl[self.residue2_key] = int(tokens[5])
521 this class handles a cross-link dataset and do filtering
522 operations, adding cross-links, merge datasets...
525 def __init__(self, converter=None, data_base=None, fasta_seq=None,
529 @param converter an instance of CrossLinkDataBaseKeywordsConverter
530 @param data_base an instance of CrossLinkDataBase to build the
532 @param fasta_seq an instance of IMP.pmi.topology.Sequences containing
533 protein fasta sequences to check cross-link consistency.
534 If not given consistency will not be checked
535 @param linkable_aa a tuple containing one-letter amino acids which
536 are linkable by the cross-linker; only used if the database
537 DOES NOT provide a value for a certain residueX_amino_acid_key
538 and if a fasta_seq is given
541 if data_base
is None:
544 self.data_base = data_base
546 _CrossLinkDataBaseStandardKeys.__init__(self)
547 if converter
is not None:
548 self.cldbkc = converter
549 self.list_parser = self.cldbkc.rplp
550 self.converter = converter.get_converter()
554 self.list_parser =
None
555 self.converter =
None
558 self.def_aa_tuple = linkable_aa
559 self.fasta_seq = fasta_seq
566 Update the whole dataset after changes
568 self.update_cross_link_unique_sub_index()
569 self.update_cross_link_redundancy()
570 self.update_residues_links_number()
574 sorted_ids = sorted(self.data_base.keys())
576 for xl
in self.data_base[k]:
579 def xlid_iterator(self):
580 sorted_ids = sorted(self.data_base.keys())
581 for xlid
in sorted_ids:
584 def __getitem__(self, xlid):
585 return self.data_base[xlid]
588 return len([xl
for xl
in self])
593 def set_name(self, name):
595 for k
in self.data_base:
596 new_data_base[k +
"." + name] = self.data_base[k]
597 self.data_base = new_data_base
601 def get_number_of_xlid(self):
602 return len(self.data_base)
605 FixedFormatParser=
None, encoding=
None):
607 if FixedFormatParser is not specified, the file is
608 comma-separated-values
610 @param file_name a txt file to be parsed
611 @param converter an instance of CrossLinkDataBaseKeywordsConverter
612 @param FixedFormatParser a parser for a fixed format
613 @param encoding the encoding of the file, if not the locale
614 default (usually UTF-8)
616 if not FixedFormatParser:
617 xl_list = IMP.pmi.tools.get_db_from_csv(file_name, encoding)
619 if converter
is not None:
620 self.cldbkc = converter
621 self.list_parser = self.cldbkc.rplp
622 self.converter = converter.get_converter()
624 if not self.list_parser:
628 for nxl, xl
in enumerate(xl_list):
631 if k
in self.converter:
632 new_xl[self.converter[k]] = \
633 self.type[self.converter[k]](xl[k])
636 if self.unique_id_key
in self.cldbkc.get_setup_keys():
637 if new_xl[self.unique_id_key]
not in new_xl_dict:
638 new_xl_dict[new_xl[self.unique_id_key]] = [new_xl]
640 new_xl_dict[new_xl[self.unique_id_key]].append(
643 if str(nxl)
not in new_xl_dict:
644 new_xl_dict[str(nxl)] = [new_xl]
646 new_xl_dict[str(nxl)].append(new_xl)
652 for nxl, entry
in enumerate(xl_list):
656 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
658 "CrossLinkDataBase: expecting a site_pairs_key "
659 "for the site pair list parser")
661 if k
in self.converter:
662 new_dict[self.converter[k]] = \
663 self.type[self.converter[k]](entry[k])
665 new_dict[k] = entry[k]
668 chain_pair_list) = self.list_parser.get_list(
669 new_dict[self.site_pairs_key])
672 for n, p
in enumerate(residue_pair_list):
679 new_xl[k] = new_dict[k]
680 new_xl[self.residue1_key] = \
681 self.type[self.residue1_key](p[0])
683 new_xl[self.residue2_key] = \
684 self.type[self.residue2_key](p[1])
686 if len(chain_pair_list) == len(residue_pair_list):
687 new_xl[self.protein1_key] = \
688 self.type[self.protein1_key](
689 chain_pair_list[n][0])
691 new_xl[self.protein2_key] = \
692 self.type[self.protein2_key](
693 chain_pair_list[n][1])
696 new_xl[self.link_type_key] =
"CROSSLINK"
698 new_xl[self.link_type_key] =
"MONOLINK"
700 if self.unique_id_key
in self.cldbkc.get_setup_keys():
701 if new_xl[self.unique_id_key]
not in new_xl_dict:
702 new_xl_dict[new_xl[self.unique_id_key]] = \
705 new_xl_dict[new_xl[self.unique_id_key]].append(
708 if str(nxl)
not in new_xl_dict:
709 new_xl[self.unique_id_key] = str(nxl+1)
710 new_xl_dict[str(nxl)] = [new_xl]
712 new_xl[self.unique_id_key] = str(nxl+1)
713 new_xl_dict[str(nxl)].append(new_xl)
717 if FixedFormatParser is defined
722 with open(file_name,
"r", encoding=encoding) as f:
724 xl = FixedFormatParser.get_data(line)
726 xl[self.unique_id_key] = str(nxl+1)
727 new_xl_dict[str(nxl)] = [xl]
730 self.data_base = new_xl_dict
731 self.name = file_name
732 loc = ihm.location.InputFileLocation(file_name, details=
'Crosslinks')
733 self.dataset = ihm.dataset.CXMSDataset(loc)
736 def update_cross_link_unique_sub_index(self):
737 for k
in self.data_base:
738 for n, xl
in enumerate(self.data_base[k]):
739 xl[self.ambiguity_key] = len(self.data_base[k])
740 xl[self.unique_sub_index_key] = n+1
741 xl[self.unique_sub_id_key] = k +
"." + str(n+1)
743 def update_cross_link_redundancy(self):
744 redundancy_data_base = {}
746 pra = _ProteinsResiduesArray(xl)
747 if pra
not in redundancy_data_base:
748 redundancy_data_base[pra] = [xl[self.unique_sub_id_key]]
749 redundancy_data_base[pra.get_inverted()] = \
750 [xl[self.unique_sub_id_key]]
752 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
753 redundancy_data_base[pra.get_inverted()].append(
754 xl[self.unique_sub_id_key])
756 pra = _ProteinsResiduesArray(xl)
757 xl[self.redundancy_key] = len(redundancy_data_base[pra])
758 xl[self.redundancy_list_key] = redundancy_data_base[pra]
760 def update_residues_links_number(self):
763 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
764 if (p1, r1)
not in residue_links:
765 residue_links[(p1, r1)] = set([(p2, r2)])
767 residue_links[(p1, r1)].add((p2, r2))
768 if (p2, r2)
not in residue_links:
769 residue_links[(p2, r2)] = set([(p1, r1)])
771 residue_links[(p2, r2)].add((p1, r1))
774 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
775 xl[self.residue1_links_number_key] = len(residue_links[(p1, r1)])
776 xl[self.residue2_links_number_key] = len(residue_links[(p2, r2)])
779 """This function checks the consistency of the dataset with the
780 amino acid sequence"""
781 if self.cldbkc
and self.fasta_seq:
782 cnt_matched, cnt_matched_file = 0, 0
786 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
787 b_matched_file =
False
788 if self.residue1_amino_acid_key
in xl:
791 aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
792 b_matched = self._match_xlinks(p1, r1, aa_from_file)
793 b_matched_file = b_matched
797 b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
799 matched, non_matched = self._update_matched_xlinks(
800 b_matched, p1, r1, matched, non_matched)
802 if self.residue2_amino_acid_key
in xl:
803 aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
804 b_matched = self._match_xlinks(p2, r2, aa_from_file)
805 b_matched_file = b_matched
807 b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
809 matched, non_matched = self._update_matched_xlinks(
810 b_matched, p2, r2, matched, non_matched)
814 cnt_matched_file += 1
816 percentage_matched = round(100*cnt_matched/len(self), 1)
817 percentage_matched_file = round(
818 100 * cnt_matched_file / len(self), 1)
819 if matched
or non_matched:
821 "check_cross_link_consistency: Out of %d cross-links "
822 "%d were matched to the fasta sequence (%f %%).\n "
823 "%d were matched by using the cross-link file (%f %%)."
824 % (len(self), cnt_matched, percentage_matched,
825 cnt_matched_file, percentage_matched_file))
827 print(
"check_cross_link_consistency: Warning: Non "
829 [(prot_name, sorted(list(non_matched[prot_name])))
830 for prot_name
in non_matched])
831 return matched, non_matched
833 def _match_xlinks(self, prot_name, res_index, aa_tuple):
837 amino_dict = IMP.pmi.alphabets.amino_acid
839 for amino_acid
in aa_tuple:
840 if len(amino_acid) == 3:
841 amino_acid = amino_dict.get_one_letter_code_from_residue_type(
843 if prot_name
in self.fasta_seq.sequences:
844 seq = self.fasta_seq.sequences[prot_name]
852 if res_index == 0
or res_index == 1:
854 if res_index < len(seq):
855 if amino_acid == seq[res_index]:
859 def _update_matched_xlinks(self, b_matched, prot, res, matched,
863 matched[prot].add(res)
865 matched[prot] = set([res])
867 if prot
in non_matched:
868 non_matched[prot].add(res)
870 non_matched[prot] = set([res])
871 return matched, non_matched
873 def get_cross_link_string(self, xl):
875 for k
in self.ordered_key_list:
877 string += str(k) +
":" + str(xl[k]) +
"|"
882 if k
not in self.ordered_key_list:
883 string += str(k) +
":" + str(xl[k]) +
"|"
887 def get_short_cross_link_string(self, xl):
889 list_of_keys = [self.data_set_name_key,
890 self.unique_sub_id_key,
898 for k
in list_of_keys:
900 string += str(xl[k]) +
"|"
906 def filter(self, FilterOperator):
908 for id
in self.data_base.keys():
909 for xl
in self.data_base[id]:
910 if FilterOperator.evaluate(xl):
911 if id
not in new_xl_dict:
912 new_xl_dict[id] = [xl]
914 new_xl_dict[id].append(xl)
917 cdb.dataset = self.dataset
922 '''Get all cross-links with score greater than an input value'''
924 self.id_score_key, operator.gt, score)
925 return self.filter(FilterOperator)
927 def merge(self, CrossLinkDataBase1, CrossLinkDataBase2):
929 This function merges two cross-link datasets so that if two
930 conflicting cross-links have the same cross-link UniqueIDS,
931 the cross-links will be appended under the same UniqueID slots
932 with different SubIDs
934 raise NotImplementedError()
937 """Append cross-link dataset to this one."""
939 for k
in self.data_base:
940 new_data_base[k] = self.data_base[k]
941 for k
in db.data_base:
942 new_data_base[k] = db.data_base[k]
943 self.data_base = new_data_base
946 def __iadd__(self, db):
950 def set_value(self, key, new_value, filter_operator=None):
952 This function changes the value for a given key in the database
953 For instance one can change the name of a protein
954 @param key: the key in the database that must be changed
955 @param new_value: the new value of the key
956 @param filter_operator: optional FilterOperator to change the value to
957 a subset of the database
959 example: `cldb1.set_value(cldb1.protein1_key, 'FFF',
960 FO(cldb.protein1_key, operator.eq, "AAA"))`
964 if filter_operator
is not None:
965 if filter_operator.evaluate(xl):
973 this function returns the list of values for a given key in the
974 database alphanumerically sorted
979 return sorted(list(values))
983 This function offset the residue indexes of a given protein by
985 @param protein_name: the protein name that need to be changed
986 @param offset: the offset value
990 if xl[self.protein1_key] == protein_name:
991 xl[self.residue1_key] = xl[self.residue1_key] + offset
992 if xl[self.protein2_key] == protein_name:
993 xl[self.residue2_key] = xl[self.residue2_key] + offset
998 This function creates a new keyword for the whole database and
999 set the values from and existing keyword (optional), otherwise the
1000 values are set to None
1001 @param keyword the new keyword name:
1002 @param values_from_keyword the keyword from which we are copying
1006 if values_from_keyword
is not None:
1007 xl[keyword] = xl[values_from_keyword]
1013 protein_to_rename=
"both"):
1015 This function renames all proteins contained in the input dictionary
1016 from the old names (keys) to the new name (values)
1017 @param old_to_new_names_dictionary dictionary for converting old
1019 @param protein_to_rename specify whether to rename both or protein1
1023 for old_name
in old_to_new_names_dictionary:
1024 new_name = old_to_new_names_dictionary[old_name]
1025 if protein_to_rename ==
"both" or protein_to_rename ==
"protein1":
1027 self.
set_value(self.protein1_key, new_name, fo2)
1028 if protein_to_rename ==
"both" or protein_to_rename ==
"protein2":
1030 self.
set_value(self.protein2_key, new_name, fo2)
1034 This function parses an alignment file obtained previously, to map
1035 crosslink residues onto the sequence of a homolog proteins. It is
1036 useful if you want to map the crosslinks onto a known, homolog
1037 structure without building a comparative model.
1038 The txt file of the alignment should be structured with repeating
1041 >NAME_CROSSLINKED_PROTEIN
1042 >NAME_HOMOLOG_PROTEIN_WITH_KNOWN_STRUCTURE
1043 one-letter code sequence for cross-linked protein (one line)
1044 one-letter code sequence for homolog protein with known structure (one line)
1047 f = open(alignfile,
'r')
1049 def _assign_residues(seq1, seq2):
1057 resinds1.append(nres)
1059 resinds1.append(
None)
1061 resinds2 = [
None for i
in resinds1]
1064 for pos, r2
in enumerate(seq2):
1067 resinds2[pos] = nres
1069 if contiguousgap > 1:
1070 for i
in range(pos-int(contiguousgap/2), pos):
1075 resinds2[pos] = lastresind
1077 for n, r
in enumerate(resinds1):
1085 with open(alignfile,
'r') as f:
1091 protname = aa.replace(
">",
"").replace(
"\n",
"")
1092 seq1 = cc.replace(
",",
"").replace(
"\n",
"")
1093 seq2 = dd.replace(
",",
"").replace(
"\n",
"")
1099 for i
in range(len(seq1)):
1111 tmpl1.append([i, c1, seq1[i]])
1112 tmpl2.append([i, c2, seq2[i]])
1113 sequences1[protname] = tmpl1
1114 sequences2[protname] = tmpl2
1116 d = _assign_residues(seq1, seq2)
1122 if fo11.evaluate(xl):
1123 t = [item
for item
in sequences1[protname]
1124 if item[1] == xl[self.residue1_key]][0]
1125 sequences1[protname][t[0]] = [
1126 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1127 t = [item
for item
in sequences2[protname]
1128 if item[1] == d[xl[self.residue1_key]]][0]
1129 sequences2[protname][t[0]] = [
1130 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1132 xl[self.residue1_key] = d[xl[self.residue1_key]]
1134 if fo12.evaluate(xl):
1136 t = [item
for item
in sequences1[protname]
1137 if item[1] == xl[self.residue2_key]][0]
1138 sequences1[protname][t[0]] = [
1139 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1140 t = [item
for item
in sequences2[protname]
1141 if item[1] == d[xl[self.residue2_key]]][0]
1142 sequences2[protname][t[0]] = [
1143 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1145 xl[self.residue2_key] = d[xl[self.residue2_key]]
1148 print(aa.replace(
"\n",
""))
1149 print(
"".join([seq[2]
for seq
in sequences1[protname]]))
1150 print(bb.replace(
"\n",
""))
1151 print(
"".join([seq[2]
for seq
in sequences2[protname]]))
1155 This function creates as many classes as in the input
1156 (number_of_classes: integer) and partition cross-links according
1157 to their identification scores. Classes are defined in the psi key.
1160 if self.id_score_key
is not None:
1164 'The cross-link database does not contain score values')
1165 minscore = min(scores)
1166 maxscore = max(scores)
1167 scoreclasses = numpy.linspace(minscore, maxscore, number_of_classes+1)
1168 if self.psi_key
is None:
1171 score = xl[self.id_score_key]
1172 for n, classmin
in enumerate(scoreclasses[0:-1]):
1173 if score >= classmin
and score <= scoreclasses[n+1]:
1174 xl[self.psi_key] = str(
"CLASS_"+str(n))
1177 def clone_protein(self, protein_name, new_protein_name):
1178 for id
in self.data_base.keys():
1180 for xl
in self.data_base[id]:
1181 new_data_base.append(xl)
1182 if xl[self.protein1_key] == protein_name \
1183 and xl[self.protein2_key] != protein_name:
1185 new_xl[self.protein1_key] = new_protein_name
1186 new_data_base.append(new_xl)
1187 elif xl[self.protein1_key] != protein_name \
1188 and xl[self.protein2_key] == protein_name:
1190 new_xl[self.protein2_key] = new_protein_name
1191 new_data_base.append(new_xl)
1192 elif xl[self.protein1_key] == protein_name \
1193 and xl[self.protein2_key] == protein_name:
1195 new_xl[self.protein1_key] = new_protein_name
1196 new_data_base.append(new_xl)
1198 new_xl[self.protein2_key] = new_protein_name
1199 new_data_base.append(new_xl)
1201 new_xl[self.protein1_key] = new_protein_name
1202 new_xl[self.protein2_key] = new_protein_name
1203 new_data_base.append(new_xl)
1204 self.data_base[id] = new_data_base
1209 This function remove cross-links applied to the same residue
1210 (ie, same chain name and residue number)
1212 for id
in self.data_base.keys():
1214 for xl
in self.data_base[id]:
1215 if xl[self.protein1_key] == xl[self.protein2_key] \
1216 and xl[self.residue1_key] == xl[self.residue2_key]:
1219 new_data_base.append(xl)
1220 self.data_base[id] = new_data_base
1225 this method returns a CrossLinkDataBase class containing
1226 a random subsample of the original cross-link database.
1227 @param percentage float between 0 and 1, is the percentage of
1228 of spectra taken from the original list
1231 if percentage > 1.0
or percentage < 0.0:
1232 raise ValueError(
'the percentage of random cross-link spectra '
1233 'should be between 0 and 1')
1234 nspectra = self.get_number_of_xlid()
1235 nrandom_spectra = int(nspectra*percentage)
1236 random_keys = random.sample(sorted(self.data_base.keys()),
1239 for k
in random_keys:
1240 new_data_base[k] = self.data_base[k]
1245 sorted_ids = sorted(self.data_base.keys())
1247 for id
in sorted_ids:
1249 for xl
in self.data_base[id]:
1250 for k
in self.ordered_key_list:
1252 outstr +=
"--- " + str(k) +
" " + str(xl[k]) +
"\n"
1257 if k
not in self.ordered_key_list:
1258 outstr +=
"--- " + str(k) +
" " + str(xl[k]) +
"\n"
1259 outstr +=
"-------------\n"
1262 def plot(self, filename, **kwargs):
1264 matplotlib.use(
'Agg')
1265 import matplotlib.pyplot
as plt
1266 import matplotlib.colors
1268 if kwargs[
"type"] ==
"scatter":
1269 cmap = plt.get_cmap(
"rainbow")
1270 _ = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1271 xkey = kwargs[
"xkey"]
1272 ykey = kwargs[
"ykey"]
1273 if "colorkey" in kwargs:
1274 colorkey = kwargs[
"colorkey"]
1275 if "logyscale" in kwargs:
1276 logyscale = kwargs[
"logyscale"]
1284 xs.append(float(xl[xkey]))
1286 ys.append(math.log10(float(xl[ykey])))
1288 ys.append(float(xl[ykey]))
1289 colors.append(float(xl[colorkey]))
1291 print(
"CrossLinkDataBase.plot: Value error for "
1292 "cross-link %s" % (xl[self.unique_id_key]))
1296 for color
in colors:
1297 cindex = (color-min(colors))/(max(colors)-min(colors))
1298 cs.append(cmap(cindex))
1301 ax = fig.add_subplot(111)
1302 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8, marker=
"o")
1305 plt.savefig(filename)
1309 if kwargs[
"type"] ==
"residue_links":
1313 molecule = kwargs[
"molecule"]
1315 length = len(molecule.sequence)
1316 molecule = molecule.get_name()
1319 sequences_object = kwargs[
"sequences_object"]
1320 sequence = sequences_object.sequences[molecule]
1321 length = len(sequence)
1323 histogram = [0]*length
1325 if xl[self.protein1_key] == molecule:
1326 histogram[xl[self.residue1_key]-1] = \
1327 xl[self.residue1_links_number_key]
1328 if xl[self.protein2_key] == molecule:
1329 histogram[xl[self.residue2_key]-1] = \
1330 xl[self.residue2_links_number_key]
1331 plt.bar(range(1, length+1), histogram)
1332 plt.savefig(filename)
1336 if kwargs[
"type"] ==
"histogram":
1337 valuekey = kwargs[
"valuekey"]
1338 valuename = valuekey
1339 bins = kwargs[
"bins"]
1343 values_list.append(float(xl[valuekey]))
1345 print(
"CrossLinkDataBase.plot: Value error for cross-link "
1346 "%s" % (xl[self.unique_id_key]))
1349 filename, [values_list], valuename=valuename, bins=bins,
1350 colors=
None, format=
"pdf",
1351 reference_xline=
None, yplotrange=
None,
1352 xplotrange=
None, normalized=
True,
1355 def dump(self, json_filename):
1357 with open(json_filename,
'w')
as fp:
1358 json.dump(self.data_base, fp, sort_keys=
True, indent=2,
1359 default=set_json_default)
1361 def load(self, json_filename):
1363 with open(json_filename,
'r') as fp:
1364 self.data_base = json.load(fp)
1367 def save_csv(self, filename):
1372 sorted_group_ids = sorted(self.data_base.keys())
1373 for group
in sorted_group_ids:
1375 for xl
in self.data_base[group]:
1377 sorted_ids = sorted(xl.keys())
1378 values = [xl[k]
for k
in sorted_ids]
1379 group_block.append(values)
1382 with open(filename,
'w')
as fp:
1383 a = csv.writer(fp, delimiter=
',')
1384 a.writerow(sorted_ids)
1389 Returns the number of non redundant cross-link sites
1393 pra = _ProteinsResiduesArray(xl)
1394 prai = pra.get_inverted()
1397 return len(list(praset))
1401 """This class allows to compute and plot the distance between datasets"""
1404 """Input a dictionary where keys are cldb names and values are cldbs"""
1405 import scipy.spatial.distance
1406 self.dbs = cldb_dictionary
1407 self.keylist = list(self.dbs.keys())
1408 self.distances = list()
1410 for i, key1
in enumerate(self.keylist):
1411 for key2
in self.keylist[i+1:]:
1412 distance = self.get_distance(key1, key2)
1413 self.distances.append(distance)
1415 self.distances = scipy.spatial.distance.squareform(self.distances)
1418 return 1.0 - self.
jaccard_index(self.dbs[key1], self.dbs[key2])
1421 """Similarity index between two datasets
1422 https://en.wikipedia.org/wiki/Jaccard_index"""
1426 for xl1
in CrossLinkDataBase1:
1427 a1f = _ProteinsResiduesArray(xl1)
1428 a1b = a1f.get_inverted()
1431 for xl2
in CrossLinkDataBase2:
1432 a2f = _ProteinsResiduesArray(xl2)
1433 a2b = a2f.get_inverted()
1436 return (float(len(set1 & set2)/2)
1437 / (len(set1)/2+len(set2)/2-len(set1 & set2)/2))
1439 def plot_matrix(self, figurename="clustermatrix.pdf"):
1440 import matplotlib
as mpl
1443 import matplotlib.pylab
as pl
1444 from scipy.cluster
import hierarchy
as hrc
1446 raw_distance_matrix = self.distances
1447 labels = self.keylist
1451 ax = fig.add_subplot(1, 1, 1)
1452 dendrogram = hrc.dendrogram(
1453 hrc.linkage(raw_distance_matrix),
1456 leaves_order = dendrogram[
'leaves']
1457 ax.set_xlabel(
'Dataset')
1458 ax.set_ylabel(
'Jaccard Distance')
1460 pl.savefig(
"dendrogram."+figurename, dpi=300)
1465 ax = fig.add_subplot(1, 1, 1)
1466 cax = ax.imshow(raw_distance_matrix[leaves_order, :][:, leaves_order],
1467 interpolation=
'nearest')
1468 cb = fig.colorbar(cax)
1469 cb.set_label(
'Jaccard Distance')
1470 ax.set_xlabel(
'Dataset')
1471 ax.set_ylabel(
'Dataset')
1472 ax.set_xticks(range(len(labels)))
1473 ax.set_xticklabels(numpy.array(labels)[leaves_order],
1474 rotation=
'vertical')
1475 ax.set_yticks(range(len(labels)))
1476 ax.set_yticklabels(numpy.array(labels)[leaves_order],
1477 rotation=
'horizontal')
1479 pl.savefig(
"matrix." + figurename, dpi=300)
1485 This class maps a CrossLinkDataBase on a given structure
1486 and save an rmf file with color-coded cross-links
1489 def __init__(self, CrossLinkDataBase, rmf_or_stat_handler):
1490 self.CrossLinkDataBase = CrossLinkDataBase
1493 self.prots = rmf_or_stat_handler
1494 self.distances = defaultdict(list)
1495 self.array_to_id = {}
1496 self.id_to_array = {}
1498 print(
"computing distances for all cross-links and all structures")
1499 for i
in self.prots[::10]:
1500 self.compute_distances()
1501 for key, xl
in enumerate(self.CrossLinkDataBase):
1502 array = _ProteinsResiduesArray(xl)
1503 self.array_to_id[array] = key
1504 self.id_to_array[key] = array
1505 if xl[
"MinAmbiguousDistance"] !=
'None':
1506 self.distances[key].append(xl[
"MinAmbiguousDistance"])
1508 def compute_distances(self):
1511 sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1512 for group
in sorted_group_ids:
1514 for xl
in self.CrossLinkDataBase.data_base[group]:
1516 sorted_ids = sorted(xl.keys())
1519 + [
"UniqueID",
"Distance",
"MinAmbiguousDistance"])
1520 c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1522 (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1523 self._get_distance_and_particle_pair(r1, c1, r2, c2)
1533 values = [xl[k]
for k
in sorted_ids]
1534 values += [group, mdist]
1535 except KeyError
as e:
1536 print(
"MapCrossLinkDataBaseOnStructure "
1537 "KeyError: {0} in {1}".format(e, xl))
1538 group_dists.append(mdist)
1539 xl[
"Distance"] = mdist
1540 xl[
"State1"] = state1
1542 xl[
"State2"] = state2
1545 for xl
in self.CrossLinkDataBase.data_base[group]:
1546 xl[
"MinAmbiguousDistance"] = min(group_dists)
1548 def _get_distance_and_particle_pair(self, r1, c1, r2, c2):
1549 '''more robust and slower version of above'''
1552 selpart_1 = sel.get_selected_particles()
1553 if len(selpart_1) == 0:
1554 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle "
1555 "selected for first site")
1559 selpart_2 = sel.get_selected_particles()
1560 if len(selpart_2) == 0:
1561 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle "
1562 "selected for second site")
1565 for p1
in selpart_1:
1566 for p2
in selpart_2:
1567 if p1 == p2
and r1 == r2:
1575 h1 = h1.get_parent()
1578 h1 = h1.get_parent()
1582 h2 = h2.get_parent()
1585 h2 = h2.get_parent()
1588 results.append((dist, state_index1, copy_index1,
1589 state_index2, copy_index2, p1, p2))
1590 if len(results) == 0:
1592 results_sorted = sorted(results,
1593 key=operator.itemgetter(0, 1, 2, 3, 4))
1594 return ((results_sorted[0][0],
1595 results_sorted[0][5],
1596 results_sorted[0][6]),
1597 (results_sorted[0][1],
1598 results_sorted[0][2],
1599 results_sorted[0][3],
1600 results_sorted[0][4]))
1602 def save_rmf_snapshot(self, filename, color_id=None):
1603 sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1606 for group
in sorted_group_ids:
1607 group_dists_particles = []
1608 for xl
in self.CrossLinkDataBase.data_base[group]:
1610 self.CrossLinkDataBase.get_short_cross_link_string(xl)
1611 c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1613 (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1614 self._get_distance_and_particle_pair(r1, c1, r2, c2)
1616 print(
"TypeError or missing chain/residue ",
1619 if color_id
is not None:
1620 group_dists_particles.append((mdist, p1, p2, xllabel,
1621 float(xl[color_id])))
1623 group_dists_particles.append((mdist, p1, p2, xllabel, 0.0))
1624 if group_dists_particles:
1625 (minmdist, minp1, minp2, minxllabel, mincolor_score) = \
1626 min(group_dists_particles, key=
lambda t: t[0])
1627 color_scores.append(mincolor_score)
1628 list_of_pairs.append((minp1, minp2, minxllabel,
1634 linear.set_slope(1.0)
1637 'linear_dummy_restraints')
1639 offset = min(color_scores)
1640 maxvalue = max(color_scores)
1641 for pair
in list_of_pairs:
1644 pr.set_name(pair[2])
1645 if offset < maxvalue:
1646 factor = (pair[3]-offset)/(maxvalue-offset)
1653 rslin.add_restraint(pr)
1656 rh = RMF.create_rmf_file(filename)
1663 def boxplot_crosslink_distances(self, filename):
1665 keys = [self.id_to_array[k]
for k
in self.distances.keys()]
1666 medians = [numpy.median(self.distances[self.array_to_id[k]])
1668 dists = [self.distances[self.array_to_id[k]]
for k
in keys]
1669 distances_sorted_by_median = \
1670 [x
for _, x
in sorted(zip(medians, dists))]
1671 keys_sorted_by_median = [x
for _, x
in sorted(zip(medians, keys))]
1673 filename, distances_sorted_by_median,
1674 range(len(distances_sorted_by_median)),
1675 xlabels=keys_sorted_by_median,
1676 scale_plot_length=0.2)
1678 def get_crosslink_violations(self, threshold):
1679 violations = defaultdict(list)
1680 for k
in self.distances:
1682 viols = self.distances[k]
1683 violations[self.id_to_array[k]] = viols
1689 This class generates a CrossLinkDataBase from a given structure
1691 def __init__(self, system, residue_types_1=["K"],
1692 residue_types_2=[
"K"], reactivity_range=[0, 1], kt=1.0):
1696 cldbkc.set_protein1_key(
"Protein1")
1697 cldbkc.set_protein2_key(
"Protein2")
1698 cldbkc.set_residue1_key(
"Residue1")
1699 cldbkc.set_residue2_key(
"Residue2")
1701 self.system = system
1702 self.model = self.system.model
1703 self.residue_types_1 = residue_types_1
1704 self.residue_types_2 = residue_types_2
1706 self.indexes_dict1 = {}
1707 self.indexes_dict2 = {}
1708 self.protein_residue_dict = {}
1709 self.reactivity_dictionary = {}
1710 self.euclidean_interacting_pairs =
None
1711 self.xwalk_interacting_pairs =
None
1713 for state
in self.system.get_states():
1714 for moleculename, molecules
in state.get_molecules().items():
1715 for molecule
in molecules:
1718 seq = molecule.sequence
1719 residues = [i
for i
in range(1, len(seq)+1)
1720 if ((seq[i-1]
in self.residue_types_1)
1721 or (seq[i-1]
in self.residue_types_2))]
1727 rexp = numpy.random.exponential(0.00000001)
1728 prob = 1.0-math.exp(-rexp)
1729 self.reactivity_dictionary[(molecule, r)] = prob
1731 residues1 = [i
for i
in range(1, len(seq)+1)
1732 if seq[i-1]
in self.residue_types_1]
1733 residues2 = [i
for i
in range(1, len(seq)+1)
1734 if seq[i-1]
in self.residue_types_2]
1737 molecule.hier, residue_index=r, resolution=1)
1738 ps = s.get_selected_particles()
1740 index = p.get_index()
1741 self.indexes_dict1[index] = (molecule, r)
1742 self.protein_residue_dict[(molecule, r)] = index
1745 molecule.hier, residue_index=r, resolution=1)
1746 ps = s.get_selected_particles()
1748 index = p.get_index()
1749 self.indexes_dict2[index] = (molecule, r)
1750 self.protein_residue_dict[(molecule, r)] = index
1752 def get_all_possible_pairs(self):
1753 n = float(len(self.protein_residue_dict.keys()))
1754 return n*(n-1.0)/2.0
1756 def get_all_feasible_pairs(self, distance=21):
1758 particle_index_pairs = []
1760 for a, b
in itertools.combinations(
1761 self.protein_residue_dict.keys(), 2):
1763 index1 = self.protein_residue_dict[a]
1764 index2 = self.protein_residue_dict[b]
1768 if particle_distance <= distance:
1769 particle_index_pairs.append((index1, index2))
1770 new_xl[self.cldb.protein1_key] = a[0].get_name()
1771 new_xl[self.cldb.protein2_key] = b[0].get_name()
1772 new_xl[
"molecule_object1"] = a[0]
1773 new_xl[
"molecule_object2"] = b[0]
1774 new_xl[self.cldb.residue1_key] = a[1]
1775 new_xl[self.cldb.residue2_key] = b[1]
1776 self.cldb.data_base[str(nxl)] = [new_xl]
1781 def get_data_base(self, total_number_of_spectra, ambiguity_probability=0.1,
1782 noise=0.01, distance=21, max_delta_distance=10.0,
1783 xwalk_bin_path=
None, confidence_false=0.75,
1784 confidence_true=0.75):
1786 from random
import random
1788 number_of_spectra = 1
1790 self.beta_true = -1.4427*math.log(0.5*(1.0-confidence_true))
1791 self.beta_false = -1.4427*math.log(0.5*(1.0-confidence_false))
1792 self.cldb.data_base[str(number_of_spectra)] = []
1793 self.sites_weighted =
None
1795 while number_of_spectra < total_number_of_spectra:
1796 if (random() > ambiguity_probability
1797 and len(self.cldb.data_base[str(number_of_spectra)]) != 0):
1799 number_of_spectra += 1
1800 self.cldb.data_base[str(number_of_spectra)] = []
1802 if random() > noise:
1804 pra, dist = self.get_random_residue_pair(
1805 distance, xwalk_bin_path, max_delta_distance)
1808 pra, dist = self.get_random_residue_pair(
None,
None,
None)
1812 new_xl[self.cldb.protein1_key] = pra[0].get_name()
1813 new_xl[self.cldb.protein2_key] = pra[1].get_name()
1814 new_xl[
"molecule_object1"] = pra[0]
1815 new_xl[
"molecule_object2"] = pra[1]
1816 new_xl[self.cldb.residue1_key] = pra[2]
1817 new_xl[self.cldb.residue2_key] = pra[3]
1818 new_xl[
"Noisy"] = noisy
1820 new_xl[
"Reactivity_Residue1"] = \
1821 self.reactivity_dictionary[(pra[0], pra[2])]
1822 new_xl[
"Reactivity_Residue2"] = \
1823 self.reactivity_dictionary[(pra[1], pra[3])]
1825 new_xl[
"Score"] = np.random.beta(1.0, self.beta_false)
1827 new_xl[
"Score"] = 1.0-np.random.beta(1.0, self.beta_true)
1828 new_xl[
"TargetDistance"] = dist
1829 new_xl[
"NoiseProbability"] = noise
1830 new_xl[
"AmbiguityProbability"] = ambiguity_probability
1833 self.model, [self.protein_residue_dict[(pra[0], pra[2])],
1834 self.protein_residue_dict[(pra[1], pra[3])]])
1839 new_xl[
"InterRigidBody"] =
False
1844 new_xl[
"InterRigidBody"] =
True
1846 new_xl[
"InterRigidBody"] =
None
1848 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1852 def get_random_residue_pair(self, distance=None, xwalk_bin_path=None,
1853 max_delta_distance=
None):
1856 from random
import choice
1857 if distance
is None:
1860 protein1, residue1 = choice(self.protein_residue_dict.keys())
1861 protein2, residue2 = choice(self.protein_residue_dict.keys())
1862 index1 = self.protein_residue_dict[(protein1, residue1)]
1863 index2 = self.protein_residue_dict[(protein2, residue2)]
1867 if (protein1, residue1) != (protein2, residue2):
1871 if not xwalk_bin_path:
1873 gcpf.set_distance(distance+max_delta_distance)
1877 if not self.sites_weighted:
1878 self.sites_weighted = []
1879 for key
in self.reactivity_dictionary:
1880 r = self.reactivity_dictionary[key]
1881 self.sites_weighted.append((key, r))
1883 first_site = self.weighted_choice(self.sites_weighted)
1885 if not self.euclidean_interacting_pairs:
1886 self.euclidean_interacting_pairs = \
1887 gcpf.get_close_pairs(
1889 self.indexes_dict1.keys(),
1890 self.indexes_dict2.keys())
1892 first_site_pairs = \
1893 [pair
for pair
in self.euclidean_interacting_pairs
1894 if self.indexes_dict1[pair[0]] == first_site
or
1895 self.indexes_dict2[pair[1]] == first_site]
1896 if len(first_site_pairs) == 0:
1899 second_sites_weighted = []
1900 for pair
in first_site_pairs:
1901 if self.indexes_dict1[pair[0]] == first_site:
1902 second_site = self.indexes_dict2[pair[1]]
1903 if self.indexes_dict2[pair[1]] == first_site:
1904 second_site = self.indexes_dict1[pair[0]]
1905 r = self.reactivity_dictionary[second_site]
1906 second_sites_weighted.append((second_site, r))
1907 second_site = self.weighted_choice(second_sites_weighted)
1908 protein1, residue1 = first_site
1909 protein2, residue2 = second_site
1910 print(
"CrossLinkDataBaseFromStructure."
1911 "get_random_residue_pair:",
1912 "First site", first_site,
1913 self.reactivity_dictionary[first_site],
1914 "Second site", second_site,
1915 self.reactivity_dictionary[second_site])
1918 [self.protein_residue_dict[first_site],
1919 self.protein_residue_dict[second_site]])
1924 if particle_distance < distance \
1925 and (protein1, residue1) != (protein2, residue2):
1927 elif (particle_distance >= distance
1928 and (protein1, residue1) != (protein2, residue2)
1929 and max_delta_distance):
1931 if particle_distance-distance < max_delta_distance:
1935 if not self.xwalk_interacting_pairs:
1936 self.xwalk_interacting_pairs = \
1937 self.get_xwalk_distances(xwalk_bin_path, distance)
1938 interacting_pairs_weighted = []
1940 for pair
in self.xwalk_interacting_pairs:
1946 -self.reactivity_dictionary[(protein1, residue1)]
1949 -self.reactivity_dictionary[(protein2, residue2)]
1951 interacting_pairs_weighted.append((pair, weight1*weight2))
1953 pair = self.weighted_choice(interacting_pairs_weighted)
1958 particle_distance = float(pair[4])
1960 return ((protein1, protein2, residue1, residue2)), particle_distance
1962 def get_xwalk_distances(self, xwalk_bin_path, distance):
1966 o.init_pdb(
"xwalk.pdb", self.representation.prot)
1967 o.write_pdb(
"xwalk.pdb")
1968 namechainiddict = o.dictchain[
"xwalk.pdb"]
1971 for key
in namechainiddict:
1972 chainiddict[namechainiddict[key]] = key
1973 xwalkout = os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path
1974 +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys '
1975 '-a1 cb -a2 cb -max '
1976 + str(distance) +
' -bb').read()
1978 output_list_of_distance = []
1979 for line
in xwalkout.split(
"\n")[0:-2]:
1980 tokens = line.split()
1983 distance = float(tokens[6])
1984 fs = first.split(
"-")
1985 ss = second.split(
"-")
1988 protein1 = chainiddict[chainid1]
1989 protein2 = chainiddict[chainid2]
1990 residue1 = int(fs[1])
1991 residue2 = int(ss[1])
1992 output_list_of_distance.append(
1993 (protein1, protein2, residue1, residue2, distance))
1994 return output_list_of_distance
1996 def weighted_choice(self, choices):
1999 total = sum(w
for c, w
in choices)
2000 r = random.uniform(0, total)
2002 for c, w
in choices:
2006 assert False,
"Shouldn't get here"
2008 def save_rmf_snapshot(self, filename, color_id=None):
2011 if color_id
is None:
2012 color_id =
"Reactivity"
2013 sorted_group_ids = sorted(self.cldb.data_base.keys())
2016 for group
in sorted_group_ids:
2017 group_dists_particles = []
2018 for xl
in self.cldb.data_base[group]:
2019 xllabel = self.cldb.get_short_cross_link_string(xl)
2020 (c1, c2, r1, r2) = (xl[
"molecule_object1"],
2021 xl[
"molecule_object2"],
2022 xl[self.cldb.residue1_key],
2023 xl[self.cldb.residue2_key])
2025 index1 = self.protein_residue_dict[(c1, r1)]
2026 index2 = self.protein_residue_dict[(c2, r2)]
2029 mdist = xl[
"TargetDistance"]
2031 print(
"TypeError or missing chain/residue ",
2034 group_dists_particles.append((mdist, p1, p2, xllabel,
2035 float(xl[color_id])))
2036 if group_dists_particles:
2037 (minmdist, minp1, minp2, minxllabel, mincolor_score) \
2038 = min(group_dists_particles, key=
lambda t: t[0])
2039 color_scores.append(mincolor_score)
2040 list_of_pairs.append((minp1, minp2, minxllabel,
2047 linear.set_slope(1.0)
2051 offset = min(color_scores)
2052 maxvalue = max(color_scores)
2053 for pair
in list_of_pairs:
2055 pr.set_name(pair[2])
2056 factor = (pair[3]-offset)/(maxvalue-offset)
2061 rslin.add_restraint(pr)
2064 rh = RMF.create_rmf_file(filename)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def __init__
Input a dictionary where keys are cldb names and values are cldbs.
def get_number_of_unique_crosslinked_sites
Returns the number of non redundant cross-link sites.
def filter_score
Get all cross-links with score greater than an input value.
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
Get the particles from a list of indexes.
def create_new_keyword
This function creates a new keyword for the whole database and set the values from and existing keywo...
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
def jaccard_index
Similarity index between two datasets https://en.wikipedia.org/wiki/Jaccard_index.
def merge
This function merges two cross-link datasets so that if two conflicting cross-links have the same cro...
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class maps a CrossLinkDataBase on a given structure and save an rmf file with color-coded cross-...
A decorator for keeping track of copies of a molecule.
The standard decorator for manipulating molecular structures.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def set_value
This function changes the value for a given key in the database For instance one can change the name ...
This class allows to compute and plot the distance between datasets.
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
def filter_out_same_residues
This function remove cross-links applied to the same residue (ie, same chain name and residue number)...
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
def offset_residue_index
This function offset the residue indexes of a given protein by a specified value. ...
Class for easy writing of PDBs, RMFs, and stat files.
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
def jackknife
this method returns a CrossLinkDataBase class containing a random subsample of the original cross-lin...
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
def check_cross_link_consistency
This function checks the consistency of the dataset with the amino acid sequence. ...
Find all nearby pairs by testing all pairs.
Score a pair of particles based on the distance between their centers.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
def get_values
this function returns the list of values for a given key in the database alphanumerically sorted ...
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Associate an integer "state" index with a hierarchy node.
def append_database
Append cross-link dataset to this one.
class to link stat files to several rmf files
class to allow more advanced handling of RMF files.
Mapping between FASTA one-letter codes and residue types.
A class to handle different styles of site pairs parsers.
def classify_crosslinks_by_score
This function creates as many classes as in the input (number_of_classes: integer) and partition cros...
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
this class handles a cross-link dataset and do filtering operations, adding cross-links, merge datasets...
Support for the RMF file format for storing hierarchical molecular data and markup.
def rename_proteins
This function renames all proteins contained in the input dictionary from the old names (keys) to the...
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.
def align_sequence
This function parses an alignment file obtained previously, to map crosslink residues onto the sequen...
def create_set_from_file
if FixedFormatParser is not specified, the file is comma-separated-values