1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
8 from __future__
import print_function
24 from collections
import defaultdict
29 def set_json_default(obj):
30 if isinstance(obj, set):
38 if sys.version_info[0] >= 3:
39 def _handle_string_input(inp):
40 if not isinstance(inp, str):
41 raise TypeError(
"expecting a string")
44 def _handle_string_input(inp):
45 if not isinstance(inp, (str, unicode)):
46 raise TypeError(
"expecting a string or unicode")
48 if isinstance(inp, unicode):
54 class _CrossLinkDataBaseStandardKeys(object):
56 This class setup all the standard keys needed to
57 identify the cross-link features from the data sets
61 self.protein1_key =
"Protein1"
62 self.type[self.protein1_key] = str
63 self.protein2_key =
"Protein2"
64 self.type[self.protein2_key] = str
65 self.residue1_key =
"Residue1"
66 self.type[self.residue1_key] = int
67 self.residue2_key =
"Residue2"
68 self.type[self.residue2_key] = int
69 self.residue1_amino_acid_key =
"Residue1AminoAcid"
70 self.type[self.residue1_amino_acid_key] = str
71 self.residue2_amino_acid_key =
"Residue2AminoAcid"
72 self.type[self.residue2_amino_acid_key] = str
73 self.residue1_moiety_key =
"Residue1Moiety"
74 self.type[self.residue1_moiety_key] = str
75 self.residue2_moiety_key =
"Residue2Moiety"
76 self.type[self.residue2_moiety_key] = str
77 self.site_pairs_key =
"SitePairs"
78 self.type[self.site_pairs_key] = str
79 self.unique_id_key =
"XLUniqueID"
80 self.type[self.unique_id_key] = str
81 self.unique_sub_index_key =
"XLUniqueSubIndex"
82 self.type[self.unique_sub_index_key] = int
83 self.unique_sub_id_key =
"XLUniqueSubID"
84 self.type[self.unique_sub_id_key] = str
85 self.data_set_name_key =
"DataSetName"
86 self.type[self.data_set_name_key] = str
87 self.cross_linker_chemical_key =
"CrossLinkerChemical"
88 self.type[self.cross_linker_chemical_key] = str
89 self.id_score_key =
"IDScore"
90 self.type[self.id_score_key] = float
92 self.type[self.fdr_key] = float
93 self.quantitation_key =
"Quantitation"
94 self.type[self.quantitation_key] = float
95 self.redundancy_key =
"Redundancy"
96 self.type[self.redundancy_key] = int
97 self.redundancy_list_key =
"RedundancyList"
98 self.type[self.redundancy_key] = list
99 self.ambiguity_key =
"Ambiguity"
100 self.type[self.ambiguity_key] = int
101 self.residue1_links_number_key =
"Residue1LinksNumber"
102 self.type[self.residue1_links_number_key] = int
103 self.residue2_links_number_key =
"Residue2LinksNumber"
104 self.type[self.residue2_links_number_key] = int
105 self.type[self.ambiguity_key] = int
106 self.state_key =
"State"
107 self.type[self.state_key] = int
108 self.sigma1_key =
"Sigma1"
109 self.type[self.sigma1_key] = str
110 self.sigma2_key =
"Sigma2"
111 self.type[self.sigma2_key] = str
113 self.type[self.psi_key] = str
114 self.distance_key =
"Distance"
115 self.type[self.distance_key] = float
116 self.min_ambiguous_distance_key =
"MinAmbiguousDistance"
117 self.type[self.distance_key] = float
119 self.link_type_key =
"LinkType"
120 self.type[self.link_type_key] = str
122 self.ordered_key_list = [
123 self.data_set_name_key, self.unique_id_key,
124 self.unique_sub_index_key, self.unique_sub_id_key,
125 self.protein1_key, self.protein2_key, self.residue1_key,
126 self.residue2_key, self.residue1_amino_acid_key,
127 self.residue2_amino_acid_key, self.residue1_moiety_key,
128 self.residue2_moiety_key, self.cross_linker_chemical_key,
129 self.id_score_key, self.fdr_key, self.quantitation_key,
130 self.redundancy_key, self.redundancy_list_key, self.state_key,
131 self.sigma1_key, self.sigma2_key, self.psi_key, self.distance_key,
132 self.min_ambiguous_distance_key, self.link_type_key]
135 class _ProteinsResiduesArray(tuple):
137 This class is inherits from tuple, and it is a shorthand for a cross-link
138 (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1
139 and protein2, r1 and r2 are residue1 and residue2.
142 def __new__(self, input_data):
144 @input_data can be a dict or a tuple
146 self.cldbsk = _CrossLinkDataBaseStandardKeys()
147 if type(input_data)
is dict:
149 p1 = input_data[self.cldbsk.protein1_key]
151 p2 = input_data[self.cldbsk.protein2_key]
154 r1 = input_data[self.cldbsk.residue1_key]
156 r2 = input_data[self.cldbsk.residue2_key]
162 t = (p1,
"", r1,
None)
163 elif type(input_data)
is tuple:
164 if len(input_data) > 4
or len(input_data) == 3 \
165 or len(input_data) == 1:
167 "_ProteinsResiduesArray: must have only 4 elements")
168 if len(input_data) == 4:
169 p1 = _handle_string_input(input_data[0])
170 p2 = _handle_string_input(input_data[1])
173 if (type(r1)
is not int)
and (r1
is not None):
175 "_ProteinsResiduesArray: residue1 must be a integer")
176 if (type(r2)
is not int)
and (r1
is not None):
178 "_ProteinsResiduesArray: residue2 must be a integer")
180 if len(input_data) == 2:
181 p1 = _handle_string_input(input_data[0])
183 if type(r1)
is not int:
185 "_ProteinsResiduesArray: residue1 must be a integer")
186 t = (p1,
"", r1,
None)
189 "_ProteinsResiduesArray: input must be a dict or tuple")
190 return tuple.__new__(_ProteinsResiduesArray, t)
192 def get_inverted(self):
194 Returns a _ProteinsResiduesArray instance with protein1 and
197 return _ProteinsResiduesArray((self[1], self[0], self[3], self[2]))
200 outstr = self.cldbsk.protein1_key +
" " + str(self[0])
201 outstr +=
" " + self.cldbsk.protein2_key +
" " + str(self[1])
202 outstr +=
" " + self.cldbsk.residue1_key +
" " + str(self[2])
203 outstr +=
" " + self.cldbsk.residue2_key +
" " + str(self[3])
207 outstr = str(self[0]) +
"." + str(self[2]) +
"-" + str(self[1]) \
214 This class allows to create filter functions that can be passed to
215 the CrossLinkDataBase in this way:
217 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
219 where cldb is CrossLinkDataBase instance and it is only needed to get
220 the standard keywords
222 A filter operator can be evaluate on a CrossLinkDataBase item xl and
227 and it is used to filter the database
230 def __init__(self, argument1, operator, argument2):
232 (argument1,operator,argument2) can be either a
233 (keyword,operator.eq|lt|gt...,value)
234 or (FilterOperator1,operator.or|and...,FilterOperator2)
236 if isinstance(argument1, FilterOperator):
237 self.operations = [argument1, operator, argument2]
240 self.values = (argument1, operator, argument2)
242 def __or__(self, FilterOperator2):
245 def __and__(self, FilterOperator2):
248 def __invert__(self):
251 def evaluate(self, xl_item):
253 if len(self.operations) == 0:
254 keyword, operator, value = self.values
255 return operator(xl_item[keyword], value)
256 FilterOperator1, op, FilterOperator2 = self.operations
258 if FilterOperator2
is None:
259 return op(FilterOperator1.evaluate(xl_item))
261 return op(FilterOperator1.evaluate(xl_item),
262 FilterOperator2.evaluate(xl_item))
267 This class is needed to convert the keywords from a generic database
273 @param list_parser an instance of ResiduePairListParser, if any
277 self.backward_converter = {}
278 _CrossLinkDataBaseStandardKeys.__init__(self)
279 self.rplp = list_parser
280 if self.rplp
is None:
282 self.compulsory_keys = set([self.protein1_key,
287 self.compulsory_keys = self.rplp.get_compulsory_keys()
288 self.setup_keys = set()
292 Is a function that check whether necessary keys are setup
295 if self.compulsory_keys & setup_keys != self.compulsory_keys:
296 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup "
297 "all necessary keys")
301 Returns the keys that have been setup so far
303 return self.backward_converter.keys()
307 This sets up the standard compulsory keys as defined by
308 _CrossLinkDataBaseStandardKeys
310 for ck
in self.compulsory_keys:
311 self.converter[ck] = ck
312 self.backward_converter[ck] = ck
314 def set_unique_id_key(self, origin_key):
315 self.converter[origin_key] = self.unique_id_key
316 self.backward_converter[self.unique_id_key] = origin_key
318 def set_protein1_key(self, origin_key):
319 self.converter[origin_key] = self.protein1_key
320 self.backward_converter[self.protein1_key] = origin_key
322 def set_protein2_key(self, origin_key):
323 self.converter[origin_key] = self.protein2_key
324 self.backward_converter[self.protein2_key] = origin_key
326 def set_residue1_key(self, origin_key):
327 self.converter[origin_key] = self.residue1_key
328 self.backward_converter[self.residue1_key] = origin_key
330 def set_residue2_key(self, origin_key):
331 self.converter[origin_key] = self.residue2_key
332 self.backward_converter[self.residue2_key] = origin_key
334 def set_residue1_amino_acid_key(self, origin_key):
335 self.converter[origin_key] = self.residue1_amino_acid_key
336 self.backward_converter[self.residue1_amino_acid_key] = origin_key
338 def set_residue2_amino_acid_key(self, origin_key):
339 self.converter[origin_key] = self.residue2_amino_acid_key
340 self.backward_converter[self.residue2_amino_acid_key] = origin_key
342 def set_residue1_moiety_key(self, origin_key):
343 self.converter[origin_key] = self.residue1_moiety_key
344 self.backward_converter[self.residue1_moiety_key] = origin_key
346 def set_residue2_moiety_key(self, origin_key):
347 self.converter[origin_key] = self.residue2_moiety_key
348 self.backward_converter[self.residue2_moiety_key] = origin_key
350 def set_site_pairs_key(self, origin_key):
351 self.converter[origin_key] = self.site_pairs_key
352 self.backward_converter[self.site_pairs_key] = origin_key
354 def set_id_score_key(self, origin_key):
355 self.converter[origin_key] = self.id_score_key
356 self.backward_converter[self.id_score_key] = origin_key
358 def set_fdr_key(self, origin_key):
359 self.converter[origin_key] = self.fdr_key
360 self.backward_converter[self.fdr_key] = origin_key
362 def set_quantitation_key(self, origin_key):
363 self.converter[origin_key] = self.quantitation_key
364 self.backward_converter[self.quantitation_key] = origin_key
366 def set_psi_key(self, origin_key):
367 self.converter[origin_key] = self.psi_key
368 self.backward_converter[self.psi_key] = origin_key
370 def set_link_type_key(self, link_type_key):
371 self.converter[link_type_key] = self.link_type_key
372 self.backward_converter[self.link_type_key] = link_type_key
376 Returns the dictionary that convert the old keywords to the new ones
379 return self.converter
383 Returns the dictionary that convert the new keywords to the old ones
386 return self.backward_converter
391 A class to handle different styles of site pairs parsers.
393 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
394 [Y3-;K4-] for dead-ends
395 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
396 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
397 LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
402 def __init__(self, style):
403 _CrossLinkDataBaseStandardKeys.__init__(self)
404 if style ==
"MSSTUDIO":
406 self.compulsory_keys = set([self.protein1_key,
408 self.site_pairs_key])
409 elif style ==
"XTRACT" or style ==
"QUANTITATION":
411 self.compulsory_keys = set([self.site_pairs_key])
412 elif style ==
"LAN_HUANG":
414 self.compulsory_keys = set([self.site_pairs_key])
416 raise Exception(
"ResiduePairListParser: unknown list parser style")
418 def get_compulsory_keys(self):
419 return self.compulsory_keys
423 This function returns a list of cross-linked residues and the
424 corresponding list of cross-linked chains. The latter list can be
425 empty, if the style doesn't have the corresponding information.
427 if self.style ==
"MSSTUDIO":
428 input_string = input_string.replace(
"[",
"")
429 input_string = input_string.replace(
"]",
"")
430 input_string_pairs = input_string.split(
";")
431 residue_pair_indexes = []
432 chain_pair_indexes = []
433 for s
in input_string_pairs:
435 r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)'
436 r'(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',
439 r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$', s)
442 (residue_type_1, residue_index_1, residue_type_2,
443 residue_index_2) = m1.group(1, 2, 3, 4)
444 residue_pair_indexes.append((residue_index_1,
448 residue_type_1, residue_index_1 = m2.group(1, 2)
450 return residue_pair_indexes, chain_pair_indexes
451 if self.style ==
"XTRACT" or self.style ==
"QUANTITATION":
452 if ":x:" in input_string:
454 input_strings = input_string.split(
":x:")
455 first_peptides = input_strings[0].split(
":|:")
456 second_peptides = input_strings[1].split(
":|:")
457 first_peptides_identifiers = [
459 f.split(
":")[1])
for f
in first_peptides]
460 second_peptides_identifiers = [
462 f.split(
":")[1])
for f
in second_peptides]
463 residue_pair_indexes = []
464 chain_pair_indexes = []
465 for fpi
in first_peptides_identifiers:
466 for spi
in second_peptides_identifiers:
471 residue_pair_indexes.append((residue1, residue2))
472 chain_pair_indexes.append((chain1, chain2))
473 return residue_pair_indexes, chain_pair_indexes
476 first_peptides = input_string.split(
":|:")
477 first_peptides_identifiers = [
478 (f.split(
":")[0], f.split(
":")[1])
for f
in first_peptides]
481 for fpi
in first_peptides_identifiers:
484 residue_indexes.append((residue1,))
485 chain_indexes.append((chain1,))
486 return residue_indexes, chain_indexes
487 if self.style ==
"LAN_HUANG":
488 input_strings = input_string.split(
"-")
489 chain1, first_series = input_strings[0].split(
":")
490 chain2, second_series = input_strings[1].split(
":")
492 first_residues = first_series.replace(
";",
"|").split(
"|")
493 second_residues = second_series.replace(
";",
"|").split(
"|")
494 residue_pair_indexes = []
495 chain_pair_indexes = []
496 for fpi
in first_residues:
497 for spi
in second_residues:
498 residue1 = self.re.sub(
"[^0-9]",
"", fpi)
499 residue2 = self.re.sub(
"[^0-9]",
"", spi)
500 residue_pair_indexes.append((residue1, residue2))
501 chain_pair_indexes.append((chain1, chain2))
502 return residue_pair_indexes, chain_pair_indexes
507 A class to handle different XL format with fixed format
508 currently support ProXL
510 def __init__(self, format):
511 _CrossLinkDataBaseStandardKeys.__init__(self)
512 if format ==
"PROXL":
515 raise Exception(
"FixedFormatParser: unknown list format name")
517 def get_data(self, input_string):
518 if self.format ==
"PROXL":
519 tokens = input_string.split(
"\t")
521 if tokens[0] ==
"SEARCH ID(S)":
524 xl[self.protein1_key] = tokens[2]
525 xl[self.protein2_key] = tokens[4]
526 xl[self.residue1_key] = int(tokens[3])
527 xl[self.residue2_key] = int(tokens[5])
533 this class handles a cross-link dataset and do filtering
534 operations, adding cross-links, merge datasets...
537 def __init__(self, converter=None, data_base=None, fasta_seq=None,
541 @param converter an instance of CrossLinkDataBaseKeywordsConverter
542 @param data_base an instance of CrossLinkDataBase to build the
544 @param fasta_seq an instance of IMP.pmi.topology.Sequences containing
545 protein fasta sequences to check cross-link consistency.
546 If not given consistency will not be checked
547 @param linkable_aa a tuple containing one-letter amino acids which
548 are linkable by the cross-linker; only used if the database
549 DOES NOT provide a value for a certain residueX_amino_acid_key
550 and if a fasta_seq is given
553 if data_base
is None:
556 self.data_base = data_base
558 _CrossLinkDataBaseStandardKeys.__init__(self)
559 if converter
is not None:
560 self.cldbkc = converter
561 self.list_parser = self.cldbkc.rplp
562 self.converter = converter.get_converter()
566 self.list_parser =
None
567 self.converter =
None
570 self.def_aa_tuple = linkable_aa
571 self.fasta_seq = fasta_seq
578 Update the whole dataset after changes
580 self.update_cross_link_unique_sub_index()
581 self.update_cross_link_redundancy()
582 self.update_residues_links_number()
586 sorted_ids = sorted(self.data_base.keys())
588 for xl
in self.data_base[k]:
591 def xlid_iterator(self):
592 sorted_ids = sorted(self.data_base.keys())
593 for xlid
in sorted_ids:
596 def __getitem__(self, xlid):
597 return self.data_base[xlid]
600 return len([xl
for xl
in self])
605 def set_name(self, name):
607 for k
in self.data_base:
608 new_data_base[k +
"." + name] = self.data_base[k]
609 self.data_base = new_data_base
613 def get_number_of_xlid(self):
614 return len(self.data_base)
617 FixedFormatParser=
None):
619 if FixedFormatParser is not specified, the file is
620 comma-separated-values
622 @param file_name a txt file to be parsed
623 @param converter an instance of CrossLinkDataBaseKeywordsConverter
624 @param FixedFormatParser a parser for a fixed format
626 if not FixedFormatParser:
627 xl_list = IMP.pmi.tools.get_db_from_csv(file_name)
629 if converter
is not None:
630 self.cldbkc = converter
631 self.list_parser = self.cldbkc.rplp
632 self.converter = converter.get_converter()
634 if not self.list_parser:
638 for nxl, xl
in enumerate(xl_list):
641 if k
in self.converter:
642 new_xl[self.converter[k]] = \
643 self.type[self.converter[k]](xl[k])
646 if self.unique_id_key
in self.cldbkc.get_setup_keys():
647 if new_xl[self.unique_id_key]
not in new_xl_dict:
648 new_xl_dict[new_xl[self.unique_id_key]] = [new_xl]
650 new_xl_dict[new_xl[self.unique_id_key]].append(
653 if str(nxl)
not in new_xl_dict:
654 new_xl_dict[str(nxl)] = [new_xl]
656 new_xl_dict[str(nxl)].append(new_xl)
662 for nxl, entry
in enumerate(xl_list):
666 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
668 "CrossLinkDataBase: expecting a site_pairs_key "
669 "for the site pair list parser")
671 if k
in self.converter:
672 new_dict[self.converter[k]] = \
673 self.type[self.converter[k]](entry[k])
675 new_dict[k] = entry[k]
678 chain_pair_list) = self.list_parser.get_list(
679 new_dict[self.site_pairs_key])
682 for n, p
in enumerate(residue_pair_list):
689 new_xl[k] = new_dict[k]
690 new_xl[self.residue1_key] = \
691 self.type[self.residue1_key](p[0])
693 new_xl[self.residue2_key] = \
694 self.type[self.residue2_key](p[1])
696 if len(chain_pair_list) == len(residue_pair_list):
697 new_xl[self.protein1_key] = \
698 self.type[self.protein1_key](
699 chain_pair_list[n][0])
701 new_xl[self.protein2_key] = \
702 self.type[self.protein2_key](
703 chain_pair_list[n][1])
706 new_xl[self.link_type_key] =
"CROSSLINK"
708 new_xl[self.link_type_key] =
"MONOLINK"
710 if self.unique_id_key
in self.cldbkc.get_setup_keys():
711 if new_xl[self.unique_id_key]
not in new_xl_dict:
712 new_xl_dict[new_xl[self.unique_id_key]] = \
715 new_xl_dict[new_xl[self.unique_id_key]].append(
718 if str(nxl)
not in new_xl_dict:
719 new_xl[self.unique_id_key] = str(nxl+1)
720 new_xl_dict[str(nxl)] = [new_xl]
722 new_xl[self.unique_id_key] = str(nxl+1)
723 new_xl_dict[str(nxl)].append(new_xl)
727 if FixedFormatParser is defined
732 with open(file_name,
"r") as f:
734 xl = FixedFormatParser.get_data(line)
736 xl[self.unique_id_key] = str(nxl+1)
737 new_xl_dict[str(nxl)] = [xl]
740 self.data_base = new_xl_dict
741 self.name = file_name
742 loc = ihm.location.InputFileLocation(file_name, details=
'Crosslinks')
743 self.dataset = ihm.dataset.CXMSDataset(loc)
746 def update_cross_link_unique_sub_index(self):
747 for k
in self.data_base:
748 for n, xl
in enumerate(self.data_base[k]):
749 xl[self.ambiguity_key] = len(self.data_base[k])
750 xl[self.unique_sub_index_key] = n+1
751 xl[self.unique_sub_id_key] = k +
"." + str(n+1)
753 def update_cross_link_redundancy(self):
754 redundancy_data_base = {}
756 pra = _ProteinsResiduesArray(xl)
757 if pra
not in redundancy_data_base:
758 redundancy_data_base[pra] = [xl[self.unique_sub_id_key]]
759 redundancy_data_base[pra.get_inverted()] = \
760 [xl[self.unique_sub_id_key]]
762 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
763 redundancy_data_base[pra.get_inverted()].append(
764 xl[self.unique_sub_id_key])
766 pra = _ProteinsResiduesArray(xl)
767 xl[self.redundancy_key] = len(redundancy_data_base[pra])
768 xl[self.redundancy_list_key] = redundancy_data_base[pra]
770 def update_residues_links_number(self):
773 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
774 if (p1, r1)
not in residue_links:
775 residue_links[(p1, r1)] = set([(p2, r2)])
777 residue_links[(p1, r1)].add((p2, r2))
778 if (p2, r2)
not in residue_links:
779 residue_links[(p2, r2)] = set([(p1, r1)])
781 residue_links[(p2, r2)].add((p1, r1))
784 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
785 xl[self.residue1_links_number_key] = len(residue_links[(p1, r1)])
786 xl[self.residue2_links_number_key] = len(residue_links[(p2, r2)])
789 """This function checks the consistency of the dataset with the
790 amino acid sequence"""
791 if self.cldbkc
and self.fasta_seq:
792 cnt_matched, cnt_matched_file = 0, 0
796 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
797 b_matched_file =
False
798 if self.residue1_amino_acid_key
in xl:
801 aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
802 b_matched = self._match_xlinks(p1, r1, aa_from_file)
803 b_matched_file = b_matched
807 b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
809 matched, non_matched = self._update_matched_xlinks(
810 b_matched, p1, r1, matched, non_matched)
812 if self.residue2_amino_acid_key
in xl:
813 aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
814 b_matched = self._match_xlinks(p2, r2, aa_from_file)
815 b_matched_file = b_matched
817 b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
819 matched, non_matched = self._update_matched_xlinks(
820 b_matched, p2, r2, matched, non_matched)
824 cnt_matched_file += 1
826 percentage_matched = round(100*cnt_matched/len(self), 1)
827 percentage_matched_file = round(
828 100 * cnt_matched_file / len(self), 1)
829 if matched
or non_matched:
831 "check_cross_link_consistency: Out of %d cross-links "
832 "%d were matched to the fasta sequence (%f %%).\n "
833 "%d were matched by using the cross-link file (%f %%)."
834 % (len(self), cnt_matched, percentage_matched,
835 cnt_matched_file, percentage_matched_file))
837 print(
"check_cross_link_consistency: Warning: Non "
839 [(prot_name, sorted(list(non_matched[prot_name])))
840 for prot_name
in non_matched])
841 return matched, non_matched
843 def _match_xlinks(self, prot_name, res_index, aa_tuple):
847 amino_dict = IMP.pmi.alphabets.amino_acid
849 for amino_acid
in aa_tuple:
850 if len(amino_acid) == 3:
851 amino_acid = amino_dict.get_one_letter_code_from_residue_type(
853 if prot_name
in self.fasta_seq.sequences:
854 seq = self.fasta_seq.sequences[prot_name]
862 if res_index == 0
or res_index == 1:
864 if res_index < len(seq):
865 if amino_acid == seq[res_index]:
869 def _update_matched_xlinks(self, b_matched, prot, res, matched,
873 matched[prot].add(res)
875 matched[prot] = set([res])
877 if prot
in non_matched:
878 non_matched[prot].add(res)
880 non_matched[prot] = set([res])
881 return matched, non_matched
883 def get_cross_link_string(self, xl):
885 for k
in self.ordered_key_list:
887 string += str(k) +
":" + str(xl[k]) +
"|"
892 if k
not in self.ordered_key_list:
893 string += str(k) +
":" + str(xl[k]) +
"|"
897 def get_short_cross_link_string(self, xl):
899 list_of_keys = [self.data_set_name_key,
900 self.unique_sub_id_key,
908 for k
in list_of_keys:
910 string += str(xl[k]) +
"|"
916 def filter(self, FilterOperator):
918 for id
in self.data_base.keys():
919 for xl
in self.data_base[id]:
920 if FilterOperator.evaluate(xl):
921 if id
not in new_xl_dict:
922 new_xl_dict[id] = [xl]
924 new_xl_dict[id].append(xl)
927 cdb.dataset = self.dataset
931 '''Get all cross-links with score greater than an input value'''
933 self.id_score_key, operator.gt, score)
934 return self.filter(FilterOperator)
936 def merge(self, CrossLinkDataBase1, CrossLinkDataBase2):
938 This function merges two cross-link datasets so that if two
939 conflicting cross-links have the same cross-link UniqueIDS,
940 the cross-links will be appended under the same UniqueID slots
941 with different SubIDs
943 raise NotImplementedError()
946 """Append cross-link dataset to this one."""
948 for k
in self.data_base:
949 new_data_base[k] = self.data_base[k]
950 for k
in db.data_base:
951 new_data_base[k] = db.data_base[k]
952 self.data_base = new_data_base
955 def __iadd__(self, db):
959 def set_value(self, key, new_value, filter_operator=None):
961 This function changes the value for a given key in the database
962 For instance one can change the name of a protein
963 @param key: the key in the database that must be changed
964 @param new_value: the new value of the key
965 @param filter_operator: optional FilterOperator to change the value to
966 a subset of the database
968 example: `cldb1.set_value(cldb1.protein1_key, 'FFF',
969 FO(cldb.protein1_key, operator.eq, "AAA"))`
973 if filter_operator
is not None:
974 if filter_operator.evaluate(xl):
982 this function returns the list of values for a given key in the
983 database alphanumerically sorted
988 return sorted(list(values))
992 This function offset the residue indexes of a given protein by
994 @param protein_name: the protein name that need to be changed
995 @param offset: the offset value
999 if xl[self.protein1_key] == protein_name:
1000 xl[self.residue1_key] = xl[self.residue1_key] + offset
1001 if xl[self.protein2_key] == protein_name:
1002 xl[self.residue2_key] = xl[self.residue2_key] + offset
1007 This function creates a new keyword for the whole database and
1008 set the values from and existing keyword (optional), otherwise the
1009 values are set to None
1010 @param keyword the new keyword name:
1011 @param values_from_keyword the keyword from which we are copying
1015 if values_from_keyword
is not None:
1016 xl[keyword] = xl[values_from_keyword]
1022 protein_to_rename=
"both"):
1024 This function renames all proteins contained in the input dictionary
1025 from the old names (keys) to the new name (values)
1026 @param old_to_new_names_dictionary dictionary for converting old
1028 @param protein_to_rename specify whether to rename both or protein1
1032 for old_name
in old_to_new_names_dictionary:
1033 new_name = old_to_new_names_dictionary[old_name]
1034 if protein_to_rename ==
"both" or protein_to_rename ==
"protein1":
1036 self.
set_value(self.protein1_key, new_name, fo2)
1037 if protein_to_rename ==
"both" or protein_to_rename ==
"protein2":
1039 self.
set_value(self.protein2_key, new_name, fo2)
1043 This function parses an alignment file obtained previously, to map
1044 crosslink residues onto the sequence of a homolog proteins. It is
1045 useful if you want to map the crosslinks onto a known, homolog
1046 structure without building a comparative model.
1047 The txt file of the alignment should be structured with repeating
1050 >NAME_CROSSLINKED_PROTEIN
1051 >NAME_HOMOLOG_PROTEIN_WITH_KNOWN_STRUCTURE
1052 one-letter code sequence for cross-linked protein (one line)
1053 one-letter code sequence for homolog protein with known structure (one line)
1056 f = open(alignfile,
'r')
1058 def _assign_residues(seq1, seq2):
1066 resinds1.append(nres)
1068 resinds1.append(
None)
1070 resinds2 = [
None for i
in resinds1]
1073 for pos, r2
in enumerate(seq2):
1076 resinds2[pos] = nres
1078 if contiguousgap > 1:
1079 for i
in range(pos-int(contiguousgap/2), pos):
1084 resinds2[pos] = lastresind
1086 for n, r
in enumerate(resinds1):
1094 with open(alignfile,
'r') as f:
1100 protname = aa.replace(
">",
"").replace(
"\n",
"")
1101 seq1 = cc.replace(
",",
"").replace(
"\n",
"")
1102 seq2 = dd.replace(
",",
"").replace(
"\n",
"")
1108 for i
in range(len(seq1)):
1120 tmpl1.append([i, c1, seq1[i]])
1121 tmpl2.append([i, c2, seq2[i]])
1122 sequences1[protname] = tmpl1
1123 sequences2[protname] = tmpl2
1125 d = _assign_residues(seq1, seq2)
1131 if fo11.evaluate(xl):
1132 t = [item
for item
in sequences1[protname]
1133 if item[1] == xl[self.residue1_key]][0]
1134 sequences1[protname][t[0]] = [
1135 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1136 t = [item
for item
in sequences2[protname]
1137 if item[1] == d[xl[self.residue1_key]]][0]
1138 sequences2[protname][t[0]] = [
1139 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1141 xl[self.residue1_key] = d[xl[self.residue1_key]]
1143 if fo12.evaluate(xl):
1145 t = [item
for item
in sequences1[protname]
1146 if item[1] == xl[self.residue2_key]][0]
1147 sequences1[protname][t[0]] = [
1148 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1149 t = [item
for item
in sequences2[protname]
1150 if item[1] == d[xl[self.residue2_key]]][0]
1151 sequences2[protname][t[0]] = [
1152 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1154 xl[self.residue2_key] = d[xl[self.residue2_key]]
1157 print(aa.replace(
"\n",
""))
1158 print(
"".join([seq[2]
for seq
in sequences1[protname]]))
1159 print(bb.replace(
"\n",
""))
1160 print(
"".join([seq[2]
for seq
in sequences2[protname]]))
1164 This function creates as many classes as in the input
1165 (number_of_classes: integer) and partition cross-links according
1166 to their identification scores. Classes are defined in the psi key.
1169 if self.id_score_key
is not None:
1173 'The cross-link database does not contain score values')
1174 minscore = min(scores)
1175 maxscore = max(scores)
1176 scoreclasses = numpy.linspace(minscore, maxscore, number_of_classes+1)
1177 if self.psi_key
is None:
1180 score = xl[self.id_score_key]
1181 for n, classmin
in enumerate(scoreclasses[0:-1]):
1182 if score >= classmin
and score <= scoreclasses[n+1]:
1183 xl[self.psi_key] = str(
"CLASS_"+str(n))
1186 def clone_protein(self, protein_name, new_protein_name):
1187 for id
in self.data_base.keys():
1189 for xl
in self.data_base[id]:
1190 new_data_base.append(xl)
1191 if xl[self.protein1_key] == protein_name \
1192 and xl[self.protein2_key] != protein_name:
1194 new_xl[self.protein1_key] = new_protein_name
1195 new_data_base.append(new_xl)
1196 elif xl[self.protein1_key] != protein_name \
1197 and xl[self.protein2_key] == protein_name:
1199 new_xl[self.protein2_key] = new_protein_name
1200 new_data_base.append(new_xl)
1201 elif xl[self.protein1_key] == protein_name \
1202 and xl[self.protein2_key] == protein_name:
1204 new_xl[self.protein1_key] = new_protein_name
1205 new_data_base.append(new_xl)
1207 new_xl[self.protein2_key] = new_protein_name
1208 new_data_base.append(new_xl)
1210 new_xl[self.protein1_key] = new_protein_name
1211 new_xl[self.protein2_key] = new_protein_name
1212 new_data_base.append(new_xl)
1213 self.data_base[id] = new_data_base
1218 This function remove cross-links applied to the same residue
1219 (ie, same chain name and residue number)
1221 for id
in self.data_base.keys():
1223 for xl
in self.data_base[id]:
1224 if xl[self.protein1_key] == xl[self.protein2_key] \
1225 and xl[self.residue1_key] == xl[self.residue2_key]:
1228 new_data_base.append(xl)
1229 self.data_base[id] = new_data_base
1234 this method returns a CrossLinkDataBase class containing
1235 a random subsample of the original cross-link database.
1236 @param percentage float between 0 and 1, is the percentage of
1237 of spectra taken from the original list
1240 if percentage > 1.0
or percentage < 0.0:
1241 raise ValueError(
'the percentage of random cross-link spectra '
1242 'should be between 0 and 1')
1243 nspectra = self.get_number_of_xlid()
1244 nrandom_spectra = int(nspectra*percentage)
1245 random_keys = random.sample(sorted(self.data_base.keys()),
1248 for k
in random_keys:
1249 new_data_base[k] = self.data_base[k]
1254 sorted_ids = sorted(self.data_base.keys())
1256 for id
in sorted_ids:
1258 for xl
in self.data_base[id]:
1259 for k
in self.ordered_key_list:
1261 outstr +=
"--- " + str(k) +
" " + str(xl[k]) +
"\n"
1266 if k
not in self.ordered_key_list:
1267 outstr +=
"--- " + str(k) +
" " + str(xl[k]) +
"\n"
1268 outstr +=
"-------------\n"
1271 def plot(self, filename, **kwargs):
1273 matplotlib.use(
'Agg')
1274 import matplotlib.pyplot
as plt
1275 import matplotlib.colors
1277 if kwargs[
"type"] ==
"scatter":
1278 cmap = plt.get_cmap(
"rainbow")
1279 _ = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1280 xkey = kwargs[
"xkey"]
1281 ykey = kwargs[
"ykey"]
1282 if "colorkey" in kwargs:
1283 colorkey = kwargs[
"colorkey"]
1284 if "logyscale" in kwargs:
1285 logyscale = kwargs[
"logyscale"]
1293 xs.append(float(xl[xkey]))
1295 ys.append(math.log10(float(xl[ykey])))
1297 ys.append(float(xl[ykey]))
1298 colors.append(float(xl[colorkey]))
1300 print(
"CrossLinkDataBase.plot: Value error for "
1301 "cross-link %s" % (xl[self.unique_id_key]))
1305 for color
in colors:
1306 cindex = (color-min(colors))/(max(colors)-min(colors))
1307 cs.append(cmap(cindex))
1310 ax = fig.add_subplot(111)
1311 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8, marker=
"o")
1314 plt.savefig(filename)
1318 if kwargs[
"type"] ==
"residue_links":
1322 molecule = kwargs[
"molecule"]
1324 length = len(molecule.sequence)
1325 molecule = molecule.get_name()
1328 sequences_object = kwargs[
"sequences_object"]
1329 sequence = sequences_object.sequences[molecule]
1330 length = len(sequence)
1332 histogram = [0]*length
1334 if xl[self.protein1_key] == molecule:
1335 histogram[xl[self.residue1_key]-1] = \
1336 xl[self.residue1_links_number_key]
1337 if xl[self.protein2_key] == molecule:
1338 histogram[xl[self.residue2_key]-1] = \
1339 xl[self.residue2_links_number_key]
1340 plt.bar(range(1, length+1), histogram)
1341 plt.savefig(filename)
1345 if kwargs[
"type"] ==
"histogram":
1346 valuekey = kwargs[
"valuekey"]
1347 valuename = valuekey
1348 bins = kwargs[
"bins"]
1352 values_list.append(float(xl[valuekey]))
1354 print(
"CrossLinkDataBase.plot: Value error for cross-link "
1355 "%s" % (xl[self.unique_id_key]))
1358 filename, [values_list], valuename=valuename, bins=bins,
1359 colors=
None, format=
"pdf",
1360 reference_xline=
None, yplotrange=
None,
1361 xplotrange=
None, normalized=
True,
1364 def dump(self, json_filename):
1366 with open(json_filename,
'w')
as fp:
1367 json.dump(self.data_base, fp, sort_keys=
True, indent=2,
1368 default=set_json_default)
1370 def load(self, json_filename):
1372 with open(json_filename,
'r') as fp:
1373 self.data_base = json.load(fp)
1377 if sys.version_info[0] < 3:
1379 for k, v
in xl.iteritems():
1380 if type(k)
is unicode:
1382 if type(v)
is unicode:
1386 def save_csv(self, filename):
1391 sorted_group_ids = sorted(self.data_base.keys())
1392 for group
in sorted_group_ids:
1394 for xl
in self.data_base[group]:
1396 sorted_ids = sorted(xl.keys())
1397 values = [xl[k]
for k
in sorted_ids]
1398 group_block.append(values)
1401 with open(filename,
'w')
as fp:
1402 a = csv.writer(fp, delimiter=
',')
1403 a.writerow(sorted_ids)
1408 Returns the number of non redundant cross-link sites
1412 pra = _ProteinsResiduesArray(xl)
1413 prai = pra.get_inverted()
1416 return len(list(praset))
1420 """This class allows to compute and plot the distance between datasets"""
1423 """Input a dictionary where keys are cldb names and values are cldbs"""
1424 import scipy.spatial.distance
1425 self.dbs = cldb_dictionary
1426 self.keylist = list(self.dbs.keys())
1427 self.distances = list()
1429 for i, key1
in enumerate(self.keylist):
1430 for key2
in self.keylist[i+1:]:
1431 distance = self.get_distance(key1, key2)
1432 self.distances.append(distance)
1434 self.distances = scipy.spatial.distance.squareform(self.distances)
1437 return 1.0 - self.
jaccard_index(self.dbs[key1], self.dbs[key2])
1440 """Similarity index between two datasets
1441 https://en.wikipedia.org/wiki/Jaccard_index"""
1445 for xl1
in CrossLinkDataBase1:
1446 a1f = _ProteinsResiduesArray(xl1)
1447 a1b = a1f.get_inverted()
1450 for xl2
in CrossLinkDataBase2:
1451 a2f = _ProteinsResiduesArray(xl2)
1452 a2b = a2f.get_inverted()
1455 return (float(len(set1 & set2)/2)
1456 / (len(set1)/2+len(set2)/2-len(set1 & set2)/2))
1458 def plot_matrix(self, figurename="clustermatrix.pdf"):
1459 import matplotlib
as mpl
1462 import matplotlib.pylab
as pl
1463 from scipy.cluster
import hierarchy
as hrc
1465 raw_distance_matrix = self.distances
1466 labels = self.keylist
1470 ax = fig.add_subplot(1, 1, 1)
1471 dendrogram = hrc.dendrogram(
1472 hrc.linkage(raw_distance_matrix),
1475 leaves_order = dendrogram[
'leaves']
1476 ax.set_xlabel(
'Dataset')
1477 ax.set_ylabel(
'Jaccard Distance')
1479 pl.savefig(
"dendrogram."+figurename, dpi=300)
1484 ax = fig.add_subplot(1, 1, 1)
1485 cax = ax.imshow(raw_distance_matrix[leaves_order, :][:, leaves_order],
1486 interpolation=
'nearest')
1487 cb = fig.colorbar(cax)
1488 cb.set_label(
'Jaccard Distance')
1489 ax.set_xlabel(
'Dataset')
1490 ax.set_ylabel(
'Dataset')
1491 ax.set_xticks(range(len(labels)))
1492 ax.set_xticklabels(numpy.array(labels)[leaves_order],
1493 rotation=
'vertical')
1494 ax.set_yticks(range(len(labels)))
1495 ax.set_yticklabels(numpy.array(labels)[leaves_order],
1496 rotation=
'horizontal')
1498 pl.savefig(
"matrix." + figurename, dpi=300)
1504 This class maps a CrossLinkDataBase on a given structure
1505 and save an rmf file with color-coded cross-links
1508 def __init__(self, CrossLinkDataBase, rmf_or_stat_handler):
1509 self.CrossLinkDataBase = CrossLinkDataBase
1512 self.prots = rmf_or_stat_handler
1513 self.distances = defaultdict(list)
1514 self.array_to_id = {}
1515 self.id_to_array = {}
1517 print(
"computing distances fro all cross-links and all structures")
1518 for i
in self.prots[::10]:
1519 self.compute_distances()
1520 for key, xl
in enumerate(self.CrossLinkDataBase):
1521 array = _ProteinsResiduesArray(xl)
1522 self.array_to_id[array] = key
1523 self.id_to_array[key] = array
1524 if xl[
"MinAmbiguousDistance"] !=
'None':
1525 self.distances[key].append(xl[
"MinAmbiguousDistance"])
1527 def compute_distances(self):
1530 sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1531 for group
in sorted_group_ids:
1533 for xl
in self.CrossLinkDataBase.data_base[group]:
1535 sorted_ids = sorted(xl.keys())
1538 + [
"UniqueID",
"Distance",
"MinAmbiguousDistance"])
1539 c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1541 (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1542 self._get_distance_and_particle_pair(r1, c1, r2, c2)
1552 values = [xl[k]
for k
in sorted_ids]
1553 values += [group, mdist]
1554 except KeyError
as e:
1555 print(
"MapCrossLinkDataBaseOnStructure "
1556 "KeyError: {0} in {1}".format(e, xl))
1557 group_dists.append(mdist)
1558 xl[
"Distance"] = mdist
1559 xl[
"State1"] = state1
1561 xl[
"State2"] = state2
1564 for xl
in self.CrossLinkDataBase.data_base[group]:
1565 xl[
"MinAmbiguousDistance"] = min(group_dists)
1567 def _get_distance_and_particle_pair(self, r1, c1, r2, c2):
1568 '''more robust and slower version of above'''
1571 selpart_1 = sel.get_selected_particles()
1572 if len(selpart_1) == 0:
1573 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle "
1574 "selected for first site")
1578 selpart_2 = sel.get_selected_particles()
1579 if len(selpart_2) == 0:
1580 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle "
1581 "selected for second site")
1584 for p1
in selpart_1:
1585 for p2
in selpart_2:
1586 if p1 == p2
and r1 == r2:
1594 h1 = h1.get_parent()
1597 h1 = h1.get_parent()
1601 h2 = h2.get_parent()
1604 h2 = h2.get_parent()
1607 results.append((dist, state_index1, copy_index1,
1608 state_index2, copy_index2, p1, p2))
1609 if len(results) == 0:
1611 results_sorted = sorted(results,
1612 key=operator.itemgetter(0, 1, 2, 3, 4))
1613 return ((results_sorted[0][0],
1614 results_sorted[0][5],
1615 results_sorted[0][6]),
1616 (results_sorted[0][1],
1617 results_sorted[0][2],
1618 results_sorted[0][3],
1619 results_sorted[0][4]))
1621 def save_rmf_snapshot(self, filename, color_id=None):
1622 sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1625 for group
in sorted_group_ids:
1626 group_dists_particles = []
1627 for xl
in self.CrossLinkDataBase.data_base[group]:
1629 self.CrossLinkDataBase.get_short_cross_link_string(xl)
1630 c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1632 (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1633 self._get_distance_and_particle_pair(r1, c1, r2, c2)
1635 print(
"TypeError or missing chain/residue ",
1638 if color_id
is not None:
1639 group_dists_particles.append((mdist, p1, p2, xllabel,
1640 float(xl[color_id])))
1642 group_dists_particles.append((mdist, p1, p2, xllabel, 0.0))
1643 if group_dists_particles:
1644 (minmdist, minp1, minp2, minxllabel, mincolor_score) = \
1645 min(group_dists_particles, key=
lambda t: t[0])
1646 color_scores.append(mincolor_score)
1647 list_of_pairs.append((minp1, minp2, minxllabel,
1653 linear.set_slope(1.0)
1656 'linear_dummy_restraints')
1658 offset = min(color_scores)
1659 maxvalue = max(color_scores)
1660 for pair
in list_of_pairs:
1663 pr.set_name(pair[2])
1664 if offset < maxvalue:
1665 factor = (pair[3]-offset)/(maxvalue-offset)
1672 rslin.add_restraint(pr)
1675 rh = RMF.create_rmf_file(filename)
1682 def boxplot_crosslink_distances(self, filename):
1684 keys = [self.id_to_array[k]
for k
in self.distances.keys()]
1685 medians = [numpy.median(self.distances[self.array_to_id[k]])
1687 dists = [self.distances[self.array_to_id[k]]
for k
in keys]
1688 distances_sorted_by_median = \
1689 [x
for _, x
in sorted(zip(medians, dists))]
1690 keys_sorted_by_median = [x
for _, x
in sorted(zip(medians, keys))]
1692 filename, distances_sorted_by_median,
1693 range(len(distances_sorted_by_median)),
1694 xlabels=keys_sorted_by_median,
1695 scale_plot_length=0.2)
1697 def get_crosslink_violations(self, threshold):
1698 violations = defaultdict(list)
1699 for k
in self.distances:
1701 viols = self.distances[k]
1702 violations[self.id_to_array[k]] = viols
1708 This class generates a CrossLinkDataBase from a given structure
1710 def __init__(self, system, residue_types_1=["K"],
1711 residue_types_2=[
"K"], reactivity_range=[0, 1], kt=1.0):
1715 cldbkc.set_protein1_key(
"Protein1")
1716 cldbkc.set_protein2_key(
"Protein2")
1717 cldbkc.set_residue1_key(
"Residue1")
1718 cldbkc.set_residue2_key(
"Residue2")
1720 self.system = system
1721 self.model = self.system.model
1722 self.residue_types_1 = residue_types_1
1723 self.residue_types_2 = residue_types_2
1725 self.indexes_dict1 = {}
1726 self.indexes_dict2 = {}
1727 self.protein_residue_dict = {}
1728 self.reactivity_dictionary = {}
1729 self.euclidean_interacting_pairs =
None
1730 self.xwalk_interacting_pairs =
None
1732 for state
in self.system.get_states():
1733 for moleculename, molecules
in state.get_molecules().items():
1734 for molecule
in molecules:
1737 seq = molecule.sequence
1738 residues = [i
for i
in range(1, len(seq)+1)
1739 if ((seq[i-1]
in self.residue_types_1)
1740 or (seq[i-1]
in self.residue_types_2))]
1746 rexp = numpy.random.exponential(0.00000001)
1747 prob = 1.0-math.exp(-rexp)
1748 self.reactivity_dictionary[(molecule, r)] = prob
1750 residues1 = [i
for i
in range(1, len(seq)+1)
1751 if seq[i-1]
in self.residue_types_1]
1752 residues2 = [i
for i
in range(1, len(seq)+1)
1753 if seq[i-1]
in self.residue_types_2]
1756 molecule.hier, residue_index=r, resolution=1)
1757 ps = s.get_selected_particles()
1759 index = p.get_index()
1760 self.indexes_dict1[index] = (molecule, r)
1761 self.protein_residue_dict[(molecule, r)] = index
1764 molecule.hier, residue_index=r, resolution=1)
1765 ps = s.get_selected_particles()
1767 index = p.get_index()
1768 self.indexes_dict2[index] = (molecule, r)
1769 self.protein_residue_dict[(molecule, r)] = index
1771 def get_all_possible_pairs(self):
1772 n = float(len(self.protein_residue_dict.keys()))
1773 return n*(n-1.0)/2.0
1775 def get_all_feasible_pairs(self, distance=21):
1777 particle_index_pairs = []
1779 for a, b
in itertools.combinations(
1780 self.protein_residue_dict.keys(), 2):
1782 index1 = self.protein_residue_dict[a]
1783 index2 = self.protein_residue_dict[b]
1787 if particle_distance <= distance:
1788 particle_index_pairs.append((index1, index2))
1789 new_xl[self.cldb.protein1_key] = a[0].get_name()
1790 new_xl[self.cldb.protein2_key] = b[0].get_name()
1791 new_xl[
"molecule_object1"] = a[0]
1792 new_xl[
"molecule_object2"] = b[0]
1793 new_xl[self.cldb.residue1_key] = a[1]
1794 new_xl[self.cldb.residue2_key] = b[1]
1795 self.cldb.data_base[str(nxl)] = [new_xl]
1800 def get_data_base(self, total_number_of_spectra, ambiguity_probability=0.1,
1801 noise=0.01, distance=21, max_delta_distance=10.0,
1802 xwalk_bin_path=
None, confidence_false=0.75,
1803 confidence_true=0.75):
1805 from random
import random
1807 number_of_spectra = 1
1809 self.beta_true = -1.4427*math.log(0.5*(1.0-confidence_true))
1810 self.beta_false = -1.4427*math.log(0.5*(1.0-confidence_false))
1811 self.cldb.data_base[str(number_of_spectra)] = []
1812 self.sites_weighted =
None
1814 while number_of_spectra < total_number_of_spectra:
1815 if (random() > ambiguity_probability
1816 and len(self.cldb.data_base[str(number_of_spectra)]) != 0):
1818 number_of_spectra += 1
1819 self.cldb.data_base[str(number_of_spectra)] = []
1821 if random() > noise:
1823 pra, dist = self.get_random_residue_pair(
1824 distance, xwalk_bin_path, max_delta_distance)
1827 pra, dist = self.get_random_residue_pair(
None,
None,
None)
1831 new_xl[self.cldb.protein1_key] = pra[0].get_name()
1832 new_xl[self.cldb.protein2_key] = pra[1].get_name()
1833 new_xl[
"molecule_object1"] = pra[0]
1834 new_xl[
"molecule_object2"] = pra[1]
1835 new_xl[self.cldb.residue1_key] = pra[2]
1836 new_xl[self.cldb.residue2_key] = pra[3]
1837 new_xl[
"Noisy"] = noisy
1839 new_xl[
"Reactivity_Residue1"] = \
1840 self.reactivity_dictionary[(pra[0], pra[2])]
1841 new_xl[
"Reactivity_Residue2"] = \
1842 self.reactivity_dictionary[(pra[1], pra[3])]
1844 new_xl[
"Score"] = np.random.beta(1.0, self.beta_false)
1846 new_xl[
"Score"] = 1.0-np.random.beta(1.0, self.beta_true)
1847 new_xl[
"TargetDistance"] = dist
1848 new_xl[
"NoiseProbability"] = noise
1849 new_xl[
"AmbiguityProbability"] = ambiguity_probability
1852 self.model, [self.protein_residue_dict[(pra[0], pra[2])],
1853 self.protein_residue_dict[(pra[1], pra[3])]])
1858 new_xl[
"InterRigidBody"] =
False
1863 new_xl[
"InterRigidBody"] =
True
1865 new_xl[
"InterRigidBody"] =
None
1867 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1871 def get_random_residue_pair(self, distance=None, xwalk_bin_path=None,
1872 max_delta_distance=
None):
1875 from random
import choice
1876 if distance
is None:
1879 protein1, residue1 = choice(self.protein_residue_dict.keys())
1880 protein2, residue2 = choice(self.protein_residue_dict.keys())
1881 index1 = self.protein_residue_dict[(protein1, residue1)]
1882 index2 = self.protein_residue_dict[(protein2, residue2)]
1886 if (protein1, residue1) != (protein2, residue2):
1890 if not xwalk_bin_path:
1892 gcpf.set_distance(distance+max_delta_distance)
1896 if not self.sites_weighted:
1897 self.sites_weighted = []
1898 for key
in self.reactivity_dictionary:
1899 r = self.reactivity_dictionary[key]
1900 self.sites_weighted.append((key, r))
1902 first_site = self.weighted_choice(self.sites_weighted)
1904 if not self.euclidean_interacting_pairs:
1905 self.euclidean_interacting_pairs = \
1906 gcpf.get_close_pairs(
1908 self.indexes_dict1.keys(),
1909 self.indexes_dict2.keys())
1911 first_site_pairs = \
1912 [pair
for pair
in self.euclidean_interacting_pairs
1913 if self.indexes_dict1[pair[0]] == first_site
or
1914 self.indexes_dict2[pair[1]] == first_site]
1915 if len(first_site_pairs) == 0:
1918 second_sites_weighted = []
1919 for pair
in first_site_pairs:
1920 if self.indexes_dict1[pair[0]] == first_site:
1921 second_site = self.indexes_dict2[pair[1]]
1922 if self.indexes_dict2[pair[1]] == first_site:
1923 second_site = self.indexes_dict1[pair[0]]
1924 r = self.reactivity_dictionary[second_site]
1925 second_sites_weighted.append((second_site, r))
1926 second_site = self.weighted_choice(second_sites_weighted)
1927 protein1, residue1 = first_site
1928 protein2, residue2 = second_site
1929 print(
"CrossLinkDataBaseFromStructure."
1930 "get_random_residue_pair:",
1931 "First site", first_site,
1932 self.reactivity_dictionary[first_site],
1933 "Second site", second_site,
1934 self.reactivity_dictionary[second_site])
1937 [self.protein_residue_dict[first_site],
1938 self.protein_residue_dict[second_site]])
1943 if particle_distance < distance \
1944 and (protein1, residue1) != (protein2, residue2):
1946 elif (particle_distance >= distance
1947 and (protein1, residue1) != (protein2, residue2)
1948 and max_delta_distance):
1950 if particle_distance-distance < max_delta_distance:
1954 if not self.xwalk_interacting_pairs:
1955 self.xwalk_interacting_pairs = \
1956 self.get_xwalk_distances(xwalk_bin_path, distance)
1957 interacting_pairs_weighted = []
1959 for pair
in self.xwalk_interacting_pairs:
1965 -self.reactivity_dictionary[(protein1, residue1)]
1968 -self.reactivity_dictionary[(protein2, residue2)]
1970 interacting_pairs_weighted.append((pair, weight1*weight2))
1972 pair = self.weighted_choice(interacting_pairs_weighted)
1977 particle_distance = float(pair[4])
1979 return ((protein1, protein2, residue1, residue2)), particle_distance
1981 def get_xwalk_distances(self, xwalk_bin_path, distance):
1985 o.init_pdb(
"xwalk.pdb", self.representation.prot)
1986 o.write_pdb(
"xwalk.pdb")
1987 namechainiddict = o.dictchain[
"xwalk.pdb"]
1990 for key
in namechainiddict:
1991 chainiddict[namechainiddict[key]] = key
1992 xwalkout = os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path
1993 +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys '
1994 '-a1 cb -a2 cb -max '
1995 + str(distance) +
' -bb').read()
1997 output_list_of_distance = []
1998 for line
in xwalkout.split(
"\n")[0:-2]:
1999 tokens = line.split()
2002 distance = float(tokens[6])
2003 fs = first.split(
"-")
2004 ss = second.split(
"-")
2007 protein1 = chainiddict[chainid1]
2008 protein2 = chainiddict[chainid2]
2009 residue1 = int(fs[1])
2010 residue2 = int(ss[1])
2011 output_list_of_distance.append(
2012 (protein1, protein2, residue1, residue2, distance))
2013 return output_list_of_distance
2015 def weighted_choice(self, choices):
2018 total = sum(w
for c, w
in choices)
2019 r = random.uniform(0, total)
2021 for c, w
in choices:
2025 assert False,
"Shouldn't get here"
2027 def save_rmf_snapshot(self, filename, color_id=None):
2030 if color_id
is None:
2031 color_id =
"Reactivity"
2032 sorted_group_ids = sorted(self.cldb.data_base.keys())
2035 for group
in sorted_group_ids:
2036 group_dists_particles = []
2037 for xl
in self.cldb.data_base[group]:
2038 xllabel = self.cldb.get_short_cross_link_string(xl)
2039 (c1, c2, r1, r2) = (xl[
"molecule_object1"],
2040 xl[
"molecule_object2"],
2041 xl[self.cldb.residue1_key],
2042 xl[self.cldb.residue2_key])
2044 index1 = self.protein_residue_dict[(c1, r1)]
2045 index2 = self.protein_residue_dict[(c2, r2)]
2048 mdist = xl[
"TargetDistance"]
2050 print(
"TypeError or missing chain/residue ",
2053 group_dists_particles.append((mdist, p1, p2, xllabel,
2054 float(xl[color_id])))
2055 if group_dists_particles:
2056 (minmdist, minp1, minp2, minxllabel, mincolor_score) \
2057 = min(group_dists_particles, key=
lambda t: t[0])
2058 color_scores.append(mincolor_score)
2059 list_of_pairs.append((minp1, minp2, minxllabel,
2066 linear.set_slope(1.0)
2070 offset = min(color_scores)
2071 maxvalue = max(color_scores)
2072 for pair
in list_of_pairs:
2074 pr.set_name(pair[2])
2075 factor = (pair[3]-offset)/(maxvalue-offset)
2080 rslin.add_restraint(pr)
2083 rh = RMF.create_rmf_file(filename)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def __init__
Input a dictionary where keys are cldb names and values are cldbs.
def get_number_of_unique_crosslinked_sites
Returns the number of non redundant cross-link sites.
def filter_score
Get all cross-links with score greater than an input value.
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
Get the particles from a list of indexes.
def create_new_keyword
This function creates a new keyword for the whole database and set the values from and existing keywo...
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
def jaccard_index
Similarity index between two datasets https://en.wikipedia.org/wiki/Jaccard_index.
def merge
This function merges two cross-link datasets so that if two conflicting cross-links have the same cro...
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class maps a CrossLinkDataBase on a given structure and save an rmf file with color-coded cross-...
A decorator for keeping track of copies of a molecule.
The standard decorator for manipulating molecular structures.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def set_value
This function changes the value for a given key in the database For instance one can change the name ...
This class allows to compute and plot the distance between datasets.
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
def filter_out_same_residues
This function remove cross-links applied to the same residue (ie, same chain name and residue number)...
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
def offset_residue_index
This function offset the residue indexes of a given protein by a specified value. ...
Class for easy writing of PDBs, RMFs, and stat files.
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
def jackknife
this method returns a CrossLinkDataBase class containing a random subsample of the original cross-lin...
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
def check_cross_link_consistency
This function checks the consistency of the dataset with the amino acid sequence. ...
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
def get_values
this function returns the list of values for a given key in the database alphanumerically sorted ...
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Associate an integer "state" index with a hierarchy node.
def append_database
Append cross-link dataset to this one.
class to link stat files to several rmf files
class to allow more advanced handling of RMF files.
Mapping between FASTA one-letter codes and residue types.
A class to handle different styles of site pairs parsers.
def classify_crosslinks_by_score
This function creates as many classes as in the input (number_of_classes: integer) and partition cros...
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
this class handles a cross-link dataset and do filtering operations, adding cross-links, merge datasets...
Support for the RMF file format for storing hierarchical molecular data and markup.
def rename_proteins
This function renames all proteins contained in the input dictionary from the old names (keys) to the...
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.
def align_sequence
This function parses an alignment file obtained previously, to map crosslink residues onto the sequen...
def create_set_from_file
if FixedFormatParser is not specified, the file is comma-separated-values