1 """@namespace IMP.pmi.io.crosslink
2 Handles cross-link data sets.
4 Utilities are also provided to help in the analysis of models that
8 from __future__
import print_function
24 from collections
import defaultdict
29 def set_json_default(obj):
30 if isinstance(obj, set):
38 if sys.version_info[0] >= 3:
39 def _handle_string_input(inp):
40 if not isinstance(inp, str):
41 raise TypeError(
"expecting a string")
44 def _handle_string_input(inp):
45 if not isinstance(inp, (str, unicode)):
46 raise TypeError(
"expecting a string or unicode")
48 if isinstance(inp, unicode):
54 class _CrossLinkDataBaseStandardKeys(object):
56 This class setup all the standard keys needed to
57 identify the cross-link features from the data sets
61 self.protein1_key =
"Protein1"
62 self.type[self.protein1_key] = str
63 self.protein2_key =
"Protein2"
64 self.type[self.protein2_key] = str
65 self.residue1_key =
"Residue1"
66 self.type[self.residue1_key] = int
67 self.residue2_key =
"Residue2"
68 self.type[self.residue2_key] = int
69 self.residue1_amino_acid_key =
"Residue1AminoAcid"
70 self.type[self.residue1_amino_acid_key] = str
71 self.residue2_amino_acid_key =
"Residue2AminoAcid"
72 self.type[self.residue2_amino_acid_key] = str
73 self.residue1_moiety_key =
"Residue1Moiety"
74 self.type[self.residue1_moiety_key] = str
75 self.residue2_moiety_key =
"Residue2Moiety"
76 self.type[self.residue2_moiety_key] = str
77 self.site_pairs_key =
"SitePairs"
78 self.type[self.site_pairs_key] = str
79 self.unique_id_key =
"XLUniqueID"
80 self.type[self.unique_id_key] = str
81 self.unique_sub_index_key =
"XLUniqueSubIndex"
82 self.type[self.unique_sub_index_key] = int
83 self.unique_sub_id_key =
"XLUniqueSubID"
84 self.type[self.unique_sub_id_key] = str
85 self.data_set_name_key =
"DataSetName"
86 self.type[self.data_set_name_key] = str
87 self.cross_linker_chemical_key =
"CrossLinkerChemical"
88 self.type[self.cross_linker_chemical_key] = str
89 self.id_score_key =
"IDScore"
90 self.type[self.id_score_key] = float
92 self.type[self.fdr_key] = float
93 self.quantitation_key =
"Quantitation"
94 self.type[self.quantitation_key] = float
95 self.redundancy_key =
"Redundancy"
96 self.type[self.redundancy_key] = int
97 self.redundancy_list_key =
"RedundancyList"
98 self.type[self.redundancy_key] = list
99 self.ambiguity_key =
"Ambiguity"
100 self.type[self.ambiguity_key] = int
101 self.residue1_links_number_key =
"Residue1LinksNumber"
102 self.type[self.residue1_links_number_key] = int
103 self.residue2_links_number_key =
"Residue2LinksNumber"
104 self.type[self.residue2_links_number_key] = int
105 self.type[self.ambiguity_key] = int
106 self.state_key =
"State"
107 self.type[self.state_key] = int
108 self.sigma1_key =
"Sigma1"
109 self.type[self.sigma1_key] = str
110 self.sigma2_key =
"Sigma2"
111 self.type[self.sigma2_key] = str
113 self.type[self.psi_key] = str
114 self.distance_key =
"Distance"
115 self.type[self.distance_key] = float
116 self.min_ambiguous_distance_key =
"MinAmbiguousDistance"
117 self.type[self.distance_key] = float
119 self.link_type_key =
"LinkType"
120 self.type[self.link_type_key] = str
122 self.ordered_key_list = [
123 self.data_set_name_key, self.unique_id_key,
124 self.unique_sub_index_key, self.unique_sub_id_key,
125 self.protein1_key, self.protein2_key, self.residue1_key,
126 self.residue2_key, self.residue1_amino_acid_key,
127 self.residue2_amino_acid_key, self.residue1_moiety_key,
128 self.residue2_moiety_key, self.cross_linker_chemical_key,
129 self.id_score_key, self.fdr_key, self.quantitation_key,
130 self.redundancy_key, self.redundancy_list_key, self.state_key,
131 self.sigma1_key, self.sigma2_key, self.psi_key, self.distance_key,
132 self.min_ambiguous_distance_key, self.link_type_key]
135 class _ProteinsResiduesArray(tuple):
137 This class is inherits from tuple, and it is a shorthand for a cross-link
138 (p1,p2,r1,r2) or a monolink (p1,r1) where p1 and p2 are protein1
139 and protein2, r1 and r2 are residue1 and residue2.
142 def __new__(self, input_data):
144 @input_data can be a dict or a tuple
146 self.cldbsk = _CrossLinkDataBaseStandardKeys()
147 if type(input_data)
is dict:
149 p1 = input_data[self.cldbsk.protein1_key]
151 p2 = input_data[self.cldbsk.protein2_key]
154 r1 = input_data[self.cldbsk.residue1_key]
156 r2 = input_data[self.cldbsk.residue2_key]
162 t = (p1,
"", r1,
None)
163 elif type(input_data)
is tuple:
164 if len(input_data) > 4
or len(input_data) == 3 \
165 or len(input_data) == 1:
167 "_ProteinsResiduesArray: must have only 4 elements")
168 if len(input_data) == 4:
169 p1 = _handle_string_input(input_data[0])
170 p2 = _handle_string_input(input_data[1])
173 if (type(r1)
is not int)
and (r1
is not None):
175 "_ProteinsResiduesArray: residue1 must be a integer")
176 if (type(r2)
is not int)
and (r1
is not None):
178 "_ProteinsResiduesArray: residue2 must be a integer")
180 if len(input_data) == 2:
181 p1 = _handle_string_input(input_data[0])
183 if type(r1)
is not int:
185 "_ProteinsResiduesArray: residue1 must be a integer")
186 t = (p1,
"", r1,
None)
189 "_ProteinsResiduesArray: input must be a dict or tuple")
190 return tuple.__new__(_ProteinsResiduesArray, t)
192 def get_inverted(self):
194 Returns a _ProteinsResiduesArray instance with protein1 and
197 return _ProteinsResiduesArray((self[1], self[0], self[3], self[2]))
200 outstr = self.cldbsk.protein1_key +
" " + str(self[0])
201 outstr +=
" " + self.cldbsk.protein2_key +
" " + str(self[1])
202 outstr +=
" " + self.cldbsk.residue1_key +
" " + str(self[2])
203 outstr +=
" " + self.cldbsk.residue2_key +
" " + str(self[3])
207 outstr = str(self[0]) +
"." + str(self[2]) +
"-" + str(self[1]) \
214 This class allows to create filter functions that can be passed to
215 the CrossLinkDataBase in this way:
217 fo=FilterOperator(cldb.protein1_key,operator.eq,"AAA")|FilterOperator(cldb.protein2_key,operator.eq,"BBB")
219 where cldb is CrossLinkDataBase instance and it is only needed to get
220 the standard keywords
222 A filter operator can be evaluate on a CrossLinkDataBase item xl and
227 and it is used to filter the database
230 def __init__(self, argument1, operator, argument2):
232 (argument1,operator,argument2) can be either a
233 (keyword,operator.eq|lt|gt...,value)
234 or (FilterOperator1,operator.or|and...,FilterOperator2)
236 if isinstance(argument1, FilterOperator):
237 self.operations = [argument1, operator, argument2]
240 self.values = (argument1, operator, argument2)
242 def __or__(self, FilterOperator2):
245 def __and__(self, FilterOperator2):
248 def __invert__(self):
251 def evaluate(self, xl_item):
253 if len(self.operations) == 0:
254 keyword, operator, value = self.values
255 return operator(xl_item[keyword], value)
256 FilterOperator1, op, FilterOperator2 = self.operations
258 if FilterOperator2
is None:
259 return op(FilterOperator1.evaluate(xl_item))
261 return op(FilterOperator1.evaluate(xl_item),
262 FilterOperator2.evaluate(xl_item))
267 This class is needed to convert the keywords from a generic database
273 @param list_parser an instance of ResiduePairListParser, if any
277 self.backward_converter = {}
278 _CrossLinkDataBaseStandardKeys.__init__(self)
279 self.rplp = list_parser
280 if self.rplp
is None:
282 self.compulsory_keys = set([self.protein1_key,
287 self.compulsory_keys = self.rplp.get_compulsory_keys()
288 self.setup_keys = set()
292 Is a function that check whether necessary keys are setup
295 if self.compulsory_keys & setup_keys != self.compulsory_keys:
296 raise KeyError(
"CrossLinkDataBaseKeywordsConverter: must setup "
297 "all necessary keys")
301 Returns the keys that have been setup so far
303 return self.backward_converter.keys()
307 This sets up the standard compulsory keys as defined by
308 _CrossLinkDataBaseStandardKeys
310 for ck
in self.compulsory_keys:
311 self.converter[ck] = ck
312 self.backward_converter[ck] = ck
314 def set_unique_id_key(self, origin_key):
315 self.converter[origin_key] = self.unique_id_key
316 self.backward_converter[self.unique_id_key] = origin_key
318 def set_protein1_key(self, origin_key):
319 self.converter[origin_key] = self.protein1_key
320 self.backward_converter[self.protein1_key] = origin_key
322 def set_protein2_key(self, origin_key):
323 self.converter[origin_key] = self.protein2_key
324 self.backward_converter[self.protein2_key] = origin_key
326 def set_residue1_key(self, origin_key):
327 self.converter[origin_key] = self.residue1_key
328 self.backward_converter[self.residue1_key] = origin_key
330 def set_residue2_key(self, origin_key):
331 self.converter[origin_key] = self.residue2_key
332 self.backward_converter[self.residue2_key] = origin_key
334 def set_residue1_amino_acid_key(self, origin_key):
335 self.converter[origin_key] = self.residue1_amino_acid_key
336 self.backward_converter[self.residue1_amino_acid_key] = origin_key
338 def set_residue2_amino_acid_key(self, origin_key):
339 self.converter[origin_key] = self.residue2_amino_acid_key
340 self.backward_converter[self.residue2_amino_acid_key] = origin_key
342 def set_residue1_moiety_key(self, origin_key):
343 self.converter[origin_key] = self.residue1_moiety_key
344 self.backward_converter[self.residue1_moiety_key] = origin_key
346 def set_residue2_moiety_key(self, origin_key):
347 self.converter[origin_key] = self.residue2_moiety_key
348 self.backward_converter[self.residue2_moiety_key] = origin_key
350 def set_site_pairs_key(self, origin_key):
351 self.converter[origin_key] = self.site_pairs_key
352 self.backward_converter[self.site_pairs_key] = origin_key
354 def set_id_score_key(self, origin_key):
355 self.converter[origin_key] = self.id_score_key
356 self.backward_converter[self.id_score_key] = origin_key
358 def set_fdr_key(self, origin_key):
359 self.converter[origin_key] = self.fdr_key
360 self.backward_converter[self.fdr_key] = origin_key
362 def set_quantitation_key(self, origin_key):
363 self.converter[origin_key] = self.quantitation_key
364 self.backward_converter[self.quantitation_key] = origin_key
366 def set_psi_key(self, origin_key):
367 self.converter[origin_key] = self.psi_key
368 self.backward_converter[self.psi_key] = origin_key
370 def set_link_type_key(self, link_type_key):
371 self.converter[link_type_key] = self.link_type_key
372 self.backward_converter[self.link_type_key] = link_type_key
376 Returns the dictionary that convert the old keywords to the new ones
379 return self.converter
383 Returns the dictionary that convert the new keywords to the old ones
386 return self.backward_converter
391 A class to handle different styles of site pairs parsers.
393 MSSTUDIO: [Y3-S756;Y3-K759;K4-S756;K4-K759] for cross-links
394 [Y3-;K4-] for dead-ends
395 QUANTITATION: sp|P33298|PRS6B_YEAST:280:x:sp|P33298|PRS6B_YEAST:337
396 QUANTITATION (with ambiguity separator :|:): Fbw7:107:|:StrepII2x-Fbw7fl:408:x:Nedd8:48
397 LAN_HUANG: PROT1:C88-PROT2:C448 ambiguous separators | or ;
402 def __init__(self, style):
403 _CrossLinkDataBaseStandardKeys.__init__(self)
404 if style ==
"MSSTUDIO":
406 self.compulsory_keys = set([self.protein1_key,
408 self.site_pairs_key])
409 elif style ==
"XTRACT" or style ==
"QUANTITATION":
411 self.compulsory_keys = set([self.site_pairs_key])
412 elif style ==
"LAN_HUANG":
414 self.compulsory_keys = set([self.site_pairs_key])
416 raise Exception(
"ResiduePairListParser: unknown list parser style")
418 def get_compulsory_keys(self):
419 return self.compulsory_keys
423 This function returns a list of cross-linked residues and the
424 corresponding list of cross-linked chains. The latter list can be
425 empty, if the style doesn't have the corresponding information.
427 if self.style ==
"MSSTUDIO":
428 input_string = input_string.replace(
"[",
"")
429 input_string = input_string.replace(
"]",
"")
430 input_string_pairs = input_string.split(
";")
431 residue_pair_indexes = []
432 chain_pair_indexes = []
433 for s
in input_string_pairs:
435 r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)'
436 r'(\d+)-(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)$',
439 r'^(A|C|D|E|F|G|H|I|K|L|M|N|O|P|Q|R|S|T|Y|X|W)(\d+)-$', s)
442 (residue_type_1, residue_index_1, residue_type_2,
443 residue_index_2) = m1.group(1, 2, 3, 4)
444 residue_pair_indexes.append((residue_index_1,
448 residue_type_1, residue_index_1 = m2.group(1, 2)
450 return residue_pair_indexes, chain_pair_indexes
451 if self.style ==
"XTRACT" or self.style ==
"QUANTITATION":
452 if ":x:" in input_string:
454 input_strings = input_string.split(
":x:")
455 first_peptides = input_strings[0].split(
":|:")
456 second_peptides = input_strings[1].split(
":|:")
457 first_peptides_identifiers = [
459 f.split(
":")[1])
for f
in first_peptides]
460 second_peptides_identifiers = [
462 f.split(
":")[1])
for f
in second_peptides]
463 residue_pair_indexes = []
464 chain_pair_indexes = []
465 for fpi
in first_peptides_identifiers:
466 for spi
in second_peptides_identifiers:
471 residue_pair_indexes.append((residue1, residue2))
472 chain_pair_indexes.append((chain1, chain2))
473 return residue_pair_indexes, chain_pair_indexes
476 first_peptides = input_string.split(
":|:")
477 first_peptides_identifiers = [
478 (f.split(
":")[0], f.split(
":")[1])
for f
in first_peptides]
481 for fpi
in first_peptides_identifiers:
484 residue_indexes.append((residue1,))
485 chain_indexes.append((chain1,))
486 return residue_indexes, chain_indexes
487 if self.style ==
"LAN_HUANG":
488 input_strings = input_string.split(
"-")
489 chain1, first_series = input_strings[0].split(
":")
490 chain2, second_series = input_strings[1].split(
":")
492 first_residues = first_series.replace(
";",
"|").split(
"|")
493 second_residues = second_series.replace(
";",
"|").split(
"|")
494 residue_pair_indexes = []
495 chain_pair_indexes = []
496 for fpi
in first_residues:
497 for spi
in second_residues:
498 residue1 = self.re.sub(
"[^0-9]",
"", fpi)
499 residue2 = self.re.sub(
"[^0-9]",
"", spi)
500 residue_pair_indexes.append((residue1, residue2))
501 chain_pair_indexes.append((chain1, chain2))
502 return residue_pair_indexes, chain_pair_indexes
507 A class to handle different XL format with fixed format
508 currently support ProXL
510 def __init__(self, format):
511 _CrossLinkDataBaseStandardKeys.__init__(self)
512 if format ==
"PROXL":
515 raise Exception(
"FixedFormatParser: unknown list format name")
517 def get_data(self, input_string):
518 if self.format ==
"PROXL":
519 tokens = input_string.split(
"\t")
521 if tokens[0] ==
"SEARCH ID(S)":
524 xl[self.protein1_key] = tokens[2]
525 xl[self.protein2_key] = tokens[4]
526 xl[self.residue1_key] = int(tokens[3])
527 xl[self.residue2_key] = int(tokens[5])
533 this class handles a cross-link dataset and do filtering
534 operations, adding cross-links, merge datasets...
537 def __init__(self, converter=None, data_base=None, fasta_seq=None,
541 @param converter an instance of CrossLinkDataBaseKeywordsConverter
542 @param data_base an instance of CrossLinkDataBase to build the
544 @param fasta_seq an instance of IMP.pmi.topology.Sequences containing
545 protein fasta sequences to check cross-link consistency.
546 If not given consistency will not be checked
547 @param linkable_aa a tuple containing one-letter amino acids which
548 are linkable by the cross-linker; only used if the database
549 DOES NOT provide a value for a certain residueX_amino_acid_key
550 and if a fasta_seq is given
553 if data_base
is None:
556 self.data_base = data_base
558 _CrossLinkDataBaseStandardKeys.__init__(self)
559 if converter
is not None:
560 self.cldbkc = converter
561 self.list_parser = self.cldbkc.rplp
562 self.converter = converter.get_converter()
566 self.list_parser =
None
567 self.converter =
None
570 self.def_aa_tuple = linkable_aa
571 self.fasta_seq = fasta_seq
578 Update the whole dataset after changes
580 self.update_cross_link_unique_sub_index()
581 self.update_cross_link_redundancy()
582 self.update_residues_links_number()
586 sorted_ids = sorted(self.data_base.keys())
588 for xl
in self.data_base[k]:
591 def xlid_iterator(self):
592 sorted_ids = sorted(self.data_base.keys())
593 for xlid
in sorted_ids:
596 def __getitem__(self, xlid):
597 return self.data_base[xlid]
600 return len([xl
for xl
in self])
605 def set_name(self, name):
607 for k
in self.data_base:
608 new_data_base[k +
"." + name] = self.data_base[k]
609 self.data_base = new_data_base
613 def get_number_of_xlid(self):
614 return len(self.data_base)
617 FixedFormatParser=
None, encoding=
None):
619 if FixedFormatParser is not specified, the file is
620 comma-separated-values
622 @param file_name a txt file to be parsed
623 @param converter an instance of CrossLinkDataBaseKeywordsConverter
624 @param FixedFormatParser a parser for a fixed format
625 @param encoding the encoding of the file, if not the locale
626 default (usually UTF-8)
628 if not FixedFormatParser:
629 xl_list = IMP.pmi.tools.get_db_from_csv(file_name, encoding)
631 if converter
is not None:
632 self.cldbkc = converter
633 self.list_parser = self.cldbkc.rplp
634 self.converter = converter.get_converter()
636 if not self.list_parser:
640 for nxl, xl
in enumerate(xl_list):
643 if k
in self.converter:
644 new_xl[self.converter[k]] = \
645 self.type[self.converter[k]](xl[k])
648 if self.unique_id_key
in self.cldbkc.get_setup_keys():
649 if new_xl[self.unique_id_key]
not in new_xl_dict:
650 new_xl_dict[new_xl[self.unique_id_key]] = [new_xl]
652 new_xl_dict[new_xl[self.unique_id_key]].append(
655 if str(nxl)
not in new_xl_dict:
656 new_xl_dict[str(nxl)] = [new_xl]
658 new_xl_dict[str(nxl)].append(new_xl)
664 for nxl, entry
in enumerate(xl_list):
668 if self.site_pairs_key
not in self.cldbkc.get_setup_keys():
670 "CrossLinkDataBase: expecting a site_pairs_key "
671 "for the site pair list parser")
673 if k
in self.converter:
674 new_dict[self.converter[k]] = \
675 self.type[self.converter[k]](entry[k])
677 new_dict[k] = entry[k]
680 chain_pair_list) = self.list_parser.get_list(
681 new_dict[self.site_pairs_key])
684 for n, p
in enumerate(residue_pair_list):
691 new_xl[k] = new_dict[k]
692 new_xl[self.residue1_key] = \
693 self.type[self.residue1_key](p[0])
695 new_xl[self.residue2_key] = \
696 self.type[self.residue2_key](p[1])
698 if len(chain_pair_list) == len(residue_pair_list):
699 new_xl[self.protein1_key] = \
700 self.type[self.protein1_key](
701 chain_pair_list[n][0])
703 new_xl[self.protein2_key] = \
704 self.type[self.protein2_key](
705 chain_pair_list[n][1])
708 new_xl[self.link_type_key] =
"CROSSLINK"
710 new_xl[self.link_type_key] =
"MONOLINK"
712 if self.unique_id_key
in self.cldbkc.get_setup_keys():
713 if new_xl[self.unique_id_key]
not in new_xl_dict:
714 new_xl_dict[new_xl[self.unique_id_key]] = \
717 new_xl_dict[new_xl[self.unique_id_key]].append(
720 if str(nxl)
not in new_xl_dict:
721 new_xl[self.unique_id_key] = str(nxl+1)
722 new_xl_dict[str(nxl)] = [new_xl]
724 new_xl[self.unique_id_key] = str(nxl+1)
725 new_xl_dict[str(nxl)].append(new_xl)
729 if FixedFormatParser is defined
731 if sys.version_info[0] == 2:
732 def open_with_encoding(fname, mode, encoding):
733 return open(fname, mode)
735 open_with_encoding = open
739 with open_with_encoding(file_name,
"r", encoding=encoding) as f:
741 xl = FixedFormatParser.get_data(line)
743 xl[self.unique_id_key] = str(nxl+1)
744 new_xl_dict[str(nxl)] = [xl]
747 self.data_base = new_xl_dict
748 self.name = file_name
749 loc = ihm.location.InputFileLocation(file_name, details=
'Crosslinks')
750 self.dataset = ihm.dataset.CXMSDataset(loc)
753 def update_cross_link_unique_sub_index(self):
754 for k
in self.data_base:
755 for n, xl
in enumerate(self.data_base[k]):
756 xl[self.ambiguity_key] = len(self.data_base[k])
757 xl[self.unique_sub_index_key] = n+1
758 xl[self.unique_sub_id_key] = k +
"." + str(n+1)
760 def update_cross_link_redundancy(self):
761 redundancy_data_base = {}
763 pra = _ProteinsResiduesArray(xl)
764 if pra
not in redundancy_data_base:
765 redundancy_data_base[pra] = [xl[self.unique_sub_id_key]]
766 redundancy_data_base[pra.get_inverted()] = \
767 [xl[self.unique_sub_id_key]]
769 redundancy_data_base[pra].append(xl[self.unique_sub_id_key])
770 redundancy_data_base[pra.get_inverted()].append(
771 xl[self.unique_sub_id_key])
773 pra = _ProteinsResiduesArray(xl)
774 xl[self.redundancy_key] = len(redundancy_data_base[pra])
775 xl[self.redundancy_list_key] = redundancy_data_base[pra]
777 def update_residues_links_number(self):
780 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
781 if (p1, r1)
not in residue_links:
782 residue_links[(p1, r1)] = set([(p2, r2)])
784 residue_links[(p1, r1)].add((p2, r2))
785 if (p2, r2)
not in residue_links:
786 residue_links[(p2, r2)] = set([(p1, r1)])
788 residue_links[(p2, r2)].add((p1, r1))
791 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
792 xl[self.residue1_links_number_key] = len(residue_links[(p1, r1)])
793 xl[self.residue2_links_number_key] = len(residue_links[(p2, r2)])
796 """This function checks the consistency of the dataset with the
797 amino acid sequence"""
798 if self.cldbkc
and self.fasta_seq:
799 cnt_matched, cnt_matched_file = 0, 0
803 (p1, p2, r1, r2) = _ProteinsResiduesArray(xl)
804 b_matched_file =
False
805 if self.residue1_amino_acid_key
in xl:
808 aa_from_file = (xl[self.residue1_amino_acid_key].upper(),)
809 b_matched = self._match_xlinks(p1, r1, aa_from_file)
810 b_matched_file = b_matched
814 b_matched = self._match_xlinks(p1, r1, self.def_aa_tuple)
816 matched, non_matched = self._update_matched_xlinks(
817 b_matched, p1, r1, matched, non_matched)
819 if self.residue2_amino_acid_key
in xl:
820 aa_from_file = (xl[self.residue2_amino_acid_key].upper(), )
821 b_matched = self._match_xlinks(p2, r2, aa_from_file)
822 b_matched_file = b_matched
824 b_matched = self._match_xlinks(p2, r2, self.def_aa_tuple)
826 matched, non_matched = self._update_matched_xlinks(
827 b_matched, p2, r2, matched, non_matched)
831 cnt_matched_file += 1
833 percentage_matched = round(100*cnt_matched/len(self), 1)
834 percentage_matched_file = round(
835 100 * cnt_matched_file / len(self), 1)
836 if matched
or non_matched:
838 "check_cross_link_consistency: Out of %d cross-links "
839 "%d were matched to the fasta sequence (%f %%).\n "
840 "%d were matched by using the cross-link file (%f %%)."
841 % (len(self), cnt_matched, percentage_matched,
842 cnt_matched_file, percentage_matched_file))
844 print(
"check_cross_link_consistency: Warning: Non "
846 [(prot_name, sorted(list(non_matched[prot_name])))
847 for prot_name
in non_matched])
848 return matched, non_matched
850 def _match_xlinks(self, prot_name, res_index, aa_tuple):
854 amino_dict = IMP.pmi.alphabets.amino_acid
856 for amino_acid
in aa_tuple:
857 if len(amino_acid) == 3:
858 amino_acid = amino_dict.get_one_letter_code_from_residue_type(
860 if prot_name
in self.fasta_seq.sequences:
861 seq = self.fasta_seq.sequences[prot_name]
869 if res_index == 0
or res_index == 1:
871 if res_index < len(seq):
872 if amino_acid == seq[res_index]:
876 def _update_matched_xlinks(self, b_matched, prot, res, matched,
880 matched[prot].add(res)
882 matched[prot] = set([res])
884 if prot
in non_matched:
885 non_matched[prot].add(res)
887 non_matched[prot] = set([res])
888 return matched, non_matched
890 def get_cross_link_string(self, xl):
892 for k
in self.ordered_key_list:
894 string += str(k) +
":" + str(xl[k]) +
"|"
899 if k
not in self.ordered_key_list:
900 string += str(k) +
":" + str(xl[k]) +
"|"
904 def get_short_cross_link_string(self, xl):
906 list_of_keys = [self.data_set_name_key,
907 self.unique_sub_id_key,
915 for k
in list_of_keys:
917 string += str(xl[k]) +
"|"
923 def filter(self, FilterOperator):
925 for id
in self.data_base.keys():
926 for xl
in self.data_base[id]:
927 if FilterOperator.evaluate(xl):
928 if id
not in new_xl_dict:
929 new_xl_dict[id] = [xl]
931 new_xl_dict[id].append(xl)
934 cdb.dataset = self.dataset
939 '''Get all cross-links with score greater than an input value'''
941 self.id_score_key, operator.gt, score)
942 return self.filter(FilterOperator)
944 def merge(self, CrossLinkDataBase1, CrossLinkDataBase2):
946 This function merges two cross-link datasets so that if two
947 conflicting cross-links have the same cross-link UniqueIDS,
948 the cross-links will be appended under the same UniqueID slots
949 with different SubIDs
951 raise NotImplementedError()
954 """Append cross-link dataset to this one."""
956 for k
in self.data_base:
957 new_data_base[k] = self.data_base[k]
958 for k
in db.data_base:
959 new_data_base[k] = db.data_base[k]
960 self.data_base = new_data_base
963 def __iadd__(self, db):
967 def set_value(self, key, new_value, filter_operator=None):
969 This function changes the value for a given key in the database
970 For instance one can change the name of a protein
971 @param key: the key in the database that must be changed
972 @param new_value: the new value of the key
973 @param filter_operator: optional FilterOperator to change the value to
974 a subset of the database
976 example: `cldb1.set_value(cldb1.protein1_key, 'FFF',
977 FO(cldb.protein1_key, operator.eq, "AAA"))`
981 if filter_operator
is not None:
982 if filter_operator.evaluate(xl):
990 this function returns the list of values for a given key in the
991 database alphanumerically sorted
996 return sorted(list(values))
1000 This function offset the residue indexes of a given protein by
1002 @param protein_name: the protein name that need to be changed
1003 @param offset: the offset value
1007 if xl[self.protein1_key] == protein_name:
1008 xl[self.residue1_key] = xl[self.residue1_key] + offset
1009 if xl[self.protein2_key] == protein_name:
1010 xl[self.residue2_key] = xl[self.residue2_key] + offset
1015 This function creates a new keyword for the whole database and
1016 set the values from and existing keyword (optional), otherwise the
1017 values are set to None
1018 @param keyword the new keyword name:
1019 @param values_from_keyword the keyword from which we are copying
1023 if values_from_keyword
is not None:
1024 xl[keyword] = xl[values_from_keyword]
1030 protein_to_rename=
"both"):
1032 This function renames all proteins contained in the input dictionary
1033 from the old names (keys) to the new name (values)
1034 @param old_to_new_names_dictionary dictionary for converting old
1036 @param protein_to_rename specify whether to rename both or protein1
1040 for old_name
in old_to_new_names_dictionary:
1041 new_name = old_to_new_names_dictionary[old_name]
1042 if protein_to_rename ==
"both" or protein_to_rename ==
"protein1":
1044 self.
set_value(self.protein1_key, new_name, fo2)
1045 if protein_to_rename ==
"both" or protein_to_rename ==
"protein2":
1047 self.
set_value(self.protein2_key, new_name, fo2)
1051 This function parses an alignment file obtained previously, to map
1052 crosslink residues onto the sequence of a homolog proteins. It is
1053 useful if you want to map the crosslinks onto a known, homolog
1054 structure without building a comparative model.
1055 The txt file of the alignment should be structured with repeating
1058 >NAME_CROSSLINKED_PROTEIN
1059 >NAME_HOMOLOG_PROTEIN_WITH_KNOWN_STRUCTURE
1060 one-letter code sequence for cross-linked protein (one line)
1061 one-letter code sequence for homolog protein with known structure (one line)
1064 f = open(alignfile,
'r')
1066 def _assign_residues(seq1, seq2):
1074 resinds1.append(nres)
1076 resinds1.append(
None)
1078 resinds2 = [
None for i
in resinds1]
1081 for pos, r2
in enumerate(seq2):
1084 resinds2[pos] = nres
1086 if contiguousgap > 1:
1087 for i
in range(pos-int(contiguousgap/2), pos):
1092 resinds2[pos] = lastresind
1094 for n, r
in enumerate(resinds1):
1102 with open(alignfile,
'r') as f:
1108 protname = aa.replace(
">",
"").replace(
"\n",
"")
1109 seq1 = cc.replace(
",",
"").replace(
"\n",
"")
1110 seq2 = dd.replace(
",",
"").replace(
"\n",
"")
1116 for i
in range(len(seq1)):
1128 tmpl1.append([i, c1, seq1[i]])
1129 tmpl2.append([i, c2, seq2[i]])
1130 sequences1[protname] = tmpl1
1131 sequences2[protname] = tmpl2
1133 d = _assign_residues(seq1, seq2)
1139 if fo11.evaluate(xl):
1140 t = [item
for item
in sequences1[protname]
1141 if item[1] == xl[self.residue1_key]][0]
1142 sequences1[protname][t[0]] = [
1143 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1144 t = [item
for item
in sequences2[protname]
1145 if item[1] == d[xl[self.residue1_key]]][0]
1146 sequences2[protname][t[0]] = [
1147 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1149 xl[self.residue1_key] = d[xl[self.residue1_key]]
1151 if fo12.evaluate(xl):
1153 t = [item
for item
in sequences1[protname]
1154 if item[1] == xl[self.residue2_key]][0]
1155 sequences1[protname][t[0]] = [
1156 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1157 t = [item
for item
in sequences2[protname]
1158 if item[1] == d[xl[self.residue2_key]]][0]
1159 sequences2[protname][t[0]] = [
1160 t[0], t[1],
'\033[91m' + t[2] +
'\033[0m']
1162 xl[self.residue2_key] = d[xl[self.residue2_key]]
1165 print(aa.replace(
"\n",
""))
1166 print(
"".join([seq[2]
for seq
in sequences1[protname]]))
1167 print(bb.replace(
"\n",
""))
1168 print(
"".join([seq[2]
for seq
in sequences2[protname]]))
1172 This function creates as many classes as in the input
1173 (number_of_classes: integer) and partition cross-links according
1174 to their identification scores. Classes are defined in the psi key.
1177 if self.id_score_key
is not None:
1181 'The cross-link database does not contain score values')
1182 minscore = min(scores)
1183 maxscore = max(scores)
1184 scoreclasses = numpy.linspace(minscore, maxscore, number_of_classes+1)
1185 if self.psi_key
is None:
1188 score = xl[self.id_score_key]
1189 for n, classmin
in enumerate(scoreclasses[0:-1]):
1190 if score >= classmin
and score <= scoreclasses[n+1]:
1191 xl[self.psi_key] = str(
"CLASS_"+str(n))
1194 def clone_protein(self, protein_name, new_protein_name):
1195 for id
in self.data_base.keys():
1197 for xl
in self.data_base[id]:
1198 new_data_base.append(xl)
1199 if xl[self.protein1_key] == protein_name \
1200 and xl[self.protein2_key] != protein_name:
1202 new_xl[self.protein1_key] = new_protein_name
1203 new_data_base.append(new_xl)
1204 elif xl[self.protein1_key] != protein_name \
1205 and xl[self.protein2_key] == protein_name:
1207 new_xl[self.protein2_key] = new_protein_name
1208 new_data_base.append(new_xl)
1209 elif xl[self.protein1_key] == protein_name \
1210 and xl[self.protein2_key] == protein_name:
1212 new_xl[self.protein1_key] = new_protein_name
1213 new_data_base.append(new_xl)
1215 new_xl[self.protein2_key] = new_protein_name
1216 new_data_base.append(new_xl)
1218 new_xl[self.protein1_key] = new_protein_name
1219 new_xl[self.protein2_key] = new_protein_name
1220 new_data_base.append(new_xl)
1221 self.data_base[id] = new_data_base
1226 This function remove cross-links applied to the same residue
1227 (ie, same chain name and residue number)
1229 for id
in self.data_base.keys():
1231 for xl
in self.data_base[id]:
1232 if xl[self.protein1_key] == xl[self.protein2_key] \
1233 and xl[self.residue1_key] == xl[self.residue2_key]:
1236 new_data_base.append(xl)
1237 self.data_base[id] = new_data_base
1242 this method returns a CrossLinkDataBase class containing
1243 a random subsample of the original cross-link database.
1244 @param percentage float between 0 and 1, is the percentage of
1245 of spectra taken from the original list
1248 if percentage > 1.0
or percentage < 0.0:
1249 raise ValueError(
'the percentage of random cross-link spectra '
1250 'should be between 0 and 1')
1251 nspectra = self.get_number_of_xlid()
1252 nrandom_spectra = int(nspectra*percentage)
1253 random_keys = random.sample(sorted(self.data_base.keys()),
1256 for k
in random_keys:
1257 new_data_base[k] = self.data_base[k]
1262 sorted_ids = sorted(self.data_base.keys())
1264 for id
in sorted_ids:
1266 for xl
in self.data_base[id]:
1267 for k
in self.ordered_key_list:
1269 outstr +=
"--- " + str(k) +
" " + str(xl[k]) +
"\n"
1274 if k
not in self.ordered_key_list:
1275 outstr +=
"--- " + str(k) +
" " + str(xl[k]) +
"\n"
1276 outstr +=
"-------------\n"
1279 def plot(self, filename, **kwargs):
1281 matplotlib.use(
'Agg')
1282 import matplotlib.pyplot
as plt
1283 import matplotlib.colors
1285 if kwargs[
"type"] ==
"scatter":
1286 cmap = plt.get_cmap(
"rainbow")
1287 _ = matplotlib.colors.Normalize(vmin=0.0, vmax=1.0)
1288 xkey = kwargs[
"xkey"]
1289 ykey = kwargs[
"ykey"]
1290 if "colorkey" in kwargs:
1291 colorkey = kwargs[
"colorkey"]
1292 if "logyscale" in kwargs:
1293 logyscale = kwargs[
"logyscale"]
1301 xs.append(float(xl[xkey]))
1303 ys.append(math.log10(float(xl[ykey])))
1305 ys.append(float(xl[ykey]))
1306 colors.append(float(xl[colorkey]))
1308 print(
"CrossLinkDataBase.plot: Value error for "
1309 "cross-link %s" % (xl[self.unique_id_key]))
1313 for color
in colors:
1314 cindex = (color-min(colors))/(max(colors)-min(colors))
1315 cs.append(cmap(cindex))
1318 ax = fig.add_subplot(111)
1319 ax.scatter(xs, ys, s=50.0, c=cs, alpha=0.8, marker=
"o")
1322 plt.savefig(filename)
1326 if kwargs[
"type"] ==
"residue_links":
1330 molecule = kwargs[
"molecule"]
1332 length = len(molecule.sequence)
1333 molecule = molecule.get_name()
1336 sequences_object = kwargs[
"sequences_object"]
1337 sequence = sequences_object.sequences[molecule]
1338 length = len(sequence)
1340 histogram = [0]*length
1342 if xl[self.protein1_key] == molecule:
1343 histogram[xl[self.residue1_key]-1] = \
1344 xl[self.residue1_links_number_key]
1345 if xl[self.protein2_key] == molecule:
1346 histogram[xl[self.residue2_key]-1] = \
1347 xl[self.residue2_links_number_key]
1348 plt.bar(range(1, length+1), histogram)
1349 plt.savefig(filename)
1353 if kwargs[
"type"] ==
"histogram":
1354 valuekey = kwargs[
"valuekey"]
1355 valuename = valuekey
1356 bins = kwargs[
"bins"]
1360 values_list.append(float(xl[valuekey]))
1362 print(
"CrossLinkDataBase.plot: Value error for cross-link "
1363 "%s" % (xl[self.unique_id_key]))
1366 filename, [values_list], valuename=valuename, bins=bins,
1367 colors=
None, format=
"pdf",
1368 reference_xline=
None, yplotrange=
None,
1369 xplotrange=
None, normalized=
True,
1372 def dump(self, json_filename):
1374 with open(json_filename,
'w')
as fp:
1375 json.dump(self.data_base, fp, sort_keys=
True, indent=2,
1376 default=set_json_default)
1378 def load(self, json_filename):
1380 with open(json_filename,
'r') as fp:
1381 self.data_base = json.load(fp)
1385 if sys.version_info[0] < 3:
1387 for k, v
in xl.iteritems():
1388 if type(k)
is unicode:
1390 if type(v)
is unicode:
1394 def save_csv(self, filename):
1399 sorted_group_ids = sorted(self.data_base.keys())
1400 for group
in sorted_group_ids:
1402 for xl
in self.data_base[group]:
1404 sorted_ids = sorted(xl.keys())
1405 values = [xl[k]
for k
in sorted_ids]
1406 group_block.append(values)
1409 with open(filename,
'w')
as fp:
1410 a = csv.writer(fp, delimiter=
',')
1411 a.writerow(sorted_ids)
1416 Returns the number of non redundant cross-link sites
1420 pra = _ProteinsResiduesArray(xl)
1421 prai = pra.get_inverted()
1424 return len(list(praset))
1428 """This class allows to compute and plot the distance between datasets"""
1431 """Input a dictionary where keys are cldb names and values are cldbs"""
1432 import scipy.spatial.distance
1433 self.dbs = cldb_dictionary
1434 self.keylist = list(self.dbs.keys())
1435 self.distances = list()
1437 for i, key1
in enumerate(self.keylist):
1438 for key2
in self.keylist[i+1:]:
1439 distance = self.get_distance(key1, key2)
1440 self.distances.append(distance)
1442 self.distances = scipy.spatial.distance.squareform(self.distances)
1445 return 1.0 - self.
jaccard_index(self.dbs[key1], self.dbs[key2])
1448 """Similarity index between two datasets
1449 https://en.wikipedia.org/wiki/Jaccard_index"""
1453 for xl1
in CrossLinkDataBase1:
1454 a1f = _ProteinsResiduesArray(xl1)
1455 a1b = a1f.get_inverted()
1458 for xl2
in CrossLinkDataBase2:
1459 a2f = _ProteinsResiduesArray(xl2)
1460 a2b = a2f.get_inverted()
1463 return (float(len(set1 & set2)/2)
1464 / (len(set1)/2+len(set2)/2-len(set1 & set2)/2))
1466 def plot_matrix(self, figurename="clustermatrix.pdf"):
1467 import matplotlib
as mpl
1470 import matplotlib.pylab
as pl
1471 from scipy.cluster
import hierarchy
as hrc
1473 raw_distance_matrix = self.distances
1474 labels = self.keylist
1478 ax = fig.add_subplot(1, 1, 1)
1479 dendrogram = hrc.dendrogram(
1480 hrc.linkage(raw_distance_matrix),
1483 leaves_order = dendrogram[
'leaves']
1484 ax.set_xlabel(
'Dataset')
1485 ax.set_ylabel(
'Jaccard Distance')
1487 pl.savefig(
"dendrogram."+figurename, dpi=300)
1492 ax = fig.add_subplot(1, 1, 1)
1493 cax = ax.imshow(raw_distance_matrix[leaves_order, :][:, leaves_order],
1494 interpolation=
'nearest')
1495 cb = fig.colorbar(cax)
1496 cb.set_label(
'Jaccard Distance')
1497 ax.set_xlabel(
'Dataset')
1498 ax.set_ylabel(
'Dataset')
1499 ax.set_xticks(range(len(labels)))
1500 ax.set_xticklabels(numpy.array(labels)[leaves_order],
1501 rotation=
'vertical')
1502 ax.set_yticks(range(len(labels)))
1503 ax.set_yticklabels(numpy.array(labels)[leaves_order],
1504 rotation=
'horizontal')
1506 pl.savefig(
"matrix." + figurename, dpi=300)
1512 This class maps a CrossLinkDataBase on a given structure
1513 and save an rmf file with color-coded cross-links
1516 def __init__(self, CrossLinkDataBase, rmf_or_stat_handler):
1517 self.CrossLinkDataBase = CrossLinkDataBase
1520 self.prots = rmf_or_stat_handler
1521 self.distances = defaultdict(list)
1522 self.array_to_id = {}
1523 self.id_to_array = {}
1525 print(
"computing distances fro all cross-links and all structures")
1526 for i
in self.prots[::10]:
1527 self.compute_distances()
1528 for key, xl
in enumerate(self.CrossLinkDataBase):
1529 array = _ProteinsResiduesArray(xl)
1530 self.array_to_id[array] = key
1531 self.id_to_array[key] = array
1532 if xl[
"MinAmbiguousDistance"] !=
'None':
1533 self.distances[key].append(xl[
"MinAmbiguousDistance"])
1535 def compute_distances(self):
1538 sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1539 for group
in sorted_group_ids:
1541 for xl
in self.CrossLinkDataBase.data_base[group]:
1543 sorted_ids = sorted(xl.keys())
1546 + [
"UniqueID",
"Distance",
"MinAmbiguousDistance"])
1547 c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1549 (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1550 self._get_distance_and_particle_pair(r1, c1, r2, c2)
1560 values = [xl[k]
for k
in sorted_ids]
1561 values += [group, mdist]
1562 except KeyError
as e:
1563 print(
"MapCrossLinkDataBaseOnStructure "
1564 "KeyError: {0} in {1}".format(e, xl))
1565 group_dists.append(mdist)
1566 xl[
"Distance"] = mdist
1567 xl[
"State1"] = state1
1569 xl[
"State2"] = state2
1572 for xl
in self.CrossLinkDataBase.data_base[group]:
1573 xl[
"MinAmbiguousDistance"] = min(group_dists)
1575 def _get_distance_and_particle_pair(self, r1, c1, r2, c2):
1576 '''more robust and slower version of above'''
1579 selpart_1 = sel.get_selected_particles()
1580 if len(selpart_1) == 0:
1581 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle "
1582 "selected for first site")
1586 selpart_2 = sel.get_selected_particles()
1587 if len(selpart_2) == 0:
1588 print(
"MapCrossLinkDataBaseOnStructure: Warning: no particle "
1589 "selected for second site")
1592 for p1
in selpart_1:
1593 for p2
in selpart_2:
1594 if p1 == p2
and r1 == r2:
1602 h1 = h1.get_parent()
1605 h1 = h1.get_parent()
1609 h2 = h2.get_parent()
1612 h2 = h2.get_parent()
1615 results.append((dist, state_index1, copy_index1,
1616 state_index2, copy_index2, p1, p2))
1617 if len(results) == 0:
1619 results_sorted = sorted(results,
1620 key=operator.itemgetter(0, 1, 2, 3, 4))
1621 return ((results_sorted[0][0],
1622 results_sorted[0][5],
1623 results_sorted[0][6]),
1624 (results_sorted[0][1],
1625 results_sorted[0][2],
1626 results_sorted[0][3],
1627 results_sorted[0][4]))
1629 def save_rmf_snapshot(self, filename, color_id=None):
1630 sorted_group_ids = sorted(self.CrossLinkDataBase.data_base.keys())
1633 for group
in sorted_group_ids:
1634 group_dists_particles = []
1635 for xl
in self.CrossLinkDataBase.data_base[group]:
1637 self.CrossLinkDataBase.get_short_cross_link_string(xl)
1638 c1, c2, r1, r2 = _ProteinsResiduesArray(xl)
1640 (mdist, p1, p2), (state1, copy1, state2, copy2) = \
1641 self._get_distance_and_particle_pair(r1, c1, r2, c2)
1643 print(
"TypeError or missing chain/residue ",
1646 if color_id
is not None:
1647 group_dists_particles.append((mdist, p1, p2, xllabel,
1648 float(xl[color_id])))
1650 group_dists_particles.append((mdist, p1, p2, xllabel, 0.0))
1651 if group_dists_particles:
1652 (minmdist, minp1, minp2, minxllabel, mincolor_score) = \
1653 min(group_dists_particles, key=
lambda t: t[0])
1654 color_scores.append(mincolor_score)
1655 list_of_pairs.append((minp1, minp2, minxllabel,
1661 linear.set_slope(1.0)
1664 'linear_dummy_restraints')
1666 offset = min(color_scores)
1667 maxvalue = max(color_scores)
1668 for pair
in list_of_pairs:
1671 pr.set_name(pair[2])
1672 if offset < maxvalue:
1673 factor = (pair[3]-offset)/(maxvalue-offset)
1680 rslin.add_restraint(pr)
1683 rh = RMF.create_rmf_file(filename)
1690 def boxplot_crosslink_distances(self, filename):
1692 keys = [self.id_to_array[k]
for k
in self.distances.keys()]
1693 medians = [numpy.median(self.distances[self.array_to_id[k]])
1695 dists = [self.distances[self.array_to_id[k]]
for k
in keys]
1696 distances_sorted_by_median = \
1697 [x
for _, x
in sorted(zip(medians, dists))]
1698 keys_sorted_by_median = [x
for _, x
in sorted(zip(medians, keys))]
1700 filename, distances_sorted_by_median,
1701 range(len(distances_sorted_by_median)),
1702 xlabels=keys_sorted_by_median,
1703 scale_plot_length=0.2)
1705 def get_crosslink_violations(self, threshold):
1706 violations = defaultdict(list)
1707 for k
in self.distances:
1709 viols = self.distances[k]
1710 violations[self.id_to_array[k]] = viols
1716 This class generates a CrossLinkDataBase from a given structure
1718 def __init__(self, system, residue_types_1=["K"],
1719 residue_types_2=[
"K"], reactivity_range=[0, 1], kt=1.0):
1723 cldbkc.set_protein1_key(
"Protein1")
1724 cldbkc.set_protein2_key(
"Protein2")
1725 cldbkc.set_residue1_key(
"Residue1")
1726 cldbkc.set_residue2_key(
"Residue2")
1728 self.system = system
1729 self.model = self.system.model
1730 self.residue_types_1 = residue_types_1
1731 self.residue_types_2 = residue_types_2
1733 self.indexes_dict1 = {}
1734 self.indexes_dict2 = {}
1735 self.protein_residue_dict = {}
1736 self.reactivity_dictionary = {}
1737 self.euclidean_interacting_pairs =
None
1738 self.xwalk_interacting_pairs =
None
1740 for state
in self.system.get_states():
1741 for moleculename, molecules
in state.get_molecules().items():
1742 for molecule
in molecules:
1745 seq = molecule.sequence
1746 residues = [i
for i
in range(1, len(seq)+1)
1747 if ((seq[i-1]
in self.residue_types_1)
1748 or (seq[i-1]
in self.residue_types_2))]
1754 rexp = numpy.random.exponential(0.00000001)
1755 prob = 1.0-math.exp(-rexp)
1756 self.reactivity_dictionary[(molecule, r)] = prob
1758 residues1 = [i
for i
in range(1, len(seq)+1)
1759 if seq[i-1]
in self.residue_types_1]
1760 residues2 = [i
for i
in range(1, len(seq)+1)
1761 if seq[i-1]
in self.residue_types_2]
1764 molecule.hier, residue_index=r, resolution=1)
1765 ps = s.get_selected_particles()
1767 index = p.get_index()
1768 self.indexes_dict1[index] = (molecule, r)
1769 self.protein_residue_dict[(molecule, r)] = index
1772 molecule.hier, residue_index=r, resolution=1)
1773 ps = s.get_selected_particles()
1775 index = p.get_index()
1776 self.indexes_dict2[index] = (molecule, r)
1777 self.protein_residue_dict[(molecule, r)] = index
1779 def get_all_possible_pairs(self):
1780 n = float(len(self.protein_residue_dict.keys()))
1781 return n*(n-1.0)/2.0
1783 def get_all_feasible_pairs(self, distance=21):
1785 particle_index_pairs = []
1787 for a, b
in itertools.combinations(
1788 self.protein_residue_dict.keys(), 2):
1790 index1 = self.protein_residue_dict[a]
1791 index2 = self.protein_residue_dict[b]
1795 if particle_distance <= distance:
1796 particle_index_pairs.append((index1, index2))
1797 new_xl[self.cldb.protein1_key] = a[0].get_name()
1798 new_xl[self.cldb.protein2_key] = b[0].get_name()
1799 new_xl[
"molecule_object1"] = a[0]
1800 new_xl[
"molecule_object2"] = b[0]
1801 new_xl[self.cldb.residue1_key] = a[1]
1802 new_xl[self.cldb.residue2_key] = b[1]
1803 self.cldb.data_base[str(nxl)] = [new_xl]
1808 def get_data_base(self, total_number_of_spectra, ambiguity_probability=0.1,
1809 noise=0.01, distance=21, max_delta_distance=10.0,
1810 xwalk_bin_path=
None, confidence_false=0.75,
1811 confidence_true=0.75):
1813 from random
import random
1815 number_of_spectra = 1
1817 self.beta_true = -1.4427*math.log(0.5*(1.0-confidence_true))
1818 self.beta_false = -1.4427*math.log(0.5*(1.0-confidence_false))
1819 self.cldb.data_base[str(number_of_spectra)] = []
1820 self.sites_weighted =
None
1822 while number_of_spectra < total_number_of_spectra:
1823 if (random() > ambiguity_probability
1824 and len(self.cldb.data_base[str(number_of_spectra)]) != 0):
1826 number_of_spectra += 1
1827 self.cldb.data_base[str(number_of_spectra)] = []
1829 if random() > noise:
1831 pra, dist = self.get_random_residue_pair(
1832 distance, xwalk_bin_path, max_delta_distance)
1835 pra, dist = self.get_random_residue_pair(
None,
None,
None)
1839 new_xl[self.cldb.protein1_key] = pra[0].get_name()
1840 new_xl[self.cldb.protein2_key] = pra[1].get_name()
1841 new_xl[
"molecule_object1"] = pra[0]
1842 new_xl[
"molecule_object2"] = pra[1]
1843 new_xl[self.cldb.residue1_key] = pra[2]
1844 new_xl[self.cldb.residue2_key] = pra[3]
1845 new_xl[
"Noisy"] = noisy
1847 new_xl[
"Reactivity_Residue1"] = \
1848 self.reactivity_dictionary[(pra[0], pra[2])]
1849 new_xl[
"Reactivity_Residue2"] = \
1850 self.reactivity_dictionary[(pra[1], pra[3])]
1852 new_xl[
"Score"] = np.random.beta(1.0, self.beta_false)
1854 new_xl[
"Score"] = 1.0-np.random.beta(1.0, self.beta_true)
1855 new_xl[
"TargetDistance"] = dist
1856 new_xl[
"NoiseProbability"] = noise
1857 new_xl[
"AmbiguityProbability"] = ambiguity_probability
1860 self.model, [self.protein_residue_dict[(pra[0], pra[2])],
1861 self.protein_residue_dict[(pra[1], pra[3])]])
1866 new_xl[
"InterRigidBody"] =
False
1871 new_xl[
"InterRigidBody"] =
True
1873 new_xl[
"InterRigidBody"] =
None
1875 self.cldb.data_base[str(number_of_spectra)].append(new_xl)
1879 def get_random_residue_pair(self, distance=None, xwalk_bin_path=None,
1880 max_delta_distance=
None):
1883 from random
import choice
1884 if distance
is None:
1887 protein1, residue1 = choice(self.protein_residue_dict.keys())
1888 protein2, residue2 = choice(self.protein_residue_dict.keys())
1889 index1 = self.protein_residue_dict[(protein1, residue1)]
1890 index2 = self.protein_residue_dict[(protein2, residue2)]
1894 if (protein1, residue1) != (protein2, residue2):
1898 if not xwalk_bin_path:
1900 gcpf.set_distance(distance+max_delta_distance)
1904 if not self.sites_weighted:
1905 self.sites_weighted = []
1906 for key
in self.reactivity_dictionary:
1907 r = self.reactivity_dictionary[key]
1908 self.sites_weighted.append((key, r))
1910 first_site = self.weighted_choice(self.sites_weighted)
1912 if not self.euclidean_interacting_pairs:
1913 self.euclidean_interacting_pairs = \
1914 gcpf.get_close_pairs(
1916 self.indexes_dict1.keys(),
1917 self.indexes_dict2.keys())
1919 first_site_pairs = \
1920 [pair
for pair
in self.euclidean_interacting_pairs
1921 if self.indexes_dict1[pair[0]] == first_site
or
1922 self.indexes_dict2[pair[1]] == first_site]
1923 if len(first_site_pairs) == 0:
1926 second_sites_weighted = []
1927 for pair
in first_site_pairs:
1928 if self.indexes_dict1[pair[0]] == first_site:
1929 second_site = self.indexes_dict2[pair[1]]
1930 if self.indexes_dict2[pair[1]] == first_site:
1931 second_site = self.indexes_dict1[pair[0]]
1932 r = self.reactivity_dictionary[second_site]
1933 second_sites_weighted.append((second_site, r))
1934 second_site = self.weighted_choice(second_sites_weighted)
1935 protein1, residue1 = first_site
1936 protein2, residue2 = second_site
1937 print(
"CrossLinkDataBaseFromStructure."
1938 "get_random_residue_pair:",
1939 "First site", first_site,
1940 self.reactivity_dictionary[first_site],
1941 "Second site", second_site,
1942 self.reactivity_dictionary[second_site])
1945 [self.protein_residue_dict[first_site],
1946 self.protein_residue_dict[second_site]])
1951 if particle_distance < distance \
1952 and (protein1, residue1) != (protein2, residue2):
1954 elif (particle_distance >= distance
1955 and (protein1, residue1) != (protein2, residue2)
1956 and max_delta_distance):
1958 if particle_distance-distance < max_delta_distance:
1962 if not self.xwalk_interacting_pairs:
1963 self.xwalk_interacting_pairs = \
1964 self.get_xwalk_distances(xwalk_bin_path, distance)
1965 interacting_pairs_weighted = []
1967 for pair
in self.xwalk_interacting_pairs:
1973 -self.reactivity_dictionary[(protein1, residue1)]
1976 -self.reactivity_dictionary[(protein2, residue2)]
1978 interacting_pairs_weighted.append((pair, weight1*weight2))
1980 pair = self.weighted_choice(interacting_pairs_weighted)
1985 particle_distance = float(pair[4])
1987 return ((protein1, protein2, residue1, residue2)), particle_distance
1989 def get_xwalk_distances(self, xwalk_bin_path, distance):
1993 o.init_pdb(
"xwalk.pdb", self.representation.prot)
1994 o.write_pdb(
"xwalk.pdb")
1995 namechainiddict = o.dictchain[
"xwalk.pdb"]
1998 for key
in namechainiddict:
1999 chainiddict[namechainiddict[key]] = key
2000 xwalkout = os.popen(
'java -Xmx256m -cp ' + xwalk_bin_path
2001 +
' Xwalk -infile xwalk.pdb -aa1 lys -aa2 lys '
2002 '-a1 cb -a2 cb -max '
2003 + str(distance) +
' -bb').read()
2005 output_list_of_distance = []
2006 for line
in xwalkout.split(
"\n")[0:-2]:
2007 tokens = line.split()
2010 distance = float(tokens[6])
2011 fs = first.split(
"-")
2012 ss = second.split(
"-")
2015 protein1 = chainiddict[chainid1]
2016 protein2 = chainiddict[chainid2]
2017 residue1 = int(fs[1])
2018 residue2 = int(ss[1])
2019 output_list_of_distance.append(
2020 (protein1, protein2, residue1, residue2, distance))
2021 return output_list_of_distance
2023 def weighted_choice(self, choices):
2026 total = sum(w
for c, w
in choices)
2027 r = random.uniform(0, total)
2029 for c, w
in choices:
2033 assert False,
"Shouldn't get here"
2035 def save_rmf_snapshot(self, filename, color_id=None):
2038 if color_id
is None:
2039 color_id =
"Reactivity"
2040 sorted_group_ids = sorted(self.cldb.data_base.keys())
2043 for group
in sorted_group_ids:
2044 group_dists_particles = []
2045 for xl
in self.cldb.data_base[group]:
2046 xllabel = self.cldb.get_short_cross_link_string(xl)
2047 (c1, c2, r1, r2) = (xl[
"molecule_object1"],
2048 xl[
"molecule_object2"],
2049 xl[self.cldb.residue1_key],
2050 xl[self.cldb.residue2_key])
2052 index1 = self.protein_residue_dict[(c1, r1)]
2053 index2 = self.protein_residue_dict[(c2, r2)]
2056 mdist = xl[
"TargetDistance"]
2058 print(
"TypeError or missing chain/residue ",
2061 group_dists_particles.append((mdist, p1, p2, xllabel,
2062 float(xl[color_id])))
2063 if group_dists_particles:
2064 (minmdist, minp1, minp2, minxllabel, mincolor_score) \
2065 = min(group_dists_particles, key=
lambda t: t[0])
2066 color_scores.append(mincolor_score)
2067 list_of_pairs.append((minp1, minp2, minxllabel,
2074 linear.set_slope(1.0)
2078 offset = min(color_scores)
2079 maxvalue = max(color_scores)
2080 for pair
in list_of_pairs:
2082 pr.set_name(pair[2])
2083 factor = (pair[3]-offset)/(maxvalue-offset)
2088 rslin.add_restraint(pr)
2091 rh = RMF.create_rmf_file(filename)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
static bool get_is_setup(const IMP::ParticleAdaptor &p)
def __init__
Input a dictionary where keys are cldb names and values are cldbs.
def get_number_of_unique_crosslinked_sites
Returns the number of non redundant cross-link sites.
def filter_score
Get all cross-links with score greater than an input value.
RMF::FrameID save_frame(RMF::FileHandle file, std::string name="")
Save the current state of the linked objects as a new RMF frame.
def plot_field_histogram
Plot a list of histograms from a value list.
def plot_fields_box_plots
Plot time series as boxplots.
Color get_rgb_color(double f)
Return the color for f from the RGB color map.
This class generates a CrossLinkDataBase from a given structure.
ParticlesTemp get_particles(Model *m, const ParticleIndexes &ps)
Get the particles from a list of indexes.
def create_new_keyword
This function creates a new keyword for the whole database and set the values from and existing keywo...
static bool get_is_setup(const IMP::ParticleAdaptor &p)
double get_distance(XYZR a, XYZR b)
Compute the sphere distance between a and b.
def jaccard_index
Similarity index between two datasets https://en.wikipedia.org/wiki/Jaccard_index.
def merge
This function merges two cross-link datasets so that if two conflicting cross-links have the same cro...
This class is needed to convert the keywords from a generic database to the standard ones...
Object used to hold a set of restraints.
Stores a named protein chain.
def get_backward_converter
Returns the dictionary that convert the new keywords to the old ones.
This class maps a CrossLinkDataBase on a given structure and save an rmf file with color-coded cross-...
A decorator for keeping track of copies of a molecule.
The standard decorator for manipulating molecular structures.
This class allows to create filter functions that can be passed to the CrossLinkDataBase in this way:...
def set_value
This function changes the value for a given key in the database For instance one can change the name ...
This class allows to compute and plot the distance between datasets.
def get_setup_keys
Returns the keys that have been setup so far.
def get_converter
Returns the dictionary that convert the old keywords to the new ones.
def filter_out_same_residues
This function remove cross-links applied to the same residue (ie, same chain name and residue number)...
A decorator for a particle with x,y,z coordinates.
def set_standard_keys
This sets up the standard compulsory keys as defined by _CrossLinkDataBaseStandardKeys.
void add_hierarchies(RMF::NodeHandle fh, const atom::Hierarchies &hs)
def offset_residue_index
This function offset the residue indexes of a given protein by a specified value. ...
Class for easy writing of PDBs, RMFs, and stat files.
int get_state_index(Hierarchy h)
Walk up the hierarchy to find the current state.
def check_keys
Is a function that check whether necessary keys are setup.
void add_geometries(RMF::NodeHandle parent, const display::GeometriesTemp &r)
Add geometries to a given parent node.
def jackknife
this method returns a CrossLinkDataBase class containing a random subsample of the original cross-lin...
void add_restraints(RMF::NodeHandle fh, const Restraints &hs)
def check_cross_link_consistency
This function checks the consistency of the dataset with the amino acid sequence. ...
Find all nearby pairs by testing all pairs.
Classes for writing output files and processing them.
Simple implementation of segments in 3D.
def __init__
(argument1,operator,argument2) can be either a (keyword,operator.eq|lt|gt...,value) or (FilterOperato...
def get_list
This function returns a list of cross-linked residues and the corresponding list of cross-linked chai...
def get_values
this function returns the list of values for a given key in the database alphanumerically sorted ...
Basic functionality that is expected to be used by a wide variety of IMP users.
General purpose algebraic and geometric methods that are expected to be used by a wide variety of IMP...
The general base class for IMP exceptions.
Associate an integer "state" index with a hierarchy node.
def append_database
Append cross-link dataset to this one.
class to link stat files to several rmf files
class to allow more advanced handling of RMF files.
Mapping between FASTA one-letter codes and residue types.
A class to handle different styles of site pairs parsers.
def classify_crosslinks_by_score
This function creates as many classes as in the input (number_of_classes: integer) and partition cros...
int get_copy_index(Hierarchy h)
Walk up the hierarchy to find the current copy index.
Applies a PairScore to a Pair.
Python classes to represent, score, sample and analyze models.
Output IMP model data in various file formats.
Functionality for loading, creating, manipulating and scoring atomic structures.
Select hierarchy particles identified by the biological name.
this class handles a cross-link dataset and do filtering operations, adding cross-links, merge datasets...
Support for the RMF file format for storing hierarchical molecular data and markup.
def rename_proteins
This function renames all proteins contained in the input dictionary from the old names (keys) to the...
double get_distance(const Line3D &s, const Vector3D &p)
Get closest distance between a line and a point.
def align_sequence
This function parses an alignment file obtained previously, to map crosslink residues onto the sequen...
def create_set_from_file
if FixedFormatParser is not specified, the file is comma-separated-values