IMP logo
IMP Reference Guide  develop.50fdd7fa33,2025/09/05
The Integrative Modeling Platform
evaluate_output_database.py
1 import pandas as pd
2 import numpy as np
3 from . import tools
4 import sys
5 import IMP
6 import IMP.pmi
7 import IMP.pmi.topology
8 import os
9 import math
10 
11 
12 def read_sequences(sequence_file):
13  sequences = IMP.pmi.topology.Sequences(sequence_file)
14  seqs = {}
15  for s in sequences.sequences.keys():
16  s_cs = s.split("|")[-1].strip()
17  if "Chains" in s_cs:
18  chains = s_cs.split()[1].split(",")
19  else:
20  chains = s_cs.split()
21  for c in chains:
22  seqs[c] = sequences[s]
23  return seqs
24 
25 
26 def tint(x, n=3):
27  if not math.isnan(x):
28  return int(x*10**3)/10**3
29  else:
30  return x
31 
32 
33 def main():
34  priorh = {}
35  priors = {}
36  for r in tools.residues:
37  priorh[r] = np.log(tools.h_residue_propensities[r])
38  priors[r] = np.log(tools.s_residue_propensities[r])
39 
40  dbh_file = sys.argv[1]
41  out_file = sys.argv[2]
42 
43  # Open output file. Line buffered, so we actually see the printout before
44  # the system buffering kicks in
45  of = open(out_file, "w", buffering=1)
46 
47  # 1 - open scoring function database
48  dtypes = {'EMDB': 'object'}
49  dfs = pd.read_csv(dbh_file, sep=r'\s+', dtype=dtypes)
50  dfs.resid = dfs.resid.astype(int)
51  # 2 - Get list of EMDBs
52  emdbs = tools.get_emdbs(dfs)
53 
54  print("Total number of systems:", len(emdbs))
55 
56  # 3 - Cycle through EMDBs
57  for emdb in emdbs:
58  sedfs = tools.get_emdb_SEs_from_db(dfs, emdb)
59  # print(emdb)
60  database_home_ = "."
61  fasta_file = f"{emdb}.fasta"
62  seqs = read_sequences(os.path.join(database_home_, emdb, "0system",
63  fasta_file))
64  # print(seqs)
65 
66  if len(sedfs) == 0:
67  print("EMDB", emdb, "has no structure elements")
68  continue
69 
70  seids = list(sedfs.keys())
71  resolution = float(sedfs[seids[0]]["resolution"].values[0])
72  print(resolution)
73  for seid in seids:
74  print(seid)
75  sedf = sedfs[seid]
76  # se_ccc = sedf["se_ccc"].values[0]
77  ss = tools.get_se_ss(sedf)
78  seq = tools.get_se_sequence(sedf)
79  print(seq)
80  all_score_dict = tools.score_sesf_over_against_sequences(
81  sedf, seqs, log=False)
82  if ss == "H":
83  prior = priorh
84  elif ss == "S":
85  prior = priors
86  prior_score_dict = tools.score_prior_over_against_sequences(
87  sedf, seqs, prior)
88  actual_prior_score = tools.score_prior_sequence(prior, seq)
89  all_scores = list(tools.NestedDictValues(all_score_dict))
90  all_prior_scores = list(tools.NestedDictValues(prior_score_dict))
91  actual_score = tools.score_sequence(sedf, seq, log=False)
92  act_rank = sum(np.abs(all_scores) < actual_score)
93  act_pct = act_rank / float(len(all_scores))
94  prior_rank = sum(np.abs(all_prior_scores) < actual_prior_score)
95  prior_pct = prior_rank / float(len(all_prior_scores))
96  outstring = tools.catstring(
97  [emdb, resolution, seid, "|", act_rank, tint(act_pct, 3),
98  tint(actual_score, 2), tint(min(all_scores)),
99  tint(np.average(all_scores)), tint(max(all_scores)), "|",
100  prior_rank, tint(prior_pct, 3), tint(actual_prior_score, 2),
101  tint(min(all_prior_scores)),
102  tint(np.average(all_prior_scores)),
103  tint(max(all_prior_scores)), "|"] + seq)
104  of.write(outstring+"\n")
105  # print(outstring)
106 
107 
108 if __name__ == '__main__':
109  main()
Set of Python classes to create a multi-state, multi-resolution IMP hierarchy.
Python classes to represent, score, sample and analyze models.
A dictionary-like wrapper for reading and storing sequence data.