IMP logo
IMP Reference Guide  develop.50fdd7fa33,2025/09/05
The Integrative Modeling Platform
emseqfinder/tools.py
1 import pandas as pd
2 import numpy as np
3 import os
4 
5 residues = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS',
6  'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP',
7  'TYR', 'VAL']
8 features = ["EMDB", "resolution", "chain", "resid", "resname", "ss", "se_ccc",
9  "res_ccc"]
10 all_columns = features+residues
11 one_to_three = {"A": "Ala", "R": "Arg", "N": "Asn", "D": "Asp",
12  "C": "Cys", "E": "Glu", "Q": "Gln", "G": "Gly",
13  "H": "His", "I": "Ile", "L": "Leu", "K": "Lys",
14  "M": "Met", "F": "Phe", "P": "Pro", "S": "Ser",
15  "T": "Thr", "W": "Trp", "Y": "Tyr", "V": "Val"}
16 
17 emdb_header_home = "/wynton/group/sali/saltzberg/database/emdb_headers"
18 
19 h_residue_propensities = {
20  'ALA': 0.09743660383807928, 'ARG': 0.052206653248949665,
21  'ASN': 0.033516232631581634, 'ASP': 0.03697014685512692,
22  'CYS': 0.014795201253000493, 'GLN': 0.04344432285169852,
23  'GLU': 0.06778753070641984, 'GLY': 0.03754920042140815,
24  'HIS': 0.01862409730140631, 'ILE': 0.07440455692198593,
25  'LEU': 0.14053655562613038, 'LYS': 0.05310967070473625,
26  'MET': 0.030689839012904986, 'PHE': 0.05122200709659481,
27  'PRO': 0.017251714840352126, 'SER': 0.05328313168494384,
28  'THR': 0.04712016509403881, 'TRP': 0.016744086383568144,
29  'TYR': 0.038248146135774035, 'VAL': 0.07506013739129991}
30 
31 s_residue_propensities = {
32  'ALA': 0.06003398703023674, 'ARG': 0.044378467067739666,
33  'ASN': 0.03278576451285257, 'ASP': 0.03541292288460036,
34  'CYS': 0.018204547230252364, 'GLN': 0.03322525197281038,
35  'GLU': 0.04347995937182592, 'GLY': 0.04848035002734589,
36  'HIS': 0.017638096726306743, 'ILE': 0.09542737713883898,
37  'LEU': 0.10406086413001016, 'LYS': 0.04244472224392531,
38  'MET': 0.02161301664192515, 'PHE': 0.05539495273068208,
39  'PRO': 0.019483944058129542, 'SER': 0.06141104773810454,
40  'THR': 0.0748593640128135, 'TRP': 0.015216032502539261,
41  'TYR': 0.045706695835612154, 'VAL': 0.13074263614344872}
42 
43 
44 def read_sequences(sequence_file):
45  import IMP
46  import IMP.pmi
47  import IMP.pmi.topology
48  sequences = IMP.pmi.topology.Sequences(sequence_file)
49  seqs = {}
50  for s in sequences.sequences.keys():
51  s_cs = s.split("|")[-1].strip()
52  if "Chains" in s_cs:
53  chains = s_cs.split()[1].split(",")
54  else:
55  chains = s_cs.split()
56  for c in chains:
57  seqs[c] = sequences[s]
58  return seqs
59 
60 
61 def tint(x, n=3):
62  return int(x*10**3)/10**3
63 
64 
65 def process_input_database(db, reslow=0.0, reshigh=10.0, normalize=True,
66  threshold=True, nvoxels=2744):
67  df = import_voxel_file(db)
68  print("Imported voxel file of length", len(df))
69 
70  # Remove unknown residues
71  df = df[df["resname"] != "UNK"]
72 
73  all_voxel_cols = df.columns[-1*nvoxels:]
74  # Get all EMDBs
75  emdbs = df.EMDB.unique()
76 
77  for emdb in emdbs:
78  print(emdb)
79  # Get threshold and min/max stats
80  xml = os.path.join(emdb_header_home, "headers", emdb+".xml")
81  thresh = get_threshold_from_xml(xml)
82  (vmin, vmax, vavg, vstd) = get_statistics_from_xml(xml)
83  if thresh is None or vavg is None or vstd == 0:
84  index_names = df[df['EMDB'] == emdb].index
85  df.drop(index_names, inplace=True)
86  continue
87 
88  # Threshold voxels at user-defined thresholding value
89  df_vals = df.EMDB[all_voxel_cols]
90 
91  df_vals.loc[df_vals < float(thresh)] = 0
92 
93  # Now scale voxels by mean and std of map
94  df_vals = df_vals / float(vstd) - float(vavg)
95  for v in all_voxel_cols:
96  df.EMDB[v] = df_vals[v]
97 
98  df_x = df[(df["resolution"] < reshigh) & (df["resolution"] > reslow)]
99 
100  print("Filtered database #entries:", len(df_x))
101  df_h = df_x[df_x["ss"] == "H"]
102  df_s = df_x[df_x["ss"] == "S"]
103  df_h = df_h.dropna()
104  df_s = df_s.dropna()
105 
106  return df_h, df_s
107 
108 
109 def import_voxel_file(fname):
110  f = open(fname, "r")
111  columns = ["EMDB", "resolution", "chain", "resid", "resname", "ss",
112  "se_ccc", "res_ccc"]
113  data = []
114  for line in f.readlines():
115  try:
116  int(line[0])
117  dlist = []
118  fields = line.split()
119  for i in range(len(fields)):
120  if i == 3:
121  dlist.append(int(fields[i]))
122  elif i in [0, 2, 4, 5]:
123  dlist.append(fields[i])
124  else:
125  dlist.append(float(fields[i]))
126  data.append(dlist)
127  except: # noqa: E722
128  continue
129 
130  for i in range(len(data[0])-len(columns)):
131  columns.append("v"+str(i))
132  database = pd.DataFrame(data, columns=columns)
133  return database
134 
135 
136 def get_xml_line(xml, field):
137  f = open(xml, "r")
138 
139  for line in f.readlines():
140  if field in line:
141  f.close()
142  return line
143  f.close()
144  return None
145 
146 
147 def get_feature_points(fdict, feature):
148  cdict = features["Coordinates"]
149 
150  coords = []
151 
152  for v in fdict[feature]:
153  coords.append(cdict[v])
154 
155  return coords
156 
157 
158 def get_threshold_from_xml(xml):
159  line = get_xml_line(xml, "contourLevel")
160  if line is None:
161  return line
162  return float(line.strip().split(">")[1].split("<")[0])
163 
164 
165 def get_statistics_from_xml(xml):
166  # Returns voxel minimum, maximum, average and SD as a tuple
167  line = get_xml_line(xml, "<minimum>")
168  minimum = float(line.strip().split(">")[1].split("<")[0])
169  line = get_xml_line(xml, "<maximum>")
170  maximum = float(line.strip().split(">")[1].split("<")[0])
171  line = get_xml_line(xml, "<average>")
172  average = float(line.strip().split(">")[1].split("<")[0])
173  line = get_xml_line(xml, "<std>")
174  std = float(line.strip().split(">")[1].split("<")[0])
175  return (minimum, maximum, average, std)
176 
177 
178 def make_simulated_scores(sequence, sens=0.05, acc=0.5):
179  pass
180 
181 
182 def get_auc(pcts, binwidth=0.05):
183  histbins = np.arange(0, 1 + binwidth, binwidth)
184 
185  hh, hb = np.histogram(pcts, bins=histbins, density=True)
186  print(hh)
187  auch = 0
188  sumh = 0
189  for i in range(len(hb)-1):
190  sumh += hh[i]/len(histbins)
191  auch += (sumh-hb[i+1])*binwidth
192 
193  return auch
194 
195 
196 def score_prior_over_against_sequences(sedf, sequences, prior,
197  unavailable=None):
198  se_len = len(sedf)
199  scores = {}
200 
201  # Loop over all sequence chains
202  for c in sequences.keys():
203  seq = sequences[c]
204  scores[c] = {}
205 
206  # Loop over all starting residues
207  for r in range(len(seq)-se_len):
208  try:
209  segment = [one_to_three[s].upper() for s in seq[r:r+se_len]]
210  score = score_prior_sequence(prior, segment)
211  scores[c][r+1] = score
212  except: # noqa: E722
213  continue
214  if unavailable is not None:
215  for r in range(len(seq)-se_len, len(seq)):
216  scores[c][r+1] = unavailable
217  return scores
218 
219 
220 def score_sesf_over_against_sequences(sedf, sequences, count=False,
221  unavailable=None, log=True):
222  se_len = len(sedf)
223  scores = {}
224  skeys = list(sequences.keys())
225  # Loop over all sequence chains
226  for c in range(len(skeys)):
227  if count:
228  if c % 100 == 0:
229  print("Doing sequence", c, "out of", len(skeys))
230  sk = skeys[c]
231  seq = sequences[sk]
232  scores[sk] = {}
233 
234  # Loop over all starting residues
235  for r in range(len(seq)-se_len):
236  try:
237  segment = [one_to_three[s].upper() for s in seq[r:r+se_len]]
238  score = score_sequence(sedf, segment, log)
239  scores[sk][r+1] = score
240  except: # noqa: E722
241  continue
242  if unavailable is not None:
243  for r in range(len(seq)-se_len, len(seq)):
244  scores[sk][r+1] = unavailable
245  return scores
246 
247 
248 def NestedDictValues(d):
249  for v in d.values():
250  if isinstance(v, dict):
251  yield from NestedDictValues(v)
252  else:
253  yield v
254 
255 
256 def catstring(stuff, delimiter=" "):
257  outstring = ""
258  for s in stuff:
259  outstring += str(s)+delimiter
260 
261  return outstring[0:-1]
262 
263 
264 def get_se_ss(df_se):
265  ss = np.unique(df_se["ss"].values)
266  if len(ss) > 1:
267  print("Multiple secondary structures in this element! That's bad!!",
268  df_se["emdb"].values[0])
269  # print(df_se["resid"])
270  return ss[0]
271 
272 
273 def get_se_sequence(sedf):
274  seq = []
275  for i in range(len(sedf)):
276  seq.append(sedf.iloc[i]["resname"])
277  return seq
278 
279 
280 def score_prior_sequence(prior, seq):
281  score = 0
282  for i in range(len(seq)):
283  score += -1*prior[seq[i]]
284  return score
285 
286 
287 def score_sequence(sedf, seq, log=True):
288  if len(seq) != len(sedf):
289  raise Exception("Input sequence and structure element must be "
290  "the same size")
291  score = 0
292  for i in range(len(seq)):
293  if not log:
294  score += -1*np.log(sedf.iloc[i]["p_" + seq[i]])
295  else:
296  score += -1*sedf.iloc[i]["p_" + seq[i]]
297  return score
298 
299 
300 def get_emdbs(df):
301  return list(df["EMDB"].unique())
302 
303 
304 def get_correct_scores(df):
305  correct_scores = {}
306  for i, row in df.iterrows():
307  correct_scores[i] = row[row["resname"]]
308  return correct_scores
309 
310 
311 def get_emdb_SEs_from_db(df, emdb):
312  # Finds StructureElements (contiguous sequence elements) in a database
313  # Returns a list of DFs with those entries in it
314  df_emdb = df[df["EMDB"] == emdb]
315  if len(df_emdb) == 0:
316  print("No entries for EMDB", emdb, "found")
317 
318  chains = np.unique(df_emdb["chain"].values)
319 
320  # Dictionary of structure elements
321  ses = {}
322  for c in chains:
323  df_chain = df_emdb[df_emdb["chain"] == c]
324  sorted_resids = np.sort(df_chain["resid"].values)
325  rid_0 = sorted_resids[0]
326  ss = df_chain[df_chain["resid"] == rid_0]["ss"].values[0]
327  rid_1 = rid_0
328 
329  for i in range(1, len(sorted_resids)):
330  ridx = sorted_resids[i]
331  this_ss = df_chain[df_chain["resid"] == ridx]["ss"].values[0]
332  if ridx - rid_1 > 1 or this_ss != ss:
333  if rid_1-rid_0 > 3:
334  seid = (ss+"_"+c+"_"+str(int(rid_0))+"_"
335  + str(int(rid_1-rid_0)+1))
336 
337  idx = np.where((df_chain['resid'] >= rid_0)
338  & (df_chain['resid'] <= rid_1))
339  if len(idx[0]) == rid_1-rid_0+1:
340  ses[seid] = df_chain.iloc[idx[0]]
341  rid_0 = ridx
342  rid_1 = rid_0
343  ss = this_ss
344  else:
345  rid_1 = sorted_resids[i]
346 
347 # for last seid
348  if rid_1-rid_0 > 3:
349  seid = ss+"_"+c+"_"+str(int(rid_0))+"_"+str(int(rid_1-rid_0)+1)
350  idx = np.where((df_chain['resid'] >= rid_0)
351  & (df_chain['resid'] <= rid_1))
352  if len(idx[0]) == rid_1-rid_0+1:
353  ses[seid] = df_chain.iloc[idx[0]]
354 
355  return ses
Set of Python classes to create a multi-state, multi-resolution IMP hierarchy.
Python classes to represent, score, sample and analyze models.
A dictionary-like wrapper for reading and storing sequence data.