5 residues = [
'ALA',
'ARG',
'ASN',
'ASP',
'CYS',
'GLN',
'GLU',
'GLY',
'HIS',
6 'ILE',
'LEU',
'LYS',
'MET',
'PHE',
'PRO',
'SER',
'THR',
'TRP',
8 features = [
"EMDB",
"resolution",
"chain",
"resid",
"resname",
"ss",
"se_ccc",
10 all_columns = features+residues
11 one_to_three = {
"A":
"Ala",
"R": "Arg", "N": "Asn", "D": "Asp",
12 "C":
"Cys",
"E":
"Glu",
"Q":
"Gln",
"G":
"Gly",
13 "H":
"His",
"I":
"Ile",
"L":
"Leu",
"K":
"Lys",
14 "M":
"Met",
"F":
"Phe",
"P":
"Pro",
"S":
"Ser",
15 "T":
"Thr",
"W":
"Trp",
"Y":
"Tyr",
"V":
"Val"}
17 emdb_header_home =
"/wynton/group/sali/saltzberg/database/emdb_headers"
19 h_residue_propensities = {
20 'ALA': 0.09743660383807928,
'ARG': 0.052206653248949665,
21 'ASN': 0.033516232631581634,
'ASP': 0.03697014685512692,
22 'CYS': 0.014795201253000493,
'GLN': 0.04344432285169852,
23 'GLU': 0.06778753070641984,
'GLY': 0.03754920042140815,
24 'HIS': 0.01862409730140631,
'ILE': 0.07440455692198593,
25 'LEU': 0.14053655562613038,
'LYS': 0.05310967070473625,
26 'MET': 0.030689839012904986,
'PHE': 0.05122200709659481,
27 'PRO': 0.017251714840352126,
'SER': 0.05328313168494384,
28 'THR': 0.04712016509403881,
'TRP': 0.016744086383568144,
29 'TYR': 0.038248146135774035,
'VAL': 0.07506013739129991}
31 s_residue_propensities = {
32 'ALA': 0.06003398703023674,
'ARG': 0.044378467067739666,
33 'ASN': 0.03278576451285257,
'ASP': 0.03541292288460036,
34 'CYS': 0.018204547230252364,
'GLN': 0.03322525197281038,
35 'GLU': 0.04347995937182592,
'GLY': 0.04848035002734589,
36 'HIS': 0.017638096726306743,
'ILE': 0.09542737713883898,
37 'LEU': 0.10406086413001016,
'LYS': 0.04244472224392531,
38 'MET': 0.02161301664192515,
'PHE': 0.05539495273068208,
39 'PRO': 0.019483944058129542,
'SER': 0.06141104773810454,
40 'THR': 0.0748593640128135,
'TRP': 0.015216032502539261,
41 'TYR': 0.045706695835612154,
'VAL': 0.13074263614344872}
44 def read_sequences(sequence_file):
50 for s
in sequences.sequences.keys():
51 s_cs = s.split(
"|")[-1].strip()
53 chains = s_cs.split()[1].split(
",")
57 seqs[c] = sequences[s]
62 return int(x*10**3)/10**3
65 def process_input_database(db, reslow=0.0, reshigh=10.0, normalize=True,
66 threshold=
True, nvoxels=2744):
67 df = import_voxel_file(db)
68 print(
"Imported voxel file of length", len(df))
71 df = df[df[
"resname"] !=
"UNK"]
73 all_voxel_cols = df.columns[-1*nvoxels:]
75 emdbs = df.EMDB.unique()
80 xml = os.path.join(emdb_header_home,
"headers", emdb+
".xml")
81 thresh = get_threshold_from_xml(xml)
82 (vmin, vmax, vavg, vstd) = get_statistics_from_xml(xml)
83 if thresh
is None or vavg
is None or vstd == 0:
84 index_names = df[df[
'EMDB'] == emdb].index
85 df.drop(index_names, inplace=
True)
89 df_vals = df.EMDB[all_voxel_cols]
91 df_vals.loc[df_vals < float(thresh)] = 0
94 df_vals = df_vals / float(vstd) - float(vavg)
95 for v
in all_voxel_cols:
96 df.EMDB[v] = df_vals[v]
98 df_x = df[(df[
"resolution"] < reshigh) & (df[
"resolution"] > reslow)]
100 print(
"Filtered database #entries:", len(df_x))
101 df_h = df_x[df_x[
"ss"] ==
"H"]
102 df_s = df_x[df_x[
"ss"] ==
"S"]
109 def import_voxel_file(fname):
111 columns = ["EMDB",
"resolution",
"chain",
"resid",
"resname",
"ss",
114 for line
in f.readlines():
118 fields = line.split()
119 for i
in range(len(fields)):
121 dlist.append(int(fields[i]))
122 elif i
in [0, 2, 4, 5]:
123 dlist.append(fields[i])
125 dlist.append(float(fields[i]))
130 for i
in range(len(data[0])-len(columns)):
131 columns.append(
"v"+str(i))
132 database = pd.DataFrame(data, columns=columns)
136 def get_xml_line(xml, field):
139 for line
in f.readlines():
147 def get_feature_points(fdict, feature):
148 cdict = features[
"Coordinates"]
152 for v
in fdict[feature]:
153 coords.append(cdict[v])
158 def get_threshold_from_xml(xml):
159 line = get_xml_line(xml,
"contourLevel")
162 return float(line.strip().split(
">")[1].split(
"<")[0])
165 def get_statistics_from_xml(xml):
167 line = get_xml_line(xml,
"<minimum>")
168 minimum = float(line.strip().split(
">")[1].split(
"<")[0])
169 line = get_xml_line(xml,
"<maximum>")
170 maximum = float(line.strip().split(
">")[1].split(
"<")[0])
171 line = get_xml_line(xml,
"<average>")
172 average = float(line.strip().split(
">")[1].split(
"<")[0])
173 line = get_xml_line(xml,
"<std>")
174 std = float(line.strip().split(
">")[1].split(
"<")[0])
175 return (minimum, maximum, average, std)
178 def make_simulated_scores(sequence, sens=0.05, acc=0.5):
182 def get_auc(pcts, binwidth=0.05):
183 histbins = np.arange(0, 1 + binwidth, binwidth)
185 hh, hb = np.histogram(pcts, bins=histbins, density=
True)
189 for i
in range(len(hb)-1):
190 sumh += hh[i]/len(histbins)
191 auch += (sumh-hb[i+1])*binwidth
196 def score_prior_over_against_sequences(sedf, sequences, prior,
202 for c
in sequences.keys():
207 for r
in range(len(seq)-se_len):
209 segment = [one_to_three[s].upper()
for s
in seq[r:r+se_len]]
210 score = score_prior_sequence(prior, segment)
211 scores[c][r+1] = score
214 if unavailable
is not None:
215 for r
in range(len(seq)-se_len, len(seq)):
216 scores[c][r+1] = unavailable
220 def score_sesf_over_against_sequences(sedf, sequences, count=False,
221 unavailable=
None, log=
True):
224 skeys = list(sequences.keys())
226 for c
in range(len(skeys)):
229 print(
"Doing sequence", c,
"out of", len(skeys))
235 for r
in range(len(seq)-se_len):
237 segment = [one_to_three[s].upper()
for s
in seq[r:r+se_len]]
238 score = score_sequence(sedf, segment, log)
239 scores[sk][r+1] = score
242 if unavailable
is not None:
243 for r
in range(len(seq)-se_len, len(seq)):
244 scores[sk][r+1] = unavailable
248 def NestedDictValues(d):
250 if isinstance(v, dict):
251 yield from NestedDictValues(v)
256 def catstring(stuff, delimiter=" "):
259 outstring += str(s)+delimiter
261 return outstring[0:-1]
264 def get_se_ss(df_se):
265 ss = np.unique(df_se[
"ss"].values)
267 print(
"Multiple secondary structures in this element! That's bad!!",
268 df_se[
"emdb"].values[0])
273 def get_se_sequence(sedf):
275 for i
in range(len(sedf)):
276 seq.append(sedf.iloc[i][
"resname"])
280 def score_prior_sequence(prior, seq):
282 for i
in range(len(seq)):
283 score += -1*prior[seq[i]]
287 def score_sequence(sedf, seq, log=True):
288 if len(seq) != len(sedf):
289 raise Exception(
"Input sequence and structure element must be "
292 for i
in range(len(seq)):
294 score += -1*np.log(sedf.iloc[i][
"p_" + seq[i]])
296 score += -1*sedf.iloc[i][
"p_" + seq[i]]
301 return list(df[
"EMDB"].unique())
304 def get_correct_scores(df):
306 for i, row
in df.iterrows():
307 correct_scores[i] = row[row[
"resname"]]
308 return correct_scores
311 def get_emdb_SEs_from_db(df, emdb):
314 df_emdb = df[df[
"EMDB"] == emdb]
315 if len(df_emdb) == 0:
316 print(
"No entries for EMDB", emdb,
"found")
318 chains = np.unique(df_emdb[
"chain"].values)
323 df_chain = df_emdb[df_emdb[
"chain"] == c]
324 sorted_resids = np.sort(df_chain[
"resid"].values)
325 rid_0 = sorted_resids[0]
326 ss = df_chain[df_chain[
"resid"] == rid_0][
"ss"].values[0]
329 for i
in range(1, len(sorted_resids)):
330 ridx = sorted_resids[i]
331 this_ss = df_chain[df_chain[
"resid"] == ridx][
"ss"].values[0]
332 if ridx - rid_1 > 1
or this_ss != ss:
334 seid = (ss+
"_"+c+
"_"+str(int(rid_0))+
"_"
335 + str(int(rid_1-rid_0)+1))
337 idx = np.where((df_chain[
'resid'] >= rid_0)
338 & (df_chain[
'resid'] <= rid_1))
339 if len(idx[0]) == rid_1-rid_0+1:
340 ses[seid] = df_chain.iloc[idx[0]]
345 rid_1 = sorted_resids[i]
349 seid = ss+
"_"+c+
"_"+str(int(rid_0))+
"_"+str(int(rid_1-rid_0)+1)
350 idx = np.where((df_chain[
'resid'] >= rid_0)
351 & (df_chain[
'resid'] <= rid_1))
352 if len(idx[0]) == rid_1-rid_0+1:
353 ses[seid] = df_chain.iloc[idx[0]]
Set of Python classes to create a multi-state, multi-resolution IMP hierarchy.
Python classes to represent, score, sample and analyze models.
A dictionary-like wrapper for reading and storing sequence data.