12 def read_sequences(sequence_file):
15 for s
in sequences.sequences.keys():
16 s_cs = s.split(
"|")[-1].strip()
18 chains = s_cs.split()[1].split(
",")
22 seqs[c] = sequences[s]
28 return int(x*10**3)/10**3
36 for r
in tools.residues:
37 priorh[r] = np.log(tools.h_residue_propensities[r])
38 priors[r] = np.log(tools.s_residue_propensities[r])
40 dbh_file = sys.argv[1]
41 out_file = sys.argv[2]
45 of = open(out_file,
"w", buffering=1)
48 dtypes = {
'EMDB':
'object'}
49 dfs = pd.read_csv(dbh_file, sep=
r'\s+', dtype=dtypes)
50 dfs.resid = dfs.resid.astype(int)
52 emdbs = tools.get_emdbs(dfs)
54 print(
"Total number of systems:", len(emdbs))
58 sedfs = tools.get_emdb_SEs_from_db(dfs, emdb)
61 fasta_file = f
"{emdb}.fasta"
62 seqs = read_sequences(os.path.join(database_home_, emdb,
"0system",
67 print(
"EMDB", emdb,
"has no structure elements")
70 seids = list(sedfs.keys())
71 resolution = float(sedfs[seids[0]][
"resolution"].values[0])
77 ss = tools.get_se_ss(sedf)
78 seq = tools.get_se_sequence(sedf)
80 all_score_dict = tools.score_sesf_over_against_sequences(
81 sedf, seqs, log=
False)
86 prior_score_dict = tools.score_prior_over_against_sequences(
88 actual_prior_score = tools.score_prior_sequence(prior, seq)
89 all_scores = list(tools.NestedDictValues(all_score_dict))
90 all_prior_scores = list(tools.NestedDictValues(prior_score_dict))
91 actual_score = tools.score_sequence(sedf, seq, log=
False)
92 act_rank = sum(np.abs(all_scores) < actual_score)
93 act_pct = act_rank / float(len(all_scores))
94 prior_rank = sum(np.abs(all_prior_scores) < actual_prior_score)
95 prior_pct = prior_rank / float(len(all_prior_scores))
96 outstring = tools.catstring(
97 [emdb, resolution, seid,
"|", act_rank, tint(act_pct, 3),
98 tint(actual_score, 2), tint(min(all_scores)),
99 tint(np.average(all_scores)), tint(max(all_scores)),
"|",
100 prior_rank, tint(prior_pct, 3), tint(actual_prior_score, 2),
101 tint(min(all_prior_scores)),
102 tint(np.average(all_prior_scores)),
103 tint(max(all_prior_scores)),
"|"] + seq)
104 of.write(outstring+
"\n")
108 if __name__ ==
'__main__':
Set of Python classes to create a multi-state, multi-resolution IMP hierarchy.
Python classes to represent, score, sample and analyze models.
A dictionary-like wrapper for reading and storing sequence data.