4 from sklearn.preprocessing
import LabelBinarizer
9 residues = [
"GLY",
"ALA",
"THR",
"HIS",
"SER",
"ASP",
"ASN",
"GLU",
"GLN",
10 "VAL",
"LEU",
"ILE",
"LYS",
"ARG",
"TRP",
"TYR",
"PHE",
"CYS",
14 def split_df_random(df_in, x=0.8, y=0.1, z=0.1):
15 shuffled = df_in.sample(frac=1)
19 return shuffled.iloc[0:nx], shuffled.iloc[nx:nx+ny], shuffled.iloc[nx+ny:]
22 def import_voxel_file_pandas(fname, other_columns, n_voxels, read_rows='all'):
23 print(
"Extracting data from "+fname, flush=
True)
24 columns = [oc[0]
for oc
in other_columns]
25 dtype = [oc[1]
for oc
in other_columns]
26 for i
in range(n_voxels):
27 dtype.append(
"float32")
28 columns.append(
"v"+str(i))
31 dtypes[col] = dtype[columns.index(col)]
32 if read_rows ==
'all':
33 database = pd.read_csv(fname, sep=
" ", names=columns,
34 na_values=[
"None"], dtype=dtypes)
35 elif type(read_rows)
is int
or type(read_rows)
is float:
36 database = pd.read_csv(fname, sep=
" ", names=columns, nrows=read_rows,
37 na_values=[
"None"], dtype=dtypes)
39 print(
"Processing all lines in the file", fname)
40 database = pd.read_csv(fname, sep=
" ", names=columns,
41 na_values=[
"None"], dtype=dtypes)
43 print(
"Extracted "+str(len(database))+
" lines in "+str(time.time()-t0),
47 database.drop(database[database[
"resname"] ==
"UNK"].index, inplace=
True)
48 print(
"Processed database in "+str(time.time()-t0), flush=
True)
53 def import_voxel_file_pickle(fname, other_columns, n_voxels):
54 print(
"Extracting data from "+fname, flush=
True)
55 columns = [oc[0]
for oc
in other_columns]
56 dtype = [oc[1]
for oc
in other_columns]
57 for i
in range(n_voxels):
59 columns.append(
"v"+str(i))
62 for d
in range(len(dtype)):
64 dtypes[columns[d]] = np.float32
66 dtypes[columns[d]] = str
68 dtypes[columns[d]] = np.int16
70 database = pd.read_pickle(fname)
71 print(
"Extracted "+str(len(database))+
" lines in "+str(time.time()-t0),
75 print(
"Database size (MB):", str(sys.getsizeof(database)/1024/1024),
78 print(
"Processed database in "+str(time.time()-t0), flush=
True)
82 def import_voxel_file(fname, other_columns, n_voxels):
83 print(
"Extracting data from "+fname, flush=
True)
87 columns = [oc[0]
for oc
in other_columns]
88 dtype = [oc[1]
for oc
in other_columns]
89 for i
in range(n_voxels):
91 columns.append(
"v"+str(i))
93 for d
in range(len(dtypes)):
95 dtypes[columns[d]] = np.float32
97 dtypes[columns[d]] = str
99 dtypes[columns[d]] = np.int16
103 for line
in open(fname,
"r"):
106 print(
"-- Line "+str(k)+
" entries: "+str(linenum)+
" in "
107 + str(time.time()-t0), flush=
True)
111 print(k, line[0], line.split()[0:8], flush=
True)
114 fields = line.split()
115 for i
in range(len(fields)):
117 dlist.append(int(fields[i]))
118 elif dtype[i] ==
"s":
119 dlist.append(fields[i])
121 if fields[i] ==
"None":
125 dlist.append(float(fields[i]))
130 print(
"Extracted "+str(len(data))+
" lines in "+str(time.time()-t0),
133 database = pd.DataFrame(data, columns=columns)
136 database.drop(database[database[
"resname"] ==
"UNK"].index, inplace=
True)
137 print(
"Processed database in "+str(time.time()-t0), flush=
True)
142 def reshape_df(df, dims):
144 for i, row
in df.iterrows():
145 outlist.append(np.reshape(row.values, dims))
146 return np.array(outlist)
149 def split_image_and_other_features(df, image_prefix="v",
150 other_columns=[
"resolution"]):
155 vcols = [c
for c
in df.columns
if c[0] == image_prefix]
156 print(
"SIOF", other_columns)
158 image_features = df[vcols]
159 other_features = df[other_columns]
160 return image_features, other_features
163 class ResidueVoxelDataset:
164 def __init__(self, infile, classes=residues, target=None,
165 voxel_dim=(14, 14, 14),
166 other_columns=[(
"EMDB",
"object"),
167 (
"resolution",
"float32"),
168 (
"pdbname",
"object"),
171 (
"resname",
"object"),
173 train_features=[
"resolution",
"H",
"S"], binarize_ss=
True):
177 self.voxel_dim = voxel_dim
178 self.other_columns = other_columns
179 self.n_voxels = voxel_dim[0]*voxel_dim[1]*voxel_dim[2]
180 self.train_features = train_features
181 self.data_df = import_voxel_file_pickle(
182 self.infile, other_columns=other_columns,
183 n_voxels=self.n_voxels)
184 self.pred_classes = classes
185 self.zb = self.get_binarizer()
187 def get_voxel_columns(self):
190 for i
in range(self.n_voxels):
191 vcols.append(
"v"+str(i))
194 def drop_non_residues(self):
195 self.data_df = self.data_df[self.data_df[self.target]
in residues]
197 def binarize_ss(self, df, add_to_training_features=True):
200 bc = self.data_df[
"ss"] == c
202 self.data_df[c] = bc.astype(int)
204 if add_to_training_features:
205 self.train_features = [y
for x
in [self.train_features, ss_cats]
210 def split_features_and_reshape(self, df):
212 ximages, xother = split_image_and_other_features(
213 df, other_columns=self.train_features)
216 ximages_reshape = reshape_df(ximages, self.voxel_dim)
218 if self.target
is not None:
223 return {
"image_features": ximages_reshape,
224 "other_features": xother,
227 def get_train_test_val_sets(self, train=0.75, test=0.15, val=0.1):
228 df_train, df_test, df_val = split_df_random(
229 self.data_df, train, test, val)
232 for dft
in [df_train, df_test, df_val]:
234 train_test_val.append(self.split_features_and_reshape(dft))
236 train_test_val.append(
None)
238 return train_test_val
240 def get_binarizer(self):
241 return LabelBinarizer().fit(self.pred_classes)
243 def get_weights_by_class(self, ser):
245 for i
in range(len(self.pred_classes)):
246 res = self.pred_classes[i]
247 res_frac = sum(x == res
for x
in ser) / len(ser)
248 class_weights[i] = (1/len(self.pred_classes)) / res_frac