IMP logo
IMP Reference Guide  develop.50fdd7fa33,2025/09/05
The Integrative Modeling Platform
data_v2_for_parts.py
1 import sys
2 import pandas as pd
3 import numpy as np
4 from sklearn.preprocessing import LabelBinarizer
5 import time
6 # This script contains all important scripts for preparing input file
7 # and running the ML model
8 
9 residues = ["GLY", "ALA", "THR", "HIS", "SER", "ASP", "ASN", "GLU", "GLN",
10  "VAL", "LEU", "ILE", "LYS", "ARG", "TRP", "TYR", "PHE", "CYS",
11  "MET", "PRO"]
12 
13 
14 def split_df_random(df_in, x=0.8, y=0.1, z=0.1):
15  shuffled = df_in.sample(frac=1)
16  n_arr = len(df_in)
17  nx = int(x*n_arr)
18  ny = int(y*n_arr)
19  return shuffled.iloc[0:nx], shuffled.iloc[nx:nx+ny], shuffled.iloc[nx+ny:]
20 
21 
22 def import_voxel_file_pandas(fname, other_columns, n_voxels, read_rows='all'):
23  print("Extracting data from "+fname, flush=True)
24  columns = [oc[0] for oc in other_columns]
25  dtype = [oc[1] for oc in other_columns]
26  for i in range(n_voxels):
27  dtype.append("float32")
28  columns.append("v"+str(i))
29  dtypes = {}
30  for col in columns:
31  dtypes[col] = dtype[columns.index(col)]
32  if read_rows == 'all':
33  database = pd.read_csv(fname, sep=" ", names=columns,
34  na_values=["None"], dtype=dtypes)
35  elif type(read_rows) is int or type(read_rows) is float:
36  database = pd.read_csv(fname, sep=" ", names=columns, nrows=read_rows,
37  na_values=["None"], dtype=dtypes)
38  else:
39  print("Processing all lines in the file", fname)
40  database = pd.read_csv(fname, sep=" ", names=columns,
41  na_values=["None"], dtype=dtypes)
42  t0 = time.time()
43  print("Extracted "+str(len(database))+" lines in "+str(time.time()-t0),
44  flush=True)
45 
46  # Drop all instances of "UNK"
47  database.drop(database[database["resname"] == "UNK"].index, inplace=True)
48  print("Processed database in "+str(time.time()-t0), flush=True)
49 
50  return database
51 
52 
53 def import_voxel_file_pickle(fname, other_columns, n_voxels):
54  print("Extracting data from "+fname, flush=True)
55  columns = [oc[0] for oc in other_columns]
56  dtype = [oc[1] for oc in other_columns]
57  for i in range(n_voxels):
58  dtype.append("f")
59  columns.append("v"+str(i))
60 
61  dtypes = {}
62  for d in range(len(dtype)):
63  if dtype[d] == "f":
64  dtypes[columns[d]] = np.float32
65  elif dtype[d] == "s":
66  dtypes[columns[d]] = str
67  elif dtype[d] == "i":
68  dtypes[columns[d]] = np.int16
69  t0 = time.time()
70  database = pd.read_pickle(fname)
71  print("Extracted "+str(len(database))+" lines in "+str(time.time()-t0),
72  flush=True)
73  t0 = time.time()
74  # database = pd.DataFrame(data, columns=columns)
75  print("Database size (MB):", str(sys.getsizeof(database)/1024/1024),
76  len(database))
77 
78  print("Processed database in "+str(time.time()-t0), flush=True)
79  return database
80 
81 
82 def import_voxel_file(fname, other_columns, n_voxels):
83  print("Extracting data from "+fname, flush=True)
84  t0 = time.time()
85  # f = open(fname, "r")
86  data = []
87  columns = [oc[0] for oc in other_columns]
88  dtype = [oc[1] for oc in other_columns]
89  for i in range(n_voxels):
90  dtype.append("f")
91  columns.append("v"+str(i))
92  dtypes = {}
93  for d in range(len(dtypes)):
94  if dtype[d] == "f":
95  dtypes[columns[d]] = np.float32
96  elif dtype[d] == "s":
97  dtypes[columns[d]] = str
98  elif dtype[d] == "i":
99  dtypes[columns[d]] = np.int16
100 
101  k = 0
102  linenum = 0
103  for line in open(fname, "r"):
104  k += 1
105  if k % 100000 == 0:
106  print("-- Line "+str(k)+" entries: "+str(linenum)+" in "
107  + str(time.time()-t0), flush=True)
108  try:
109  int(line[0])
110  except: # noqa: E722
111  print(k, line[0], line.split()[0:8], flush=True)
112  continue
113  dlist = []
114  fields = line.split()
115  for i in range(len(fields)):
116  if dtype[i] == "i":
117  dlist.append(int(fields[i]))
118  elif dtype[i] == "s":
119  dlist.append(fields[i])
120  else:
121  if fields[i] == "None":
122  dlist.append(None)
123  else:
124  try:
125  dlist.append(float(fields[i]))
126  except: # noqa: E722
127  dlist.append(None)
128  data.append(dlist)
129  linenum += 1
130  print("Extracted "+str(len(data))+" lines in "+str(time.time()-t0),
131  flush=True)
132  t0 = time.time()
133  database = pd.DataFrame(data, columns=columns)
134 
135  # Drop all instances of "UNK"
136  database.drop(database[database["resname"] == "UNK"].index, inplace=True)
137  print("Processed database in "+str(time.time()-t0), flush=True)
138 
139  return database
140 
141 
142 def reshape_df(df, dims):
143  outlist = []
144  for i, row in df.iterrows():
145  outlist.append(np.reshape(row.values, dims))
146  return np.array(outlist)
147 
148 
149 def split_image_and_other_features(df, image_prefix="v",
150  other_columns=["resolution"]):
151  # Split a dataframe into two dataframes: one with image information,
152  # one with other features
153  # other_columns :: column labels to extract and return in
154  # other_features - these should be ones used for learning
155  vcols = [c for c in df.columns if c[0] == image_prefix]
156  print("SIOF", other_columns)
157  print(df.columns)
158  image_features = df[vcols]
159  other_features = df[other_columns]
160  return image_features, other_features
161 
162 
163 class ResidueVoxelDataset:
164  def __init__(self, infile, classes=residues, target=None,
165  voxel_dim=(14, 14, 14),
166  other_columns=[("EMDB", "object"),
167  ("resolution", "float32"),
168  ("pdbname", "object"),
169  ("chain", "object"),
170  ("resid", "int16"),
171  ("resname", "object"),
172  ("ss", "object")],
173  train_features=["resolution", "H", "S"], binarize_ss=True):
174 
175  self.infile = infile
176  self.target = target
177  self.voxel_dim = voxel_dim
178  self.other_columns = other_columns
179  self.n_voxels = voxel_dim[0]*voxel_dim[1]*voxel_dim[2]
180  self.train_features = train_features
181  self.data_df = import_voxel_file_pickle(
182  self.infile, other_columns=other_columns,
183  n_voxels=self.n_voxels)
184  self.pred_classes = classes
185  self.zb = self.get_binarizer()
186 
187  def get_voxel_columns(self):
188  # Return a numpy array of the voxel values only
189  vcols = []
190  for i in range(self.n_voxels):
191  vcols.append("v"+str(i))
192  return vcols
193 
194  def drop_non_residues(self):
195  self.data_df = self.data_df[self.data_df[self.target] in residues]
196 
197  def binarize_ss(self, df, add_to_training_features=True):
198  ss_cats = ["H", "S"]
199  for c in ss_cats:
200  bc = self.data_df["ss"] == c
201 # print(bc)
202  self.data_df[c] = bc.astype(int)
203 
204  if add_to_training_features:
205  self.train_features = [y for x in [self.train_features, ss_cats]
206  for y in x]
207 # print(self.data_df[c])
208  return ss_cats
209 
210  def split_features_and_reshape(self, df):
211  # First, split the image voxels from the other features
212  ximages, xother = split_image_and_other_features(
213  df, other_columns=self.train_features)
214 
215  # Reshape the images to a 14x14x14 box
216  ximages_reshape = reshape_df(ximages, self.voxel_dim)
217 
218  if self.target is not None:
219  y = df[self.target]
220  else:
221  y = None
222 
223  return {"image_features": ximages_reshape,
224  "other_features": xother,
225  "target": y}
226 
227  def get_train_test_val_sets(self, train=0.75, test=0.15, val=0.1):
228  df_train, df_test, df_val = split_df_random(
229  self.data_df, train, test, val)
230 
231  train_test_val = []
232  for dft in [df_train, df_test, df_val]:
233  if len(dft) > 0:
234  train_test_val.append(self.split_features_and_reshape(dft))
235  else:
236  train_test_val.append(None)
237 
238  return train_test_val
239 
240  def get_binarizer(self):
241  return LabelBinarizer().fit(self.pred_classes)
242 
243  def get_weights_by_class(self, ser):
244  class_weights = {}
245  for i in range(len(self.pred_classes)):
246  res = self.pred_classes[i]
247  res_frac = sum(x == res for x in ser) / len(ser)
248  class_weights[i] = (1/len(self.pred_classes)) / res_frac
249 
250  return class_weights