IMP logo
IMP Reference Guide  develop.50fdd7fa33,2025/09/05
The Integrative Modeling Platform
convert_MLDB_topkl.py
1 import pandas as pd
2 import sys
3 import time
4 import numpy as np
5 import os
6 
7 # usage: python convert_MLDB_topkl.py 10k.dat
8 
9 
10 def import_voxel_file_pickle(fname, other_columns, n_voxels):
11  print("Extracting data from "+fname, flush=True)
12  columns = [oc[0] for oc in other_columns]
13  dtype = [oc[1] for oc in other_columns]
14  for i in range(n_voxels):
15  dtype.append("f")
16  columns.append("v"+str(i))
17 
18  dtypes = {}
19  for d in range(len(dtype)):
20  if dtype[d] == "f":
21  dtypes[columns[d]] = np.float32
22  elif dtype[d] == "s":
23  dtypes[columns[d]] = str
24  elif dtype[d] == "i":
25  dtypes[columns[d]] = np.int16
26  database = pd.read_csv(fname, sep=" ", names=columns,
27  na_values=["None"], dtype=dtypes)
28  database.drop(database[database["resname"] == "UNK"].index, inplace=True)
29  return database
30 
31 
32 def main():
33  fname = sys.argv[1]
34  if sys.argv[2]:
35  basename = sys.argv[2]
36  else:
37  basename = os.path.basename(fname).split('.')[0]
38 
39  n_voxels = 2744
40  # for parts fitting we are using this
41  other_columns = [("EMDB", "s"), ("resolution", "f"), ("pdbname", "s"),
42  ("chain", "s"), ("resid", "i"), ("resname", "s"),
43  ("ss", "s")]
44 
45  # read_pickle
46  start = time.time()
47  f_name = basename + '.pkl'
48  print(f_name)
49  df = import_voxel_file_pickle(fname, other_columns, n_voxels)
50  df.to_pickle(f_name)
51  end = time.time()
52  print('for pickle convert time', end-start)
53 
54 
55 if __name__ == '__main__':
56  main()