IMP logo
IMP Reference Guide  develop.50fdd7fa33,2025/09/05
The Integrative Modeling Platform
batch.py
1 import subprocess
2 import shutil
3 import contextlib
4 import pathlib
5 import tempfile
6 import gzip
7 import os
8 import sys
9 from . import get_data_path
10 from IMP import ArgumentParser
11 from IMP.emseqfinder.compute_dynamic_threshold import compute_threshold
12 from IMP.emseqfinder.calculate_seq_match_batch import calculate_seq_match
13 
14 
15 __doc__ = "Perform all steps of the emseqfinder protocol."
16 
17 
18 def process_pdb(pdbfile, resolution, database_home, reference_map,
19  reference_pdb, final_output_file):
20  base = pathlib.Path(pdbfile.stem)
21  mapfile = pathlib.Path('cryoem_maps') / base.with_suffix('.map')
22  fastafile = pathlib.Path('fasta_files') / base.with_suffix('.fasta')
23 
24  # Check file existence
25  if not mapfile.exists():
26  print(f"[WARNING] Missing map file for {base!r}. Skipping...")
27  return
28  if not fastafile.exists():
29  print(f"[WARNING] Missing FASTA file for {base!r}. Skipping...")
30  return
31 
32  print("===============================================")
33  print(f"[INFO] Processing {base}")
34 
35  threshold = compute_threshold(mapfile)
36  print(f"[INFO] Using threshold: {threshold:.4f}")
37 
38  # Clean and recreate project directory
39  shutil.rmtree(base, ignore_errors=True)
40  (base / '0system').mkdir(parents=True)
41  (base / '1structure_elements').mkdir()
42 
43  # STRIDE assignment
44  stride_file = base / base.with_suffix('.stride')
45  stride_file.unlink(missing_ok=True)
46  p = subprocess.run(["stride", pdbfile, "-rA", "-f" + str(stride_file)])
47  if p.returncode != 0:
48  print(f"[ERROR] STRIDE failed for {base}. Skipping...")
49  return
50 
51  # Create unique fragment directory
52  frag_dir = pathlib.Path(f"fragments_{base}")
53  shutil.rmtree(frag_dir, ignore_errors=True)
54  frag_dir.mkdir(parents=True)
55  p = subprocess.run(
56  [sys.executable, '-m',
57  'IMP.emseqfinder.fragdb.get_fraglib_from_native', pdbfile,
58  stride_file, frag_dir])
59  if p.returncode != 0:
60  print(f"[ERROR] Fragment generation failed. Skipping {base}.")
61  return
62 
63  # Copy system files
64  shutil.copy(pdbfile, base / "0system" / "native.pdb")
65  shutil.copy(mapfile, base / "0system" / "emdb.map")
66  shutil.copy(fastafile, base / "0system")
67  for frag in frag_dir.glob("*.pdb"):
68  shutil.copy(frag, base / "1structure_elements")
69 
70  # Normalize map
71  p = subprocess.run(
72  [sys.executable, '-m',
73  'IMP.emseqfinder.mldb.normalize_map_for_parts_fitting', base,
74  '--thresh', str(threshold),
75  '--database_home', database_home,
76  '--reference_map', reference_map])
77  if p.returncode != 0:
78  print(f"[ERROR] Normalization failed. Skipping {base}.")
79  return
80 
81  # Remove old database files if exist
82  for p in (f"{base}_ML_side.dat", f"{base}_ML_side.pkl",
83  f"{base}_ML_side_ML_prob.dat", f"{base}_ML_output.txt"):
84  pathlib.Path(p).unlink(missing_ok=True)
85 
86  # Create database
87  p = subprocess.run(
88  [sys.executable, '-m',
89  'IMP.emseqfinder.mldb.get_database_for_one_emdb_using_parts', base,
90  '--database_home', database_home,
91  '--reference_pdb', reference_pdb,
92  f"{base}_ML_side.dat", str(resolution)])
93  if p.returncode != 0:
94  print(f"[ERROR] ML DB generation failed. Skipping {base}.")
95  return
96 
97  # Convert to pickle
98  p = subprocess.run(
99  [sys.executable, '-m', 'IMP.emseqfinder.convert_MLDB_topkl',
100  f"{base}_ML_side.dat", f"{base}_ML_side"])
101  if p.returncode != 0:
102  print(f"[ERROR] PKL conversion failed. Skipping {base}.")
103  return
104 
105  # Prediction
106  p = subprocess.run(
107  [sys.executable, '-m', 'IMP.emseqfinder.predict',
108  f"{base}_ML_side.pkl", '10000'])
109  if p.returncode != 0:
110  print(f"[ERROR] Prediction failed. Skipping {base}.")
111  return
112 
113  # Evaluate prediction
114  p = subprocess.run(
115  [sys.executable, '-m', 'IMP.emseqfinder.evaluate_output_database',
116  f"{base}_ML_side_ML_prob.dat", f"{base}_ML_output.txt"])
117  if p.returncode != 0:
118  print(f"[ERROR] Evaluation failed. Skipping {base}.")
119  return
120 
121  # Calculate and append sequence match
122  calculate_seq_match([f"{base}_ML_output.txt"], final_output_file)
123 
124  # Cleanup
125  shutil.rmtree(frag_dir, ignore_errors=True)
126 
127  print(f"[INFO] Finished processing {base} ✅")
128  print("===============================================")
129  print("")
130 
131 
132 def parse_args():
133  parser = ArgumentParser(
134  description="Perform all steps of the emseqfinder protocol")
135  parser.add_argument(
136  "--db-resolution", dest="resolution", type=float,
137  help="Resolution used for database generation", default=4.0)
138  parser.add_argument(
139  "--database_home", dest="database_home", type=str,
140  help="Directory containing all data files used in the protocol",
141  default=".")
142  parser.add_argument(
143  "--reference_map", dest="reference_map", type=str,
144  help="reference map for voxel data extraction",
145  default=get_data_path('reference/ref.mrc.gz'))
146  parser.add_argument(
147  "--reference_pdb", type=str, help="Reference PDB",
148  default=get_data_path('reference/ref.pdb'))
149 
150  return parser.parse_args()
151 
152 
153 @contextlib.contextmanager
154 def get_reference_map(args):
155  if args.reference_map.endswith('.gz'):
156  # IMP's mrc reader cannot read compressed files, so decompress
157  # to a temporary location and return that instead
158  with tempfile.TemporaryDirectory() as tmpdir:
159  tmpfile = os.path.join(tmpdir, 'ref.mrc')
160  with gzip.open(args.reference_map, 'rb') as fh_in:
161  with open(tmpfile, 'wb') as fh_out:
162  shutil.copyfileobj(fh_in, fh_out)
163  yield tmpfile
164  else:
165  yield args.reference_map
166 
167 
168 def main():
169  args = parse_args()
170 
171  # Output file for overall results
172  final_output_file = pathlib.Path("batch_matching_results.txt")
173 
174  with get_reference_map(args) as reference_map:
175  for pdbfile in pathlib.Path("pdb_files").glob("*.pdb"):
176  process_pdb(pdbfile, args.resolution, args.database_home,
177  reference_map, args.reference_pdb, final_output_file)
178 
179 
180 if __name__ == '__main__':
181  main()
std::string get_data_path(std::string file_name)
Return the full path to one of this module's data files.
Determine a threshold for an EM map.
Determine percentage sequence overlap for multiple results files.