9 from .
import get_data_path
10 from IMP
import ArgumentParser
15 __doc__ =
"Perform all steps of the emseqfinder protocol."
18 def process_pdb(pdbfile, resolution, database_home, reference_map,
19 reference_pdb, final_output_file):
20 base = pathlib.Path(pdbfile.stem)
21 mapfile = pathlib.Path(
'cryoem_maps') / base.with_suffix(
'.map')
22 fastafile = pathlib.Path(
'fasta_files') / base.with_suffix(
'.fasta')
25 if not mapfile.exists():
26 print(f
"[WARNING] Missing map file for {base!r}. Skipping...")
28 if not fastafile.exists():
29 print(f
"[WARNING] Missing FASTA file for {base!r}. Skipping...")
32 print(
"===============================================")
33 print(f
"[INFO] Processing {base}")
35 threshold = compute_threshold(mapfile)
36 print(f
"[INFO] Using threshold: {threshold:.4f}")
39 shutil.rmtree(base, ignore_errors=
True)
40 (base /
'0system').mkdir(parents=
True)
41 (base /
'1structure_elements').mkdir()
44 stride_file = base / base.with_suffix(
'.stride')
45 stride_file.unlink(missing_ok=
True)
46 p = subprocess.run([
"stride", pdbfile,
"-rA",
"-f" + str(stride_file)])
48 print(f
"[ERROR] STRIDE failed for {base}. Skipping...")
52 frag_dir = pathlib.Path(f
"fragments_{base}")
53 shutil.rmtree(frag_dir, ignore_errors=
True)
54 frag_dir.mkdir(parents=
True)
56 [sys.executable,
'-m',
57 'IMP.emseqfinder.fragdb.get_fraglib_from_native', pdbfile,
58 stride_file, frag_dir])
60 print(f
"[ERROR] Fragment generation failed. Skipping {base}.")
64 shutil.copy(pdbfile, base /
"0system" /
"native.pdb")
65 shutil.copy(mapfile, base /
"0system" /
"emdb.map")
66 shutil.copy(fastafile, base /
"0system")
67 for frag
in frag_dir.glob(
"*.pdb"):
68 shutil.copy(frag, base /
"1structure_elements")
72 [sys.executable,
'-m',
73 'IMP.emseqfinder.mldb.normalize_map_for_parts_fitting', base,
74 '--thresh', str(threshold),
75 '--database_home', database_home,
76 '--reference_map', reference_map])
78 print(f
"[ERROR] Normalization failed. Skipping {base}.")
82 for p
in (f
"{base}_ML_side.dat", f
"{base}_ML_side.pkl",
83 f
"{base}_ML_side_ML_prob.dat", f
"{base}_ML_output.txt"):
84 pathlib.Path(p).unlink(missing_ok=
True)
88 [sys.executable,
'-m',
89 'IMP.emseqfinder.mldb.get_database_for_one_emdb_using_parts', base,
90 '--database_home', database_home,
91 '--reference_pdb', reference_pdb,
92 f
"{base}_ML_side.dat", str(resolution)])
94 print(f
"[ERROR] ML DB generation failed. Skipping {base}.")
99 [sys.executable,
'-m',
'IMP.emseqfinder.convert_MLDB_topkl',
100 f
"{base}_ML_side.dat", f
"{base}_ML_side"])
101 if p.returncode != 0:
102 print(f
"[ERROR] PKL conversion failed. Skipping {base}.")
107 [sys.executable,
'-m',
'IMP.emseqfinder.predict',
108 f
"{base}_ML_side.pkl",
'10000'])
109 if p.returncode != 0:
110 print(f
"[ERROR] Prediction failed. Skipping {base}.")
115 [sys.executable,
'-m',
'IMP.emseqfinder.evaluate_output_database',
116 f
"{base}_ML_side_ML_prob.dat", f
"{base}_ML_output.txt"])
117 if p.returncode != 0:
118 print(f
"[ERROR] Evaluation failed. Skipping {base}.")
122 calculate_seq_match([f
"{base}_ML_output.txt"], final_output_file)
125 shutil.rmtree(frag_dir, ignore_errors=
True)
127 print(f
"[INFO] Finished processing {base} ✅")
128 print(
"===============================================")
133 parser = ArgumentParser(
134 description=
"Perform all steps of the emseqfinder protocol")
136 "--db-resolution", dest=
"resolution", type=float,
137 help=
"Resolution used for database generation", default=4.0)
139 "--database_home", dest=
"database_home", type=str,
140 help=
"Directory containing all data files used in the protocol",
143 "--reference_map", dest=
"reference_map", type=str,
144 help=
"reference map for voxel data extraction",
147 "--reference_pdb", type=str, help=
"Reference PDB",
150 return parser.parse_args()
153 @contextlib.contextmanager
154 def get_reference_map(args):
155 if args.reference_map.endswith(
'.gz'):
158 with tempfile.TemporaryDirectory()
as tmpdir:
159 tmpfile = os.path.join(tmpdir,
'ref.mrc')
160 with gzip.open(args.reference_map,
'rb')
as fh_in:
161 with open(tmpfile,
'wb')
as fh_out:
162 shutil.copyfileobj(fh_in, fh_out)
165 yield args.reference_map
172 final_output_file = pathlib.Path(
"batch_matching_results.txt")
174 with get_reference_map(args)
as reference_map:
175 for pdbfile
in pathlib.Path(
"pdb_files").glob(
"*.pdb"):
176 process_pdb(pdbfile, args.resolution, args.database_home,
177 reference_map, args.reference_pdb, final_output_file)
180 if __name__ ==
'__main__':
std::string get_data_path(std::string file_name)
Return the full path to one of this module's data files.
Determine a threshold for an EM map.
Determine percentage sequence overlap for multiple results files.