IMP logo
IMP Reference Guide  develop.50fdd7fa33,2025/09/05
The Integrative Modeling Platform
calculate_seq_match_batch.py
1 """@namespace IMP.emseqfinder.calculate_seq_match_batch
2  Determine percentage sequence overlap for multiple results files."""
3 
4 
5 import sys
6 import os
7 
8 
9 def calculate_seq_match(result_files, final_output_file):
10  # Ensure output file exists and write headers only once
11  if not os.path.exists(final_output_file):
12  with open(final_output_file, "w") as outfile:
13  # Use tab separation
14  outfile.write("Result_File\tTotal_Percentage_Matched\t"
15  "Total_Abs_Percentage_Matched\n")
16 
17  # Process each input file
18  for result_file in result_files:
19  total_residue = 0
20  total_residue_matched = 0
21  total_residue_matched_abs = 0
22 
23  with open(result_file, 'r') as infile:
24  for i, lines in enumerate(infile):
25  # Skip header line (assuming the first line is the header)
26  if i == 0 and "pdbname" in lines.lower():
27  print(f"[INFO] Skipping header in {result_file}")
28  continue
29 
30  # Skip empty lines
31  if not lines.strip():
32  continue
33 
34  line = lines.strip().split('|')
35 
36  # Ensure the first part is formatted correctly
37  split_data = line[0].strip().split()
38 
39  if len(split_data) < 3:
40  print(f"[WARNING] Skipping malformed line in "
41  f"{result_file}: {line[0]}")
42  continue # Skip this line
43 
44  # Take only the first 3 elements
45  emdb, resolution, seid = split_data[:3]
46 
47  try:
48  se_length = int(seid.split('_')[-1])
49  except ValueError:
50  print(f"[WARNING] Could not parse SE length from: "
51  f"{seid} in {result_file}")
52  continue # Skip this line
53 
54  # Prevent IndexError
55  if len(line) < 2:
56  print(f"[WARNING] Skipping malformed line (missing "
57  f"expected values) in {result_file}: {line}")
58  continue
59 
60  if 'nan' in line[1]:
61  print(f"[INFO] Skipping line with 'nan' values "
62  f"in {result_file}: {line}")
63  continue
64  else:
65  total_residue += se_length
66  rank = line[1].strip().split()[0] # Handle multiple spaces
67 
68  try:
69  if int(rank) <= se_length / 3:
70  total_residue_matched += se_length
71 
72  if int(rank) == 0:
73  total_residue_matched_abs += se_length
74  except ValueError:
75  print(f"[WARNING] Invalid rank value in "
76  f"{result_file}: {rank}")
77  continue
78 
79  # Prevent division by zero error
80  if total_residue == 0:
81  total_percentage_matched = 0.0
82  total_abs_percentage_matched = 0.0
83  else:
84  total_percentage_matched = round(
85  100 * (total_residue_matched / total_residue), 3)
86  total_abs_percentage_matched = round(
87  100 * (total_residue_matched_abs / total_residue), 3)
88 
89  # Append new result to the common output file
90  with open(final_output_file, "a") as outfile:
91  # Use tab separation
92  outfile.write(f"{result_file}\t{total_percentage_matched}\t"
93  f"{total_abs_percentage_matched}\n")
94 
95  # print(f"✅ Results saved to {final_output_file}")
96 
97 
98 if __name__ == '__main__':
99  calculate_seq_match(sys.argv[1:], "seq_matching_results.txt")