1 """@namespace IMP.emseqfinder.calculate_seq_match_batch
2 Determine percentage sequence overlap for multiple results files."""
9 def calculate_seq_match(result_files, final_output_file):
11 if not os.path.exists(final_output_file):
12 with open(final_output_file,
"w")
as outfile:
14 outfile.write(
"Result_File\tTotal_Percentage_Matched\t"
15 "Total_Abs_Percentage_Matched\n")
18 for result_file
in result_files:
20 total_residue_matched = 0
21 total_residue_matched_abs = 0
23 with open(result_file,
'r') as infile:
24 for i, lines
in enumerate(infile):
26 if i == 0
and "pdbname" in lines.lower():
27 print(f
"[INFO] Skipping header in {result_file}")
34 line = lines.strip().split(
'|')
37 split_data = line[0].strip().split()
39 if len(split_data) < 3:
40 print(f
"[WARNING] Skipping malformed line in "
41 f
"{result_file}: {line[0]}")
45 emdb, resolution, seid = split_data[:3]
48 se_length = int(seid.split(
'_')[-1])
50 print(f
"[WARNING] Could not parse SE length from: "
51 f
"{seid} in {result_file}")
56 print(f
"[WARNING] Skipping malformed line (missing "
57 f
"expected values) in {result_file}: {line}")
61 print(f
"[INFO] Skipping line with 'nan' values "
62 f
"in {result_file}: {line}")
65 total_residue += se_length
66 rank = line[1].strip().split()[0]
69 if int(rank) <= se_length / 3:
70 total_residue_matched += se_length
73 total_residue_matched_abs += se_length
75 print(f
"[WARNING] Invalid rank value in "
76 f
"{result_file}: {rank}")
80 if total_residue == 0:
81 total_percentage_matched = 0.0
82 total_abs_percentage_matched = 0.0
84 total_percentage_matched = round(
85 100 * (total_residue_matched / total_residue), 3)
86 total_abs_percentage_matched = round(
87 100 * (total_residue_matched_abs / total_residue), 3)
90 with open(final_output_file,
"a")
as outfile:
92 outfile.write(f
"{result_file}\t{total_percentage_matched}\t"
93 f
"{total_abs_percentage_matched}\n")
98 if __name__ ==
'__main__':
99 calculate_seq_match(sys.argv[1:],
"seq_matching_results.txt")