annotate variant4b.py @ 0:bc556481a1fb draft default tip

planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
author estrain
date Fri, 13 Mar 2026 12:01:17 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
1 import sys
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
2
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
3 def identify_variants_with_genes(input_file_path, output_file_path, simple_name):
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
4 # Define the genes of interest
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
5 genes_of_interest = ['LMO0737', 'ORF2110', 'ORF2819']
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
6
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
7 # Open the input file and read its lines
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
8 with open(input_file_path, 'r') as file:
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
9 lines = file.readlines()
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
10
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
11 # Check if the file has more than just the header
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
12 if len(lines) <= 1:
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
13 print("Input file does not contain enough data.")
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
14 return
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
15
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
16 # Extract the column headers and find the indices of the genes of interest
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
17 headers = lines[0].strip().split('\t')
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
18 gene_indices = [headers.index(gene) for gene in genes_of_interest]
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
19 serotype_index = headers.index('SEROTYPE')
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
20
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
21 # Modify the header to include the new first column
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
22 modified_header = f"FileName\t{lines[0]}"
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
23
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
24 # Initialize a list to hold the modified lines, starting with the modified header
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
25 modified_lines = [modified_header]
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
26
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
27 # Process each data line in the input file
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
28 for line in lines[1:]:
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
29 data = line.strip().split('\t')
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
30 # Check if the genes of interest are all present (marked as "FULL")
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
31 if all(data[index] == 'FULL' for index in gene_indices):
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
32 # Modify the SEROTYPE column to "4b variant"
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
33 data[serotype_index] = "4b variant"
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
34 # Prepend the simple name to the line
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
35 modified_line = f"{simple_name}\t" + '\t'.join(data) + '\n'
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
36 # Add the modified line to the list
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
37 modified_lines.append(modified_line)
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
38
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
39 # Write the modified lines to the output file
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
40 with open(output_file_path, 'w') as file:
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
41 file.writelines(modified_lines)
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
42
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
43 print(f'Results written to {output_file_path}')
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
44
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
45 if __name__ == "__main__":
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
46 if len(sys.argv) != 4:
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
47 print("Usage: python script.py <input_file_path> <output_file_path> <simple_name>")
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
48 sys.exit(1)
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
49
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
50 input_file_path = sys.argv[1]
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
51 output_file_path = sys.argv[2]
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
52 simple_name = sys.argv[3]
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
53 identify_variants_with_genes(input_file_path, output_file_path, simple_name)
bc556481a1fb planemo upload commit e12eb56d1744da8f7af8ca1819e2617c83fb17a8
estrain
parents:
diff changeset
54