annotate 0.4.2/bin/process_centrifuge_output.py @ 0:082e0091e813 draft default tip

planemo upload
author galaxytrakr
date Fri, 29 May 2026 13:27:47 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
1 #!/usr/bin/env python
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
2
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
3 import os
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
4 import argparse
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
5 import logging as log
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
6 import pandas as pd
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
7 import numpy as np
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
8 from Bio import SeqIO
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
9
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
10
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
11 def main():
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
12 # READ IN ARGUMENTS
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
13 desc = """
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
14 This script is part of the centriflaken pipeline: It processes centrifuge
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
15 output and produces either a filtered FASTQ or a text file of FASTQ IDs based
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
16 on the supplied taxa/bug
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
17 """
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
18 parser = argparse.ArgumentParser(prog='process_centrifuge_output.py', description=desc)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
19 parser.add_argument("-v", dest='verbose', action="store_true", help="For more verbose output")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
20 parser.add_argument("-i", dest='input_fastq', required=False,
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
21 help="Path to input FASTQ file (same as input to centrifuge). If not mentioned, \
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
22 a text file of sequence IDs are produced instead of a FASTQ file")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
23 parser.add_argument("-t", dest='taxa_filtered_fastq_file', required=True,
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
24 help="Path to output FASTQ or output text file filtered by the taxa specified")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
25 parser.add_argument("-r", dest='cent_report', required=True, help="Path to centrifuge report")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
26 parser.add_argument("-o", dest='cent_output', required=True, help="Path to centrifuge output")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
27 parser.add_argument("-b", dest='bug', required=True,
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
28 help="Name or fragment of name of the bug by which reads are extracted")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
29 args = parser.parse_args()
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
30
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
31 # MORE INFO IF VERBOSE
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
32 if args.verbose:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
33 log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
34 else:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
35 log.basicConfig(format="%(levelname)s: %(message)s")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
36
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
37 # ASSIGN VARIABLES
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
38 input_fastq = args.input_fastq
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
39 taxa_filtered_fastq_file = args.taxa_filtered_fastq_file
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
40 cent_report = args.cent_report
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
41 cent_output = args.cent_output
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
42 bug = args.bug
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
43 report_col_list = ["name", "taxID"]
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
44 output_col_list = ["taxID", "readID"]
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
45
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
46 # Match and filter taxa names and ids from centrifuge report file
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
47 report_df = pd.read_csv(cent_report, delimiter="\t", usecols=report_col_list)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
48 report_df['name'] = report_df['name'].str.lower()
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
49 filt_report_df = report_df[report_df['name'].str.contains(bug.lower())]
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
50 #print("\nMatching taxa names and ids:\n",filt_report_df)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
51 taxID_list = filt_report_df['taxID']
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
52
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
53 # Match the above tax ids to read ids from centrifuge output file and deduplicate
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
54 output_df = pd.read_csv(cent_output, delimiter="\t", usecols=output_col_list)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
55 filt_output_df = output_df.loc[output_df['taxID'].isin(taxID_list)]
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
56 readID_list = filt_output_df['readID']
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
57 readID_dedup_list = np.unique(readID_list)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
58 TF=open(taxa_filtered_fastq_file, "w")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
59
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
60 if (not input_fastq):
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
61 # print("\nFILTERED READ ID LIST:\n", readID_dedup_list)
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
62 for ID in readID_dedup_list:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
63 TF.write(f"{ID}\n")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
64 else:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
65 # Extract filtered reads from input fastq and write to output fastq
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
66 print ("Indexing reads..")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
67 rec = SeqIO.index(input_fastq,"fastq")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
68 for i in readID_dedup_list:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
69 if i in rec:
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
70 SeqIO.write(rec[i], TF, "fastq")
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
71
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
72 TF.close()
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
73
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
74 if __name__ == "__main__":
082e0091e813 planemo upload
galaxytrakr
parents:
diff changeset
75 main()