Mercurial > repos > galaxytrakr > hfp_centriflaken_awsbatch
comparison 0.4.2/bin/process_centrifuge_output.py @ 0:082e0091e813 draft default tip
planemo upload
| author | galaxytrakr |
|---|---|
| date | Fri, 29 May 2026 13:27:47 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:082e0091e813 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import os | |
| 4 import argparse | |
| 5 import logging as log | |
| 6 import pandas as pd | |
| 7 import numpy as np | |
| 8 from Bio import SeqIO | |
| 9 | |
| 10 | |
| 11 def main(): | |
| 12 # READ IN ARGUMENTS | |
| 13 desc = """ | |
| 14 This script is part of the centriflaken pipeline: It processes centrifuge | |
| 15 output and produces either a filtered FASTQ or a text file of FASTQ IDs based | |
| 16 on the supplied taxa/bug | |
| 17 """ | |
| 18 parser = argparse.ArgumentParser(prog='process_centrifuge_output.py', description=desc) | |
| 19 parser.add_argument("-v", dest='verbose', action="store_true", help="For more verbose output") | |
| 20 parser.add_argument("-i", dest='input_fastq', required=False, | |
| 21 help="Path to input FASTQ file (same as input to centrifuge). If not mentioned, \ | |
| 22 a text file of sequence IDs are produced instead of a FASTQ file") | |
| 23 parser.add_argument("-t", dest='taxa_filtered_fastq_file', required=True, | |
| 24 help="Path to output FASTQ or output text file filtered by the taxa specified") | |
| 25 parser.add_argument("-r", dest='cent_report', required=True, help="Path to centrifuge report") | |
| 26 parser.add_argument("-o", dest='cent_output', required=True, help="Path to centrifuge output") | |
| 27 parser.add_argument("-b", dest='bug', required=True, | |
| 28 help="Name or fragment of name of the bug by which reads are extracted") | |
| 29 args = parser.parse_args() | |
| 30 | |
| 31 # MORE INFO IF VERBOSE | |
| 32 if args.verbose: | |
| 33 log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG) | |
| 34 else: | |
| 35 log.basicConfig(format="%(levelname)s: %(message)s") | |
| 36 | |
| 37 # ASSIGN VARIABLES | |
| 38 input_fastq = args.input_fastq | |
| 39 taxa_filtered_fastq_file = args.taxa_filtered_fastq_file | |
| 40 cent_report = args.cent_report | |
| 41 cent_output = args.cent_output | |
| 42 bug = args.bug | |
| 43 report_col_list = ["name", "taxID"] | |
| 44 output_col_list = ["taxID", "readID"] | |
| 45 | |
| 46 # Match and filter taxa names and ids from centrifuge report file | |
| 47 report_df = pd.read_csv(cent_report, delimiter="\t", usecols=report_col_list) | |
| 48 report_df['name'] = report_df['name'].str.lower() | |
| 49 filt_report_df = report_df[report_df['name'].str.contains(bug.lower())] | |
| 50 #print("\nMatching taxa names and ids:\n",filt_report_df) | |
| 51 taxID_list = filt_report_df['taxID'] | |
| 52 | |
| 53 # Match the above tax ids to read ids from centrifuge output file and deduplicate | |
| 54 output_df = pd.read_csv(cent_output, delimiter="\t", usecols=output_col_list) | |
| 55 filt_output_df = output_df.loc[output_df['taxID'].isin(taxID_list)] | |
| 56 readID_list = filt_output_df['readID'] | |
| 57 readID_dedup_list = np.unique(readID_list) | |
| 58 TF=open(taxa_filtered_fastq_file, "w") | |
| 59 | |
| 60 if (not input_fastq): | |
| 61 # print("\nFILTERED READ ID LIST:\n", readID_dedup_list) | |
| 62 for ID in readID_dedup_list: | |
| 63 TF.write(f"{ID}\n") | |
| 64 else: | |
| 65 # Extract filtered reads from input fastq and write to output fastq | |
| 66 print ("Indexing reads..") | |
| 67 rec = SeqIO.index(input_fastq,"fastq") | |
| 68 for i in readID_dedup_list: | |
| 69 if i in rec: | |
| 70 SeqIO.write(rec[i], TF, "fastq") | |
| 71 | |
| 72 TF.close() | |
| 73 | |
| 74 if __name__ == "__main__": | |
| 75 main() |
