kkonganti@0: #!/usr/bin/env python kkonganti@0: kkonganti@0: import os kkonganti@0: import argparse kkonganti@0: import logging as log kkonganti@0: import pandas as pd kkonganti@0: import numpy as np kkonganti@0: from Bio import SeqIO kkonganti@0: kkonganti@0: kkonganti@0: def main(): kkonganti@0: # READ IN ARGUMENTS kkonganti@0: desc = """ kkonganti@1: This script is part of the CPIPES - centriflaken pipeline: It processes centrifuge kkonganti@0: output and produces either a filtered FASTQ or a text file of FASTQ IDs based kkonganti@0: on the supplied taxa/bug kkonganti@0: """ kkonganti@0: parser = argparse.ArgumentParser(prog='process_centrifuge_output.py', description=desc) kkonganti@0: parser.add_argument("-v", dest='verbose', action="store_true", help="For more verbose output") kkonganti@0: parser.add_argument("-i", dest='input_fastq', required=False, kkonganti@0: help="Path to input FASTQ file (same as input to centrifuge). If not mentioned, \ kkonganti@0: a text file of sequence IDs are produced instead of a FASTQ file") kkonganti@0: parser.add_argument("-t", dest='taxa_filtered_fastq_file', required=True, kkonganti@0: help="Path to output FASTQ or output text file filtered by the taxa specified") kkonganti@0: parser.add_argument("-r", dest='cent_report', required=True, help="Path to centrifuge report") kkonganti@0: parser.add_argument("-o", dest='cent_output', required=True, help="Path to centrifuge output") kkonganti@0: parser.add_argument("-b", dest='bug', required=True, kkonganti@0: help="Name or fragment of name of the bug by which reads are extracted") kkonganti@0: args = parser.parse_args() kkonganti@0: kkonganti@0: # MORE INFO IF VERBOSE kkonganti@0: if args.verbose: kkonganti@0: log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG) kkonganti@0: else: kkonganti@0: log.basicConfig(format="%(levelname)s: %(message)s") kkonganti@0: kkonganti@0: # ASSIGN VARIABLES kkonganti@0: input_fastq = args.input_fastq kkonganti@0: taxa_filtered_fastq_file = args.taxa_filtered_fastq_file kkonganti@0: cent_report = args.cent_report kkonganti@0: cent_output = args.cent_output kkonganti@0: bug = args.bug kkonganti@0: report_col_list = ["name", "taxID"] kkonganti@0: output_col_list = ["taxID", "readID"] kkonganti@0: kkonganti@0: # Match and filter taxa names and ids from centrifuge report file kkonganti@0: report_df = pd.read_csv(cent_report, delimiter="\t", usecols=report_col_list) kkonganti@0: report_df['name'] = report_df['name'].str.lower() kkonganti@0: filt_report_df = report_df[report_df['name'].str.contains(bug.lower())] kkonganti@0: #print("\nMatching taxa names and ids:\n",filt_report_df) kkonganti@0: taxID_list = filt_report_df['taxID'] kkonganti@0: kkonganti@0: # Match the above tax ids to read ids from centrifuge output file and deduplicate kkonganti@0: output_df = pd.read_csv(cent_output, delimiter="\t", usecols=output_col_list) kkonganti@0: filt_output_df = output_df.loc[output_df['taxID'].isin(taxID_list)] kkonganti@0: readID_list = filt_output_df['readID'] kkonganti@0: readID_dedup_list = np.unique(readID_list) kkonganti@0: TF=open(taxa_filtered_fastq_file, "w") kkonganti@0: kkonganti@0: if (not input_fastq): kkonganti@0: # print("\nFILTERED READ ID LIST:\n", readID_dedup_list) kkonganti@0: for ID in readID_dedup_list: kkonganti@0: TF.write(f"{ID}\n") kkonganti@0: else: kkonganti@0: # Extract filtered reads from input fastq and write to output fastq kkonganti@0: print ("Indexing reads..") kkonganti@0: rec = SeqIO.index(input_fastq,"fastq") kkonganti@0: for i in readID_dedup_list: kkonganti@0: if i in rec: kkonganti@0: SeqIO.write(rec[i], TF, "fastq") kkonganti@0: kkonganti@0: TF.close() kkonganti@0: kkonganti@0: if __name__ == "__main__": kkonganti@1: main()