kkonganti@0: #!/usr/bin/env python
kkonganti@0: 
kkonganti@0: import os
kkonganti@0: import argparse
kkonganti@0: import logging as log
kkonganti@0: import pandas as pd
kkonganti@0: import numpy as np
kkonganti@0: from Bio import SeqIO
kkonganti@0: 
kkonganti@0: 
kkonganti@0: def main():
kkonganti@0:     # READ IN ARGUMENTS
kkonganti@0:     desc = """
kkonganti@1:     This script is part of the CPIPES - centriflaken pipeline: It processes centrifuge
kkonganti@0:     output and produces either a filtered FASTQ or a text file of FASTQ IDs based
kkonganti@0:     on the supplied taxa/bug
kkonganti@0:     """
kkonganti@0:     parser = argparse.ArgumentParser(prog='process_centrifuge_output.py', description=desc)
kkonganti@0:     parser.add_argument("-v", dest='verbose', action="store_true", help="For more verbose output")
kkonganti@0:     parser.add_argument("-i", dest='input_fastq', required=False,
kkonganti@0:         help="Path to input FASTQ file (same as input to centrifuge). If not mentioned, \
kkonganti@0:             a text file of sequence IDs are produced instead of a FASTQ file")
kkonganti@0:     parser.add_argument("-t", dest='taxa_filtered_fastq_file', required=True,
kkonganti@0:         help="Path to output FASTQ or output text file filtered by the taxa specified")
kkonganti@0:     parser.add_argument("-r", dest='cent_report', required=True, help="Path to centrifuge report")
kkonganti@0:     parser.add_argument("-o", dest='cent_output', required=True, help="Path to centrifuge output")
kkonganti@0:     parser.add_argument("-b", dest='bug', required=True,
kkonganti@0:         help="Name or fragment of name of the bug by which reads are extracted")
kkonganti@0:     args = parser.parse_args()
kkonganti@0: 
kkonganti@0:     # MORE INFO IF VERBOSE
kkonganti@0:     if args.verbose:
kkonganti@0:         log.basicConfig(format="%(levelname)s: %(message)s", level=log.DEBUG)
kkonganti@0:     else:
kkonganti@0:         log.basicConfig(format="%(levelname)s: %(message)s")
kkonganti@0: 
kkonganti@0:     # ASSIGN VARIABLES
kkonganti@0:     input_fastq = args.input_fastq
kkonganti@0:     taxa_filtered_fastq_file = args.taxa_filtered_fastq_file
kkonganti@0:     cent_report = args.cent_report
kkonganti@0:     cent_output = args.cent_output
kkonganti@0:     bug = args.bug
kkonganti@0:     report_col_list = ["name", "taxID"]
kkonganti@0:     output_col_list = ["taxID", "readID"]
kkonganti@0: 
kkonganti@0:     # Match and filter taxa names and ids from centrifuge report file
kkonganti@0:     report_df = pd.read_csv(cent_report, delimiter="\t", usecols=report_col_list)
kkonganti@0:     report_df['name'] = report_df['name'].str.lower()
kkonganti@0:     filt_report_df = report_df[report_df['name'].str.contains(bug.lower())]
kkonganti@0:     #print("\nMatching taxa names and ids:\n",filt_report_df)
kkonganti@0:     taxID_list = filt_report_df['taxID']
kkonganti@0: 
kkonganti@0:     # Match the above tax ids to read ids from centrifuge output file and deduplicate
kkonganti@0:     output_df = pd.read_csv(cent_output, delimiter="\t", usecols=output_col_list)
kkonganti@0:     filt_output_df = output_df.loc[output_df['taxID'].isin(taxID_list)]
kkonganti@0:     readID_list = filt_output_df['readID']
kkonganti@0:     readID_dedup_list = np.unique(readID_list)
kkonganti@0:     TF=open(taxa_filtered_fastq_file, "w")
kkonganti@0: 
kkonganti@0:     if (not input_fastq):
kkonganti@0:         # print("\nFILTERED READ ID LIST:\n", readID_dedup_list)
kkonganti@0:         for ID in readID_dedup_list:
kkonganti@0:             TF.write(f"{ID}\n")
kkonganti@0:     else:
kkonganti@0:         # Extract filtered reads from input fastq and write to output fastq
kkonganti@0:         print ("Indexing reads..")
kkonganti@0:         rec = SeqIO.index(input_fastq,"fastq")
kkonganti@0:         for i in readID_dedup_list:
kkonganti@0:             if i in rec:
kkonganti@0:                 SeqIO.write(rec[i], TF, "fastq")
kkonganti@0: 
kkonganti@0:     TF.close()
kkonganti@0: 
kkonganti@0: if __name__ == "__main__":
kkonganti@1:     main()