jpayne@12: #!/usr/bin/env python3 jpayne@12: jpayne@12: jpayne@12: import gzip jpayne@12: import io jpayne@12: import pickle jpayne@12: import os jpayne@12: import sys jpayne@12: jpayne@12: from argparse import ArgumentParser jpayne@12: try: jpayne@12: from version import SalmID_version jpayne@12: except: jpayne@12: SalmID_version = "version unknown" jpayne@12: jpayne@12: jpayne@12: def reverse_complement(sequence): jpayne@12: complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'M': 'K', 'R': 'Y', 'W': 'W', jpayne@12: 'S': 'S', 'Y': 'R', 'K': 'M', 'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'} jpayne@12: return "".join(complement[base] for base in reversed(sequence)) jpayne@12: jpayne@12: jpayne@12: def parse_args(): jpayne@12: "Parse the input arguments, use '-h' for help." jpayne@12: parser = ArgumentParser(description='SalmID - rapid Kmer based Salmonella identifier from sequence data') jpayne@12: # inputs jpayne@12: parser.add_argument('-v','--version', action='version', version='%(prog)s ' + SalmID_version) jpayne@12: parser.add_argument( jpayne@12: '-i','--input_file', type=str, required=False, default= 'None', metavar = 'your_fastqgz', jpayne@12: help='Single fastq.gz file input, include path to file if file is not in same directory ') jpayne@12: parser.add_argument( jpayne@12: '-e', '--extension', type=str, required=False, default= '.fastq.gz', metavar = 'file_extension', jpayne@12: help='File extension, if specified without "--input_dir", SalmID will attempt to ID all files\n' + jpayne@12: ' with this extension in current directory, otherwise files in input directory') jpayne@12: jpayne@12: parser.add_argument( jpayne@12: '-d','--input_dir', type=str, required=False, default='.', metavar = 'directory', jpayne@12: help='Directory which contains data for identification, when not specified files in current directory will be analyzed.') jpayne@12: parser.add_argument( jpayne@12: '-r', '--report', type=str, required=False, default='percentage', metavar = 'percentage, coverage or taxonomy', jpayne@12: help='Report either percentage ("percentage") of clade specific kmers recovered, average kmer-coverage ("cov"), or ' jpayne@12: 'taxonomy (taxonomic species ID, plus observed mean k-mer coverages and expected coverage).') jpayne@12: parser.add_argument( jpayne@12: '-m', '--mode', type=str, required=False, default='quick', metavar = 'quick or thorough', jpayne@12: help='Quick [quick] or thorough [thorough] mode') jpayne@12: if len(sys.argv)==1: jpayne@12: parser.print_help(sys.stderr) jpayne@12: sys.exit(1) jpayne@12: return parser.parse_args() jpayne@12: jpayne@12: def get_av_read_length(file): jpayne@12: i = 1 jpayne@12: n_reads = 0 jpayne@12: total_length = 0 jpayne@12: if file.endswith(".gz"): jpayne@12: file_content=io.BufferedReader(gzip.open(file)) jpayne@12: else: jpayne@12: file_content=open(file,"r").readlines() jpayne@12: for line in file_content: jpayne@12: if i % 4 == 2: jpayne@12: total_length += len(line.strip()) jpayne@12: n_reads +=1 jpayne@12: i += 1 jpayne@12: if n_reads == 100: jpayne@12: break jpayne@12: return total_length/100 jpayne@12: jpayne@12: jpayne@12: def createKmerDict_reads(list_of_strings, kmer): jpayne@12: kmer_table = {} jpayne@12: for string in list_of_strings: jpayne@12: sequence = string.strip('\n') jpayne@12: for i in range(len(sequence)-kmer+1): jpayne@12: new_mer =sequence[i:i+kmer] jpayne@12: new_mer_rc = reverse_complement(new_mer) jpayne@12: if new_mer in kmer_table: jpayne@12: kmer_table[new_mer.upper()] += 1 jpayne@12: else: jpayne@12: kmer_table[new_mer.upper()] = 1 jpayne@12: if new_mer_rc in kmer_table: jpayne@12: kmer_table[new_mer_rc.upper()] += 1 jpayne@12: else: jpayne@12: kmer_table[new_mer_rc.upper()] = 1 jpayne@12: return kmer_table jpayne@12: jpayne@12: jpayne@12: def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode): jpayne@12: mean_1 = None jpayne@12: mean_2 = None jpayne@12: i = 1 jpayne@12: n_reads_1 = 0 jpayne@12: n_reads_2 = 0 jpayne@12: total_coverage_1 = 0 jpayne@12: total_coverage_2 = 0 jpayne@12: reads_1 = [] jpayne@12: reads_2 = [] jpayne@12: total_reads = 0 jpayne@12: if file.endswith(".gz"): jpayne@12: file_content=io.BufferedReader(gzip.open(file)) jpayne@12: else: jpayne@12: file_content=open(file,"r").readlines() jpayne@12: for line in file_content: jpayne@12: start = int((len(line) - k) // 2) jpayne@12: if i % 4 == 2: jpayne@12: total_reads += 1 jpayne@12: if file.endswith(".gz"): jpayne@12: s1 = line[start:k + start].decode() jpayne@12: line=line.decode() jpayne@12: else: jpayne@12: s1 = line[start:k + start] jpayne@12: if s1 in kmerDict_1: jpayne@12: n_reads_1 += 1 jpayne@12: total_coverage_1 += len(line) jpayne@12: reads_1.append(line) jpayne@12: if s1 in kmerDict_2: jpayne@12: n_reads_2 += 1 jpayne@12: total_coverage_2 += len(line) jpayne@12: reads_2.append(line) jpayne@12: i += 1 jpayne@12: if mode == 'quick': jpayne@12: if total_coverage_2 >= 800000: jpayne@12: break jpayne@12: jpayne@12: if len(reads_1) == 0: jpayne@12: kmer_Dict1 = {} jpayne@12: else: jpayne@12: kmer_Dict1 = createKmerDict_reads(reads_1, k) jpayne@12: mers_1 = set([key for key in kmer_Dict1]) jpayne@12: mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1])/len(mers_1) jpayne@12: if len(reads_2) == 0: jpayne@12: kmer_Dict2 = {} jpayne@12: else: jpayne@12: kmer_Dict2 = createKmerDict_reads(reads_2, k) jpayne@12: mers_2 = set([key for key in kmer_Dict2]) jpayne@12: mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2])/len(mers_2) jpayne@12: return kmer_Dict1, kmer_Dict2, mean_1, mean_2, total_reads jpayne@12: jpayne@12: def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers): jpayne@12: ''' jpayne@12: Given an iterable (list, set, dictrionary) returns mean coverage for the kmers in iterable jpayne@12: :param iterable: set, list or dictionary containing kmers jpayne@12: :param kmer_dict: dictionary with kmers as keys, kmer-frequency as value jpayne@12: :param clade_specific_kmers: list, dict or set of clade specific kmers jpayne@12: :return: mean frequency as float jpayne@12: ''' jpayne@12: if len(iterable) == 0: jpayne@12: return 0 jpayne@12: return sum([kmer_dict[value] for value in iterable])/len(clade_specific_kmers) jpayne@12: jpayne@12: def kmer_lists(query_fastq_gz, k, jpayne@12: allmers,allmers_rpoB, jpayne@12: uniqmers_bongori, jpayne@12: uniqmers_I, jpayne@12: uniqmers_IIa, jpayne@12: uniqmers_IIb, jpayne@12: uniqmers_IIIa, jpayne@12: uniqmers_IIIb, jpayne@12: uniqmers_IV, jpayne@12: uniqmers_VI, jpayne@12: uniqmers_VII, jpayne@12: uniqmers_VIII, jpayne@12: uniqmers_bongori_rpoB, jpayne@12: uniqmers_S_enterica_rpoB, jpayne@12: uniqmers_Escherichia_rpoB, jpayne@12: uniqmers_Listeria_ss_rpoB, jpayne@12: uniqmers_Lmono_rpoB, jpayne@12: mode): jpayne@12: dict_invA, dict_rpoB, mean_invA, mean_rpoB , total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers, jpayne@12: allmers_rpoB, mode) jpayne@12: target_mers_invA = set([key for key in dict_invA]) jpayne@12: target_mers_rpoB = set([key for key in dict_rpoB]) jpayne@12: if target_mers_invA == 0: jpayne@12: print('No reads found matching invA, no Salmonella in sample?') jpayne@12: else: jpayne@12: p_bongori = (len(uniqmers_bongori & target_mers_invA) / len(uniqmers_bongori)) * 100 jpayne@12: p_I = (len(uniqmers_I & target_mers_invA) / len(uniqmers_I)) * 100 jpayne@12: p_IIa = (len(uniqmers_IIa & target_mers_invA) / len(uniqmers_IIa)) * 100 jpayne@12: p_IIb = (len(uniqmers_IIb & target_mers_invA) / len(uniqmers_IIb)) * 100 jpayne@12: p_IIIa = (len(uniqmers_IIIa & target_mers_invA) / len(uniqmers_IIIa)) * 100 jpayne@12: p_IIIb = (len(uniqmers_IIIb & target_mers_invA) / len(uniqmers_IIIb)) * 100 jpayne@12: p_VI = (len(uniqmers_VI & target_mers_invA) / len(uniqmers_VI)) * 100 jpayne@12: p_IV = (len(uniqmers_IV & target_mers_invA) / len(uniqmers_IV)) * 100 jpayne@12: p_VII = (len(uniqmers_VII & target_mers_invA) / len(uniqmers_VII)) * 100 jpayne@12: p_VIII = (len(uniqmers_VIII & target_mers_invA) / len(uniqmers_VIII)) * 100 jpayne@12: p_bongori_rpoB = (len(uniqmers_bongori_rpoB & target_mers_rpoB) / len(uniqmers_bongori_rpoB)) * 100 jpayne@12: p_Senterica = (len(uniqmers_S_enterica_rpoB & target_mers_rpoB) / len(uniqmers_S_enterica_rpoB)) * 100 jpayne@12: p_Escherichia = (len(uniqmers_Escherichia_rpoB & target_mers_rpoB) / len(uniqmers_Escherichia_rpoB)) * 100 jpayne@12: p_Listeria_ss = (len(uniqmers_Listeria_ss_rpoB & target_mers_rpoB) / len(uniqmers_Listeria_ss_rpoB)) * 100 jpayne@12: p_Lmono = (len(uniqmers_Lmono_rpoB & target_mers_rpoB) / len(uniqmers_Lmono_rpoB)) * 100 jpayne@12: bongori_invA_cov = mean_cov_selected_kmers(uniqmers_bongori & target_mers_invA, dict_invA, uniqmers_bongori) jpayne@12: I_invA_cov = mean_cov_selected_kmers(uniqmers_I & target_mers_invA, dict_invA, uniqmers_I) jpayne@12: IIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIa & target_mers_invA, dict_invA, uniqmers_IIa) jpayne@12: IIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIb & target_mers_invA, dict_invA, uniqmers_IIb) jpayne@12: IIIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIIa & target_mers_invA, dict_invA, uniqmers_IIIa) jpayne@12: IIIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIIb & target_mers_invA, dict_invA, uniqmers_IIIb) jpayne@12: IV_invA_cov = mean_cov_selected_kmers(uniqmers_IV & target_mers_invA, dict_invA, uniqmers_IV) jpayne@12: VI_invA_cov = mean_cov_selected_kmers(uniqmers_VI & target_mers_invA, dict_invA, uniqmers_VI) jpayne@12: VII_invA_cov = mean_cov_selected_kmers(uniqmers_VII & target_mers_invA, dict_invA, uniqmers_VII) jpayne@12: VIII_invA_cov = mean_cov_selected_kmers(uniqmers_VIII & target_mers_invA, dict_invA, uniqmers_VIII) jpayne@12: S_enterica_rpoB_cov = mean_cov_selected_kmers((uniqmers_S_enterica_rpoB & target_mers_rpoB), dict_rpoB, jpayne@12: uniqmers_S_enterica_rpoB) jpayne@12: S_bongori_rpoB_cov = mean_cov_selected_kmers((uniqmers_bongori_rpoB & target_mers_rpoB), dict_rpoB, jpayne@12: uniqmers_bongori_rpoB) jpayne@12: Escherichia_rpoB_cov = mean_cov_selected_kmers((uniqmers_Escherichia_rpoB & target_mers_rpoB), dict_rpoB, jpayne@12: uniqmers_Escherichia_rpoB) jpayne@12: Listeria_ss_rpoB_cov = mean_cov_selected_kmers((uniqmers_Listeria_ss_rpoB & target_mers_rpoB), dict_rpoB, jpayne@12: uniqmers_Listeria_ss_rpoB) jpayne@12: Lmono_rpoB_cov = mean_cov_selected_kmers((uniqmers_Lmono_rpoB & target_mers_rpoB), dict_rpoB, jpayne@12: uniqmers_Lmono_rpoB) jpayne@12: coverages = [Listeria_ss_rpoB_cov, Lmono_rpoB_cov, Escherichia_rpoB_cov, S_bongori_rpoB_cov, jpayne@12: S_enterica_rpoB_cov, bongori_invA_cov, I_invA_cov, IIa_invA_cov, IIb_invA_cov, jpayne@12: IIIa_invA_cov, IIIb_invA_cov, IV_invA_cov, VI_invA_cov, VII_invA_cov, VIII_invA_cov] jpayne@12: locus_scores = [p_Listeria_ss, p_Lmono, p_Escherichia, p_bongori_rpoB, p_Senterica, p_bongori, jpayne@12: p_I, p_IIa,p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII] jpayne@12: return locus_scores, coverages, total_reads jpayne@12: jpayne@12: def report_taxon(locus_covs, average_read_length, number_of_reads): jpayne@12: list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.', jpayne@12: 'Salmonella bongori (rpoB)', 'Salmonella enterica (rpoB)', jpayne@12: 'Salmonella bongori (invA)', 'S. enterica subsp. enterica (invA)', jpayne@12: 'S. enterica subsp. salamae (invA: clade a)','S. enterica subsp. salamae (invA: clade b)', jpayne@12: 'S. enterica subsp. arizonae (invA)', 'S. enterica subsp. diarizonae (invA)', jpayne@12: 'S. enterica subsp. houtenae (invA)', 'S. enterica subsp. indica (invA)', jpayne@12: 'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)'] jpayne@12: if sum(locus_covs) < 1: jpayne@12: rpoB = ('No rpoB matches!', 0) jpayne@12: invA = ('No invA matches!', 0) jpayne@12: return rpoB, invA, 0.0 jpayne@12: else: jpayne@12: # given list of scores get taxon jpayne@12: if sum(locus_covs[0:5]) > 0: jpayne@12: best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x])+1 jpayne@12: all_rpoB = max(range(len(locus_covs[0:5])), key=lambda x: locus_covs[0:5][x]) jpayne@12: if (locus_covs[best_rpoB] != 0) & (all_rpoB == 0): jpayne@12: rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB]) jpayne@12: elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]),1) < 1): jpayne@12: rpoB = (list_taxa[0], locus_covs[0]) jpayne@12: else: jpayne@12: rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB]) jpayne@12: else: jpayne@12: rpoB = ('No rpoB matches!', 0) jpayne@12: if sum(locus_covs[5:]) > 0: jpayne@12: best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x])+5 jpayne@12: invA = (list_taxa[best_invA], locus_covs[best_invA]) jpayne@12: else: jpayne@12: invA = ('No invA matches!', 0) jpayne@12: if 'Listeria' in rpoB[0]: jpayne@12: return rpoB, invA, (average_read_length * number_of_reads) / 3000000 jpayne@12: else: jpayne@12: return rpoB, invA, (average_read_length * number_of_reads) / 5000000 jpayne@12: jpayne@12: jpayne@12: jpayne@12: def main(): jpayne@12: ex_dir = os.path.dirname(os.path.realpath(__file__)) jpayne@12: args = parse_args() jpayne@12: input_file = args.input_file jpayne@12: if input_file != 'None': jpayne@12: files = [input_file] jpayne@12: else: jpayne@12: extension = args.extension jpayne@12: inputdir = args.input_dir jpayne@12: files = [inputdir + '/'+ f for f in os.listdir(inputdir) if f.endswith(extension)] jpayne@12: report = args.report jpayne@12: mode = args.mode jpayne@12: f_invA = open(ex_dir + "/invA_mers_dict", "rb") jpayne@12: sets_dict_invA = pickle.load(f_invA) jpayne@12: f_invA.close() jpayne@12: allmers = sets_dict_invA['allmers'] jpayne@12: uniqmers_I = sets_dict_invA['uniqmers_I'] jpayne@12: uniqmers_IIa = sets_dict_invA['uniqmers_IIa'] jpayne@12: uniqmers_IIb = sets_dict_invA['uniqmers_IIb'] jpayne@12: uniqmers_IIIa = sets_dict_invA['uniqmers_IIIa'] jpayne@12: uniqmers_IIIb = sets_dict_invA['uniqmers_IIIb'] jpayne@12: uniqmers_IV = sets_dict_invA['uniqmers_IV'] jpayne@12: uniqmers_VI = sets_dict_invA['uniqmers_VI'] jpayne@12: uniqmers_VII = sets_dict_invA['uniqmers_VII'] jpayne@12: uniqmers_VIII = sets_dict_invA['uniqmers_VIII'] jpayne@12: uniqmers_bongori = sets_dict_invA['uniqmers_bongori'] jpayne@12: jpayne@12: f = open(ex_dir + "/rpoB_mers_dict", "rb") jpayne@12: sets_dict = pickle.load(f) jpayne@12: f.close() jpayne@12: jpayne@12: allmers_rpoB = sets_dict['allmers'] jpayne@12: uniqmers_bongori_rpoB = sets_dict['uniqmers_bongori'] jpayne@12: uniqmers_S_enterica_rpoB = sets_dict['uniqmers_S_enterica'] jpayne@12: uniqmers_Escherichia_rpoB = sets_dict['uniqmers_Escherichia'] jpayne@12: uniqmers_Listeria_ss_rpoB = sets_dict['uniqmers_Listeria_ss'] jpayne@12: uniqmers_Lmono_rpoB = sets_dict['uniqmers_L_mono'] jpayne@12: #todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports jpayne@12: if report == 'taxonomy': jpayne@12: print('file\trpoB\tinvA\texpected coverage') jpayne@12: for f in files: jpayne@12: locus_scores, coverages, reads = kmer_lists(f, 27, jpayne@12: allmers, allmers_rpoB, jpayne@12: uniqmers_bongori, jpayne@12: uniqmers_I, jpayne@12: uniqmers_IIa, jpayne@12: uniqmers_IIb, jpayne@12: uniqmers_IIIa, jpayne@12: uniqmers_IIIb, jpayne@12: uniqmers_IV, jpayne@12: uniqmers_VI, jpayne@12: uniqmers_VII, jpayne@12: uniqmers_VIII, jpayne@12: uniqmers_bongori_rpoB, jpayne@12: uniqmers_S_enterica_rpoB, jpayne@12: uniqmers_Escherichia_rpoB, jpayne@12: uniqmers_Listeria_ss_rpoB, jpayne@12: uniqmers_Lmono_rpoB, jpayne@12: mode) jpayne@12: pretty_covs = [round(cov, 1) for cov in coverages] jpayne@12: report = report_taxon(pretty_covs, get_av_read_length(f), reads) jpayne@12: print(f.split('/')[-1] + '\t' + report[0][0] + '[' + str(report[0][1]) + ']' + '\t' + report[1][0] + jpayne@12: '[' + str(report[1][1]) + ']' + jpayne@12: '\t' + str(round(report[2], 1))) jpayne@12: else: jpayne@12: print( jpayne@12: 'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' + jpayne@12: '(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' + jpayne@12: ' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' + jpayne@12: '\tsubsp. II (clade VIII : invA)') jpayne@12: if report == 'percentage': jpayne@12: for f in files: jpayne@12: locus_scores, coverages , reads = kmer_lists( f, 27, jpayne@12: allmers,allmers_rpoB, jpayne@12: uniqmers_bongori, jpayne@12: uniqmers_I, jpayne@12: uniqmers_IIa, jpayne@12: uniqmers_IIb, jpayne@12: uniqmers_IIIa, jpayne@12: uniqmers_IIIb, jpayne@12: uniqmers_IV, jpayne@12: uniqmers_VI, jpayne@12: uniqmers_VII, jpayne@12: uniqmers_VIII, jpayne@12: uniqmers_bongori_rpoB, jpayne@12: uniqmers_S_enterica_rpoB, jpayne@12: uniqmers_Escherichia_rpoB, jpayne@12: uniqmers_Listeria_ss_rpoB, jpayne@12: uniqmers_Lmono_rpoB, jpayne@12: mode) jpayne@12: pretty_scores = [str(round(score)) for score in locus_scores] jpayne@12: print(f.split('/')[-1] +'\t' + '\t'.join(pretty_scores)) jpayne@12: else: jpayne@12: for f in files: jpayne@12: locus_scores, coverages , reads = kmer_lists( f, 27, jpayne@12: allmers,allmers_rpoB, jpayne@12: uniqmers_bongori, jpayne@12: uniqmers_I, jpayne@12: uniqmers_IIa, jpayne@12: uniqmers_IIb, jpayne@12: uniqmers_IIIa, jpayne@12: uniqmers_IIIb, jpayne@12: uniqmers_IV, jpayne@12: uniqmers_VI, jpayne@12: uniqmers_VII, jpayne@12: uniqmers_VIII, jpayne@12: uniqmers_bongori_rpoB, jpayne@12: uniqmers_S_enterica_rpoB, jpayne@12: uniqmers_Escherichia_rpoB, jpayne@12: uniqmers_Listeria_ss_rpoB, jpayne@12: uniqmers_Lmono_rpoB, jpayne@12: mode) jpayne@12: pretty_covs = [str(round(cov, 1)) for cov in coverages] jpayne@12: print(f.split('/')[-1] + '\t' + '\t'.join(pretty_covs)) jpayne@12: jpayne@12: if __name__ == '__main__': jpayne@12: main() jpayne@12: