Mercurial > repos > jpayne > seqsero_v2
changeset 12:4ac593d4b40f
planemo upload
author | jpayne |
---|---|
date | Wed, 14 Aug 2019 17:45:41 -0400 |
parents | f6f0702de3b4 |
children | 3513b93ebe6a |
files | SalmID/LICENSE SalmID/README.md SalmID/SalmID.py SalmID/__pycache__/version.cpython-36.pyc SalmID/invA_mers_dict SalmID/rpoB_mers_dict SalmID/version.py SeqSero2/SalmID/LICENSE SeqSero2/SalmID/README.md SeqSero2/SalmID/SalmID.py SeqSero2/SalmID/__pycache__/version.cpython-36.pyc SeqSero2/SalmID/invA_mers_dict SeqSero2/SalmID/rpoB_mers_dict SeqSero2/SalmID/version.py SeqSero2/SeqSero2_package.py seqsero2.xml test-data/forward_250k.fastq.gz test-data/reverse_250k.fastq.gz |
diffstat | 18 files changed, 452 insertions(+), 447 deletions(-) [+] |
line wrap: on
line diff
--- a/SalmID/LICENSE Tue Aug 13 08:40:57 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2017 Henk den Bakker - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE.
--- a/SalmID/README.md Tue Aug 13 08:40:57 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,49 +0,0 @@ -[![DOI](https://zenodo.org/badge/97020646.svg)](https://zenodo.org/badge/latestdoi/97020646) - -# SalmID -Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs Salmonella species and subspecies, and some common contaminants (Listeria, Escherichia). - -## Requirements: -Python 3 - -## Installation: -The easy way with homebrew ([Linux](http://linuxbrew.sh/) or [MacOS](https://brew.sh/)): -``` -brew install brewsci/bio/salmid -``` -Big thanks to [Torsten Seemann](https://tseemann.github.io/) for including this in homebrew! - -Alernatively git clone to your machine: -``` -git clone --recursive https://github.com/hcdenbakker/SalmID.git -``` - -Make SalmID executable: -``` -cd SalmID -``` - -``` -chmod +x SalmID.py -``` - - -Add the SalmID folder to your path - -To execute: -``` -SalmID.py -e .fastq.gz -``` -This will perform a SalmID run on all fastq.gz files in the current directory. -``` -SalmID.py -i your_fastq_gz.fastq.gz -``` -This will perform a SalmID run on an individual file (i.e., your_fastq_gz.fastq.gz) -``` -SalmID.py -d directory_with_data -e _1.fastq.gz -``` -This will perform a SalmID run on all files in directory 'directory_with_data' with extension '_1.fastq.gz' - -## Todo's and thoughts for future releases: -- Provide coverage estimates for genomes in sample based on kmer frequencies -- Write code to use SalmID on long read (minion, pacbio) platforms
--- a/SalmID/SalmID.py Tue Aug 13 08:40:57 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,371 +0,0 @@ -#!/usr/bin/env python3 - - -import gzip -import io -import pickle -import os -import sys - -from argparse import ArgumentParser -try: - from version import SalmID_version -except: - SalmID_version = "version unknown" - - -def reverse_complement(sequence): - complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'M': 'K', 'R': 'Y', 'W': 'W', - 'S': 'S', 'Y': 'R', 'K': 'M', 'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'} - return "".join(complement[base] for base in reversed(sequence)) - - -def parse_args(): - "Parse the input arguments, use '-h' for help." - parser = ArgumentParser(description='SalmID - rapid Kmer based Salmonella identifier from sequence data') - # inputs - parser.add_argument('-v','--version', action='version', version='%(prog)s ' + SalmID_version) - parser.add_argument( - '-i','--input_file', type=str, required=False, default= 'None', metavar = 'your_fastqgz', - help='Single fastq.gz file input, include path to file if file is not in same directory ') - parser.add_argument( - '-e', '--extension', type=str, required=False, default= '.fastq.gz', metavar = 'file_extension', - help='File extension, if specified without "--input_dir", SalmID will attempt to ID all files\n' + - ' with this extension in current directory, otherwise files in input directory') - - parser.add_argument( - '-d','--input_dir', type=str, required=False, default='.', metavar = 'directory', - help='Directory which contains data for identification, when not specified files in current directory will be analyzed.') - parser.add_argument( - '-r', '--report', type=str, required=False, default='percentage', metavar = 'percentage, coverage or taxonomy', - help='Report either percentage ("percentage") of clade specific kmers recovered, average kmer-coverage ("cov"), or ' - 'taxonomy (taxonomic species ID, plus observed mean k-mer coverages and expected coverage).') - parser.add_argument( - '-m', '--mode', type=str, required=False, default='quick', metavar = 'quick or thorough', - help='Quick [quick] or thorough [thorough] mode') - if len(sys.argv)==1: - parser.print_help(sys.stderr) - sys.exit(1) - return parser.parse_args() - -def get_av_read_length(file): - i = 1 - n_reads = 0 - total_length = 0 - if file.endswith(".gz"): - file_content=io.BufferedReader(gzip.open(file)) - else: - file_content=open(file,"r").readlines() - for line in file_content: - if i % 4 == 2: - total_length += len(line.strip()) - n_reads +=1 - i += 1 - if n_reads == 100: - break - return total_length/100 - - -def createKmerDict_reads(list_of_strings, kmer): - kmer_table = {} - for string in list_of_strings: - sequence = string.strip('\n') - for i in range(len(sequence)-kmer+1): - new_mer =sequence[i:i+kmer] - new_mer_rc = reverse_complement(new_mer) - if new_mer in kmer_table: - kmer_table[new_mer.upper()] += 1 - else: - kmer_table[new_mer.upper()] = 1 - if new_mer_rc in kmer_table: - kmer_table[new_mer_rc.upper()] += 1 - else: - kmer_table[new_mer_rc.upper()] = 1 - return kmer_table - - -def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode): - mean_1 = None - mean_2 = None - i = 1 - n_reads_1 = 0 - n_reads_2 = 0 - total_coverage_1 = 0 - total_coverage_2 = 0 - reads_1 = [] - reads_2 = [] - total_reads = 0 - if file.endswith(".gz"): - file_content=io.BufferedReader(gzip.open(file)) - else: - file_content=open(file,"r").readlines() - for line in file_content: - start = int((len(line) - k) // 2) - if i % 4 == 2: - total_reads += 1 - if file.endswith(".gz"): - s1 = line[start:k + start].decode() - line=line.decode() - else: - s1 = line[start:k + start] - if s1 in kmerDict_1: - n_reads_1 += 1 - total_coverage_1 += len(line) - reads_1.append(line) - if s1 in kmerDict_2: - n_reads_2 += 1 - total_coverage_2 += len(line) - reads_2.append(line) - i += 1 - if mode == 'quick': - if total_coverage_2 >= 800000: - break - - if len(reads_1) == 0: - kmer_Dict1 = {} - else: - kmer_Dict1 = createKmerDict_reads(reads_1, k) - mers_1 = set([key for key in kmer_Dict1]) - mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1])/len(mers_1) - if len(reads_2) == 0: - kmer_Dict2 = {} - else: - kmer_Dict2 = createKmerDict_reads(reads_2, k) - mers_2 = set([key for key in kmer_Dict2]) - mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2])/len(mers_2) - return kmer_Dict1, kmer_Dict2, mean_1, mean_2, total_reads - -def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers): - ''' - Given an iterable (list, set, dictrionary) returns mean coverage for the kmers in iterable - :param iterable: set, list or dictionary containing kmers - :param kmer_dict: dictionary with kmers as keys, kmer-frequency as value - :param clade_specific_kmers: list, dict or set of clade specific kmers - :return: mean frequency as float - ''' - if len(iterable) == 0: - return 0 - return sum([kmer_dict[value] for value in iterable])/len(clade_specific_kmers) - -def kmer_lists(query_fastq_gz, k, - allmers,allmers_rpoB, - uniqmers_bongori, - uniqmers_I, - uniqmers_IIa, - uniqmers_IIb, - uniqmers_IIIa, - uniqmers_IIIb, - uniqmers_IV, - uniqmers_VI, - uniqmers_VII, - uniqmers_VIII, - uniqmers_bongori_rpoB, - uniqmers_S_enterica_rpoB, - uniqmers_Escherichia_rpoB, - uniqmers_Listeria_ss_rpoB, - uniqmers_Lmono_rpoB, - mode): - dict_invA, dict_rpoB, mean_invA, mean_rpoB , total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers, - allmers_rpoB, mode) - target_mers_invA = set([key for key in dict_invA]) - target_mers_rpoB = set([key for key in dict_rpoB]) - if target_mers_invA == 0: - print('No reads found matching invA, no Salmonella in sample?') - else: - p_bongori = (len(uniqmers_bongori & target_mers_invA) / len(uniqmers_bongori)) * 100 - p_I = (len(uniqmers_I & target_mers_invA) / len(uniqmers_I)) * 100 - p_IIa = (len(uniqmers_IIa & target_mers_invA) / len(uniqmers_IIa)) * 100 - p_IIb = (len(uniqmers_IIb & target_mers_invA) / len(uniqmers_IIb)) * 100 - p_IIIa = (len(uniqmers_IIIa & target_mers_invA) / len(uniqmers_IIIa)) * 100 - p_IIIb = (len(uniqmers_IIIb & target_mers_invA) / len(uniqmers_IIIb)) * 100 - p_VI = (len(uniqmers_VI & target_mers_invA) / len(uniqmers_VI)) * 100 - p_IV = (len(uniqmers_IV & target_mers_invA) / len(uniqmers_IV)) * 100 - p_VII = (len(uniqmers_VII & target_mers_invA) / len(uniqmers_VII)) * 100 - p_VIII = (len(uniqmers_VIII & target_mers_invA) / len(uniqmers_VIII)) * 100 - p_bongori_rpoB = (len(uniqmers_bongori_rpoB & target_mers_rpoB) / len(uniqmers_bongori_rpoB)) * 100 - p_Senterica = (len(uniqmers_S_enterica_rpoB & target_mers_rpoB) / len(uniqmers_S_enterica_rpoB)) * 100 - p_Escherichia = (len(uniqmers_Escherichia_rpoB & target_mers_rpoB) / len(uniqmers_Escherichia_rpoB)) * 100 - p_Listeria_ss = (len(uniqmers_Listeria_ss_rpoB & target_mers_rpoB) / len(uniqmers_Listeria_ss_rpoB)) * 100 - p_Lmono = (len(uniqmers_Lmono_rpoB & target_mers_rpoB) / len(uniqmers_Lmono_rpoB)) * 100 - bongori_invA_cov = mean_cov_selected_kmers(uniqmers_bongori & target_mers_invA, dict_invA, uniqmers_bongori) - I_invA_cov = mean_cov_selected_kmers(uniqmers_I & target_mers_invA, dict_invA, uniqmers_I) - IIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIa & target_mers_invA, dict_invA, uniqmers_IIa) - IIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIb & target_mers_invA, dict_invA, uniqmers_IIb) - IIIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIIa & target_mers_invA, dict_invA, uniqmers_IIIa) - IIIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIIb & target_mers_invA, dict_invA, uniqmers_IIIb) - IV_invA_cov = mean_cov_selected_kmers(uniqmers_IV & target_mers_invA, dict_invA, uniqmers_IV) - VI_invA_cov = mean_cov_selected_kmers(uniqmers_VI & target_mers_invA, dict_invA, uniqmers_VI) - VII_invA_cov = mean_cov_selected_kmers(uniqmers_VII & target_mers_invA, dict_invA, uniqmers_VII) - VIII_invA_cov = mean_cov_selected_kmers(uniqmers_VIII & target_mers_invA, dict_invA, uniqmers_VIII) - S_enterica_rpoB_cov = mean_cov_selected_kmers((uniqmers_S_enterica_rpoB & target_mers_rpoB), dict_rpoB, - uniqmers_S_enterica_rpoB) - S_bongori_rpoB_cov = mean_cov_selected_kmers((uniqmers_bongori_rpoB & target_mers_rpoB), dict_rpoB, - uniqmers_bongori_rpoB) - Escherichia_rpoB_cov = mean_cov_selected_kmers((uniqmers_Escherichia_rpoB & target_mers_rpoB), dict_rpoB, - uniqmers_Escherichia_rpoB) - Listeria_ss_rpoB_cov = mean_cov_selected_kmers((uniqmers_Listeria_ss_rpoB & target_mers_rpoB), dict_rpoB, - uniqmers_Listeria_ss_rpoB) - Lmono_rpoB_cov = mean_cov_selected_kmers((uniqmers_Lmono_rpoB & target_mers_rpoB), dict_rpoB, - uniqmers_Lmono_rpoB) - coverages = [Listeria_ss_rpoB_cov, Lmono_rpoB_cov, Escherichia_rpoB_cov, S_bongori_rpoB_cov, - S_enterica_rpoB_cov, bongori_invA_cov, I_invA_cov, IIa_invA_cov, IIb_invA_cov, - IIIa_invA_cov, IIIb_invA_cov, IV_invA_cov, VI_invA_cov, VII_invA_cov, VIII_invA_cov] - locus_scores = [p_Listeria_ss, p_Lmono, p_Escherichia, p_bongori_rpoB, p_Senterica, p_bongori, - p_I, p_IIa,p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII] - return locus_scores, coverages, total_reads - -def report_taxon(locus_covs, average_read_length, number_of_reads): - list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.', - 'Salmonella bongori (rpoB)', 'Salmonella enterica (rpoB)', - 'Salmonella bongori (invA)', 'S. enterica subsp. enterica (invA)', - 'S. enterica subsp. salamae (invA: clade a)','S. enterica subsp. salamae (invA: clade b)', - 'S. enterica subsp. arizonae (invA)', 'S. enterica subsp. diarizonae (invA)', - 'S. enterica subsp. houtenae (invA)', 'S. enterica subsp. indica (invA)', - 'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)'] - if sum(locus_covs) < 1: - rpoB = ('No rpoB matches!', 0) - invA = ('No invA matches!', 0) - return rpoB, invA, 0.0 - else: - # given list of scores get taxon - if sum(locus_covs[0:5]) > 0: - best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x])+1 - all_rpoB = max(range(len(locus_covs[0:5])), key=lambda x: locus_covs[0:5][x]) - if (locus_covs[best_rpoB] != 0) & (all_rpoB == 0): - rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB]) - elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]),1) < 1): - rpoB = (list_taxa[0], locus_covs[0]) - else: - rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB]) - else: - rpoB = ('No rpoB matches!', 0) - if sum(locus_covs[5:]) > 0: - best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x])+5 - invA = (list_taxa[best_invA], locus_covs[best_invA]) - else: - invA = ('No invA matches!', 0) - if 'Listeria' in rpoB[0]: - return rpoB, invA, (average_read_length * number_of_reads) / 3000000 - else: - return rpoB, invA, (average_read_length * number_of_reads) / 5000000 - - - -def main(): - ex_dir = os.path.dirname(os.path.realpath(__file__)) - args = parse_args() - input_file = args.input_file - if input_file != 'None': - files = [input_file] - else: - extension = args.extension - inputdir = args.input_dir - files = [inputdir + '/'+ f for f in os.listdir(inputdir) if f.endswith(extension)] - report = args.report - mode = args.mode - f_invA = open(ex_dir + "/invA_mers_dict", "rb") - sets_dict_invA = pickle.load(f_invA) - f_invA.close() - allmers = sets_dict_invA['allmers'] - uniqmers_I = sets_dict_invA['uniqmers_I'] - uniqmers_IIa = sets_dict_invA['uniqmers_IIa'] - uniqmers_IIb = sets_dict_invA['uniqmers_IIb'] - uniqmers_IIIa = sets_dict_invA['uniqmers_IIIa'] - uniqmers_IIIb = sets_dict_invA['uniqmers_IIIb'] - uniqmers_IV = sets_dict_invA['uniqmers_IV'] - uniqmers_VI = sets_dict_invA['uniqmers_VI'] - uniqmers_VII = sets_dict_invA['uniqmers_VII'] - uniqmers_VIII = sets_dict_invA['uniqmers_VIII'] - uniqmers_bongori = sets_dict_invA['uniqmers_bongori'] - - f = open(ex_dir + "/rpoB_mers_dict", "rb") - sets_dict = pickle.load(f) - f.close() - - allmers_rpoB = sets_dict['allmers'] - uniqmers_bongori_rpoB = sets_dict['uniqmers_bongori'] - uniqmers_S_enterica_rpoB = sets_dict['uniqmers_S_enterica'] - uniqmers_Escherichia_rpoB = sets_dict['uniqmers_Escherichia'] - uniqmers_Listeria_ss_rpoB = sets_dict['uniqmers_Listeria_ss'] - uniqmers_Lmono_rpoB = sets_dict['uniqmers_L_mono'] - #todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports - if report == 'taxonomy': - print('file\trpoB\tinvA\texpected coverage') - for f in files: - locus_scores, coverages, reads = kmer_lists(f, 27, - allmers, allmers_rpoB, - uniqmers_bongori, - uniqmers_I, - uniqmers_IIa, - uniqmers_IIb, - uniqmers_IIIa, - uniqmers_IIIb, - uniqmers_IV, - uniqmers_VI, - uniqmers_VII, - uniqmers_VIII, - uniqmers_bongori_rpoB, - uniqmers_S_enterica_rpoB, - uniqmers_Escherichia_rpoB, - uniqmers_Listeria_ss_rpoB, - uniqmers_Lmono_rpoB, - mode) - pretty_covs = [round(cov, 1) for cov in coverages] - report = report_taxon(pretty_covs, get_av_read_length(f), reads) - print(f.split('/')[-1] + '\t' + report[0][0] + '[' + str(report[0][1]) + ']' + '\t' + report[1][0] + - '[' + str(report[1][1]) + ']' + - '\t' + str(round(report[2], 1))) - else: - print( - 'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' + - '(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' + - ' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' + - '\tsubsp. II (clade VIII : invA)') - if report == 'percentage': - for f in files: - locus_scores, coverages , reads = kmer_lists( f, 27, - allmers,allmers_rpoB, - uniqmers_bongori, - uniqmers_I, - uniqmers_IIa, - uniqmers_IIb, - uniqmers_IIIa, - uniqmers_IIIb, - uniqmers_IV, - uniqmers_VI, - uniqmers_VII, - uniqmers_VIII, - uniqmers_bongori_rpoB, - uniqmers_S_enterica_rpoB, - uniqmers_Escherichia_rpoB, - uniqmers_Listeria_ss_rpoB, - uniqmers_Lmono_rpoB, - mode) - pretty_scores = [str(round(score)) for score in locus_scores] - print(f.split('/')[-1] +'\t' + '\t'.join(pretty_scores)) - else: - for f in files: - locus_scores, coverages , reads = kmer_lists( f, 27, - allmers,allmers_rpoB, - uniqmers_bongori, - uniqmers_I, - uniqmers_IIa, - uniqmers_IIb, - uniqmers_IIIa, - uniqmers_IIIb, - uniqmers_IV, - uniqmers_VI, - uniqmers_VII, - uniqmers_VIII, - uniqmers_bongori_rpoB, - uniqmers_S_enterica_rpoB, - uniqmers_Escherichia_rpoB, - uniqmers_Listeria_ss_rpoB, - uniqmers_Lmono_rpoB, - mode) - pretty_covs = [str(round(cov, 1)) for cov in coverages] - print(f.split('/')[-1] + '\t' + '\t'.join(pretty_covs)) - -if __name__ == '__main__': - main() -
--- a/SalmID/version.py Tue Aug 13 08:40:57 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -SalmID_version = '0.122'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SeqSero2/SalmID/LICENSE Wed Aug 14 17:45:41 2019 -0400 @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Henk den Bakker + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SeqSero2/SalmID/README.md Wed Aug 14 17:45:41 2019 -0400 @@ -0,0 +1,49 @@ +[![DOI](https://zenodo.org/badge/97020646.svg)](https://zenodo.org/badge/latestdoi/97020646) + +# SalmID +Rapid tool to check taxonomic ID of single isolate samples. Currently only IDs Salmonella species and subspecies, and some common contaminants (Listeria, Escherichia). + +## Requirements: +Python 3 + +## Installation: +The easy way with homebrew ([Linux](http://linuxbrew.sh/) or [MacOS](https://brew.sh/)): +``` +brew install brewsci/bio/salmid +``` +Big thanks to [Torsten Seemann](https://tseemann.github.io/) for including this in homebrew! + +Alernatively git clone to your machine: +``` +git clone --recursive https://github.com/hcdenbakker/SalmID.git +``` + +Make SalmID executable: +``` +cd SalmID +``` + +``` +chmod +x SalmID.py +``` + + +Add the SalmID folder to your path + +To execute: +``` +SalmID.py -e .fastq.gz +``` +This will perform a SalmID run on all fastq.gz files in the current directory. +``` +SalmID.py -i your_fastq_gz.fastq.gz +``` +This will perform a SalmID run on an individual file (i.e., your_fastq_gz.fastq.gz) +``` +SalmID.py -d directory_with_data -e _1.fastq.gz +``` +This will perform a SalmID run on all files in directory 'directory_with_data' with extension '_1.fastq.gz' + +## Todo's and thoughts for future releases: +- Provide coverage estimates for genomes in sample based on kmer frequencies +- Write code to use SalmID on long read (minion, pacbio) platforms
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SeqSero2/SalmID/SalmID.py Wed Aug 14 17:45:41 2019 -0400 @@ -0,0 +1,371 @@ +#!/usr/bin/env python3 + + +import gzip +import io +import pickle +import os +import sys + +from argparse import ArgumentParser +try: + from version import SalmID_version +except: + SalmID_version = "version unknown" + + +def reverse_complement(sequence): + complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'M': 'K', 'R': 'Y', 'W': 'W', + 'S': 'S', 'Y': 'R', 'K': 'M', 'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'} + return "".join(complement[base] for base in reversed(sequence)) + + +def parse_args(): + "Parse the input arguments, use '-h' for help." + parser = ArgumentParser(description='SalmID - rapid Kmer based Salmonella identifier from sequence data') + # inputs + parser.add_argument('-v','--version', action='version', version='%(prog)s ' + SalmID_version) + parser.add_argument( + '-i','--input_file', type=str, required=False, default= 'None', metavar = 'your_fastqgz', + help='Single fastq.gz file input, include path to file if file is not in same directory ') + parser.add_argument( + '-e', '--extension', type=str, required=False, default= '.fastq.gz', metavar = 'file_extension', + help='File extension, if specified without "--input_dir", SalmID will attempt to ID all files\n' + + ' with this extension in current directory, otherwise files in input directory') + + parser.add_argument( + '-d','--input_dir', type=str, required=False, default='.', metavar = 'directory', + help='Directory which contains data for identification, when not specified files in current directory will be analyzed.') + parser.add_argument( + '-r', '--report', type=str, required=False, default='percentage', metavar = 'percentage, coverage or taxonomy', + help='Report either percentage ("percentage") of clade specific kmers recovered, average kmer-coverage ("cov"), or ' + 'taxonomy (taxonomic species ID, plus observed mean k-mer coverages and expected coverage).') + parser.add_argument( + '-m', '--mode', type=str, required=False, default='quick', metavar = 'quick or thorough', + help='Quick [quick] or thorough [thorough] mode') + if len(sys.argv)==1: + parser.print_help(sys.stderr) + sys.exit(1) + return parser.parse_args() + +def get_av_read_length(file): + i = 1 + n_reads = 0 + total_length = 0 + if file.endswith(".gz"): + file_content=io.BufferedReader(gzip.open(file)) + else: + file_content=open(file,"r").readlines() + for line in file_content: + if i % 4 == 2: + total_length += len(line.strip()) + n_reads +=1 + i += 1 + if n_reads == 100: + break + return total_length/100 + + +def createKmerDict_reads(list_of_strings, kmer): + kmer_table = {} + for string in list_of_strings: + sequence = string.strip('\n') + for i in range(len(sequence)-kmer+1): + new_mer =sequence[i:i+kmer] + new_mer_rc = reverse_complement(new_mer) + if new_mer in kmer_table: + kmer_table[new_mer.upper()] += 1 + else: + kmer_table[new_mer.upper()] = 1 + if new_mer_rc in kmer_table: + kmer_table[new_mer_rc.upper()] += 1 + else: + kmer_table[new_mer_rc.upper()] = 1 + return kmer_table + + +def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode): + mean_1 = None + mean_2 = None + i = 1 + n_reads_1 = 0 + n_reads_2 = 0 + total_coverage_1 = 0 + total_coverage_2 = 0 + reads_1 = [] + reads_2 = [] + total_reads = 0 + if file.endswith(".gz"): + file_content=io.BufferedReader(gzip.open(file)) + else: + file_content=open(file,"r").readlines() + for line in file_content: + start = int((len(line) - k) // 2) + if i % 4 == 2: + total_reads += 1 + if file.endswith(".gz"): + s1 = line[start:k + start].decode() + line=line.decode() + else: + s1 = line[start:k + start] + if s1 in kmerDict_1: + n_reads_1 += 1 + total_coverage_1 += len(line) + reads_1.append(line) + if s1 in kmerDict_2: + n_reads_2 += 1 + total_coverage_2 += len(line) + reads_2.append(line) + i += 1 + if mode == 'quick': + if total_coverage_2 >= 800000: + break + + if len(reads_1) == 0: + kmer_Dict1 = {} + else: + kmer_Dict1 = createKmerDict_reads(reads_1, k) + mers_1 = set([key for key in kmer_Dict1]) + mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1])/len(mers_1) + if len(reads_2) == 0: + kmer_Dict2 = {} + else: + kmer_Dict2 = createKmerDict_reads(reads_2, k) + mers_2 = set([key for key in kmer_Dict2]) + mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2])/len(mers_2) + return kmer_Dict1, kmer_Dict2, mean_1, mean_2, total_reads + +def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers): + ''' + Given an iterable (list, set, dictrionary) returns mean coverage for the kmers in iterable + :param iterable: set, list or dictionary containing kmers + :param kmer_dict: dictionary with kmers as keys, kmer-frequency as value + :param clade_specific_kmers: list, dict or set of clade specific kmers + :return: mean frequency as float + ''' + if len(iterable) == 0: + return 0 + return sum([kmer_dict[value] for value in iterable])/len(clade_specific_kmers) + +def kmer_lists(query_fastq_gz, k, + allmers,allmers_rpoB, + uniqmers_bongori, + uniqmers_I, + uniqmers_IIa, + uniqmers_IIb, + uniqmers_IIIa, + uniqmers_IIIb, + uniqmers_IV, + uniqmers_VI, + uniqmers_VII, + uniqmers_VIII, + uniqmers_bongori_rpoB, + uniqmers_S_enterica_rpoB, + uniqmers_Escherichia_rpoB, + uniqmers_Listeria_ss_rpoB, + uniqmers_Lmono_rpoB, + mode): + dict_invA, dict_rpoB, mean_invA, mean_rpoB , total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers, + allmers_rpoB, mode) + target_mers_invA = set([key for key in dict_invA]) + target_mers_rpoB = set([key for key in dict_rpoB]) + if target_mers_invA == 0: + print('No reads found matching invA, no Salmonella in sample?') + else: + p_bongori = (len(uniqmers_bongori & target_mers_invA) / len(uniqmers_bongori)) * 100 + p_I = (len(uniqmers_I & target_mers_invA) / len(uniqmers_I)) * 100 + p_IIa = (len(uniqmers_IIa & target_mers_invA) / len(uniqmers_IIa)) * 100 + p_IIb = (len(uniqmers_IIb & target_mers_invA) / len(uniqmers_IIb)) * 100 + p_IIIa = (len(uniqmers_IIIa & target_mers_invA) / len(uniqmers_IIIa)) * 100 + p_IIIb = (len(uniqmers_IIIb & target_mers_invA) / len(uniqmers_IIIb)) * 100 + p_VI = (len(uniqmers_VI & target_mers_invA) / len(uniqmers_VI)) * 100 + p_IV = (len(uniqmers_IV & target_mers_invA) / len(uniqmers_IV)) * 100 + p_VII = (len(uniqmers_VII & target_mers_invA) / len(uniqmers_VII)) * 100 + p_VIII = (len(uniqmers_VIII & target_mers_invA) / len(uniqmers_VIII)) * 100 + p_bongori_rpoB = (len(uniqmers_bongori_rpoB & target_mers_rpoB) / len(uniqmers_bongori_rpoB)) * 100 + p_Senterica = (len(uniqmers_S_enterica_rpoB & target_mers_rpoB) / len(uniqmers_S_enterica_rpoB)) * 100 + p_Escherichia = (len(uniqmers_Escherichia_rpoB & target_mers_rpoB) / len(uniqmers_Escherichia_rpoB)) * 100 + p_Listeria_ss = (len(uniqmers_Listeria_ss_rpoB & target_mers_rpoB) / len(uniqmers_Listeria_ss_rpoB)) * 100 + p_Lmono = (len(uniqmers_Lmono_rpoB & target_mers_rpoB) / len(uniqmers_Lmono_rpoB)) * 100 + bongori_invA_cov = mean_cov_selected_kmers(uniqmers_bongori & target_mers_invA, dict_invA, uniqmers_bongori) + I_invA_cov = mean_cov_selected_kmers(uniqmers_I & target_mers_invA, dict_invA, uniqmers_I) + IIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIa & target_mers_invA, dict_invA, uniqmers_IIa) + IIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIb & target_mers_invA, dict_invA, uniqmers_IIb) + IIIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIIa & target_mers_invA, dict_invA, uniqmers_IIIa) + IIIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIIb & target_mers_invA, dict_invA, uniqmers_IIIb) + IV_invA_cov = mean_cov_selected_kmers(uniqmers_IV & target_mers_invA, dict_invA, uniqmers_IV) + VI_invA_cov = mean_cov_selected_kmers(uniqmers_VI & target_mers_invA, dict_invA, uniqmers_VI) + VII_invA_cov = mean_cov_selected_kmers(uniqmers_VII & target_mers_invA, dict_invA, uniqmers_VII) + VIII_invA_cov = mean_cov_selected_kmers(uniqmers_VIII & target_mers_invA, dict_invA, uniqmers_VIII) + S_enterica_rpoB_cov = mean_cov_selected_kmers((uniqmers_S_enterica_rpoB & target_mers_rpoB), dict_rpoB, + uniqmers_S_enterica_rpoB) + S_bongori_rpoB_cov = mean_cov_selected_kmers((uniqmers_bongori_rpoB & target_mers_rpoB), dict_rpoB, + uniqmers_bongori_rpoB) + Escherichia_rpoB_cov = mean_cov_selected_kmers((uniqmers_Escherichia_rpoB & target_mers_rpoB), dict_rpoB, + uniqmers_Escherichia_rpoB) + Listeria_ss_rpoB_cov = mean_cov_selected_kmers((uniqmers_Listeria_ss_rpoB & target_mers_rpoB), dict_rpoB, + uniqmers_Listeria_ss_rpoB) + Lmono_rpoB_cov = mean_cov_selected_kmers((uniqmers_Lmono_rpoB & target_mers_rpoB), dict_rpoB, + uniqmers_Lmono_rpoB) + coverages = [Listeria_ss_rpoB_cov, Lmono_rpoB_cov, Escherichia_rpoB_cov, S_bongori_rpoB_cov, + S_enterica_rpoB_cov, bongori_invA_cov, I_invA_cov, IIa_invA_cov, IIb_invA_cov, + IIIa_invA_cov, IIIb_invA_cov, IV_invA_cov, VI_invA_cov, VII_invA_cov, VIII_invA_cov] + locus_scores = [p_Listeria_ss, p_Lmono, p_Escherichia, p_bongori_rpoB, p_Senterica, p_bongori, + p_I, p_IIa,p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII] + return locus_scores, coverages, total_reads + +def report_taxon(locus_covs, average_read_length, number_of_reads): + list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.', + 'Salmonella bongori (rpoB)', 'Salmonella enterica (rpoB)', + 'Salmonella bongori (invA)', 'S. enterica subsp. enterica (invA)', + 'S. enterica subsp. salamae (invA: clade a)','S. enterica subsp. salamae (invA: clade b)', + 'S. enterica subsp. arizonae (invA)', 'S. enterica subsp. diarizonae (invA)', + 'S. enterica subsp. houtenae (invA)', 'S. enterica subsp. indica (invA)', + 'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)'] + if sum(locus_covs) < 1: + rpoB = ('No rpoB matches!', 0) + invA = ('No invA matches!', 0) + return rpoB, invA, 0.0 + else: + # given list of scores get taxon + if sum(locus_covs[0:5]) > 0: + best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x])+1 + all_rpoB = max(range(len(locus_covs[0:5])), key=lambda x: locus_covs[0:5][x]) + if (locus_covs[best_rpoB] != 0) & (all_rpoB == 0): + rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB]) + elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]),1) < 1): + rpoB = (list_taxa[0], locus_covs[0]) + else: + rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB]) + else: + rpoB = ('No rpoB matches!', 0) + if sum(locus_covs[5:]) > 0: + best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x])+5 + invA = (list_taxa[best_invA], locus_covs[best_invA]) + else: + invA = ('No invA matches!', 0) + if 'Listeria' in rpoB[0]: + return rpoB, invA, (average_read_length * number_of_reads) / 3000000 + else: + return rpoB, invA, (average_read_length * number_of_reads) / 5000000 + + + +def main(): + ex_dir = os.path.dirname(os.path.realpath(__file__)) + args = parse_args() + input_file = args.input_file + if input_file != 'None': + files = [input_file] + else: + extension = args.extension + inputdir = args.input_dir + files = [inputdir + '/'+ f for f in os.listdir(inputdir) if f.endswith(extension)] + report = args.report + mode = args.mode + f_invA = open(ex_dir + "/invA_mers_dict", "rb") + sets_dict_invA = pickle.load(f_invA) + f_invA.close() + allmers = sets_dict_invA['allmers'] + uniqmers_I = sets_dict_invA['uniqmers_I'] + uniqmers_IIa = sets_dict_invA['uniqmers_IIa'] + uniqmers_IIb = sets_dict_invA['uniqmers_IIb'] + uniqmers_IIIa = sets_dict_invA['uniqmers_IIIa'] + uniqmers_IIIb = sets_dict_invA['uniqmers_IIIb'] + uniqmers_IV = sets_dict_invA['uniqmers_IV'] + uniqmers_VI = sets_dict_invA['uniqmers_VI'] + uniqmers_VII = sets_dict_invA['uniqmers_VII'] + uniqmers_VIII = sets_dict_invA['uniqmers_VIII'] + uniqmers_bongori = sets_dict_invA['uniqmers_bongori'] + + f = open(ex_dir + "/rpoB_mers_dict", "rb") + sets_dict = pickle.load(f) + f.close() + + allmers_rpoB = sets_dict['allmers'] + uniqmers_bongori_rpoB = sets_dict['uniqmers_bongori'] + uniqmers_S_enterica_rpoB = sets_dict['uniqmers_S_enterica'] + uniqmers_Escherichia_rpoB = sets_dict['uniqmers_Escherichia'] + uniqmers_Listeria_ss_rpoB = sets_dict['uniqmers_Listeria_ss'] + uniqmers_Lmono_rpoB = sets_dict['uniqmers_L_mono'] + #todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports + if report == 'taxonomy': + print('file\trpoB\tinvA\texpected coverage') + for f in files: + locus_scores, coverages, reads = kmer_lists(f, 27, + allmers, allmers_rpoB, + uniqmers_bongori, + uniqmers_I, + uniqmers_IIa, + uniqmers_IIb, + uniqmers_IIIa, + uniqmers_IIIb, + uniqmers_IV, + uniqmers_VI, + uniqmers_VII, + uniqmers_VIII, + uniqmers_bongori_rpoB, + uniqmers_S_enterica_rpoB, + uniqmers_Escherichia_rpoB, + uniqmers_Listeria_ss_rpoB, + uniqmers_Lmono_rpoB, + mode) + pretty_covs = [round(cov, 1) for cov in coverages] + report = report_taxon(pretty_covs, get_av_read_length(f), reads) + print(f.split('/')[-1] + '\t' + report[0][0] + '[' + str(report[0][1]) + ']' + '\t' + report[1][0] + + '[' + str(report[1][1]) + ']' + + '\t' + str(round(report[2], 1))) + else: + print( + 'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' + + '(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' + + ' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' + + '\tsubsp. II (clade VIII : invA)') + if report == 'percentage': + for f in files: + locus_scores, coverages , reads = kmer_lists( f, 27, + allmers,allmers_rpoB, + uniqmers_bongori, + uniqmers_I, + uniqmers_IIa, + uniqmers_IIb, + uniqmers_IIIa, + uniqmers_IIIb, + uniqmers_IV, + uniqmers_VI, + uniqmers_VII, + uniqmers_VIII, + uniqmers_bongori_rpoB, + uniqmers_S_enterica_rpoB, + uniqmers_Escherichia_rpoB, + uniqmers_Listeria_ss_rpoB, + uniqmers_Lmono_rpoB, + mode) + pretty_scores = [str(round(score)) for score in locus_scores] + print(f.split('/')[-1] +'\t' + '\t'.join(pretty_scores)) + else: + for f in files: + locus_scores, coverages , reads = kmer_lists( f, 27, + allmers,allmers_rpoB, + uniqmers_bongori, + uniqmers_I, + uniqmers_IIa, + uniqmers_IIb, + uniqmers_IIIa, + uniqmers_IIIb, + uniqmers_IV, + uniqmers_VI, + uniqmers_VII, + uniqmers_VIII, + uniqmers_bongori_rpoB, + uniqmers_S_enterica_rpoB, + uniqmers_Escherichia_rpoB, + uniqmers_Listeria_ss_rpoB, + uniqmers_Lmono_rpoB, + mode) + pretty_covs = [str(round(cov, 1)) for cov in coverages] + print(f.split('/')[-1] + '\t' + '\t'.join(pretty_covs)) + +if __name__ == '__main__': + main() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SeqSero2/SalmID/version.py Wed Aug 14 17:45:41 2019 -0400 @@ -0,0 +1,1 @@ +SalmID_version = '0.122'
--- a/SeqSero2/SeqSero2_package.py Tue Aug 13 08:40:57 2019 -0400 +++ b/SeqSero2/SeqSero2_package.py Wed Aug 14 17:45:41 2019 -0400 @@ -1172,6 +1172,7 @@ t="4" else: t=threads + new_fasta = None if os.path.getsize(combined_fq)>100 and (fnameB=="" or os.path.getsize(mapped_fq1)>100):#if not, then it's "-:-:-" if fnameB!="": subprocess.check_output("spades.py --careful --pe1-s "+combined_fq+" --pe1-1 "+mapped_fq1+" --pe1-2 "+mapped_fq2+" -t "+t+" -o "+outdir,shell=True, stderr=subprocess.STDOUT) @@ -1191,7 +1192,7 @@ def judge_subspecies(fnameA): #seqsero2 -a; judge subspecies on just forward raw reads fastq - salmID_output=subprocess.check_output("../SalmID/SalmID.py -i "+fnameA, shell=True, stderr=subprocess.STDOUT) + salmID_output=subprocess.check_output(os.path.join(dirpath, "SalmID/SalmID.py") + ' -i ' +fnameA, shell=True, stderr=subprocess.STDOUT) #out, err = salmID_output.communicate() #out=out.decode("utf-8") out = salmID_output.decode("utf-8") @@ -1256,6 +1257,7 @@ ############################begin the real analysis if analysis_mode=="a": if data_type in ["1","2","3"]:#use allele mode + H1_cont_stat_list = H2_cont_stat_list = [] for_fq,rev_fq=get_input_files(make_dir,input_file,data_type,dirpath) os.chdir(make_dir) ###add a function to tell input files
--- a/seqsero2.xml Tue Aug 13 08:40:57 2019 -0400 +++ b/seqsero2.xml Wed Aug 14 17:45:41 2019 -0400 @@ -7,7 +7,7 @@ <requirement type="package" version="1.9">samtools</requirement> <requirement type="package" version="2.9.1">sra-tools</requirement> <requirement type="package" version="0.7.17">bwa</requirement> - <requirement type="package" version="3.12.0">spades</requirement> + <requirement type="package" version="3.13.1">spades</requirement> <requirement type="package" version="2.27.1">bedtools</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ @@ -34,6 +34,7 @@ ln -s $forward ${name}_1.fastq; ln -s $reverse ${name}_2.fastq; mkdir ./output; + touch output/SeqSero_log.txt ; python $__tool_directory__/SeqSero2/SeqSero2_package.py -p \${GALAXY_SLOTS:-4} -t 2 @@ -127,9 +128,11 @@ <test> <param name="mode" value="a" /> <param name="reads_select" value="history" /> - <param name="forward" value="forward_25k.fastq.gz" ftype="fastqsanger" /> - <param name="reverse" value="reverse_25k.fastq.gz" ftype="fastqsanger" /> - <output name="results" file="Seqsero_result_25k.tsv" /> + <param name="forward" value="forward_250k.fastq.gz" ftype="fastqsanger" /> + <param name="reverse" value="reverse_250k.fastq.gz" ftype="fastqsanger" /> + <assert_stdout> + <has_text text="predicted antigenic profile does not exist" /> + </assert_stdout> </test> <!-- <test> <param name="mode" value="a" />