comparison cfsan_bettercallsal.xml @ 1:365849f031fd

"planemo upload"
author kkonganti
date Mon, 05 Jun 2023 18:48:51 -0400
parents a4b1ee4b68b1
children 4678c2cd1c9e
comparison
equal deleted inserted replaced
0:a4b1ee4b68b1 1:365849f031fd
1 <tool id="cfsan_centriflaken" name="Centriflaken" version="0.2.0+galaxy0"> 1 <tool id="cfsan_bettercallsal" name="bettercallsal" version="0.2.0+galaxy0">
2 <description>An automated pipeline to generate a MAG of interest (E.coli or Salmonella) and perform serotyping.</description> 2 <description>An automated workflow to assign Salmonella serotype based on NCBI Pathogen Detection Project for Salmonella.</description>
3 <requirements> 3 <requirements>
4 <requirement type="package" version="22.04">nextflow</requirement> 4 <requirement type="package" version="22.10">nextflow</requirement>
5 <requirement type="package" version="1.0.0">micromamba</requirement>
5 <requirement type="package">graphviz</requirement> 6 <requirement type="package">graphviz</requirement>
6 </requirements> 7 </requirements>
7 <version_command>nextflow -version</version_command> 8 <version_command>nextflow -version</version_command>
8 <command detect_errors="exit_code"><![CDATA[ 9 <command detect_errors="exit_code"><![CDATA[
9 mkdir -p cpipes-input || exit 1; 10 mkdir -p cpipes-input || exit 1;
32 #end if 33 #end if
33 ln -sf '$pair.forward' './cpipes-input/$read_R1'; 34 ln -sf '$pair.forward' './cpipes-input/$read_R1';
34 ln -sf '$pair.reverse' './cpipes-input/$read_R2'; 35 ln -sf '$pair.reverse' './cpipes-input/$read_R2';
35 #end for 36 #end for
36 #end if 37 #end if
37 $__tool_directory__/0.4.0/cpipes 38 $__tool_directory__/0.5.0/cpipes
38 --pipeline $input_read_type_cond.pipeline_cond.pipeline 39 --pipeline $input_read_type_cond.pipeline_cond.pipeline
39 #if ($input_read_type_cond.pipeline_cond.pipeline == "centriflaken"): 40 --input \${pwd_path}/cpipes-input
40 --fq_single_end true
41 --flye_genome_size '${genome_size}'
42 #if ($input_read_type_cond.pipeline_cond.long_read_platform == "nanopore_corr"):
43 --flye_nano_corr true --flye_nano_raw false
44 #elif ($input_read_type_cond.pipeline_cond.long_read_platform == "nanopore_hq"):
45 --flye_nano_hq true --flye_nano_raw false
46 #elif ($input_read_type_cond.pipeline_cond.long_read_platform == "pacbio_raw"):
47 --flye_pacbio_raw true --flye_nano_raw false
48 #elif ($input_read_type_cond.pipeline_cond.long_read_platform == "pacbio_corr"):
49 --flye_pacbio_corr true --flye_nano_raw false
50 #elif ($input_read_type_cond.pipeline_cond.long_read_platform == "pacbio_hifi"):
51 --flye_pacbio_hifi true --flye_nano_raw false
52 #end if
53 #elif ($input_read_type_cond.pipeline_cond.pipeline == "centriflaken_hy"):
54 #if (str($input_read_type_cond.input_read_type) == "single_long"):
55 --fq_single_end true
56 #elif (str($input_read_type_cond.input_read_type) == "paired"):
57 --fq_single_end false --fq2_suffix '${input_read_type_cond.fq2_suffix}'
58 #end if
59 #end if
60 --input \${pwd_path}/cpipes-input
61 --output \${pwd_path}/cpipes-output 41 --output \${pwd_path}/cpipes-output
62 --fq_suffix '${input_read_type_cond.fq_suffix}' 42 --fq_suffix '${input_read_type_cond.fq_suffix}'
63 #if ($fq_filter_by_len != ""): 43 #if (str($input_read_type_cond.input_read_type) == "single_long"):
64 --fq_filter_by_len $fq_filter_by_len 44 --fq_single_end true
45 #elif (str($input_read_type_cond.input_read_type) == "paired"):
46 --fq_single_end false --fq2_suffix '${input_read_type_cond.fq2_suffix}'
65 #end if 47 #end if
48 --tuspy_n $tuspy_n
49 if ($sourmash_cond.run == "true"):
50 --sfhpy_fcv $sourmash_run.sfhpy_fcv
51 #end if
52 --bcs_thresholds $bcs_thresholds
66 --fq_filename_delim '${fq_filename_delim}' 53 --fq_filename_delim '${fq_filename_delim}'
67 --fq_filename_delim_idx $fq_filename_delim_idx 54 --fq_filename_delim_idx $fq_filename_delim_idx
68 --centrifuge_extract_bug '${centrifuge_extract_bug}'
69 #if (str($input_read_type_cond.pipeline_cond.rm_dup_seqs) == "true"):
70 --seqkit_rmdup_run true
71 #end if
72 -profile kondagac; 55 -profile kondagac;
73 mv './cpipes-output/${input_read_type_cond.pipeline_cond.pipeline}-multiqc/multiqc_report.html' './multiqc_report.html' > /dev/null 2>&1 || exit 1; 56 mv './cpipes-output/${input_read_type_cond.pipeline_cond.pipeline}-multiqc/multiqc_report.html' './multiqc_report.html' > /dev/null 2>&1 || exit 1;
74 mv './cpipes-output/${input_read_type_cond.pipeline_cond.pipeline}-results/kraken2_extract_contigs' kraken2_extract_contigs > /dev/null 2>&1 || exit 1;
75 rm -rf ./cpipes-output > /dev/null 2>&1 || exit 1; 57 rm -rf ./cpipes-output > /dev/null 2>&1 || exit 1;
76 rm -rf ./work > /dev/null 2>&1 || exit 1 58 rm -rf ./work > /dev/null 2>&1 || exit 1
77 ]]></command> 59 ]]></command>
78 <inputs> 60 <inputs>
79 <conditional name="input_read_type_cond"> 61 <conditional name="input_read_type_cond">
80 <param name="input_read_type" type="select" label="Select the read collection type"> 62 <param name="input_read_type" type="select" label="Select the read collection type">
81 <option value="single_long" selected="true">Unpaired reads (i.e. Single-End short reads or Long reads)</option> 63 <option value="single_long" selected="true">Single-End short reads</option>
82 <option value="paired">Paired-End reads</option> 64 <option value="paired">Paired-End short reads</option>
83 </param> 65 </param>
84 <when value="single_long"> 66 <when value="single_long">
85 <param name="input" type="data_collection" collection_type="list" format="fastq,fastq.gz" 67 <param name="input" type="data_collection" collection_type="list" format="fastq,fastq.gz"
86 label="Dataset list of unpaired short reads or long reads" /> 68 label="Dataset list of unpaired short reads or long reads" />
87 <conditional name="pipeline_cond"> 69 <param name="fq_suffix" value=".fastq.gz" type="text" label="Suffix of the Single-End FASTQ"/>
88 <param name="pipeline" type="select" label="CPIPES Workflow name"
89 help="centriflaken: for long reads (Nanopore or PacBio). centriflaken_hy: for unpaired short reads. Default: centriflaken">
90 <option value="centriflaken" selected="true">centriflaken</option>
91 <option value="centriflaken_hy">centriflaken_hy</option>
92 </param>
93 <when value="centriflaken">
94 <param name="long_read_platform" type="select" label="Mention long read sequencing platform and type">
95 <option value="nanopore_raw" selected="true">Nanopore raw reads, pre-Guppy5 (&lt;20% error)</option>
96 <option value="nanopore_corr">Nanopore reads that were corrected with other methods (&lt;3% error)</option>
97 <option value="nanopore_hq">Nanopore high-quality reads, Guppy5+ SUP or Q20 (5% error)</option>
98 <option value="pacbio_raw">PacBio regular CLR reads (&lt;20% error)</option>
99 <option value="pacbio_corr">PacBio reads that were corrected with other methods (&lt;3% error)</option>
100 <option value="pacbio_hifi">PacBio HiFi reads (&lt;1% error)</option>
101 </param>
102 <param name="rm_dup_seqs" type="select" label="Remove duplicate sequences"
103 help="THIS OPTION IS IGNORED IF THE INPUT READS ARE LONG READS.">
104 <option value="NA" selected="true">N/A</option>
105 </param>
106 </when>
107 <when value="centriflaken_hy">
108 <param name="long_read_platform" type="select" label="Mention long read sequencing platform and type"
109 help="THIS OPTION IS IGNORED IF THE INPUT READS ARE SHORT READS.">
110 <option value="NA" selected="true">N/A</option>
111 </param>
112 <param name="rm_dup_seqs" type="select" label="Remove duplicate sequences"
113 help="Selecting yes will compare sequence content and remove identical sequences i.e. only the first occured sequence record will be saved.">
114 <option value="true">yes</option>
115 <option value="false" selected="true">no</option>
116 </param>
117 </when>
118 </conditional>
119 <param name="fq_suffix" value=".fastq.gz" type="text" label="Suffix of the Unpaired FASTQ"/>
120 </when> 70 </when>
121 <when value="paired"> 71 <when value="paired">
122 <param name="input_pair" type="data_collection" collection_type="list:paired" format="fastq,fastq.gz" label="List of Dataset pairs" /> 72 <param name="input_pair" type="data_collection" collection_type="list:paired" format="fastq,fastq.gz" label="List of Dataset pairs" />
123 <conditional name="pipeline_cond">
124 <param name="pipeline" type="select" label="CPIPES Workflow name"
125 help="Auto selected centriflaken_hy workflow for paired-end short reads.">
126 <option value="centriflaken_hy" selected="true">centriflaken_hy</option>
127 </param>
128 <when value="centriflaken_hy">
129 <param name="long_read_platform" type="select" label="Mention long read sequencing platform and type"
130 help="THIS OPTION IS IGNORED IF THE INPUT READS ARE SHORT READS.">
131 <option value="NA" selected="true">N/A</option>
132 </param>
133 <param name="rm_dup_seqs" type="select" label="Remove duplicate sequences"
134 help="Selecting yes will compare sequence content and remove identical sequences i.e. only the first occured sequence record will be saved.">
135 <option value="true">yes</option>
136 <option value="false" selected="true">no</option>
137 </param>
138 </when>
139 </conditional>
140 <param name="fq_suffix" value="_R1_001.fastq.gz" type="text" label="Suffix of the R1 FASTQ"/> 73 <param name="fq_suffix" value="_R1_001.fastq.gz" type="text" label="Suffix of the R1 FASTQ"/>
141 <param name="fq2_suffix" value="_R2_001.fastq.gz" type="text" label="Suffix of the R2 FASTQ"/> 74 <param name="fq2_suffix" value="_R2_001.fastq.gz" type="text" label="Suffix of the R2 FASTQ"/>
142 </when> 75 </when>
143 </conditional> 76 </conditional>
144 <param name="fq_filter_by_len" optional="true" value="" type="integer" label="Enter minimum read length to retain before starting the analysis" 77 <param name="tuspy_n" optional="true" value="10" type="integer" label="Enter the number of top unique serotypes to retain after initial MASH screen step."
145 help="Keep this option empty to use default values. Default for centriflaken (long reads) is 4000 bp and for centriflaken_hy (short reads) is 75 bp."/> 78 help="The default value of 10 is suitable for almost all scenarios."/>
79 <param name="bcs_thresholds" type="select" label="Enter the type of base quality thresholds to be set with bettercallsal."
80 help="The default value sets strictest thresholds that tends to filter out most of the false positive hits.">
81 <option value="strict" selected="true">strict</option>
82 <option value="relax">relax</option>
83 </param>
84 <conditional name="sourmash_cond">
85 <param name="run" type="select" label="Run sourmash"
86 help="Should sourmash be used for additional genome fraction filtering.">
87 <option value="true" selected="true">yes</option>
88 <option value="false">no</option>
89 </param>
90 <when value="true">
91 <param name="sfhpy_fcv" type="text" value="0.1" label="Enter the minimum coverage match with sourmash before a serotype hit is considered for further processing."
92 help="The default value is set at 10% coverage threshold."/>
93 </when>
94 <when value="false">
95 <param name="sfhpy_fcv" type="select" label="Enter the minimum coverage match with sourmash before a serotype hit is considered for further processing."
96 help="THIS OPTION IS IGNORED IF SOURMASH TOOL IgreS DISABLED.">
97 <option value="NA" selected="true">N/A</option>
98 </param>
99 </when>
100 </conditional>
146 <param name="fq_filename_delim" type="text" value="_" label="File name delimitor by which samples are grouped together (--fq_filename_delim)" 101 <param name="fq_filename_delim" type="text" value="_" label="File name delimitor by which samples are grouped together (--fq_filename_delim)"
147 help="This is the delimitor by which samples are grouped together to display in the final MultiQC report. For example, if your input data sets are mango_replicate1.fastq.gz, mango_replicate2.fastq.gz, orange_replicate1_maryland.fastq.gz, orange_replicate2_maryland.fastq.gz, then to create 2 samples mango and orange, the value for --fq_filename_delim would be _ (underscore) and the value for --fq_filename_delim_idx would be 1, since you want to group by the first word (i.e. mango or orange) after splitting the filename based on _ (underscore)."/> 102 help="This is the delimitor by which samples are grouped together to display in the final MultiQC report. For example, if your input data sets are mango_replicate1.fastq.gz, mango_replicate2.fastq.gz, orange_replicate1_maryland.fastq.gz, orange_replicate2_maryland.fastq.gz, then to create 2 samples mango and orange, the value for --fq_filename_delim would be _ (underscore) and the value for --fq_filename_delim_idx would be 1, since you want to group by the first word (i.e. mango or orange) after splitting the filename based on _ (underscore)."/>
148 <param name="fq_filename_delim_idx" type="integer" value="1" label="File name delimitor index (--fq_filename_delim_idx)" /> 103 <param name="fq_filename_delim_idx" type="integer" value="1" label="File name delimitor index (--fq_filename_delim_idx)" />
149 <param name="centrifuge_extract_bug" type="text" value="Escherichia coli" label="Reads belonging to this taxa are extracted and a MAG is generated to allow for serotyping"/>
150 <param name="genome_size" type="text" optional="true" value="5.5m" label="Estimated genome size" help="For example, 5m or 2.6g.">
151 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator>
152 </param>
153 <!-- <param name="runtime_profile" type="select" label="Run time profile"> 104 <!-- <param name="runtime_profile" type="select" label="Run time profile">
154 <option value="kondagac" selected="true">conda</option> 105 <option value="kondagac" selected="true">conda</option>
155 <option value="cingularitygac">singularity</option> 106 <option value="cingularitygac">singularity</option>
156 </param> --> 107 </param> -->
157 </inputs> 108 </inputs>
158 <outputs> 109 <outputs>
159 <data name="multiqc_report" format="html" label="${input_read_type_cond.pipeline_cond.pipeline}: MultiQC Report on ${on_string}" from_work_dir="multiqc_report.html"/> 110 <data name="multiqc_report" format="html" label="${input_read_type_cond.pipeline_cond.pipeline}: MultiQC Report on ${on_string}" from_work_dir="multiqc_report.html"/>
160 <collection name="assembled_mags" type="list" label="${input_read_type_cond.pipeline_cond.pipeline}: Assembled MAGs on ${on_string}">
161 <discover_datasets pattern="(?P&lt;name&gt;.*)\.assembly_filtered_contigs\.fasta" ext="fasta" directory="kraken2_extract_contigs"/>
162 </collection>
163 </outputs> 111 </outputs>
164 <tests> 112 <tests>
165 <!--Test 01: long reads--> 113 <!--Test 01: long reads-->
166 <test expect_num_outputs="2"> 114 <test expect_num_outputs="2">
167 <param name="input"> 115 <param name="input">
180 128
181 .. class:: infomark 129 .. class:: infomark
182 130
183 **Purpose** 131 **Purpose**
184 132
185 Centriflaken suite of automated data analysis pipelines are based on Nextflow DSL2 developed at CFSAN, FDA. These pipelines allow rapid 133 bettercallsal is an automated workflow to assign Salmonella serotype based on NCBI Pathogen Detection Project for Salmonella.
186 and effective construction of metagenomic assembled genomes (MAGs) to enable bacterial source-tracking. It is based on methods described in our 134 It uses MASH to reduce the search space followed by additional genome filtering with sourmash. It then performs genome based
187 previous publication (Maguire *et al*, 2021. doi: https://doi.org/10.1371/journal.pone.0245172). 135 alignment with kma followed by count generation using salmon. This workflow can be used to analyze shotgun metagenomics
136 datasets, quasi-metagenomic datasets (enriched for Salmonella) and target enriched datasets (enriched with molecular baits specific for Salmonella)
137 and is especially useful in a case where a sample is of multi-serovar mixture.
138
139 It is written in Nextflow and is part of the modular data analysis pipelines (CFSAN PIPELINES or CPIPES for short) at CFSAN.
140
188 141
189 ---- 142 ----
190 143
191 .. class:: infomark 144 .. class:: infomark
192 145
193 **Testing and Validation** 146 **Testing and Validation**
194 147
195 The CPIPES - Centriflaken Nextflow pipeline has been wrapped to make it work in Galaxy. It takes in either paired or unpaired short reads or long reads, generates MAGs and performs 148 The CPIPES - bettercallsal Nextflow pipeline has been wrapped to make it work in Galaxy. It takes in either paired or unpaired short reads list as an input
196 in silico-based analysis (i.e., virulence gene finding). Additionally, AMR gene finding analysis is also included in Centriflaken and performed on MAGs 149 and generates a MultiQC report in the final step. The pipeline has been tested on 2x300 bp MiSeq and 2x150 bp NextSeq reads and has shown to call multiple
197 of interest. The final summary plots and tables can be downloaded from the provided MultiQC HTML report generated as part of the pipeline. 150 Salmonella serotypes with up to ~95% accuracy. The pipeline has also been tested on metagenomics data sets from Peach and Papaya outbreaks as discussed in
198 The Centriflaken pipeline was validated with data from our previously published method (Maguire *et al*, 2021. doi: https://doi.org/10.1371/journal.pone.0245172) and was able to replicate the detection 151 our preprint (https://www.biorxiv.org/content/10.1101/2023.04.06.535929v1.full). All the original testing and validation was
199 and classification of STECs for each sample. We tested the pipeline with Nanopore data obtained from 21 additional enriched samples from
200 irrigation water and was able to perform the entire precision metagenomics analysis in less than 5 hours for all of them. All the original testing and validation was
201 done on the command line on the CFSAN Raven2 HPC Cluster. 152 done on the command line on the CFSAN Raven2 HPC Cluster.
202 153
203 154
204 ---- 155 ----
205 156
213 164
214 - MultiQC Report: Contains a brief summary report including any serotyping and AMR result tables. 165 - MultiQC Report: Contains a brief summary report including any serotyping and AMR result tables.
215 Please note that due to MultiQC customizations, the preview (eye icon) will not 166 Please note that due to MultiQC customizations, the preview (eye icon) will not
216 work within Galaxy for the MultiQC report. Please download the file by clicking 167 work within Galaxy for the MultiQC report. Please download the file by clicking
217 on the floppy icon and view it in your browser on your local desktop/workstation. 168 on the floppy icon and view it in your browser on your local desktop/workstation.
218 - Final assembly: contains contigs and possibly scaffolds. 169 You can export the tables and plots from the downloaded MultiQC report.
219 170
220 ]]></help> 171 ]]></help>
221 <citations> 172 <citations>
222 <citation type="bibtex"> 173 <citation type="bibtex">
223 @misc{gitlabCPIPES, 174 @misc{bettercallsal,
224 author = {Konganti, Kranti}, 175 author = {Konganti, Kranti},
225 year = {2022}, 176 year = {2023},
226 title = {CPIPES - Centriflaken}, 177 title = {bettercallsal: better calling of Salmonella serotypes from enrichment cultures using shotgun metagenomic profiling and its application in an outbreak setting},
227 publisher = {GitLab}, 178 publisher = {Cold Spring Harbor Laboratory},
228 journal = {GitLab repository}, 179 journal = {bioRxiv},
229 url = {https://cfsan-git.fda.gov/Kranti.Konganti/cpipes}} 180 url = {https://www.biorxiv.org/content/10.1101/2023.04.06.535929v1.full}}
230 </citation> 181 </citation>
231 </citations> 182 </citations>
232 </tool> 183 </tool>