comparison cfsan_cronology.xml @ 0:c8597e9e1a97

"planemo upload"
author kkonganti
date Mon, 27 Nov 2023 12:37:44 -0500
parents
children c6327baca625
comparison
equal deleted inserted replaced
-1:000000000000 0:c8597e9e1a97
1 <tool id="cfsan_cronology" name="cronology" version="0.1.0">
2 <description>An automated workflow for Cronobacter isolate assembly, sequence typing and traceback.</description>
3 <requirements>
4 <requirement type="package" version="23.04">nextflow</requirement>
5 <requirement type="package" version="1.0.0">micromamba</requirement>
6 <requirement type="package">graphviz</requirement>
7 </requirements>
8 <version_command>nextflow -version</version_command>
9 <command detect_errors="exit_code"><![CDATA[
10 mkdir -p cpipes-input || exit 1;
11 pwd_path=\$(pwd);
12 #import re
13 #if (str($input_read_type_cond.input_read_type) == "single_long"):
14 #for _, $unpaired in enumerate($input_read_type_cond.input):
15 #set read1 = str($unpaired.name)
16 #if not str($unpaired.name).endswith(('.fastq', '.fastq.gz')):
17 #set read1_ext = re.sub('fastqsanger', 'fastq', str($unpaired.ext))
18 #set read1 = str($unpaired.name) + str('.') + $read1_ext
19 #end if
20 ln -sf '$unpaired' './cpipes-input/$read1';
21 #end for
22 #elif (str($input_read_type_cond.input_read_type) == "paired"):
23 #for _, $pair in enumerate($input_read_type_cond.input_pair)
24 #set read_R1 = re.sub('\:forward', '_forward', str($pair.forward.name))
25 #set read_R2 = re.sub('\:reverse', '_reverse', str($pair.reverse.name))
26 #set read_R1_ext = re.sub('fastqsanger', 'fastq', str($pair.forward.ext))
27 #set read_R2_ext = re.sub('fastqsanger', 'fastq', str($pair.reverse.ext))
28 #if not str($pair.forward.name).endswith(('.fastq', '.fastq.gz')):
29 #set read_R1 = $read_R1 + str('.') + $read_R1_ext
30 #end if
31 #if not str($pair.reverse.name).endswith(('.fastq', '.fastq.gz')):
32 #set read_R2 = $read_R2 + str('.') + $read_R2_ext
33 #end if
34 ln -sf '$pair.forward' './cpipes-input/$read_R1';
35 ln -sf '$pair.reverse' './cpipes-input/$read_R2';
36 #end for
37 #end if
38 $__tool_directory__/0.1.0/cpipes
39 --pipeline cronology
40 --input \${pwd_path}/cpipes-input
41 --output \${pwd_path}/cpipes-output
42 --fq_suffix '${input_read_type_cond.fq_suffix}'
43 #if (str($input_read_type_cond.input_read_type) == "single_long"):
44 --fq_single_end true
45 #elif (str($input_read_type_cond.input_read_type) == "paired"):
46 --fq_single_end false --fq2_suffix '${input_read_type_cond.fq2_suffix}'
47 #end if
48 --ref_acc $refgenome
49 --tuspy_n $tuspy_n
50 --fq_filename_delim '${fq_filename_delim}'
51 --fq_filename_delim_idx $fq_filename_delim_idx
52 -profile kondagac;
53 mv './cpipes-output/cronology-multiqc/multiqc_report.html' './multiqc_report.html' > /dev/null 2>&1 || exit 1;
54 mv './cpipes-output/mashtree/hitsTree.dnd' './hitsTree.dnd' > /dev/null 2>&1 || exit 1;
55 ]]></command>
56 <inputs>
57 <conditional name="input_read_type_cond">
58 <param name="input_read_type" type="select" label="Select the read collection type">
59 <option value="single_long" selected="true">Single-End short reads</option>
60 <option value="paired">Paired-End short reads</option>
61 </param>
62 <when value="single_long">
63 <param name="input" type="data_collection" collection_type="list" format="fastq,fastq.gz"
64 label="Dataset list of unpaired short reads or long reads" />
65 <param name="fq_suffix" value=".fastq.gz" type="text" label="Suffix of the Single-End FASTQ"/>
66 </when>
67 <when value="paired">
68 <param name="input_pair" type="data_collection" collection_type="list:paired" format="fastq,fastq.gz" label="List of Dataset pairs" />
69 <param name="fq_suffix" value="_R1_001.fastq.gz" type="text" label="Suffix of the R1 FASTQ"
70 help="For any data sets downloaded from NCBI into Galaxy, change this to _forward.fastq.gz suffix."/>
71 <param name="fq2_suffix" value="_R2_001.fastq.gz" type="text" label="Suffix of the R2 FASTQ"
72 help="For any data sets downloaded from NCBI into Galaxy, change this to _reverse.fastq.gz suffix."/>
73 </when>
74 </conditional>
75 <param name="refgenome" optional="true" value="GCF_003516125" type="text"
76 label="NCBI reference genome accession"
77 help="Is the reference genome other than <i>Cronobacter sakazakii</i>? Reference genome FASTA is used as a model for gene prediction. DO NOT ENTER THE DECIMAL PART (Ex: GCF_003516125.1)." />
78 <param name="tuspy_n" optional="true" value="10" type="integer" label="Enter the number of top unique hits to retain after initial MASH screen step"
79 help="These hits will be used to build a genome distance based tree for your experiment run. Default value of 2 is suitable for almost all scenarios."/>
80 <param name="fq_filename_delim" type="text" value="_" label="File name delimitor by which samples are grouped together (--fq_filename_delim)"
81 help="This is the delimitor by which samples are grouped together to display in the final MultiQC report. For example, if your input data sets are mango_replicate1.fastq.gz, mango_replicate2.fastq.gz, orange_replicate1_maryland.fastq.gz, orange_replicate2_maryland.fastq.gz, then to create 2 samples mango and orange, the value for --fq_filename_delim would be _ (underscore) and the value for --fq_filename_delim_idx would be 1, since you want to group by the first word (i.e. mango or orange) after splitting the filename based on _ (underscore)."/>
82 <param name="fq_filename_delim_idx" type="integer" value="1" label="File name delimitor index (--fq_filename_delim_idx)" />
83 </inputs>
84 <outputs>
85 <data name="multiqc_report" format="html" label="cronology: MultiQC Report on ${on_string}" from_work_dir="multiqc_report.html"/>
86 <data name="mashtree" format="nwk" label="cronology: Genome distance based tree on ${on_string}" from_work_dir="hitsTree.dnd"/>
87 <collection name="itol_metadata" type="list" label="cronology: iTOL Metadata: ${on_string}">
88 <discover_datasets pattern="(?P&lt;name&gt;.*)\.txt" ext="txt" match_relative_path="true" directory="./cpipes-output/cat_unique"/>
89 </collection>
90 <collection name="gene_models" type="list" label="cronology: Predicted gene models: ${on_string}">
91 <discover_datasets pattern="(?P&lt;name&gt;.*)\.gff" ext="gff" match_relative_path="true" recurse="true" directory="./cpipes-output/prokka"/>
92 </collection>
93 <collection name="assemblies" type="list" label="cronology: Polished genome assemblies: ${on_string}">
94 <discover_datasets pattern="(?P&lt;name&gt;.*)\.fa" ext="fa" match_relative_path="true" directory="./cpipes-output/polypolish"/>
95 </collection>
96 </outputs>
97 <tests>
98 <!--Test 01: long reads-->
99 <test expect_num_outputs="2">
100 <param name="input">
101 <collection type="list">
102 <element name="FAL11127.fastq.gz" value="FAL11127.fastq.gz" />
103 <element name="FAL11341.fastq.gz" value="FAL11341.fastq.gz" />
104 <element name="FAL11342.fastq.gz" value="FAL11342.fastq.gz" />
105 </collection>
106 </param>
107 <param name="fq_suffix" value=".fastq.gz"/>
108 <output name="multiqc_report" file="multiqc_report.html" ftype="html" compare="sim_size"/>
109 <!-- <output name="assembled_mags" file="FAL11127.assembly_filtered.contigs.fasta" ftype="fasta" compare="sim_size"/> -->
110 </test>
111 </tests>
112 <help><![CDATA[
113
114 .. class:: infomark
115
116 **Purpose**
117
118 cronology is an automated workflow to assign Salmonella serotype based on NCBI Pathogen Detection Project for Salmonella.
119 It uses MASH to reduce the search space followed by additional genome filtering with sourmash. It then performs genome based
120 alignment with kma followed by count generation using salmon. This workflow can be used to analyze shotgun metagenomics
121 datasets, quasi-metagenomic datasets (enriched for Salmonella) and target enriched datasets (enriched with molecular baits specific for Salmonella)
122 and is especially useful in a case where a sample is of multi-serovar mixture.
123
124 It is written in Nextflow and is part of the modular data analysis pipelines (CFSAN PIPELINES or CPIPES for short) at CFSAN.
125
126
127 ----
128
129 .. class:: infomark
130
131 **Testing and Validation**
132
133 The CPIPES - cronology Nextflow pipeline has been wrapped to make it work in Galaxy. It takes in either paired or unpaired short reads list as an input
134 and generates a MultiQC report in the final step. The pipeline has been tested on 2x300 bp MiSeq and 2x150 bp NextSeq simulated reads and has been shown to call multiple
135 Salmonella serotypes with up to ~95% accuracy. The pipeline has also been tested on metagenomics data sets from Peach and Papaya outbreaks as discussed in
136 our publication (https://www.frontiersin.org/articles/10.3389/fmicb.2023.1200983/full). All the original testing and validation was
137 done on the command line on the CFSAN Raven2 HPC Cluster.
138
139
140 ----
141
142 .. class:: infomark
143
144 **Outputs**
145
146 The main output file is a:
147
148 ::
149
150 - MultiQC Report: Contains a brief summary report including any serotyping and AMR result tables.
151 Please note that due to MultiQC customizations, the preview (eye icon) will not
152 work within Galaxy for the MultiQC report. Please download the file by clicking
153 on the floppy icon and view it in your browser on your local desktop/workstation.
154 You can export the tables and plots from the downloaded MultiQC report.
155
156 ]]></help>
157 <citations>
158 <citation type="bibtex">
159 @article{cronology,
160 author = {Konganti, Kranti},
161 year = {2023},
162 month = {August},
163 title = {cronology: better calling of Salmonella serotypes from enrichment cultures using shotgun metagenomic profiling and its application in an outbreak setting},
164 journal = {Frontiers in Microbiology},
165 doi = {10.3389/fmicb.2023.1200983},
166 url = {https://www.frontiersin.org/articles/10.3389/fmicb.2023.1200983/full}}
167 </citation>
168 </citations>
169 </tool>