comparison hfp_centriflaken.xml @ 0:082e0091e813 draft default tip

planemo upload
author galaxytrakr
date Fri, 29 May 2026 13:27:47 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:082e0091e813
1 <tool id="hfp_centriflaken_awsbatch" name="centriflaken" version="0.4.2+awsbatch">
2 <description>An automated pipeline to generate a MAG of interest (E.coli or Salmonella) and perform serotyping.</description>
3 <requirements>
4 <container type="docker">quay.io/galaxytrakr/mulled-v2-ebd88135862aa647eeae73d4d8e6ea8ec81245cd:v5.0</container>
5 </requirements>
6 <version_command>nextflow -version</version_command>
7 <command detect_errors="exit_code"><![CDATA[
8 export MAMBA_ROOT_PREFIX="/server/galaxy/data/nextflow-micromamba-cache";
9 export NXF_HOME=\$(pwd)"/.nextflow-home";
10 input_path=\$(pwd)"/cpipes-input";
11 mkdir -p "\${input_path}" || exit 1;
12 #import re
13 #if (str($input_read_type_cond.input_read_type) == "single_long"):
14 #for _, $unpaired in enumerate($input_read_type_cond.input):
15 #set read1 = str($unpaired.name)
16 #if not str($unpaired.name).endswith(('.fastq', '.fastq.gz')):
17 #set read1_ext = re.sub('fastqsanger', 'fastq', str($unpaired.ext))
18 #set read1 = str($unpaired.name) + str('.') + $read1_ext
19 #end if
20 ln -sf '$unpaired' "\${input_path}/$read1";
21 #end for
22 #elif (str($input_read_type_cond.input_read_type) == "paired"):
23 #for _, $pair in enumerate($input_read_type_cond.input_pair)
24 #set read_R1 = re.sub('\:forward', '_forward', str($pair.forward.name))
25 #set read_R2 = re.sub('\:reverse', '_reverse', str($pair.reverse.name))
26 #set read_R1_ext = re.sub('fastqsanger', 'fastq', str($pair.forward.ext))
27 #set read_R2_ext = re.sub('fastqsanger', 'fastq', str($pair.reverse.ext))
28 #if not str($pair.forward.name).endswith(('.fastq', '.fastq.gz')):
29 #set read_R1 = $read_R1 + str('.') + $read_R1_ext
30 #end if
31 #if not str($pair.reverse.name).endswith(('.fastq', '.fastq.gz')):
32 #set read_R2 = $read_R2 + str('.') + $read_R2_ext
33 #end if
34 ln -sf '$pair.forward' "\${input_path}/$read_R1";
35 ln -sf '$pair.reverse' "\${input_path}/$read_R2";
36 #end for
37 #end if
38 $__tool_directory__/0.4.2/cpipes
39 --pipeline $input_read_type_cond.pipeline_cond.pipeline
40 #if ($input_read_type_cond.pipeline_cond.pipeline == "centriflaken"):
41 --fq_single_end true
42 --flye_genome_size '${genome_size}'
43 #if ($input_read_type_cond.pipeline_cond.long_read_platform == "nanopore_corr"):
44 --flye_nano_corr true --flye_nano_raw false
45 #elif ($input_read_type_cond.pipeline_cond.long_read_platform == "nanopore_hq"):
46 --flye_nano_hq true --flye_nano_raw false
47 #elif ($input_read_type_cond.pipeline_cond.long_read_platform == "pacbio_raw"):
48 --flye_pacbio_raw true --flye_nano_raw false
49 #elif ($input_read_type_cond.pipeline_cond.long_read_platform == "pacbio_corr"):
50 --flye_pacbio_corr true --flye_nano_raw false
51 #elif ($input_read_type_cond.pipeline_cond.long_read_platform == "pacbio_hifi"):
52 --flye_pacbio_hifi true --flye_nano_raw false
53 #end if
54 #elif ($input_read_type_cond.pipeline_cond.pipeline == "centriflaken_hy"):
55 #if (str($input_read_type_cond.input_read_type) == "single_long"):
56 --fq_single_end true
57 #elif (str($input_read_type_cond.input_read_type) == "paired"):
58 --fq_single_end false --fq2_suffix '${input_read_type_cond.fq2_suffix}'
59 #end if
60 #end if
61 --input \${input_path}
62 --output cpipes-output
63 --fq_suffix '${input_read_type_cond.fq_suffix}'
64 #if ($fq_filter_by_len != ""):
65 --fq_filter_by_len $fq_filter_by_len
66 #end if
67 --fq_filename_delim '${fq_filename_delim}'
68 --fq_filename_delim_idx $fq_filename_delim_idx
69 --centrifuge_extract_bug '${centrifuge_extract_bug}'
70 #if (str($input_read_type_cond.pipeline_cond.rm_dup_seqs) == "true"):
71 --seqkit_rmdup_run true
72 #end if
73 -profile stdkondagac;
74 mv "./cpipes-output/${input_read_type_cond.pipeline_cond.pipeline}-multiqc/CPIPES-Report_multiqc_report.html" "./multiqc_report.html" || exit 1;
75 mv "./cpipes-output/${input_read_type_cond.pipeline_cond.pipeline}-results/kraken2_extract_contigs" "kraken2_extract_contigs" || exit 1;
76 ]]></command>
77 <inputs>
78 <conditional name="input_read_type_cond">
79 <param name="input_read_type" type="select" label="Select the read collection type">
80 <option value="single_long" selected="true">Unpaired reads (i.e. Single-End short reads or Long reads)</option>
81 <option value="paired">Paired-End reads</option>
82 </param>
83 <when value="single_long">
84 <param name="input" type="data_collection" collection_type="list" format="fastq,fastq.gz"
85 label="Dataset list of unpaired short reads or long reads" />
86 <conditional name="pipeline_cond">
87 <param name="pipeline" type="select" label="CPIPES Workflow name"
88 help="centriflaken: for long reads (Nanopore or PacBio). centriflaken_hy: for unpaired short reads. Default: centriflaken">
89 <option value="centriflaken" selected="true">centriflaken</option>
90 <option value="centriflaken_hy">centriflaken_hy</option>
91 </param>
92 <when value="centriflaken">
93 <param name="long_read_platform" type="select" label="Mention long read sequencing platform and type">
94 <option value="nanopore_raw" selected="true">Nanopore raw reads, pre-Guppy5 (&lt;20% error)</option>
95 <option value="nanopore_corr">Nanopore reads that were corrected with other methods (&lt;3% error)</option>
96 <option value="nanopore_hq">Nanopore high-quality reads, Guppy5+ SUP or Q20 (5% error)</option>
97 <option value="pacbio_raw">PacBio regular CLR reads (&lt;20% error)</option>
98 <option value="pacbio_corr">PacBio reads that were corrected with other methods (&lt;3% error)</option>
99 <option value="pacbio_hifi">PacBio HiFi reads (&lt;1% error)</option>
100 </param>
101 <param name="rm_dup_seqs" type="select" label="Remove duplicate sequences"
102 help="THIS OPTION IS IGNORED IF THE INPUT READS ARE LONG READS.">
103 <option value="NA" selected="true">N/A</option>
104 </param>
105 </when>
106 <when value="centriflaken_hy">
107 <param name="long_read_platform" type="select" label="Mention long read sequencing platform and type"
108 help="THIS OPTION IS IGNORED IF THE INPUT READS ARE SHORT READS.">
109 <option value="NA" selected="true">N/A</option>
110 </param>
111 <param name="rm_dup_seqs" type="select" label="Remove duplicate sequences"
112 help="Selecting yes will compare sequence content and remove identical sequences i.e. only the first occured sequence record will be saved.">
113 <option value="true">yes</option>
114 <option value="false" selected="true">no</option>
115 </param>
116 </when>
117 </conditional>
118 <param name="fq_suffix" value=".fastq.gz" type="text" label="Suffix of the Unpaired FASTQ"/>
119 </when>
120 <when value="paired">
121 <param name="input_pair" type="data_collection" collection_type="list:paired" format="fastq,fastq.gz" label="List of Dataset pairs" />
122 <conditional name="pipeline_cond">
123 <param name="pipeline" type="select" label="CPIPES Workflow name"
124 help="Auto selected centriflaken_hy workflow for paired-end short reads.">
125 <option value="centriflaken_hy" selected="true">centriflaken_hy</option>
126 </param>
127 <when value="centriflaken_hy">
128 <param name="long_read_platform" type="select" label="Mention long read sequencing platform and type"
129 help="THIS OPTION IS IGNORED IF THE INPUT READS ARE SHORT READS.">
130 <option value="NA" selected="true">N/A</option>
131 </param>
132 <param name="rm_dup_seqs" type="select" label="Remove duplicate sequences"
133 help="Selecting yes will compare sequence content and remove identical sequences i.e. only the first occured sequence record will be saved.">
134 <option value="true">yes</option>
135 <option value="false" selected="true">no</option>
136 </param>
137 </when>
138 </conditional>
139 <param name="fq_suffix" value="_R1_001.fastq.gz" type="text" label="Suffix of the R1 FASTQ"/>
140 <param name="fq2_suffix" value="_R2_001.fastq.gz" type="text" label="Suffix of the R2 FASTQ"/>
141 </when>
142 </conditional>
143 <param name="fq_filter_by_len" optional="true" value="" type="integer" label="Enter minimum read length to retain before starting the analysis"
144 help="Keep this option empty to use default values. Default for centriflaken (long reads) is 4000 bp and for centriflaken_hy (short reads) is 75 bp."/>
145 <param name="fq_filename_delim" type="text" value="_" label="File name delimitor by which samples are grouped together (--fq_filename_delim)"
146 help="This is the delimitor by which samples are grouped together to display in the final MultiQC report. For example, if your input data sets are mango_replicate1.fastq.gz, mango_replicate2.fastq.gz, orange_replicate1_maryland.fastq.gz, orange_replicate2_maryland.fastq.gz, then to create 2 samples mango and orange, the value for --fq_filename_delim would be _ (underscore) and the value for --fq_filename_delim_idx would be 1, since you want to group by the first word (i.e. mango or orange) after splitting the filename based on _ (underscore)."/>
147 <param name="fq_filename_delim_idx" type="integer" value="1" label="File name delimitor index (--fq_filename_delim_idx)" />
148 <param name="centrifuge_extract_bug" type="text" value="Escherichia coli" label="Reads belonging to this taxa are extracted and a MAG is generated to allow for serotyping"/>
149 <param name="genome_size" type="text" optional="true" value="5.5m" label="Estimated genome size" help="For example, 5m or 2.6g.">
150 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator>
151 </param>
152 <!-- <param name="runtime_profile" type="select" label="Run time profile">
153 <option value="kondagac" selected="true">conda</option>
154 <option value="cingularitygac">singularity</option>
155 </param> -->
156 </inputs>
157 <outputs>
158 <data name="multiqc_report" format="html" label="${input_read_type_cond.pipeline_cond.pipeline}: MultiQC Report on ${on_string}" from_work_dir="multiqc_report.html"/>
159 <collection name="assembled_mags" type="list" label="${input_read_type_cond.pipeline_cond.pipeline}: Assembled MAGs on ${on_string}">
160 <discover_datasets pattern="(?P&lt;name&gt;.*)\.assembly_filtered_contigs\.fasta" ext="fasta" directory="kraken2_extract_contigs"/>
161 </collection>
162 </outputs>
163 <tests>
164 <!--Test 01: long reads-->
165 <test expect_num_outputs="2">
166 <param name="input">
167 <collection type="list">
168 <element name="FAL11127.fastq.gz" value="FAL11127.fastq.gz" />
169 <element name="FAL11341.fastq.gz" value="FAL11341.fastq.gz" />
170 <element name="FAL11342.fastq.gz" value="FAL11342.fastq.gz" />
171 </collection>
172 </param>
173 <param name="fq_suffix" value=".fastq.gz"/>
174 <output name="multiqc_report" file="multiqc_report.html" ftype="html" compare="sim_size"/>
175 <!-- <output name="assembled_mags" file="FAL11127.assembly_filtered.contigs.fasta" ftype="fasta" compare="sim_size"/> -->
176 </test>
177 </tests>
178 <help><![CDATA[
179
180 .. class:: infomark
181
182 **Purpose**
183
184 Centriflaken suite of automated data analysis pipelines are based on Nextflow DSL2 developed at CFSAN, FDA. These pipelines allow rapid
185 and effective construction of metagenomic assembled genomes (MAGs) to enable bacterial source-tracking. It is based on methods described in our
186 previous publication (Maguire *et al*, 2021. doi: https://doi.org/10.1371/journal.pone.0245172).
187
188 ----
189
190 .. class:: infomark
191
192 **Testing and Validation**
193
194 The CPIPES - Centriflaken Nextflow pipeline has been wrapped to make it work in Galaxy. It takes in either paired or unpaired short reads or long reads, generates MAGs and performs
195 in silico-based analysis (i.e., virulence gene finding). Additionally, AMR gene finding analysis is also included in Centriflaken and performed on MAGs
196 of interest. The final summary plots and tables can be downloaded from the provided MultiQC HTML report generated as part of the pipeline.
197 The Centriflaken pipeline was validated with data from our previously published method (Maguire *et al*, 2021. doi: https://doi.org/10.1371/journal.pone.0245172) and was able to replicate the detection
198 and classification of STECs for each sample. We tested the pipeline with Nanopore data obtained from 21 additional enriched samples from
199 irrigation water and was able to perform the entire precision metagenomics analysis in less than 5 hours for all of them. All the original testing and validation was
200 done on the command line on the CFSAN Raven2 HPC Cluster.
201
202
203 ----
204
205 .. class:: infomark
206
207 **Outputs**
208
209 The main output files are:
210
211 ::
212
213 - MultiQC Report: Contains a brief summary report including any serotyping and AMR result tables.
214 Please note that due to MultiQC customizations, the preview (eye icon) will not
215 work within Galaxy for the MultiQC report. Please download the file by clicking
216 on the floppy icon and view it in your browser on your local desktop/workstation.
217 - Final assembly: contains contigs and possibly scaffolds.
218
219 ]]></help>
220 <citations>
221 <citation type="bibtex">
222 @misc{gitlabCPIPES,
223 author = {Konganti, Kranti},
224 year = {2022},
225 title = {CPIPES - Centriflaken},
226 publisher = {GitLab},
227 journal = {GitLab repository},
228 url = {https://cfsan-git.fda.gov/Kranti.Konganti/cpipes}}
229 </citation>
230 </citations>
231 </tool>