annotate cfsan_centriflaken.xml @ 54:d6539bfd7a46

"planemo upload"
author kkonganti
date Tue, 12 Jul 2022 16:53:01 -0400
parents 2d0e3b6c693a
children 98ed61dc2d4c
rev   line source
kkonganti@0 1 <tool id="cfsan_centriflaken" name="Centriflaken" version="0.2.0+galaxy0">
kkonganti@0 2 <description>An automated pipeline to generate a MAG of interest (E.coli or Salmonella) and perform serotyping.</description>
kkonganti@0 3 <requirements>
kkonganti@0 4 <requirement type="package" version="22.04">nextflow</requirement>
kkonganti@0 5 <requirement type="package">graphviz</requirement>
kkonganti@0 6 </requirements>
kkonganti@0 7 <version_command>nextflow -version</version_command>
kkonganti@0 8 <command detect_errors="exit_code"><![CDATA[
kkonganti@50 9 mkdir -p cpipes-input || exit 1;
kkonganti@4 10 #for $key in $input.keys()
kkonganti@4 11 ln -sf '$input[$key]' './cpipes-input/$key';
kkonganti@0 12 #end for
kkonganti@16 13 pwd_path=\$(pwd);
kkonganti@0 14 $__tool_directory__/0.2.1/cpipes
kkonganti@26 15 --pipeline $pipeline
kkonganti@4 16 #if ($pipeline == "centriflaken"):
kkonganti@4 17 --fq_single_end true
kkonganti@33 18 --flye_genome_size '${genome_size}'
kkonganti@33 19 #if ($long_read_platform == "nanopore_corr"):
kkonganti@33 20 --flye_nano_corr true --flye_nano_raw false
kkonganti@33 21 #elif ($long_read_platform == "nanopore_hq"):
kkonganti@33 22 --flye_nano_hq true --flye_nano_raw false
kkonganti@33 23 #elif ($long_read_platform == "pacbio_raw"):
kkonganti@33 24 --flye_pacbio_raw true --flye_nano_raw false
kkonganti@33 25 #elif ($long_read_platform == "pacbio_corr"):
kkonganti@33 26 --flye_pacbio_corr true --flye_nano_raw false
kkonganti@33 27 #elif ($long_read_platform == "pacbio_hifi"):
kkonganti@33 28 --flye_pacbio_hifi true --flye_nano_raw false
kkonganti@33 29 #end if
kkonganti@16 30 #elif ($pipeline == "centriflaken_hy"):
kkonganti@4 31 #if ($reads_lib_layout == "single"):
kkonganti@4 32 --fq_single_end true
kkonganti@30 33 #elif ($reads_lib_layout == "paired"):
kkonganti@32 34 --fq_single_end false --fq2_suffix '${fq2_suffix}'
kkonganti@4 35 #end if
kkonganti@0 36 #end if
kkonganti@0 37 --input \${pwd_path}/cpipes-input
kkonganti@0 38 --output \${pwd_path}/cpipes-output
kkonganti@4 39 --fq_suffix '${fq_suffix}'
kkonganti@41 40 #if ($fq_filter_by_len != ""):
kkonganti@39 41 --fq_filter_by_len $fq_filter_by_len
kkonganti@39 42 #end if
kkonganti@0 43 --fq_filename_delim '${fq_filename_delim}'
kkonganti@0 44 --fq_filename_delim_idx $fq_filename_delim_idx
kkonganti@0 45 --centrifuge_extract_bug '${centrifuge_extract_bug}'
kkonganti@47 46 -profile kondagac;
kkonganti@24 47 mv './cpipes-output/${pipeline}-multiqc/multiqc_report.html' './multiqc_report.html';
kkonganti@24 48 mv './cpipes-output/${pipeline}-results/kraken2_extract_contigs' kraken2_extract_contigs;
kkonganti@46 49 rm -rf ./cpipes-output;
kkonganti@46 50 rm -rf ./work
kkonganti@0 51 ]]></command>
kkonganti@0 52 <inputs>
kkonganti@3 53 <param name="input" type="data_collection" collection_type="list" format="fastq,fastq.gz,fastqsanger.gz,fastqsanger" label="Input read collection" />
kkonganti@31 54 <param name="pipeline" type="select" label="CPIPES Workflow name"
kkonganti@9 55 help="centriflaken: for long reads (Nanopore or PacBio). centriflaken_hy: for short reads (paired or unpaired). Default: centriflaken">
kkonganti@31 56 <option value="centriflaken" selected="true">centriflaken</option>
kkonganti@4 57 <option value="centriflaken_hy">centriflaken_hy</option>
kkonganti@4 58 </param>
kkonganti@31 59 <param name="reads_lib_layout" type="select" label="Short Read Library Layout"
kkonganti@28 60 help="Leave this option as Single-End for centriflaken. If the pipeline is centriflaken_hy (i.e for short reads), what is the library layout? Default: Single-End">
kkonganti@31 61 <option value="single" selected="true">Single-End</option>
kkonganti@4 62 <option value="paired">Paired-End</option>
kkonganti@4 63 </param>
kkonganti@31 64 <param name="long_read_platform" type="select" label="Mention long read sequencing platform and type"
kkonganti@25 65 help="THIS OPTION IS IGNORED IF THE INPUT READS ARE SHORT READS.">
kkonganti@31 66 <option value="nanopore_raw" selected="true">Nanopore raw reads, pre-Guppy5 (&lt;20% error)</option>
kkonganti@25 67 <option value="nanopore_corr">Nanopore reads that were corrected with other methods (&lt;3% error)</option>
kkonganti@25 68 <option value="nanopore_hq">Nanopore high-quality reads, Guppy5+ SUP or Q20 (5% error)</option>
kkonganti@25 69 <option value="pacbio_raw">PacBio regular CLR reads (&lt;20% error)</option>
kkonganti@25 70 <option value="pacbio_corr">PacBio reads that were corrected with other methods (&lt;3% error)</option>
kkonganti@25 71 <option value="pacbio_hifi">PacBio HiFi reads (&lt;1% error)</option>
kkonganti@25 72 </param>
kkonganti@4 73 <param name="fq_suffix" value=".fastq.gz" type="text" label="Suffix of the R1 FASTQ or Unpaired FASTQ"/>
kkonganti@47 74 <param name="fq2_suffix" value="_R2_001.fastq.gz" type="text" label="Suffix of the R2 FASTQ"
kkonganti@47 75 help="THIS OPTION IS IGNORED IF THE INPUT READS ARE UNPAIRED/LONG READS."/>
kkonganti@44 76 <param name="fq_filter_by_len" optional="true" value="" type="integer" label="Enter minimum read length to retain before starting the analysis"
kkonganti@48 77 help="Keep this option empty to use default values. Default for centriflaken (long reads) is 4000 bp and for centriflaken_hy (short reads) is 75 bp."/>
kkonganti@40 78 <param name="fq_filename_delim" type="text" value="_" label="File name delimitor by which samples are grouped together (--fq_filename_delim)"
kkonganti@48 79 help="This is the delimitor by which samples are grouped together to display in the final MultiQC report. For example, if your input data sets are mango_replicate1.fastq.gz, mango_replicate2.fastq.gz, orange_replicate1_maryland.fastq.gz, orange_replicate2_maryland.fastq.gz, then to create 2 samples mango and orange, the value for --fq_filename_delim would be _ (underscore) and the value for --fq_filename_delim_idx would be 1, since you want to group by the first word (i.e. mango or orange) after splitting the filename based on _ (underscore)."/>
kkonganti@6 80 <param name="fq_filename_delim_idx" type="integer" value="1" label="File name delimitor index (--fq_filename_delim_idx)" />
kkonganti@0 81 <param name="centrifuge_extract_bug" type="text" value="Escherichia coli" label="Reads belonging to this taxa are extracted and a MAG is generated to allow for serotyping"/>
kkonganti@0 82 <param name="genome_size" type="text" optional="true" value="5.5m" label="Estimated genome size" help="For example, 5m or 2.6g.">
kkonganti@0 83 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator>
kkonganti@0 84 </param>
kkonganti@47 85 <!-- <param name="runtime_profile" type="select" label="Run time profile">
kkonganti@31 86 <option value="kondagac" selected="true">conda</option>
kkonganti@12 87 <option value="cingularitygac">singularity</option>
kkonganti@47 88 </param> -->
kkonganti@0 89 </inputs>
kkonganti@0 90 <outputs>
kkonganti@36 91 <data name="multiqc_report" format="html" label="${pipeline}: MultiQC Report on ${on_string}" from_work_dir="multiqc_report.html"/>
kkonganti@36 92 <collection name="assembled_mags" type="list" label="${pipeline}: Assembled MAGs on ${on_string}">
kkonganti@24 93 <discover_datasets pattern="(?P&lt;name&gt;.*)\.assembly_filtered_contigs\.fasta" ext="fasta" directory="kraken2_extract_contigs"/>
kkonganti@18 94 </collection>
kkonganti@0 95 </outputs>
kkonganti@3 96 <tests>
kkonganti@3 97 <!--Test 01: long reads-->
kkonganti@3 98 <test expect_num_outputs="2">
kkonganti@4 99 <param name="input">
kkonganti@3 100 <collection type="list">
kkonganti@4 101 <element name="FAL11127.fastq.gz" value="FAL11127.fastq.gz" />
kkonganti@4 102 <element name="FAL11341.fastq.gz" value="FAL11341.fastq.gz" />
kkonganti@4 103 <element name="FAL11342.fastq.gz" value="FAL11342.fastq.gz" />
kkonganti@3 104 </collection>
kkonganti@3 105 </param>
kkonganti@3 106 <param name="fq_suffix" value=".fastq.gz"/>
kkonganti@3 107 <output name="multiqc_report" file="multiqc_report.html" ftype="html" compare="sim_size"/>
kkonganti@18 108 <!-- <output name="assembled_mags" file="FAL11127.assembly_filtered.contigs.fasta" ftype="fasta" compare="sim_size"/> -->
kkonganti@3 109 </test>
kkonganti@3 110 </tests>
kkonganti@0 111 <help><![CDATA[
kkonganti@0 112
kkonganti@0 113 .. class:: infomark
kkonganti@0 114
kkonganti@0 115 **Purpose**
kkonganti@0 116
kkonganti@50 117 Centriflaken suite of automated data analysis pipelines are based on Nextflow DSL2 developed at CFSAN, FDA. These pipelines allow rapid
kkonganti@0 118 and effective construction of metagenomic assembled genomes (MAGs) to enable bacterial source-tracking. It is based on methods described in our
kkonganti@53 119 previous publication (Maguire *et al*, 2021. doi: https://doi.org/10.1371/journal.pone.0245172).
kkonganti@14 120
kkonganti@0 121 ----
kkonganti@0 122
kkonganti@0 123 .. class:: infomark
kkonganti@0 124
kkonganti@0 125 **Testing and Validation**
kkonganti@0 126
kkonganti@47 127 The CPIPES - Centriflaken Nextflow pipeline has been wrapped to make it work in Galaxy. It takes in either paired or unpaired short reads or long reads, generates MAGs and performs
kkonganti@0 128 in silico-based analysis (i.e., virulence gene finding). Additionally, AMR gene finding analysis is also included in Centriflaken and performed on MAGs
kkonganti@0 129 of interest. The final summary plots and tables can be downloaded from the provided MultiQC HTML report generated as part of the pipeline.
kkonganti@53 130 The Centriflaken pipeline was validated with data from our previously published method (Maguire *et al*, 2021. doi: https://doi.org/10.1371/journal.pone.0245172) and was able to replicate the detection
kkonganti@47 131 and classification of STECs for each sample. We tested the pipeline with Nanopore data obtained from 21 additional enriched samples from
kkonganti@0 132 irrigation water and was able to perform the entire precision metagenomics analysis in less than 5 hours for all of them. All the original testing and validation was
kkonganti@0 133 done on the command line on the CFSAN Raven2 HPC Cluster.
kkonganti@0 134
kkonganti@0 135
kkonganti@0 136 ----
kkonganti@0 137
kkonganti@0 138 .. class:: infomark
kkonganti@0 139
kkonganti@0 140 **Outputs**
kkonganti@0 141
kkonganti@0 142 The main output files are:
kkonganti@0 143
kkonganti@0 144 ::
kkonganti@0 145
kkonganti@54 146 - MultiQC Report: Contains a brief summary report including any serotyping and AMR result tables. Please note that due to MultiQC customizations, the preview (eye icon) will not work within Galaxy for the MultiQC report. Please download the file by clicking on the floppy icon and view it on your browser on your local desktop/workstation.
kkonganti@27 147 - Final assembly: contains contigs and possibly scaffolds.
kkonganti@0 148
kkonganti@0 149 ]]></help>
kkonganti@0 150 <citations>
kkonganti@0 151 <citation type="bibtex">
kkonganti@0 152 @misc{gitlabCPIPES,
kkonganti@0 153 author = {Konganti, Kranti},
kkonganti@0 154 year = {2022},
kkonganti@0 155 title = {CPIPES - Centriflaken},
kkonganti@0 156 publisher = {GitLab},
kkonganti@0 157 journal = {GitLab repository},
kkonganti@0 158 url = {https://cfsan-git.fda.gov/Kranti.Konganti/cpipes}}
kkonganti@0 159 </citation>
kkonganti@0 160 </citations>
kkonganti@0 161 </tool>