Mercurial > repos > kkonganti > cfsan_centriflaken

<tool id="cfsan_centriflaken" name="Centriflaken" version="0.2.0+galaxy0">
    <description>An automated pipeline to generate a MAG of interest (E.coli or Salmonella) and perform serotyping.</description>
    <requirements>
	<requirement type="package" version="22.04">nextflow</requirement>
	<requirement type="package">graphviz</requirement>
    </requirements>
    <version_command>nextflow -version</version_command>
    <command detect_errors="exit_code"><![CDATA[
	mkdir -p cpipes-input &&
	#for $input_dataset in $input
	    ln -sf '$input_dataset' './cpipes-input/${input.element_identifier}';
	#end for
	pwd_path=\$(pwd) &&
	$__tool_directory__/0.2.1/cpipes
    #if (reads.type == "long"):
        --pipeline centriflaken
    #else:
        --pipeline centriflaken_hy
    #end if
	--input \${pwd_path}/cpipes-input
	--output \${pwd_path}/cpipes-output
	#if ($reads.reads_lib.paired_end == "true"):
	    --fq_single_end false
        --fq_suffix '${reads.reads_lib.fq_suffix}'
	    --fq2_suffix '${reads.reads_lib.fq2_suffix}'
    #else:
        --fq_single_end true
        --fq_suffix '${reads.fq_suffix}'
	#end if
	--fq_filename_delim '${fq_filename_delim}'
	--fq_filename_delim_idx $fq_filename_delim_idx
	--centrifuge_extract_bug '${centrifuge_extract_bug}'
	--flye_genome_size '${genome_size}'
	-profile $profile
    ]]></command>
    <inputs>
        <param name="input" type="data_collection" collection_type="list" format="fastq,fastq.gz,fastqsanger.gz,fastqsanger" label="Input read collection" />
        <conditional name="reads">
            <param name="type" type="select" label="Sequencing Read Library Type" value="long">
                <option value="long">Long reads</option>
                <option value="short">Short reads</option>
            </param>
            <when value="short">
                <conditional name="reads_lib">
                    <param name="paired_end" type="select" label="Sequencing Read Library Layout" value="false">
                        <option value="false">Short read Single-End or Long reads</option>
                        <option value="true">Short read Paired-End</option>
                    </param>
                    <when value="true">
                        <param name="fq_suffix" value="_R1_001.fastq.gz" type="text" label="Suffix of the FASTQ R1 file of Paired-End reads."/>
                        <param name="fq2_suffix" value="_R2_001.fastq.gz" type="text" label="Suffix of the FASTQ R2 file of Paired-End reads."/>
                    </when>
                    <when value="false">
                        <param name="fq_suffix" value="_R1_001.fastq.gz" type="text" label="Suffix of the FASTQ R1 file of Paired-End reads."/>
                    </when>
                </conditional>
            </when>
            <when value="long">
                <param name="fq_suffix" value=".fastq.gz" type="text" label="Suffix of the FASTQ file of Long reads."/>
            </when>
        </conditional>
        <param name="fq_filename_delim" type="text" value="_" label="File name delimitor by which samples are grouped together (--fq_filename_delim)"
            help="This is the delimitor by which samples are grouped together to display in the final MultiQC report. For example, if your input data sets are mango_replicate1.fastq.gz, mango_replicate2.fastq.gz, orange_replicate1_maryland.fastq.gz, orange_replicate2_maryland.fastq.gz, then to create 2 samples mango and orange, the value for --fq_filename_delim would be _ (underscore) and the value for --fq_filename_idx would be 1, since you want to group by the first word (i.e. mango or orange) after splitting the filename based on _ (underscore)"/>
        <param name="fq_filename_delim_idx" type="integer" value="1" label="File name delimitor index (--fq_filename_delimitor_idx)" />
        <param name="centrifuge_extract_bug" type="text" value="Escherichia coli" label="Reads belonging to this taxa are extracted and a MAG is generated to allow for serotyping"/>
        <param name="genome_size" type="text" optional="true" value="5.5m" label="Estimated genome size" help="For example, 5m or 2.6g.">
            <validator type="regex" message="Genome size must be a float  or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator>
        </param>
        <param name="runtime_profile" type="select" label="Run time profile" value="kondagac">
            <option value="kondagac">conda</option>
            <option value="cingularitygac">singularity</option>
        </param>
    </inputs>
    <outputs>
        <data name="multiqc_report" format="html" label="MultiQC Report on ${on_string}">
            <discover_datasets pattern="multiqc_report.html" assign_primary_output="true" directory="cpipes-output"/>
        </data>
        <data name="assembled_mags" format="fasta" label="CENTRIFLAKEN: Assembled MAGs">
            <discover_datasets pattern=".*\.assembly_filtered_contigs.fasta" visible="true" directory="cpipes-output"/>
        </data>
    </outputs>
    <tests>
        <!--Test 01: long reads-->
        <test expect_num_outputs="2">
            <param name="input" value="FAL11127.fastq.gz" >
                <collection type="list">
                    <element name="file1" value="FAL11127.fastq.gz" />
                    <element name="file2" value="FAL11341.fastq.gz" />
                    <element name="file3" value="FAL11342.fastq.gz" />
                </collection>
            </param>
            <param name="fq_suffix" value=".fastq.gz"/>
            <output name="multiqc_report" file="multiqc_report.html" ftype="html" compare="sim_size"/>
            <output name="assembled_mags" file="FAL11127.assembly_filtered.contigs.fasta" ftype="fasta" compare="sim_size"/>
        </test>
    </tests>
    <help><![CDATA[

.. class:: infomark

**Purpose**

Centriflaken suite of automated data analysis pipelines based on Nextflow DSL2 developed at CFSAN, FDA. Thess piepelines allow rapid
and effective construction of metagenomic assembled genomes (MAGs) to enable bacterial source-tracking. It is based on methods described in our
previous publication (https://doi.org/10.1371/journal.pone.0245172).
----

.. class:: infomark

**Testing and Validation**

The pipeline has been wrapped to make it work in Galaxy. It takes in either paired or unpaired short reads or long reads, generates MAGs and performs
in silico-based analysis (i.e., virulence gene finding). Additionally, AMR gene finding analysis is also included in Centriflaken and performed on MAGs
of interest. The final summary plots and tables can be downloaded from the provided MultiQC HTML report generated as part of the pipeline.
The Centriflaken pipeline was validated with data from our previously published method (Maguire et al, 2021) and was able to replicate the detection
and classification of STECs for each sample. We tested the pipeline with nanopore data obtained from 21 additional enriched samples from
irrigation water and was able to perform the entire precision metagenomics analysis in less than 5 hours for all of them. All the original testing and validation was
done on the command line on the CFSAN Raven2 HPC Cluster.


----

.. class:: infomark

**Outputs**

The main output files are:

    ::

        - MultiQC Report: Contains a brief summary report including any serotyping and AMR result tables.
        - Final assembly: contains contigs and possibly scaffolds (see below).

  ]]></help>
    <citations>
        <citation type="bibtex">
            @misc{gitlabCPIPES,
            author = {Konganti, Kranti},
            year = {2022},
            title = {CPIPES - Centriflaken},
            publisher = {GitLab},
            journal = {GitLab repository},
            url = {https://cfsan-git.fda.gov/Kranti.Konganti/cpipes}}
        </citation>
    </citations>
</tool>
author	kkonganti
date	Mon, 27 Jun 2022 18:19:22 -0400
parents	29a590703d3e
children	34d3ef477de3