Mercurial > repos > kkonganti > cfsan_centriflaken

<tool id="cfsan_centriflaken" name="Centriflaken" version="0.2.0+galaxy0">
    <description>An automated pipeline to generate a MAG of interest (E.coli or Salmonella) and perform serotyping.</description>
    <requirements>
	<requirement type="package" version="22.04">nextflow</requirement>
	<requirement type="package">graphviz</requirement>
    </requirements>
    <version_command>nextflow -version</version_command>
    <command detect_errors="exit_code"><![CDATA[
	mkdir -p cpipes-input;
	#for $key in $input.keys()
	    ln -sf '$input[$key]' './cpipes-input/$key';
	#end for
	pwd_path=\$(pwd);
	$__tool_directory__/0.2.1/cpipes
    #if ($pipeline == "centriflaken"):
        --pipeline $pipeline
        --fq_single_end true
    #elif ($pipeline == "centriflaken_hy"):
        --pipeline $pipeline
        #if ($reads_lib_layout == "single"):
            --fq_single_end true
        #else:
            --fq_single_end false
        #end if
    #end if
	--input \${pwd_path}/cpipes-input
	--output \${pwd_path}/cpipes-output
    --fq_suffix '${fq_suffix}'
	--fq_filename_delim '${fq_filename_delim}'
	--fq_filename_delim_idx $fq_filename_delim_idx
	--centrifuge_extract_bug '${centrifuge_extract_bug}'
	--flye_genome_size '${genome_size}'
	-profile $runtime_profile
    -resume
    ]]></command>
    <inputs>
        <param name="input" type="data_collection" collection_type="list" format="fastq,fastq.gz,fastqsanger.gz,fastqsanger" label="Input read collection" />
        <param name="pipeline" type="select" label="CPIPES Workflow name" value="centriflaken"
            help="centriflaken: for long reads (Nanopore or PacBio). centriflaken_hy: for short reads (paired or unpaired). Default: centriflaken">
            <option value="centriflaken">centriflaken</option>
            <option value="centriflaken_hy">centriflaken_hy</option>
        </param>
        <param name="read_lib_layout" type="select" label="Short Read Library Layout" value="single"
            help="If the pipeline is centriflaken_hy (i.e for short reads), what is the library layout? Default: Single-End">
            <option value="single">Single-End</option>
            <option value="paired">Paired-End</option>
        </param>
        <param name="fq_suffix" value=".fastq.gz" type="text" label="Suffix of the R1 FASTQ or Unpaired FASTQ"/>
        <param name="fq2_suffix" value="_R2_001.fastq.gz" type="text" label="Suffix of the R2 FASTQ R2"/>
        <param name="fq_filename_delim" type="text" value="_" label="File name delimitor by which samples are grouped together (--fq_filename_delim)"
            help="This is the delimitor by which samples are grouped together to display in the final MultiQC report. For example, if your input data sets are mango_replicate1.fastq.gz, mango_replicate2.fastq.gz, orange_replicate1_maryland.fastq.gz, orange_replicate2_maryland.fastq.gz, then to create 2 samples mango and orange, the value for --fq_filename_delim would be _ (underscore) and the value for --fq_filename_delim_idx would be 1, since you want to group by the first word (i.e. mango or orange) after splitting the filename based on _ (underscore)"/>
        <param name="fq_filename_delim_idx" type="integer" value="1" label="File name delimitor index (--fq_filename_delim_idx)" />
        <param name="centrifuge_extract_bug" type="text" value="Escherichia coli" label="Reads belonging to this taxa are extracted and a MAG is generated to allow for serotyping"/>
        <param name="genome_size" type="text" optional="true" value="5.5m" label="Estimated genome size" help="For example, 5m or 2.6g.">
            <validator type="regex" message="Genome size must be a float  or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator>
        </param>
        <param name="runtime_profile" type="select" label="Run time profile" value="kondagac">
            <option value="kondagac">conda</option>
            <option value="cingularitygac">singularity</option>
        </param>
    </inputs>
    <outputs>
        <data name="multiqc_report" format="html" label="MultiQC Report on ${on_string}" from_work_dir="./cpipes-output/${pipeline}-multiqc/multiqc_report.html"/>
        <collection name="assembled_mags" type="list" label="CENTRIFLAKEN: Assembled MAGs on ${on_string}">
            <discover_datasets pattern="(?P&lt;name&gt;.*)\.assembly_filtered_contigs\.fasta" directory="./cpipes-output/${pipeline}-results/kraken2_extract_contigs"/>
        </collection>
    </outputs>
    <tests>
        <!--Test 01: long reads-->
        <test expect_num_outputs="2">
            <param name="input">
                <collection type="list">
                    <element name="FAL11127.fastq.gz" value="FAL11127.fastq.gz" />
                    <element name="FAL11341.fastq.gz" value="FAL11341.fastq.gz" />
                    <element name="FAL11342.fastq.gz" value="FAL11342.fastq.gz" />
                </collection>
            </param>
            <param name="fq_suffix" value=".fastq.gz"/>
            <output name="multiqc_report" file="multiqc_report.html" ftype="html" compare="sim_size"/>
            <!-- <output name="assembled_mags" file="FAL11127.assembly_filtered.contigs.fasta" ftype="fasta" compare="sim_size"/> -->
        </test>
    </tests>
    <help><![CDATA[

.. class:: infomark

**Purpose**

Centriflaken suite of automated data analysis pipelines are based on Nextflow DSL2 developed at CFSAN, FDA. These piepelines allow rapid
and effective construction of metagenomic assembled genomes (MAGs) to enable bacterial source-tracking. It is based on methods described in our
previous publication (https://doi.org/10.1371/journal.pone.0245172).

----

.. class:: infomark

**Testing and Validation**

The pipeline has been wrapped to make it work in Galaxy. It takes in either paired or unpaired short reads or long reads, generates MAGs and performs
in silico-based analysis (i.e., virulence gene finding). Additionally, AMR gene finding analysis is also included in Centriflaken and performed on MAGs
of interest. The final summary plots and tables can be downloaded from the provided MultiQC HTML report generated as part of the pipeline.
The Centriflaken pipeline was validated with data from our previously published method (Maguire et al, 2021) and was able to replicate the detection
and classification of STECs for each sample. We tested the pipeline with nanopore data obtained from 21 additional enriched samples from
irrigation water and was able to perform the entire precision metagenomics analysis in less than 5 hours for all of them. All the original testing and validation was
done on the command line on the CFSAN Raven2 HPC Cluster.


----

.. class:: infomark

**Outputs**

The main output files are:

    ::

        - MultiQC Report: Contains a brief summary report including any serotyping and AMR result tables.
        - Final assembly: contains contigs and possibly scaffolds (see below).

  ]]></help>
    <citations>
        <citation type="bibtex">
            @misc{gitlabCPIPES,
            author = {Konganti, Kranti},
            year = {2022},
            title = {CPIPES - Centriflaken},
            publisher = {GitLab},
            journal = {GitLab repository},
            url = {https://cfsan-git.fda.gov/Kranti.Konganti/cpipes}}
        </citation>
    </citations>
</tool>
author	kkonganti
date	Wed, 29 Jun 2022 09:15:39 -0400
parents	4f31b641f6fd
children	91480ddc3fcd