Mercurial > repos > kkonganti > cfsan_bettercallsal
view cfsan_bettercallsal.xml @ 1:365849f031fd
"planemo upload"
author | kkonganti |
---|---|
date | Mon, 05 Jun 2023 18:48:51 -0400 |
parents | a4b1ee4b68b1 |
children | 4678c2cd1c9e |
line wrap: on
line source
<tool id="cfsan_bettercallsal" name="bettercallsal" version="0.2.0+galaxy0"> <description>An automated workflow to assign Salmonella serotype based on NCBI Pathogen Detection Project for Salmonella.</description> <requirements> <requirement type="package" version="22.10">nextflow</requirement> <requirement type="package" version="1.0.0">micromamba</requirement> <requirement type="package">graphviz</requirement> </requirements> <version_command>nextflow -version</version_command> <command detect_errors="exit_code"><![CDATA[ mkdir -p cpipes-input || exit 1; pwd_path=\$(pwd); #import re #if (str($input_read_type_cond.input_read_type) == "single_long"): #for _, $unpaired in enumerate($input_read_type_cond.input): #set read1 = str($unpaired.name) #if not str($unpaired.name).endswith(('.fastq', '.fastq.gz')): #set read1_ext = re.sub('fastqsanger', 'fastq', str($unpaired.ext)) #set read1 = str($unpaired.name) + str('.') + $read1_ext #end if ln -sf '$unpaired' './cpipes-input/$read1'; #end for #elif (str($input_read_type_cond.input_read_type) == "paired"): #for _, $pair in enumerate($input_read_type_cond.input_pair) #set read_R1 = re.sub('\:forward', '_forward', str($pair.forward.name)) #set read_R2 = re.sub('\:reverse', '_reverse', str($pair.reverse.name)) #set read_R1_ext = re.sub('fastqsanger', 'fastq', str($pair.forward.ext)) #set read_R2_ext = re.sub('fastqsanger', 'fastq', str($pair.reverse.ext)) #if not str($pair.forward.name).endswith(('.fastq', '.fastq.gz')): #set read_R1 = $read_R1 + str('.') + $read_R1_ext #end if #if not str($pair.reverse.name).endswith(('.fastq', '.fastq.gz')): #set read_R2 = $read_R2 + str('.') + $read_R2_ext #end if ln -sf '$pair.forward' './cpipes-input/$read_R1'; ln -sf '$pair.reverse' './cpipes-input/$read_R2'; #end for #end if $__tool_directory__/0.5.0/cpipes --pipeline $input_read_type_cond.pipeline_cond.pipeline --input \${pwd_path}/cpipes-input --output \${pwd_path}/cpipes-output --fq_suffix '${input_read_type_cond.fq_suffix}' #if (str($input_read_type_cond.input_read_type) == "single_long"): --fq_single_end true #elif (str($input_read_type_cond.input_read_type) == "paired"): --fq_single_end false --fq2_suffix '${input_read_type_cond.fq2_suffix}' #end if --tuspy_n $tuspy_n if ($sourmash_cond.run == "true"): --sfhpy_fcv $sourmash_run.sfhpy_fcv #end if --bcs_thresholds $bcs_thresholds --fq_filename_delim '${fq_filename_delim}' --fq_filename_delim_idx $fq_filename_delim_idx -profile kondagac; mv './cpipes-output/${input_read_type_cond.pipeline_cond.pipeline}-multiqc/multiqc_report.html' './multiqc_report.html' > /dev/null 2>&1 || exit 1; rm -rf ./cpipes-output > /dev/null 2>&1 || exit 1; rm -rf ./work > /dev/null 2>&1 || exit 1 ]]></command> <inputs> <conditional name="input_read_type_cond"> <param name="input_read_type" type="select" label="Select the read collection type"> <option value="single_long" selected="true">Single-End short reads</option> <option value="paired">Paired-End short reads</option> </param> <when value="single_long"> <param name="input" type="data_collection" collection_type="list" format="fastq,fastq.gz" label="Dataset list of unpaired short reads or long reads" /> <param name="fq_suffix" value=".fastq.gz" type="text" label="Suffix of the Single-End FASTQ"/> </when> <when value="paired"> <param name="input_pair" type="data_collection" collection_type="list:paired" format="fastq,fastq.gz" label="List of Dataset pairs" /> <param name="fq_suffix" value="_R1_001.fastq.gz" type="text" label="Suffix of the R1 FASTQ"/> <param name="fq2_suffix" value="_R2_001.fastq.gz" type="text" label="Suffix of the R2 FASTQ"/> </when> </conditional> <param name="tuspy_n" optional="true" value="10" type="integer" label="Enter the number of top unique serotypes to retain after initial MASH screen step." help="The default value of 10 is suitable for almost all scenarios."/> <param name="bcs_thresholds" type="select" label="Enter the type of base quality thresholds to be set with bettercallsal." help="The default value sets strictest thresholds that tends to filter out most of the false positive hits."> <option value="strict" selected="true">strict</option> <option value="relax">relax</option> </param> <conditional name="sourmash_cond"> <param name="run" type="select" label="Run sourmash" help="Should sourmash be used for additional genome fraction filtering."> <option value="true" selected="true">yes</option> <option value="false">no</option> </param> <when value="true"> <param name="sfhpy_fcv" type="text" value="0.1" label="Enter the minimum coverage match with sourmash before a serotype hit is considered for further processing." help="The default value is set at 10% coverage threshold."/> </when> <when value="false"> <param name="sfhpy_fcv" type="select" label="Enter the minimum coverage match with sourmash before a serotype hit is considered for further processing." help="THIS OPTION IS IGNORED IF SOURMASH TOOL IgreS DISABLED."> <option value="NA" selected="true">N/A</option> </param> </when> </conditional> <param name="fq_filename_delim" type="text" value="_" label="File name delimitor by which samples are grouped together (--fq_filename_delim)" help="This is the delimitor by which samples are grouped together to display in the final MultiQC report. For example, if your input data sets are mango_replicate1.fastq.gz, mango_replicate2.fastq.gz, orange_replicate1_maryland.fastq.gz, orange_replicate2_maryland.fastq.gz, then to create 2 samples mango and orange, the value for --fq_filename_delim would be _ (underscore) and the value for --fq_filename_delim_idx would be 1, since you want to group by the first word (i.e. mango or orange) after splitting the filename based on _ (underscore)."/> <param name="fq_filename_delim_idx" type="integer" value="1" label="File name delimitor index (--fq_filename_delim_idx)" /> <!-- <param name="runtime_profile" type="select" label="Run time profile"> <option value="kondagac" selected="true">conda</option> <option value="cingularitygac">singularity</option> </param> --> </inputs> <outputs> <data name="multiqc_report" format="html" label="${input_read_type_cond.pipeline_cond.pipeline}: MultiQC Report on ${on_string}" from_work_dir="multiqc_report.html"/> </outputs> <tests> <!--Test 01: long reads--> <test expect_num_outputs="2"> <param name="input"> <collection type="list"> <element name="FAL11127.fastq.gz" value="FAL11127.fastq.gz" /> <element name="FAL11341.fastq.gz" value="FAL11341.fastq.gz" /> <element name="FAL11342.fastq.gz" value="FAL11342.fastq.gz" /> </collection> </param> <param name="fq_suffix" value=".fastq.gz"/> <output name="multiqc_report" file="multiqc_report.html" ftype="html" compare="sim_size"/> <!-- <output name="assembled_mags" file="FAL11127.assembly_filtered.contigs.fasta" ftype="fasta" compare="sim_size"/> --> </test> </tests> <help><![CDATA[ .. class:: infomark **Purpose** bettercallsal is an automated workflow to assign Salmonella serotype based on NCBI Pathogen Detection Project for Salmonella. It uses MASH to reduce the search space followed by additional genome filtering with sourmash. It then performs genome based alignment with kma followed by count generation using salmon. This workflow can be used to analyze shotgun metagenomics datasets, quasi-metagenomic datasets (enriched for Salmonella) and target enriched datasets (enriched with molecular baits specific for Salmonella) and is especially useful in a case where a sample is of multi-serovar mixture. It is written in Nextflow and is part of the modular data analysis pipelines (CFSAN PIPELINES or CPIPES for short) at CFSAN. ---- .. class:: infomark **Testing and Validation** The CPIPES - bettercallsal Nextflow pipeline has been wrapped to make it work in Galaxy. It takes in either paired or unpaired short reads list as an input and generates a MultiQC report in the final step. The pipeline has been tested on 2x300 bp MiSeq and 2x150 bp NextSeq reads and has shown to call multiple Salmonella serotypes with up to ~95% accuracy. The pipeline has also been tested on metagenomics data sets from Peach and Papaya outbreaks as discussed in our preprint (https://www.biorxiv.org/content/10.1101/2023.04.06.535929v1.full). All the original testing and validation was done on the command line on the CFSAN Raven2 HPC Cluster. ---- .. class:: infomark **Outputs** The main output files are: :: - MultiQC Report: Contains a brief summary report including any serotyping and AMR result tables. Please note that due to MultiQC customizations, the preview (eye icon) will not work within Galaxy for the MultiQC report. Please download the file by clicking on the floppy icon and view it in your browser on your local desktop/workstation. You can export the tables and plots from the downloaded MultiQC report. ]]></help> <citations> <citation type="bibtex"> @misc{bettercallsal, author = {Konganti, Kranti}, year = {2023}, title = {bettercallsal: better calling of Salmonella serotypes from enrichment cultures using shotgun metagenomic profiling and its application in an outbreak setting}, publisher = {Cold Spring Harbor Laboratory}, journal = {bioRxiv}, url = {https://www.biorxiv.org/content/10.1101/2023.04.06.535929v1.full}} </citation> </citations> </tool>