Mercurial > repos > estrain > gtdbtk_classify_wf
changeset 0:a20cd9311046 draft
planemo upload commit bdb45cf3a98e21f5002866b6789a1457f521bf5d
| author | estrain |
|---|---|
| date | Thu, 12 Mar 2026 20:07:08 +0000 |
| parents | |
| children | 353347ef2386 |
| files | gtdbtk_classify_wf.xml |
| diffstat | 1 files changed, 155 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gtdbtk_classify_wf.xml Thu Mar 12 20:07:08 2026 +0000 @@ -0,0 +1,155 @@ +<tool id="gtdbtk_classify_wf" name="GTDB-Tk Classify genomes" version="0.1.3" profile="24"> + <description>by placement in GTDB reference tree</description> + <requirements> + <requirement type="package" version="2.5.2">gtdbtk</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ +#import re + +mkdir input_dir && +mkdir output_dir && +## GTDBTK can process *.fna. and $.fna.gz but unzipping everying to simplify the workflow +#for $i in $input: + #set cleaned = re.sub(r'\.fna|\.fasta|\.fa|\.gz', '', $i.element_identifier) + #set final_name = cleaned + '.fna' + #if $i.ext in ['fasta.gz']: + gunzip -c '${i}' > input_dir/'${final_name}' && + #else: + ln -s '${i}' input_dir/'${final_name}' && + #end if +#end for +export GTDBTK_DATA_PATH=$gtdbtk_db.fields.path && +gtdbtk classify_wf +--genome_dir input_dir +--out_dir output_dir +--mash_db $gtdbtk_db.fields.path +--cpus \${GALAXY_SLOTS:-4} + +#if str($advanced.output_process_log) == 'yes': + && cat output_dir/gtdbtk.warnings.log output_dir/gtdbtk.log > '$process_log' +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="fasta,fasta.gz" multiple="true" label="Fasta (Genome) files"/> + <param name="gtdbtk_db" type="select" label="GTDB-Tk database"> + <options from_data_table="gtdbtk_database_versioned"> + <validator type="no_options" message="No locally cached GTDB-Tk database is available"/> + </options> + </param> + <section name="advanced" title="Advanced options"> + <param argument="--min_perc_aa" type="integer" min="0" max="100" value="10" label="Exclude genomes that do not have at least this percentage of AA in the MSA" help="Inclusive bound"/> + <param argument="--force" type="boolean" truevalue="--force" falsevalue="" checked="false" label="Continue processing if an error occurs on a single genome?"/> + <param argument="--min_af" type="float" min="0" max="1" value="0.65" label="Minimum alignment fraction to consider closest genome"/> + <param name="output_process_log" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Output process log file?"/> + </section> + </inputs> + <outputs> + <data name="process_log" format="txt" label="${tool.name} on ${on_string} (process log)"> + <filter>advanced['output_process_log']</filter> + </data> + <collection name="output_align" type="list" format="fasta.gz" label="${tool.name} on ${on_string} (align)"> + <discover_datasets pattern="(?P<designation>.+)\.fasta.gz" ext="fasta.gz" directory="output_dir/align"/> + </collection> + <collection name="output_identfy" type="list" format="tsv" label="${tool.name} on ${on_string} (identify)"> + <discover_datasets pattern="(?P<designation>.+)\.tsv" ext="tsv" directory="output_dir/identify"/> + </collection> + <collection name="output_classify" type="list" format="newick" label="${tool.name} on ${on_string} (classify)"> + <discover_datasets pattern="(?P<designation>.+)\.tree" ext="newick" directory="output_dir/classify"/> + </collection> + <collection name="output_summary" type="list" format="tsv" label="${tool.name} on ${on_string} (summary)"> + <discover_datasets pattern="(?P<designation>.+)\.tsv" ext="tsv" directory="output_dir"/> + </collection> + </outputs> + <tests> + <!-- The commented test here is valid if we could store the GTDB-Tk database --> + <!-- + <test expect_num_outputs="4"> + <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> + <param name="gtdbtk_db" value="gtdbtk214"/> + <output_collection name="output_summary" type="list" count="1"> + <element name="gtdbtk.ar53.summary" ftype="tsv"> + <assert_contents> + <has_text text="user_genome"/> + </assert_contents> + </element> + </output_collection> + <output_collection name="output_identfy" type="list" count="4"> + <element name="gtdbtk.ar53.markers_summary" ftype="tsv"> + <assert_contents> + <has_text text="number_unique_genes"/> + </assert_contents> + </element> + <element name="gtdbtk.bac120.markers_summary" ftype="tsv"> + <assert_contents> + <has_text text="genome_1_fna_gz"/> + </assert_contents> + </element> + <element name="gtdbtk.failed_genomes" ftype="tsv"> + <assert_contents> + <has_size value="0"/> + </assert_contents> + </element> + <element name="gtdbtk.translation_table_summary" ftype="tsv"> + <assert_contents> + <has_text text="genome_1_fna_gz"/> + </assert_contents> + </element> + </output_collection> + <output_collection name="output_classify" type="list" count="1"> + <element name="gtdbtk.ar53.classify" ftype="newick"> + <assert_contents> + <has_text text="GB_GCA_"/> + </assert_contents> + </element> + </output_collection> + <output_collection name="output_align" type="list" count="2"> + <element name="gtdbtk.ar53.msa" ftype="fasta.gz" decompress="true"> + <assert_contents> + <has_text text="GB_GCA_000008085"/> + </assert_contents> + </element> + <element name="gtdbtk.ar53.user_msa" ftype="fasta.gz" decompress="true"> + <assert_contents> + <has_text text="genome_1_fna_gz"/> + </assert_contents> + </element> + </output_collection> + </test> + --> + <!-- GTDB-Tk databases are far too large to test currently --> + <test expect_failure="true"> + <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> + <param name="gtdbtk_db" value="gtdbtk214"/> + <assert_stderr> + <has_text text="Fatal error: Exit code 1"/> + </assert_stderr> + </test> + </tests> + <help><![CDATA[ +**What it does** + +GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes +based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or +thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also +be applied to isolate and single-cell genomes. + +This tool accepts one or more fasta (genome) files and determines taxonomic classification of genomes by +maximum-likelihood (ML) placement. The classification workflow consists of three steps: identify, align, and +classify. + +The identify step calls genes using Prodigal, and uses HMM models and the HMMER package to identify the 120 bacterial +and 122 archaeal marker genes used for phylogenetic inference. Multiple sequence alignments (MSA) are obtained by +aligning marker genes to their respective HMM model. + +The align step concatenates the aligned marker genes and filters the concatenated MSA to approximately 5,000 +amino acids. + +Finally, the classify step uses pplacer to find the maximum-likelihood placement of each genome in the GTDB-Tk +reference tree. GTDB-Tk classifies each genome based on its placement in the reference tree, its relative evolutionary +divergence, and/or average nucleotide identity (ANI) to reference genomes. + +Results can be impacted by a lack of marker genes or contamination. + ]]></help> + <expand macro="citations"/> +</tool> +
