comparison gtdbtk_classify_wf.xml @ 0:a20cd9311046 draft

planemo upload commit bdb45cf3a98e21f5002866b6789a1457f521bf5d
author estrain
date Thu, 12 Mar 2026 20:07:08 +0000
parents
children 353347ef2386
comparison
equal deleted inserted replaced
-1:000000000000 0:a20cd9311046
1 <tool id="gtdbtk_classify_wf" name="GTDB-Tk Classify genomes" version="0.1.3" profile="24">
2 <description>by placement in GTDB reference tree</description>
3 <requirements>
4 <requirement type="package" version="2.5.2">gtdbtk</requirement>
5 </requirements>
6 <command detect_errors="exit_code"><![CDATA[
7 #import re
8
9 mkdir input_dir &&
10 mkdir output_dir &&
11 ## GTDBTK can process *.fna. and $.fna.gz but unzipping everying to simplify the workflow
12 #for $i in $input:
13 #set cleaned = re.sub(r'\.fna|\.fasta|\.fa|\.gz', '', $i.element_identifier)
14 #set final_name = cleaned + '.fna'
15 #if $i.ext in ['fasta.gz']:
16 gunzip -c '${i}' > input_dir/'${final_name}' &&
17 #else:
18 ln -s '${i}' input_dir/'${final_name}' &&
19 #end if
20 #end for
21 export GTDBTK_DATA_PATH=$gtdbtk_db.fields.path &&
22 gtdbtk classify_wf
23 --genome_dir input_dir
24 --out_dir output_dir
25 --mash_db $gtdbtk_db.fields.path
26 --cpus \${GALAXY_SLOTS:-4}
27
28 #if str($advanced.output_process_log) == 'yes':
29 && cat output_dir/gtdbtk.warnings.log output_dir/gtdbtk.log > '$process_log'
30 #end if
31 ]]></command>
32 <inputs>
33 <param name="input" type="data" format="fasta,fasta.gz" multiple="true" label="Fasta (Genome) files"/>
34 <param name="gtdbtk_db" type="select" label="GTDB-Tk database">
35 <options from_data_table="gtdbtk_database_versioned">
36 <validator type="no_options" message="No locally cached GTDB-Tk database is available"/>
37 </options>
38 </param>
39 <section name="advanced" title="Advanced options">
40 <param argument="--min_perc_aa" type="integer" min="0" max="100" value="10" label="Exclude genomes that do not have at least this percentage of AA in the MSA" help="Inclusive bound"/>
41 <param argument="--force" type="boolean" truevalue="--force" falsevalue="" checked="false" label="Continue processing if an error occurs on a single genome?"/>
42 <param argument="--min_af" type="float" min="0" max="1" value="0.65" label="Minimum alignment fraction to consider closest genome"/>
43 <param name="output_process_log" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Output process log file?"/>
44 </section>
45 </inputs>
46 <outputs>
47 <data name="process_log" format="txt" label="${tool.name} on ${on_string} (process log)">
48 <filter>advanced['output_process_log']</filter>
49 </data>
50 <collection name="output_align" type="list" format="fasta.gz" label="${tool.name} on ${on_string} (align)">
51 <discover_datasets pattern="(?P&lt;designation&gt;.+)\.fasta.gz" ext="fasta.gz" directory="output_dir/align"/>
52 </collection>
53 <collection name="output_identfy" type="list" format="tsv" label="${tool.name} on ${on_string} (identify)">
54 <discover_datasets pattern="(?P&lt;designation&gt;.+)\.tsv" ext="tsv" directory="output_dir/identify"/>
55 </collection>
56 <collection name="output_classify" type="list" format="newick" label="${tool.name} on ${on_string} (classify)">
57 <discover_datasets pattern="(?P&lt;designation&gt;.+)\.tree" ext="newick" directory="output_dir/classify"/>
58 </collection>
59 <collection name="output_summary" type="list" format="tsv" label="${tool.name} on ${on_string} (summary)">
60 <discover_datasets pattern="(?P&lt;designation&gt;.+)\.tsv" ext="tsv" directory="output_dir"/>
61 </collection>
62 </outputs>
63 <tests>
64 <!-- The commented test here is valid if we could store the GTDB-Tk database -->
65 <!--
66 <test expect_num_outputs="4">
67 <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/>
68 <param name="gtdbtk_db" value="gtdbtk214"/>
69 <output_collection name="output_summary" type="list" count="1">
70 <element name="gtdbtk.ar53.summary" ftype="tsv">
71 <assert_contents>
72 <has_text text="user_genome"/>
73 </assert_contents>
74 </element>
75 </output_collection>
76 <output_collection name="output_identfy" type="list" count="4">
77 <element name="gtdbtk.ar53.markers_summary" ftype="tsv">
78 <assert_contents>
79 <has_text text="number_unique_genes"/>
80 </assert_contents>
81 </element>
82 <element name="gtdbtk.bac120.markers_summary" ftype="tsv">
83 <assert_contents>
84 <has_text text="genome_1_fna_gz"/>
85 </assert_contents>
86 </element>
87 <element name="gtdbtk.failed_genomes" ftype="tsv">
88 <assert_contents>
89 <has_size value="0"/>
90 </assert_contents>
91 </element>
92 <element name="gtdbtk.translation_table_summary" ftype="tsv">
93 <assert_contents>
94 <has_text text="genome_1_fna_gz"/>
95 </assert_contents>
96 </element>
97 </output_collection>
98 <output_collection name="output_classify" type="list" count="1">
99 <element name="gtdbtk.ar53.classify" ftype="newick">
100 <assert_contents>
101 <has_text text="GB_GCA_"/>
102 </assert_contents>
103 </element>
104 </output_collection>
105 <output_collection name="output_align" type="list" count="2">
106 <element name="gtdbtk.ar53.msa" ftype="fasta.gz" decompress="true">
107 <assert_contents>
108 <has_text text="GB_GCA_000008085"/>
109 </assert_contents>
110 </element>
111 <element name="gtdbtk.ar53.user_msa" ftype="fasta.gz" decompress="true">
112 <assert_contents>
113 <has_text text="genome_1_fna_gz"/>
114 </assert_contents>
115 </element>
116 </output_collection>
117 </test>
118 -->
119 <!-- GTDB-Tk databases are far too large to test currently -->
120 <test expect_failure="true">
121 <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/>
122 <param name="gtdbtk_db" value="gtdbtk214"/>
123 <assert_stderr>
124 <has_text text="Fatal error: Exit code 1"/>
125 </assert_stderr>
126 </test>
127 </tests>
128 <help><![CDATA[
129 **What it does**
130
131 GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes
132 based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or
133 thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also
134 be applied to isolate and single-cell genomes.
135
136 This tool accepts one or more fasta (genome) files and determines taxonomic classification of genomes by
137 maximum-likelihood (ML) placement. The classification workflow consists of three steps: identify, align, and
138 classify.
139
140 The identify step calls genes using Prodigal, and uses HMM models and the HMMER package to identify the 120 bacterial
141 and 122 archaeal marker genes used for phylogenetic inference. Multiple sequence alignments (MSA) are obtained by
142 aligning marker genes to their respective HMM model.
143
144 The align step concatenates the aligned marker genes and filters the concatenated MSA to approximately 5,000
145 amino acids.
146
147 Finally, the classify step uses pplacer to find the maximum-likelihood placement of each genome in the GTDB-Tk
148 reference tree. GTDB-Tk classifies each genome based on its placement in the reference tree, its relative evolutionary
149 divergence, and/or average nucleotide identity (ANI) to reference genomes.
150
151 Results can be impacted by a lack of marker genes or contamination.
152 ]]></help>
153 <expand macro="citations"/>
154 </tool>
155