Mercurial > repos > estrain > gtdbtk_classify_wf
comparison gtdbtk_classify_wf.xml @ 0:a20cd9311046 draft
planemo upload commit bdb45cf3a98e21f5002866b6789a1457f521bf5d
| author | estrain |
|---|---|
| date | Thu, 12 Mar 2026 20:07:08 +0000 |
| parents | |
| children | 353347ef2386 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a20cd9311046 |
|---|---|
| 1 <tool id="gtdbtk_classify_wf" name="GTDB-Tk Classify genomes" version="0.1.3" profile="24"> | |
| 2 <description>by placement in GTDB reference tree</description> | |
| 3 <requirements> | |
| 4 <requirement type="package" version="2.5.2">gtdbtk</requirement> | |
| 5 </requirements> | |
| 6 <command detect_errors="exit_code"><![CDATA[ | |
| 7 #import re | |
| 8 | |
| 9 mkdir input_dir && | |
| 10 mkdir output_dir && | |
| 11 ## GTDBTK can process *.fna. and $.fna.gz but unzipping everying to simplify the workflow | |
| 12 #for $i in $input: | |
| 13 #set cleaned = re.sub(r'\.fna|\.fasta|\.fa|\.gz', '', $i.element_identifier) | |
| 14 #set final_name = cleaned + '.fna' | |
| 15 #if $i.ext in ['fasta.gz']: | |
| 16 gunzip -c '${i}' > input_dir/'${final_name}' && | |
| 17 #else: | |
| 18 ln -s '${i}' input_dir/'${final_name}' && | |
| 19 #end if | |
| 20 #end for | |
| 21 export GTDBTK_DATA_PATH=$gtdbtk_db.fields.path && | |
| 22 gtdbtk classify_wf | |
| 23 --genome_dir input_dir | |
| 24 --out_dir output_dir | |
| 25 --mash_db $gtdbtk_db.fields.path | |
| 26 --cpus \${GALAXY_SLOTS:-4} | |
| 27 | |
| 28 #if str($advanced.output_process_log) == 'yes': | |
| 29 && cat output_dir/gtdbtk.warnings.log output_dir/gtdbtk.log > '$process_log' | |
| 30 #end if | |
| 31 ]]></command> | |
| 32 <inputs> | |
| 33 <param name="input" type="data" format="fasta,fasta.gz" multiple="true" label="Fasta (Genome) files"/> | |
| 34 <param name="gtdbtk_db" type="select" label="GTDB-Tk database"> | |
| 35 <options from_data_table="gtdbtk_database_versioned"> | |
| 36 <validator type="no_options" message="No locally cached GTDB-Tk database is available"/> | |
| 37 </options> | |
| 38 </param> | |
| 39 <section name="advanced" title="Advanced options"> | |
| 40 <param argument="--min_perc_aa" type="integer" min="0" max="100" value="10" label="Exclude genomes that do not have at least this percentage of AA in the MSA" help="Inclusive bound"/> | |
| 41 <param argument="--force" type="boolean" truevalue="--force" falsevalue="" checked="false" label="Continue processing if an error occurs on a single genome?"/> | |
| 42 <param argument="--min_af" type="float" min="0" max="1" value="0.65" label="Minimum alignment fraction to consider closest genome"/> | |
| 43 <param name="output_process_log" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Output process log file?"/> | |
| 44 </section> | |
| 45 </inputs> | |
| 46 <outputs> | |
| 47 <data name="process_log" format="txt" label="${tool.name} on ${on_string} (process log)"> | |
| 48 <filter>advanced['output_process_log']</filter> | |
| 49 </data> | |
| 50 <collection name="output_align" type="list" format="fasta.gz" label="${tool.name} on ${on_string} (align)"> | |
| 51 <discover_datasets pattern="(?P<designation>.+)\.fasta.gz" ext="fasta.gz" directory="output_dir/align"/> | |
| 52 </collection> | |
| 53 <collection name="output_identfy" type="list" format="tsv" label="${tool.name} on ${on_string} (identify)"> | |
| 54 <discover_datasets pattern="(?P<designation>.+)\.tsv" ext="tsv" directory="output_dir/identify"/> | |
| 55 </collection> | |
| 56 <collection name="output_classify" type="list" format="newick" label="${tool.name} on ${on_string} (classify)"> | |
| 57 <discover_datasets pattern="(?P<designation>.+)\.tree" ext="newick" directory="output_dir/classify"/> | |
| 58 </collection> | |
| 59 <collection name="output_summary" type="list" format="tsv" label="${tool.name} on ${on_string} (summary)"> | |
| 60 <discover_datasets pattern="(?P<designation>.+)\.tsv" ext="tsv" directory="output_dir"/> | |
| 61 </collection> | |
| 62 </outputs> | |
| 63 <tests> | |
| 64 <!-- The commented test here is valid if we could store the GTDB-Tk database --> | |
| 65 <!-- | |
| 66 <test expect_num_outputs="4"> | |
| 67 <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> | |
| 68 <param name="gtdbtk_db" value="gtdbtk214"/> | |
| 69 <output_collection name="output_summary" type="list" count="1"> | |
| 70 <element name="gtdbtk.ar53.summary" ftype="tsv"> | |
| 71 <assert_contents> | |
| 72 <has_text text="user_genome"/> | |
| 73 </assert_contents> | |
| 74 </element> | |
| 75 </output_collection> | |
| 76 <output_collection name="output_identfy" type="list" count="4"> | |
| 77 <element name="gtdbtk.ar53.markers_summary" ftype="tsv"> | |
| 78 <assert_contents> | |
| 79 <has_text text="number_unique_genes"/> | |
| 80 </assert_contents> | |
| 81 </element> | |
| 82 <element name="gtdbtk.bac120.markers_summary" ftype="tsv"> | |
| 83 <assert_contents> | |
| 84 <has_text text="genome_1_fna_gz"/> | |
| 85 </assert_contents> | |
| 86 </element> | |
| 87 <element name="gtdbtk.failed_genomes" ftype="tsv"> | |
| 88 <assert_contents> | |
| 89 <has_size value="0"/> | |
| 90 </assert_contents> | |
| 91 </element> | |
| 92 <element name="gtdbtk.translation_table_summary" ftype="tsv"> | |
| 93 <assert_contents> | |
| 94 <has_text text="genome_1_fna_gz"/> | |
| 95 </assert_contents> | |
| 96 </element> | |
| 97 </output_collection> | |
| 98 <output_collection name="output_classify" type="list" count="1"> | |
| 99 <element name="gtdbtk.ar53.classify" ftype="newick"> | |
| 100 <assert_contents> | |
| 101 <has_text text="GB_GCA_"/> | |
| 102 </assert_contents> | |
| 103 </element> | |
| 104 </output_collection> | |
| 105 <output_collection name="output_align" type="list" count="2"> | |
| 106 <element name="gtdbtk.ar53.msa" ftype="fasta.gz" decompress="true"> | |
| 107 <assert_contents> | |
| 108 <has_text text="GB_GCA_000008085"/> | |
| 109 </assert_contents> | |
| 110 </element> | |
| 111 <element name="gtdbtk.ar53.user_msa" ftype="fasta.gz" decompress="true"> | |
| 112 <assert_contents> | |
| 113 <has_text text="genome_1_fna_gz"/> | |
| 114 </assert_contents> | |
| 115 </element> | |
| 116 </output_collection> | |
| 117 </test> | |
| 118 --> | |
| 119 <!-- GTDB-Tk databases are far too large to test currently --> | |
| 120 <test expect_failure="true"> | |
| 121 <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> | |
| 122 <param name="gtdbtk_db" value="gtdbtk214"/> | |
| 123 <assert_stderr> | |
| 124 <has_text text="Fatal error: Exit code 1"/> | |
| 125 </assert_stderr> | |
| 126 </test> | |
| 127 </tests> | |
| 128 <help><![CDATA[ | |
| 129 **What it does** | |
| 130 | |
| 131 GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes | |
| 132 based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or | |
| 133 thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also | |
| 134 be applied to isolate and single-cell genomes. | |
| 135 | |
| 136 This tool accepts one or more fasta (genome) files and determines taxonomic classification of genomes by | |
| 137 maximum-likelihood (ML) placement. The classification workflow consists of three steps: identify, align, and | |
| 138 classify. | |
| 139 | |
| 140 The identify step calls genes using Prodigal, and uses HMM models and the HMMER package to identify the 120 bacterial | |
| 141 and 122 archaeal marker genes used for phylogenetic inference. Multiple sequence alignments (MSA) are obtained by | |
| 142 aligning marker genes to their respective HMM model. | |
| 143 | |
| 144 The align step concatenates the aligned marker genes and filters the concatenated MSA to approximately 5,000 | |
| 145 amino acids. | |
| 146 | |
| 147 Finally, the classify step uses pplacer to find the maximum-likelihood placement of each genome in the GTDB-Tk | |
| 148 reference tree. GTDB-Tk classifies each genome based on its placement in the reference tree, its relative evolutionary | |
| 149 divergence, and/or average nucleotide identity (ANI) to reference genomes. | |
| 150 | |
| 151 Results can be impacted by a lack of marker genes or contamination. | |
| 152 ]]></help> | |
| 153 <expand macro="citations"/> | |
| 154 </tool> | |
| 155 |
