Mercurial > repos > kkonganti > cfsan_flye
comparison cfsan_flye.xml @ 0:96bb0635f0a0
"planemo upload"
author | kkonganti |
---|---|
date | Fri, 24 Jun 2022 14:18:37 -0400 |
parents | |
children | 5c18b16d6ac1 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:96bb0635f0a0 |
---|---|
1 <tool id="cfsan_flye" name="CFSAN_Flye" version="2.8.1+galaxy0" profile="20.01"> | |
2 <description>de novo assembler for single molecule sequencing reads</description> | |
3 <requirements> | |
4 <requirement type="package" version="2.8.1">flye</requirement> | |
5 </requirements> | |
6 <version_command>flye --version</version_command> | |
7 <command detect_errors="exit_code"><![CDATA[ | |
8 #if $inputs.is_of_type('fastqsanger', 'fastq'): | |
9 #set $ext = 'fastq' | |
10 #elif $inputs.is_of_type('fastqsanger.gz', 'fastq.gz'): | |
11 #set $ext = 'fastq.gz' | |
12 #elif $inputs.is_of_type('fasta.gz'): | |
13 #set $ext = 'fasta.gz' | |
14 #elif $inputs.is_of_type('fasta'): | |
15 #set $ext = 'fasta' | |
16 #end if | |
17 infile=\$(basename $inputs '.dat').${ext} && | |
18 ln -s $inputs ./\${infile} && | |
19 flye | |
20 $mode_conditional.mode | |
21 \${infile} | |
22 -o out_dir | |
23 -t \${GALAXY_SLOTS:-4} | |
24 -i $iterations | |
25 #if $mode_conditional.mode == '--pacbio-hifi' and $mode_conditional.hifi_error: | |
26 --hifi-error $mode_conditional.hifi_error | |
27 #end if | |
28 #if $min_overlap: | |
29 -m $min_overlap | |
30 #end if | |
31 #if $asm.asm_select == 'true': | |
32 --asm-coverage $asm.asm_coverage | |
33 -g '${asm.genome_size}' | |
34 #end if | |
35 $meta | |
36 $scaffold | |
37 ]]></command> | |
38 <inputs> | |
39 <param name="inputs" type="data" format="fasta,fasta.gz,fastq,fastq.gz,fastqsanger.gz,fastqsanger" label="Input reads" /> | |
40 <conditional name="mode_conditional"> | |
41 <param name="mode" type="select" label="Mode"> | |
42 <option value="--nano-raw">Nanopore raw (--nano-raw)</option> | |
43 <option value="--nano-corr">Nanopore corrected (--nano-corr)</option> | |
44 <option value="--nano-hq">Nanopore HQ (--nano-hq)</option> | |
45 <option value="--pacbio-raw">PacBio raw (--pacbio-raw)</option> | |
46 <option value="--pacbio-corr">PacBio corrected (--pacbio-corr)</option> | |
47 <option value="--pacbio-hifi">PacBio HiFi (--pacbio-hifi)</option> | |
48 </param> | |
49 <when value="--nano-raw"/> | |
50 <when value="--nano-corr"/> | |
51 <when value="--nano-hq"/> | |
52 <when value="--pacbio-raw"/> | |
53 <when value="--pacbio-corr"/> | |
54 <when value="--pacbio-hifi"> | |
55 <param argument="--hifi-error" type="float" min="0" max="1" optional="true" label="Expected HiFi reads error rate" help="Default: 0.01"/> | |
56 </when> | |
57 </conditional> | |
58 <param argument="--iterations" type="integer" value="1" label="Number of polishing iterations" | |
59 help="Polishing is performed as the final assembly stage. By default, Flye runs one polishing iteration. Additional iterations | |
60 might correct a small number of extra errors (due to improvements on how reads may align to the corrected assembly). If the | |
61 parameter is set to 0, the polishing is not performed"/> | |
62 <param argument="--min-overlap" type="integer" min="1000" max="10000" optional="true" label="Minimum overlap between reads" | |
63 help="This sets a minimum overlap length for two reads to be considered overlapping. By default it is chosen | |
64 automatically based on the read length distribution (reads N90) and does not require manual setting. Typical | |
65 value is 3k-5k (and down to 1k for datasets with shorter read length). Intuitively, we want to set this | |
66 parameter as high as possible, so the repeat graph is less tangled. However, higher values might lead to assembly gaps. | |
67 In some rare cases it makes sense to manually increase minimum overlap for assemblies of big genomes with long reads and high coverage." /> | |
68 <param argument="--keep-haplotypes" type="boolean" truevalue="--keep-haplotypes" falsevalue="" checked="False" label="Keep haplotypes" | |
69 help="By default, Flye collapses graph structures caused by alternative haplotypes (bubbles, superbubbles, roundabouts) to produce longer | |
70 consensus contigs. This option retains the alternative paths on the graph, producing less contigouos, but more detailed assembly."/> | |
71 <param argument="--scaffold" type="boolean" truevalue="--scaffold" falsevalue="" label="Enable scaffolding using graph" | |
72 help="Starting from the version 2.9 Flye does not perform scaffolding by default, which guarantees that all assembled sequences do not have any gaps" /> | |
73 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" checked="False" label="Perform metagenomic assembly" | |
74 help="It is designed for highly non-uniform coverage and is sensitive to underrepresented sequence at low coverage (as low as 2x). | |
75 In some examples of simple metagenomes, we observed that the normal mode assembled more contigious bacterial | |
76 consensus sequence, while the metagenome mode was slightly more fragmented, but revealed strain mixtures"/> | |
77 <conditional name="asm"> | |
78 <param name="asm_select" type="select" label="Reduced contig assembly coverage" help="Typically, assemblies of large genomes at high coverage require a hundreds of RAM. For high coverage assemblies, | |
79 you can reduce memory usage by using only a subset of longest reads for initial contig extension stage (usually, the memory bottleneck)"> | |
80 <option value="true">Enable reduced coverage for initial disjointing assembly</option> | |
81 <option value="false" selected="true">Disable reduced coverage for initial disjointing assembly</option> | |
82 </param> | |
83 <when value="true"> | |
84 <param argument="--asm-coverage" type="integer" min="0" value="30" | |
85 label="Reduced coverage for initial disjointing assembly" | |
86 help="This parameter specifies the target coverage of the longest reads. For a typical assembly, 30x is enough to produce good | |
87 initial contigs. Regardless of this parameter, all reads will be used at the later pipeline stages."/> | |
88 <param argument="--genome-size" type="text" optional="true" label="Estimated genome size" | |
89 help="For example, 5m or 2.6g. No longer required as input. However, it must be used in conjunction with --asm-coverage option."> | |
90 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator> | |
91 </param> | |
92 </when> | |
93 <when value="false" /> | |
94 </conditional> | |
95 <param name="generate_log" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate a log file"/> | |
96 </inputs> | |
97 <outputs> | |
98 <data name="consensus" format="fasta" from_work_dir="out_dir/assembly.fasta" label="${tool.name} on ${on_string}: consensus"/> | |
99 <data name="assembly_graph" format="graph_dot" from_work_dir="out_dir/assembly_graph.gv" label="${tool.name} on ${on_string}: assembly graph"/> | |
100 <data name="assembly_gfa" format="txt" from_work_dir="out_dir/assembly_graph.gfa" label="${tool.name} on ${on_string}: graphical fragment assembly"/> | |
101 <data name="assembly_info" format="tabular" from_work_dir="out_dir/assembly_info.txt" label="${tool.name} on ${on_string}: assembly info"/> | |
102 <data name="flye_log" format="txt" from_work_dir="out_dir/flye.log" label="${tool.name} on ${on_string}: log"> | |
103 <filter>generate_log</filter> | |
104 </data> | |
105 </outputs> | |
106 <tests> | |
107 <!--Test 01: pacbio-raw--> | |
108 <test expect_num_outputs="5"> | |
109 <param name="inputs" ftype="fastq.gz" value="ecoli.fastq.gz"/> | |
110 <param name="mode" value="--pacbio-raw"/> | |
111 <param name="iterations" value="0"/> | |
112 <param name="generate_log" value="true"/> | |
113 <output name="assembly_info" file="result1_assembly_info.txt" ftype="tabular" compare="sim_size"/> | |
114 <output name="assembly_graph" file="result1_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> | |
115 <output name="assembly_gfa" file="result1_assembly_graph.gfa" ftype="txt" compare="sim_size"/> | |
116 <output name="consensus" file="result1_assembly.fasta" ftype="fasta" compare="sim_size"/> | |
117 <output name="flye_log" file="result1.log" ftype="txt" compare="sim_size"/> | |
118 </test> | |
119 <!--Test 02: nano raw--> | |
120 <test expect_num_outputs="4"> | |
121 <param name="inputs" ftype="fasta.gz" value="nanopore.fasta.gz"/> | |
122 <param name="mode" value="--nano-raw"/> | |
123 <param name="iterations" value="0"/> | |
124 <output name="assembly_info" ftype="tabular"> | |
125 <assert_contents> | |
126 <has_size value="95" delta="100"/> | |
127 </assert_contents> | |
128 </output> | |
129 <output name="assembly_graph" ftype="graph_dot"> | |
130 <assert_contents> | |
131 <has_size value="803" delta="100"/> | |
132 </assert_contents> | |
133 </output> | |
134 <output name="assembly_gfa" ftype="txt"> | |
135 <assert_contents> | |
136 <has_size value="35047" delta="100"/> | |
137 </assert_contents> | |
138 </output> | |
139 <output name="consensus" ftype="fasta"> | |
140 <assert_contents> | |
141 <has_size value="35573" delta="100"/> | |
142 </assert_contents> | |
143 </output> | |
144 </test> | |
145 <!--Test 03: reduce coverage--> | |
146 <test expect_num_outputs="4"> | |
147 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/> | |
148 <conditional name="mode_conditional"> | |
149 <param name="mode" value="--nano-raw"/> | |
150 </conditional> | |
151 <conditional name="asm"> | |
152 <param name="asm_select" value="true" /> | |
153 <param name="asm" value="30"/> | |
154 <param name="genome_size" value="3980000"/> | |
155 </conditional> | |
156 <output name="assembly_info" ftype="tabular"> | |
157 <assert_contents> | |
158 <has_size value="286" delta="100"/> | |
159 </assert_contents> | |
160 </output> | |
161 <output name="assembly_graph" ftype="graph_dot"> | |
162 <assert_contents> | |
163 <has_size value="1840" delta="100"/> | |
164 </assert_contents> | |
165 </output> | |
166 <output name="assembly_gfa" ftype="txt"> | |
167 <assert_contents> | |
168 <has_size value="420752" delta="100"/> | |
169 </assert_contents> | |
170 </output> | |
171 <output name="consensus" ftype="fasta"> | |
172 <assert_contents> | |
173 <has_size value="427580" delta="100"/> | |
174 </assert_contents> | |
175 </output> | |
176 </test> | |
177 <!--Test 04: metagenomic mode--> | |
178 <test expect_num_outputs="4"> | |
179 <param name="inputs" ftype="fastq.gz" value="ecoli.fastq.gz"/> | |
180 <conditional name="mode_conditional"> | |
181 <param name="mode" value="--pacbio-raw"/> | |
182 </conditional> | |
183 <param name="meta" value="true"/> | |
184 <output name="assembly_info" ftype="tabular"> | |
185 <assert_contents> | |
186 <has_size value="95" delta="100"/> | |
187 </assert_contents> | |
188 </output> | |
189 <output name="assembly_graph" ftype="graph_dot"> | |
190 <assert_contents> | |
191 <has_size value="367" delta="100"/> | |
192 </assert_contents> | |
193 </output> | |
194 <output name="assembly_gfa" ftype="txt"> | |
195 <assert_contents> | |
196 <has_size value="418729" delta="100"/> | |
197 </assert_contents> | |
198 </output> | |
199 <output name="consensus" ftype="fasta"> | |
200 <assert_contents> | |
201 <has_size value="425667" delta="100"/> | |
202 </assert_contents> | |
203 </output> | |
204 </test> | |
205 <!--Test 05: nanopore HQ mode--> | |
206 <test expect_num_outputs="4"> | |
207 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/> | |
208 <conditional name="mode_conditional"> | |
209 <param name="mode" value="--nano-hq"/> | |
210 </conditional> | |
211 <param name="min_overlap" value="1000"/> | |
212 <output name="assembly_info" ftype="tabular"> | |
213 <assert_contents> | |
214 <has_size value="286" delta="100"/> | |
215 </assert_contents> | |
216 </output> | |
217 <output name="assembly_graph" ftype="graph_dot"> | |
218 <assert_contents> | |
219 <has_size value="1248" delta="100"/> | |
220 </assert_contents> | |
221 </output> | |
222 <output name="assembly_gfa" ftype="txt"> | |
223 <assert_contents> | |
224 <has_size value="420252" delta="100"/> | |
225 </assert_contents> | |
226 </output> | |
227 <output name="consensus" ftype="fasta"> | |
228 <assert_contents> | |
229 <has_size value="427129" delta="100"/> | |
230 </assert_contents> | |
231 </output> | |
232 </test> | |
233 <!--Test 06: hifi error option--> | |
234 <test expect_num_outputs="4"> | |
235 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/> | |
236 <conditional name="mode_conditional"> | |
237 <param name="mode" value="--pacbio-hifi"/> | |
238 <param name="hifi_error" value="0.21"/> | |
239 </conditional> | |
240 <param name="min_overlap" value="1000"/> | |
241 <output name="assembly_info" ftype="tabular"> | |
242 <assert_contents> | |
243 <has_size value="286" delta="100"/> | |
244 </assert_contents> | |
245 </output> | |
246 <output name="assembly_graph" ftype="graph_dot"> | |
247 <assert_contents> | |
248 <has_size value="1273" delta="100"/> | |
249 </assert_contents> | |
250 </output> | |
251 <output name="assembly_gfa" ftype="txt"> | |
252 <assert_contents> | |
253 <has_size value="420252" delta="100"/> | |
254 </assert_contents> | |
255 </output> | |
256 <output name="consensus" ftype="fasta"> | |
257 <assert_contents> | |
258 <has_size value="427129" delta="100"/> | |
259 </assert_contents> | |
260 </output> | |
261 </test> | |
262 <!--Test 07: keep haplotypes--> | |
263 <test expect_num_outputs="4"> | |
264 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/> | |
265 <conditional name="mode_conditional"> | |
266 <param name="mode" value="--pacbio-corr"/> | |
267 <param name="hifi_error" value="0.21"/> | |
268 </conditional> | |
269 <param name="min_overlap" value="1000"/> | |
270 <param name="keep-haplotypes" value="true"/> | |
271 <output name="assembly_info" ftype="tabular"> | |
272 <assert_contents> | |
273 <has_size value="286" delta="100"/> | |
274 </assert_contents> | |
275 </output> | |
276 <output name="assembly_graph" ftype="graph_dot"> | |
277 <assert_contents> | |
278 <has_size value="1273" delta="100"/> | |
279 </assert_contents> | |
280 </output> | |
281 <output name="assembly_gfa" ftype="txt"> | |
282 <assert_contents> | |
283 <has_size value="420252" delta="100"/> | |
284 </assert_contents> | |
285 </output> | |
286 <output name="consensus" ftype="fasta"> | |
287 <assert_contents> | |
288 <has_size value="427129" delta="100"/> | |
289 </assert_contents> | |
290 </output> | |
291 </test> | |
292 <!--Test 08: scaffolding mode--> | |
293 <test expect_num_outputs="4"> | |
294 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/> | |
295 <param name="mode" value="--nano-hq"/> | |
296 <param name="min_overlap" value="1000"/> | |
297 <param name="scaffolding" value="true"/> | |
298 <output name="assembly_info" ftype="tabular"> | |
299 <assert_contents> | |
300 <has_size value="286" delta="100"/> | |
301 </assert_contents> | |
302 </output> | |
303 <output name="assembly_graph" ftype="graph_dot"> | |
304 <assert_contents> | |
305 <has_size value="1248" delta="100"/> | |
306 </assert_contents> | |
307 </output> | |
308 <output name="assembly_gfa" ftype="txt"> | |
309 <assert_contents> | |
310 <has_size value="420252" delta="100"/> | |
311 </assert_contents> | |
312 </output> | |
313 <output name="consensus" ftype="fasta"> | |
314 <assert_contents> | |
315 <has_size value="427129" delta="100"/> | |
316 </assert_contents> | |
317 </output> | |
318 </test> | |
319 </tests> | |
320 <help><![CDATA[ | |
321 | |
322 .. class:: infomark | |
323 | |
324 **Purpose** | |
325 | |
326 Flye is a de novo assembler for single molecule sequencing reads, such as those produced by PacBio and Oxford Nanopore Technologies. | |
327 It is designed for a wide range of datasets, from small bacterial projects to large mammalian-scale assemblies. The package represents | |
328 a complete pipeline: it takes raw PacBio/ONT reads as input and outputs polished contigs. Flye also has a special mode for metagenome | |
329 assembly. | |
330 | |
331 ---- | |
332 | |
333 .. class:: infomark | |
334 | |
335 **Quick usage** | |
336 | |
337 Input reads can be in FASTA or FASTQ format, uncompressed or compressed with gz. Currently, PacBio (raw, corrected, HiFi) and ONT reads | |
338 (raw, corrected) are supported. Expected error rates are <30% for raw, <3% for corrected, and <1% for HiFi. Note that Flye was primarily | |
339 developed to run on raw reads. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o | |
340 ption enables the mode for metagenome/uneven coverage assembly. | |
341 | |
342 Genome size estimate is no longer a required option. You need to provide an estimate if using *--asm-coverage* option. | |
343 | |
344 To reduce memory consumption for large genome assemblies, you can use a subset of the longest reads for initial disjointig assembly by | |
345 specifying *--asm-coverage* and *--genome-size* options. Typically, 40x coverage is enough to produce good disjointigs. | |
346 | |
347 ---- | |
348 | |
349 .. class:: infomark | |
350 | |
351 **Outputs** | |
352 | |
353 The main output files are: | |
354 | |
355 :: | |
356 | |
357 - Final assembly: contains contigs and possibly scaffolds (see below). | |
358 - Final repeat graph: note that the edge sequences might be different (shorter) than contig sequences, because contigs might include multiple graph edges. | |
359 - Extra information about contigs (such as length or coverage). | |
360 | |
361 Each contig is formed by a single unique graph edge. If possible, unique contigs are extended with the sequence from flanking unresolved repeats on the graph. Thus, | |
362 a contig fully contains the corresponding graph edge (with the same id), but might be longer then this edge. This is somewhat similar to unitig-contig relation in | |
363 OLC assemblers. In a rare case when a repetitive graph edge is not covered by the set of "extended" contigs, it will be also output in the assembly file. | |
364 | |
365 Sometimes it is possible to further order contigs into scaffolds based on the repeat graph structure. These ordered contigs will be output as a part of scaffold in | |
366 the assembly file (with a scaffold prefix). Since it is hard to give a reliable estimate of the gap size, those gaps are represented with the default 100 Ns. | |
367 assembly_info.txt file (below) contains additional information about how scaffolds were formed. | |
368 | |
369 Extra information about contigs/scaffolds is output into the assembly_info.txt file. It is a tab-delimited table with the columns as follows: | |
370 | |
371 :: | |
372 | |
373 - Contig/scaffold id | |
374 - Length | |
375 - Coverage | |
376 - Is circular, (Y)es or (N)o | |
377 - Is repetitive, (Y)es or (N)o | |
378 - Multiplicity (based on coverage) | |
379 - Alternative group | |
380 - Graph path (graph path corresponding to this contig/scaffold). | |
381 | |
382 Scaffold gaps are marked with ?? symbols, and * symbol denotes a terminal graph node. Alternative contigs (representing alternative haplotypes) will have the same alt. | |
383 group ID. Primary contigs are marked by *. | |
384 | |
385 ---- | |
386 | |
387 .. class:: infomark | |
388 | |
389 **Algorithm Description** | |
390 | |
391 This is a brief description of the Flye algorithm. Please refer to the manuscript for more detailed information. The draft contig extension is organized as follows: | |
392 | |
393 :: | |
394 | |
395 - K-mer counting / erroneous k-mer pre-filtering | |
396 - Solid k-mer selection (k-mers with sufficient frequency, which are unlikely to be erroneous) | |
397 - Contig extension. The algorithm starts from a single read and extends it with a next overlapping read (overlaps are dynamically detected using the selected solid k-mers). | |
398 | |
399 Note that we do not attempt to resolve repeats at this stage, thus the reconstructed contigs might contain misassemblies. Flye then aligns the reads on these draft | |
400 contigs using minimap2 and calls a consensus. Afterwards, Flye performs repeat analysis as follows: | |
401 | |
402 :: | |
403 | |
404 - Repeat graph is constructed from the (possibly misassembled) contigs | |
405 - In this graph all repeats longer than minimum overlap are collapsed | |
406 - The algorithm resolves repeats using the read information and graph structure | |
407 - The unbranching paths in the graph are output as contigs | |
408 | |
409 If enabled, after resolving bridged repeats, Trestle module attempts to resolve simple unbridged repeats (of multiplicity 2) using the heterogeneities between repeat copies. | |
410 Finally, Flye performs polishing of the resulting assembly to correct the remaining errors: | |
411 | |
412 :: | |
413 | |
414 - Alignment of all reads to the current assembly using minimap2 | |
415 - Partition the alignment into mini-alignments (bubbles) | |
416 - Error correction of each bubble using a maximum likelihood approach | |
417 | |
418 | |
419 The polishing steps could be repeated, which might slightly increase quality for some datasets. | |
420 | |
421 | |
422 ]]></help> | |
423 <citations> | |
424 <citation type="doi">10.1073/pnas.1604560113</citation> | |
425 <citation type="bibtex"> | |
426 @misc{githubFlye, | |
427 author = {Kolmogorov, Mijhail}, | |
428 year = {2021}, | |
429 title = {Flye}, | |
430 publisher = {GitHub}, | |
431 journal = {GitHub repository}, | |
432 url = {https://github.com/fenderglass/Flye}} | |
433 </citation> | |
434 </citations> | |
435 </tool> |