comparison cfsan_flye.xml @ 0:96bb0635f0a0

"planemo upload"
author kkonganti
date Fri, 24 Jun 2022 14:18:37 -0400
parents
children 5c18b16d6ac1
comparison
equal deleted inserted replaced
-1:000000000000 0:96bb0635f0a0
1 <tool id="cfsan_flye" name="CFSAN_Flye" version="2.8.1+galaxy0" profile="20.01">
2 <description>de novo assembler for single molecule sequencing reads</description>
3 <requirements>
4 <requirement type="package" version="2.8.1">flye</requirement>
5 </requirements>
6 <version_command>flye --version</version_command>
7 <command detect_errors="exit_code"><![CDATA[
8 #if $inputs.is_of_type('fastqsanger', 'fastq'):
9 #set $ext = 'fastq'
10 #elif $inputs.is_of_type('fastqsanger.gz', 'fastq.gz'):
11 #set $ext = 'fastq.gz'
12 #elif $inputs.is_of_type('fasta.gz'):
13 #set $ext = 'fasta.gz'
14 #elif $inputs.is_of_type('fasta'):
15 #set $ext = 'fasta'
16 #end if
17 infile=\$(basename $inputs '.dat').${ext} &&
18 ln -s $inputs ./\${infile} &&
19 flye
20 $mode_conditional.mode
21 \${infile}
22 -o out_dir
23 -t \${GALAXY_SLOTS:-4}
24 -i $iterations
25 #if $mode_conditional.mode == '--pacbio-hifi' and $mode_conditional.hifi_error:
26 --hifi-error $mode_conditional.hifi_error
27 #end if
28 #if $min_overlap:
29 -m $min_overlap
30 #end if
31 #if $asm.asm_select == 'true':
32 --asm-coverage $asm.asm_coverage
33 -g '${asm.genome_size}'
34 #end if
35 $meta
36 $scaffold
37 ]]></command>
38 <inputs>
39 <param name="inputs" type="data" format="fasta,fasta.gz,fastq,fastq.gz,fastqsanger.gz,fastqsanger" label="Input reads" />
40 <conditional name="mode_conditional">
41 <param name="mode" type="select" label="Mode">
42 <option value="--nano-raw">Nanopore raw (--nano-raw)</option>
43 <option value="--nano-corr">Nanopore corrected (--nano-corr)</option>
44 <option value="--nano-hq">Nanopore HQ (--nano-hq)</option>
45 <option value="--pacbio-raw">PacBio raw (--pacbio-raw)</option>
46 <option value="--pacbio-corr">PacBio corrected (--pacbio-corr)</option>
47 <option value="--pacbio-hifi">PacBio HiFi (--pacbio-hifi)</option>
48 </param>
49 <when value="--nano-raw"/>
50 <when value="--nano-corr"/>
51 <when value="--nano-hq"/>
52 <when value="--pacbio-raw"/>
53 <when value="--pacbio-corr"/>
54 <when value="--pacbio-hifi">
55 <param argument="--hifi-error" type="float" min="0" max="1" optional="true" label="Expected HiFi reads error rate" help="Default: 0.01"/>
56 </when>
57 </conditional>
58 <param argument="--iterations" type="integer" value="1" label="Number of polishing iterations"
59 help="Polishing is performed as the final assembly stage. By default, Flye runs one polishing iteration. Additional iterations
60 might correct a small number of extra errors (due to improvements on how reads may align to the corrected assembly). If the
61 parameter is set to 0, the polishing is not performed"/>
62 <param argument="--min-overlap" type="integer" min="1000" max="10000" optional="true" label="Minimum overlap between reads"
63 help="This sets a minimum overlap length for two reads to be considered overlapping. By default it is chosen
64 automatically based on the read length distribution (reads N90) and does not require manual setting. Typical
65 value is 3k-5k (and down to 1k for datasets with shorter read length). Intuitively, we want to set this
66 parameter as high as possible, so the repeat graph is less tangled. However, higher values might lead to assembly gaps.
67 In some rare cases it makes sense to manually increase minimum overlap for assemblies of big genomes with long reads and high coverage." />
68 <param argument="--keep-haplotypes" type="boolean" truevalue="--keep-haplotypes" falsevalue="" checked="False" label="Keep haplotypes"
69 help="By default, Flye collapses graph structures caused by alternative haplotypes (bubbles, superbubbles, roundabouts) to produce longer
70 consensus contigs. This option retains the alternative paths on the graph, producing less contigouos, but more detailed assembly."/>
71 <param argument="--scaffold" type="boolean" truevalue="--scaffold" falsevalue="" label="Enable scaffolding using graph"
72 help="Starting from the version 2.9 Flye does not perform scaffolding by default, which guarantees that all assembled sequences do not have any gaps" />
73 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" checked="False" label="Perform metagenomic assembly"
74 help="It is designed for highly non-uniform coverage and is sensitive to underrepresented sequence at low coverage (as low as 2x).
75 In some examples of simple metagenomes, we observed that the normal mode assembled more contigious bacterial
76 consensus sequence, while the metagenome mode was slightly more fragmented, but revealed strain mixtures"/>
77 <conditional name="asm">
78 <param name="asm_select" type="select" label="Reduced contig assembly coverage" help="Typically, assemblies of large genomes at high coverage require a hundreds of RAM. For high coverage assemblies,
79 you can reduce memory usage by using only a subset of longest reads for initial contig extension stage (usually, the memory bottleneck)">
80 <option value="true">Enable reduced coverage for initial disjointing assembly</option>
81 <option value="false" selected="true">Disable reduced coverage for initial disjointing assembly</option>
82 </param>
83 <when value="true">
84 <param argument="--asm-coverage" type="integer" min="0" value="30"
85 label="Reduced coverage for initial disjointing assembly"
86 help="This parameter specifies the target coverage of the longest reads. For a typical assembly, 30x is enough to produce good
87 initial contigs. Regardless of this parameter, all reads will be used at the later pipeline stages."/>
88 <param argument="--genome-size" type="text" optional="true" label="Estimated genome size"
89 help="For example, 5m or 2.6g. No longer required as input. However, it must be used in conjunction with --asm-coverage option.">
90 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator>
91 </param>
92 </when>
93 <when value="false" />
94 </conditional>
95 <param name="generate_log" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate a log file"/>
96 </inputs>
97 <outputs>
98 <data name="consensus" format="fasta" from_work_dir="out_dir/assembly.fasta" label="${tool.name} on ${on_string}: consensus"/>
99 <data name="assembly_graph" format="graph_dot" from_work_dir="out_dir/assembly_graph.gv" label="${tool.name} on ${on_string}: assembly graph"/>
100 <data name="assembly_gfa" format="txt" from_work_dir="out_dir/assembly_graph.gfa" label="${tool.name} on ${on_string}: graphical fragment assembly"/>
101 <data name="assembly_info" format="tabular" from_work_dir="out_dir/assembly_info.txt" label="${tool.name} on ${on_string}: assembly info"/>
102 <data name="flye_log" format="txt" from_work_dir="out_dir/flye.log" label="${tool.name} on ${on_string}: log">
103 <filter>generate_log</filter>
104 </data>
105 </outputs>
106 <tests>
107 <!--Test 01: pacbio-raw-->
108 <test expect_num_outputs="5">
109 <param name="inputs" ftype="fastq.gz" value="ecoli.fastq.gz"/>
110 <param name="mode" value="--pacbio-raw"/>
111 <param name="iterations" value="0"/>
112 <param name="generate_log" value="true"/>
113 <output name="assembly_info" file="result1_assembly_info.txt" ftype="tabular" compare="sim_size"/>
114 <output name="assembly_graph" file="result1_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/>
115 <output name="assembly_gfa" file="result1_assembly_graph.gfa" ftype="txt" compare="sim_size"/>
116 <output name="consensus" file="result1_assembly.fasta" ftype="fasta" compare="sim_size"/>
117 <output name="flye_log" file="result1.log" ftype="txt" compare="sim_size"/>
118 </test>
119 <!--Test 02: nano raw-->
120 <test expect_num_outputs="4">
121 <param name="inputs" ftype="fasta.gz" value="nanopore.fasta.gz"/>
122 <param name="mode" value="--nano-raw"/>
123 <param name="iterations" value="0"/>
124 <output name="assembly_info" ftype="tabular">
125 <assert_contents>
126 <has_size value="95" delta="100"/>
127 </assert_contents>
128 </output>
129 <output name="assembly_graph" ftype="graph_dot">
130 <assert_contents>
131 <has_size value="803" delta="100"/>
132 </assert_contents>
133 </output>
134 <output name="assembly_gfa" ftype="txt">
135 <assert_contents>
136 <has_size value="35047" delta="100"/>
137 </assert_contents>
138 </output>
139 <output name="consensus" ftype="fasta">
140 <assert_contents>
141 <has_size value="35573" delta="100"/>
142 </assert_contents>
143 </output>
144 </test>
145 <!--Test 03: reduce coverage-->
146 <test expect_num_outputs="4">
147 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
148 <conditional name="mode_conditional">
149 <param name="mode" value="--nano-raw"/>
150 </conditional>
151 <conditional name="asm">
152 <param name="asm_select" value="true" />
153 <param name="asm" value="30"/>
154 <param name="genome_size" value="3980000"/>
155 </conditional>
156 <output name="assembly_info" ftype="tabular">
157 <assert_contents>
158 <has_size value="286" delta="100"/>
159 </assert_contents>
160 </output>
161 <output name="assembly_graph" ftype="graph_dot">
162 <assert_contents>
163 <has_size value="1840" delta="100"/>
164 </assert_contents>
165 </output>
166 <output name="assembly_gfa" ftype="txt">
167 <assert_contents>
168 <has_size value="420752" delta="100"/>
169 </assert_contents>
170 </output>
171 <output name="consensus" ftype="fasta">
172 <assert_contents>
173 <has_size value="427580" delta="100"/>
174 </assert_contents>
175 </output>
176 </test>
177 <!--Test 04: metagenomic mode-->
178 <test expect_num_outputs="4">
179 <param name="inputs" ftype="fastq.gz" value="ecoli.fastq.gz"/>
180 <conditional name="mode_conditional">
181 <param name="mode" value="--pacbio-raw"/>
182 </conditional>
183 <param name="meta" value="true"/>
184 <output name="assembly_info" ftype="tabular">
185 <assert_contents>
186 <has_size value="95" delta="100"/>
187 </assert_contents>
188 </output>
189 <output name="assembly_graph" ftype="graph_dot">
190 <assert_contents>
191 <has_size value="367" delta="100"/>
192 </assert_contents>
193 </output>
194 <output name="assembly_gfa" ftype="txt">
195 <assert_contents>
196 <has_size value="418729" delta="100"/>
197 </assert_contents>
198 </output>
199 <output name="consensus" ftype="fasta">
200 <assert_contents>
201 <has_size value="425667" delta="100"/>
202 </assert_contents>
203 </output>
204 </test>
205 <!--Test 05: nanopore HQ mode-->
206 <test expect_num_outputs="4">
207 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
208 <conditional name="mode_conditional">
209 <param name="mode" value="--nano-hq"/>
210 </conditional>
211 <param name="min_overlap" value="1000"/>
212 <output name="assembly_info" ftype="tabular">
213 <assert_contents>
214 <has_size value="286" delta="100"/>
215 </assert_contents>
216 </output>
217 <output name="assembly_graph" ftype="graph_dot">
218 <assert_contents>
219 <has_size value="1248" delta="100"/>
220 </assert_contents>
221 </output>
222 <output name="assembly_gfa" ftype="txt">
223 <assert_contents>
224 <has_size value="420252" delta="100"/>
225 </assert_contents>
226 </output>
227 <output name="consensus" ftype="fasta">
228 <assert_contents>
229 <has_size value="427129" delta="100"/>
230 </assert_contents>
231 </output>
232 </test>
233 <!--Test 06: hifi error option-->
234 <test expect_num_outputs="4">
235 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
236 <conditional name="mode_conditional">
237 <param name="mode" value="--pacbio-hifi"/>
238 <param name="hifi_error" value="0.21"/>
239 </conditional>
240 <param name="min_overlap" value="1000"/>
241 <output name="assembly_info" ftype="tabular">
242 <assert_contents>
243 <has_size value="286" delta="100"/>
244 </assert_contents>
245 </output>
246 <output name="assembly_graph" ftype="graph_dot">
247 <assert_contents>
248 <has_size value="1273" delta="100"/>
249 </assert_contents>
250 </output>
251 <output name="assembly_gfa" ftype="txt">
252 <assert_contents>
253 <has_size value="420252" delta="100"/>
254 </assert_contents>
255 </output>
256 <output name="consensus" ftype="fasta">
257 <assert_contents>
258 <has_size value="427129" delta="100"/>
259 </assert_contents>
260 </output>
261 </test>
262 <!--Test 07: keep haplotypes-->
263 <test expect_num_outputs="4">
264 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
265 <conditional name="mode_conditional">
266 <param name="mode" value="--pacbio-corr"/>
267 <param name="hifi_error" value="0.21"/>
268 </conditional>
269 <param name="min_overlap" value="1000"/>
270 <param name="keep-haplotypes" value="true"/>
271 <output name="assembly_info" ftype="tabular">
272 <assert_contents>
273 <has_size value="286" delta="100"/>
274 </assert_contents>
275 </output>
276 <output name="assembly_graph" ftype="graph_dot">
277 <assert_contents>
278 <has_size value="1273" delta="100"/>
279 </assert_contents>
280 </output>
281 <output name="assembly_gfa" ftype="txt">
282 <assert_contents>
283 <has_size value="420252" delta="100"/>
284 </assert_contents>
285 </output>
286 <output name="consensus" ftype="fasta">
287 <assert_contents>
288 <has_size value="427129" delta="100"/>
289 </assert_contents>
290 </output>
291 </test>
292 <!--Test 08: scaffolding mode-->
293 <test expect_num_outputs="4">
294 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
295 <param name="mode" value="--nano-hq"/>
296 <param name="min_overlap" value="1000"/>
297 <param name="scaffolding" value="true"/>
298 <output name="assembly_info" ftype="tabular">
299 <assert_contents>
300 <has_size value="286" delta="100"/>
301 </assert_contents>
302 </output>
303 <output name="assembly_graph" ftype="graph_dot">
304 <assert_contents>
305 <has_size value="1248" delta="100"/>
306 </assert_contents>
307 </output>
308 <output name="assembly_gfa" ftype="txt">
309 <assert_contents>
310 <has_size value="420252" delta="100"/>
311 </assert_contents>
312 </output>
313 <output name="consensus" ftype="fasta">
314 <assert_contents>
315 <has_size value="427129" delta="100"/>
316 </assert_contents>
317 </output>
318 </test>
319 </tests>
320 <help><![CDATA[
321
322 .. class:: infomark
323
324 **Purpose**
325
326 Flye is a de novo assembler for single molecule sequencing reads, such as those produced by PacBio and Oxford Nanopore Technologies.
327 It is designed for a wide range of datasets, from small bacterial projects to large mammalian-scale assemblies. The package represents
328 a complete pipeline: it takes raw PacBio/ONT reads as input and outputs polished contigs. Flye also has a special mode for metagenome
329 assembly.
330
331 ----
332
333 .. class:: infomark
334
335 **Quick usage**
336
337 Input reads can be in FASTA or FASTQ format, uncompressed or compressed with gz. Currently, PacBio (raw, corrected, HiFi) and ONT reads
338 (raw, corrected) are supported. Expected error rates are <30% for raw, <3% for corrected, and <1% for HiFi. Note that Flye was primarily
339 developed to run on raw reads. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o
340 ption enables the mode for metagenome/uneven coverage assembly.
341
342 Genome size estimate is no longer a required option. You need to provide an estimate if using *--asm-coverage* option.
343
344 To reduce memory consumption for large genome assemblies, you can use a subset of the longest reads for initial disjointig assembly by
345 specifying *--asm-coverage* and *--genome-size* options. Typically, 40x coverage is enough to produce good disjointigs.
346
347 ----
348
349 .. class:: infomark
350
351 **Outputs**
352
353 The main output files are:
354
355 ::
356
357 - Final assembly: contains contigs and possibly scaffolds (see below).
358 - Final repeat graph: note that the edge sequences might be different (shorter) than contig sequences, because contigs might include multiple graph edges.
359 - Extra information about contigs (such as length or coverage).
360
361 Each contig is formed by a single unique graph edge. If possible, unique contigs are extended with the sequence from flanking unresolved repeats on the graph. Thus,
362 a contig fully contains the corresponding graph edge (with the same id), but might be longer then this edge. This is somewhat similar to unitig-contig relation in
363 OLC assemblers. In a rare case when a repetitive graph edge is not covered by the set of "extended" contigs, it will be also output in the assembly file.
364
365 Sometimes it is possible to further order contigs into scaffolds based on the repeat graph structure. These ordered contigs will be output as a part of scaffold in
366 the assembly file (with a scaffold prefix). Since it is hard to give a reliable estimate of the gap size, those gaps are represented with the default 100 Ns.
367 assembly_info.txt file (below) contains additional information about how scaffolds were formed.
368
369 Extra information about contigs/scaffolds is output into the assembly_info.txt file. It is a tab-delimited table with the columns as follows:
370
371 ::
372
373 - Contig/scaffold id
374 - Length
375 - Coverage
376 - Is circular, (Y)es or (N)o
377 - Is repetitive, (Y)es or (N)o
378 - Multiplicity (based on coverage)
379 - Alternative group
380 - Graph path (graph path corresponding to this contig/scaffold).
381
382 Scaffold gaps are marked with ?? symbols, and * symbol denotes a terminal graph node. Alternative contigs (representing alternative haplotypes) will have the same alt.
383 group ID. Primary contigs are marked by *.
384
385 ----
386
387 .. class:: infomark
388
389 **Algorithm Description**
390
391 This is a brief description of the Flye algorithm. Please refer to the manuscript for more detailed information. The draft contig extension is organized as follows:
392
393 ::
394
395 - K-mer counting / erroneous k-mer pre-filtering
396 - Solid k-mer selection (k-mers with sufficient frequency, which are unlikely to be erroneous)
397 - Contig extension. The algorithm starts from a single read and extends it with a next overlapping read (overlaps are dynamically detected using the selected solid k-mers).
398
399 Note that we do not attempt to resolve repeats at this stage, thus the reconstructed contigs might contain misassemblies. Flye then aligns the reads on these draft
400 contigs using minimap2 and calls a consensus. Afterwards, Flye performs repeat analysis as follows:
401
402 ::
403
404 - Repeat graph is constructed from the (possibly misassembled) contigs
405 - In this graph all repeats longer than minimum overlap are collapsed
406 - The algorithm resolves repeats using the read information and graph structure
407 - The unbranching paths in the graph are output as contigs
408
409 If enabled, after resolving bridged repeats, Trestle module attempts to resolve simple unbridged repeats (of multiplicity 2) using the heterogeneities between repeat copies.
410 Finally, Flye performs polishing of the resulting assembly to correct the remaining errors:
411
412 ::
413
414 - Alignment of all reads to the current assembly using minimap2
415 - Partition the alignment into mini-alignments (bubbles)
416 - Error correction of each bubble using a maximum likelihood approach
417
418
419 The polishing steps could be repeated, which might slightly increase quality for some datasets.
420
421
422 ]]></help>
423 <citations>
424 <citation type="doi">10.1073/pnas.1604560113</citation>
425 <citation type="bibtex">
426 @misc{githubFlye,
427 author = {Kolmogorov, Mijhail},
428 year = {2021},
429 title = {Flye},
430 publisher = {GitHub},
431 journal = {GitHub repository},
432 url = {https://github.com/fenderglass/Flye}}
433 </citation>
434 </citations>
435 </tool>