kkonganti@1
|
1 <tool id="cfsan_flye" name="CFSAN_Flye" version="2.8.1+galaxy0">
|
kkonganti@0
|
2 <description>de novo assembler for single molecule sequencing reads</description>
|
kkonganti@0
|
3 <requirements>
|
kkonganti@0
|
4 <requirement type="package" version="2.8.1">flye</requirement>
|
kkonganti@0
|
5 </requirements>
|
kkonganti@0
|
6 <version_command>flye --version</version_command>
|
kkonganti@0
|
7 <command detect_errors="exit_code"><![CDATA[
|
kkonganti@0
|
8 #if $inputs.is_of_type('fastqsanger', 'fastq'):
|
kkonganti@0
|
9 #set $ext = 'fastq'
|
kkonganti@0
|
10 #elif $inputs.is_of_type('fastqsanger.gz', 'fastq.gz'):
|
kkonganti@0
|
11 #set $ext = 'fastq.gz'
|
kkonganti@0
|
12 #elif $inputs.is_of_type('fasta.gz'):
|
kkonganti@0
|
13 #set $ext = 'fasta.gz'
|
kkonganti@0
|
14 #elif $inputs.is_of_type('fasta'):
|
kkonganti@0
|
15 #set $ext = 'fasta'
|
kkonganti@0
|
16 #end if
|
kkonganti@0
|
17 infile=\$(basename $inputs '.dat').${ext} &&
|
kkonganti@0
|
18 ln -s $inputs ./\${infile} &&
|
kkonganti@0
|
19 flye
|
kkonganti@0
|
20 $mode_conditional.mode
|
kkonganti@0
|
21 \${infile}
|
kkonganti@0
|
22 -o out_dir
|
kkonganti@0
|
23 -t \${GALAXY_SLOTS:-4}
|
kkonganti@0
|
24 -i $iterations
|
kkonganti@0
|
25 #if $mode_conditional.mode == '--pacbio-hifi' and $mode_conditional.hifi_error:
|
kkonganti@0
|
26 --hifi-error $mode_conditional.hifi_error
|
kkonganti@0
|
27 #end if
|
kkonganti@0
|
28 #if $min_overlap:
|
kkonganti@0
|
29 -m $min_overlap
|
kkonganti@0
|
30 #end if
|
kkonganti@0
|
31 #if $asm.asm_select == 'true':
|
kkonganti@0
|
32 --asm-coverage $asm.asm_coverage
|
kkonganti@0
|
33 -g '${asm.genome_size}'
|
kkonganti@0
|
34 #end if
|
kkonganti@0
|
35 $meta
|
kkonganti@0
|
36 $scaffold
|
kkonganti@0
|
37 ]]></command>
|
kkonganti@0
|
38 <inputs>
|
kkonganti@0
|
39 <param name="inputs" type="data" format="fasta,fasta.gz,fastq,fastq.gz,fastqsanger.gz,fastqsanger" label="Input reads" />
|
kkonganti@0
|
40 <conditional name="mode_conditional">
|
kkonganti@0
|
41 <param name="mode" type="select" label="Mode">
|
kkonganti@0
|
42 <option value="--nano-raw">Nanopore raw (--nano-raw)</option>
|
kkonganti@0
|
43 <option value="--nano-corr">Nanopore corrected (--nano-corr)</option>
|
kkonganti@0
|
44 <option value="--nano-hq">Nanopore HQ (--nano-hq)</option>
|
kkonganti@0
|
45 <option value="--pacbio-raw">PacBio raw (--pacbio-raw)</option>
|
kkonganti@0
|
46 <option value="--pacbio-corr">PacBio corrected (--pacbio-corr)</option>
|
kkonganti@0
|
47 <option value="--pacbio-hifi">PacBio HiFi (--pacbio-hifi)</option>
|
kkonganti@0
|
48 </param>
|
kkonganti@0
|
49 <when value="--nano-raw"/>
|
kkonganti@0
|
50 <when value="--nano-corr"/>
|
kkonganti@0
|
51 <when value="--nano-hq"/>
|
kkonganti@0
|
52 <when value="--pacbio-raw"/>
|
kkonganti@0
|
53 <when value="--pacbio-corr"/>
|
kkonganti@0
|
54 <when value="--pacbio-hifi">
|
kkonganti@0
|
55 <param argument="--hifi-error" type="float" min="0" max="1" optional="true" label="Expected HiFi reads error rate" help="Default: 0.01"/>
|
kkonganti@0
|
56 </when>
|
kkonganti@0
|
57 </conditional>
|
kkonganti@0
|
58 <param argument="--iterations" type="integer" value="1" label="Number of polishing iterations"
|
kkonganti@0
|
59 help="Polishing is performed as the final assembly stage. By default, Flye runs one polishing iteration. Additional iterations
|
kkonganti@0
|
60 might correct a small number of extra errors (due to improvements on how reads may align to the corrected assembly). If the
|
kkonganti@0
|
61 parameter is set to 0, the polishing is not performed"/>
|
kkonganti@0
|
62 <param argument="--min-overlap" type="integer" min="1000" max="10000" optional="true" label="Minimum overlap between reads"
|
kkonganti@0
|
63 help="This sets a minimum overlap length for two reads to be considered overlapping. By default it is chosen
|
kkonganti@0
|
64 automatically based on the read length distribution (reads N90) and does not require manual setting. Typical
|
kkonganti@0
|
65 value is 3k-5k (and down to 1k for datasets with shorter read length). Intuitively, we want to set this
|
kkonganti@0
|
66 parameter as high as possible, so the repeat graph is less tangled. However, higher values might lead to assembly gaps.
|
kkonganti@0
|
67 In some rare cases it makes sense to manually increase minimum overlap for assemblies of big genomes with long reads and high coverage." />
|
kkonganti@0
|
68 <param argument="--keep-haplotypes" type="boolean" truevalue="--keep-haplotypes" falsevalue="" checked="False" label="Keep haplotypes"
|
kkonganti@0
|
69 help="By default, Flye collapses graph structures caused by alternative haplotypes (bubbles, superbubbles, roundabouts) to produce longer
|
kkonganti@0
|
70 consensus contigs. This option retains the alternative paths on the graph, producing less contigouos, but more detailed assembly."/>
|
kkonganti@0
|
71 <param argument="--scaffold" type="boolean" truevalue="--scaffold" falsevalue="" label="Enable scaffolding using graph"
|
kkonganti@0
|
72 help="Starting from the version 2.9 Flye does not perform scaffolding by default, which guarantees that all assembled sequences do not have any gaps" />
|
kkonganti@0
|
73 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" checked="False" label="Perform metagenomic assembly"
|
kkonganti@0
|
74 help="It is designed for highly non-uniform coverage and is sensitive to underrepresented sequence at low coverage (as low as 2x).
|
kkonganti@0
|
75 In some examples of simple metagenomes, we observed that the normal mode assembled more contigious bacterial
|
kkonganti@0
|
76 consensus sequence, while the metagenome mode was slightly more fragmented, but revealed strain mixtures"/>
|
kkonganti@0
|
77 <conditional name="asm">
|
kkonganti@0
|
78 <param name="asm_select" type="select" label="Reduced contig assembly coverage" help="Typically, assemblies of large genomes at high coverage require a hundreds of RAM. For high coverage assemblies,
|
kkonganti@0
|
79 you can reduce memory usage by using only a subset of longest reads for initial contig extension stage (usually, the memory bottleneck)">
|
kkonganti@0
|
80 <option value="true">Enable reduced coverage for initial disjointing assembly</option>
|
kkonganti@0
|
81 <option value="false" selected="true">Disable reduced coverage for initial disjointing assembly</option>
|
kkonganti@0
|
82 </param>
|
kkonganti@0
|
83 <when value="true">
|
kkonganti@0
|
84 <param argument="--asm-coverage" type="integer" min="0" value="30"
|
kkonganti@0
|
85 label="Reduced coverage for initial disjointing assembly"
|
kkonganti@0
|
86 help="This parameter specifies the target coverage of the longest reads. For a typical assembly, 30x is enough to produce good
|
kkonganti@0
|
87 initial contigs. Regardless of this parameter, all reads will be used at the later pipeline stages."/>
|
kkonganti@0
|
88 <param argument="--genome-size" type="text" optional="true" label="Estimated genome size"
|
kkonganti@0
|
89 help="For example, 5m or 2.6g. No longer required as input. However, it must be used in conjunction with --asm-coverage option.">
|
kkonganti@0
|
90 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator>
|
kkonganti@0
|
91 </param>
|
kkonganti@0
|
92 </when>
|
kkonganti@0
|
93 <when value="false" />
|
kkonganti@0
|
94 </conditional>
|
kkonganti@0
|
95 <param name="generate_log" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate a log file"/>
|
kkonganti@0
|
96 </inputs>
|
kkonganti@0
|
97 <outputs>
|
kkonganti@0
|
98 <data name="consensus" format="fasta" from_work_dir="out_dir/assembly.fasta" label="${tool.name} on ${on_string}: consensus"/>
|
kkonganti@0
|
99 <data name="assembly_graph" format="graph_dot" from_work_dir="out_dir/assembly_graph.gv" label="${tool.name} on ${on_string}: assembly graph"/>
|
kkonganti@0
|
100 <data name="assembly_gfa" format="txt" from_work_dir="out_dir/assembly_graph.gfa" label="${tool.name} on ${on_string}: graphical fragment assembly"/>
|
kkonganti@0
|
101 <data name="assembly_info" format="tabular" from_work_dir="out_dir/assembly_info.txt" label="${tool.name} on ${on_string}: assembly info"/>
|
kkonganti@0
|
102 <data name="flye_log" format="txt" from_work_dir="out_dir/flye.log" label="${tool.name} on ${on_string}: log">
|
kkonganti@0
|
103 <filter>generate_log</filter>
|
kkonganti@0
|
104 </data>
|
kkonganti@0
|
105 </outputs>
|
kkonganti@0
|
106 <tests>
|
kkonganti@0
|
107 <!--Test 01: pacbio-raw-->
|
kkonganti@0
|
108 <test expect_num_outputs="5">
|
kkonganti@0
|
109 <param name="inputs" ftype="fastq.gz" value="ecoli.fastq.gz"/>
|
kkonganti@0
|
110 <param name="mode" value="--pacbio-raw"/>
|
kkonganti@0
|
111 <param name="iterations" value="0"/>
|
kkonganti@0
|
112 <param name="generate_log" value="true"/>
|
kkonganti@0
|
113 <output name="assembly_info" file="result1_assembly_info.txt" ftype="tabular" compare="sim_size"/>
|
kkonganti@0
|
114 <output name="assembly_graph" file="result1_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/>
|
kkonganti@0
|
115 <output name="assembly_gfa" file="result1_assembly_graph.gfa" ftype="txt" compare="sim_size"/>
|
kkonganti@0
|
116 <output name="consensus" file="result1_assembly.fasta" ftype="fasta" compare="sim_size"/>
|
kkonganti@0
|
117 <output name="flye_log" file="result1.log" ftype="txt" compare="sim_size"/>
|
kkonganti@0
|
118 </test>
|
kkonganti@0
|
119 <!--Test 02: nano raw-->
|
kkonganti@0
|
120 <test expect_num_outputs="4">
|
kkonganti@0
|
121 <param name="inputs" ftype="fasta.gz" value="nanopore.fasta.gz"/>
|
kkonganti@0
|
122 <param name="mode" value="--nano-raw"/>
|
kkonganti@0
|
123 <param name="iterations" value="0"/>
|
kkonganti@0
|
124 <output name="assembly_info" ftype="tabular">
|
kkonganti@0
|
125 <assert_contents>
|
kkonganti@0
|
126 <has_size value="95" delta="100"/>
|
kkonganti@0
|
127 </assert_contents>
|
kkonganti@0
|
128 </output>
|
kkonganti@0
|
129 <output name="assembly_graph" ftype="graph_dot">
|
kkonganti@0
|
130 <assert_contents>
|
kkonganti@0
|
131 <has_size value="803" delta="100"/>
|
kkonganti@0
|
132 </assert_contents>
|
kkonganti@0
|
133 </output>
|
kkonganti@0
|
134 <output name="assembly_gfa" ftype="txt">
|
kkonganti@0
|
135 <assert_contents>
|
kkonganti@0
|
136 <has_size value="35047" delta="100"/>
|
kkonganti@0
|
137 </assert_contents>
|
kkonganti@0
|
138 </output>
|
kkonganti@0
|
139 <output name="consensus" ftype="fasta">
|
kkonganti@0
|
140 <assert_contents>
|
kkonganti@0
|
141 <has_size value="35573" delta="100"/>
|
kkonganti@0
|
142 </assert_contents>
|
kkonganti@0
|
143 </output>
|
kkonganti@0
|
144 </test>
|
kkonganti@0
|
145 <!--Test 03: reduce coverage-->
|
kkonganti@0
|
146 <test expect_num_outputs="4">
|
kkonganti@0
|
147 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
|
kkonganti@0
|
148 <conditional name="mode_conditional">
|
kkonganti@0
|
149 <param name="mode" value="--nano-raw"/>
|
kkonganti@0
|
150 </conditional>
|
kkonganti@0
|
151 <conditional name="asm">
|
kkonganti@0
|
152 <param name="asm_select" value="true" />
|
kkonganti@0
|
153 <param name="asm" value="30"/>
|
kkonganti@0
|
154 <param name="genome_size" value="3980000"/>
|
kkonganti@0
|
155 </conditional>
|
kkonganti@0
|
156 <output name="assembly_info" ftype="tabular">
|
kkonganti@0
|
157 <assert_contents>
|
kkonganti@0
|
158 <has_size value="286" delta="100"/>
|
kkonganti@0
|
159 </assert_contents>
|
kkonganti@0
|
160 </output>
|
kkonganti@0
|
161 <output name="assembly_graph" ftype="graph_dot">
|
kkonganti@0
|
162 <assert_contents>
|
kkonganti@0
|
163 <has_size value="1840" delta="100"/>
|
kkonganti@0
|
164 </assert_contents>
|
kkonganti@0
|
165 </output>
|
kkonganti@0
|
166 <output name="assembly_gfa" ftype="txt">
|
kkonganti@0
|
167 <assert_contents>
|
kkonganti@0
|
168 <has_size value="420752" delta="100"/>
|
kkonganti@0
|
169 </assert_contents>
|
kkonganti@0
|
170 </output>
|
kkonganti@0
|
171 <output name="consensus" ftype="fasta">
|
kkonganti@0
|
172 <assert_contents>
|
kkonganti@0
|
173 <has_size value="427580" delta="100"/>
|
kkonganti@0
|
174 </assert_contents>
|
kkonganti@0
|
175 </output>
|
kkonganti@0
|
176 </test>
|
kkonganti@0
|
177 <!--Test 04: metagenomic mode-->
|
kkonganti@0
|
178 <test expect_num_outputs="4">
|
kkonganti@0
|
179 <param name="inputs" ftype="fastq.gz" value="ecoli.fastq.gz"/>
|
kkonganti@0
|
180 <conditional name="mode_conditional">
|
kkonganti@0
|
181 <param name="mode" value="--pacbio-raw"/>
|
kkonganti@0
|
182 </conditional>
|
kkonganti@0
|
183 <param name="meta" value="true"/>
|
kkonganti@0
|
184 <output name="assembly_info" ftype="tabular">
|
kkonganti@0
|
185 <assert_contents>
|
kkonganti@0
|
186 <has_size value="95" delta="100"/>
|
kkonganti@0
|
187 </assert_contents>
|
kkonganti@0
|
188 </output>
|
kkonganti@0
|
189 <output name="assembly_graph" ftype="graph_dot">
|
kkonganti@0
|
190 <assert_contents>
|
kkonganti@0
|
191 <has_size value="367" delta="100"/>
|
kkonganti@0
|
192 </assert_contents>
|
kkonganti@0
|
193 </output>
|
kkonganti@0
|
194 <output name="assembly_gfa" ftype="txt">
|
kkonganti@0
|
195 <assert_contents>
|
kkonganti@0
|
196 <has_size value="418729" delta="100"/>
|
kkonganti@0
|
197 </assert_contents>
|
kkonganti@0
|
198 </output>
|
kkonganti@0
|
199 <output name="consensus" ftype="fasta">
|
kkonganti@0
|
200 <assert_contents>
|
kkonganti@0
|
201 <has_size value="425667" delta="100"/>
|
kkonganti@0
|
202 </assert_contents>
|
kkonganti@0
|
203 </output>
|
kkonganti@0
|
204 </test>
|
kkonganti@0
|
205 <!--Test 05: nanopore HQ mode-->
|
kkonganti@0
|
206 <test expect_num_outputs="4">
|
kkonganti@0
|
207 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
|
kkonganti@0
|
208 <conditional name="mode_conditional">
|
kkonganti@0
|
209 <param name="mode" value="--nano-hq"/>
|
kkonganti@0
|
210 </conditional>
|
kkonganti@0
|
211 <param name="min_overlap" value="1000"/>
|
kkonganti@0
|
212 <output name="assembly_info" ftype="tabular">
|
kkonganti@0
|
213 <assert_contents>
|
kkonganti@0
|
214 <has_size value="286" delta="100"/>
|
kkonganti@0
|
215 </assert_contents>
|
kkonganti@0
|
216 </output>
|
kkonganti@0
|
217 <output name="assembly_graph" ftype="graph_dot">
|
kkonganti@0
|
218 <assert_contents>
|
kkonganti@0
|
219 <has_size value="1248" delta="100"/>
|
kkonganti@0
|
220 </assert_contents>
|
kkonganti@0
|
221 </output>
|
kkonganti@0
|
222 <output name="assembly_gfa" ftype="txt">
|
kkonganti@0
|
223 <assert_contents>
|
kkonganti@0
|
224 <has_size value="420252" delta="100"/>
|
kkonganti@0
|
225 </assert_contents>
|
kkonganti@0
|
226 </output>
|
kkonganti@0
|
227 <output name="consensus" ftype="fasta">
|
kkonganti@0
|
228 <assert_contents>
|
kkonganti@0
|
229 <has_size value="427129" delta="100"/>
|
kkonganti@0
|
230 </assert_contents>
|
kkonganti@0
|
231 </output>
|
kkonganti@0
|
232 </test>
|
kkonganti@0
|
233 <!--Test 06: hifi error option-->
|
kkonganti@0
|
234 <test expect_num_outputs="4">
|
kkonganti@0
|
235 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
|
kkonganti@0
|
236 <conditional name="mode_conditional">
|
kkonganti@0
|
237 <param name="mode" value="--pacbio-hifi"/>
|
kkonganti@0
|
238 <param name="hifi_error" value="0.21"/>
|
kkonganti@0
|
239 </conditional>
|
kkonganti@0
|
240 <param name="min_overlap" value="1000"/>
|
kkonganti@0
|
241 <output name="assembly_info" ftype="tabular">
|
kkonganti@0
|
242 <assert_contents>
|
kkonganti@0
|
243 <has_size value="286" delta="100"/>
|
kkonganti@0
|
244 </assert_contents>
|
kkonganti@0
|
245 </output>
|
kkonganti@0
|
246 <output name="assembly_graph" ftype="graph_dot">
|
kkonganti@0
|
247 <assert_contents>
|
kkonganti@0
|
248 <has_size value="1273" delta="100"/>
|
kkonganti@0
|
249 </assert_contents>
|
kkonganti@0
|
250 </output>
|
kkonganti@0
|
251 <output name="assembly_gfa" ftype="txt">
|
kkonganti@0
|
252 <assert_contents>
|
kkonganti@0
|
253 <has_size value="420252" delta="100"/>
|
kkonganti@0
|
254 </assert_contents>
|
kkonganti@0
|
255 </output>
|
kkonganti@0
|
256 <output name="consensus" ftype="fasta">
|
kkonganti@0
|
257 <assert_contents>
|
kkonganti@0
|
258 <has_size value="427129" delta="100"/>
|
kkonganti@0
|
259 </assert_contents>
|
kkonganti@0
|
260 </output>
|
kkonganti@0
|
261 </test>
|
kkonganti@0
|
262 <!--Test 07: keep haplotypes-->
|
kkonganti@0
|
263 <test expect_num_outputs="4">
|
kkonganti@0
|
264 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
|
kkonganti@0
|
265 <conditional name="mode_conditional">
|
kkonganti@0
|
266 <param name="mode" value="--pacbio-corr"/>
|
kkonganti@0
|
267 <param name="hifi_error" value="0.21"/>
|
kkonganti@0
|
268 </conditional>
|
kkonganti@0
|
269 <param name="min_overlap" value="1000"/>
|
kkonganti@0
|
270 <param name="keep-haplotypes" value="true"/>
|
kkonganti@0
|
271 <output name="assembly_info" ftype="tabular">
|
kkonganti@0
|
272 <assert_contents>
|
kkonganti@0
|
273 <has_size value="286" delta="100"/>
|
kkonganti@0
|
274 </assert_contents>
|
kkonganti@0
|
275 </output>
|
kkonganti@0
|
276 <output name="assembly_graph" ftype="graph_dot">
|
kkonganti@0
|
277 <assert_contents>
|
kkonganti@0
|
278 <has_size value="1273" delta="100"/>
|
kkonganti@0
|
279 </assert_contents>
|
kkonganti@0
|
280 </output>
|
kkonganti@0
|
281 <output name="assembly_gfa" ftype="txt">
|
kkonganti@0
|
282 <assert_contents>
|
kkonganti@0
|
283 <has_size value="420252" delta="100"/>
|
kkonganti@0
|
284 </assert_contents>
|
kkonganti@0
|
285 </output>
|
kkonganti@0
|
286 <output name="consensus" ftype="fasta">
|
kkonganti@0
|
287 <assert_contents>
|
kkonganti@0
|
288 <has_size value="427129" delta="100"/>
|
kkonganti@0
|
289 </assert_contents>
|
kkonganti@0
|
290 </output>
|
kkonganti@0
|
291 </test>
|
kkonganti@0
|
292 <!--Test 08: scaffolding mode-->
|
kkonganti@0
|
293 <test expect_num_outputs="4">
|
kkonganti@0
|
294 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
|
kkonganti@0
|
295 <param name="mode" value="--nano-hq"/>
|
kkonganti@0
|
296 <param name="min_overlap" value="1000"/>
|
kkonganti@0
|
297 <param name="scaffolding" value="true"/>
|
kkonganti@0
|
298 <output name="assembly_info" ftype="tabular">
|
kkonganti@0
|
299 <assert_contents>
|
kkonganti@0
|
300 <has_size value="286" delta="100"/>
|
kkonganti@0
|
301 </assert_contents>
|
kkonganti@0
|
302 </output>
|
kkonganti@0
|
303 <output name="assembly_graph" ftype="graph_dot">
|
kkonganti@0
|
304 <assert_contents>
|
kkonganti@0
|
305 <has_size value="1248" delta="100"/>
|
kkonganti@0
|
306 </assert_contents>
|
kkonganti@0
|
307 </output>
|
kkonganti@0
|
308 <output name="assembly_gfa" ftype="txt">
|
kkonganti@0
|
309 <assert_contents>
|
kkonganti@0
|
310 <has_size value="420252" delta="100"/>
|
kkonganti@0
|
311 </assert_contents>
|
kkonganti@0
|
312 </output>
|
kkonganti@0
|
313 <output name="consensus" ftype="fasta">
|
kkonganti@0
|
314 <assert_contents>
|
kkonganti@0
|
315 <has_size value="427129" delta="100"/>
|
kkonganti@0
|
316 </assert_contents>
|
kkonganti@0
|
317 </output>
|
kkonganti@0
|
318 </test>
|
kkonganti@0
|
319 </tests>
|
kkonganti@0
|
320 <help><![CDATA[
|
kkonganti@0
|
321
|
kkonganti@0
|
322 .. class:: infomark
|
kkonganti@0
|
323
|
kkonganti@0
|
324 **Purpose**
|
kkonganti@0
|
325
|
kkonganti@0
|
326 Flye is a de novo assembler for single molecule sequencing reads, such as those produced by PacBio and Oxford Nanopore Technologies.
|
kkonganti@0
|
327 It is designed for a wide range of datasets, from small bacterial projects to large mammalian-scale assemblies. The package represents
|
kkonganti@0
|
328 a complete pipeline: it takes raw PacBio/ONT reads as input and outputs polished contigs. Flye also has a special mode for metagenome
|
kkonganti@0
|
329 assembly.
|
kkonganti@0
|
330
|
kkonganti@0
|
331 ----
|
kkonganti@0
|
332
|
kkonganti@0
|
333 .. class:: infomark
|
kkonganti@0
|
334
|
kkonganti@0
|
335 **Quick usage**
|
kkonganti@0
|
336
|
kkonganti@0
|
337 Input reads can be in FASTA or FASTQ format, uncompressed or compressed with gz. Currently, PacBio (raw, corrected, HiFi) and ONT reads
|
kkonganti@0
|
338 (raw, corrected) are supported. Expected error rates are <30% for raw, <3% for corrected, and <1% for HiFi. Note that Flye was primarily
|
kkonganti@0
|
339 developed to run on raw reads. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o
|
kkonganti@0
|
340 ption enables the mode for metagenome/uneven coverage assembly.
|
kkonganti@0
|
341
|
kkonganti@0
|
342 Genome size estimate is no longer a required option. You need to provide an estimate if using *--asm-coverage* option.
|
kkonganti@0
|
343
|
kkonganti@0
|
344 To reduce memory consumption for large genome assemblies, you can use a subset of the longest reads for initial disjointig assembly by
|
kkonganti@0
|
345 specifying *--asm-coverage* and *--genome-size* options. Typically, 40x coverage is enough to produce good disjointigs.
|
kkonganti@0
|
346
|
kkonganti@0
|
347 ----
|
kkonganti@0
|
348
|
kkonganti@0
|
349 .. class:: infomark
|
kkonganti@0
|
350
|
kkonganti@0
|
351 **Outputs**
|
kkonganti@0
|
352
|
kkonganti@0
|
353 The main output files are:
|
kkonganti@0
|
354
|
kkonganti@0
|
355 ::
|
kkonganti@0
|
356
|
kkonganti@0
|
357 - Final assembly: contains contigs and possibly scaffolds (see below).
|
kkonganti@0
|
358 - Final repeat graph: note that the edge sequences might be different (shorter) than contig sequences, because contigs might include multiple graph edges.
|
kkonganti@0
|
359 - Extra information about contigs (such as length or coverage).
|
kkonganti@0
|
360
|
kkonganti@0
|
361 Each contig is formed by a single unique graph edge. If possible, unique contigs are extended with the sequence from flanking unresolved repeats on the graph. Thus,
|
kkonganti@0
|
362 a contig fully contains the corresponding graph edge (with the same id), but might be longer then this edge. This is somewhat similar to unitig-contig relation in
|
kkonganti@0
|
363 OLC assemblers. In a rare case when a repetitive graph edge is not covered by the set of "extended" contigs, it will be also output in the assembly file.
|
kkonganti@0
|
364
|
kkonganti@0
|
365 Sometimes it is possible to further order contigs into scaffolds based on the repeat graph structure. These ordered contigs will be output as a part of scaffold in
|
kkonganti@0
|
366 the assembly file (with a scaffold prefix). Since it is hard to give a reliable estimate of the gap size, those gaps are represented with the default 100 Ns.
|
kkonganti@0
|
367 assembly_info.txt file (below) contains additional information about how scaffolds were formed.
|
kkonganti@0
|
368
|
kkonganti@0
|
369 Extra information about contigs/scaffolds is output into the assembly_info.txt file. It is a tab-delimited table with the columns as follows:
|
kkonganti@0
|
370
|
kkonganti@0
|
371 ::
|
kkonganti@0
|
372
|
kkonganti@0
|
373 - Contig/scaffold id
|
kkonganti@0
|
374 - Length
|
kkonganti@0
|
375 - Coverage
|
kkonganti@0
|
376 - Is circular, (Y)es or (N)o
|
kkonganti@0
|
377 - Is repetitive, (Y)es or (N)o
|
kkonganti@0
|
378 - Multiplicity (based on coverage)
|
kkonganti@0
|
379 - Alternative group
|
kkonganti@0
|
380 - Graph path (graph path corresponding to this contig/scaffold).
|
kkonganti@0
|
381
|
kkonganti@0
|
382 Scaffold gaps are marked with ?? symbols, and * symbol denotes a terminal graph node. Alternative contigs (representing alternative haplotypes) will have the same alt.
|
kkonganti@0
|
383 group ID. Primary contigs are marked by *.
|
kkonganti@0
|
384
|
kkonganti@0
|
385 ----
|
kkonganti@0
|
386
|
kkonganti@0
|
387 .. class:: infomark
|
kkonganti@0
|
388
|
kkonganti@0
|
389 **Algorithm Description**
|
kkonganti@0
|
390
|
kkonganti@0
|
391 This is a brief description of the Flye algorithm. Please refer to the manuscript for more detailed information. The draft contig extension is organized as follows:
|
kkonganti@0
|
392
|
kkonganti@0
|
393 ::
|
kkonganti@0
|
394
|
kkonganti@0
|
395 - K-mer counting / erroneous k-mer pre-filtering
|
kkonganti@0
|
396 - Solid k-mer selection (k-mers with sufficient frequency, which are unlikely to be erroneous)
|
kkonganti@0
|
397 - Contig extension. The algorithm starts from a single read and extends it with a next overlapping read (overlaps are dynamically detected using the selected solid k-mers).
|
kkonganti@0
|
398
|
kkonganti@0
|
399 Note that we do not attempt to resolve repeats at this stage, thus the reconstructed contigs might contain misassemblies. Flye then aligns the reads on these draft
|
kkonganti@0
|
400 contigs using minimap2 and calls a consensus. Afterwards, Flye performs repeat analysis as follows:
|
kkonganti@0
|
401
|
kkonganti@0
|
402 ::
|
kkonganti@0
|
403
|
kkonganti@0
|
404 - Repeat graph is constructed from the (possibly misassembled) contigs
|
kkonganti@0
|
405 - In this graph all repeats longer than minimum overlap are collapsed
|
kkonganti@0
|
406 - The algorithm resolves repeats using the read information and graph structure
|
kkonganti@0
|
407 - The unbranching paths in the graph are output as contigs
|
kkonganti@0
|
408
|
kkonganti@0
|
409 If enabled, after resolving bridged repeats, Trestle module attempts to resolve simple unbridged repeats (of multiplicity 2) using the heterogeneities between repeat copies.
|
kkonganti@0
|
410 Finally, Flye performs polishing of the resulting assembly to correct the remaining errors:
|
kkonganti@0
|
411
|
kkonganti@0
|
412 ::
|
kkonganti@0
|
413
|
kkonganti@0
|
414 - Alignment of all reads to the current assembly using minimap2
|
kkonganti@0
|
415 - Partition the alignment into mini-alignments (bubbles)
|
kkonganti@0
|
416 - Error correction of each bubble using a maximum likelihood approach
|
kkonganti@0
|
417
|
kkonganti@0
|
418
|
kkonganti@0
|
419 The polishing steps could be repeated, which might slightly increase quality for some datasets.
|
kkonganti@0
|
420
|
kkonganti@0
|
421
|
kkonganti@0
|
422 ]]></help>
|
kkonganti@0
|
423 <citations>
|
kkonganti@0
|
424 <citation type="doi">10.1073/pnas.1604560113</citation>
|
kkonganti@0
|
425 <citation type="bibtex">
|
kkonganti@0
|
426 @misc{githubFlye,
|
kkonganti@0
|
427 author = {Kolmogorov, Mijhail},
|
kkonganti@0
|
428 year = {2021},
|
kkonganti@0
|
429 title = {Flye},
|
kkonganti@0
|
430 publisher = {GitHub},
|
kkonganti@0
|
431 journal = {GitHub repository},
|
kkonganti@0
|
432 url = {https://github.com/fenderglass/Flye}}
|
kkonganti@0
|
433 </citation>
|
kkonganti@0
|
434 </citations>
|
kkonganti@0
|
435 </tool>
|