Mercurial > repos > estrain > flye_cpu15mem59
comparison flye_cpu15mem59.xml @ 0:e1e6ef58f334 draft
planemo upload commit bdb45cf3a98e21f5002866b6789a1457f521bf5d
| author | estrain |
|---|---|
| date | Thu, 12 Mar 2026 20:06:54 +0000 |
| parents | |
| children | ab6ffc360b78 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e1e6ef58f334 |
|---|---|
| 1 <tool id="flye" name="Flye" version="@TOOL_VERSION@+galaxy@SUFFIX_VERSION@" profile="20.01"> | |
| 2 <description>de novo assembler for single molecule sequencing reads</description> | |
| 3 <macros> | |
| 4 <import>macros.xml</import> | |
| 5 </macros> | |
| 6 <expand macro="edam_ontology"/> | |
| 7 <expand macro="xrefs"/> | |
| 8 <expand macro="requirements" /> | |
| 9 <version_command>flye --version</version_command> | |
| 10 <command detect_errors="exit_code"><![CDATA[ | |
| 11 #if $input.is_of_type('fastqsanger', 'fastq'): | |
| 12 #set ext = 'fastq' | |
| 13 #elif $input.is_of_type('fastqsanger.gz', 'fastq.gz'): | |
| 14 #set ext = 'fastq.gz' | |
| 15 #elif $input.is_of_type('fasta.gz'): | |
| 16 #set ext = 'fasta.gz' | |
| 17 #elif $input.is_of_type('fasta'): | |
| 18 #set ext = 'fasta' | |
| 19 #else: | |
| 20 #set ext = 'dat' | |
| 21 #end if | |
| 22 | |
| 23 ln -sf '$input' ./input_0.${ext} && | |
| 24 flye $mode_conditional.mode ./input_0.${ext} | |
| 25 -o out_dir | |
| 26 -t \${GALAXY_SLOTS:-4} | |
| 27 -i $iterations | |
| 28 #if $mode_conditional.mode == '--pacbio-hifi' and $mode_conditional.hifi_error: | |
| 29 --hifi-error $mode_conditional.hifi_error | |
| 30 #end if | |
| 31 #if $min_overlap: | |
| 32 -m $min_overlap | |
| 33 #end if | |
| 34 #if $asm.asm_select == 'true': | |
| 35 --asm-coverage $asm.asm_coverage | |
| 36 -g '$asm.genome_size' | |
| 37 #end if | |
| 38 #if $meta: | |
| 39 $meta | |
| 40 #end if | |
| 41 #if $scaffold: | |
| 42 $scaffold | |
| 43 #end if | |
| 44 #if $no_alt_contigs: | |
| 45 $no_alt_contigs | |
| 46 #end if | |
| 47 ]]></command> | |
| 48 <inputs> | |
| 49 <param name="input" type="data" format="fasta,fasta.gz,fastq,fastq.gz,fastqsanger.gz,fastqsanger" label="Read file" /> | |
| 50 <conditional name="mode_conditional"> | |
| 51 <param name="mode" type="select" label="Mode"> | |
| 52 <option value="--nano-raw">Nanopore raw (--nano-raw)</option> | |
| 53 <option value="--nano-corr">Nanopore corrected (--nano-corr)</option> | |
| 54 <option value="--nano-hq">Nanopore HQ (--nano-hq)</option> | |
| 55 <option value="--pacbio-raw">PacBio raw (--pacbio-raw)</option> | |
| 56 <option value="--pacbio-corr">PacBio corrected (--pacbio-corr)</option> | |
| 57 <option value="--pacbio-hifi">PacBio HiFi (--pacbio-hifi)</option> | |
| 58 </param> | |
| 59 <when value="--nano-raw"/> | |
| 60 <when value="--nano-corr"/> | |
| 61 <when value="--nano-hq"/> | |
| 62 <when value="--pacbio-raw"/> | |
| 63 <when value="--pacbio-corr"/> | |
| 64 <when value="--pacbio-hifi"> | |
| 65 <param argument="--hifi-error" type="float" min="0" max="1" optional="true" label="Expected HiFi reads error rate" help="Default: 0.01"/> | |
| 66 </when> | |
| 67 </conditional> | |
| 68 <param argument="--iterations" type="integer" value="1" label="Number of polishing iterations" | |
| 69 help="Polishing is performed as the final assembly stage. By default, Flye runs one polishing iteration. Additional iterations | |
| 70 might correct a small number of extra errors (due to improvements on how reads may align to the corrected assembly). If the | |
| 71 parameter is set to 0, the polishing is not performed"/> | |
| 72 <param argument="--min-overlap" type="integer" min="1000" max="10000" optional="true" label="Minimum overlap between reads" | |
| 73 help="This sets a minimum overlap length for two reads to be considered overlapping. By default it is chosen | |
| 74 automatically based on the read length distribution (reads N90) and does not require manual setting. Typical | |
| 75 value is 3k-5k (and down to 1k for datasets with shorter read length). Intuitively, we want to set this | |
| 76 parameter as high as possible, so the repeat graph is less tangled. However, higher values might lead to assembly gaps. | |
| 77 In some rare cases it makes sense to manually increase minimum overlap for assemblies of big genomes with long reads and high coverage." /> | |
| 78 <param argument="--keep-haplotypes" type="boolean" truevalue="--keep-haplotypes" falsevalue="" checked="False" label="Keep haplotypes" | |
| 79 help="By default, Flye collapses graph structures caused by alternative haplotypes (bubbles, superbubbles, roundabouts) to produce longer | |
| 80 consensus contigs. This option retains the alternative paths on the graph, producing less contigouos, but more detailed assembly."/> | |
| 81 <param argument="--scaffold" type="boolean" truevalue="--scaffold" falsevalue="" label="Enable scaffolding using graph" | |
| 82 help="Starting from the version 2.9 Flye does not perform scaffolding by default, which guarantees that all assembled sequences do not have any gaps" /> | |
| 83 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" checked="False" label="Perform metagenomic assembly" | |
| 84 help="It is designed for highly non-uniform coverage and is sensitive to underrepresented sequence at low coverage (as low as 2x). | |
| 85 In some examples of simple metagenomes, we observed that the normal mode assembled more contigious bacterial | |
| 86 consensus sequence, while the metagenome mode was slightly more fragmented, but revealed strain mixtures"/> | |
| 87 <conditional name="asm"> | |
| 88 <param name="asm_select" type="select" label="Reduced contig assembly coverage" help="Typically, assemblies of large genomes at high coverage require a large amount of RAM. For high coverage assemblies, you can reduce memory usage by using only a subset of longest reads for initial contig extension stage (which is often the memory bottleneck)"> | |
| 89 <option value="true">Enable reduced coverage for initial disjointing assembly</option> | |
| 90 <option value="false" selected="true">Disable reduced coverage for initial disjointing assembly</option> | |
| 91 </param> | |
| 92 <when value="true"> | |
| 93 <param argument="--asm-coverage" type="integer" min="0" value="30" | |
| 94 label="Reduced coverage for initial disjointing assembly" | |
| 95 help="This parameter specifies the target coverage of the longest reads. For a typical assembly, 30x is enough to produce good | |
| 96 initial contigs. Regardless of this parameter, all reads will be used at the later pipeline stages."/> | |
| 97 <param argument="--genome-size" type="text" optional="true" label="Estimated genome size" | |
| 98 help="For example, 5m or 2.6g. No longer required as input. However, it must be used in conjunction with --asm-coverage option."> | |
| 99 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator> | |
| 100 </param> | |
| 101 </when> | |
| 102 <when value="false" /> | |
| 103 </conditional> | |
| 104 <param argument="--no-alt-contigs" type="boolean" truevalue="--no-alt-contigs" falsevalue="" checked="false" label="Remove all non-primary contigs from the assembly"/> | |
| 105 <param name="generate_log" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate a log file"/> | |
| 106 </inputs> | |
| 107 <outputs> | |
| 108 <data name="consensus" format="fasta" from_work_dir="out_dir/assembly.fasta" label="${tool.name} on ${on_string}: consensus"/> | |
| 109 <data name="assembly_graph" format="graph_dot" from_work_dir="out_dir/assembly_graph.gv" label="${tool.name} on ${on_string}: assembly graph"/> | |
| 110 <data name="assembly_gfa" format="gfa1" from_work_dir="out_dir/assembly_graph.gfa" label="${tool.name} on ${on_string}: graphical fragment assembly"/> | |
| 111 <data name="assembly_info" format="tabular" from_work_dir="out_dir/assembly_info.txt" label="${tool.name} on ${on_string}: assembly info"/> | |
| 112 <data name="flye_log" format="txt" from_work_dir="out_dir/flye.log" label="${tool.name} on ${on_string}: log"> | |
| 113 <filter>generate_log</filter> | |
| 114 </data> | |
| 115 </outputs> | |
| 116 <tests> | |
| 117 <!--Test 01: pacbio-raw--> | |
| 118 <test expect_num_outputs="5"> | |
| 119 <param name="inputs" ftype="fastq.gz" value="ecoli_01.fastq.gz,ecoli_02.fastq.gz,ecoli_03.fastq.gz,ecoli_04.fastq.gz,ecoli_05.fastq.gz,ecoli_06.fastq.gz,ecoli_07.fastq.gz"/> | |
| 120 <conditional name="mode_conditional"> | |
| 121 <param name="mode" value="--pacbio-raw"/> | |
| 122 </conditional> | |
| 123 <param name="iterations" value="0"/> | |
| 124 <param name="generate_log" value="true"/> | |
| 125 <output name="assembly_info" file="result1_assembly_info.txt" ftype="tabular" compare="sim_size"/> | |
| 126 <output name="assembly_graph" file="result1_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/> | |
| 127 <output name="assembly_gfa" file="result1_assembly_graph.gfa" ftype="gfa1" compare="diff" lines_diff="10"/> | |
| 128 <output name="consensus" ftype="fasta"> | |
| 129 <assert_contents> | |
| 130 <has_line line=">contig_1"/> | |
| 131 </assert_contents> | |
| 132 </output> | |
| 133 <output name="flye_log" file="result1.log" ftype="txt" compare="sim_size"/> | |
| 134 </test> | |
| 135 <!--Test 02: nano raw--> | |
| 136 <test expect_num_outputs="4"> | |
| 137 <param name="inputs" ftype="fasta.gz" value="nanopore.fasta.gz"/> | |
| 138 <conditional name="mode_conditional"> | |
| 139 <param name="mode" value="--nano-raw"/> | |
| 140 </conditional> | |
| 141 <param name="iterations" value="0"/> | |
| 142 <output name="assembly_info" ftype="tabular"> | |
| 143 <assert_contents> | |
| 144 <has_size value="95" delta="100"/> | |
| 145 </assert_contents> | |
| 146 </output> | |
| 147 <output name="assembly_graph" ftype="graph_dot"> | |
| 148 <assert_contents> | |
| 149 <has_size value="803" delta="100"/> | |
| 150 </assert_contents> | |
| 151 </output> | |
| 152 <output name="assembly_gfa" ftype="gfa1"> | |
| 153 <assert_contents> | |
| 154 <has_size value="35047" delta="100"/> | |
| 155 </assert_contents> | |
| 156 </output> | |
| 157 <output name="consensus" ftype="fasta"> | |
| 158 <assert_contents> | |
| 159 <has_size value="35573" delta="100"/> | |
| 160 </assert_contents> | |
| 161 </output> | |
| 162 </test> | |
| 163 <!--Test 03: reduce coverage--> | |
| 164 <test expect_num_outputs="4"> | |
| 165 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> | |
| 166 <conditional name="mode_conditional"> | |
| 167 <param name="mode" value="--nano-raw"/> | |
| 168 </conditional> | |
| 169 <conditional name="asm"> | |
| 170 <param name="asm_select" value="true" /> | |
| 171 <param name="asm_coverage" value="30"/> | |
| 172 <param name="genome_size" value="3980000"/> | |
| 173 </conditional> | |
| 174 <output name="assembly_info" ftype="tabular"> | |
| 175 <assert_contents> | |
| 176 <has_size value="286" delta="100"/> | |
| 177 </assert_contents> | |
| 178 </output> | |
| 179 <output name="assembly_graph" ftype="graph_dot"> | |
| 180 <assert_contents> | |
| 181 <has_size value="1840" delta="100"/> | |
| 182 </assert_contents> | |
| 183 </output> | |
| 184 <output name="assembly_gfa" ftype="gfa1"> | |
| 185 <assert_contents> | |
| 186 <has_size value="420752" delta="100"/> | |
| 187 </assert_contents> | |
| 188 </output> | |
| 189 <output name="consensus" ftype="fasta"> | |
| 190 <assert_contents> | |
| 191 <has_size value="427580" delta="100"/> | |
| 192 </assert_contents> | |
| 193 </output> | |
| 194 </test> | |
| 195 <!--Test 04: metagenomic mode--> | |
| 196 <test expect_num_outputs="4"> | |
| 197 <param name="inputs" ftype="fastq.gz" value="ecoli_01.fastq.gz,ecoli_02.fastq.gz,ecoli_03.fastq.gz,ecoli_04.fastq.gz,ecoli_05.fastq.gz,ecoli_06.fastq.gz,ecoli_07.fastq.gz"/> | |
| 198 <conditional name="mode_conditional"> | |
| 199 <param name="mode" value="--pacbio-raw"/> | |
| 200 </conditional> | |
| 201 <param name="meta" value="true"/> | |
| 202 <output name="assembly_info" ftype="tabular"> | |
| 203 <assert_contents> | |
| 204 <has_size value="95" delta="100"/> | |
| 205 </assert_contents> | |
| 206 </output> | |
| 207 <output name="assembly_graph" ftype="graph_dot"> | |
| 208 <assert_contents> | |
| 209 <has_size value="367" delta="100"/> | |
| 210 </assert_contents> | |
| 211 </output> | |
| 212 <output name="assembly_gfa" ftype="gfa1"> | |
| 213 <assert_contents> | |
| 214 <has_size value="418729" delta="100"/> | |
| 215 </assert_contents> | |
| 216 </output> | |
| 217 <output name="consensus" ftype="fasta"> | |
| 218 <assert_contents> | |
| 219 <has_size value="425667" delta="100"/> | |
| 220 </assert_contents> | |
| 221 </output> | |
| 222 </test> | |
| 223 <!--Test 05: nanopore HQ mode--> | |
| 224 <test expect_num_outputs="4"> | |
| 225 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> | |
| 226 <conditional name="mode_conditional"> | |
| 227 <param name="mode" value="--nano-hq"/> | |
| 228 </conditional> | |
| 229 <param name="min_overlap" value="1000"/> | |
| 230 <output name="assembly_info" ftype="tabular"> | |
| 231 <assert_contents> | |
| 232 <has_size value="286" delta="100"/> | |
| 233 </assert_contents> | |
| 234 </output> | |
| 235 <output name="assembly_graph" ftype="graph_dot"> | |
| 236 <assert_contents> | |
| 237 <has_size value="1248" delta="100"/> | |
| 238 </assert_contents> | |
| 239 </output> | |
| 240 <output name="assembly_gfa" ftype="gfa1"> | |
| 241 <assert_contents> | |
| 242 <has_size value="419414" delta="1000"/> | |
| 243 </assert_contents> | |
| 244 </output> | |
| 245 <output name="consensus" ftype="fasta"> | |
| 246 <assert_contents> | |
| 247 <has_size value="426277" delta="1000"/> | |
| 248 </assert_contents> | |
| 249 </output> | |
| 250 </test> | |
| 251 <!--Test 06: hifi error option--> | |
| 252 <test expect_num_outputs="4"> | |
| 253 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> | |
| 254 <conditional name="mode_conditional"> | |
| 255 <param name="mode" value="--pacbio-hifi"/> | |
| 256 <param name="hifi_error" value="0.21"/> | |
| 257 </conditional> | |
| 258 <param name="min_overlap" value="1000"/> | |
| 259 <output name="assembly_info" ftype="tabular"> | |
| 260 <assert_contents> | |
| 261 <has_size value="286" delta="100"/> | |
| 262 </assert_contents> | |
| 263 </output> | |
| 264 <output name="assembly_graph" ftype="graph_dot"> | |
| 265 <assert_contents> | |
| 266 <has_size value="1248" delta="500"/> | |
| 267 </assert_contents> | |
| 268 </output> | |
| 269 <output name="assembly_gfa" ftype="gfa1"> | |
| 270 <assert_contents> | |
| 271 <has_size value="420254" delta="2000"/> | |
| 272 </assert_contents> | |
| 273 </output> | |
| 274 <output name="consensus" ftype="fasta"> | |
| 275 <assert_contents> | |
| 276 <has_size value="427131" delta="2000"/> | |
| 277 </assert_contents> | |
| 278 </output> | |
| 279 </test> | |
| 280 <!--Test 07: keep haplotypes--> | |
| 281 <test expect_num_outputs="4"> | |
| 282 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> | |
| 283 <conditional name="mode_conditional"> | |
| 284 <param name="mode" value="--pacbio-corr"/> | |
| 285 </conditional> | |
| 286 <param name="min_overlap" value="1000"/> | |
| 287 <param name="keep_haplotypes" value="true"/> | |
| 288 <output name="assembly_info" ftype="tabular"> | |
| 289 <assert_contents> | |
| 290 <has_size value="286" delta="200"/> | |
| 291 </assert_contents> | |
| 292 </output> | |
| 293 <output name="assembly_graph" ftype="graph_dot"> | |
| 294 <assert_contents> | |
| 295 <has_size value="1273" delta="500"/> | |
| 296 </assert_contents> | |
| 297 </output> | |
| 298 <output name="assembly_gfa" ftype="gfa1"> | |
| 299 <assert_contents> | |
| 300 <has_size value="420254" delta="3000"/> | |
| 301 </assert_contents> | |
| 302 </output> | |
| 303 <output name="consensus" ftype="fasta"> | |
| 304 <assert_contents> | |
| 305 <has_size value="427131" delta="3000"/> | |
| 306 </assert_contents> | |
| 307 </output> | |
| 308 </test> | |
| 309 <!--Test 08: scaffolding mode--> | |
| 310 <test expect_num_outputs="4"> | |
| 311 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi_01.fastq.gz,ecoli_hifi_02.fastq.gz,ecoli_hifi_03.fastq.gz,ecoli_hifi_04.fastq.gz,ecoli_hifi_05.fastq.gz,ecoli_hifi_06.fastq.gz,ecoli_hifi_07.fastq.gz,ecoli_hifi_08.fastq.gz,ecoli_hifi_09.fastq.gz"/> | |
| 312 <conditional name="mode_conditional"> | |
| 313 <param name="mode" value="--nano-hq"/> | |
| 314 </conditional> | |
| 315 <param name="min_overlap" value="1000"/> | |
| 316 <param name="scaffold" value="true"/> | |
| 317 <output name="assembly_info" ftype="tabular"> | |
| 318 <assert_contents> | |
| 319 <has_size value="286" delta="100"/> | |
| 320 </assert_contents> | |
| 321 </output> | |
| 322 <output name="assembly_graph" ftype="graph_dot"> | |
| 323 <assert_contents> | |
| 324 <has_size value="1248" delta="100"/> | |
| 325 </assert_contents> | |
| 326 </output> | |
| 327 <output name="assembly_gfa" ftype="gfa1"> | |
| 328 <assert_contents> | |
| 329 <has_size value="419414" delta="2000"/> | |
| 330 </assert_contents> | |
| 331 </output> | |
| 332 <output name="consensus" ftype="fasta"> | |
| 333 <assert_contents> | |
| 334 <has_size value="426277" delta="2000"/> | |
| 335 </assert_contents> | |
| 336 </output> | |
| 337 </test> | |
| 338 <!--Test 09: test not-alt-contigs parameter w--> | |
| 339 <test expect_num_outputs="4"> | |
| 340 <param name="inputs" ftype="fasta.gz" value="nanopore.fasta.gz"/> | |
| 341 <conditional name="mode_conditional"> | |
| 342 <param name="mode" value="--nano-raw"/> | |
| 343 </conditional> | |
| 344 <param name="iterations" value="0"/> | |
| 345 <param name="no_alt_contigs" value="true"/> | |
| 346 <output name="assembly_info" ftype="tabular"> | |
| 347 <assert_contents> | |
| 348 <has_size value="151" delta="100"/> | |
| 349 </assert_contents> | |
| 350 </output> | |
| 351 <output name="assembly_graph" ftype="graph_dot"> | |
| 352 <assert_contents> | |
| 353 <has_size value="217" delta="100"/> | |
| 354 </assert_contents> | |
| 355 </output> | |
| 356 <output name="assembly_gfa" ftype="gfa1"> | |
| 357 <assert_contents> | |
| 358 <has_size value="5110" delta="100"/> | |
| 359 </assert_contents> | |
| 360 </output> | |
| 361 <output name="consensus" ftype="fasta"> | |
| 362 <assert_contents> | |
| 363 <has_size value="5123" delta="100"/> | |
| 364 </assert_contents> | |
| 365 </output> | |
| 366 </test> | |
| 367 </tests> | |
| 368 <help><![CDATA[ | |
| 369 | |
| 370 **Purpose** | |
| 371 | |
| 372 Flye is a de novo assembler for single molecule sequencing reads, such as those produced by PacBio and Oxford Nanopore Technologies. | |
| 373 It is designed for a wide range of datasets, from small bacterial projects to large mammalian-scale assemblies. The package represents | |
| 374 a complete pipeline: it takes raw PacBio/ONT reads as input and outputs polished contigs. Flye also has a special mode for metagenome | |
| 375 assembly. | |
| 376 | |
| 377 ---- | |
| 378 | |
| 379 **Quick usage** | |
| 380 | |
| 381 Input reads can be in FASTA or FASTQ format, uncompressed or compressed with gz. Currently, PacBio (raw, corrected, HiFi) and ONT reads | |
| 382 (raw, corrected) are supported. Expected error rates are <30% for raw, <3% for corrected, and <1% for HiFi. Note that Flye was primarily | |
| 383 developed to run on raw reads. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o | |
| 384 ption enables the mode for metagenome/uneven coverage assembly. | |
| 385 | |
| 386 Genome size estimate is no longer a required option. You need to provide an estimate if using *--asm-coverage* option. | |
| 387 | |
| 388 To reduce memory consumption for large genome assemblies, you can use a subset of the longest reads for initial disjointig assembly by | |
| 389 specifying *--asm-coverage* and *--genome-size* options. Typically, 40x coverage is enough to produce good disjointigs. | |
| 390 | |
| 391 ---- | |
| 392 | |
| 393 **Outputs** | |
| 394 | |
| 395 The main output files are: | |
| 396 | |
| 397 * Final assembly: contains contigs and possibly scaffolds (see below). | |
| 398 * Final repeat graph: note that the edge sequences might be different (shorter) than contig sequences, because contigs might include multiple graph edges. | |
| 399 * Extra information about contigs (such as length or coverage). | |
| 400 | |
| 401 Each contig is formed by a single unique graph edge. If possible, unique contigs are extended with the sequence from flanking unresolved repeats on the graph. Thus, | |
| 402 a contig fully contains the corresponding graph edge (with the same id), but might be longer then this edge. This is somewhat similar to unitig-contig relation in | |
| 403 OLC assemblers. In a rare case when a repetitive graph edge is not covered by the set of "extended" contigs, it will be also output in the assembly file. | |
| 404 | |
| 405 Sometimes it is possible to further order contigs into scaffolds based on the repeat graph structure. These ordered contigs will be output as a part of scaffold in | |
| 406 the assembly file (with a scaffold prefix). Since it is hard to give a reliable estimate of the gap size, those gaps are represented with the default 100 Ns. | |
| 407 assembly_info.txt file (below) contains additional information about how scaffolds were formed. | |
| 408 | |
| 409 Extra information about contigs/scaffolds is output into the assembly_info.txt file. It is a tab-delimited table with the columns as follows: | |
| 410 | |
| 411 * Contig/scaffold id | |
| 412 * Length | |
| 413 * Coverage | |
| 414 * Is circular, (Y)es or (N)o | |
| 415 * Is repetitive, (Y)es or (N)o | |
| 416 * Multiplicity (based on coverage) | |
| 417 * Alternative group | |
| 418 * Graph path (graph path corresponding to this contig/scaffold). | |
| 419 | |
| 420 Scaffold gaps are marked with `??` symbols, and `*` symbol denotes a terminal graph node. Alternative contigs (representing alternative haplotypes) will have the same alt. | |
| 421 group ID. Primary contigs are marked by `*`. | |
| 422 | |
| 423 ---- | |
| 424 | |
| 425 **Algorithm Description** | |
| 426 | |
| 427 This is a brief description of the Flye algorithm. Please refer to the manuscript for more detailed information. The draft contig extension is organized as follows: | |
| 428 | |
| 429 * K-mer counting / erroneous k-mer pre-filtering | |
| 430 * Solid k-mer selection (k-mers with sufficient frequency, which are unlikely to be erroneous) | |
| 431 * Contig extension. The algorithm starts from a single read and extends it with a next overlapping read (overlaps are dynamically detected using the selected solid k-mers). | |
| 432 | |
| 433 Note that we do not attempt to resolve repeats at this stage, thus the reconstructed contigs might contain misassemblies. Flye then aligns the reads on these draft | |
| 434 contigs using minimap2 and calls a consensus. Afterwards, Flye performs repeat analysis as follows: | |
| 435 | |
| 436 * Repeat graph is constructed from the (possibly misassembled) contigs | |
| 437 * In this graph all repeats longer than minimum overlap are collapsed | |
| 438 * The algorithm resolves repeats using the read information and graph structure | |
| 439 * The unbranching paths in the graph are output as contigs | |
| 440 | |
| 441 If enabled, after resolving bridged repeats, Trestle module attempts to resolve simple unbridged repeats (of multiplicity 2) using the heterogeneities between repeat copies. | |
| 442 Finally, Flye performs polishing of the resulting assembly to correct the remaining errors: | |
| 443 | |
| 444 * Alignment of all reads to the current assembly using minimap2 | |
| 445 * Partition the alignment into mini-alignments (bubbles) | |
| 446 * Error correction of each bubble using a maximum likelihood approach | |
| 447 | |
| 448 The polishing steps could be repeated, which might slightly increase quality for some datasets. | |
| 449 | |
| 450 | |
| 451 ]]></help> | |
| 452 <expand macro="citations" /> | |
| 453 </tool> |
