annotate cfsan_flye.xml @ 1:5c18b16d6ac1 tip

"planemo upload"
author kkonganti
date Fri, 24 Jun 2022 14:19:52 -0400
parents 96bb0635f0a0
children
rev   line source
kkonganti@1 1 <tool id="cfsan_flye" name="CFSAN_Flye" version="2.8.1+galaxy0">
kkonganti@0 2 <description>de novo assembler for single molecule sequencing reads</description>
kkonganti@0 3 <requirements>
kkonganti@0 4 <requirement type="package" version="2.8.1">flye</requirement>
kkonganti@0 5 </requirements>
kkonganti@0 6 <version_command>flye --version</version_command>
kkonganti@0 7 <command detect_errors="exit_code"><![CDATA[
kkonganti@0 8 #if $inputs.is_of_type('fastqsanger', 'fastq'):
kkonganti@0 9 #set $ext = 'fastq'
kkonganti@0 10 #elif $inputs.is_of_type('fastqsanger.gz', 'fastq.gz'):
kkonganti@0 11 #set $ext = 'fastq.gz'
kkonganti@0 12 #elif $inputs.is_of_type('fasta.gz'):
kkonganti@0 13 #set $ext = 'fasta.gz'
kkonganti@0 14 #elif $inputs.is_of_type('fasta'):
kkonganti@0 15 #set $ext = 'fasta'
kkonganti@0 16 #end if
kkonganti@0 17 infile=\$(basename $inputs '.dat').${ext} &&
kkonganti@0 18 ln -s $inputs ./\${infile} &&
kkonganti@0 19 flye
kkonganti@0 20 $mode_conditional.mode
kkonganti@0 21 \${infile}
kkonganti@0 22 -o out_dir
kkonganti@0 23 -t \${GALAXY_SLOTS:-4}
kkonganti@0 24 -i $iterations
kkonganti@0 25 #if $mode_conditional.mode == '--pacbio-hifi' and $mode_conditional.hifi_error:
kkonganti@0 26 --hifi-error $mode_conditional.hifi_error
kkonganti@0 27 #end if
kkonganti@0 28 #if $min_overlap:
kkonganti@0 29 -m $min_overlap
kkonganti@0 30 #end if
kkonganti@0 31 #if $asm.asm_select == 'true':
kkonganti@0 32 --asm-coverage $asm.asm_coverage
kkonganti@0 33 -g '${asm.genome_size}'
kkonganti@0 34 #end if
kkonganti@0 35 $meta
kkonganti@0 36 $scaffold
kkonganti@0 37 ]]></command>
kkonganti@0 38 <inputs>
kkonganti@0 39 <param name="inputs" type="data" format="fasta,fasta.gz,fastq,fastq.gz,fastqsanger.gz,fastqsanger" label="Input reads" />
kkonganti@0 40 <conditional name="mode_conditional">
kkonganti@0 41 <param name="mode" type="select" label="Mode">
kkonganti@0 42 <option value="--nano-raw">Nanopore raw (--nano-raw)</option>
kkonganti@0 43 <option value="--nano-corr">Nanopore corrected (--nano-corr)</option>
kkonganti@0 44 <option value="--nano-hq">Nanopore HQ (--nano-hq)</option>
kkonganti@0 45 <option value="--pacbio-raw">PacBio raw (--pacbio-raw)</option>
kkonganti@0 46 <option value="--pacbio-corr">PacBio corrected (--pacbio-corr)</option>
kkonganti@0 47 <option value="--pacbio-hifi">PacBio HiFi (--pacbio-hifi)</option>
kkonganti@0 48 </param>
kkonganti@0 49 <when value="--nano-raw"/>
kkonganti@0 50 <when value="--nano-corr"/>
kkonganti@0 51 <when value="--nano-hq"/>
kkonganti@0 52 <when value="--pacbio-raw"/>
kkonganti@0 53 <when value="--pacbio-corr"/>
kkonganti@0 54 <when value="--pacbio-hifi">
kkonganti@0 55 <param argument="--hifi-error" type="float" min="0" max="1" optional="true" label="Expected HiFi reads error rate" help="Default: 0.01"/>
kkonganti@0 56 </when>
kkonganti@0 57 </conditional>
kkonganti@0 58 <param argument="--iterations" type="integer" value="1" label="Number of polishing iterations"
kkonganti@0 59 help="Polishing is performed as the final assembly stage. By default, Flye runs one polishing iteration. Additional iterations
kkonganti@0 60 might correct a small number of extra errors (due to improvements on how reads may align to the corrected assembly). If the
kkonganti@0 61 parameter is set to 0, the polishing is not performed"/>
kkonganti@0 62 <param argument="--min-overlap" type="integer" min="1000" max="10000" optional="true" label="Minimum overlap between reads"
kkonganti@0 63 help="This sets a minimum overlap length for two reads to be considered overlapping. By default it is chosen
kkonganti@0 64 automatically based on the read length distribution (reads N90) and does not require manual setting. Typical
kkonganti@0 65 value is 3k-5k (and down to 1k for datasets with shorter read length). Intuitively, we want to set this
kkonganti@0 66 parameter as high as possible, so the repeat graph is less tangled. However, higher values might lead to assembly gaps.
kkonganti@0 67 In some rare cases it makes sense to manually increase minimum overlap for assemblies of big genomes with long reads and high coverage." />
kkonganti@0 68 <param argument="--keep-haplotypes" type="boolean" truevalue="--keep-haplotypes" falsevalue="" checked="False" label="Keep haplotypes"
kkonganti@0 69 help="By default, Flye collapses graph structures caused by alternative haplotypes (bubbles, superbubbles, roundabouts) to produce longer
kkonganti@0 70 consensus contigs. This option retains the alternative paths on the graph, producing less contigouos, but more detailed assembly."/>
kkonganti@0 71 <param argument="--scaffold" type="boolean" truevalue="--scaffold" falsevalue="" label="Enable scaffolding using graph"
kkonganti@0 72 help="Starting from the version 2.9 Flye does not perform scaffolding by default, which guarantees that all assembled sequences do not have any gaps" />
kkonganti@0 73 <param argument="--meta" type="boolean" truevalue="--meta" falsevalue="" checked="False" label="Perform metagenomic assembly"
kkonganti@0 74 help="It is designed for highly non-uniform coverage and is sensitive to underrepresented sequence at low coverage (as low as 2x).
kkonganti@0 75 In some examples of simple metagenomes, we observed that the normal mode assembled more contigious bacterial
kkonganti@0 76 consensus sequence, while the metagenome mode was slightly more fragmented, but revealed strain mixtures"/>
kkonganti@0 77 <conditional name="asm">
kkonganti@0 78 <param name="asm_select" type="select" label="Reduced contig assembly coverage" help="Typically, assemblies of large genomes at high coverage require a hundreds of RAM. For high coverage assemblies,
kkonganti@0 79 you can reduce memory usage by using only a subset of longest reads for initial contig extension stage (usually, the memory bottleneck)">
kkonganti@0 80 <option value="true">Enable reduced coverage for initial disjointing assembly</option>
kkonganti@0 81 <option value="false" selected="true">Disable reduced coverage for initial disjointing assembly</option>
kkonganti@0 82 </param>
kkonganti@0 83 <when value="true">
kkonganti@0 84 <param argument="--asm-coverage" type="integer" min="0" value="30"
kkonganti@0 85 label="Reduced coverage for initial disjointing assembly"
kkonganti@0 86 help="This parameter specifies the target coverage of the longest reads. For a typical assembly, 30x is enough to produce good
kkonganti@0 87 initial contigs. Regardless of this parameter, all reads will be used at the later pipeline stages."/>
kkonganti@0 88 <param argument="--genome-size" type="text" optional="true" label="Estimated genome size"
kkonganti@0 89 help="For example, 5m or 2.6g. No longer required as input. However, it must be used in conjunction with --asm-coverage option.">
kkonganti@0 90 <validator type="regex" message="Genome size must be a float or integer, optionally followed by the a unit prefix (kmg)">^([0-9]*[.])?[0-9]+[kmg]?$</validator>
kkonganti@0 91 </param>
kkonganti@0 92 </when>
kkonganti@0 93 <when value="false" />
kkonganti@0 94 </conditional>
kkonganti@0 95 <param name="generate_log" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate a log file"/>
kkonganti@0 96 </inputs>
kkonganti@0 97 <outputs>
kkonganti@0 98 <data name="consensus" format="fasta" from_work_dir="out_dir/assembly.fasta" label="${tool.name} on ${on_string}: consensus"/>
kkonganti@0 99 <data name="assembly_graph" format="graph_dot" from_work_dir="out_dir/assembly_graph.gv" label="${tool.name} on ${on_string}: assembly graph"/>
kkonganti@0 100 <data name="assembly_gfa" format="txt" from_work_dir="out_dir/assembly_graph.gfa" label="${tool.name} on ${on_string}: graphical fragment assembly"/>
kkonganti@0 101 <data name="assembly_info" format="tabular" from_work_dir="out_dir/assembly_info.txt" label="${tool.name} on ${on_string}: assembly info"/>
kkonganti@0 102 <data name="flye_log" format="txt" from_work_dir="out_dir/flye.log" label="${tool.name} on ${on_string}: log">
kkonganti@0 103 <filter>generate_log</filter>
kkonganti@0 104 </data>
kkonganti@0 105 </outputs>
kkonganti@0 106 <tests>
kkonganti@0 107 <!--Test 01: pacbio-raw-->
kkonganti@0 108 <test expect_num_outputs="5">
kkonganti@0 109 <param name="inputs" ftype="fastq.gz" value="ecoli.fastq.gz"/>
kkonganti@0 110 <param name="mode" value="--pacbio-raw"/>
kkonganti@0 111 <param name="iterations" value="0"/>
kkonganti@0 112 <param name="generate_log" value="true"/>
kkonganti@0 113 <output name="assembly_info" file="result1_assembly_info.txt" ftype="tabular" compare="sim_size"/>
kkonganti@0 114 <output name="assembly_graph" file="result1_assembly_graph.dot" ftype="graph_dot" compare="sim_size"/>
kkonganti@0 115 <output name="assembly_gfa" file="result1_assembly_graph.gfa" ftype="txt" compare="sim_size"/>
kkonganti@0 116 <output name="consensus" file="result1_assembly.fasta" ftype="fasta" compare="sim_size"/>
kkonganti@0 117 <output name="flye_log" file="result1.log" ftype="txt" compare="sim_size"/>
kkonganti@0 118 </test>
kkonganti@0 119 <!--Test 02: nano raw-->
kkonganti@0 120 <test expect_num_outputs="4">
kkonganti@0 121 <param name="inputs" ftype="fasta.gz" value="nanopore.fasta.gz"/>
kkonganti@0 122 <param name="mode" value="--nano-raw"/>
kkonganti@0 123 <param name="iterations" value="0"/>
kkonganti@0 124 <output name="assembly_info" ftype="tabular">
kkonganti@0 125 <assert_contents>
kkonganti@0 126 <has_size value="95" delta="100"/>
kkonganti@0 127 </assert_contents>
kkonganti@0 128 </output>
kkonganti@0 129 <output name="assembly_graph" ftype="graph_dot">
kkonganti@0 130 <assert_contents>
kkonganti@0 131 <has_size value="803" delta="100"/>
kkonganti@0 132 </assert_contents>
kkonganti@0 133 </output>
kkonganti@0 134 <output name="assembly_gfa" ftype="txt">
kkonganti@0 135 <assert_contents>
kkonganti@0 136 <has_size value="35047" delta="100"/>
kkonganti@0 137 </assert_contents>
kkonganti@0 138 </output>
kkonganti@0 139 <output name="consensus" ftype="fasta">
kkonganti@0 140 <assert_contents>
kkonganti@0 141 <has_size value="35573" delta="100"/>
kkonganti@0 142 </assert_contents>
kkonganti@0 143 </output>
kkonganti@0 144 </test>
kkonganti@0 145 <!--Test 03: reduce coverage-->
kkonganti@0 146 <test expect_num_outputs="4">
kkonganti@0 147 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
kkonganti@0 148 <conditional name="mode_conditional">
kkonganti@0 149 <param name="mode" value="--nano-raw"/>
kkonganti@0 150 </conditional>
kkonganti@0 151 <conditional name="asm">
kkonganti@0 152 <param name="asm_select" value="true" />
kkonganti@0 153 <param name="asm" value="30"/>
kkonganti@0 154 <param name="genome_size" value="3980000"/>
kkonganti@0 155 </conditional>
kkonganti@0 156 <output name="assembly_info" ftype="tabular">
kkonganti@0 157 <assert_contents>
kkonganti@0 158 <has_size value="286" delta="100"/>
kkonganti@0 159 </assert_contents>
kkonganti@0 160 </output>
kkonganti@0 161 <output name="assembly_graph" ftype="graph_dot">
kkonganti@0 162 <assert_contents>
kkonganti@0 163 <has_size value="1840" delta="100"/>
kkonganti@0 164 </assert_contents>
kkonganti@0 165 </output>
kkonganti@0 166 <output name="assembly_gfa" ftype="txt">
kkonganti@0 167 <assert_contents>
kkonganti@0 168 <has_size value="420752" delta="100"/>
kkonganti@0 169 </assert_contents>
kkonganti@0 170 </output>
kkonganti@0 171 <output name="consensus" ftype="fasta">
kkonganti@0 172 <assert_contents>
kkonganti@0 173 <has_size value="427580" delta="100"/>
kkonganti@0 174 </assert_contents>
kkonganti@0 175 </output>
kkonganti@0 176 </test>
kkonganti@0 177 <!--Test 04: metagenomic mode-->
kkonganti@0 178 <test expect_num_outputs="4">
kkonganti@0 179 <param name="inputs" ftype="fastq.gz" value="ecoli.fastq.gz"/>
kkonganti@0 180 <conditional name="mode_conditional">
kkonganti@0 181 <param name="mode" value="--pacbio-raw"/>
kkonganti@0 182 </conditional>
kkonganti@0 183 <param name="meta" value="true"/>
kkonganti@0 184 <output name="assembly_info" ftype="tabular">
kkonganti@0 185 <assert_contents>
kkonganti@0 186 <has_size value="95" delta="100"/>
kkonganti@0 187 </assert_contents>
kkonganti@0 188 </output>
kkonganti@0 189 <output name="assembly_graph" ftype="graph_dot">
kkonganti@0 190 <assert_contents>
kkonganti@0 191 <has_size value="367" delta="100"/>
kkonganti@0 192 </assert_contents>
kkonganti@0 193 </output>
kkonganti@0 194 <output name="assembly_gfa" ftype="txt">
kkonganti@0 195 <assert_contents>
kkonganti@0 196 <has_size value="418729" delta="100"/>
kkonganti@0 197 </assert_contents>
kkonganti@0 198 </output>
kkonganti@0 199 <output name="consensus" ftype="fasta">
kkonganti@0 200 <assert_contents>
kkonganti@0 201 <has_size value="425667" delta="100"/>
kkonganti@0 202 </assert_contents>
kkonganti@0 203 </output>
kkonganti@0 204 </test>
kkonganti@0 205 <!--Test 05: nanopore HQ mode-->
kkonganti@0 206 <test expect_num_outputs="4">
kkonganti@0 207 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
kkonganti@0 208 <conditional name="mode_conditional">
kkonganti@0 209 <param name="mode" value="--nano-hq"/>
kkonganti@0 210 </conditional>
kkonganti@0 211 <param name="min_overlap" value="1000"/>
kkonganti@0 212 <output name="assembly_info" ftype="tabular">
kkonganti@0 213 <assert_contents>
kkonganti@0 214 <has_size value="286" delta="100"/>
kkonganti@0 215 </assert_contents>
kkonganti@0 216 </output>
kkonganti@0 217 <output name="assembly_graph" ftype="graph_dot">
kkonganti@0 218 <assert_contents>
kkonganti@0 219 <has_size value="1248" delta="100"/>
kkonganti@0 220 </assert_contents>
kkonganti@0 221 </output>
kkonganti@0 222 <output name="assembly_gfa" ftype="txt">
kkonganti@0 223 <assert_contents>
kkonganti@0 224 <has_size value="420252" delta="100"/>
kkonganti@0 225 </assert_contents>
kkonganti@0 226 </output>
kkonganti@0 227 <output name="consensus" ftype="fasta">
kkonganti@0 228 <assert_contents>
kkonganti@0 229 <has_size value="427129" delta="100"/>
kkonganti@0 230 </assert_contents>
kkonganti@0 231 </output>
kkonganti@0 232 </test>
kkonganti@0 233 <!--Test 06: hifi error option-->
kkonganti@0 234 <test expect_num_outputs="4">
kkonganti@0 235 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
kkonganti@0 236 <conditional name="mode_conditional">
kkonganti@0 237 <param name="mode" value="--pacbio-hifi"/>
kkonganti@0 238 <param name="hifi_error" value="0.21"/>
kkonganti@0 239 </conditional>
kkonganti@0 240 <param name="min_overlap" value="1000"/>
kkonganti@0 241 <output name="assembly_info" ftype="tabular">
kkonganti@0 242 <assert_contents>
kkonganti@0 243 <has_size value="286" delta="100"/>
kkonganti@0 244 </assert_contents>
kkonganti@0 245 </output>
kkonganti@0 246 <output name="assembly_graph" ftype="graph_dot">
kkonganti@0 247 <assert_contents>
kkonganti@0 248 <has_size value="1273" delta="100"/>
kkonganti@0 249 </assert_contents>
kkonganti@0 250 </output>
kkonganti@0 251 <output name="assembly_gfa" ftype="txt">
kkonganti@0 252 <assert_contents>
kkonganti@0 253 <has_size value="420252" delta="100"/>
kkonganti@0 254 </assert_contents>
kkonganti@0 255 </output>
kkonganti@0 256 <output name="consensus" ftype="fasta">
kkonganti@0 257 <assert_contents>
kkonganti@0 258 <has_size value="427129" delta="100"/>
kkonganti@0 259 </assert_contents>
kkonganti@0 260 </output>
kkonganti@0 261 </test>
kkonganti@0 262 <!--Test 07: keep haplotypes-->
kkonganti@0 263 <test expect_num_outputs="4">
kkonganti@0 264 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
kkonganti@0 265 <conditional name="mode_conditional">
kkonganti@0 266 <param name="mode" value="--pacbio-corr"/>
kkonganti@0 267 <param name="hifi_error" value="0.21"/>
kkonganti@0 268 </conditional>
kkonganti@0 269 <param name="min_overlap" value="1000"/>
kkonganti@0 270 <param name="keep-haplotypes" value="true"/>
kkonganti@0 271 <output name="assembly_info" ftype="tabular">
kkonganti@0 272 <assert_contents>
kkonganti@0 273 <has_size value="286" delta="100"/>
kkonganti@0 274 </assert_contents>
kkonganti@0 275 </output>
kkonganti@0 276 <output name="assembly_graph" ftype="graph_dot">
kkonganti@0 277 <assert_contents>
kkonganti@0 278 <has_size value="1273" delta="100"/>
kkonganti@0 279 </assert_contents>
kkonganti@0 280 </output>
kkonganti@0 281 <output name="assembly_gfa" ftype="txt">
kkonganti@0 282 <assert_contents>
kkonganti@0 283 <has_size value="420252" delta="100"/>
kkonganti@0 284 </assert_contents>
kkonganti@0 285 </output>
kkonganti@0 286 <output name="consensus" ftype="fasta">
kkonganti@0 287 <assert_contents>
kkonganti@0 288 <has_size value="427129" delta="100"/>
kkonganti@0 289 </assert_contents>
kkonganti@0 290 </output>
kkonganti@0 291 </test>
kkonganti@0 292 <!--Test 08: scaffolding mode-->
kkonganti@0 293 <test expect_num_outputs="4">
kkonganti@0 294 <param name="inputs" ftype="fastq.gz" value="ecoli_hifi.fastq.gz"/>
kkonganti@0 295 <param name="mode" value="--nano-hq"/>
kkonganti@0 296 <param name="min_overlap" value="1000"/>
kkonganti@0 297 <param name="scaffolding" value="true"/>
kkonganti@0 298 <output name="assembly_info" ftype="tabular">
kkonganti@0 299 <assert_contents>
kkonganti@0 300 <has_size value="286" delta="100"/>
kkonganti@0 301 </assert_contents>
kkonganti@0 302 </output>
kkonganti@0 303 <output name="assembly_graph" ftype="graph_dot">
kkonganti@0 304 <assert_contents>
kkonganti@0 305 <has_size value="1248" delta="100"/>
kkonganti@0 306 </assert_contents>
kkonganti@0 307 </output>
kkonganti@0 308 <output name="assembly_gfa" ftype="txt">
kkonganti@0 309 <assert_contents>
kkonganti@0 310 <has_size value="420252" delta="100"/>
kkonganti@0 311 </assert_contents>
kkonganti@0 312 </output>
kkonganti@0 313 <output name="consensus" ftype="fasta">
kkonganti@0 314 <assert_contents>
kkonganti@0 315 <has_size value="427129" delta="100"/>
kkonganti@0 316 </assert_contents>
kkonganti@0 317 </output>
kkonganti@0 318 </test>
kkonganti@0 319 </tests>
kkonganti@0 320 <help><![CDATA[
kkonganti@0 321
kkonganti@0 322 .. class:: infomark
kkonganti@0 323
kkonganti@0 324 **Purpose**
kkonganti@0 325
kkonganti@0 326 Flye is a de novo assembler for single molecule sequencing reads, such as those produced by PacBio and Oxford Nanopore Technologies.
kkonganti@0 327 It is designed for a wide range of datasets, from small bacterial projects to large mammalian-scale assemblies. The package represents
kkonganti@0 328 a complete pipeline: it takes raw PacBio/ONT reads as input and outputs polished contigs. Flye also has a special mode for metagenome
kkonganti@0 329 assembly.
kkonganti@0 330
kkonganti@0 331 ----
kkonganti@0 332
kkonganti@0 333 .. class:: infomark
kkonganti@0 334
kkonganti@0 335 **Quick usage**
kkonganti@0 336
kkonganti@0 337 Input reads can be in FASTA or FASTQ format, uncompressed or compressed with gz. Currently, PacBio (raw, corrected, HiFi) and ONT reads
kkonganti@0 338 (raw, corrected) are supported. Expected error rates are <30% for raw, <3% for corrected, and <1% for HiFi. Note that Flye was primarily
kkonganti@0 339 developed to run on raw reads. You may specify multiple files with reads (separated by spaces). Mixing different read types is not yet supported. The *--meta* o
kkonganti@0 340 ption enables the mode for metagenome/uneven coverage assembly.
kkonganti@0 341
kkonganti@0 342 Genome size estimate is no longer a required option. You need to provide an estimate if using *--asm-coverage* option.
kkonganti@0 343
kkonganti@0 344 To reduce memory consumption for large genome assemblies, you can use a subset of the longest reads for initial disjointig assembly by
kkonganti@0 345 specifying *--asm-coverage* and *--genome-size* options. Typically, 40x coverage is enough to produce good disjointigs.
kkonganti@0 346
kkonganti@0 347 ----
kkonganti@0 348
kkonganti@0 349 .. class:: infomark
kkonganti@0 350
kkonganti@0 351 **Outputs**
kkonganti@0 352
kkonganti@0 353 The main output files are:
kkonganti@0 354
kkonganti@0 355 ::
kkonganti@0 356
kkonganti@0 357 - Final assembly: contains contigs and possibly scaffolds (see below).
kkonganti@0 358 - Final repeat graph: note that the edge sequences might be different (shorter) than contig sequences, because contigs might include multiple graph edges.
kkonganti@0 359 - Extra information about contigs (such as length or coverage).
kkonganti@0 360
kkonganti@0 361 Each contig is formed by a single unique graph edge. If possible, unique contigs are extended with the sequence from flanking unresolved repeats on the graph. Thus,
kkonganti@0 362 a contig fully contains the corresponding graph edge (with the same id), but might be longer then this edge. This is somewhat similar to unitig-contig relation in
kkonganti@0 363 OLC assemblers. In a rare case when a repetitive graph edge is not covered by the set of "extended" contigs, it will be also output in the assembly file.
kkonganti@0 364
kkonganti@0 365 Sometimes it is possible to further order contigs into scaffolds based on the repeat graph structure. These ordered contigs will be output as a part of scaffold in
kkonganti@0 366 the assembly file (with a scaffold prefix). Since it is hard to give a reliable estimate of the gap size, those gaps are represented with the default 100 Ns.
kkonganti@0 367 assembly_info.txt file (below) contains additional information about how scaffolds were formed.
kkonganti@0 368
kkonganti@0 369 Extra information about contigs/scaffolds is output into the assembly_info.txt file. It is a tab-delimited table with the columns as follows:
kkonganti@0 370
kkonganti@0 371 ::
kkonganti@0 372
kkonganti@0 373 - Contig/scaffold id
kkonganti@0 374 - Length
kkonganti@0 375 - Coverage
kkonganti@0 376 - Is circular, (Y)es or (N)o
kkonganti@0 377 - Is repetitive, (Y)es or (N)o
kkonganti@0 378 - Multiplicity (based on coverage)
kkonganti@0 379 - Alternative group
kkonganti@0 380 - Graph path (graph path corresponding to this contig/scaffold).
kkonganti@0 381
kkonganti@0 382 Scaffold gaps are marked with ?? symbols, and * symbol denotes a terminal graph node. Alternative contigs (representing alternative haplotypes) will have the same alt.
kkonganti@0 383 group ID. Primary contigs are marked by *.
kkonganti@0 384
kkonganti@0 385 ----
kkonganti@0 386
kkonganti@0 387 .. class:: infomark
kkonganti@0 388
kkonganti@0 389 **Algorithm Description**
kkonganti@0 390
kkonganti@0 391 This is a brief description of the Flye algorithm. Please refer to the manuscript for more detailed information. The draft contig extension is organized as follows:
kkonganti@0 392
kkonganti@0 393 ::
kkonganti@0 394
kkonganti@0 395 - K-mer counting / erroneous k-mer pre-filtering
kkonganti@0 396 - Solid k-mer selection (k-mers with sufficient frequency, which are unlikely to be erroneous)
kkonganti@0 397 - Contig extension. The algorithm starts from a single read and extends it with a next overlapping read (overlaps are dynamically detected using the selected solid k-mers).
kkonganti@0 398
kkonganti@0 399 Note that we do not attempt to resolve repeats at this stage, thus the reconstructed contigs might contain misassemblies. Flye then aligns the reads on these draft
kkonganti@0 400 contigs using minimap2 and calls a consensus. Afterwards, Flye performs repeat analysis as follows:
kkonganti@0 401
kkonganti@0 402 ::
kkonganti@0 403
kkonganti@0 404 - Repeat graph is constructed from the (possibly misassembled) contigs
kkonganti@0 405 - In this graph all repeats longer than minimum overlap are collapsed
kkonganti@0 406 - The algorithm resolves repeats using the read information and graph structure
kkonganti@0 407 - The unbranching paths in the graph are output as contigs
kkonganti@0 408
kkonganti@0 409 If enabled, after resolving bridged repeats, Trestle module attempts to resolve simple unbridged repeats (of multiplicity 2) using the heterogeneities between repeat copies.
kkonganti@0 410 Finally, Flye performs polishing of the resulting assembly to correct the remaining errors:
kkonganti@0 411
kkonganti@0 412 ::
kkonganti@0 413
kkonganti@0 414 - Alignment of all reads to the current assembly using minimap2
kkonganti@0 415 - Partition the alignment into mini-alignments (bubbles)
kkonganti@0 416 - Error correction of each bubble using a maximum likelihood approach
kkonganti@0 417
kkonganti@0 418
kkonganti@0 419 The polishing steps could be repeated, which might slightly increase quality for some datasets.
kkonganti@0 420
kkonganti@0 421
kkonganti@0 422 ]]></help>
kkonganti@0 423 <citations>
kkonganti@0 424 <citation type="doi">10.1073/pnas.1604560113</citation>
kkonganti@0 425 <citation type="bibtex">
kkonganti@0 426 @misc{githubFlye,
kkonganti@0 427 author = {Kolmogorov, Mijhail},
kkonganti@0 428 year = {2021},
kkonganti@0 429 title = {Flye},
kkonganti@0 430 publisher = {GitHub},
kkonganti@0 431 journal = {GitHub repository},
kkonganti@0 432 url = {https://github.com/fenderglass/Flye}}
kkonganti@0 433 </citation>
kkonganti@0 434 </citations>
kkonganti@0 435 </tool>