# HG changeset patch # User galaxytrakr # Date 1774274975 0 # Node ID a4afe551dfc9d6cdd580ff178501248d2f3f5dd8 planemo upload for repository https://github.com/CFSAN-Biostatistics/galaxytrakr-tools commit c55c06b92c0ee0429047bcff1992bf2ec293284a diff -r 000000000000 -r a4afe551dfc9 aws_sra.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aws_sra.xml Mon Mar 23 14:09:35 2026 +0000 @@ -0,0 +1,360 @@ + + Fetch SRA data files from NCBI's public AWS S3 buckets + + + awscli + sra-tools + pigz + + + aws --version + + '$output_list' + + ## ── DOWNLOAD RAW mode ──────────────────────────────────────────────────── + #elif $action.mode == 'copy' + aws s3 cp + --no-sign-request + #if $action.recursive + --recursive + #end if + '${s3_base}/${ $action.s3_key.strip("/") }' + '$output_data' + + ## ── FASTQ DUMP mode (sra-pub-run-odp only) ─────────────────────────────── + #elif $action.mode == 'fastq_dump' + #set $acc = $action.accession.strip() + + ## 1. Download the .sra file from S3 + mkdir -p sra_cache && + aws s3 cp + --no-sign-request + '${s3_base}/sra/${acc}/${acc}.sra' + ./sra_cache/${acc}.sra && + + ## 2. Convert with fasterq-dump --split-files. + ## Paired runs → _1.fastq + _2.fastq + ## Single runs → .fastq (no _1/_2 suffix) + ## We always use --split-files; single-end runs simply produce one file. + mkdir -p fastq_out && + fasterq-dump + ./sra_cache/${acc}.sra + --outdir ./fastq_out + --temp . + --threads \${GALAXY_SLOTS:-4} + --split-files + && + + ## 3. Compress with pigz (fasterq-dump does not gzip natively) + pigz -p \${GALAXY_SLOTS:-4} ./fastq_out/*.fastq && + + ## 4. Stage outputs + #if $action.layout == 'paired' + cp ./fastq_out/${acc}_1.fastq.gz '$output_r1' && + cp ./fastq_out/${acc}_2.fastq.gz '$output_r2' + #else + cp ./fastq_out/${acc}.fastq.gz '$output_r1' + #end if + #end if + ]]> + + +

+ + + + + + + + + + + + + + +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + action['mode'] == 'list' + + + + + action['mode'] == 'copy' + + + + + action['mode'] == 'fastq_dump' + + + + + action['mode'] == 'fastq_dump' and action['layout'] == 'paired' + + + + + + +

+ + +

+ + + + + + + + + + + + + +

+ +

+ + + + + + + + + + + + + + + + + + + +

+ +

+ + + + + + + + + + + + + + _1`` (R1 / forward) and ``_2`` (R2 / reverse) +for paired-end runs, matching ``fasterq-dump``'s native ``--split-files`` naming. +Single-end runs produce only ``_1``. + +*Fetching multiple accessions and building a paired collection* + +Run this tool **once per accession** — either manually or by using Galaxy's dataset +collection mapping to fan out over a list of accession identifiers. Keeping one job per +accession means a failed download does not affect the others. + +Once all jobs are complete your history will contain datasets labelled:: + + SRR000001_1 SRR000001_2 + SRR000002_1 SRR000002_2 + ... + +Use **Galaxy's "Build List of Dataset Pairs"** tool to assemble these into a +``list:paired`` collection. Galaxy will auto-detect the ``_1`` / ``_2`` suffixes +and propose pairings — confirm and name the collection, then pass it directly to +any downstream tool that accepts a paired collection (aligners, QC tools, etc.). + +.. warning:: + + This tool cannot auto-detect read layout from the accession. Check the SRA record + at https://www.ncbi.nlm.nih.gov/sra before running. Selecting the wrong layout will + produce incorrect output. + +----- + +**Notes** + +- All S3 requests are made without AWS credentials (``--no-sign-request``). +- There is typically a **1–2 day lag** between an accession appearing in SRA Search and + being available in the S3 buckets. +- Controlled-access dbGaP data (``sra-ca-run-odp``) requires AWS credentials and is + **not** supported by this tool. +- ``fasterq-dump`` and ``pigz`` both use ``\${GALAXY_SLOTS}`` threads. Allocate more + cores in your job configuration to speed up conversion of large runs. + +.. _NCBI Sequence Read Archive (SRA): https://www.ncbi.nlm.nih.gov/sra + ]]> + + + +@misc{ncbi_sra_aws, + title = {{NCBI} {SRA} on {AWS} Open Data}, + author = {{National Center for Biotechnology Information}}, + howpublished = {\url{https://registry.opendata.aws/ncbi-sra/}}, + note = {Accessed via AWS S3 without credentials} +} + + +@article{sra_toolkit, + title = {The {NCBI} {SRA} and portable data in biology}, + author = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and + {International Nucleotide Sequence Database Collaboration}}, + journal = {Nucleic Acids Research}, + volume = {39}, + number = {suppl\_1}, + pages = {D19--D21}, + year = {2011}, + doi = {10.1093/nar/gkq1019} +} + + + +