aws_sra: aws_sra.xml comparison

comparison aws_sra.xml @ 0:a4afe551dfc9 draft

planemo upload for repository https://github.com/CFSAN-Biostatistics/galaxytrakr-tools commit c55c06b92c0ee0429047bcff1992bf2ec293284a

author	galaxytrakr
date	Mon, 23 Mar 2026 14:09:35 +0000
parents
children	ddfdc4c465e7

comparison

equal deleted inserted replaced

--1:000000000000
+:a4afe551dfc9
+<tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.3.0+gt_0.1" profile="23.0">
+<description>Fetch SRA data files from NCBI's public AWS S3 buckets</description>
+<requirements>
+<requirement type="package" version="2.34.8" channel="conda-forge">awscli</requirement>
+<requirement type="package" version="3.2.1" channel="bioconda">sra-tools</requirement>
+<requirement type="package" version="2.8" channel="conda-forge">pigz</requirement>
+</requirements>
+<version_command>aws --version</version_command>
+<command detect_errors="exit_code"><![CDATA[
+## ── Resolve bucket base URL ──────────────────────────────────────────────
+#if $source.bucket == 'sra_pub_run_odp'
+#set $s3_base = 's3://sra-pub-run-odp'
+#elif $source.bucket == 'sra_pub_src_1'
+#set $s3_base = 's3://sra-pub-src-1'
+#elif $source.bucket == 'sra_pub_src_2'
+#set $s3_base = 's3://sra-pub-src-2'
+#elif $source.bucket == 'sra_pub_metadata'
+#set $s3_base = 's3://sra-pub-metadata-us-east-1/sra/metadata'
+#end if
+## ── LIST mode ────────────────────────────────────────────────────────────
+#if $action.mode == 'list'
+aws s3 ls
+--no-sign-request
+#if $source.prefix
+'${s3_base}/${ $source.prefix.strip("/") }/'
+#else
+'${s3_base}/'
+#end if
+#if $action.recursive
+--recursive
+#end if
+> '$output_list'
+## ── DOWNLOAD RAW mode ────────────────────────────────────────────────────
+#elif $action.mode == 'copy'
+aws s3 cp
+--no-sign-request
+#if $action.recursive
+--recursive
+#end if
+'${s3_base}/${ $action.s3_key.strip("/") }'
+'$output_data'
+## ── FASTQ DUMP mode (sra-pub-run-odp only) ───────────────────────────────
+#elif $action.mode == 'fastq_dump'
+#set $acc = $action.accession.strip()
+## 1. Download the .sra file from S3
+mkdir -p sra_cache &&
+aws s3 cp
+--no-sign-request
+'${s3_base}/sra/${acc}/${acc}.sra'
+./sra_cache/${acc}.sra &&
+## 2. Convert with fasterq-dump --split-files.
+##    Paired runs  → <acc>_1.fastq + <acc>_2.fastq
+##    Single runs  → <acc>.fastq   (no _1/_2 suffix)
+##    We always use --split-files; single-end runs simply produce one file.
+mkdir -p fastq_out &&
+fasterq-dump
+./sra_cache/${acc}.sra
+--outdir ./fastq_out
+--temp .
+--threads \${GALAXY_SLOTS:-4}
+--split-files
+&&
+## 3. Compress with pigz (fasterq-dump does not gzip natively)
+pigz -p \${GALAXY_SLOTS:-4} ./fastq_out/*.fastq &&
+## 4. Stage outputs
+#if $action.layout == 'paired'
+cp ./fastq_out/${acc}_1.fastq.gz '$output_r1' &&
+cp ./fastq_out/${acc}_2.fastq.gz '$output_r2'
+#else
+cp ./fastq_out/${acc}.fastq.gz '$output_r1'
+#end if
+#end if
+]]></command>
+<inputs>
+<section name="source" title="Data Source" expanded="true">
+<param name="bucket" type="select" label="SRA S3 Bucket"
+help="Select the NCBI SRA AWS Open Data bucket to access. FASTQ conversion is only available for sra-pub-run-odp.">
+<option value="sra_pub_run_odp" selected="true">
+sra-pub-run-odp — Open-access SRA runs (.sra format)
+</option>
+<option value="sra_pub_src_1">
+sra-pub-src-1 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 1]
+</option>
+<option value="sra_pub_src_2">
+sra-pub-src-2 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 2]
+</option>
+<option value="sra_pub_metadata">
+sra-pub-metadata-us-east-1 — SRA metadata (Parquet/CSV for Athena/Glue)
+</option>
+</param>
+<param name="prefix" type="text" value="" optional="true"
+label="S3 key prefix (optional)"
+help="Restrict listing to a sub-path, e.g. 'sra/SRR000001'. Used only in List mode.">
+<sanitizer invalid_char="">
+<valid initial="string.printable">
+<remove value="'"/>
+<remove value='"'/>
+</valid>
+</sanitizer>
+</param>
+</section>
+<conditional name="action">
+<param name="mode" type="select" label="Action">
+<option value="list" selected="true">List objects</option>
+<option value="copy">Download raw file(s)</option>
+<option value="fastq_dump">Download and convert to FASTQ (sra-pub-run-odp only)</option>
+</param>
+<!-- ── LIST ── -->
+<when value="list">
+<param name="recursive" type="boolean" truevalue="--recursive" falsevalue=""
+checked="false" label="List recursively"
+help="List all objects under the prefix, not just the immediate level."/>
+</when>
+<!-- ── COPY ── -->
+<when value="copy">
+<param name="s3_key" type="text" label="S3 key to download"
+help="Full key of the object to download, e.g. 'sra/SRR000001/SRR000001.sra'.">
+<validator type="empty_field" message="An S3 key is required for download."/>
+<sanitizer invalid_char="">
+<valid initial="string.printable">
+<remove value="'"/>
+<remove value='"'/>
+</valid>
+</sanitizer>
+</param>
+<param name="recursive" type="boolean" truevalue="--recursive" falsevalue=""
+checked="false" label="Download recursively"
+help="Download all objects with this prefix rather than a single object."/>
+</when>
+<!-- ── FASTQ DUMP ── -->
+<when value="fastq_dump">
+<param name="accession" type="text" label="SRA Accession"
+help="SRA run accession to fetch and convert, e.g. SRR000001. Must be present in sra-pub-run-odp.">
+<validator type="empty_field" message="An SRA accession is required."/>
+<validator type="regex"
+message="Must be a valid SRA run accession (SRR, ERR, or DRR followed by digits)."
+expression="^[SED]RR[0-9]+$"/>
+</param>
+<param name="layout" type="select" label="Read layout"
+help="Paired-end produces two datasets labelled accession_1 and accession_2. Single-end produces one dataset. Check the SRA record to confirm layout before running.">
+<option value="paired" selected="true">Paired-end (R1 + R2)</option>
+<option value="single">Single-end</option>
+</param>
+</when>
+</conditional>
+</inputs>
+<outputs>
+<!-- List output -->
+<data name="output_list" format="txt"
+label="SRA S3 listing: ${source.prefix}">
+<filter>action['mode'] == 'list'</filter>
+</data>
+<!-- Raw download -->
+<data name="output_data" format="auto"
+label="SRA download: ${action.s3_key}">
+<filter>action['mode'] == 'copy'</filter>
+</data>
+<!-- FASTQ R1 / single.
+Label matches fasterq-dump's native _1 suffix so Galaxy's
+"Build List of Dataset Pairs" can auto-detect pairings. -->
+<data name="output_r1" format="fastqsanger.gz"
+label="${action.accession}_1">
+<filter>action['mode'] == 'fastq_dump'</filter>
+</data>
+<!-- FASTQ R2 (paired-end only) -->
+<data name="output_r2" format="fastqsanger.gz"
+label="${action.accession}_2">
+<filter>action['mode'] == 'fastq_dump' and action['layout'] == 'paired'</filter>
+</data>
+</outputs>
+<tests>
+<!-- Test 1: list mode -->
+<test expect_num_outputs="1">
+<section name="source">
+<param name="bucket" value="sra_pub_run_odp"/>
+<param name="prefix" value="sra/SRR000001"/>
+</section>
+<conditional name="action">
+<param name="mode" value="list"/>
+<param name="recursive" value="false"/>
+</conditional>
+<output name="output_list">
+<assert_contents>
+<has_text text="SRR000001"/>
+</assert_contents>
+</output>
+</test>
+<!-- Test 2: fastq_dump paired -->
+<test expect_num_outputs="2">
+<section name="source">
+<param name="bucket" value="sra_pub_run_odp"/>
+</section>
+<conditional name="action">
+<param name="mode" value="fastq_dump"/>
+<param name="accession" value="SRR000001"/>
+<param name="layout" value="paired"/>
+</conditional>
+<output name="output_r1">
+<assert_contents>
+<has_text text="@SRR000001"/>
+</assert_contents>
+</output>
+<output name="output_r2">
+<assert_contents>
+<has_text text="@SRR000001"/>
+</assert_contents>
+</output>
+</test>
+<!-- Test 3: fastq_dump single-end -->
+<test expect_num_outputs="1">
+<section name="source">
+<param name="bucket" value="sra_pub_run_odp"/>
+</section>
+<conditional name="action">
+<param name="mode" value="fastq_dump"/>
+<param name="accession" value="SRR000001"/>
+<param name="layout" value="single"/>
+</conditional>
+<output name="output_r1">
+<assert_contents>
+<has_text text="@SRR000001"/>
+</assert_contents>
+</output>
+</test>
+</tests>
+<help><![CDATA[
+**NCBI SRA AWS Fetch**
+This tool fetches data from the `NCBI Sequence Read Archive (SRA)`_ hosted on Amazon S3
+as part of the AWS Open Data program. No AWS account is required.
+-----
+**Available Buckets**
++------------------------------+------------------------------------------------------------+
+| Bucket                       | Contents                                                   |
++==============================+============================================================+
+| sra-pub-run-odp              | All open-access SRA runs in SRA Normalized format (.sra).  |
+|                              | Supports FASTQ conversion via this tool.                   |
++------------------------------+------------------------------------------------------------+
+| sra-pub-src-1                | Source BAM, CRAM, and FASTQ files from PacBio, ONT, 10X.  |
++------------------------------+------------------------------------------------------------+
+| sra-pub-src-2                | Same as above (second bucket for source submissions).       |
++------------------------------+------------------------------------------------------------+
+| sra-pub-metadata-us-east-1   | SRA metadata in Parquet/CSV format (for Athena / Glue).    |
++------------------------------+------------------------------------------------------------+
+-----
+**Listing objects**
+Select **List objects** to see what files are available under a given prefix, e.g. ``sra/SRR000001``.
+Leave the prefix blank to browse the bucket root (may return a very large listing).
+-----
+**Downloading raw files**
+Select **Download raw file(s)** and provide the full S3 key, e.g.::
+sra/SRR000001/SRR000001.sra
+-----
+**Download and convert to FASTQ**
+Fetches a single SRA run from ``sra-pub-run-odp`` and converts it to gzip-compressed
+FASTQ using ``fasterq-dump``.
+Outputs are labelled ``<accession>_1`` (R1 / forward) and ``<accession>_2`` (R2 / reverse)
+for paired-end runs, matching ``fasterq-dump``'s native ``--split-files`` naming.
+Single-end runs produce only ``<accession>_1``.
+*Fetching multiple accessions and building a paired collection*
+Run this tool **once per accession** — either manually or by using Galaxy's dataset
+collection mapping to fan out over a list of accession identifiers. Keeping one job per
+accession means a failed download does not affect the others.
+Once all jobs are complete your history will contain datasets labelled::
+SRR000001_1    SRR000001_2
+SRR000002_1    SRR000002_2
+...
+Use **Galaxy's "Build List of Dataset Pairs"** tool to assemble these into a
+``list:paired`` collection. Galaxy will auto-detect the ``_1`` / ``_2`` suffixes
+and propose pairings — confirm and name the collection, then pass it directly to
+any downstream tool that accepts a paired collection (aligners, QC tools, etc.).
+.. warning::
+This tool cannot auto-detect read layout from the accession. Check the SRA record
+at https://www.ncbi.nlm.nih.gov/sra before running. Selecting the wrong layout will
+produce incorrect output.
+-----
+**Notes**
+- All S3 requests are made without AWS credentials (``--no-sign-request``).
+- There is typically a **1–2 day lag** between an accession appearing in SRA Search and
+being available in the S3 buckets.
+- Controlled-access dbGaP data (``sra-ca-run-odp``) requires AWS credentials and is
+**not** supported by this tool.
+- ``fasterq-dump`` and ``pigz`` both use ``\${GALAXY_SLOTS}`` threads. Allocate more
+cores in your job configuration to speed up conversion of large runs.
+.. _NCBI Sequence Read Archive (SRA): https://www.ncbi.nlm.nih.gov/sra
+]]></help>
+<citations>
+<citation type="bibtex">
+@misc{ncbi_sra_aws,
+title        = {{NCBI} {SRA} on {AWS} Open Data},
+author       = {{National Center for Biotechnology Information}},
+howpublished = {\url{https://registry.opendata.aws/ncbi-sra/}},
+note         = {Accessed via AWS S3 without credentials}
+}
+</citation>
+<citation type="bibtex">
+@article{sra_toolkit,
+title   = {The {NCBI} {SRA} and portable data in biology},
+author  = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and
+{International Nucleotide Sequence Database Collaboration}},
+journal = {Nucleic Acids Research},
+volume  = {39},
+number  = {suppl\_1},
+pages   = {D19--D21},
+year    = {2011},
+doi     = {10.1093/nar/gkq1019}
+}
+</citation>
+</citations>
+</tool>

Mercurial > repos > galaxytrakr > aws_sra

comparison aws_sra.xml @ 0:a4afe551dfc9 draft