view aws_sra.xml @ 5:313a1e088e09 draft

planemo upload for repository https://github.com/CFSAN-Biostatistics/galaxytrakr-tools commit f79fae538cabdbb93082069dc04d9f203500a481
author galaxytrakr
date Mon, 23 Mar 2026 15:52:57 +0000
parents 0e6eba56289f
children 8c60cd4c0ca7
line wrap: on
line source

<tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.3.0+gt_0.5" profile="23.0">
    <description>Fetch SRA data files from NCBI's public AWS S3 buckets</description>

    <requirements>
        <requirement type="package" version="2.34.8">awscli</requirement>
        <requirement type="package" version="3.2.1">sra-tools</requirement>
        <requirement type="package" version="2.8">pigz</requirement>
    </requirements>

    <version_command>aws --version</version_command>

    <command detect_errors="exit_code"><![CDATA[
        ## ── Resolve bucket base URL ──────────────────────────────────────────────
        #if $source.bucket == 'sra_pub_run_odp'
            #set $s3_base = 's3://sra-pub-run-odp'
        #elif $source.bucket == 'sra_pub_src_1'
            #set $s3_base = 's3://sra-pub-src-1'
        #elif $source.bucket == 'sra_pub_src_2'
            #set $s3_base = 's3://sra-pub-src-2'
        #elif $source.bucket == 'sra_pub_metadata'
            #set $s3_base = 's3://sra-pub-metadata-us-east-1/sra/metadata'
        #end if

        ## ── LIST mode ────────────────────────────────────────────────────────────
        #if $action.mode == 'list'
            aws s3 ls
                --no-sign-request
                #if $source.prefix
                    '${s3_base}/${ $source.prefix.strip("/") }/'
                #else
                    '${s3_base}/'
                #end if
                #if $action.recursive
                    --recursive
                #end if
            > '$output_list'

        ## ── DOWNLOAD RAW mode ────────────────────────────────────────────────────
        #elif $action.mode == 'copy'
            aws s3 cp
                --no-sign-request
                #if $action.recursive
                    --recursive
                #end if
                '${s3_base}/${ $action.s3_key.strip("/") }'
                '$output_data'

        ## ── FASTQ DUMP mode (sra-pub-run-odp only) ───────────────────────────────
        #elif $action.mode == 'fastq_dump'
            #set $acc = $action.accession.strip()

            ## 1. Download the .sra file from S3
            mkdir -p sra_cache &&
            aws s3 cp
                --no-sign-request
                '${s3_base}/sra/${acc}/${acc}.sra'
                ./sra_cache/${acc}.sra &&

            ## 2. Convert with fasterq-dump --split-files.
            ##    Paired runs  → <acc>_1.fastq + <acc>_2.fastq
            ##    Single runs  → <acc>.fastq   (no _1/_2 suffix)
            ##    We always use --split-files; single-end runs simply produce one file.
            mkdir -p fastq_out &&
            fasterq-dump
                ./sra_cache/${acc}.sra
                --outdir ./fastq_out
                --temp .
                --threads \${GALAXY_SLOTS:-4}
                --split-files
            &&

            ## 3. Compress with pigz (fasterq-dump does not gzip natively)
            pigz -p \${GALAXY_SLOTS:-4} ./fastq_out/*.fastq &&

            ## 4. Stage outputs
            #if $action.layout == 'paired'
                cp ./fastq_out/${acc}_1.fastq.gz '$output_r1' &&
                cp ./fastq_out/${acc}_2.fastq.gz '$output_r2'
            #else
                cp ./fastq_out/${acc}.fastq.gz '$output_r1'
            #end if
        #end if
    ]]></command>

    <inputs>
        <section name="source" title="Data Source" expanded="true">
            <param name="bucket" type="select" label="SRA S3 Bucket"
                   help="Select the NCBI SRA AWS Open Data bucket to access. FASTQ conversion is only available for sra-pub-run-odp.">
                <option value="sra_pub_run_odp" selected="true">
                    sra-pub-run-odp — Open-access SRA runs (.sra format)
                </option>
                <option value="sra_pub_src_1">
                    sra-pub-src-1 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 1]
                </option>
                <option value="sra_pub_src_2">
                    sra-pub-src-2 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 2]
                </option>
                <option value="sra_pub_metadata">
                    sra-pub-metadata-us-east-1 — SRA metadata (Parquet/CSV for Athena/Glue)
                </option>
            </param>
            <param name="prefix" type="text" value="" optional="true"
                   label="S3 key prefix (optional)"
                   help="Restrict listing to a sub-path, e.g. 'sra/SRR000001'. Used only in List mode.">
                <sanitizer invalid_char="">
                    <valid initial="string.printable">
                        <remove value="'"/>
                        <remove value='"'/>
                    </valid>
                </sanitizer>
            </param>
        </section>

        <conditional name="action">
            <param name="mode" type="select" label="Action">
                <option value="list" selected="true">List objects</option>
                <option value="copy">Download raw file(s)</option>
                <option value="fastq_dump">Download and convert to FASTQ (sra-pub-run-odp only)</option>
            </param>

            <!-- ── LIST ── -->
            <when value="list">
                <param name="recursive" type="boolean" truevalue="--recursive" falsevalue=""
                       checked="false" label="List recursively"
                       help="List all objects under the prefix, not just the immediate level."/>
            </when>

            <!-- ── COPY ── -->
            <when value="copy">
                <param name="s3_key" type="text" label="S3 key to download"
                       help="Full key of the object to download, e.g. 'sra/SRR000001/SRR000001.sra'.">
                    <validator type="empty_field" message="An S3 key is required for download."/>
                    <sanitizer invalid_char="">
                        <valid initial="string.printable">
                            <remove value="'"/>
                            <remove value='"'/>
                        </valid>
                    </sanitizer>
                </param>
                <param name="recursive" type="boolean" truevalue="--recursive" falsevalue=""
                       checked="false" label="Download recursively"
                       help="Download all objects with this prefix rather than a single object."/>
            </when>

            <!-- ── FASTQ DUMP ── -->
            <when value="fastq_dump">
                <param name="accession" type="text" label="SRA Accession"
                       help="SRA run accession to fetch and convert, e.g. SRR000001. Must be present in sra-pub-run-odp.">
                    <validator type="empty_field" message="An SRA accession is required."/>     
                    <validator type="regex" message="Must be a valid SRA run accession (SRR, ERR, or DRR followed by digits).">^\[SED\]RR\[0-9\]+$</validator>
                </param>
                <param name="layout" type="select" label="Read layout"
                       help="Paired-end produces two datasets labelled accession_1 and accession_2. Single-end produces one dataset. Check the SRA record to confirm layout before running.">
                    <option value="paired" selected="true">Paired-end (R1 + R2)</option>
                    <option value="single">Single-end</option>
                </param>
            </when>
        </conditional>
    </inputs>

    <outputs>
        <!-- List output -->
        <data name="output_list" format="txt"
              label="SRA S3 listing: ${source.prefix}">
            <filter>action['mode'] == 'list'</filter>
        </data>

        <!-- Raw download -->
        <data name="output_data" format="auto"
              label="SRA download: ${action.s3_key}">
            <filter>action['mode'] == 'copy'</filter>
        </data>

        <!-- FASTQ R1 / single.
             Label matches fasterq-dump's native _1 suffix so Galaxy's
             "Build List of Dataset Pairs" can auto-detect pairings. -->
        <data name="output_r1" format="fastqsanger.gz"
              label="${action.accession}_1">
            <filter>action['mode'] == 'fastq_dump'</filter>
        </data>

        <!-- FASTQ R2 (paired-end only) -->
        <data name="output_r2" format="fastqsanger.gz"
              label="${action.accession}_2">
            <filter>action['mode'] == 'fastq_dump' and action['layout'] == 'paired'</filter>
        </data>
    </outputs>

    <tests>
        <!-- Test 1: list mode -->
        <test expect_num_outputs="1">
            <section name="source">
                <param name="bucket" value="sra_pub_run_odp"/>
                <param name="prefix" value="sra/SRR000001"/>
            </section>
            <conditional name="action">
                <param name="mode" value="list"/>
                <param name="recursive" value="false"/>
            </conditional>
            <output name="output_list">
                <assert_contents>
                    <has_text text="SRR000001"/>
                </assert_contents>
            </output>
        </test>

        <!-- Test 2: fastq_dump paired -->
        <test expect_num_outputs="2">
            <section name="source">
                <param name="bucket" value="sra_pub_run_odp"/>
            </section>
            <conditional name="action">
                <param name="mode" value="fastq_dump"/>
                <param name="accession" value="SRR000001"/>
                <param name="layout" value="paired"/>
            </conditional>
            <output name="output_r1">
                <assert_contents>
                    <has_text text="@SRR000001"/>
                </assert_contents>
            </output>
            <output name="output_r2">
                <assert_contents>
                    <has_text text="@SRR000001"/>
                </assert_contents>
            </output>
        </test>

        <!-- Test 3: fastq_dump single-end -->
        <test expect_num_outputs="1">
            <section name="source">
                <param name="bucket" value="sra_pub_run_odp"/>
            </section>
            <conditional name="action">
                <param name="mode" value="fastq_dump"/>
                <param name="accession" value="SRR000001"/>
                <param name="layout" value="single"/>
            </conditional>
            <output name="output_r1">
                <assert_contents>
                    <has_text text="@SRR000001"/>
                </assert_contents>
            </output>
        </test>
    </tests>

    <help><![CDATA[
**NCBI SRA AWS Fetch**

This tool fetches data from the `NCBI Sequence Read Archive (SRA)`_ hosted on Amazon S3
as part of the AWS Open Data program. No AWS account is required.

-----

**Available Buckets**

+------------------------------+------------------------------------------------------------+
| Bucket                       | Contents                                                   |
+==============================+============================================================+
| sra-pub-run-odp              | All open-access SRA runs in SRA Normalized format (.sra).  |
|                              | Supports FASTQ conversion via this tool.                   |
+------------------------------+------------------------------------------------------------+
| sra-pub-src-1                | Source BAM, CRAM, and FASTQ files from PacBio, ONT, 10X.   |
+------------------------------+------------------------------------------------------------+
| sra-pub-src-2                | Same as above (second bucket for source submissions).      |
+------------------------------+------------------------------------------------------------+
| sra-pub-metadata-us-east-1   | SRA metadata in Parquet/CSV format (for Athena / Glue).    |
+------------------------------+------------------------------------------------------------+


-----

**Listing objects**

Select **List objects** to see what files are available under a given prefix, e.g. ``sra/SRR000001``.
Leave the prefix blank to browse the bucket root (may return a very large listing).

-----

**Downloading raw files**

Select **Download raw file(s)** and provide the full S3 key, e.g.::

    sra/SRR000001/SRR000001.sra

-----

**Download and convert to FASTQ**

Fetches a single SRA run from ``sra-pub-run-odp`` and converts it to gzip-compressed
FASTQ using ``fasterq-dump``.

Outputs are labelled ``<accession>_1`` (R1 / forward) and ``<accession>_2`` (R2 / reverse)
for paired-end runs, matching ``fasterq-dump``'s native ``--split-files`` naming.
Single-end runs produce only ``<accession>_1``.

*Fetching multiple accessions and building a paired collection*

Run this tool **once per accession** — either manually or by using Galaxy's dataset
collection mapping to fan out over a list of accession identifiers. Keeping one job per
accession means a failed download does not affect the others.

Once all jobs are complete your history will contain datasets labelled::

    SRR000001_1    SRR000001_2
    SRR000002_1    SRR000002_2
    ...

Use **Galaxy's "Build List of Dataset Pairs"** tool to assemble these into a
``list:paired`` collection. Galaxy will auto-detect the ``_1`` / ``_2`` suffixes
and propose pairings — confirm and name the collection, then pass it directly to
any downstream tool that accepts a paired collection (aligners, QC tools, etc.).

.. warning::

   This tool cannot auto-detect read layout from the accession. Check the SRA record
   at https://www.ncbi.nlm.nih.gov/sra before running. Selecting the wrong layout will
   produce incorrect output.

-----

**Notes**

- All S3 requests are made without AWS credentials (``--no-sign-request``).
- There is typically a **1–2 day lag** between an accession appearing in SRA Search and
  being available in the S3 buckets.
- Controlled-access dbGaP data (``sra-ca-run-odp``) requires AWS credentials and is
  **not** supported by this tool.
- ``fasterq-dump`` and ``pigz`` both use ``\${GALAXY_SLOTS}`` threads. Allocate more
  cores in your job configuration to speed up conversion of large runs.

.. _NCBI Sequence Read Archive (SRA): https://www.ncbi.nlm.nih.gov/sra
    ]]></help>

    <citations>
        <citation type="bibtex">
@misc{ncbi_sra_aws,
  title        = {{NCBI} {SRA} on {AWS} Open Data},
  author       = {{National Center for Biotechnology Information}},
  howpublished = {\url{https://registry.opendata.aws/ncbi-sra/}},
  note         = {Accessed via AWS S3 without credentials}
}
        </citation>
        <citation type="bibtex">
@article{sra_toolkit,
  title   = {The {NCBI} {SRA} and portable data in biology},
  author  = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and
             {International Nucleotide Sequence Database Collaboration}},
  journal = {Nucleic Acids Research},
  volume  = {39},
  number  = {suppl\_1},
  pages   = {D19--D21},
  year    = {2011},
  doi     = {10.1093/nar/gkq1019}
}
        </citation>
    </citations>

</tool>