Mercurial > repos > galaxytrakr > aws_sra
diff aws_sra.xml @ 0:a4afe551dfc9 draft
planemo upload for repository https://github.com/CFSAN-Biostatistics/galaxytrakr-tools commit c55c06b92c0ee0429047bcff1992bf2ec293284a
| author | galaxytrakr |
|---|---|
| date | Mon, 23 Mar 2026 14:09:35 +0000 |
| parents | |
| children | ddfdc4c465e7 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/aws_sra.xml Mon Mar 23 14:09:35 2026 +0000 @@ -0,0 +1,360 @@ +<tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.3.0+gt_0.1" profile="23.0"> + <description>Fetch SRA data files from NCBI's public AWS S3 buckets</description> + + <requirements> + <requirement type="package" version="2.34.8" channel="conda-forge">awscli</requirement> + <requirement type="package" version="3.2.1" channel="bioconda">sra-tools</requirement> + <requirement type="package" version="2.8" channel="conda-forge">pigz</requirement> + </requirements> + + <version_command>aws --version</version_command> + + <command detect_errors="exit_code"><![CDATA[ + ## ── Resolve bucket base URL ────────────────────────────────────────────── + #if $source.bucket == 'sra_pub_run_odp' + #set $s3_base = 's3://sra-pub-run-odp' + #elif $source.bucket == 'sra_pub_src_1' + #set $s3_base = 's3://sra-pub-src-1' + #elif $source.bucket == 'sra_pub_src_2' + #set $s3_base = 's3://sra-pub-src-2' + #elif $source.bucket == 'sra_pub_metadata' + #set $s3_base = 's3://sra-pub-metadata-us-east-1/sra/metadata' + #end if + + ## ── LIST mode ──────────────────────────────────────────────────────────── + #if $action.mode == 'list' + aws s3 ls + --no-sign-request + #if $source.prefix + '${s3_base}/${ $source.prefix.strip("/") }/' + #else + '${s3_base}/' + #end if + #if $action.recursive + --recursive + #end if + > '$output_list' + + ## ── DOWNLOAD RAW mode ──────────────────────────────────────────────────── + #elif $action.mode == 'copy' + aws s3 cp + --no-sign-request + #if $action.recursive + --recursive + #end if + '${s3_base}/${ $action.s3_key.strip("/") }' + '$output_data' + + ## ── FASTQ DUMP mode (sra-pub-run-odp only) ─────────────────────────────── + #elif $action.mode == 'fastq_dump' + #set $acc = $action.accession.strip() + + ## 1. Download the .sra file from S3 + mkdir -p sra_cache && + aws s3 cp + --no-sign-request + '${s3_base}/sra/${acc}/${acc}.sra' + ./sra_cache/${acc}.sra && + + ## 2. Convert with fasterq-dump --split-files. + ## Paired runs → <acc>_1.fastq + <acc>_2.fastq + ## Single runs → <acc>.fastq (no _1/_2 suffix) + ## We always use --split-files; single-end runs simply produce one file. + mkdir -p fastq_out && + fasterq-dump + ./sra_cache/${acc}.sra + --outdir ./fastq_out + --temp . + --threads \${GALAXY_SLOTS:-4} + --split-files + && + + ## 3. Compress with pigz (fasterq-dump does not gzip natively) + pigz -p \${GALAXY_SLOTS:-4} ./fastq_out/*.fastq && + + ## 4. Stage outputs + #if $action.layout == 'paired' + cp ./fastq_out/${acc}_1.fastq.gz '$output_r1' && + cp ./fastq_out/${acc}_2.fastq.gz '$output_r2' + #else + cp ./fastq_out/${acc}.fastq.gz '$output_r1' + #end if + #end if + ]]></command> + + <inputs> + <section name="source" title="Data Source" expanded="true"> + <param name="bucket" type="select" label="SRA S3 Bucket" + help="Select the NCBI SRA AWS Open Data bucket to access. FASTQ conversion is only available for sra-pub-run-odp."> + <option value="sra_pub_run_odp" selected="true"> + sra-pub-run-odp — Open-access SRA runs (.sra format) + </option> + <option value="sra_pub_src_1"> + sra-pub-src-1 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 1] + </option> + <option value="sra_pub_src_2"> + sra-pub-src-2 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 2] + </option> + <option value="sra_pub_metadata"> + sra-pub-metadata-us-east-1 — SRA metadata (Parquet/CSV for Athena/Glue) + </option> + </param> + <param name="prefix" type="text" value="" optional="true" + label="S3 key prefix (optional)" + help="Restrict listing to a sub-path, e.g. 'sra/SRR000001'. Used only in List mode."> + <sanitizer invalid_char=""> + <valid initial="string.printable"> + <remove value="'"/> + <remove value='"'/> + </valid> + </sanitizer> + </param> + </section> + + <conditional name="action"> + <param name="mode" type="select" label="Action"> + <option value="list" selected="true">List objects</option> + <option value="copy">Download raw file(s)</option> + <option value="fastq_dump">Download and convert to FASTQ (sra-pub-run-odp only)</option> + </param> + + <!-- ── LIST ── --> + <when value="list"> + <param name="recursive" type="boolean" truevalue="--recursive" falsevalue="" + checked="false" label="List recursively" + help="List all objects under the prefix, not just the immediate level."/> + </when> + + <!-- ── COPY ── --> + <when value="copy"> + <param name="s3_key" type="text" label="S3 key to download" + help="Full key of the object to download, e.g. 'sra/SRR000001/SRR000001.sra'."> + <validator type="empty_field" message="An S3 key is required for download."/> + <sanitizer invalid_char=""> + <valid initial="string.printable"> + <remove value="'"/> + <remove value='"'/> + </valid> + </sanitizer> + </param> + <param name="recursive" type="boolean" truevalue="--recursive" falsevalue="" + checked="false" label="Download recursively" + help="Download all objects with this prefix rather than a single object."/> + </when> + + <!-- ── FASTQ DUMP ── --> + <when value="fastq_dump"> + <param name="accession" type="text" label="SRA Accession" + help="SRA run accession to fetch and convert, e.g. SRR000001. Must be present in sra-pub-run-odp."> + <validator type="empty_field" message="An SRA accession is required."/> + <validator type="regex" + message="Must be a valid SRA run accession (SRR, ERR, or DRR followed by digits)." + expression="^[SED]RR[0-9]+$"/> + </param> + <param name="layout" type="select" label="Read layout" + help="Paired-end produces two datasets labelled accession_1 and accession_2. Single-end produces one dataset. Check the SRA record to confirm layout before running."> + <option value="paired" selected="true">Paired-end (R1 + R2)</option> + <option value="single">Single-end</option> + </param> + </when> + </conditional> + </inputs> + + <outputs> + <!-- List output --> + <data name="output_list" format="txt" + label="SRA S3 listing: ${source.prefix}"> + <filter>action['mode'] == 'list'</filter> + </data> + + <!-- Raw download --> + <data name="output_data" format="auto" + label="SRA download: ${action.s3_key}"> + <filter>action['mode'] == 'copy'</filter> + </data> + + <!-- FASTQ R1 / single. + Label matches fasterq-dump's native _1 suffix so Galaxy's + "Build List of Dataset Pairs" can auto-detect pairings. --> + <data name="output_r1" format="fastqsanger.gz" + label="${action.accession}_1"> + <filter>action['mode'] == 'fastq_dump'</filter> + </data> + + <!-- FASTQ R2 (paired-end only) --> + <data name="output_r2" format="fastqsanger.gz" + label="${action.accession}_2"> + <filter>action['mode'] == 'fastq_dump' and action['layout'] == 'paired'</filter> + </data> + </outputs> + + <tests> + <!-- Test 1: list mode --> + <test expect_num_outputs="1"> + <section name="source"> + <param name="bucket" value="sra_pub_run_odp"/> + <param name="prefix" value="sra/SRR000001"/> + </section> + <conditional name="action"> + <param name="mode" value="list"/> + <param name="recursive" value="false"/> + </conditional> + <output name="output_list"> + <assert_contents> + <has_text text="SRR000001"/> + </assert_contents> + </output> + </test> + + <!-- Test 2: fastq_dump paired --> + <test expect_num_outputs="2"> + <section name="source"> + <param name="bucket" value="sra_pub_run_odp"/> + </section> + <conditional name="action"> + <param name="mode" value="fastq_dump"/> + <param name="accession" value="SRR000001"/> + <param name="layout" value="paired"/> + </conditional> + <output name="output_r1"> + <assert_contents> + <has_text text="@SRR000001"/> + </assert_contents> + </output> + <output name="output_r2"> + <assert_contents> + <has_text text="@SRR000001"/> + </assert_contents> + </output> + </test> + + <!-- Test 3: fastq_dump single-end --> + <test expect_num_outputs="1"> + <section name="source"> + <param name="bucket" value="sra_pub_run_odp"/> + </section> + <conditional name="action"> + <param name="mode" value="fastq_dump"/> + <param name="accession" value="SRR000001"/> + <param name="layout" value="single"/> + </conditional> + <output name="output_r1"> + <assert_contents> + <has_text text="@SRR000001"/> + </assert_contents> + </output> + </test> + </tests> + + <help><![CDATA[ +**NCBI SRA AWS Fetch** + +This tool fetches data from the `NCBI Sequence Read Archive (SRA)`_ hosted on Amazon S3 +as part of the AWS Open Data program. No AWS account is required. + +----- + +**Available Buckets** + ++------------------------------+------------------------------------------------------------+ +| Bucket | Contents | ++==============================+============================================================+ +| sra-pub-run-odp | All open-access SRA runs in SRA Normalized format (.sra). | +| | Supports FASTQ conversion via this tool. | ++------------------------------+------------------------------------------------------------+ +| sra-pub-src-1 | Source BAM, CRAM, and FASTQ files from PacBio, ONT, 10X. | ++------------------------------+------------------------------------------------------------+ +| sra-pub-src-2 | Same as above (second bucket for source submissions). | ++------------------------------+------------------------------------------------------------+ +| sra-pub-metadata-us-east-1 | SRA metadata in Parquet/CSV format (for Athena / Glue). | ++------------------------------+------------------------------------------------------------+ + +----- + +**Listing objects** + +Select **List objects** to see what files are available under a given prefix, e.g. ``sra/SRR000001``. +Leave the prefix blank to browse the bucket root (may return a very large listing). + +----- + +**Downloading raw files** + +Select **Download raw file(s)** and provide the full S3 key, e.g.:: + + sra/SRR000001/SRR000001.sra + +----- + +**Download and convert to FASTQ** + +Fetches a single SRA run from ``sra-pub-run-odp`` and converts it to gzip-compressed +FASTQ using ``fasterq-dump``. + +Outputs are labelled ``<accession>_1`` (R1 / forward) and ``<accession>_2`` (R2 / reverse) +for paired-end runs, matching ``fasterq-dump``'s native ``--split-files`` naming. +Single-end runs produce only ``<accession>_1``. + +*Fetching multiple accessions and building a paired collection* + +Run this tool **once per accession** — either manually or by using Galaxy's dataset +collection mapping to fan out over a list of accession identifiers. Keeping one job per +accession means a failed download does not affect the others. + +Once all jobs are complete your history will contain datasets labelled:: + + SRR000001_1 SRR000001_2 + SRR000002_1 SRR000002_2 + ... + +Use **Galaxy's "Build List of Dataset Pairs"** tool to assemble these into a +``list:paired`` collection. Galaxy will auto-detect the ``_1`` / ``_2`` suffixes +and propose pairings — confirm and name the collection, then pass it directly to +any downstream tool that accepts a paired collection (aligners, QC tools, etc.). + +.. warning:: + + This tool cannot auto-detect read layout from the accession. Check the SRA record + at https://www.ncbi.nlm.nih.gov/sra before running. Selecting the wrong layout will + produce incorrect output. + +----- + +**Notes** + +- All S3 requests are made without AWS credentials (``--no-sign-request``). +- There is typically a **1–2 day lag** between an accession appearing in SRA Search and + being available in the S3 buckets. +- Controlled-access dbGaP data (``sra-ca-run-odp``) requires AWS credentials and is + **not** supported by this tool. +- ``fasterq-dump`` and ``pigz`` both use ``\${GALAXY_SLOTS}`` threads. Allocate more + cores in your job configuration to speed up conversion of large runs. + +.. _NCBI Sequence Read Archive (SRA): https://www.ncbi.nlm.nih.gov/sra + ]]></help> + + <citations> + <citation type="bibtex"> +@misc{ncbi_sra_aws, + title = {{NCBI} {SRA} on {AWS} Open Data}, + author = {{National Center for Biotechnology Information}}, + howpublished = {\url{https://registry.opendata.aws/ncbi-sra/}}, + note = {Accessed via AWS S3 without credentials} +} + </citation> + <citation type="bibtex"> +@article{sra_toolkit, + title = {The {NCBI} {SRA} and portable data in biology}, + author = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and + {International Nucleotide Sequence Database Collaboration}}, + journal = {Nucleic Acids Research}, + volume = {39}, + number = {suppl\_1}, + pages = {D19--D21}, + year = {2011}, + doi = {10.1093/nar/gkq1019} +} + </citation> + </citations> + +</tool>
