Mercurial > repos > galaxytrakr > aws_sra
comparison aws_sra.xml @ 13:2897d365dd62 draft
planemo upload for repository https://github.com/CFSAN-Biostatistics/galaxytrakr-tools commit 619ebd7e6a24be0ec6c2728511290f43b0bad89f
| author | galaxytrakr |
|---|---|
| date | Mon, 23 Mar 2026 20:04:52 +0000 |
| parents | 76192dc490d2 |
| children | 27569ff426e0 |
comparison
equal
deleted
inserted
replaced
| 12:76192dc490d2 | 13:2897d365dd62 |
|---|---|
| 1 <tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.3.0+gt_0.12" profile="23.0"> | 1 <tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.3.0+gt_0.13" profile="23.0"> |
| 2 <description>Fetch SRA data files from NCBI's public AWS S3 buckets</description> | 2 <description>Fetches SRA runs from AWS and converts them to FASTQ</description> |
| 3 | 3 |
| 4 <requirements> | 4 <requirements> |
| 5 <requirement type="package" version="2.34.8">awscli</requirement> | 5 <requirement type="package" version="2.34.8">awscli</requirement> |
| 6 <requirement type="package" version="3.2.1">sra-tools</requirement> | 6 <requirement type="package" version="3.2.1">sra-tools</requirement> |
| 7 <requirement type="package" version="2.8">pigz</requirement> | 7 <requirement type="package" version="2.8">pigz</requirement> |
| 8 </requirements> | 8 </requirements> |
| 9 | 9 |
| 10 <version_command>aws --version</version_command> | 10 <version_command>fasterq-dump --version</version_command> |
| 11 | 11 |
| 12 <command detect_errors="exit_code"><![CDATA[ | 12 <command detect_errors="aggressive"><![CDATA[ |
| 13 ## ── Resolve bucket base URL ────────────────────────────────────────────── | 13 ## This loop handles both 'single' and 'batch' modes. |
| 14 #if $source.bucket == 'sra_pub_run_odp' | 14 #for $acc_line in $run_type.mode == 'single' and str($run_type.accession).split() or $run_type.accession_list.lines: |
| 15 #set $s3_base = 's3://sra-pub-run-odp' | 15 #set $acc = $acc_line.strip() |
| 16 #elif $source.bucket == 'sra_pub_src_1' | 16 #if $acc: |
| 17 #set $s3_base = 's3://sra-pub-src-1' | |
| 18 #elif $source.bucket == 'sra_pub_src_2' | |
| 19 #set $s3_base = 's3://sra-pub-src-2' | |
| 20 #elif $source.bucket == 'sra_pub_metadata' | |
| 21 #set $s3_base = 's3://sra-pub-metadata-us-east-1/sra/metadata' | |
| 22 #end if | |
| 23 | 17 |
| 24 ## ── LIST mode ──────────────────────────────────────────────────────────── | 18 echo "Processing accession: $acc" && |
| 25 #if $action.mode == 'list' | 19 |
| 26 #set $s3_path = $s3_base | 20 ## 1. Create unique directories for this accession |
| 27 #if $source.prefix | 21 mkdir -p sra_cache_${acc} fastq_out_${acc} && |
| 28 #set $s3_path = $s3_path + '/' + $source.prefix.strip("/") | 22 |
| 23 ## 2. Download the file from S3 using the discovered path format | |
| 24 aws s3 cp --no-sign-request 's3://sra-pub-run-odp/sra/${acc}/${acc}' ./sra_cache_${acc}/ && | |
| 25 | |
| 26 ## 3. Convert with fasterq-dump, using the correct argument order | |
| 27 fasterq-dump --outdir ./fastq_out_${acc} --temp . --threads \${GALAXY_SLOTS:-4} --split-files ./sra_cache_${acc}/${acc} && | |
| 28 | |
| 29 ## 4. Compress with pigz | |
| 30 pigz -p \${GALAXY_SLOTS:-4} ./fastq_out_${acc}/*.fastq && | |
| 31 | |
| 32 ## 5. Move outputs to special directories Galaxy can discover | |
| 33 #if $layout == 'paired' | |
| 34 mv ./fastq_out_${acc}/${acc}_1.fastq.gz '$output_r1.files_path/${acc}_1.fastq.gz' && | |
| 35 mv ./fastq_out_${acc}/${acc}_2.fastq.gz '$output_r2.files_path/${acc}_2.fastq.gz' | |
| 36 #else | |
| 37 mv ./fastq_out_${acc}/*.fastq.gz '$output_r1.files_path/${acc}.fastq.gz' | |
| 38 #end if && | |
| 39 | |
| 40 ## 6. Clean up temporary files | |
| 41 rm -rf sra_cache_${acc} fastq_out_${acc} | |
| 42 | |
| 29 #end if | 43 #end if |
| 30 aws s3 ls | 44 #end for |
| 31 --no-sign-request | |
| 32 #if $action.recursive | |
| 33 --recursive | |
| 34 #end if | |
| 35 $s3_path/ | |
| 36 > '$output_list' | |
| 37 | |
| 38 ## ── DOWNLOAD RAW mode ──────────────────────────────────────────────────── | |
| 39 #elif $action.mode == 'copy' | |
| 40 aws s3 cp | |
| 41 --no-sign-request | |
| 42 #if $action.recursive | |
| 43 --recursive | |
| 44 #end if | |
| 45 '${s3_base}/${ $action.s3_key.strip("/") }' | |
| 46 '$output_data' | |
| 47 | |
| 48 ## ── FASTQ DUMP mode (sra-pub-run-odp only) ─────────────────────────────── | |
| 49 #elif $action.mode == 'fastq_dump' | |
| 50 #set $acc = $action.accession.strip() | |
| 51 | |
| 52 mkdir -p sra_cache && | |
| 53 aws s3 cp --no-sign-request '${s3_base}/sra/${acc}/${acc}' ./sra_cache/${acc} && | |
| 54 mkdir -p fastq_out && | |
| 55 fasterq-dump --outdir ./fastq_out --temp . --threads \${GALAXY_SLOTS:-4} --split-files ./sra_cache/${acc} && | |
| 56 pigz -p \${GALAXY_SLOTS:-4} ./fastq_out/*.fastq && | |
| 57 #if $action.layout == 'paired' | |
| 58 cp ./fastq_out/${acc}_1.fastq.gz '$output_r1' && | |
| 59 cp ./fastq_out/${acc}_2.fastq.gz '$output_r2' | |
| 60 #else | |
| 61 cp ./fastq_out/${acc}.fastq.gz '$output_r1' | |
| 62 #end if | |
| 63 #end if | |
| 64 ]]></command> | 45 ]]></command> |
| 65 | 46 |
| 66 <inputs> | 47 <inputs> |
| 67 <section name="source" title="Data Source" expanded="true"> | 48 <!-- This conditional allows the user to choose a single run or a list of runs --> |
| 68 <param name="bucket" type="select" label="SRA S3 Bucket" | 49 <conditional name="run_type"> |
| 69 help="Select the NCBI SRA AWS Open Data bucket to access. FASTQ conversion is only available for sra-pub-run-odp."> | 50 <param name="mode" type="select" label="Execution Mode" help="Run on a single accession or a list of accessions from a file."> |
| 70 <option value="sra_pub_run_odp" selected="true"> | 51 <option value="single" selected="true">Single Accession</option> |
| 71 sra-pub-run-odp — Open-access SRA runs (.sra format) | 52 <option value="batch">Batch of Accessions</option> |
| 72 </option> | |
| 73 <option value="sra_pub_src_1"> | |
| 74 sra-pub-src-1 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 1] | |
| 75 </option> | |
| 76 <option value="sra_pub_src_2"> | |
| 77 sra-pub-src-2 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 2] | |
| 78 </option> | |
| 79 <option value="sra_pub_metadata"> | |
| 80 sra-pub-metadata-us-east-1 — SRA metadata (Parquet/CSV for Athena/Glue) | |
| 81 </option> | |
| 82 </param> | 53 </param> |
| 83 <param name="prefix" type="text" value="" optional="true" | 54 <when value="single"> |
| 84 label="S3 key prefix (optional)" | 55 <param name="accession" type="text" label="SRA Accession" help="e.g., SRR13333333"/> |
| 85 help="Restrict listing to a sub-path, e.g. 'sra/SRR000001'. Used only in List mode."> | |
| 86 <sanitizer invalid_char=""> | |
| 87 <valid initial="string.printable"> | |
| 88 <remove value="'"/> | |
| 89 <remove value='"'/> | |
| 90 </valid> | |
| 91 </sanitizer> | |
| 92 </param> | |
| 93 </section> | |
| 94 | |
| 95 <conditional name="action"> | |
| 96 <param name="mode" type="select" label="Action"> | |
| 97 <option value="list" selected="true">List objects</option> | |
| 98 <option value="copy">Download raw file(s)</option> | |
| 99 <option value="fastq_dump">Download and convert to FASTQ (sra-pub-run-odp only)</option> | |
| 100 </param> | |
| 101 | |
| 102 <!-- ── LIST ── --> | |
| 103 <when value="list"> | |
| 104 <param name="recursive" type="boolean" truevalue="--recursive" falsevalue="" | |
| 105 checked="false" label="List recursively" | |
| 106 help="List all objects under the prefix, not just the immediate level."/> | |
| 107 </when> | 56 </when> |
| 108 | 57 <when value="batch"> |
| 109 <!-- ── COPY ── --> | 58 <param name="accession_list" type="data" format="txt" label="List of SRA Accessions" help="A plain text file with one SRA accession per line."/> |
| 110 <when value="copy"> | |
| 111 <param name="s3_key" type="text" label="S3 key to download" | |
| 112 help="Full key of the object to download, e.g. 'sra/SRR000001/SRR000001.sra'."> | |
| 113 <validator type="empty_field" message="An S3 key is required for download."/> | |
| 114 <sanitizer invalid_char=""> | |
| 115 <valid initial="string.printable"> | |
| 116 <remove value="'"/> | |
| 117 <remove value='"'/> | |
| 118 </valid> | |
| 119 </sanitizer> | |
| 120 </param> | |
| 121 <param name="recursive" type="boolean" truevalue="--recursive" falsevalue="" | |
| 122 checked="false" label="Download recursively" | |
| 123 help="Download all objects with this prefix rather than a single object."/> | |
| 124 </when> | |
| 125 | |
| 126 <!-- ── FASTQ DUMP ── --> | |
| 127 <when value="fastq_dump"> | |
| 128 <param name="accession" type="text" label="SRA Accession" | |
| 129 help="SRA run accession to fetch and convert, e.g. SRR000001. Must be present in sra-pub-run-odp."> | |
| 130 </param> | |
| 131 <param name="layout" type="select" label="Read layout" | |
| 132 help="Paired-end produces two datasets labelled accession_1 and accession_2. Single-end produces one dataset. Check the SRA record to confirm layout before running."> | |
| 133 <option value="paired" selected="true">Paired-end (R1 + R2)</option> | |
| 134 <option value="single">Single-end</option> | |
| 135 </param> | |
| 136 </when> | 59 </when> |
| 137 </conditional> | 60 </conditional> |
| 61 | |
| 62 <!-- This layout parameter is always required --> | |
| 63 <param name="layout" type="select" label="Read layout" help="Check the SRA record to confirm layout before running."> | |
| 64 <option value="paired" selected="true">Paired-end (R1 + R2)</option> | |
| 65 <option value="single">Single-end</option> | |
| 66 </param> | |
| 138 </inputs> | 67 </inputs> |
| 139 | 68 |
| 140 <outputs> | 69 <outputs> |
| 141 <!-- List output --> | 70 <!-- These collections will gather all the files produced by the loop --> |
| 142 <data name="output_list" format="txt" | 71 <collection name="output_r1" type="list" label="${run_type.accession or 'FASTQ Reads (R1)'}"> |
| 143 label="SRA S3 listing: ${source.prefix}"> | 72 <discover_datasets pattern="(?P<designation>.+)_1\.fastq\.gz" format="fastqsanger.gz" /> |
| 144 <filter>action['mode'] == 'list'</filter> | 73 </collection> |
| 145 </data> | 74 <collection name="output_r2" type="list" label="${run_type.accession or 'FASTQ Reads (R2)'}"> |
| 146 | 75 <discover_datasets pattern="(?P<designation>.+)_2\.fastq\.gz" format="fastqsanger.gz" /> |
| 147 <!-- Raw download --> | 76 <filter>layout == 'paired'</filter> |
| 148 <data name="output_data" format="auto" | 77 </collection> |
| 149 label="SRA download: ${action.s3_key}"> | |
| 150 <filter>action['mode'] == 'copy'</filter> | |
| 151 </data> | |
| 152 | |
| 153 <!-- FASTQ R1 / single. | |
| 154 Label matches fasterq-dump's native _1 suffix so Galaxy's | |
| 155 "Build List of Dataset Pairs" can auto-detect pairings. --> | |
| 156 <data name="output_r1" format="fastqsanger.gz" | |
| 157 label="${action.accession}_1"> | |
| 158 <filter>action['mode'] == 'fastq_dump'</filter> | |
| 159 </data> | |
| 160 | |
| 161 <!-- FASTQ R2 (paired-end only) --> | |
| 162 <data name="output_r2" format="fastqsanger.gz" | |
| 163 label="${action.accession}_2"> | |
| 164 <filter>action['mode'] == 'fastq_dump' and action['layout'] == 'paired'</filter> | |
| 165 </data> | |
| 166 </outputs> | 78 </outputs> |
| 167 | 79 |
| 168 <tests> | 80 <tests> |
| 169 <!-- Test 1: list mode --> | |
| 170 <test expect_num_outputs="1"> | |
| 171 <section name="source"> | |
| 172 <param name="bucket" value="sra_pub_run_odp"/> | |
| 173 <param name="prefix" value="sra/SRR000001"/> | |
| 174 </section> | |
| 175 <conditional name="action"> | |
| 176 <param name="mode" value="list"/> | |
| 177 <param name="recursive" value="false"/> | |
| 178 </conditional> | |
| 179 <output name="output_list"> | |
| 180 <assert_contents> | |
| 181 <has_text text="SRR000001"/> | |
| 182 </assert_contents> | |
| 183 </output> | |
| 184 </test> | |
| 185 | |
| 186 <!-- Test 2: fastq_dump paired --> | |
| 187 <test expect_num_outputs="2"> | 81 <test expect_num_outputs="2"> |
| 188 <section name="source"> | 82 <param name="mode" value="single"/> |
| 189 <param name="bucket" value="sra_pub_run_odp"/> | 83 <param name="accession" value="SRR13333333"/> |
| 190 </section> | 84 <param name="layout" value="paired"/> |
| 191 <conditional name="action"> | 85 <output_collection name="output_r1" type="list" count="1"> |
| 192 <param name="mode" value="fastq_dump"/> | 86 <element name="SRR13333333_1" ftype="fastqsanger.gz" has_text="@SRR13333333"/> |
| 193 <param name="accession" value="SRR000001"/> | 87 </output_collection> |
| 194 <param name="layout" value="paired"/> | 88 <output_collection name="output_r2" type="list" count="1"> |
| 195 </conditional> | 89 <element name="SRR13333333_2" ftype="fastqsanger.gz" has_text="@SRR13333333"/> |
| 196 <output name="output_r1"> | 90 </output_collection> |
| 197 <assert_contents> | |
| 198 <has_text text="@SRR000001"/> | |
| 199 </assert_contents> | |
| 200 </output> | |
| 201 <output name="output_r2"> | |
| 202 <assert_contents> | |
| 203 <has_text text="@SRR000001"/> | |
| 204 </assert_contents> | |
| 205 </output> | |
| 206 </test> | |
| 207 | |
| 208 <!-- Test 3: fastq_dump single-end --> | |
| 209 <test expect_num_outputs="1"> | |
| 210 <section name="source"> | |
| 211 <param name="bucket" value="sra_pub_run_odp"/> | |
| 212 </section> | |
| 213 <conditional name="action"> | |
| 214 <param name="mode" value="fastq_dump"/> | |
| 215 <param name="accession" value="SRR000001"/> | |
| 216 <param name="layout" value="single"/> | |
| 217 </conditional> | |
| 218 <output name="output_r1"> | |
| 219 <assert_contents> | |
| 220 <has_text text="@SRR000001"/> | |
| 221 </assert_contents> | |
| 222 </output> | |
| 223 </test> | 91 </test> |
| 224 </tests> | 92 </tests> |
| 225 | 93 |
| 226 <help><![CDATA[ | 94 <help><![CDATA[ |
| 227 **NCBI SRA AWS Fetch** | 95 **NCBI SRA AWS Fetch** |
| 228 | 96 |
| 229 This tool fetches data from the `NCBI Sequence Read Archive (SRA)`_ hosted on Amazon S3 | 97 Fetches SRA runs from the public `sra-pub-run-odp` bucket on Amazon S3 and converts them to gzip-compressed FASTQ using `fasterq-dump`. |
| 230 as part of the AWS Open Data program. No AWS account is required. | |
| 231 | 98 |
| 232 ----- | 99 This tool can be run on a single SRA accession or a list of accessions provided as a text file (one per line). |
| 233 | 100 |
| 234 **Available Buckets** | 101 Outputs are automatically organized into collections suitable for downstream analysis. |
| 235 | |
| 236 +------------------------------+------------------------------------------------------------+ | |
| 237 | Bucket | Contents | | |
| 238 +==============================+============================================================+ | |
| 239 | sra-pub-run-odp | All open-access SRA runs in SRA Normalized format (.sra). | | |
| 240 | | Supports FASTQ conversion via this tool. | | |
| 241 +------------------------------+------------------------------------------------------------+ | |
| 242 | sra-pub-src-1 | Source BAM, CRAM, and FASTQ files from PacBio, ONT, 10X. | | |
| 243 +------------------------------+------------------------------------------------------------+ | |
| 244 | sra-pub-src-2 | Same as above (second bucket for source submissions). | | |
| 245 +------------------------------+------------------------------------------------------------+ | |
| 246 | sra-pub-metadata-us-east-1 | SRA metadata in Parquet/CSV format (for Athena / Glue). | | |
| 247 +------------------------------+------------------------------------------------------------+ | |
| 248 | |
| 249 | |
| 250 ----- | |
| 251 | |
| 252 **Listing objects** | |
| 253 | |
| 254 Select **List objects** to see what files are available under a given prefix, e.g. ``sra/SRR000001``. | |
| 255 Leave the prefix blank to browse the bucket root (may return a very large listing). | |
| 256 | |
| 257 ----- | |
| 258 | |
| 259 **Downloading raw files** | |
| 260 | |
| 261 Select **Download raw file(s)** and provide the full S3 key, e.g.:: | |
| 262 | |
| 263 sra/SRR000001/SRR000001.sra | |
| 264 | |
| 265 ----- | |
| 266 | |
| 267 **Download and convert to FASTQ** | |
| 268 | |
| 269 Fetches a single SRA run from ``sra-pub-run-odp`` and converts it to gzip-compressed | |
| 270 FASTQ using ``fasterq-dump``. | |
| 271 | |
| 272 Outputs are labelled ``<accession>_1`` (R1 / forward) and ``<accession>_2`` (R2 / reverse) | |
| 273 for paired-end runs, matching ``fasterq-dump``'s native ``--split-files`` naming. | |
| 274 Single-end runs produce only ``<accession>_1``. | |
| 275 | |
| 276 *Fetching multiple accessions and building a paired collection* | |
| 277 | |
| 278 Run this tool **once per accession** — either manually or by using Galaxy's dataset | |
| 279 collection mapping to fan out over a list of accession identifiers. Keeping one job per | |
| 280 accession means a failed download does not affect the others. | |
| 281 | |
| 282 Once all jobs are complete your history will contain datasets labelled:: | |
| 283 | |
| 284 SRR000001_1 SRR000001_2 | |
| 285 SRR000002_1 SRR000002_2 | |
| 286 ... | |
| 287 | |
| 288 Use **Galaxy's "Build List of Dataset Pairs"** tool to assemble these into a | |
| 289 ``list:paired`` collection. Galaxy will auto-detect the ``_1`` / ``_2`` suffixes | |
| 290 and propose pairings — confirm and name the collection, then pass it directly to | |
| 291 any downstream tool that accepts a paired collection (aligners, QC tools, etc.). | |
| 292 | |
| 293 .. warning:: | |
| 294 | |
| 295 This tool cannot auto-detect read layout from the accession. Check the SRA record | |
| 296 at https://www.ncbi.nlm.nih.gov/sra before running. Selecting the wrong layout will | |
| 297 produce incorrect output. | |
| 298 | |
| 299 ----- | |
| 300 | |
| 301 **Notes** | |
| 302 | |
| 303 - All S3 requests are made without AWS credentials (``--no-sign-request``). | |
| 304 - There is typically a **1–2 day lag** between an accession appearing in SRA Search and | |
| 305 being available in the S3 buckets. | |
| 306 - Controlled-access dbGaP data (``sra-ca-run-odp``) requires AWS credentials and is | |
| 307 **not** supported by this tool. | |
| 308 - ``fasterq-dump`` and ``pigz`` both use ``\${GALAXY_SLOTS}`` threads. Allocate more | |
| 309 cores in your job configuration to speed up conversion of large runs. | |
| 310 | |
| 311 .. _NCBI Sequence Read Archive (SRA): https://www.ncbi.nlm.nih.gov/sra | |
| 312 ]]></help> | 102 ]]></help> |
| 313 | 103 |
| 314 <citations> | 104 <citations> |
| 315 <citation type="bibtex"> | 105 <citation type="bibtex"> |
| 316 @misc{ncbi_sra_aws, | 106 @misc{ncbi_sra_aws, |
| 317 title = {{NCBI} {SRA} on {AWS} Open Data}, | 107 title = {{NCBI} {SRA} on {AWS} Open Data}, |
| 318 author = {{National Center for Biotechnology Information}}, | 108 author = {{National Center for Biotechnology Information}}, |
| 319 howpublished = {\url{https://registry.opendata.aws/ncbi-sra/}}, | 109 howpublished = {\\url{https://registry.opendata.aws/ncbi-sra/}}, |
| 320 note = {Accessed via AWS S3 without credentials} | 110 note = {Accessed via AWS S3 without credentials} |
| 321 } | 111 } |
| 322 </citation> | 112 </citation> |
| 323 <citation type="bibtex"> | 113 <citation type="bibtex"> |
| 324 @article{sra_toolkit, | 114 @article{sra_toolkit, |
| 325 title = {The {NCBI} {SRA} and portable data in biology}, | 115 title = {The {NCBI} {SRA} and portable data in biology}, |
| 326 author = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and | 116 author = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and |
| 327 {International Nucleotide Sequence Database Collaboration}}, | 117 {International Nucleotide Sequence Database Collaboration}}, |
| 328 journal = {Nucleic Acids Research}, | 118 journal = {Nucleic Acids Research}, |
| 329 volume = {39}, | 119 volume = {39}, |
| 330 number = {suppl\_1}, | 120 number = {suppl\\\_1}, |
| 331 pages = {D19--D21}, | 121 pages = {D19--D21}, |
| 332 year = {2011}, | 122 year = {2011}, |
| 333 doi = {10.1093/nar/gkq1019} | 123 doi = {10.1093/nar/gkq1019} |
| 334 } | 124 } |
| 335 </citation> | 125 </citation> |
| 336 </citations> | 126 </citations> |
| 337 | |
| 338 </tool> | 127 </tool> |
