comparison aws_sra.xml @ 13:2897d365dd62 draft

planemo upload for repository https://github.com/CFSAN-Biostatistics/galaxytrakr-tools commit 619ebd7e6a24be0ec6c2728511290f43b0bad89f
author galaxytrakr
date Mon, 23 Mar 2026 20:04:52 +0000
parents 76192dc490d2
children 27569ff426e0
comparison
equal deleted inserted replaced
12:76192dc490d2 13:2897d365dd62
1 <tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.3.0+gt_0.12" profile="23.0"> 1 <tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.3.0+gt_0.13" profile="23.0">
2 <description>Fetch SRA data files from NCBI's public AWS S3 buckets</description> 2 <description>Fetches SRA runs from AWS and converts them to FASTQ</description>
3 3
4 <requirements> 4 <requirements>
5 <requirement type="package" version="2.34.8">awscli</requirement> 5 <requirement type="package" version="2.34.8">awscli</requirement>
6 <requirement type="package" version="3.2.1">sra-tools</requirement> 6 <requirement type="package" version="3.2.1">sra-tools</requirement>
7 <requirement type="package" version="2.8">pigz</requirement> 7 <requirement type="package" version="2.8">pigz</requirement>
8 </requirements> 8 </requirements>
9 9
10 <version_command>aws --version</version_command> 10 <version_command>fasterq-dump --version</version_command>
11 11
12 <command detect_errors="exit_code"><![CDATA[ 12 <command detect_errors="aggressive"><![CDATA[
13 ## ── Resolve bucket base URL ────────────────────────────────────────────── 13 ## This loop handles both 'single' and 'batch' modes.
14 #if $source.bucket == 'sra_pub_run_odp' 14 #for $acc_line in $run_type.mode == 'single' and str($run_type.accession).split() or $run_type.accession_list.lines:
15 #set $s3_base = 's3://sra-pub-run-odp' 15 #set $acc = $acc_line.strip()
16 #elif $source.bucket == 'sra_pub_src_1' 16 #if $acc:
17 #set $s3_base = 's3://sra-pub-src-1'
18 #elif $source.bucket == 'sra_pub_src_2'
19 #set $s3_base = 's3://sra-pub-src-2'
20 #elif $source.bucket == 'sra_pub_metadata'
21 #set $s3_base = 's3://sra-pub-metadata-us-east-1/sra/metadata'
22 #end if
23 17
24 ## ── LIST mode ──────────────────────────────────────────────────────────── 18 echo "Processing accession: $acc" &&
25 #if $action.mode == 'list' 19
26 #set $s3_path = $s3_base 20 ## 1. Create unique directories for this accession
27 #if $source.prefix 21 mkdir -p sra_cache_${acc} fastq_out_${acc} &&
28 #set $s3_path = $s3_path + '/' + $source.prefix.strip("/") 22
23 ## 2. Download the file from S3 using the discovered path format
24 aws s3 cp --no-sign-request 's3://sra-pub-run-odp/sra/${acc}/${acc}' ./sra_cache_${acc}/ &&
25
26 ## 3. Convert with fasterq-dump, using the correct argument order
27 fasterq-dump --outdir ./fastq_out_${acc} --temp . --threads \${GALAXY_SLOTS:-4} --split-files ./sra_cache_${acc}/${acc} &&
28
29 ## 4. Compress with pigz
30 pigz -p \${GALAXY_SLOTS:-4} ./fastq_out_${acc}/*.fastq &&
31
32 ## 5. Move outputs to special directories Galaxy can discover
33 #if $layout == 'paired'
34 mv ./fastq_out_${acc}/${acc}_1.fastq.gz '$output_r1.files_path/${acc}_1.fastq.gz' &&
35 mv ./fastq_out_${acc}/${acc}_2.fastq.gz '$output_r2.files_path/${acc}_2.fastq.gz'
36 #else
37 mv ./fastq_out_${acc}/*.fastq.gz '$output_r1.files_path/${acc}.fastq.gz'
38 #end if &&
39
40 ## 6. Clean up temporary files
41 rm -rf sra_cache_${acc} fastq_out_${acc}
42
29 #end if 43 #end if
30 aws s3 ls 44 #end for
31 --no-sign-request
32 #if $action.recursive
33 --recursive
34 #end if
35 $s3_path/
36 > '$output_list'
37
38 ## ── DOWNLOAD RAW mode ────────────────────────────────────────────────────
39 #elif $action.mode == 'copy'
40 aws s3 cp
41 --no-sign-request
42 #if $action.recursive
43 --recursive
44 #end if
45 '${s3_base}/${ $action.s3_key.strip("/") }'
46 '$output_data'
47
48 ## ── FASTQ DUMP mode (sra-pub-run-odp only) ───────────────────────────────
49 #elif $action.mode == 'fastq_dump'
50 #set $acc = $action.accession.strip()
51
52 mkdir -p sra_cache &&
53 aws s3 cp --no-sign-request '${s3_base}/sra/${acc}/${acc}' ./sra_cache/${acc} &&
54 mkdir -p fastq_out &&
55 fasterq-dump --outdir ./fastq_out --temp . --threads \${GALAXY_SLOTS:-4} --split-files ./sra_cache/${acc} &&
56 pigz -p \${GALAXY_SLOTS:-4} ./fastq_out/*.fastq &&
57 #if $action.layout == 'paired'
58 cp ./fastq_out/${acc}_1.fastq.gz '$output_r1' &&
59 cp ./fastq_out/${acc}_2.fastq.gz '$output_r2'
60 #else
61 cp ./fastq_out/${acc}.fastq.gz '$output_r1'
62 #end if
63 #end if
64 ]]></command> 45 ]]></command>
65 46
66 <inputs> 47 <inputs>
67 <section name="source" title="Data Source" expanded="true"> 48 <!-- This conditional allows the user to choose a single run or a list of runs -->
68 <param name="bucket" type="select" label="SRA S3 Bucket" 49 <conditional name="run_type">
69 help="Select the NCBI SRA AWS Open Data bucket to access. FASTQ conversion is only available for sra-pub-run-odp."> 50 <param name="mode" type="select" label="Execution Mode" help="Run on a single accession or a list of accessions from a file.">
70 <option value="sra_pub_run_odp" selected="true"> 51 <option value="single" selected="true">Single Accession</option>
71 sra-pub-run-odp — Open-access SRA runs (.sra format) 52 <option value="batch">Batch of Accessions</option>
72 </option>
73 <option value="sra_pub_src_1">
74 sra-pub-src-1 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 1]
75 </option>
76 <option value="sra_pub_src_2">
77 sra-pub-src-2 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 2]
78 </option>
79 <option value="sra_pub_metadata">
80 sra-pub-metadata-us-east-1 — SRA metadata (Parquet/CSV for Athena/Glue)
81 </option>
82 </param> 53 </param>
83 <param name="prefix" type="text" value="" optional="true" 54 <when value="single">
84 label="S3 key prefix (optional)" 55 <param name="accession" type="text" label="SRA Accession" help="e.g., SRR13333333"/>
85 help="Restrict listing to a sub-path, e.g. 'sra/SRR000001'. Used only in List mode.">
86 <sanitizer invalid_char="">
87 <valid initial="string.printable">
88 <remove value="'"/>
89 <remove value='"'/>
90 </valid>
91 </sanitizer>
92 </param>
93 </section>
94
95 <conditional name="action">
96 <param name="mode" type="select" label="Action">
97 <option value="list" selected="true">List objects</option>
98 <option value="copy">Download raw file(s)</option>
99 <option value="fastq_dump">Download and convert to FASTQ (sra-pub-run-odp only)</option>
100 </param>
101
102 <!-- ── LIST ── -->
103 <when value="list">
104 <param name="recursive" type="boolean" truevalue="--recursive" falsevalue=""
105 checked="false" label="List recursively"
106 help="List all objects under the prefix, not just the immediate level."/>
107 </when> 56 </when>
108 57 <when value="batch">
109 <!-- ── COPY ── --> 58 <param name="accession_list" type="data" format="txt" label="List of SRA Accessions" help="A plain text file with one SRA accession per line."/>
110 <when value="copy">
111 <param name="s3_key" type="text" label="S3 key to download"
112 help="Full key of the object to download, e.g. 'sra/SRR000001/SRR000001.sra'.">
113 <validator type="empty_field" message="An S3 key is required for download."/>
114 <sanitizer invalid_char="">
115 <valid initial="string.printable">
116 <remove value="'"/>
117 <remove value='"'/>
118 </valid>
119 </sanitizer>
120 </param>
121 <param name="recursive" type="boolean" truevalue="--recursive" falsevalue=""
122 checked="false" label="Download recursively"
123 help="Download all objects with this prefix rather than a single object."/>
124 </when>
125
126 <!-- ── FASTQ DUMP ── -->
127 <when value="fastq_dump">
128 <param name="accession" type="text" label="SRA Accession"
129 help="SRA run accession to fetch and convert, e.g. SRR000001. Must be present in sra-pub-run-odp.">
130 </param>
131 <param name="layout" type="select" label="Read layout"
132 help="Paired-end produces two datasets labelled accession_1 and accession_2. Single-end produces one dataset. Check the SRA record to confirm layout before running.">
133 <option value="paired" selected="true">Paired-end (R1 + R2)</option>
134 <option value="single">Single-end</option>
135 </param>
136 </when> 59 </when>
137 </conditional> 60 </conditional>
61
62 <!-- This layout parameter is always required -->
63 <param name="layout" type="select" label="Read layout" help="Check the SRA record to confirm layout before running.">
64 <option value="paired" selected="true">Paired-end (R1 + R2)</option>
65 <option value="single">Single-end</option>
66 </param>
138 </inputs> 67 </inputs>
139 68
140 <outputs> 69 <outputs>
141 <!-- List output --> 70 <!-- These collections will gather all the files produced by the loop -->
142 <data name="output_list" format="txt" 71 <collection name="output_r1" type="list" label="${run_type.accession or 'FASTQ Reads (R1)'}">
143 label="SRA S3 listing: ${source.prefix}"> 72 <discover_datasets pattern="(?P<designation>.+)_1\.fastq\.gz" format="fastqsanger.gz" />
144 <filter>action['mode'] == 'list'</filter> 73 </collection>
145 </data> 74 <collection name="output_r2" type="list" label="${run_type.accession or 'FASTQ Reads (R2)'}">
146 75 <discover_datasets pattern="(?P<designation>.+)_2\.fastq\.gz" format="fastqsanger.gz" />
147 <!-- Raw download --> 76 <filter>layout == 'paired'</filter>
148 <data name="output_data" format="auto" 77 </collection>
149 label="SRA download: ${action.s3_key}">
150 <filter>action['mode'] == 'copy'</filter>
151 </data>
152
153 <!-- FASTQ R1 / single.
154 Label matches fasterq-dump's native _1 suffix so Galaxy's
155 "Build List of Dataset Pairs" can auto-detect pairings. -->
156 <data name="output_r1" format="fastqsanger.gz"
157 label="${action.accession}_1">
158 <filter>action['mode'] == 'fastq_dump'</filter>
159 </data>
160
161 <!-- FASTQ R2 (paired-end only) -->
162 <data name="output_r2" format="fastqsanger.gz"
163 label="${action.accession}_2">
164 <filter>action['mode'] == 'fastq_dump' and action['layout'] == 'paired'</filter>
165 </data>
166 </outputs> 78 </outputs>
167 79
168 <tests> 80 <tests>
169 <!-- Test 1: list mode -->
170 <test expect_num_outputs="1">
171 <section name="source">
172 <param name="bucket" value="sra_pub_run_odp"/>
173 <param name="prefix" value="sra/SRR000001"/>
174 </section>
175 <conditional name="action">
176 <param name="mode" value="list"/>
177 <param name="recursive" value="false"/>
178 </conditional>
179 <output name="output_list">
180 <assert_contents>
181 <has_text text="SRR000001"/>
182 </assert_contents>
183 </output>
184 </test>
185
186 <!-- Test 2: fastq_dump paired -->
187 <test expect_num_outputs="2"> 81 <test expect_num_outputs="2">
188 <section name="source"> 82 <param name="mode" value="single"/>
189 <param name="bucket" value="sra_pub_run_odp"/> 83 <param name="accession" value="SRR13333333"/>
190 </section> 84 <param name="layout" value="paired"/>
191 <conditional name="action"> 85 <output_collection name="output_r1" type="list" count="1">
192 <param name="mode" value="fastq_dump"/> 86 <element name="SRR13333333_1" ftype="fastqsanger.gz" has_text="@SRR13333333"/>
193 <param name="accession" value="SRR000001"/> 87 </output_collection>
194 <param name="layout" value="paired"/> 88 <output_collection name="output_r2" type="list" count="1">
195 </conditional> 89 <element name="SRR13333333_2" ftype="fastqsanger.gz" has_text="@SRR13333333"/>
196 <output name="output_r1"> 90 </output_collection>
197 <assert_contents>
198 <has_text text="@SRR000001"/>
199 </assert_contents>
200 </output>
201 <output name="output_r2">
202 <assert_contents>
203 <has_text text="@SRR000001"/>
204 </assert_contents>
205 </output>
206 </test>
207
208 <!-- Test 3: fastq_dump single-end -->
209 <test expect_num_outputs="1">
210 <section name="source">
211 <param name="bucket" value="sra_pub_run_odp"/>
212 </section>
213 <conditional name="action">
214 <param name="mode" value="fastq_dump"/>
215 <param name="accession" value="SRR000001"/>
216 <param name="layout" value="single"/>
217 </conditional>
218 <output name="output_r1">
219 <assert_contents>
220 <has_text text="@SRR000001"/>
221 </assert_contents>
222 </output>
223 </test> 91 </test>
224 </tests> 92 </tests>
225 93
226 <help><![CDATA[ 94 <help><![CDATA[
227 **NCBI SRA AWS Fetch** 95 **NCBI SRA AWS Fetch**
228 96
229 This tool fetches data from the `NCBI Sequence Read Archive (SRA)`_ hosted on Amazon S3 97 Fetches SRA runs from the public `sra-pub-run-odp` bucket on Amazon S3 and converts them to gzip-compressed FASTQ using `fasterq-dump`.
230 as part of the AWS Open Data program. No AWS account is required.
231 98
232 ----- 99 This tool can be run on a single SRA accession or a list of accessions provided as a text file (one per line).
233 100
234 **Available Buckets** 101 Outputs are automatically organized into collections suitable for downstream analysis.
235
236 +------------------------------+------------------------------------------------------------+
237 | Bucket | Contents |
238 +==============================+============================================================+
239 | sra-pub-run-odp | All open-access SRA runs in SRA Normalized format (.sra). |
240 | | Supports FASTQ conversion via this tool. |
241 +------------------------------+------------------------------------------------------------+
242 | sra-pub-src-1 | Source BAM, CRAM, and FASTQ files from PacBio, ONT, 10X. |
243 +------------------------------+------------------------------------------------------------+
244 | sra-pub-src-2 | Same as above (second bucket for source submissions). |
245 +------------------------------+------------------------------------------------------------+
246 | sra-pub-metadata-us-east-1 | SRA metadata in Parquet/CSV format (for Athena / Glue). |
247 +------------------------------+------------------------------------------------------------+
248
249
250 -----
251
252 **Listing objects**
253
254 Select **List objects** to see what files are available under a given prefix, e.g. ``sra/SRR000001``.
255 Leave the prefix blank to browse the bucket root (may return a very large listing).
256
257 -----
258
259 **Downloading raw files**
260
261 Select **Download raw file(s)** and provide the full S3 key, e.g.::
262
263 sra/SRR000001/SRR000001.sra
264
265 -----
266
267 **Download and convert to FASTQ**
268
269 Fetches a single SRA run from ``sra-pub-run-odp`` and converts it to gzip-compressed
270 FASTQ using ``fasterq-dump``.
271
272 Outputs are labelled ``<accession>_1`` (R1 / forward) and ``<accession>_2`` (R2 / reverse)
273 for paired-end runs, matching ``fasterq-dump``'s native ``--split-files`` naming.
274 Single-end runs produce only ``<accession>_1``.
275
276 *Fetching multiple accessions and building a paired collection*
277
278 Run this tool **once per accession** — either manually or by using Galaxy's dataset
279 collection mapping to fan out over a list of accession identifiers. Keeping one job per
280 accession means a failed download does not affect the others.
281
282 Once all jobs are complete your history will contain datasets labelled::
283
284 SRR000001_1 SRR000001_2
285 SRR000002_1 SRR000002_2
286 ...
287
288 Use **Galaxy's "Build List of Dataset Pairs"** tool to assemble these into a
289 ``list:paired`` collection. Galaxy will auto-detect the ``_1`` / ``_2`` suffixes
290 and propose pairings — confirm and name the collection, then pass it directly to
291 any downstream tool that accepts a paired collection (aligners, QC tools, etc.).
292
293 .. warning::
294
295 This tool cannot auto-detect read layout from the accession. Check the SRA record
296 at https://www.ncbi.nlm.nih.gov/sra before running. Selecting the wrong layout will
297 produce incorrect output.
298
299 -----
300
301 **Notes**
302
303 - All S3 requests are made without AWS credentials (``--no-sign-request``).
304 - There is typically a **1–2 day lag** between an accession appearing in SRA Search and
305 being available in the S3 buckets.
306 - Controlled-access dbGaP data (``sra-ca-run-odp``) requires AWS credentials and is
307 **not** supported by this tool.
308 - ``fasterq-dump`` and ``pigz`` both use ``\${GALAXY_SLOTS}`` threads. Allocate more
309 cores in your job configuration to speed up conversion of large runs.
310
311 .. _NCBI Sequence Read Archive (SRA): https://www.ncbi.nlm.nih.gov/sra
312 ]]></help> 102 ]]></help>
313 103
314 <citations> 104 <citations>
315 <citation type="bibtex"> 105 <citation type="bibtex">
316 @misc{ncbi_sra_aws, 106 @misc{ncbi_sra_aws,
317 title = {{NCBI} {SRA} on {AWS} Open Data}, 107 title = {{NCBI} {SRA} on {AWS} Open Data},
318 author = {{National Center for Biotechnology Information}}, 108 author = {{National Center for Biotechnology Information}},
319 howpublished = {\url{https://registry.opendata.aws/ncbi-sra/}}, 109 howpublished = {\\url{https://registry.opendata.aws/ncbi-sra/}},
320 note = {Accessed via AWS S3 without credentials} 110 note = {Accessed via AWS S3 without credentials}
321 } 111 }
322 </citation> 112 </citation>
323 <citation type="bibtex"> 113 <citation type="bibtex">
324 @article{sra_toolkit, 114 @article{sra_toolkit,
325 title = {The {NCBI} {SRA} and portable data in biology}, 115 title = {The {NCBI} {SRA} and portable data in biology},
326 author = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and 116 author = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and
327 {International Nucleotide Sequence Database Collaboration}}, 117 {International Nucleotide Sequence Database Collaboration}},
328 journal = {Nucleic Acids Research}, 118 journal = {Nucleic Acids Research},
329 volume = {39}, 119 volume = {39},
330 number = {suppl\_1}, 120 number = {suppl\\\_1},
331 pages = {D19--D21}, 121 pages = {D19--D21},
332 year = {2011}, 122 year = {2011},
333 doi = {10.1093/nar/gkq1019} 123 doi = {10.1093/nar/gkq1019}
334 } 124 }
335 </citation> 125 </citation>
336 </citations> 126 </citations>
337
338 </tool> 127 </tool>