comparison aws_sra.xml @ 0:a4afe551dfc9 draft

planemo upload for repository https://github.com/CFSAN-Biostatistics/galaxytrakr-tools commit c55c06b92c0ee0429047bcff1992bf2ec293284a
author galaxytrakr
date Mon, 23 Mar 2026 14:09:35 +0000
parents
children ddfdc4c465e7
comparison
equal deleted inserted replaced
-1:000000000000 0:a4afe551dfc9
1 <tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.3.0+gt_0.1" profile="23.0">
2 <description>Fetch SRA data files from NCBI's public AWS S3 buckets</description>
3
4 <requirements>
5 <requirement type="package" version="2.34.8" channel="conda-forge">awscli</requirement>
6 <requirement type="package" version="3.2.1" channel="bioconda">sra-tools</requirement>
7 <requirement type="package" version="2.8" channel="conda-forge">pigz</requirement>
8 </requirements>
9
10 <version_command>aws --version</version_command>
11
12 <command detect_errors="exit_code"><![CDATA[
13 ## ── Resolve bucket base URL ──────────────────────────────────────────────
14 #if $source.bucket == 'sra_pub_run_odp'
15 #set $s3_base = 's3://sra-pub-run-odp'
16 #elif $source.bucket == 'sra_pub_src_1'
17 #set $s3_base = 's3://sra-pub-src-1'
18 #elif $source.bucket == 'sra_pub_src_2'
19 #set $s3_base = 's3://sra-pub-src-2'
20 #elif $source.bucket == 'sra_pub_metadata'
21 #set $s3_base = 's3://sra-pub-metadata-us-east-1/sra/metadata'
22 #end if
23
24 ## ── LIST mode ────────────────────────────────────────────────────────────
25 #if $action.mode == 'list'
26 aws s3 ls
27 --no-sign-request
28 #if $source.prefix
29 '${s3_base}/${ $source.prefix.strip("/") }/'
30 #else
31 '${s3_base}/'
32 #end if
33 #if $action.recursive
34 --recursive
35 #end if
36 > '$output_list'
37
38 ## ── DOWNLOAD RAW mode ────────────────────────────────────────────────────
39 #elif $action.mode == 'copy'
40 aws s3 cp
41 --no-sign-request
42 #if $action.recursive
43 --recursive
44 #end if
45 '${s3_base}/${ $action.s3_key.strip("/") }'
46 '$output_data'
47
48 ## ── FASTQ DUMP mode (sra-pub-run-odp only) ───────────────────────────────
49 #elif $action.mode == 'fastq_dump'
50 #set $acc = $action.accession.strip()
51
52 ## 1. Download the .sra file from S3
53 mkdir -p sra_cache &&
54 aws s3 cp
55 --no-sign-request
56 '${s3_base}/sra/${acc}/${acc}.sra'
57 ./sra_cache/${acc}.sra &&
58
59 ## 2. Convert with fasterq-dump --split-files.
60 ## Paired runs → <acc>_1.fastq + <acc>_2.fastq
61 ## Single runs → <acc>.fastq (no _1/_2 suffix)
62 ## We always use --split-files; single-end runs simply produce one file.
63 mkdir -p fastq_out &&
64 fasterq-dump
65 ./sra_cache/${acc}.sra
66 --outdir ./fastq_out
67 --temp .
68 --threads \${GALAXY_SLOTS:-4}
69 --split-files
70 &&
71
72 ## 3. Compress with pigz (fasterq-dump does not gzip natively)
73 pigz -p \${GALAXY_SLOTS:-4} ./fastq_out/*.fastq &&
74
75 ## 4. Stage outputs
76 #if $action.layout == 'paired'
77 cp ./fastq_out/${acc}_1.fastq.gz '$output_r1' &&
78 cp ./fastq_out/${acc}_2.fastq.gz '$output_r2'
79 #else
80 cp ./fastq_out/${acc}.fastq.gz '$output_r1'
81 #end if
82 #end if
83 ]]></command>
84
85 <inputs>
86 <section name="source" title="Data Source" expanded="true">
87 <param name="bucket" type="select" label="SRA S3 Bucket"
88 help="Select the NCBI SRA AWS Open Data bucket to access. FASTQ conversion is only available for sra-pub-run-odp.">
89 <option value="sra_pub_run_odp" selected="true">
90 sra-pub-run-odp — Open-access SRA runs (.sra format)
91 </option>
92 <option value="sra_pub_src_1">
93 sra-pub-src-1 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 1]
94 </option>
95 <option value="sra_pub_src_2">
96 sra-pub-src-2 — Source submissions: BAM/CRAM/FASTQ (PacBio, ONT, 10X) [bucket 2]
97 </option>
98 <option value="sra_pub_metadata">
99 sra-pub-metadata-us-east-1 — SRA metadata (Parquet/CSV for Athena/Glue)
100 </option>
101 </param>
102 <param name="prefix" type="text" value="" optional="true"
103 label="S3 key prefix (optional)"
104 help="Restrict listing to a sub-path, e.g. 'sra/SRR000001'. Used only in List mode.">
105 <sanitizer invalid_char="">
106 <valid initial="string.printable">
107 <remove value="'"/>
108 <remove value='"'/>
109 </valid>
110 </sanitizer>
111 </param>
112 </section>
113
114 <conditional name="action">
115 <param name="mode" type="select" label="Action">
116 <option value="list" selected="true">List objects</option>
117 <option value="copy">Download raw file(s)</option>
118 <option value="fastq_dump">Download and convert to FASTQ (sra-pub-run-odp only)</option>
119 </param>
120
121 <!-- ── LIST ── -->
122 <when value="list">
123 <param name="recursive" type="boolean" truevalue="--recursive" falsevalue=""
124 checked="false" label="List recursively"
125 help="List all objects under the prefix, not just the immediate level."/>
126 </when>
127
128 <!-- ── COPY ── -->
129 <when value="copy">
130 <param name="s3_key" type="text" label="S3 key to download"
131 help="Full key of the object to download, e.g. 'sra/SRR000001/SRR000001.sra'.">
132 <validator type="empty_field" message="An S3 key is required for download."/>
133 <sanitizer invalid_char="">
134 <valid initial="string.printable">
135 <remove value="'"/>
136 <remove value='"'/>
137 </valid>
138 </sanitizer>
139 </param>
140 <param name="recursive" type="boolean" truevalue="--recursive" falsevalue=""
141 checked="false" label="Download recursively"
142 help="Download all objects with this prefix rather than a single object."/>
143 </when>
144
145 <!-- ── FASTQ DUMP ── -->
146 <when value="fastq_dump">
147 <param name="accession" type="text" label="SRA Accession"
148 help="SRA run accession to fetch and convert, e.g. SRR000001. Must be present in sra-pub-run-odp.">
149 <validator type="empty_field" message="An SRA accession is required."/>
150 <validator type="regex"
151 message="Must be a valid SRA run accession (SRR, ERR, or DRR followed by digits)."
152 expression="^[SED]RR[0-9]+$"/>
153 </param>
154 <param name="layout" type="select" label="Read layout"
155 help="Paired-end produces two datasets labelled accession_1 and accession_2. Single-end produces one dataset. Check the SRA record to confirm layout before running.">
156 <option value="paired" selected="true">Paired-end (R1 + R2)</option>
157 <option value="single">Single-end</option>
158 </param>
159 </when>
160 </conditional>
161 </inputs>
162
163 <outputs>
164 <!-- List output -->
165 <data name="output_list" format="txt"
166 label="SRA S3 listing: ${source.prefix}">
167 <filter>action['mode'] == 'list'</filter>
168 </data>
169
170 <!-- Raw download -->
171 <data name="output_data" format="auto"
172 label="SRA download: ${action.s3_key}">
173 <filter>action['mode'] == 'copy'</filter>
174 </data>
175
176 <!-- FASTQ R1 / single.
177 Label matches fasterq-dump's native _1 suffix so Galaxy's
178 "Build List of Dataset Pairs" can auto-detect pairings. -->
179 <data name="output_r1" format="fastqsanger.gz"
180 label="${action.accession}_1">
181 <filter>action['mode'] == 'fastq_dump'</filter>
182 </data>
183
184 <!-- FASTQ R2 (paired-end only) -->
185 <data name="output_r2" format="fastqsanger.gz"
186 label="${action.accession}_2">
187 <filter>action['mode'] == 'fastq_dump' and action['layout'] == 'paired'</filter>
188 </data>
189 </outputs>
190
191 <tests>
192 <!-- Test 1: list mode -->
193 <test expect_num_outputs="1">
194 <section name="source">
195 <param name="bucket" value="sra_pub_run_odp"/>
196 <param name="prefix" value="sra/SRR000001"/>
197 </section>
198 <conditional name="action">
199 <param name="mode" value="list"/>
200 <param name="recursive" value="false"/>
201 </conditional>
202 <output name="output_list">
203 <assert_contents>
204 <has_text text="SRR000001"/>
205 </assert_contents>
206 </output>
207 </test>
208
209 <!-- Test 2: fastq_dump paired -->
210 <test expect_num_outputs="2">
211 <section name="source">
212 <param name="bucket" value="sra_pub_run_odp"/>
213 </section>
214 <conditional name="action">
215 <param name="mode" value="fastq_dump"/>
216 <param name="accession" value="SRR000001"/>
217 <param name="layout" value="paired"/>
218 </conditional>
219 <output name="output_r1">
220 <assert_contents>
221 <has_text text="@SRR000001"/>
222 </assert_contents>
223 </output>
224 <output name="output_r2">
225 <assert_contents>
226 <has_text text="@SRR000001"/>
227 </assert_contents>
228 </output>
229 </test>
230
231 <!-- Test 3: fastq_dump single-end -->
232 <test expect_num_outputs="1">
233 <section name="source">
234 <param name="bucket" value="sra_pub_run_odp"/>
235 </section>
236 <conditional name="action">
237 <param name="mode" value="fastq_dump"/>
238 <param name="accession" value="SRR000001"/>
239 <param name="layout" value="single"/>
240 </conditional>
241 <output name="output_r1">
242 <assert_contents>
243 <has_text text="@SRR000001"/>
244 </assert_contents>
245 </output>
246 </test>
247 </tests>
248
249 <help><![CDATA[
250 **NCBI SRA AWS Fetch**
251
252 This tool fetches data from the `NCBI Sequence Read Archive (SRA)`_ hosted on Amazon S3
253 as part of the AWS Open Data program. No AWS account is required.
254
255 -----
256
257 **Available Buckets**
258
259 +------------------------------+------------------------------------------------------------+
260 | Bucket | Contents |
261 +==============================+============================================================+
262 | sra-pub-run-odp | All open-access SRA runs in SRA Normalized format (.sra). |
263 | | Supports FASTQ conversion via this tool. |
264 +------------------------------+------------------------------------------------------------+
265 | sra-pub-src-1 | Source BAM, CRAM, and FASTQ files from PacBio, ONT, 10X. |
266 +------------------------------+------------------------------------------------------------+
267 | sra-pub-src-2 | Same as above (second bucket for source submissions). |
268 +------------------------------+------------------------------------------------------------+
269 | sra-pub-metadata-us-east-1 | SRA metadata in Parquet/CSV format (for Athena / Glue). |
270 +------------------------------+------------------------------------------------------------+
271
272 -----
273
274 **Listing objects**
275
276 Select **List objects** to see what files are available under a given prefix, e.g. ``sra/SRR000001``.
277 Leave the prefix blank to browse the bucket root (may return a very large listing).
278
279 -----
280
281 **Downloading raw files**
282
283 Select **Download raw file(s)** and provide the full S3 key, e.g.::
284
285 sra/SRR000001/SRR000001.sra
286
287 -----
288
289 **Download and convert to FASTQ**
290
291 Fetches a single SRA run from ``sra-pub-run-odp`` and converts it to gzip-compressed
292 FASTQ using ``fasterq-dump``.
293
294 Outputs are labelled ``<accession>_1`` (R1 / forward) and ``<accession>_2`` (R2 / reverse)
295 for paired-end runs, matching ``fasterq-dump``'s native ``--split-files`` naming.
296 Single-end runs produce only ``<accession>_1``.
297
298 *Fetching multiple accessions and building a paired collection*
299
300 Run this tool **once per accession** — either manually or by using Galaxy's dataset
301 collection mapping to fan out over a list of accession identifiers. Keeping one job per
302 accession means a failed download does not affect the others.
303
304 Once all jobs are complete your history will contain datasets labelled::
305
306 SRR000001_1 SRR000001_2
307 SRR000002_1 SRR000002_2
308 ...
309
310 Use **Galaxy's "Build List of Dataset Pairs"** tool to assemble these into a
311 ``list:paired`` collection. Galaxy will auto-detect the ``_1`` / ``_2`` suffixes
312 and propose pairings — confirm and name the collection, then pass it directly to
313 any downstream tool that accepts a paired collection (aligners, QC tools, etc.).
314
315 .. warning::
316
317 This tool cannot auto-detect read layout from the accession. Check the SRA record
318 at https://www.ncbi.nlm.nih.gov/sra before running. Selecting the wrong layout will
319 produce incorrect output.
320
321 -----
322
323 **Notes**
324
325 - All S3 requests are made without AWS credentials (``--no-sign-request``).
326 - There is typically a **1–2 day lag** between an accession appearing in SRA Search and
327 being available in the S3 buckets.
328 - Controlled-access dbGaP data (``sra-ca-run-odp``) requires AWS credentials and is
329 **not** supported by this tool.
330 - ``fasterq-dump`` and ``pigz`` both use ``\${GALAXY_SLOTS}`` threads. Allocate more
331 cores in your job configuration to speed up conversion of large runs.
332
333 .. _NCBI Sequence Read Archive (SRA): https://www.ncbi.nlm.nih.gov/sra
334 ]]></help>
335
336 <citations>
337 <citation type="bibtex">
338 @misc{ncbi_sra_aws,
339 title = {{NCBI} {SRA} on {AWS} Open Data},
340 author = {{National Center for Biotechnology Information}},
341 howpublished = {\url{https://registry.opendata.aws/ncbi-sra/}},
342 note = {Accessed via AWS S3 without credentials}
343 }
344 </citation>
345 <citation type="bibtex">
346 @article{sra_toolkit,
347 title = {The {NCBI} {SRA} and portable data in biology},
348 author = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and
349 {International Nucleotide Sequence Database Collaboration}},
350 journal = {Nucleic Acids Research},
351 volume = {39},
352 number = {suppl\_1},
353 pages = {D19--D21},
354 year = {2011},
355 doi = {10.1093/nar/gkq1019}
356 }
357 </citation>
358 </citations>
359
360 </tool>