Mercurial > repos > galaxytrakr > aws_sra
comparison aws_sra.xml @ 30:73ee30eb273a draft
planemo upload for repository https://github.com/CFSAN-Biostatistics/galaxytrakr-tools commit d0c08749588099d40db3c23bfd554800ac307a30
| author | galaxytrakr |
|---|---|
| date | Tue, 24 Mar 2026 03:09:41 +0000 |
| parents | 569a598c7e68 |
| children | 90ae4d437133 |
comparison
equal
deleted
inserted
replaced
| 29:569a598c7e68 | 30:73ee30eb273a |
|---|---|
| 1 <tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.4.0+gt_0.8" profile="23.0"> | 1 <tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.4.0+gt_0.50" profile="22.01"> |
| 2 <description>Fetches one or more SRA runs from AWS S3 and converts them to FASTQ</description> | 2 <description>Fetches one or more SRA runs from AWS S3 and converts them to FASTQ</description> |
| 3 | 3 |
| 4 <requirements> | 4 <requirements> |
| 5 <requirement type="package" version="3.1.1">sra-tools</requirement> | |
| 6 <requirement type="package" version="2.8">pigz</requirement> | |
| 5 <requirement type="package" version="2.34.8">awscli</requirement> | 7 <requirement type="package" version="2.34.8">awscli</requirement> |
| 6 <requirement type="package" version="3.2.1">sra-tools</requirement> | |
| 7 <requirement type="package" version="2.8">pigz</requirement> | |
| 8 </requirements> | 8 </requirements> |
| 9 | 9 |
| 10 <version_command>fasterq-dump --version</version_command> | 10 <version_command>fasterq-dump --version</version_command> |
| 11 | 11 |
| 12 <command detect_errors="aggressive"><![CDATA[ | 12 <command detect_errors="exit_code"><![CDATA[ |
| 13 ## Create a clean list of accessions from the user input | 13 #if $input.input_select == "accession_number": |
| 14 echo "$accession" | sed 's/,/\n/g; s/ \+/\n/g' | grep . > accessions.txt && | 14 echo '${input.accession}' | sed -r 's/(\,|\;|__cn__)/\n/g' > accessions && |
| 15 | 15 #else: |
| 16 ## Loop over each clean accession | 16 grep '^[[:space:]]*[ESD]RR[0-9]\{1,\}[[:space:]]*$' '${input.file_list}' > accessions && |
| 17 for acc in $(cat accessions.txt); | 17 #end if |
| 18 do | 18 mkdir -p output && |
| 19 echo "Processing accession: $acc" && | 19 mkdir -p outputOther && |
| 20 | 20 for acc in \$(cat ./accessions); |
| 21 ## 1. Create unique directories for this accession | 21 do ( |
| 22 mkdir -p sra_cache_${acc} fastq_out_${acc} && | 22 echo "Processing accession: \$acc" && |
| 23 | 23 mkdir -p sra_cache_\${acc} && |
| 24 ## 2. Download the file from S3 using aws s3 cp | 24 aws s3 cp --no-sign-request "s3://sra-pub-run-odp/sra/\${acc}/\${acc}" ./sra_cache_\${acc}/\${acc} && |
| 25 aws s3 cp --no-sign-request "s3://sra-pub-run-odp/sra/${acc}/${acc}" ./sra_cache_${acc}/ && | 25 fasterq-dump -e \${GALAXY_SLOTS:-4} -t . --split-3 ./sra_cache_\${acc}/\${acc} && |
| 26 | 26 rm -rf sra_cache_\${acc} && |
| 27 ## 3. Convert with fasterq-dump | 27 count="\$(ls \${acc}*.fastq 2>/dev/null | wc -l)" && |
| 28 fasterq-dump --outdir ./fastq_out_${acc} --temp . --threads \${GALAXY_SLOTS:-4} --split-files ./sra_cache_${acc}/${acc} && | 28 echo "Found \$count fastq file(s) for \$acc" && |
| 29 | 29 data=(\$(ls \${acc}*.fastq 2>/dev/null)) && |
| 30 ## 4. Compress with pigz | 30 if [ "\$count" -eq 1 ]; then |
| 31 pigz -p \${GALAXY_SLOTS:-4} ./fastq_out_${acc}/*.fastq && | 31 pigz -cqp \${GALAXY_SLOTS:-4} "\${data[0]}" > output/"\${acc}".fastqsanger.gz && |
| 32 | 32 rm "\${data[0]}"; |
| 33 ## 5. Move outputs for collection discovery | 33 elif [ -e "\${acc}".fastq ]; then |
| 34 #if str($layout) == 'paired' | 34 pigz -cqp \${GALAXY_SLOTS:-4} "\${acc}".fastq > outputOther/"\${acc}"__single.fastqsanger.gz && |
| 35 mv ./fastq_out_${acc}/${acc}_1.fastq.gz '$output_r1.files_path/${acc}_1.fastq.gz' && | 35 pigz -cqp \${GALAXY_SLOTS:-4} "\${acc}"_1.fastq > output/"\${acc}"_1.fastqsanger.gz && |
| 36 mv ./fastq_out_${acc}/${acc}_2.fastq.gz '$output_r2.files_path/${acc}_2.fastq.gz' | 36 pigz -cqp \${GALAXY_SLOTS:-4} "\${acc}"_2.fastq > output/"\${acc}"_2.fastqsanger.gz && |
| 37 #else | 37 rm "\${acc}"*.fastq; |
| 38 mv ./fastq_out_${acc}/${acc}.fastq.gz '$output_r1.files_path/${acc}.fastq.gz' | 38 elif [ "\$count" -eq 2 ]; then |
| 39 #end if && | 39 pigz -cqp \${GALAXY_SLOTS:-4} "\${acc}"_1.fastq > output/"\${acc}"_1.fastqsanger.gz && |
| 40 | 40 pigz -cqp \${GALAXY_SLOTS:-4} "\${acc}"_2.fastq > output/"\${acc}"_2.fastqsanger.gz && |
| 41 ## 6. Clean up | 41 rm "\${acc}"*.fastq; |
| 42 rm -rf sra_cache_${acc} fastq_out_${acc} | 42 else |
| 43 done | 43 for file in \${data[*]}; do |
| 44 pigz -cqp \${GALAXY_SLOTS:-4} "\$file" > outputOther/"\$file"sanger.gz && | |
| 45 rm "\$file"; | |
| 46 done; | |
| 47 fi | |
| 48 ); done; | |
| 49 echo "Done with all accessions." | |
| 44 ]]></command> | 50 ]]></command> |
| 45 | 51 |
| 46 <inputs> | 52 <inputs> |
| 47 <param name="accession" type="text" label="SRA Accession(s)" help="Provide one or more accession numbers (separated by commas, spaces, or newlines)."/> | 53 <conditional name="input"> |
| 48 <param name="layout" type="select" label="Read layout" help="This setting is applied to all accessions."> | 54 <param name="input_select" type="select" label="Select input type"> |
| 49 <option value="paired" selected="true">Paired-end (R1 + R2)</option> | 55 <option value="accession_number">SRA Accession number(s)</option> |
| 50 <option value="single">Single-end</option> | 56 <option value="file_list">File containing accession list</option> |
| 51 </param> | 57 </param> |
| 58 <when value="accession_number"> | |
| 59 <param name="accession" type="text" label="SRA Accession(s)" | |
| 60 help="One or more SRA run accessions (SRR, ERR, or DRR), separated by commas or spaces."> | |
| 61 <validator type="empty_field" message="At least one SRA accession is required."/> | |
| 62 <sanitizer> | |
| 63 <valid initial="string.printable"> | |
| 64 <remove value="'"/> | |
| 65 </valid> | |
| 66 <mapping initial="none"> | |
| 67 <add source="'" target="'"'"'"/> | |
| 68 </mapping> | |
| 69 </sanitizer> | |
| 70 </param> | |
| 71 </when> | |
| 72 <when value="file_list"> | |
| 73 <param name="file_list" type="data" format="txt,tabular" label="Accession list file" | |
| 74 help="A text file with one SRA accession (SRR, ERR, or DRR) per line."/> | |
| 75 </when> | |
| 76 </conditional> | |
| 52 </inputs> | 77 </inputs> |
| 53 | 78 |
| 54 <outputs> | 79 <outputs> |
| 55 <collection name="output_r1" type="list" label="FASTQ Reads (R1) for ${accession}"> | 80 <collection name="list_paired" type="list:paired" label="Paired-end FASTQ (aws_sra)"> |
| 56 <discover_datasets pattern="(?P<designation>.+)_1\.fastq\.gz" format="fastqsanger.gz" /> | 81 <discover_datasets |
| 82 pattern="(?P<identifier_0>[^_]+)_(?P<identifier_1>[12])\.fastqsanger\.gz" | |
| 83 directory="output" | |
| 84 ext="fastqsanger.gz"/> | |
| 57 </collection> | 85 </collection> |
| 58 <collection name="output_r2" type="list" label="FASTQ Reads (R2) for ${accession}"> | 86 <collection name="output_single" type="list" label="Single-end FASTQ (aws_sra)"> |
| 59 <discover_datasets pattern="(?P<designation>.+)_2\.fastq\.gz" format="fastqsanger.gz" /> | 87 <discover_datasets |
| 60 <filter>layout == 'paired'</filter> | 88 pattern="(?P<designation>.+)\.fastqsanger\.gz" |
| 89 directory="output" | |
| 90 ext="fastqsanger.gz"/> | |
| 91 </collection> | |
| 92 <collection name="output_other" type="list" label="Other FASTQ (aws_sra)"> | |
| 93 <discover_datasets | |
| 94 pattern="(?P<designation>.+)\.fastqsanger\.gz" | |
| 95 directory="outputOther" | |
| 96 format="fastqsanger.gz"/> | |
| 61 </collection> | 97 </collection> |
| 62 </outputs> | 98 </outputs> |
| 63 | 99 |
| 64 <tests> | 100 <tests> |
| 65 <test expect_num_outputs="2"> | 101 <test expect_num_outputs="3"> |
| 66 <param name="accession" value="SRR13333333"/> | 102 <conditional name="input"> |
| 67 <param name="layout" value="paired"/> | 103 <param name="input_select" value="accession_number"/> |
| 68 <output_collection name="output_r1" type="list" count="1"> | 104 <param name="accession" value="SRR13333333"/> |
| 69 <element name="SRR13333333_1" ftype="fastqsanger.gz"> | 105 </conditional> |
| 70 <assert_contents> | 106 <output_collection name="list_paired" type="list:paired" count="1"> |
| 71 <has_text text="@SRR13333333"/> | 107 <element name="SRR13333333"> |
| 72 </assert_contents> | 108 <element name="forward" ftype="fastqsanger.gz"> |
| 73 </element> | 109 <assert_contents> |
| 74 </output_collection> | 110 <has_text text="@SRR13333333"/> |
| 75 <output_collection name="output_r2" type="list" count="1"> | 111 </assert_contents> |
| 76 <element name="SRR13333333_2" ftype="fastqsanger.gz"> | 112 </element> |
| 77 <assert_contents> | 113 <element name="reverse" ftype="fastqsanger.gz"> |
| 78 <has_text text="@SRR13333333"/> | 114 <assert_contents> |
| 79 </assert_contents> | 115 <has_text text="@SRR13333333"/> |
| 116 </assert_contents> | |
| 117 </element> | |
| 80 </element> | 118 </element> |
| 81 </output_collection> | 119 </output_collection> |
| 82 </test> | 120 </test> |
| 121 | |
| 83 </tests> | 122 </tests> |
| 84 | 123 |
| 85 <help><![CDATA[ | 124 <help><![CDATA[ |
| 86 **NCBI SRA AWS Fetch** | 125 **NCBI SRA AWS Fetch** |
| 87 | 126 |
| 88 Fetches one or more SRA runs from the public `sra-pub-run-odp` S3 bucket and converts them to gzip-compressed FASTQ using `fasterq-dump`. This tool uses `aws s3 cp` for direct downloads within the AWS environment. | 127 Fetches one or more SRA runs from the public ``sra-pub-run-odp`` S3 bucket and converts them to |
| 128 gzip-compressed FASTQ using ``fasterq-dump``. Downloads use ``aws s3 cp`` with no credentials | |
| 129 required (public bucket). | |
| 130 | |
| 131 **Inputs** | |
| 132 | |
| 133 - **SRA Accession number(s)**: Type one or more accessions (SRR, ERR, or DRR) directly, comma- or space-separated. | |
| 134 - **File containing accession list**: A text file with one accession per line. This option is connectable as a workflow input. | |
| 135 | |
| 136 **Outputs** | |
| 137 | |
| 138 Three collections are always created (some may be empty depending on the data): | |
| 139 | |
| 140 - **Paired-end FASTQ**: A nested ``list:paired`` collection — each accession contains a forward/reverse pair. | |
| 141 - **Single-end FASTQ**: A flat list for reads confirmed single-end by ``--split-3``. | |
| 142 - **Other FASTQ**: Reads that could not be cleanly classified. | |
| 89 ]]></help> | 143 ]]></help> |
| 90 | 144 |
| 91 <citations> | 145 <citations> |
| 92 <citation type="bibtex"> | 146 <citation type="bibtex"> |
| 93 @misc{ncbi_sra_aws, | 147 @misc{ncbi_sra_aws, |
| 94 title = {{NCBI} {SRA} on {AWS} Open Data}, | 148 title = {{NCBI} {SRA} on {AWS} Open Data}, |
| 95 author = {{National Center for Biotechnology Information}}, | 149 author = {{National Center for Biotechnology Information}}, |
| 96 howpublished = {\\url{https://registry.opendata.aws/ncbi-sra/}}, | 150 howpublished = {\url{https://registry.opendata.aws/ncbi-sra/}}, |
| 97 note = {Accessed via AWS S3 without credentials} | 151 note = {Accessed via AWS S3 without credentials} |
| 98 } | 152 } |
| 99 </citation> | 153 </citation> |
| 100 <citation type="bibtex"> | 154 <citation type="doi">10.1093/nar/gkq1019</citation> |
| 101 @article{sra_toolkit, | |
| 102 title = {The {NCBI} {SRA} and portable data in biology}, | |
| 103 author = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and | |
| 104 {International Nucleotide Sequence Database Collaboration}}, | |
| 105 journal = {Nucleic Acids Research}, | |
| 106 volume = {39}, | |
| 107 number = {suppl\\\_1}, | |
| 108 pages = {D19--D21}, | |
| 109 year = {2011}, | |
| 110 doi = {10.1093/nar/gkq1019} | |
| 111 } | |
| 112 </citation> | |
| 113 </citations> | 155 </citations> |
| 114 </tool> | 156 </tool> |
