view aws_sra.xml @ 14:27569ff426e0 draft

planemo upload for repository https://github.com/CFSAN-Biostatistics/galaxytrakr-tools commit de407afdbdbc98c61500d3f8cfabf31d8b0da5d5
author galaxytrakr
date Mon, 23 Mar 2026 20:09:31 +0000
parents 2897d365dd62
children 25cf81d65cb8
line wrap: on
line source

<tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.3.0+gt_0.14" profile="23.0">
    <description>Fetches SRA runs from AWS and converts them to FASTQ</description>

    <requirements>
        <requirement type="package" version="2.34.8">awscli</requirement>
        <requirement type="package" version="3.2.1">sra-tools</requirement>
        <requirement type="package" version="2.8">pigz</requirement>
    </requirements>

    <version_command>fasterq-dump --version</version_command>

    <command detect_errors="aggressive"><![CDATA[
        ## This loop handles both 'single' and 'batch' modes.
        #for $acc_line in $run_type.mode == 'single' and str($run_type.accession).split() or $run_type.accession_list.lines:
            #set $acc = $acc_line.strip()
            #if $acc:

                echo "Processing accession: $acc" &&

                ## 1. Create unique directories for this accession
                mkdir -p sra_cache_${acc} fastq_out_${acc} &&

                ## 2. Download the file from S3 using the discovered path format
                aws s3 cp --no-sign-request 's3://sra-pub-run-odp/sra/${acc}/${acc}' ./sra_cache_${acc}/ &&

                ## 3. Convert with fasterq-dump, using the correct argument order
                fasterq-dump --outdir ./fastq_out_${acc} --temp . --threads \${GALAXY_SLOTS:-4} --split-files ./sra_cache_${acc}/${acc} &&

                ## 4. Compress with pigz
                pigz -p \${GALAXY_SLOTS:-4} ./fastq_out_${acc}/*.fastq &&

                ## 5. Move outputs to special directories Galaxy can discover
                #if $layout == 'paired'
                    mv ./fastq_out_${acc}/${acc}_1.fastq.gz '$output_r1.files_path/${acc}_1.fastq.gz' &&
                    mv ./fastq_out_${acc}/${acc}_2.fastq.gz '$output_r2.files_path/${acc}_2.fastq.gz'
                #else
                    mv ./fastq_out_${acc}/*.fastq.gz '$output_r1.files_path/${acc}.fastq.gz'
                #end if &&

                ## 6. Clean up temporary files
                rm -rf sra_cache_${acc} fastq_out_${acc}

            #end if
        #end for
    ]]></command>

    <inputs>
        <!-- This conditional allows the user to choose a single run or a list of runs -->
        <conditional name="run_type">
            <param name="mode" type="select" label="Execution Mode" help="Run on a single accession or a list of accessions from a file.">
                <option value="single" selected="true">Single Accession</option>
                <option value="batch">Batch of Accessions</option>
            </param>
            <when value="single">
                <param name="accession" type="text" label="SRA Accession" help="e.g., SRR13333333"/>
            </when>
            <when value="batch">
                <param name="accession_list" type="data" format="txt" label="List of SRA Accessions" help="A plain text file with one SRA accession per line."/>
            </when>
        </conditional>

        <!-- This layout parameter is always required -->
        <param name="layout" type="select" label="Read layout" help="Check the SRA record to confirm layout before running.">
            <option value="paired" selected="true">Paired-end (R1 + R2)</option>
            <option value="single">Single-end</option>
        </param>
    </inputs>

    <outputs>
        <!-- These collections will gather all the files produced by the loop -->
        <collection name="output_r1" type="list" label="${run_type.accession or 'FASTQ Reads (R1)'}">
            <discover_datasets pattern="(?P<designation>.+)_1\.fastq\.gz" format="fastqsanger.gz" />
        </collection>
        <collection name="output_r2" type="list" label="${run_type.accession or 'FASTQ Reads (R2)'}">
            <discover_datasets pattern="(?P<designation>.+)_2\.fastq\.gz" format="fastqsanger.gz" />
            <filter>layout == 'paired'</filter>
        </collection>
    </outputs>

    <tests>
        <test expect_num_outputs="2">
            <param name="mode" value="single"/>
            <param name="accession" value="SRR13333333"/>
            <param name="layout" value="paired"/>
            <output_collection name="output_r1" type="list" count="1">
                <element name="SRR13333333_1" ftype="fastqsanger.gz" has_text="@SRR13333333"/>
            </output_collection>
            <output_collection name="output_r2" type="list" count="1">
                <element name="SRR13333333_2" ftype="fastqsanger.gz" has_text="@SRR13333333"/>
            </output_collection>
        </test>
    </tests>

    <help><![CDATA[
**NCBI SRA AWS Fetch**

Fetches SRA runs from the public `sra-pub-run-odp` bucket on Amazon S3 and converts them to gzip-compressed FASTQ using `fasterq-dump`.

This tool can be run on a single SRA accession or a list of accessions provided as a text file (one per line).

Outputs are automatically organized into collections suitable for downstream analysis.
    ]]></help>

    <citations>
        <citation type="bibtex">
@misc{ncbi_sra_aws,
  title        = {{NCBI} {SRA} on {AWS} Open Data},
  author       = {{National Center for Biotechnology Information}},
  howpublished = {\\url{https://registry.opendata.aws/ncbi-sra/}},
  note         = {Accessed via AWS S3 without credentials}
}
        </citation>
        <citation type="bibtex">
@article{sra_toolkit,
  title   = {The {NCBI} {SRA} and portable data in biology},
  author  = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and
             {International Nucleotide Sequence Database Collaboration}},
  journal = {Nucleic Acids Research},
  volume  = {39},
  number  = {suppl\\\_1},
  pages   = {D19--D21},
  year    = {2011},
  doi     = {10.1093/nar/gkq1019}
}
        </citation>
    </citations>
</tool>