view aws_sra.xml @ 18:5680c31cd031 draft

planemo upload for repository https://github.com/CFSAN-Biostatistics/galaxytrakr-tools commit 593c33d4de9fd7663766c8463d56e4defd608b04
author galaxytrakr
date Mon, 23 Mar 2026 20:51:03 +0000
parents 9fb80e0392ce
children a4186132e1c4
line wrap: on
line source

<tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.3.0+gt_0.18" profile="23.0">
    <description>Fetches SRA runs from AWS and converts them to FASTQ</description>
    <requirements>
        <requirement type="package" version="2.34.8">awscli</requirement>
        <requirement type="package" version="3.2.1">sra-tools</requirement>
        <requirement type="package" version="2.8">pigz</requirement>
    </requirements>
    <version_command>fasterq-dump --version</version_command>

    <command detect_errors="aggressive"><![CDATA[
        ## Single Run Mode
        #if $run_type.mode == 'single'
            #set $acc = str($run_type.accession).strip()
            echo "Processing single accession: $acc" &&
            mkdir -p sra_cache fastq_out &&
            aws s3 cp --no-sign-request 's3://sra-pub-run-odp/sra/${acc}/${acc}' ./sra_cache/ &&
            fasterq-dump --outdir ./fastq_out --temp . --threads \${GALAXY_SLOTS:-4} --split-files ./sra_cache/${acc} &&
            pigz -p \${GALAXY_SLOTS:-4} ./fastq_out/*.fastq &&
            #if str($layout) == 'paired'
                mv ./fastq_out/${acc}_1.fastq.gz '$output_r1_single' &&
                mv ./fastq_out/${acc}_2.fastq.gz '$output_r2_single'
            #else
                mv ./fastq_out/*.fastq.gz '$output_r1_single'
            #end if

        ## Batch Run Mode
        #else
            #for $acc in $run_type.accession_list.lines:
                #set $acc = $acc.strip()
                #if $acc:
                    echo "Processing batch accession: $acc" &&
                    mkdir -p sra_cache_${acc} fastq_out_${acc} &&
                    aws s3 cp --no-sign-request 's3://sra-pub-run-odp/sra/${acc}/${acc}' ./sra_cache_${acc}/ &&
                    fasterq-dump --outdir ./fastq_out_${acc} --temp . --threads \${GALAXY_SLOTS:-4} --split-files ./sra_cache_${acc}/${acc} &&
                    pigz -p \${GALAXY_SLOTS:-4} ./fastq_out_${acc}/*.fastq &&
                    #if str($layout) == 'paired'
                        # Move files to the special path for collection discovery
                        mv ./fastq_out_${acc}/${acc}_1.fastq.gz '$output_r1_batch.files_path/${acc}_1.fastq.gz' &&
                        mv ./fastq_out_${acc}/${acc}_2.fastq.gz '$output_r2_batch.files_path/${acc}_2.fastq.gz'
                    #else
                        mv ./fastq_out_${acc}/*.fastq.gz '$output_r1_batch.files_path/${acc}.fastq.gz'
                    #end if &&
                    rm -rf sra_cache_${acc} fastq_out_${acc}
                #end if
            #end for
        #end if
    ]]></command>

    <inputs>
        <conditional name="run_type">
            <param name="mode" type="select" label="Execution Mode">
                <option value="single" selected="true">Single Accession</option>
                <option value="batch">Batch of Accessions</option>
            </param>
            <when value="single">
                <param name="accession" type="text" label="SRA Accession"/>
            </when>
            <when value="batch">
                <param name="accession_list" type="data" format="txt" label="List of SRA Accessions"/>
            </when>
        </conditional>
        <param name="layout" type="select" label="Read layout">
            <option value="paired" selected="true">Paired-end (R1 + R2)</option>
            <option value="single">Single-end</option>
        </param>
    </inputs>

    <outputs>
        <!-- Outputs for Single Run Mode -->
        <data name="output_r1_single" format="fastqsanger.gz" label="${run_type.accession}_1.fastq.gz">
            <filter>run_type['mode'] == 'single'</filter>
        </data>
        <data name="output_r2_single" format="fastqsanger.gz" label="${run_type.accession}_2.fastq.gz">
            <filter>run_type['mode'] == 'single' and layout == 'paired'</filter>
        </data>

        <!-- Outputs for Batch Mode -->
        <collection name="output_r1_batch" type="list" label="FASTQ Reads (R1)">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)_1\.fastq\.gz" format="fastqsanger.gz" />
            <filter>run_type['mode'] == 'batch'</filter>
        </collection>
        <collection name="output_r2_batch" type="list" label="FASTQ Reads (R2)">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)_2\.fastq\.gz" format="fastqsanger.gz" />
            <filter>run_type['mode'] == 'batch' and layout == 'paired'</filter>
        </collection>
    </outputs>
    
    <help><![CDATA[
**NCBI SRA AWS Fetch**

Fetches SRA runs from the public `sra-pub-run-odp` bucket on Amazon S3 and converts them to gzip-compressed FASTQ using `fasterq-dump`.

This tool can be run on a single SRA accession or a list of accessions provided as a text file (one per line).

Outputs are automatically organized into collections suitable for downstream analysis.
    ]]></help>

    <citations>
        <citation type="bibtex">
@misc{ncbi_sra_aws,
  title        = {{NCBI} {SRA} on {AWS} Open Data},
  author       = {{National Center for Biotechnology Information}},
  howpublished = {\\url{https://registry.opendata.aws/ncbi-sra/}},
  note         = {Accessed via AWS S3 without credentials}
}
        </citation>
        <citation type="bibtex">
@article{sra_toolkit,
  title   = {The {NCBI} {SRA} and portable data in biology},
  author  = {Leinonen, Rasko and Sugawara, Hideaki and Shumway, Martin and
             {International Nucleotide Sequence Database Collaboration}},
  journal = {Nucleic Acids Research},
  volume  = {39},
  number  = {suppl\\\_1},
  pages   = {D19--D21},
  year    = {2011},
  doi     = {10.1093/nar/gkq1019}
}
        </citation>
    </citations>
</tool>