view aws_sra.xml @ 33:4dbb8753a569 draft

planemo upload for repository https://github.com/CFSAN-Biostatistics/galaxytrakr-tools commit 569b20847bf87de3c298edf3d2435d198b8805ba
author galaxytrakr
date Tue, 24 Mar 2026 14:28:21 +0000
parents 8b8a63786853
children 4c1ff0d60937
line wrap: on
line source

<tool id="aws_sra" name="NCBI SRA AWS Fetch" version="0.4.0+gt_1.2" profile="23.0">
    <description>Fetches one or more SRA runs from AWS S3 and converts them to FASTQ</description>

    <requirements>
        <requirement type="package" version="3.1.1">sra-tools</requirement>
        <requirement type="package" version="2.8">pigz</requirement>
        <requirement type="package" version="2.34.8">awscli</requirement>
    </requirements>

    <version_command>fasterq-dump --version</version_command>

    <command detect_errors="exit_code"><![CDATA[
    #if $input.input_select == "accession_number":
        echo '${input.accession}' | sed -r 's/(\,|\;|__cn__)/\n/g' > accessions &&
    #else:
        grep '^[[:space:]]*[ESD]RR[0-9]\{1,\}[[:space:]]*$' '${input.file_list}' > accessions &&
    #end if
    mkdir -p output &&
    mkdir -p outputOther &&
    mkdir -p outputSingle &&  
    for acc in \$(cat ./accessions);
    do (
        echo "Processing accession: \$acc" &&
        mkdir -p sra_cache_\${acc} &&
        aws s3 cp --no-sign-request "s3://sra-pub-run-odp/sra/\${acc}/\${acc}" ./sra_cache_\${acc}/\${acc} &&
        fasterq-dump -e \${GALAXY_SLOTS:-4} -t . --split-3 ./sra_cache_\${acc}/\${acc} &&
        rm -rf sra_cache_\${acc} &&
        count="\$(ls \${acc}*.fastq 2>/dev/null | wc -l)" &&
        echo "Found \$count fastq file(s) for \$acc" &&
        data=(\$(ls \${acc}*.fastq 2>/dev/null)) &&
        if [ "\$count" -eq 1 ]; then
            pigz -cqp \${GALAXY_SLOTS:-4} "\${data[0]}" > outputSingle/"\${acc}".fastqsanger.gz &&
            rm "\${data[0]}";
        elif [ -e "\${acc}".fastq ]; then
            pigz -cqp \${GALAXY_SLOTS:-4} "\${acc}".fastq > outputOther/"\${acc}"__single.fastqsanger.gz &&
            pigz -cqp \${GALAXY_SLOTS:-4} "\${acc}"_1.fastq > output/"\${acc}"_1.fastqsanger.gz &&
            pigz -cqp \${GALAXY_SLOTS:-4} "\${acc}"_2.fastq > output/"\${acc}"_2.fastqsanger.gz &&
            rm "\${acc}"*.fastq;
        elif [ "\$count" -eq 2 ]; then
            pigz -cqp \${GALAXY_SLOTS:-4} "\${acc}"_1.fastq > output/"\${acc}"_1.fastqsanger.gz &&
            pigz -cqp \${GALAXY_SLOTS:-4} "\${acc}"_2.fastq > output/"\${acc}"_2.fastqsanger.gz &&
            rm "\${acc}"*.fastq;
        else
            for file in \${data[*]}; do
                pigz -cqp \${GALAXY_SLOTS:-4} "\$file" > outputOther/"\$file"sanger.gz &&
                rm "\$file";
            done;
        fi
    ); done;
    echo "Done with all accessions."
    ]]></command>

    <inputs>
        <conditional name="input">
            <param name="input_select" type="select" label="Select input type">
                <option value="accession_number">SRA Accession number(s)</option>
                <option value="file_list">File containing accession list</option>
            </param>
            <when value="accession_number">
                <param name="accession" type="text" label="SRA Accession(s)"
                       help="One or more SRA run accessions (SRR, ERR, or DRR), separated by commas or spaces.">
                    <validator type="empty_field" message="At least one SRA accession is required."/>
                    <sanitizer>
                        <valid initial="string.printable">
                            <remove value="&apos;"/>
                        </valid>
                        <mapping initial="none">
                            <add source="&apos;" target="&apos;&quot;&apos;&quot;&apos;"/>
                        </mapping>
                    </sanitizer>
                </param>
            </when>
            <when value="file_list">
                <param name="file_list" type="data" format="txt,tabular" label="Accession list file"
                       help="A text file with one SRA accession (SRR, ERR, or DRR) per line."/>
            </when>
        </conditional>
    </inputs>

    <outputs>
        <collection name="list_paired" type="list:paired" label="Paired-end FASTQ (aws_sra)">
            <discover_datasets
                pattern="(?P&lt;identifier_0&gt;[^_]+)_(?P&lt;identifier_1&gt;[^_]+)\\.fastqsanger\\.gz"
                directory="output"
                ext="fastqsanger.gz"/>
        </collection>
        <collection name="output_single" type="list" label="Single-end FASTQ (aws_sra)">
            <discover_datasets
                pattern="(?P&lt;designation&gt;.+)\\.fastqsanger\\.gz"
                directory="outputSingle"
                ext="fastqsanger.gz"/>
        </collection>
        <collection name="output_other" type="list" label="Other FASTQ (aws_sra)">
            <discover_datasets
                pattern="(?P&lt;designation&gt;.+)\.fastqsanger\.gz"
                directory="outputOther"
                format="fastqsanger.gz"/>
        </collection>
    </outputs>

    <tests>
        <test expect_num_outputs="3">
            <conditional name="input">
                <param name="input_select" value="accession_number"/>
                <param name="accession" value="SRR13333333"/>
            </conditional>
            <output_collection name="list_paired" type="list:paired" count="1">
                <element name="SRR13333333">
                    <element name="forward" ftype="fastqsanger.gz">
                        <assert_contents>
                            <has_text text="@SRR13333333"/>
                        </assert_contents>
                    </element>
                    <element name="reverse" ftype="fastqsanger.gz">
                        <assert_contents>
                            <has_text text="@SRR13333333"/>
                        </assert_contents>
                    </element>
                </element>
            </output_collection>
        </test>

    </tests>

    <help><![CDATA[
**NCBI SRA AWS Fetch**

Fetches one or more SRA runs from the public ``sra-pub-run-odp`` S3 bucket and converts them to
gzip-compressed FASTQ using ``fasterq-dump``. Downloads use ``aws s3 cp`` with no credentials
required (public bucket).

**Inputs**

- **SRA Accession number(s)**: Type one or more accessions (SRR, ERR, or DRR) directly, comma- or space-separated.
- **File containing accession list**: A text file with one accession per line. This option is connectable as a workflow input.

**Outputs**

Three collections are always created (some may be empty depending on the data):

- **Paired-end FASTQ**: A nested ``list:paired`` collection — each accession contains a forward/reverse pair.
- **Single-end FASTQ**: A flat list for reads confirmed single-end by ``--split-3``.
- **Other FASTQ**: Reads that could not be cleanly classified.
    ]]></help>

    <citations>
        <citation type="bibtex">
@misc{ncbi_sra_aws,
  title        = {{NCBI} {SRA} on {AWS} Open Data},
  author       = {{National Center for Biotechnology Information}},
  howpublished = {\url{https://registry.opendata.aws/ncbi-sra/}},
  note         = {Accessed via AWS S3 without credentials}
}
        </citation>
        <citation type="doi">10.1093/nar/gkq1019</citation>
    </citations>
</tool>