Mercurial > repos > jpayne > bioproject_to_srr_2
changeset 2:556cac4fb538
"planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/b07378e15ad9"
author | jpayne |
---|---|
date | Mon, 03 Aug 2020 10:39:37 -0400 |
parents | b07378e15ad9 |
children | 80f1001797c7 |
files | bio2srr.py bio2srr.xml test-data/metadata.tsv test-data/test.txt |
diffstat | 4 files changed, 75 insertions(+), 38 deletions(-) [+] |
line wrap: on
line diff
--- a/bio2srr.py Thu Dec 20 11:04:53 2018 -0500 +++ b/bio2srr.py Mon Aug 03 10:39:37 2020 -0400 @@ -4,8 +4,10 @@ import requests import sys +from xml.etree import ElementTree as xml +import csv -sra_exp_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" +sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" sample = """{ "hitCount": 2, @@ -22,7 +24,11 @@ "facets": [] }""" -sra_run_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={experiment}" +data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml" + +xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample" + +sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}" sample = """{ "hitCount": 1, @@ -36,23 +42,55 @@ }""" if __name__ == "__main__": - try: - bioproject = sys.argv[1] - b_result = requests.get(sra_exp_query.format(bioproject=bioproject), headers=dict(Accept="application/json")) - b_result.raise_for_status() - if b_result.json()['entries']: - for experiment in [d['id'] for d in b_result.json()['entries']]: - r_result = requests.get(sra_run_query.format(experiment=experiment), headers=dict(Accept="application/json")) - r_result.raise_for_status() - for run in [d['id'] for d in r_result.json()['entries']]: - print(run) - else: - print(f"No results found for '{bioproject}'.", file=sys.stderr) - quit(1) - except IndexError: - raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") - except KeyError as e: - raise ValueError() from e + try: + bioproject = sys.argv[1] + b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json")) + b_result.raise_for_status() + runs = [d['id'] for d in b_result.json()['entries']] + if not runs: + print(f"No results found for '{bioproject}'.", file=sys.stderr) + quit(1) + except IndexError: + raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") + except KeyError as e: + raise ValueError() from e + try: + with open(sys.argv[2], 'r') as f: + rdr = csv.DictReader(f, dialect='excel', delimiter='\t') + rcds = list(rdr) + except IndexError: + rcds = [] + + res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json")) + res.raise_for_status() + bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']] + + for run_id, sam_id in bsams: + record = {} + record['sample'] = run_id + record['biosample_accession'] = sam_id + res = requests.get(data_query.format(accession=sam_id)) + res.raise_for_status() + root = xml.fromstring(res.text) + for attr in root.findall('.//SAMPLE_ATTRIBUTE'): + key, value = iter(attr) + record[key.text] = value.text + rcds.append(record) + print(run_id) + + headers = {} + for record in rcds: + for key in record.keys(): + headers[key] = None # use a dict to preserve header order + + + with open('./metadata.tsv', 'w') as f: + wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys()) + wtr.writeheader() + wtr.writerows(rcds) + + +
--- a/bio2srr.xml Thu Dec 20 11:04:53 2018 -0500 +++ b/bio2srr.xml Mon Aug 03 10:39:37 2020 -0400 @@ -1,26 +1,35 @@ <tool id="bio2srr" name="Bioproject to SRR" version="0.1.0"> <description>Retrieve SRR accessions from BioProject or BioSample.</description> <requirements> - <requirement type="package">package_python_3_4</requirement> - <requirement type="package" version="2.18.4">requests</requirement> + <requirement type="package" version="3.8">python</requirement> + <requirement type="package" version="2.24.0">requests</requirement> </requirements> <command detect_errors="exit_code"><![CDATA[ + #if $src_metadata + $__tool_directory__/bio2srr.py "$input1" $src_metadata > "$output" + #else $__tool_directory__/bio2srr.py "$input1" > "$output" + #end if ]]></command> <inputs> <param type="text" name="input1" label="BioProject or BioSample" /> + <param type="data" format="tabular" name="src_metadata" label="Table of your own metadata" optional="true" help="Optional metadata table to be folded in." /> </inputs> <outputs> - <data format="txt" name="output" /> + <data format="txt" name="output" label="SRR Accession List" /> + <data format="tabular" name="metadata" from_work_dir="metadata.tsv" label="Sample Metadata Table" /> </outputs> <tests> <test> <param name="input1" value="NOTHING" /> <output name="output" file="test.txt" /> + <output name="metadata" file="metadata.tsv" /> </test> </tests> <help><![CDATA[ - Retrieve SRR accessions by BioSample or BioProject. + Retrieve SRR accessions and metadata from a BioProject. + + If you have metadata for your own samples (that aren't from the BioProject) you can provide a TSV table with it and they'll be folded together. (It's a union, not a join.) Put sample ID's under the header 'sample'. ]]></help> <citations> </citations>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/metadata.tsv Mon Aug 03 10:39:37 2020 -0400 @@ -0,0 +1,3 @@ +sample biosample_accession isolation_source collection_date geo_loc_name lat_lon rel_to_oxygen samp_collect_device samp_mat_process samp_size source_material_id BioSampleModel ENA-SPOT-COUNT ENA-BASE-COUNT ENA-FIRST-PUBLIC ENA-LAST-UPDATE +SRR11671300 SAMN14820590 marine sediment Not applicable North Sea: German Bight (Helgoland Mud Area) 54.052300 N 7.580400 E obligate anaerobe anoxic sampling of sediment slurry incubations at defined time points Previously collected sediments from Helgoland mud area, stored at 4ºC (near in-situ temperature) until use as starting material for incubation experiments. For the incubation experiments, anaerobic slurries were prepared in ratio 1:4 and incubated at 30ºC. Sample name identifies the unique information of each sample including the target 16S rRNA gene, incubation timepoint and state of the enrichment. 1 ml slurry in triplicates, Pooled DNA from triplicates were sequenced as 1 sample. sediment incubation at 30ºC without any amendment sampled after 105 days sequenced with bacteria PCR primers Metagenome or environmental 41855 6160440 2020-05-04 2020-05-04 +SRR11671283 SAMN14820597 marine sediment Not applicable North Sea: German Bight (Helgoland Mud Area) 54.052300 N 7.580400 E obligate anaerobe anoxic sampling of sediment slurry incubations at defined time points Previously collected sediments from Helgoland mud area, stored at 4ºC (near in-situ temperature) until use as starting material for incubation experiments. For the incubation experiments, anaerobic slurries were prepared in ratio 1:4 and incubated at 30ºC. Sample name identifies the unique information of each sample including the target 16S rRNA gene, incubation timepoint and state of the enrichment. 1 ml slurry in triplicates, Pooled DNA from triplicates were sequenced as 1 sample. sediment incubation at 30ºC without any amendment sampled after 105 days sequenced with archaea PCR primers Metagenome or environmental 19833 2973256 2020-05-04 2020-05-04
--- a/test-data/test.txt Thu Dec 20 11:04:53 2018 -0500 +++ b/test-data/test.txt Mon Aug 03 10:39:37 2020 -0400 @@ -1,15 +1,2 @@ -ERR2130609 -ERR2130611 -ERR2130613 -ERR2130497 -ERR2130499 -ERR2130502 -ERR2130503 -ERR2130506 -ERR2130508 -ERR2130510 -ERR2130512 -ERR2130517 -ERR2130520 -ERR2130521 -ERR2130523 \ No newline at end of file +SRR11671300 +SRR11671283