Mercurial > repos > jpayne > bioproject_to_srr_2
view bio2srr.py @ 2:556cac4fb538
"planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/b07378e15ad9"
author | jpayne |
---|---|
date | Mon, 03 Aug 2020 10:39:37 -0400 |
parents | 02ac32a00e25 |
children | 80f1001797c7 |
line wrap: on
line source
#! /usr/bin/env python3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's." import requests import sys from xml.etree import ElementTree as xml import csv sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" sample = """{ "hitCount": 2, "entries": [ { "id": "SRX377510", "source": "sra-experiment" }, { "id": "SRX583279", "source": "sra-experiment" } ], "facets": [] }""" data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml" xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample" sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}" sample = """{ "hitCount": 1, "entries": [ { "id": "SRR1029665", "source": "sra-run" } ], "facets": [] }""" if __name__ == "__main__": try: bioproject = sys.argv[1] b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json")) b_result.raise_for_status() runs = [d['id'] for d in b_result.json()['entries']] if not runs: print(f"No results found for '{bioproject}'.", file=sys.stderr) quit(1) except IndexError: raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") except KeyError as e: raise ValueError() from e try: with open(sys.argv[2], 'r') as f: rdr = csv.DictReader(f, dialect='excel', delimiter='\t') rcds = list(rdr) except IndexError: rcds = [] res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json")) res.raise_for_status() bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']] for run_id, sam_id in bsams: record = {} record['sample'] = run_id record['biosample_accession'] = sam_id res = requests.get(data_query.format(accession=sam_id)) res.raise_for_status() root = xml.fromstring(res.text) for attr in root.findall('.//SAMPLE_ATTRIBUTE'): key, value = iter(attr) record[key.text] = value.text rcds.append(record) print(run_id) headers = {} for record in rcds: for key in record.keys(): headers[key] = None # use a dict to preserve header order with open('./metadata.tsv', 'w') as f: wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys()) wtr.writeheader() wtr.writerows(rcds)