Mercurial > repos > jpayne > bioproject_to_srr_2

#! /usr/bin/env python3

"Grab SRR numbers from BioProjects via the EMBL-ENA REST API's."

import requests
import sys
from xml.etree import ElementTree as xml
import csv

sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"

sample = """{
    "hitCount": 2,
    "entries": [
        {
            "id": "SRX377510",
            "source": "sra-experiment"
        },
        {
            "id": "SRX583279",
            "source": "sra-experiment"
        }
    ],
    "facets": []
}"""

data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml"

xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample"

sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}"

sample = """{
    "hitCount": 1,
    "entries": [
        {
            "id": "SRR1029665",
            "source": "sra-run"
        }
    ],
    "facets": []
}"""

if __name__ == "__main__":
    try:
        bioproject = sys.argv[1]
        b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
        b_result.raise_for_status()
        runs = [d['id'] for d in b_result.json()['entries']]
        if not runs:
            print(f"No results found for '{bioproject}'.", file=sys.stderr)
            quit(1)
    except IndexError:
        raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
    except KeyError as e:
        raise ValueError() from e

    try:
        with open(sys.argv[2], 'r') as f:
            rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
            rcds = list(rdr)


    except IndexError:
        rcds = []

    res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
    res.raise_for_status()
    bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]

    for run_id, sam_id in bsams:
        record = {}
        record['sample'] = run_id
        record['biosample_accession'] = sam_id
        res = requests.get(data_query.format(accession=sam_id))
        res.raise_for_status()
        root = xml.fromstring(res.text)
        for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
            key, value = iter(attr)
            record[key.text] = value.text
        rcds.append(record)
        print(run_id)

    headers = {}
    for record in rcds:
        for key in record.keys():
            headers[key] = None # use a dict to preserve header order


    with open('./metadata.tsv', 'w') as f:
        wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
        wtr.writeheader()
        wtr.writerows(rcds)
author	jpayne
date	Mon, 03 Aug 2020 10:39:37 -0400
parents	02ac32a00e25
children	80f1001797c7