view bio2srr.py @ 3:80f1001797c7

Uploaded
author jpayne
date Wed, 27 Oct 2021 05:00:45 -0400
parents 556cac4fb538
children 2d4a2159c74b
line wrap: on
line source
#! /usr/bin/env python3

"Grab SRR numbers from BioProjects via the EMBL-ENA REST API's."

import requests
import sys
from xml.etree import ElementTree as xml
import csv

from time import sleep

sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"

sample = """{
    "hitCount": 2,
    "entries": [
        {
            "id": "SRX377510",
            "source": "sra-experiment"
        },
        {
            "id": "SRX583279",
            "source": "sra-experiment"
        }
    ],
    "facets": []
}"""

data_query = "?display=xml"

sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run"

sample = """{
    "hitCount": 1,
    "entries": [
        {
            "id": "SRR1029665",
            "source": "sra-run"
        }
    ],
    "facets": []
}"""

def get_tag(root, tag):
    val = root.find(tag)
    if val:
        return val.text

if __name__ == "__main__":
    try:
        bioproject = sys.argv[1]

        b_result = None

        runs = []

        while not b_result or len(runs) < b_result['hitCount']:
            b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json"))
            b_result.raise_for_status()
            b_result = b_result.json()
            runs += [d['id'] for d in b_result['entries']]

        if not runs:
            print(f"No results found for '{bioproject}'.", file=sys.stderr)
            quit(1)
    except IndexError:
        raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")

    try:
        with open(sys.argv[2], 'r') as f:
            rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
            rcds = list(rdr)


    except IndexError:
        rcds = []

    bsams = []

    for id in runs:
        res = requests.get(
            f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample",
            headers=dict(Accept="application/json")
        )
        res.raise_for_status()
        bsams.append(res.json()['entries'][0]['references'][0]['acc'])
        sleep(.1)

    # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
    # res.raise_for_status()
    # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]

    for run_id, sam_id in zip(runs, bsams):
        print(run_id)
        record = {}
        record['sample'] = run_id
        record['biosample_accession'] = sam_id
        res = requests.get(
            f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}"
        )
        res.raise_for_status()
        root = xml.fromstring(res.text)
        
        record['submitter_id'] = get_tag(root, './/SUBMITTER_ID')
        record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME')

        for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
            key, value = iter(attr)
            record[key.text] = value.text
        rcds.append(record)
        sleep(.1)

    headers = {}
    for record in rcds:
        for key in record.keys():
            headers[key] = None # use a dict to preserve header order
    
        
    with open('./metadata.tsv', 'w') as f:
        wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
        wtr.writeheader()
        wtr.writerows(rcds)