Mercurial > repos > jpayne > bioproject_to_srr_2
view bio2srr.py @ 3:80f1001797c7
Uploaded
author | jpayne |
---|---|
date | Wed, 27 Oct 2021 05:00:45 -0400 |
parents | 556cac4fb538 |
children | 2d4a2159c74b |
line wrap: on
line source
#! /usr/bin/env python3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's." import requests import sys from xml.etree import ElementTree as xml import csv from time import sleep sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" sample = """{ "hitCount": 2, "entries": [ { "id": "SRX377510", "source": "sra-experiment" }, { "id": "SRX583279", "source": "sra-experiment" } ], "facets": [] }""" data_query = "?display=xml" sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run" sample = """{ "hitCount": 1, "entries": [ { "id": "SRR1029665", "source": "sra-run" } ], "facets": [] }""" def get_tag(root, tag): val = root.find(tag) if val: return val.text if __name__ == "__main__": try: bioproject = sys.argv[1] b_result = None runs = [] while not b_result or len(runs) < b_result['hitCount']: b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json")) b_result.raise_for_status() b_result = b_result.json() runs += [d['id'] for d in b_result['entries']] if not runs: print(f"No results found for '{bioproject}'.", file=sys.stderr) quit(1) except IndexError: raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") try: with open(sys.argv[2], 'r') as f: rdr = csv.DictReader(f, dialect='excel', delimiter='\t') rcds = list(rdr) except IndexError: rcds = [] bsams = [] for id in runs: res = requests.get( f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample", headers=dict(Accept="application/json") ) res.raise_for_status() bsams.append(res.json()['entries'][0]['references'][0]['acc']) sleep(.1) # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json")) # res.raise_for_status() # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']] for run_id, sam_id in zip(runs, bsams): print(run_id) record = {} record['sample'] = run_id record['biosample_accession'] = sam_id res = requests.get( f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}" ) res.raise_for_status() root = xml.fromstring(res.text) record['submitter_id'] = get_tag(root, './/SUBMITTER_ID') record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME') for attr in root.findall('.//SAMPLE_ATTRIBUTE'): key, value = iter(attr) record[key.text] = value.text rcds.append(record) sleep(.1) headers = {} for record in rcds: for key in record.keys(): headers[key] = None # use a dict to preserve header order with open('./metadata.tsv', 'w') as f: wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys()) wtr.writeheader() wtr.writerows(rcds)