Mercurial > repos > jpayne > bioproject_to_srr_2
diff bio2srr.py @ 3:80f1001797c7
Uploaded
author | jpayne |
---|---|
date | Wed, 27 Oct 2021 05:00:45 -0400 |
parents | 556cac4fb538 |
children | 2d4a2159c74b |
line wrap: on
line diff
--- a/bio2srr.py Mon Aug 03 10:39:37 2020 -0400 +++ b/bio2srr.py Wed Oct 27 05:00:45 2021 -0400 @@ -7,6 +7,8 @@ from xml.etree import ElementTree as xml import csv +from time import sleep + sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" sample = """{ @@ -24,11 +26,9 @@ "facets": [] }""" -data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml" +data_query = "?display=xml" -xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample" - -sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}" +sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run" sample = """{ "hitCount": 1, @@ -41,19 +41,30 @@ "facets": [] }""" +def get_tag(root, tag): + val = root.find(tag) + if val: + return val.text + if __name__ == "__main__": try: bioproject = sys.argv[1] - b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json")) - b_result.raise_for_status() - runs = [d['id'] for d in b_result.json()['entries']] + + b_result = None + + runs = [] + + while not b_result or len(runs) < b_result['hitCount']: + b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json")) + b_result.raise_for_status() + b_result = b_result.json() + runs += [d['id'] for d in b_result['entries']] + if not runs: print(f"No results found for '{bioproject}'.", file=sys.stderr) quit(1) except IndexError: raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") - except KeyError as e: - raise ValueError() from e try: with open(sys.argv[2], 'r') as f: @@ -64,22 +75,40 @@ except IndexError: rcds = [] - res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json")) - res.raise_for_status() - bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']] + bsams = [] - for run_id, sam_id in bsams: + for id in runs: + res = requests.get( + f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample", + headers=dict(Accept="application/json") + ) + res.raise_for_status() + bsams.append(res.json()['entries'][0]['references'][0]['acc']) + sleep(.1) + + # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json")) + # res.raise_for_status() + # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']] + + for run_id, sam_id in zip(runs, bsams): + print(run_id) record = {} record['sample'] = run_id record['biosample_accession'] = sam_id - res = requests.get(data_query.format(accession=sam_id)) + res = requests.get( + f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}" + ) res.raise_for_status() root = xml.fromstring(res.text) + + record['submitter_id'] = get_tag(root, './/SUBMITTER_ID') + record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME') + for attr in root.findall('.//SAMPLE_ATTRIBUTE'): key, value = iter(attr) record[key.text] = value.text rcds.append(record) - print(run_id) + sleep(.1) headers = {} for record in rcds: