jpayne@0: #! /usr/bin/env python3 jpayne@0: jpayne@0: "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's." jpayne@0: jpayne@0: import requests jpayne@0: import sys jpayne@2: from xml.etree import ElementTree as xml jpayne@2: import csv jpayne@0: jpayne@2: sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" jpayne@0: jpayne@0: sample = """{ jpayne@0: "hitCount": 2, jpayne@0: "entries": [ jpayne@0: { jpayne@0: "id": "SRX377510", jpayne@0: "source": "sra-experiment" jpayne@0: }, jpayne@0: { jpayne@0: "id": "SRX583279", jpayne@0: "source": "sra-experiment" jpayne@0: } jpayne@0: ], jpayne@0: "facets": [] jpayne@0: }""" jpayne@0: jpayne@2: data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml" jpayne@2: jpayne@2: xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample" jpayne@2: jpayne@2: sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}" jpayne@0: jpayne@0: sample = """{ jpayne@0: "hitCount": 1, jpayne@0: "entries": [ jpayne@0: { jpayne@0: "id": "SRR1029665", jpayne@0: "source": "sra-run" jpayne@0: } jpayne@0: ], jpayne@0: "facets": [] jpayne@0: }""" jpayne@0: jpayne@0: if __name__ == "__main__": jpayne@2: try: jpayne@2: bioproject = sys.argv[1] jpayne@2: b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json")) jpayne@2: b_result.raise_for_status() jpayne@2: runs = [d['id'] for d in b_result.json()['entries']] jpayne@2: if not runs: jpayne@2: print(f"No results found for '{bioproject}'.", file=sys.stderr) jpayne@2: quit(1) jpayne@2: except IndexError: jpayne@2: raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") jpayne@2: except KeyError as e: jpayne@2: raise ValueError() from e jpayne@0: jpayne@2: try: jpayne@2: with open(sys.argv[2], 'r') as f: jpayne@2: rdr = csv.DictReader(f, dialect='excel', delimiter='\t') jpayne@2: rcds = list(rdr) jpayne@0: jpayne@0: jpayne@2: except IndexError: jpayne@2: rcds = [] jpayne@2: jpayne@2: res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json")) jpayne@2: res.raise_for_status() jpayne@2: bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']] jpayne@2: jpayne@2: for run_id, sam_id in bsams: jpayne@2: record = {} jpayne@2: record['sample'] = run_id jpayne@2: record['biosample_accession'] = sam_id jpayne@2: res = requests.get(data_query.format(accession=sam_id)) jpayne@2: res.raise_for_status() jpayne@2: root = xml.fromstring(res.text) jpayne@2: for attr in root.findall('.//SAMPLE_ATTRIBUTE'): jpayne@2: key, value = iter(attr) jpayne@2: record[key.text] = value.text jpayne@2: rcds.append(record) jpayne@2: print(run_id) jpayne@2: jpayne@2: headers = {} jpayne@2: for record in rcds: jpayne@2: for key in record.keys(): jpayne@2: headers[key] = None # use a dict to preserve header order jpayne@2: jpayne@2: jpayne@2: with open('./metadata.tsv', 'w') as f: jpayne@2: wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys()) jpayne@2: wtr.writeheader() jpayne@2: wtr.writerows(rcds) jpayne@2: jpayne@2: jpayne@2: