bioproject_to_srr_2: bio2srr.py comparison

comparison bio2srr.py @ 3:80f1001797c7

Uploaded

author	jpayne
date	Wed, 27 Oct 2021 05:00:45 -0400
parents	556cac4fb538
children	2d4a2159c74b

comparison

equal deleted inserted replaced

-:556cac4fb538
+:80f1001797c7
 import requests
 import sys
 from xml.etree import ElementTree as xml
 import csv
+from time import sleep
 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
 sample = """{
 "hitCount": 2,
 }
 ],
 "facets": []
 }"""
-data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml"
+data_query = "?display=xml"
-xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample"
+sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run"
-sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}"
 sample = """{
 "hitCount": 1,
 "entries": [
 {
 }
 ],
 "facets": []
 }"""
+def get_tag(root, tag):
+val = root.find(tag)
+if val:
+return val.text
 if __name__ == "__main__":
 try:
 bioproject = sys.argv[1]
-b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
-b_result.raise_for_status()
+b_result = None
-runs = [d['id'] for d in b_result.json()['entries']]
+runs = []
+while not b_result or len(runs) < b_result['hitCount']:
+b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json"))
+b_result.raise_for_status()
+b_result = b_result.json()
+runs += [d['id'] for d in b_result['entries']]
 if not runs:
 print(f"No results found for '{bioproject}'.", file=sys.stderr)
 quit(1)
 except IndexError:
 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
-except KeyError as e:
-raise ValueError() from e
 try:
 with open(sys.argv[2], 'r') as f:
 rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
 rcds = list(rdr)
 except IndexError:
 rcds = []
-res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
+bsams = []
-res.raise_for_status()
-bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
-for run_id, sam_id in bsams:
+for id in runs:
+res = requests.get(
+f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample",
+headers=dict(Accept="application/json")
+)
+res.raise_for_status()
+bsams.append(res.json()['entries'][0]['references'][0]['acc'])
+sleep(.1)
+# res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
+# res.raise_for_status()
+# bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
+for run_id, sam_id in zip(runs, bsams):
+print(run_id)
 record = {}
 record['sample'] = run_id
 record['biosample_accession'] = sam_id
-res = requests.get(data_query.format(accession=sam_id))
+res = requests.get(
+f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}"
+)
 res.raise_for_status()
 root = xml.fromstring(res.text)
+record['submitter_id'] = get_tag(root, './/SUBMITTER_ID')
+record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME')
 for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
 key, value = iter(attr)
 record[key.text] = value.text
 rcds.append(record)
-print(run_id)
+sleep(.1)
 headers = {}
 for record in rcds:
 for key in record.keys():
 headers[key] = None # use a dict to preserve header order

Mercurial > repos > jpayne > bioproject_to_srr_2

comparison bio2srr.py @ 3:80f1001797c7