diff bio2srr.py @ 3:80f1001797c7

Uploaded
author jpayne
date Wed, 27 Oct 2021 05:00:45 -0400
parents 556cac4fb538
children 2d4a2159c74b
line wrap: on
line diff
--- a/bio2srr.py	Mon Aug 03 10:39:37 2020 -0400
+++ b/bio2srr.py	Wed Oct 27 05:00:45 2021 -0400
@@ -7,6 +7,8 @@
 from xml.etree import ElementTree as xml
 import csv
 
+from time import sleep
+
 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
 
 sample = """{
@@ -24,11 +26,9 @@
     "facets": []
 }"""
 
-data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml"
+data_query = "?display=xml"
 
-xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample"
-
-sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}"
+sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run"
 
 sample = """{
     "hitCount": 1,
@@ -41,19 +41,30 @@
     "facets": []
 }"""
 
+def get_tag(root, tag):
+    val = root.find(tag)
+    if val:
+        return val.text
+
 if __name__ == "__main__":
     try:
         bioproject = sys.argv[1]
-        b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
-        b_result.raise_for_status()
-        runs = [d['id'] for d in b_result.json()['entries']]
+
+        b_result = None
+
+        runs = []
+
+        while not b_result or len(runs) < b_result['hitCount']:
+            b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json"))
+            b_result.raise_for_status()
+            b_result = b_result.json()
+            runs += [d['id'] for d in b_result['entries']]
+
         if not runs:
             print(f"No results found for '{bioproject}'.", file=sys.stderr)
             quit(1)
     except IndexError:
         raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
-    except KeyError as e:
-        raise ValueError() from e
 
     try:
         with open(sys.argv[2], 'r') as f:
@@ -64,22 +75,40 @@
     except IndexError:
         rcds = []
 
-    res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
-    res.raise_for_status()
-    bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
+    bsams = []
 
-    for run_id, sam_id in bsams:
+    for id in runs:
+        res = requests.get(
+            f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample",
+            headers=dict(Accept="application/json")
+        )
+        res.raise_for_status()
+        bsams.append(res.json()['entries'][0]['references'][0]['acc'])
+        sleep(.1)
+
+    # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
+    # res.raise_for_status()
+    # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
+
+    for run_id, sam_id in zip(runs, bsams):
+        print(run_id)
         record = {}
         record['sample'] = run_id
         record['biosample_accession'] = sam_id
-        res = requests.get(data_query.format(accession=sam_id))
+        res = requests.get(
+            f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}"
+        )
         res.raise_for_status()
         root = xml.fromstring(res.text)
+        
+        record['submitter_id'] = get_tag(root, './/SUBMITTER_ID')
+        record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME')
+
         for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
             key, value = iter(attr)
             record[key.text] = value.text
         rcds.append(record)
-        print(run_id)
+        sleep(.1)
 
     headers = {}
     for record in rcds: