diff bio2srr.py @ 2:556cac4fb538

"planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/b07378e15ad9"
author jpayne
date Mon, 03 Aug 2020 10:39:37 -0400
parents 02ac32a00e25
children 80f1001797c7
line wrap: on
line diff
--- a/bio2srr.py	Thu Dec 20 11:04:53 2018 -0500
+++ b/bio2srr.py	Mon Aug 03 10:39:37 2020 -0400
@@ -4,8 +4,10 @@
 
 import requests
 import sys
+from xml.etree import ElementTree as xml
+import csv
 
-sra_exp_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
+sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
 
 sample = """{
     "hitCount": 2,
@@ -22,7 +24,11 @@
     "facets": []
 }"""
 
-sra_run_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={experiment}"
+data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml"
+
+xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample"
+
+sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}"
 
 sample = """{
     "hitCount": 1,
@@ -36,23 +42,55 @@
 }"""
 
 if __name__ == "__main__":
-	try:
-		bioproject = sys.argv[1]
-		b_result = requests.get(sra_exp_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
-		b_result.raise_for_status()
-		if b_result.json()['entries']:
-			for experiment in [d['id'] for d in b_result.json()['entries']]:
-				r_result = requests.get(sra_run_query.format(experiment=experiment), headers=dict(Accept="application/json"))
-				r_result.raise_for_status()
-				for run in [d['id'] for d in r_result.json()['entries']]:
-					print(run)
-		else:
-			print(f"No results found for '{bioproject}'.", file=sys.stderr)
-			quit(1)
-	except IndexError:
-		raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
-	except KeyError as e:
-		raise ValueError() from e
+    try:
+        bioproject = sys.argv[1]
+        b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
+        b_result.raise_for_status()
+        runs = [d['id'] for d in b_result.json()['entries']]
+        if not runs:
+            print(f"No results found for '{bioproject}'.", file=sys.stderr)
+            quit(1)
+    except IndexError:
+        raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
+    except KeyError as e:
+        raise ValueError() from e
 
+    try:
+        with open(sys.argv[2], 'r') as f:
+            rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
+            rcds = list(rdr)
 
 
+    except IndexError:
+        rcds = []
+
+    res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
+    res.raise_for_status()
+    bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
+
+    for run_id, sam_id in bsams:
+        record = {}
+        record['sample'] = run_id
+        record['biosample_accession'] = sam_id
+        res = requests.get(data_query.format(accession=sam_id))
+        res.raise_for_status()
+        root = xml.fromstring(res.text)
+        for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
+            key, value = iter(attr)
+            record[key.text] = value.text
+        rcds.append(record)
+        print(run_id)
+
+    headers = {}
+    for record in rcds:
+        for key in record.keys():
+            headers[key] = None # use a dict to preserve header order
+    
+        
+    with open('./metadata.tsv', 'w') as f:
+        wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
+        wtr.writeheader()
+        wtr.writerows(rcds)
+
+
+