bioproject_to_srr_2: bio2srr.py comparison

comparison bio2srr.py @ 2:556cac4fb538

"planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/b07378e15ad9"

author	jpayne
date	Mon, 03 Aug 2020 10:39:37 -0400
parents	02ac32a00e25
children	80f1001797c7

comparison

equal deleted inserted replaced

-:b07378e15ad9
+:556cac4fb538
 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's."
 import requests
 import sys
+from xml.etree import ElementTree as xml
+import csv
-sra_exp_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
+sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
 sample = """{
 "hitCount": 2,
 "entries": [
 {
 }
 ],
 "facets": []
 }"""
-sra_run_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={experiment}"
+data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml"
+xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample"
+sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}"
 sample = """{
 "hitCount": 1,
 "entries": [
 {
 ],
 "facets": []
 }"""
 if __name__ == "__main__":
-	try:
+try:
-		bioproject = sys.argv[1]
+bioproject = sys.argv[1]
-		b_result = requests.get(sra_exp_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
+b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
-		b_result.raise_for_status()
+b_result.raise_for_status()
-		if b_result.json()['entries']:
+runs = [d['id'] for d in b_result.json()['entries']]
-			for experiment in [d['id'] for d in b_result.json()['entries']]:
+if not runs:
-				r_result = requests.get(sra_run_query.format(experiment=experiment), headers=dict(Accept="application/json"))
+print(f"No results found for '{bioproject}'.", file=sys.stderr)
-				r_result.raise_for_status()
+quit(1)
-				for run in [d['id'] for d in r_result.json()['entries']]:
+except IndexError:
-					print(run)
+raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
-		else:
+except KeyError as e:
-			print(f"No results found for '{bioproject}'.", file=sys.stderr)
+raise ValueError() from e
-			quit(1)
-	except IndexError:
+try:
-		raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
+with open(sys.argv[2], 'r') as f:
-	except KeyError as e:
+rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
-		raise ValueError() from e
+rcds = list(rdr)
+except IndexError:
+rcds = []
+res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
+res.raise_for_status()
+bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
+for run_id, sam_id in bsams:
+record = {}
+record['sample'] = run_id
+record['biosample_accession'] = sam_id
+res = requests.get(data_query.format(accession=sam_id))
+res.raise_for_status()
+root = xml.fromstring(res.text)
+for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
+key, value = iter(attr)
+record[key.text] = value.text
+rcds.append(record)
+print(run_id)
+headers = {}
+for record in rcds:
+for key in record.keys():
+headers[key] = None # use a dict to preserve header order
+with open('./metadata.tsv', 'w') as f:
+wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
+wtr.writeheader()
+wtr.writerows(rcds)

Mercurial > repos > jpayne > bioproject_to_srr_2

comparison bio2srr.py @ 2:556cac4fb538