comparison bio2srr.py @ 2:556cac4fb538

"planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/b07378e15ad9"
author jpayne
date Mon, 03 Aug 2020 10:39:37 -0400
parents 02ac32a00e25
children 80f1001797c7
comparison
equal deleted inserted replaced
1:b07378e15ad9 2:556cac4fb538
2 2
3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's." 3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's."
4 4
5 import requests 5 import requests
6 import sys 6 import sys
7 from xml.etree import ElementTree as xml
8 import csv
7 9
8 sra_exp_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" 10 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
9 11
10 sample = """{ 12 sample = """{
11 "hitCount": 2, 13 "hitCount": 2,
12 "entries": [ 14 "entries": [
13 { 15 {
20 } 22 }
21 ], 23 ],
22 "facets": [] 24 "facets": []
23 }""" 25 }"""
24 26
25 sra_run_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={experiment}" 27 data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml"
28
29 xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample"
30
31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}"
26 32
27 sample = """{ 33 sample = """{
28 "hitCount": 1, 34 "hitCount": 1,
29 "entries": [ 35 "entries": [
30 { 36 {
34 ], 40 ],
35 "facets": [] 41 "facets": []
36 }""" 42 }"""
37 43
38 if __name__ == "__main__": 44 if __name__ == "__main__":
39 try: 45 try:
40 bioproject = sys.argv[1] 46 bioproject = sys.argv[1]
41 b_result = requests.get(sra_exp_query.format(bioproject=bioproject), headers=dict(Accept="application/json")) 47 b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
42 b_result.raise_for_status() 48 b_result.raise_for_status()
43 if b_result.json()['entries']: 49 runs = [d['id'] for d in b_result.json()['entries']]
44 for experiment in [d['id'] for d in b_result.json()['entries']]: 50 if not runs:
45 r_result = requests.get(sra_run_query.format(experiment=experiment), headers=dict(Accept="application/json")) 51 print(f"No results found for '{bioproject}'.", file=sys.stderr)
46 r_result.raise_for_status() 52 quit(1)
47 for run in [d['id'] for d in r_result.json()['entries']]: 53 except IndexError:
48 print(run) 54 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
49 else: 55 except KeyError as e:
50 print(f"No results found for '{bioproject}'.", file=sys.stderr) 56 raise ValueError() from e
51 quit(1) 57
52 except IndexError: 58 try:
53 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") 59 with open(sys.argv[2], 'r') as f:
54 except KeyError as e: 60 rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
55 raise ValueError() from e 61 rcds = list(rdr)
62
63
64 except IndexError:
65 rcds = []
66
67 res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
68 res.raise_for_status()
69 bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
70
71 for run_id, sam_id in bsams:
72 record = {}
73 record['sample'] = run_id
74 record['biosample_accession'] = sam_id
75 res = requests.get(data_query.format(accession=sam_id))
76 res.raise_for_status()
77 root = xml.fromstring(res.text)
78 for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
79 key, value = iter(attr)
80 record[key.text] = value.text
81 rcds.append(record)
82 print(run_id)
83
84 headers = {}
85 for record in rcds:
86 for key in record.keys():
87 headers[key] = None # use a dict to preserve header order
88
89
90 with open('./metadata.tsv', 'w') as f:
91 wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
92 wtr.writeheader()
93 wtr.writerows(rcds)
56 94
57 95
58 96