jpayne@0
|
1 #! /usr/bin/env python3
|
jpayne@0
|
2
|
jpayne@0
|
3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's."
|
jpayne@0
|
4
|
jpayne@0
|
5 import requests
|
jpayne@0
|
6 import sys
|
jpayne@2
|
7 from xml.etree import ElementTree as xml
|
jpayne@2
|
8 import csv
|
jpayne@0
|
9
|
jpayne@3
|
10 from time import sleep
|
jpayne@3
|
11
|
jpayne@2
|
12 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
|
jpayne@0
|
13
|
jpayne@0
|
14 sample = """{
|
jpayne@0
|
15 "hitCount": 2,
|
jpayne@0
|
16 "entries": [
|
jpayne@0
|
17 {
|
jpayne@0
|
18 "id": "SRX377510",
|
jpayne@0
|
19 "source": "sra-experiment"
|
jpayne@0
|
20 },
|
jpayne@0
|
21 {
|
jpayne@0
|
22 "id": "SRX583279",
|
jpayne@0
|
23 "source": "sra-experiment"
|
jpayne@0
|
24 }
|
jpayne@0
|
25 ],
|
jpayne@0
|
26 "facets": []
|
jpayne@0
|
27 }"""
|
jpayne@0
|
28
|
jpayne@3
|
29 data_query = "?display=xml"
|
jpayne@2
|
30
|
jpayne@3
|
31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run"
|
jpayne@0
|
32
|
jpayne@0
|
33 sample = """{
|
jpayne@0
|
34 "hitCount": 1,
|
jpayne@0
|
35 "entries": [
|
jpayne@0
|
36 {
|
jpayne@0
|
37 "id": "SRR1029665",
|
jpayne@0
|
38 "source": "sra-run"
|
jpayne@0
|
39 }
|
jpayne@0
|
40 ],
|
jpayne@0
|
41 "facets": []
|
jpayne@0
|
42 }"""
|
jpayne@0
|
43
|
jpayne@3
|
44 def get_tag(root, tag):
|
jpayne@3
|
45 val = root.find(tag)
|
jpayne@3
|
46 if val:
|
jpayne@3
|
47 return val.text
|
jpayne@3
|
48
|
jpayne@0
|
49 if __name__ == "__main__":
|
jpayne@2
|
50 try:
|
jpayne@2
|
51 bioproject = sys.argv[1]
|
jpayne@3
|
52
|
jpayne@3
|
53 b_result = None
|
jpayne@3
|
54
|
jpayne@3
|
55 runs = []
|
jpayne@3
|
56
|
jpayne@3
|
57 while not b_result or len(runs) < b_result['hitCount']:
|
jpayne@3
|
58 b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json"))
|
jpayne@3
|
59 b_result.raise_for_status()
|
jpayne@3
|
60 b_result = b_result.json()
|
jpayne@3
|
61 runs += [d['id'] for d in b_result['entries']]
|
jpayne@3
|
62
|
jpayne@2
|
63 if not runs:
|
jpayne@2
|
64 print(f"No results found for '{bioproject}'.", file=sys.stderr)
|
jpayne@2
|
65 quit(1)
|
jpayne@2
|
66 except IndexError:
|
jpayne@2
|
67 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
|
jpayne@0
|
68
|
jpayne@2
|
69 try:
|
jpayne@2
|
70 with open(sys.argv[2], 'r') as f:
|
jpayne@2
|
71 rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
|
jpayne@2
|
72 rcds = list(rdr)
|
jpayne@0
|
73
|
jpayne@0
|
74
|
jpayne@2
|
75 except IndexError:
|
jpayne@2
|
76 rcds = []
|
jpayne@2
|
77
|
jpayne@3
|
78 bsams = []
|
jpayne@2
|
79
|
jpayne@3
|
80 for id in runs:
|
jpayne@3
|
81 res = requests.get(
|
jpayne@3
|
82 f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample",
|
jpayne@3
|
83 headers=dict(Accept="application/json")
|
jpayne@3
|
84 )
|
jpayne@3
|
85 res.raise_for_status()
|
jpayne@3
|
86 bsams.append(res.json()['entries'][0]['references'][0]['acc'])
|
jpayne@3
|
87 sleep(.1)
|
jpayne@3
|
88
|
jpayne@3
|
89 # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
|
jpayne@3
|
90 # res.raise_for_status()
|
jpayne@3
|
91 # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
|
jpayne@3
|
92
|
jpayne@3
|
93 for run_id, sam_id in zip(runs, bsams):
|
jpayne@3
|
94 print(run_id)
|
jpayne@2
|
95 record = {}
|
jpayne@2
|
96 record['sample'] = run_id
|
jpayne@2
|
97 record['biosample_accession'] = sam_id
|
jpayne@3
|
98 res = requests.get(
|
jpayne@3
|
99 f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}"
|
jpayne@3
|
100 )
|
jpayne@2
|
101 res.raise_for_status()
|
jpayne@2
|
102 root = xml.fromstring(res.text)
|
jpayne@3
|
103
|
jpayne@3
|
104 record['submitter_id'] = get_tag(root, './/SUBMITTER_ID')
|
jpayne@3
|
105 record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME')
|
jpayne@3
|
106
|
jpayne@2
|
107 for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
|
jpayne@2
|
108 key, value = iter(attr)
|
jpayne@2
|
109 record[key.text] = value.text
|
jpayne@2
|
110 rcds.append(record)
|
jpayne@3
|
111 sleep(.1)
|
jpayne@2
|
112
|
jpayne@2
|
113 headers = {}
|
jpayne@2
|
114 for record in rcds:
|
jpayne@2
|
115 for key in record.keys():
|
jpayne@2
|
116 headers[key] = None # use a dict to preserve header order
|
jpayne@2
|
117
|
jpayne@2
|
118
|
jpayne@2
|
119 with open('./metadata.tsv', 'w') as f:
|
jpayne@2
|
120 wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
|
jpayne@2
|
121 wtr.writeheader()
|
jpayne@2
|
122 wtr.writerows(rcds)
|
jpayne@2
|
123
|
jpayne@2
|
124
|
jpayne@2
|
125
|