jpayne@0
|
1 #! /usr/bin/env python3
|
jpayne@0
|
2
|
jpayne@0
|
3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's."
|
jpayne@0
|
4
|
jpayne@0
|
5 import requests
|
jpayne@0
|
6 import sys
|
jpayne@2
|
7 from xml.etree import ElementTree as xml
|
jpayne@2
|
8 import csv
|
jpayne@0
|
9
|
jpayne@2
|
10 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
|
jpayne@0
|
11
|
jpayne@0
|
12 sample = """{
|
jpayne@0
|
13 "hitCount": 2,
|
jpayne@0
|
14 "entries": [
|
jpayne@0
|
15 {
|
jpayne@0
|
16 "id": "SRX377510",
|
jpayne@0
|
17 "source": "sra-experiment"
|
jpayne@0
|
18 },
|
jpayne@0
|
19 {
|
jpayne@0
|
20 "id": "SRX583279",
|
jpayne@0
|
21 "source": "sra-experiment"
|
jpayne@0
|
22 }
|
jpayne@0
|
23 ],
|
jpayne@0
|
24 "facets": []
|
jpayne@0
|
25 }"""
|
jpayne@0
|
26
|
jpayne@2
|
27 data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml"
|
jpayne@2
|
28
|
jpayne@2
|
29 xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample"
|
jpayne@2
|
30
|
jpayne@2
|
31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}"
|
jpayne@0
|
32
|
jpayne@0
|
33 sample = """{
|
jpayne@0
|
34 "hitCount": 1,
|
jpayne@0
|
35 "entries": [
|
jpayne@0
|
36 {
|
jpayne@0
|
37 "id": "SRR1029665",
|
jpayne@0
|
38 "source": "sra-run"
|
jpayne@0
|
39 }
|
jpayne@0
|
40 ],
|
jpayne@0
|
41 "facets": []
|
jpayne@0
|
42 }"""
|
jpayne@0
|
43
|
jpayne@0
|
44 if __name__ == "__main__":
|
jpayne@2
|
45 try:
|
jpayne@2
|
46 bioproject = sys.argv[1]
|
jpayne@2
|
47 b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
|
jpayne@2
|
48 b_result.raise_for_status()
|
jpayne@2
|
49 runs = [d['id'] for d in b_result.json()['entries']]
|
jpayne@2
|
50 if not runs:
|
jpayne@2
|
51 print(f"No results found for '{bioproject}'.", file=sys.stderr)
|
jpayne@2
|
52 quit(1)
|
jpayne@2
|
53 except IndexError:
|
jpayne@2
|
54 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
|
jpayne@2
|
55 except KeyError as e:
|
jpayne@2
|
56 raise ValueError() from e
|
jpayne@0
|
57
|
jpayne@2
|
58 try:
|
jpayne@2
|
59 with open(sys.argv[2], 'r') as f:
|
jpayne@2
|
60 rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
|
jpayne@2
|
61 rcds = list(rdr)
|
jpayne@0
|
62
|
jpayne@0
|
63
|
jpayne@2
|
64 except IndexError:
|
jpayne@2
|
65 rcds = []
|
jpayne@2
|
66
|
jpayne@2
|
67 res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
|
jpayne@2
|
68 res.raise_for_status()
|
jpayne@2
|
69 bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
|
jpayne@2
|
70
|
jpayne@2
|
71 for run_id, sam_id in bsams:
|
jpayne@2
|
72 record = {}
|
jpayne@2
|
73 record['sample'] = run_id
|
jpayne@2
|
74 record['biosample_accession'] = sam_id
|
jpayne@2
|
75 res = requests.get(data_query.format(accession=sam_id))
|
jpayne@2
|
76 res.raise_for_status()
|
jpayne@2
|
77 root = xml.fromstring(res.text)
|
jpayne@2
|
78 for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
|
jpayne@2
|
79 key, value = iter(attr)
|
jpayne@2
|
80 record[key.text] = value.text
|
jpayne@2
|
81 rcds.append(record)
|
jpayne@2
|
82 print(run_id)
|
jpayne@2
|
83
|
jpayne@2
|
84 headers = {}
|
jpayne@2
|
85 for record in rcds:
|
jpayne@2
|
86 for key in record.keys():
|
jpayne@2
|
87 headers[key] = None # use a dict to preserve header order
|
jpayne@2
|
88
|
jpayne@2
|
89
|
jpayne@2
|
90 with open('./metadata.tsv', 'w') as f:
|
jpayne@2
|
91 wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
|
jpayne@2
|
92 wtr.writeheader()
|
jpayne@2
|
93 wtr.writerows(rcds)
|
jpayne@2
|
94
|
jpayne@2
|
95
|
jpayne@2
|
96
|