comparison bio2srr.py @ 3:80f1001797c7

Uploaded
author jpayne
date Wed, 27 Oct 2021 05:00:45 -0400
parents 556cac4fb538
children 2d4a2159c74b
comparison
equal deleted inserted replaced
2:556cac4fb538 3:80f1001797c7
4 4
5 import requests 5 import requests
6 import sys 6 import sys
7 from xml.etree import ElementTree as xml 7 from xml.etree import ElementTree as xml
8 import csv 8 import csv
9
10 from time import sleep
9 11
10 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" 12 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
11 13
12 sample = """{ 14 sample = """{
13 "hitCount": 2, 15 "hitCount": 2,
22 } 24 }
23 ], 25 ],
24 "facets": [] 26 "facets": []
25 }""" 27 }"""
26 28
27 data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml" 29 data_query = "?display=xml"
28 30
29 xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample" 31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run"
30
31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}"
32 32
33 sample = """{ 33 sample = """{
34 "hitCount": 1, 34 "hitCount": 1,
35 "entries": [ 35 "entries": [
36 { 36 {
39 } 39 }
40 ], 40 ],
41 "facets": [] 41 "facets": []
42 }""" 42 }"""
43 43
44 def get_tag(root, tag):
45 val = root.find(tag)
46 if val:
47 return val.text
48
44 if __name__ == "__main__": 49 if __name__ == "__main__":
45 try: 50 try:
46 bioproject = sys.argv[1] 51 bioproject = sys.argv[1]
47 b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json")) 52
48 b_result.raise_for_status() 53 b_result = None
49 runs = [d['id'] for d in b_result.json()['entries']] 54
55 runs = []
56
57 while not b_result or len(runs) < b_result['hitCount']:
58 b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json"))
59 b_result.raise_for_status()
60 b_result = b_result.json()
61 runs += [d['id'] for d in b_result['entries']]
62
50 if not runs: 63 if not runs:
51 print(f"No results found for '{bioproject}'.", file=sys.stderr) 64 print(f"No results found for '{bioproject}'.", file=sys.stderr)
52 quit(1) 65 quit(1)
53 except IndexError: 66 except IndexError:
54 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") 67 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
55 except KeyError as e:
56 raise ValueError() from e
57 68
58 try: 69 try:
59 with open(sys.argv[2], 'r') as f: 70 with open(sys.argv[2], 'r') as f:
60 rdr = csv.DictReader(f, dialect='excel', delimiter='\t') 71 rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
61 rcds = list(rdr) 72 rcds = list(rdr)
62 73
63 74
64 except IndexError: 75 except IndexError:
65 rcds = [] 76 rcds = []
66 77
67 res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json")) 78 bsams = []
68 res.raise_for_status()
69 bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
70 79
71 for run_id, sam_id in bsams: 80 for id in runs:
81 res = requests.get(
82 f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample",
83 headers=dict(Accept="application/json")
84 )
85 res.raise_for_status()
86 bsams.append(res.json()['entries'][0]['references'][0]['acc'])
87 sleep(.1)
88
89 # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
90 # res.raise_for_status()
91 # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
92
93 for run_id, sam_id in zip(runs, bsams):
94 print(run_id)
72 record = {} 95 record = {}
73 record['sample'] = run_id 96 record['sample'] = run_id
74 record['biosample_accession'] = sam_id 97 record['biosample_accession'] = sam_id
75 res = requests.get(data_query.format(accession=sam_id)) 98 res = requests.get(
99 f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}"
100 )
76 res.raise_for_status() 101 res.raise_for_status()
77 root = xml.fromstring(res.text) 102 root = xml.fromstring(res.text)
103
104 record['submitter_id'] = get_tag(root, './/SUBMITTER_ID')
105 record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME')
106
78 for attr in root.findall('.//SAMPLE_ATTRIBUTE'): 107 for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
79 key, value = iter(attr) 108 key, value = iter(attr)
80 record[key.text] = value.text 109 record[key.text] = value.text
81 rcds.append(record) 110 rcds.append(record)
82 print(run_id) 111 sleep(.1)
83 112
84 headers = {} 113 headers = {}
85 for record in rcds: 114 for record in rcds:
86 for key in record.keys(): 115 for key in record.keys():
87 headers[key] = None # use a dict to preserve header order 116 headers[key] = None # use a dict to preserve header order