Mercurial > repos > jpayne > bioproject_to_srr_2
comparison bio2srr.py @ 3:80f1001797c7
Uploaded
author | jpayne |
---|---|
date | Wed, 27 Oct 2021 05:00:45 -0400 |
parents | 556cac4fb538 |
children | 2d4a2159c74b |
comparison
equal
deleted
inserted
replaced
2:556cac4fb538 | 3:80f1001797c7 |
---|---|
4 | 4 |
5 import requests | 5 import requests |
6 import sys | 6 import sys |
7 from xml.etree import ElementTree as xml | 7 from xml.etree import ElementTree as xml |
8 import csv | 8 import csv |
9 | |
10 from time import sleep | |
9 | 11 |
10 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" | 12 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" |
11 | 13 |
12 sample = """{ | 14 sample = """{ |
13 "hitCount": 2, | 15 "hitCount": 2, |
22 } | 24 } |
23 ], | 25 ], |
24 "facets": [] | 26 "facets": [] |
25 }""" | 27 }""" |
26 | 28 |
27 data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml" | 29 data_query = "?display=xml" |
28 | 30 |
29 xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample" | 31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run" |
30 | |
31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}" | |
32 | 32 |
33 sample = """{ | 33 sample = """{ |
34 "hitCount": 1, | 34 "hitCount": 1, |
35 "entries": [ | 35 "entries": [ |
36 { | 36 { |
39 } | 39 } |
40 ], | 40 ], |
41 "facets": [] | 41 "facets": [] |
42 }""" | 42 }""" |
43 | 43 |
44 def get_tag(root, tag): | |
45 val = root.find(tag) | |
46 if val: | |
47 return val.text | |
48 | |
44 if __name__ == "__main__": | 49 if __name__ == "__main__": |
45 try: | 50 try: |
46 bioproject = sys.argv[1] | 51 bioproject = sys.argv[1] |
47 b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json")) | 52 |
48 b_result.raise_for_status() | 53 b_result = None |
49 runs = [d['id'] for d in b_result.json()['entries']] | 54 |
55 runs = [] | |
56 | |
57 while not b_result or len(runs) < b_result['hitCount']: | |
58 b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json")) | |
59 b_result.raise_for_status() | |
60 b_result = b_result.json() | |
61 runs += [d['id'] for d in b_result['entries']] | |
62 | |
50 if not runs: | 63 if not runs: |
51 print(f"No results found for '{bioproject}'.", file=sys.stderr) | 64 print(f"No results found for '{bioproject}'.", file=sys.stderr) |
52 quit(1) | 65 quit(1) |
53 except IndexError: | 66 except IndexError: |
54 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") | 67 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") |
55 except KeyError as e: | |
56 raise ValueError() from e | |
57 | 68 |
58 try: | 69 try: |
59 with open(sys.argv[2], 'r') as f: | 70 with open(sys.argv[2], 'r') as f: |
60 rdr = csv.DictReader(f, dialect='excel', delimiter='\t') | 71 rdr = csv.DictReader(f, dialect='excel', delimiter='\t') |
61 rcds = list(rdr) | 72 rcds = list(rdr) |
62 | 73 |
63 | 74 |
64 except IndexError: | 75 except IndexError: |
65 rcds = [] | 76 rcds = [] |
66 | 77 |
67 res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json")) | 78 bsams = [] |
68 res.raise_for_status() | |
69 bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']] | |
70 | 79 |
71 for run_id, sam_id in bsams: | 80 for id in runs: |
81 res = requests.get( | |
82 f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample", | |
83 headers=dict(Accept="application/json") | |
84 ) | |
85 res.raise_for_status() | |
86 bsams.append(res.json()['entries'][0]['references'][0]['acc']) | |
87 sleep(.1) | |
88 | |
89 # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json")) | |
90 # res.raise_for_status() | |
91 # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']] | |
92 | |
93 for run_id, sam_id in zip(runs, bsams): | |
94 print(run_id) | |
72 record = {} | 95 record = {} |
73 record['sample'] = run_id | 96 record['sample'] = run_id |
74 record['biosample_accession'] = sam_id | 97 record['biosample_accession'] = sam_id |
75 res = requests.get(data_query.format(accession=sam_id)) | 98 res = requests.get( |
99 f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}" | |
100 ) | |
76 res.raise_for_status() | 101 res.raise_for_status() |
77 root = xml.fromstring(res.text) | 102 root = xml.fromstring(res.text) |
103 | |
104 record['submitter_id'] = get_tag(root, './/SUBMITTER_ID') | |
105 record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME') | |
106 | |
78 for attr in root.findall('.//SAMPLE_ATTRIBUTE'): | 107 for attr in root.findall('.//SAMPLE_ATTRIBUTE'): |
79 key, value = iter(attr) | 108 key, value = iter(attr) |
80 record[key.text] = value.text | 109 record[key.text] = value.text |
81 rcds.append(record) | 110 rcds.append(record) |
82 print(run_id) | 111 sleep(.1) |
83 | 112 |
84 headers = {} | 113 headers = {} |
85 for record in rcds: | 114 for record in rcds: |
86 for key in record.keys(): | 115 for key in record.keys(): |
87 headers[key] = None # use a dict to preserve header order | 116 headers[key] = None # use a dict to preserve header order |