Mercurial > repos > jpayne > bioproject_to_srr_2
comparison bio2srr.py @ 4:2d4a2159c74b
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Fri, 03 May 2024 01:17:43 -0400 |
parents | 80f1001797c7 |
children | 832f269deeb0 |
comparison
equal
deleted
inserted
replaced
3:80f1001797c7 | 4:2d4a2159c74b |
---|---|
1 #! /usr/bin/env python3 | 1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils" |
2 | |
3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's." | |
4 | 2 |
5 import requests | 3 import requests |
6 import sys | 4 import sys |
5 import csv | |
6 | |
7 | |
8 from itertools import batched | |
9 from functools import cmp_to_key | |
10 from time import sleep | |
7 from xml.etree import ElementTree as xml | 11 from xml.etree import ElementTree as xml |
8 import csv | 12 |
9 | 13 esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" |
10 from time import sleep | 14 esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" |
11 | 15 elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi" |
12 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" | 16 |
13 | 17 |
14 sample = """{ | 18 import logging |
15 "hitCount": 2, | 19 logging.basicConfig(level=logging.INFO) |
16 "entries": [ | 20 |
17 { | 21 logger = logging.getLogger("bio2srr") |
18 "id": "SRX377510", | 22 |
19 "source": "sra-experiment" | 23 def log(msg): |
20 }, | 24 logger.info(msg) # fix logging later |
21 { | |
22 "id": "SRX583279", | |
23 "source": "sra-experiment" | |
24 } | |
25 ], | |
26 "facets": [] | |
27 }""" | |
28 | |
29 data_query = "?display=xml" | |
30 | |
31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run" | |
32 | |
33 sample = """{ | |
34 "hitCount": 1, | |
35 "entries": [ | |
36 { | |
37 "id": "SRR1029665", | |
38 "source": "sra-run" | |
39 } | |
40 ], | |
41 "facets": [] | |
42 }""" | |
43 | 25 |
44 def get_tag(root, tag): | 26 def get_tag(root, tag): |
45 val = root.find(tag) | 27 val = root.find(tag) |
46 if val: | 28 if val is not None: |
47 return val.text | 29 return val.text |
30 log(f"No result for {tag}") | |
31 | |
32 | |
33 | |
34 def header_sort_override(a, b): | |
35 if a == b: | |
36 return 0 | |
37 try: | |
38 for name in ["bioproject", "srr_accession", "biosample_accession", "organism", "taxid", "package",]: | |
39 if a == name: | |
40 return -1 | |
41 if b == name: | |
42 return 1 | |
43 except: | |
44 pass | |
45 if a < b: | |
46 return -1 | |
47 else: | |
48 return 1 | |
49 | |
50 hso = cmp_to_key(header_sort_override) | |
51 | |
52 def resolve_bioproject_ids_and_links(bioproject_id_list): | |
53 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML" | |
54 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list): | |
55 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}") | |
56 #get bioproject to bioproject links | |
57 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json")) | |
58 response.raise_for_status() | |
59 reply = response.json() | |
60 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}]) | |
61 if len(linksets) >= 3: | |
62 for id in linksets[2].get("links", []): #third index is the up to down links | |
63 response = requests.get(esummary, params=dict(id=id, db="bioproject", format="json")) | |
64 response.raise_for_status() | |
65 replyy = response.json() | |
66 biop = replyy["result"][id]["project_acc"] | |
67 if id not in bioproject_id_list: | |
68 bioproject_id_list.append((biop, id)) # recurse over bioproject links | |
69 # get bioproject to biosample links | |
70 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json")) | |
71 response.raise_for_status() | |
72 reply = response.json() | |
73 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []) | |
74 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})") | |
75 for ids in batched(links, 200): | |
76 response = requests.get(esummary, params=dict(id=",".join(ids), db="biosample", format="json")) | |
77 response.raise_for_status() | |
78 replyy = response.json() | |
79 for field, value in replyy.get("result", {}).items(): | |
80 if "uids" not in field: | |
81 yield bioproject, field, value["sampledata"] # this is XML, deleriously | |
82 sleep(0.1) | |
83 | |
84 | |
85 biosample_example = """ | |
86 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268"> | |
87 <Ids> | |
88 <Id db="BioSample" is_primary="1">SAMN17131268</Id> | |
89 <Id db_label="Sample name">CJP19-D996</Id> | |
90 </Ids> | |
91 <Description> | |
92 <Title>Pathogen: environmental/food/other sample from Campylobacter jejuni</Title> | |
93 <Organism taxonomy_id="197" taxonomy_name="Campylobacter jejuni"> | |
94 <OrganismName>Campylobacter jejuni</OrganismName> | |
95 </Organism> | |
96 </Description> | |
97 <Owner> | |
98 <Name url="http://www.fda.gov/Food/FoodScienceResearch/WholeGenomeSequencingProgramWGS/default.htm" abbreviation="CFSAN">FDA Center for Food Safety and Applied Nutrition</Name> | |
99 </Owner> | |
100 <Models> | |
101 <Model>Pathogen.env</Model> | |
102 </Models> | |
103 <Package display_name="Pathogen: environmental/food/other; version 1.0">Pathogen.env.1.0</Package> | |
104 <Attributes> | |
105 <Attribute attribute_name="strain" harmonized_name="strain" display_name="strain">CJP19-D996</Attribute> | |
106 <Attribute attribute_name="collection_date" harmonized_name="collection_date" display_name="collection date">missing</Attribute> | |
107 <Attribute attribute_name="geo_loc_name" harmonized_name="geo_loc_name" display_name="geographic location">missing</Attribute> | |
108 <Attribute attribute_name="collected_by" harmonized_name="collected_by" display_name="collected by">CDC</Attribute> | |
109 <Attribute attribute_name="lat_lon" harmonized_name="lat_lon" display_name="latitude and longitude">missing</Attribute> | |
110 <Attribute attribute_name="isolation_source" harmonized_name="isolation_source" display_name="isolation source">missing</Attribute> | |
111 <Attribute attribute_name="isolate" harmonized_name="isolate" display_name="isolate">CFSAN091032</Attribute> | |
112 <Attribute attribute_name="project name" harmonized_name="project_name" display_name="project name">GenomeTrakr</Attribute> | |
113 <Attribute attribute_name="sequenced by" harmonized_name="sequenced_by" display_name="sequenced by">FDA Center for Food Safety and Applied Nutrition</Attribute> | |
114 </Attributes> | |
115 <Links> | |
116 <Link type="entrez" target="bioproject" label="PRJNA681235">681235</Link> | |
117 </Links> | |
118 <Status status="live" when="2020-12-21T15:08:05.693"/> | |
119 </BioSample> | |
120 | |
121 """ | |
122 | |
123 def flatten_biosample_xml(biosampxml): | |
124 root = xml.fromstring(biosampxml) | |
125 accession = get_tag(root, r'.//Id[@db="BioSample"]') | |
126 # sample_name = get_tag(root, r'.//Id[@db_label="Sample name"]') | |
127 organism = get_tag(root, r".//OrganismName") | |
128 tax_id = root.find(r".//Organism").attrib.get("taxonomy_id") | |
129 package = get_tag(root, r".//Package") | |
130 sampledict = dict( | |
131 biosample_accession=accession, | |
132 # sample_name=sample_name, | |
133 organism = organism, | |
134 taxid = tax_id, | |
135 package = package | |
136 ) | |
137 for attribute in root.findall("Attributes/Attribute"): | |
138 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text | |
139 | |
140 return sampledict | |
141 | |
142 | |
143 def yield_sra_runs_from_sample(biosampleids): | |
144 sleep(0.1) | |
145 response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json")) | |
146 response.raise_for_status() | |
147 reply = response.json() | |
148 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): | |
149 sleep(0.1) | |
150 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json")) | |
151 response.raise_for_status() | |
152 replyy = response.json() | |
153 for field, value in replyy.get("result", {}).items(): | |
154 if "uids" not in field: | |
155 yield field, value.get("runs") | |
156 | |
157 | |
158 runs_example = """ | |
159 <Run acc="SRR13167188" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/> | |
160 <Run acc="SRR13167189" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/> | |
161 """ | |
162 | |
163 def flatten_runs(runxml): | |
164 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted | |
165 for run in root.findall(".//Run"): | |
166 yield dict( | |
167 sra_run_accession = run.attrib["acc"], | |
168 total_spots = run.attrib["total_spots"], | |
169 total_bases = run.attrib["total_bases"], | |
170 ) | |
171 | |
172 | |
173 | |
174 def main(starting_bioproject): | |
175 rows = [] | |
176 response = requests.get(esearch, params=dict(db="bioproject", term=starting_bioproject, field="PRJA", format="json")) | |
177 response.raise_for_status() | |
178 reply = response.json() | |
179 try: | |
180 bioproject_id = reply["esearchresult"]["idlist"][0] | |
181 log(f"Found UID {bioproject_id} for '{starting_bioproject}'") | |
182 except IndexError: | |
183 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"") | |
184 sys.exit(1) | |
185 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]): | |
186 try: | |
187 sampledict = flatten_biosample_xml(biosample_xml) | |
188 except KeyError: | |
189 log(biosample_xml) | |
190 raise | |
191 sampledict["bioproject"] = bioproject | |
192 for sra, runs in yield_sra_runs_from_sample(biosample): | |
193 for run in flatten_runs(runs.strip()): | |
194 run.update(sampledict) | |
195 rows.append(run) | |
196 | |
197 log(f"Writing {len(rows)} rows to metadata.tsv") | |
198 | |
199 header = set() | |
200 for row in rows: | |
201 for key in row.keys(): | |
202 header.add(key) | |
203 | |
204 header = sorted(list(header), key=hso) | |
205 logger.info(f"Header: {header}") | |
206 | |
207 rows.sort(key=lambda x: x["biosample_accession"]) | |
208 | |
209 with open("metadata.tsv", "w") as f: | |
210 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel") | |
211 writer.writeheader() | |
212 writer.writerows(rows) | |
213 | |
214 log(f"Writing {len(rows)} accessions to accessions.txt") | |
215 | |
216 with open("accessions.txt", "w") as f: | |
217 for row in rows: | |
218 f.write(row["sra_run_accession"] + "\n") | |
219 | |
48 | 220 |
49 if __name__ == "__main__": | 221 if __name__ == "__main__": |
222 b = sys.argv[1].strip() | |
223 log(f"Starting with {b}") | |
50 try: | 224 try: |
51 bioproject = sys.argv[1] | 225 main(b) |
52 | 226 except requests.HTTPError as e: |
53 b_result = None | 227 logger.error(e) |
54 | 228 sys.exit(1) |
55 runs = [] | 229 |
56 | 230 |
57 while not b_result or len(runs) < b_result['hitCount']: | 231 |
58 b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json")) | 232 |
59 b_result.raise_for_status() | 233 |
60 b_result = b_result.json() | |
61 runs += [d['id'] for d in b_result['entries']] | |
62 | |
63 if not runs: | |
64 print(f"No results found for '{bioproject}'.", file=sys.stderr) | |
65 quit(1) | |
66 except IndexError: | |
67 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") | |
68 | |
69 try: | |
70 with open(sys.argv[2], 'r') as f: | |
71 rdr = csv.DictReader(f, dialect='excel', delimiter='\t') | |
72 rcds = list(rdr) | |
73 | |
74 | |
75 except IndexError: | |
76 rcds = [] | |
77 | |
78 bsams = [] | |
79 | |
80 for id in runs: | |
81 res = requests.get( | |
82 f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample", | |
83 headers=dict(Accept="application/json") | |
84 ) | |
85 res.raise_for_status() | |
86 bsams.append(res.json()['entries'][0]['references'][0]['acc']) | |
87 sleep(.1) | |
88 | |
89 # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json")) | |
90 # res.raise_for_status() | |
91 # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']] | |
92 | |
93 for run_id, sam_id in zip(runs, bsams): | |
94 print(run_id) | |
95 record = {} | |
96 record['sample'] = run_id | |
97 record['biosample_accession'] = sam_id | |
98 res = requests.get( | |
99 f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}" | |
100 ) | |
101 res.raise_for_status() | |
102 root = xml.fromstring(res.text) | |
103 | |
104 record['submitter_id'] = get_tag(root, './/SUBMITTER_ID') | |
105 record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME') | |
106 | |
107 for attr in root.findall('.//SAMPLE_ATTRIBUTE'): | |
108 key, value = iter(attr) | |
109 record[key.text] = value.text | |
110 rcds.append(record) | |
111 sleep(.1) | |
112 | |
113 headers = {} | |
114 for record in rcds: | |
115 for key in record.keys(): | |
116 headers[key] = None # use a dict to preserve header order | |
117 | |
118 | |
119 with open('./metadata.tsv', 'w') as f: | |
120 wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys()) | |
121 wtr.writeheader() | |
122 wtr.writerows(rcds) | |
123 | |
124 | |
125 |