comparison bio2srr.py @ 4:2d4a2159c74b

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Fri, 03 May 2024 01:17:43 -0400
parents 80f1001797c7
children 832f269deeb0
comparison
equal deleted inserted replaced
3:80f1001797c7 4:2d4a2159c74b
1 #! /usr/bin/env python3 1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils"
2
3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's."
4 2
5 import requests 3 import requests
6 import sys 4 import sys
5 import csv
6
7
8 from itertools import batched
9 from functools import cmp_to_key
10 from time import sleep
7 from xml.etree import ElementTree as xml 11 from xml.etree import ElementTree as xml
8 import csv 12
9 13 esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
10 from time import sleep 14 esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
11 15 elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
12 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" 16
13 17
14 sample = """{ 18 import logging
15 "hitCount": 2, 19 logging.basicConfig(level=logging.INFO)
16 "entries": [ 20
17 { 21 logger = logging.getLogger("bio2srr")
18 "id": "SRX377510", 22
19 "source": "sra-experiment" 23 def log(msg):
20 }, 24 logger.info(msg) # fix logging later
21 {
22 "id": "SRX583279",
23 "source": "sra-experiment"
24 }
25 ],
26 "facets": []
27 }"""
28
29 data_query = "?display=xml"
30
31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run"
32
33 sample = """{
34 "hitCount": 1,
35 "entries": [
36 {
37 "id": "SRR1029665",
38 "source": "sra-run"
39 }
40 ],
41 "facets": []
42 }"""
43 25
44 def get_tag(root, tag): 26 def get_tag(root, tag):
45 val = root.find(tag) 27 val = root.find(tag)
46 if val: 28 if val is not None:
47 return val.text 29 return val.text
30 log(f"No result for {tag}")
31
32
33
34 def header_sort_override(a, b):
35 if a == b:
36 return 0
37 try:
38 for name in ["bioproject", "srr_accession", "biosample_accession", "organism", "taxid", "package",]:
39 if a == name:
40 return -1
41 if b == name:
42 return 1
43 except:
44 pass
45 if a < b:
46 return -1
47 else:
48 return 1
49
50 hso = cmp_to_key(header_sort_override)
51
52 def resolve_bioproject_ids_and_links(bioproject_id_list):
53 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML"
54 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list):
55 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}")
56 #get bioproject to bioproject links
57 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json"))
58 response.raise_for_status()
59 reply = response.json()
60 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}])
61 if len(linksets) >= 3:
62 for id in linksets[2].get("links", []): #third index is the up to down links
63 response = requests.get(esummary, params=dict(id=id, db="bioproject", format="json"))
64 response.raise_for_status()
65 replyy = response.json()
66 biop = replyy["result"][id]["project_acc"]
67 if id not in bioproject_id_list:
68 bioproject_id_list.append((biop, id)) # recurse over bioproject links
69 # get bioproject to biosample links
70 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json"))
71 response.raise_for_status()
72 reply = response.json()
73 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", [])
74 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})")
75 for ids in batched(links, 200):
76 response = requests.get(esummary, params=dict(id=",".join(ids), db="biosample", format="json"))
77 response.raise_for_status()
78 replyy = response.json()
79 for field, value in replyy.get("result", {}).items():
80 if "uids" not in field:
81 yield bioproject, field, value["sampledata"] # this is XML, deleriously
82 sleep(0.1)
83
84
85 biosample_example = """
86 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268">
87 <Ids>
88 <Id db="BioSample" is_primary="1">SAMN17131268</Id>
89 <Id db_label="Sample name">CJP19-D996</Id>
90 </Ids>
91 <Description>
92 <Title>Pathogen: environmental/food/other sample from Campylobacter jejuni</Title>
93 <Organism taxonomy_id="197" taxonomy_name="Campylobacter jejuni">
94 <OrganismName>Campylobacter jejuni</OrganismName>
95 </Organism>
96 </Description>
97 <Owner>
98 <Name url="http://www.fda.gov/Food/FoodScienceResearch/WholeGenomeSequencingProgramWGS/default.htm" abbreviation="CFSAN">FDA Center for Food Safety and Applied Nutrition</Name>
99 </Owner>
100 <Models>
101 <Model>Pathogen.env</Model>
102 </Models>
103 <Package display_name="Pathogen: environmental/food/other; version 1.0">Pathogen.env.1.0</Package>
104 <Attributes>
105 <Attribute attribute_name="strain" harmonized_name="strain" display_name="strain">CJP19-D996</Attribute>
106 <Attribute attribute_name="collection_date" harmonized_name="collection_date" display_name="collection date">missing</Attribute>
107 <Attribute attribute_name="geo_loc_name" harmonized_name="geo_loc_name" display_name="geographic location">missing</Attribute>
108 <Attribute attribute_name="collected_by" harmonized_name="collected_by" display_name="collected by">CDC</Attribute>
109 <Attribute attribute_name="lat_lon" harmonized_name="lat_lon" display_name="latitude and longitude">missing</Attribute>
110 <Attribute attribute_name="isolation_source" harmonized_name="isolation_source" display_name="isolation source">missing</Attribute>
111 <Attribute attribute_name="isolate" harmonized_name="isolate" display_name="isolate">CFSAN091032</Attribute>
112 <Attribute attribute_name="project name" harmonized_name="project_name" display_name="project name">GenomeTrakr</Attribute>
113 <Attribute attribute_name="sequenced by" harmonized_name="sequenced_by" display_name="sequenced by">FDA Center for Food Safety and Applied Nutrition</Attribute>
114 </Attributes>
115 <Links>
116 <Link type="entrez" target="bioproject" label="PRJNA681235">681235</Link>
117 </Links>
118 <Status status="live" when="2020-12-21T15:08:05.693"/>
119 </BioSample>
120
121 """
122
123 def flatten_biosample_xml(biosampxml):
124 root = xml.fromstring(biosampxml)
125 accession = get_tag(root, r'.//Id[@db="BioSample"]')
126 # sample_name = get_tag(root, r'.//Id[@db_label="Sample name"]')
127 organism = get_tag(root, r".//OrganismName")
128 tax_id = root.find(r".//Organism").attrib.get("taxonomy_id")
129 package = get_tag(root, r".//Package")
130 sampledict = dict(
131 biosample_accession=accession,
132 # sample_name=sample_name,
133 organism = organism,
134 taxid = tax_id,
135 package = package
136 )
137 for attribute in root.findall("Attributes/Attribute"):
138 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text
139
140 return sampledict
141
142
143 def yield_sra_runs_from_sample(biosampleids):
144 sleep(0.1)
145 response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json"))
146 response.raise_for_status()
147 reply = response.json()
148 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200):
149 sleep(0.1)
150 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json"))
151 response.raise_for_status()
152 replyy = response.json()
153 for field, value in replyy.get("result", {}).items():
154 if "uids" not in field:
155 yield field, value.get("runs")
156
157
158 runs_example = """
159 <Run acc="SRR13167188" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>
160 <Run acc="SRR13167189" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>
161 """
162
163 def flatten_runs(runxml):
164 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted
165 for run in root.findall(".//Run"):
166 yield dict(
167 sra_run_accession = run.attrib["acc"],
168 total_spots = run.attrib["total_spots"],
169 total_bases = run.attrib["total_bases"],
170 )
171
172
173
174 def main(starting_bioproject):
175 rows = []
176 response = requests.get(esearch, params=dict(db="bioproject", term=starting_bioproject, field="PRJA", format="json"))
177 response.raise_for_status()
178 reply = response.json()
179 try:
180 bioproject_id = reply["esearchresult"]["idlist"][0]
181 log(f"Found UID {bioproject_id} for '{starting_bioproject}'")
182 except IndexError:
183 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"")
184 sys.exit(1)
185 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]):
186 try:
187 sampledict = flatten_biosample_xml(biosample_xml)
188 except KeyError:
189 log(biosample_xml)
190 raise
191 sampledict["bioproject"] = bioproject
192 for sra, runs in yield_sra_runs_from_sample(biosample):
193 for run in flatten_runs(runs.strip()):
194 run.update(sampledict)
195 rows.append(run)
196
197 log(f"Writing {len(rows)} rows to metadata.tsv")
198
199 header = set()
200 for row in rows:
201 for key in row.keys():
202 header.add(key)
203
204 header = sorted(list(header), key=hso)
205 logger.info(f"Header: {header}")
206
207 rows.sort(key=lambda x: x["biosample_accession"])
208
209 with open("metadata.tsv", "w") as f:
210 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel")
211 writer.writeheader()
212 writer.writerows(rows)
213
214 log(f"Writing {len(rows)} accessions to accessions.txt")
215
216 with open("accessions.txt", "w") as f:
217 for row in rows:
218 f.write(row["sra_run_accession"] + "\n")
219
48 220
49 if __name__ == "__main__": 221 if __name__ == "__main__":
222 b = sys.argv[1].strip()
223 log(f"Starting with {b}")
50 try: 224 try:
51 bioproject = sys.argv[1] 225 main(b)
52 226 except requests.HTTPError as e:
53 b_result = None 227 logger.error(e)
54 228 sys.exit(1)
55 runs = [] 229
56 230
57 while not b_result or len(runs) < b_result['hitCount']: 231
58 b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json")) 232
59 b_result.raise_for_status() 233
60 b_result = b_result.json()
61 runs += [d['id'] for d in b_result['entries']]
62
63 if not runs:
64 print(f"No results found for '{bioproject}'.", file=sys.stderr)
65 quit(1)
66 except IndexError:
67 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
68
69 try:
70 with open(sys.argv[2], 'r') as f:
71 rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
72 rcds = list(rdr)
73
74
75 except IndexError:
76 rcds = []
77
78 bsams = []
79
80 for id in runs:
81 res = requests.get(
82 f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample",
83 headers=dict(Accept="application/json")
84 )
85 res.raise_for_status()
86 bsams.append(res.json()['entries'][0]['references'][0]['acc'])
87 sleep(.1)
88
89 # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
90 # res.raise_for_status()
91 # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
92
93 for run_id, sam_id in zip(runs, bsams):
94 print(run_id)
95 record = {}
96 record['sample'] = run_id
97 record['biosample_accession'] = sam_id
98 res = requests.get(
99 f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}"
100 )
101 res.raise_for_status()
102 root = xml.fromstring(res.text)
103
104 record['submitter_id'] = get_tag(root, './/SUBMITTER_ID')
105 record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME')
106
107 for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
108 key, value = iter(attr)
109 record[key.text] = value.text
110 rcds.append(record)
111 sleep(.1)
112
113 headers = {}
114 for record in rcds:
115 for key in record.keys():
116 headers[key] = None # use a dict to preserve header order
117
118
119 with open('./metadata.tsv', 'w') as f:
120 wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
121 wtr.writeheader()
122 wtr.writerows(rcds)
123
124
125