jpayne@4
|
1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils"
|
jpayne@4
|
2
|
jpayne@4
|
3 import requests
|
jpayne@4
|
4 import sys
|
jpayne@4
|
5 import csv
|
jpayne@4
|
6
|
jpayne@8
|
7 try:
|
jpayne@8
|
8 from itertools import batched
|
jpayne@8
|
9 except ImportError:
|
jpayne@9
|
10 from itertools import islice
|
jpayne@8
|
11 def batched(iterable, n):
|
jpayne@8
|
12 "Batch data into tuples of length n. The last batch may be shorter."
|
jpayne@8
|
13 # batched('ABCDEFG', 3) --> ABC DEF G
|
jpayne@8
|
14 if n < 1:
|
jpayne@8
|
15 raise ValueError('n must be at least one')
|
jpayne@8
|
16 it = iter(iterable)
|
jpayne@8
|
17 while batch := tuple(islice(it, n)):
|
jpayne@8
|
18 yield batch
|
jpayne@4
|
19 from functools import cmp_to_key
|
jpayne@4
|
20 from time import sleep
|
jpayne@4
|
21 from xml.etree import ElementTree as xml
|
jpayne@4
|
22
|
jpayne@4
|
23 esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
|
jpayne@4
|
24 esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
|
jpayne@4
|
25 elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
|
jpayne@4
|
26
|
jpayne@4
|
27
|
jpayne@4
|
28 import logging
|
jpayne@4
|
29 logging.basicConfig(level=logging.INFO)
|
jpayne@4
|
30
|
jpayne@4
|
31 logger = logging.getLogger("bio2srr")
|
jpayne@4
|
32
|
jpayne@4
|
33 def log(msg):
|
jpayne@4
|
34 logger.info(msg) # fix logging later
|
jpayne@4
|
35
|
jpayne@4
|
36 def get_tag(root, tag):
|
jpayne@4
|
37 val = root.find(tag)
|
jpayne@4
|
38 if val is not None:
|
jpayne@4
|
39 return val.text
|
jpayne@4
|
40 log(f"No result for {tag}")
|
jpayne@4
|
41
|
jpayne@4
|
42
|
jpayne@4
|
43
|
jpayne@4
|
44 def header_sort_override(a, b):
|
jpayne@4
|
45 if a == b:
|
jpayne@4
|
46 return 0
|
jpayne@4
|
47 try:
|
jpayne@4
|
48 for name in ["bioproject", "srr_accession", "biosample_accession", "organism", "taxid", "package",]:
|
jpayne@4
|
49 if a == name:
|
jpayne@4
|
50 return -1
|
jpayne@4
|
51 if b == name:
|
jpayne@4
|
52 return 1
|
jpayne@4
|
53 except:
|
jpayne@4
|
54 pass
|
jpayne@4
|
55 if a < b:
|
jpayne@4
|
56 return -1
|
jpayne@4
|
57 else:
|
jpayne@4
|
58 return 1
|
jpayne@4
|
59
|
jpayne@4
|
60 hso = cmp_to_key(header_sort_override)
|
jpayne@4
|
61
|
jpayne@4
|
62 def resolve_bioproject_ids_and_links(bioproject_id_list):
|
jpayne@4
|
63 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML"
|
jpayne@4
|
64 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list):
|
jpayne@4
|
65 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}")
|
jpayne@4
|
66 #get bioproject to bioproject links
|
jpayne@4
|
67 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json"))
|
jpayne@4
|
68 response.raise_for_status()
|
jpayne@4
|
69 reply = response.json()
|
jpayne@4
|
70 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}])
|
jpayne@4
|
71 if len(linksets) >= 3:
|
jpayne@4
|
72 for id in linksets[2].get("links", []): #third index is the up to down links
|
jpayne@4
|
73 response = requests.get(esummary, params=dict(id=id, db="bioproject", format="json"))
|
jpayne@4
|
74 response.raise_for_status()
|
jpayne@4
|
75 replyy = response.json()
|
jpayne@4
|
76 biop = replyy["result"][id]["project_acc"]
|
jpayne@4
|
77 if id not in bioproject_id_list:
|
jpayne@4
|
78 bioproject_id_list.append((biop, id)) # recurse over bioproject links
|
jpayne@4
|
79 # get bioproject to biosample links
|
jpayne@4
|
80 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json"))
|
jpayne@4
|
81 response.raise_for_status()
|
jpayne@4
|
82 reply = response.json()
|
jpayne@4
|
83 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", [])
|
jpayne@4
|
84 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})")
|
jpayne@4
|
85 for ids in batched(links, 200):
|
jpayne@4
|
86 response = requests.get(esummary, params=dict(id=",".join(ids), db="biosample", format="json"))
|
jpayne@4
|
87 response.raise_for_status()
|
jpayne@4
|
88 replyy = response.json()
|
jpayne@4
|
89 for field, value in replyy.get("result", {}).items():
|
jpayne@4
|
90 if "uids" not in field:
|
jpayne@4
|
91 yield bioproject, field, value["sampledata"] # this is XML, deleriously
|
jpayne@10
|
92 sleep(1)
|
jpayne@4
|
93
|
jpayne@4
|
94
|
jpayne@4
|
95 biosample_example = """
|
jpayne@4
|
96 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268">
|
jpayne@4
|
97 <Ids>
|
jpayne@4
|
98 <Id db="BioSample" is_primary="1">SAMN17131268</Id>
|
jpayne@4
|
99 <Id db_label="Sample name">CJP19-D996</Id>
|
jpayne@4
|
100 </Ids>
|
jpayne@4
|
101 <Description>
|
jpayne@4
|
102 <Title>Pathogen: environmental/food/other sample from Campylobacter jejuni</Title>
|
jpayne@4
|
103 <Organism taxonomy_id="197" taxonomy_name="Campylobacter jejuni">
|
jpayne@4
|
104 <OrganismName>Campylobacter jejuni</OrganismName>
|
jpayne@4
|
105 </Organism>
|
jpayne@4
|
106 </Description>
|
jpayne@4
|
107 <Owner>
|
jpayne@4
|
108 <Name url="http://www.fda.gov/Food/FoodScienceResearch/WholeGenomeSequencingProgramWGS/default.htm" abbreviation="CFSAN">FDA Center for Food Safety and Applied Nutrition</Name>
|
jpayne@4
|
109 </Owner>
|
jpayne@4
|
110 <Models>
|
jpayne@4
|
111 <Model>Pathogen.env</Model>
|
jpayne@4
|
112 </Models>
|
jpayne@4
|
113 <Package display_name="Pathogen: environmental/food/other; version 1.0">Pathogen.env.1.0</Package>
|
jpayne@4
|
114 <Attributes>
|
jpayne@4
|
115 <Attribute attribute_name="strain" harmonized_name="strain" display_name="strain">CJP19-D996</Attribute>
|
jpayne@4
|
116 <Attribute attribute_name="collection_date" harmonized_name="collection_date" display_name="collection date">missing</Attribute>
|
jpayne@4
|
117 <Attribute attribute_name="geo_loc_name" harmonized_name="geo_loc_name" display_name="geographic location">missing</Attribute>
|
jpayne@4
|
118 <Attribute attribute_name="collected_by" harmonized_name="collected_by" display_name="collected by">CDC</Attribute>
|
jpayne@4
|
119 <Attribute attribute_name="lat_lon" harmonized_name="lat_lon" display_name="latitude and longitude">missing</Attribute>
|
jpayne@4
|
120 <Attribute attribute_name="isolation_source" harmonized_name="isolation_source" display_name="isolation source">missing</Attribute>
|
jpayne@4
|
121 <Attribute attribute_name="isolate" harmonized_name="isolate" display_name="isolate">CFSAN091032</Attribute>
|
jpayne@4
|
122 <Attribute attribute_name="project name" harmonized_name="project_name" display_name="project name">GenomeTrakr</Attribute>
|
jpayne@4
|
123 <Attribute attribute_name="sequenced by" harmonized_name="sequenced_by" display_name="sequenced by">FDA Center for Food Safety and Applied Nutrition</Attribute>
|
jpayne@4
|
124 </Attributes>
|
jpayne@4
|
125 <Links>
|
jpayne@4
|
126 <Link type="entrez" target="bioproject" label="PRJNA681235">681235</Link>
|
jpayne@4
|
127 </Links>
|
jpayne@4
|
128 <Status status="live" when="2020-12-21T15:08:05.693"/>
|
jpayne@4
|
129 </BioSample>
|
jpayne@4
|
130
|
jpayne@4
|
131 """
|
jpayne@4
|
132
|
jpayne@4
|
133 def flatten_biosample_xml(biosampxml):
|
jpayne@4
|
134 root = xml.fromstring(biosampxml)
|
jpayne@4
|
135 accession = get_tag(root, r'.//Id[@db="BioSample"]')
|
jpayne@4
|
136 # sample_name = get_tag(root, r'.//Id[@db_label="Sample name"]')
|
jpayne@4
|
137 organism = get_tag(root, r".//OrganismName")
|
jpayne@4
|
138 tax_id = root.find(r".//Organism").attrib.get("taxonomy_id")
|
jpayne@4
|
139 package = get_tag(root, r".//Package")
|
jpayne@4
|
140 sampledict = dict(
|
jpayne@4
|
141 biosample_accession=accession,
|
jpayne@4
|
142 # sample_name=sample_name,
|
jpayne@4
|
143 organism = organism,
|
jpayne@4
|
144 taxid = tax_id,
|
jpayne@4
|
145 package = package
|
jpayne@4
|
146 )
|
jpayne@4
|
147 for attribute in root.findall("Attributes/Attribute"):
|
jpayne@4
|
148 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text
|
jpayne@4
|
149
|
jpayne@4
|
150 return sampledict
|
jpayne@4
|
151
|
jpayne@4
|
152
|
jpayne@4
|
153 def yield_sra_runs_from_sample(biosampleids):
|
jpayne@4
|
154 sleep(0.1)
|
jpayne@4
|
155 response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json"))
|
jpayne@4
|
156 response.raise_for_status()
|
jpayne@4
|
157 reply = response.json()
|
jpayne@4
|
158 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200):
|
jpayne@10
|
159 sleep(0.3)
|
jpayne@4
|
160 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json"))
|
jpayne@4
|
161 response.raise_for_status()
|
jpayne@4
|
162 replyy = response.json()
|
jpayne@4
|
163 for field, value in replyy.get("result", {}).items():
|
jpayne@4
|
164 if "uids" not in field:
|
jpayne@4
|
165 yield field, value.get("runs")
|
jpayne@4
|
166
|
jpayne@4
|
167
|
jpayne@4
|
168 runs_example = """
|
jpayne@4
|
169 <Run acc="SRR13167188" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>
|
jpayne@4
|
170 <Run acc="SRR13167189" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>
|
jpayne@4
|
171 """
|
jpayne@4
|
172
|
jpayne@4
|
173 def flatten_runs(runxml):
|
jpayne@4
|
174 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted
|
jpayne@4
|
175 for run in root.findall(".//Run"):
|
jpayne@4
|
176 yield dict(
|
jpayne@4
|
177 sra_run_accession = run.attrib["acc"],
|
jpayne@4
|
178 total_spots = run.attrib["total_spots"],
|
jpayne@4
|
179 total_bases = run.attrib["total_bases"],
|
jpayne@4
|
180 )
|
jpayne@4
|
181
|
jpayne@4
|
182
|
jpayne@4
|
183
|
jpayne@4
|
184 def main(starting_bioproject):
|
jpayne@4
|
185 rows = []
|
jpayne@4
|
186 response = requests.get(esearch, params=dict(db="bioproject", term=starting_bioproject, field="PRJA", format="json"))
|
jpayne@4
|
187 response.raise_for_status()
|
jpayne@4
|
188 reply = response.json()
|
jpayne@4
|
189 try:
|
jpayne@4
|
190 bioproject_id = reply["esearchresult"]["idlist"][0]
|
jpayne@4
|
191 log(f"Found UID {bioproject_id} for '{starting_bioproject}'")
|
jpayne@4
|
192 except IndexError:
|
jpayne@4
|
193 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"")
|
jpayne@4
|
194 sys.exit(1)
|
jpayne@4
|
195 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]):
|
jpayne@4
|
196 try:
|
jpayne@4
|
197 sampledict = flatten_biosample_xml(biosample_xml)
|
jpayne@4
|
198 except KeyError:
|
jpayne@4
|
199 log(biosample_xml)
|
jpayne@4
|
200 raise
|
jpayne@4
|
201 sampledict["bioproject"] = bioproject
|
jpayne@4
|
202 for sra, runs in yield_sra_runs_from_sample(biosample):
|
jpayne@4
|
203 for run in flatten_runs(runs.strip()):
|
jpayne@4
|
204 run.update(sampledict)
|
jpayne@4
|
205 rows.append(run)
|
jpayne@4
|
206
|
jpayne@4
|
207 log(f"Writing {len(rows)} rows to metadata.tsv")
|
jpayne@4
|
208
|
jpayne@4
|
209 header = set()
|
jpayne@4
|
210 for row in rows:
|
jpayne@4
|
211 for key in row.keys():
|
jpayne@4
|
212 header.add(key)
|
jpayne@4
|
213
|
jpayne@4
|
214 header = sorted(list(header), key=hso)
|
jpayne@4
|
215 logger.info(f"Header: {header}")
|
jpayne@4
|
216
|
jpayne@4
|
217 rows.sort(key=lambda x: x["biosample_accession"])
|
jpayne@4
|
218
|
jpayne@4
|
219 with open("metadata.tsv", "w") as f:
|
jpayne@4
|
220 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel")
|
jpayne@4
|
221 writer.writeheader()
|
jpayne@4
|
222 writer.writerows(rows)
|
jpayne@4
|
223
|
jpayne@4
|
224 log(f"Writing {len(rows)} accessions to accessions.txt")
|
jpayne@4
|
225
|
jpayne@4
|
226 with open("accessions.txt", "w") as f:
|
jpayne@4
|
227 for row in rows:
|
jpayne@4
|
228 f.write(row["sra_run_accession"] + "\n")
|
jpayne@4
|
229
|
jpayne@4
|
230
|
jpayne@4
|
231 if __name__ == "__main__":
|
jpayne@4
|
232 b = sys.argv[1].strip()
|
jpayne@4
|
233 log(f"Starting with {b}")
|
jpayne@4
|
234 try:
|
jpayne@4
|
235 main(b)
|
jpayne@4
|
236 except requests.HTTPError as e:
|
jpayne@4
|
237 logger.error(e)
|
jpayne@4
|
238 sys.exit(1)
|
jpayne@4
|
239
|
jpayne@4
|
240
|
jpayne@4
|
241
|
jpayne@4
|
242
|
jpayne@4
|
243
|