Mercurial > repos > jpayne > bioproject2srr
comparison bio2srr.py @ 0:79fa4330f2c9 draft default tip
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
| author | jpayne |
|---|---|
| date | Mon, 08 Dec 2025 20:18:03 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:79fa4330f2c9 |
|---|---|
| 1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils" | |
| 2 | |
| 3 import requests | |
| 4 import sys | |
| 5 import csv | |
| 6 import os | |
| 7 | |
| 8 try: | |
| 9 from itertools import batched | |
| 10 except ImportError: | |
| 11 from itertools import islice | |
| 12 def batched(iterable, n): | |
| 13 "Batch data into tuples of length n. The last batch may be shorter." | |
| 14 # batched('ABCDEFG', 3) --> ABC DEF G | |
| 15 if n < 1: | |
| 16 raise ValueError('n must be at least one') | |
| 17 it = iter(iterable) | |
| 18 while batch := tuple(islice(it, n)): | |
| 19 yield batch | |
| 20 from functools import cmp_to_key | |
| 21 from time import sleep | |
| 22 from xml.etree import ElementTree as xml | |
| 23 | |
| 24 esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" | |
| 25 esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" | |
| 26 elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi" | |
| 27 | |
| 28 | |
| 29 import logging | |
| 30 logging.basicConfig(level=logging.INFO) | |
| 31 | |
| 32 logger = logging.getLogger("bio2srr") | |
| 33 | |
| 34 extra_params = {} | |
| 35 | |
| 36 api_key = os.environ.get("NCBI_API_KEY") | |
| 37 | |
| 38 if api_key: | |
| 39 logger.info(f"Using NCBI API key {api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}") | |
| 40 extra_params["api_key"] = api_key | |
| 41 | |
| 42 def log(msg): | |
| 43 if api_key: | |
| 44 logger.info(msg.replace(api_key, f"{api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")) # fix logging later | |
| 45 else: | |
| 46 logger.info(msg) | |
| 47 | |
| 48 def get_tag(root, tag): | |
| 49 val = root.find(tag) | |
| 50 if val is not None: | |
| 51 return val.text | |
| 52 log(f"No result for {tag}") | |
| 53 | |
| 54 | |
| 55 | |
| 56 def header_sort_override(a, b): | |
| 57 if a == b: | |
| 58 return 0 | |
| 59 try: | |
| 60 for name in ["bioproject", "srr_accession", "biosample_accession", "organism", "taxid", "package",]: | |
| 61 if a == name: | |
| 62 return -1 | |
| 63 if b == name: | |
| 64 return 1 | |
| 65 except: | |
| 66 pass | |
| 67 if a < b: | |
| 68 return -1 | |
| 69 else: | |
| 70 return 1 | |
| 71 | |
| 72 hso = cmp_to_key(header_sort_override) | |
| 73 | |
| 74 def resolve_bioproject_ids_and_links(bioproject_id_list): | |
| 75 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML" | |
| 76 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list): | |
| 77 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}") | |
| 78 #get bioproject to bioproject links | |
| 79 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params)) | |
| 80 response.raise_for_status() | |
| 81 reply = response.json() | |
| 82 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}]) | |
| 83 if len(linksets) >= 3: | |
| 84 for id in linksets[2].get("links", []): #third index is the up to down links | |
| 85 response = requests.get(esummary, params=dict(id=id, db="bioproject", format="json")) | |
| 86 response.raise_for_status() | |
| 87 replyy = response.json() | |
| 88 biop = replyy["result"][id]["project_acc"] | |
| 89 if id not in bioproject_id_list: | |
| 90 bioproject_id_list.append((biop, id)) # recurse over bioproject links | |
| 91 # get bioproject to biosample links | |
| 92 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params)) | |
| 93 response.raise_for_status() | |
| 94 reply = response.json() | |
| 95 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []) | |
| 96 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})") | |
| 97 for ids in batched(links, 200): | |
| 98 response = requests.get(esummary, params=dict(id=",".join(ids), db="biosample", format="json")) | |
| 99 response.raise_for_status() | |
| 100 replyy = response.json() | |
| 101 for field, value in replyy.get("result", {}).items(): | |
| 102 if "uids" not in field: | |
| 103 yield bioproject, field, value["sampledata"] # this is XML, deleriously | |
| 104 sleep(1 if not api_key else 0.1) | |
| 105 | |
| 106 | |
| 107 biosample_example = """ | |
| 108 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268"> | |
| 109 <Ids> | |
| 110 <Id db="BioSample" is_primary="1">SAMN17131268</Id> | |
| 111 <Id db_label="Sample name">CJP19-D996</Id> | |
| 112 </Ids> | |
| 113 <Description> | |
| 114 <Title>Pathogen: environmental/food/other sample from Campylobacter jejuni</Title> | |
| 115 <Organism taxonomy_id="197" taxonomy_name="Campylobacter jejuni"> | |
| 116 <OrganismName>Campylobacter jejuni</OrganismName> | |
| 117 </Organism> | |
| 118 </Description> | |
| 119 <Owner> | |
| 120 <Name url="http://www.fda.gov/Food/FoodScienceResearch/WholeGenomeSequencingProgramWGS/default.htm" abbreviation="CFSAN">FDA Center for Food Safety and Applied Nutrition</Name> | |
| 121 </Owner> | |
| 122 <Models> | |
| 123 <Model>Pathogen.env</Model> | |
| 124 </Models> | |
| 125 <Package display_name="Pathogen: environmental/food/other; version 1.0">Pathogen.env.1.0</Package> | |
| 126 <Attributes> | |
| 127 <Attribute attribute_name="strain" harmonized_name="strain" display_name="strain">CJP19-D996</Attribute> | |
| 128 <Attribute attribute_name="collection_date" harmonized_name="collection_date" display_name="collection date">missing</Attribute> | |
| 129 <Attribute attribute_name="geo_loc_name" harmonized_name="geo_loc_name" display_name="geographic location">missing</Attribute> | |
| 130 <Attribute attribute_name="collected_by" harmonized_name="collected_by" display_name="collected by">CDC</Attribute> | |
| 131 <Attribute attribute_name="lat_lon" harmonized_name="lat_lon" display_name="latitude and longitude">missing</Attribute> | |
| 132 <Attribute attribute_name="isolation_source" harmonized_name="isolation_source" display_name="isolation source">missing</Attribute> | |
| 133 <Attribute attribute_name="isolate" harmonized_name="isolate" display_name="isolate">CFSAN091032</Attribute> | |
| 134 <Attribute attribute_name="project name" harmonized_name="project_name" display_name="project name">GenomeTrakr</Attribute> | |
| 135 <Attribute attribute_name="sequenced by" harmonized_name="sequenced_by" display_name="sequenced by">FDA Center for Food Safety and Applied Nutrition</Attribute> | |
| 136 </Attributes> | |
| 137 <Links> | |
| 138 <Link type="entrez" target="bioproject" label="PRJNA681235">681235</Link> | |
| 139 </Links> | |
| 140 <Status status="live" when="2020-12-21T15:08:05.693"/> | |
| 141 </BioSample> | |
| 142 | |
| 143 """ | |
| 144 | |
| 145 def flatten_biosample_xml(biosampxml): | |
| 146 root = xml.fromstring(biosampxml) | |
| 147 accession = get_tag(root, r'.//Id[@db="BioSample"]') | |
| 148 # sample_name = get_tag(root, r'.//Id[@db_label="Sample name"]') | |
| 149 organism = get_tag(root, r".//OrganismName") | |
| 150 tax_id = root.find(r".//Organism").attrib.get("taxonomy_id") | |
| 151 package = get_tag(root, r".//Package") | |
| 152 sampledict = dict( | |
| 153 biosample_accession=accession, | |
| 154 # sample_name=sample_name, | |
| 155 organism = organism, | |
| 156 taxid = tax_id, | |
| 157 package = package | |
| 158 ) | |
| 159 for attribute in root.findall("Attributes/Attribute"): | |
| 160 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text | |
| 161 | |
| 162 return sampledict | |
| 163 | |
| 164 | |
| 165 def yield_sra_runs_from_sample(biosample): | |
| 166 sleep(1 if not api_key else 0.1) | |
| 167 response = requests.get(elink, params=dict(id=biosample, dbfrom="biosample", db="sra", format="json", **extra_params)) | |
| 168 response.raise_for_status() | |
| 169 reply = response.json() | |
| 170 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): | |
| 171 sleep(1 if not api_key else 0.1) | |
| 172 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params)) | |
| 173 response.raise_for_status() | |
| 174 replyy = response.json() | |
| 175 for field, value in replyy.get("result", {}).items(): | |
| 176 if "uids" not in field: | |
| 177 yield field, value.get("runs") | |
| 178 | |
| 179 | |
| 180 runs_example = """ | |
| 181 <Run acc="SRR13167188" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/> | |
| 182 <Run acc="SRR13167189" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/> | |
| 183 """ | |
| 184 | |
| 185 def flatten_runs(runxml): | |
| 186 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted | |
| 187 for run in root.findall(".//Run"): | |
| 188 if run.attrib["is_public"] == "false": | |
| 189 logger.warning(f"Skipping non-public run {run.attrib['acc']}") | |
| 190 yield dict( | |
| 191 sra_run_accession = run.attrib["acc"], | |
| 192 total_spots = run.attrib["total_spots"], | |
| 193 total_bases = run.attrib["total_bases"], | |
| 194 ) | |
| 195 | |
| 196 | |
| 197 | |
| 198 def main(starting_bioproject): | |
| 199 rows = [] | |
| 200 response = requests.get(esearch, params=dict(db="bioproject", term=starting_bioproject, field="PRJA", format="json")) | |
| 201 response.raise_for_status() | |
| 202 reply = response.json() | |
| 203 try: | |
| 204 bioproject_id = reply["esearchresult"]["idlist"][0] | |
| 205 log(f"Found UID {bioproject_id} for '{starting_bioproject}'") | |
| 206 except IndexError: | |
| 207 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"") | |
| 208 sys.exit(1) | |
| 209 sleep(1 if not api_key else 0.1) | |
| 210 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]): | |
| 211 try: | |
| 212 sampledict = flatten_biosample_xml(biosample_xml) | |
| 213 except KeyError: | |
| 214 log(biosample_xml) | |
| 215 raise | |
| 216 sampledict["bioproject"] = bioproject | |
| 217 noruns = True | |
| 218 for sra, runs in yield_sra_runs_from_sample(biosample): | |
| 219 for run in flatten_runs(runs.strip()): | |
| 220 noruns = False | |
| 221 run.update(sampledict) | |
| 222 rows.append(run) | |
| 223 if noruns: | |
| 224 rows.append(sampledict) | |
| 225 | |
| 226 log(f"Writing {len(rows)} rows to metadata.tsv") | |
| 227 | |
| 228 header = set() | |
| 229 for row in rows: | |
| 230 for key in row.keys(): | |
| 231 header.add(key) | |
| 232 | |
| 233 header = sorted(list(header), key=hso) | |
| 234 # logger.info(f"Header: {header}") | |
| 235 | |
| 236 rows.sort(key=lambda x: x["biosample_accession"]) | |
| 237 | |
| 238 with open("metadata.tsv", "w") as f: | |
| 239 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel") | |
| 240 writer.writeheader() | |
| 241 writer.writerows(rows) | |
| 242 | |
| 243 # check for duplicate runs and unreleased samples | |
| 244 | |
| 245 accessions = [row.get("sra_run_accession") for row in rows if row.get("sra_run_accession")] | |
| 246 | |
| 247 raw_length = len(accessions) | |
| 248 | |
| 249 accessions = sorted(list(set(accessions))) | |
| 250 | |
| 251 if raw_length < len(rows): | |
| 252 logger.warning(f"Bioproject {starting_bioproject} contains unreleased samples. {len(rows) - raw_length} samples will not be included in accessions.txt") | |
| 253 | |
| 254 if len(accessions) < raw_length: | |
| 255 logger.warning(f"Some SRA runs may have been reached through multiple projects or samples. accessions.txt will be deduplicated but the metadata table is not") | |
| 256 | |
| 257 log(f"Writing {len(accessions)} unique accessions to accessions.txt") | |
| 258 | |
| 259 with open("accessions.txt", "w") as f: | |
| 260 for accession in accessions: | |
| 261 f.write(accession + "\n") | |
| 262 | |
| 263 | |
| 264 if __name__ == "__main__": | |
| 265 b = sys.argv[1].strip() | |
| 266 log(f"Starting with {b}") | |
| 267 try: | |
| 268 main(b) | |
| 269 except requests.HTTPError as e: | |
| 270 logger.error(e) | |
| 271 sys.exit(1) | |
| 272 | |
| 273 | |
| 274 | |
| 275 | |
| 276 |
