bioproject_to_srr_2: bio2srr.py annotate

annotate bio2srr.py @ 14:18e1cb6018fd

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538

author	jpayne
date	Mon, 20 May 2024 02:25:23 -0400
parents	fc77995bc4da
children	0a3943480712

rev	line source
jpayne@4	1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils"
jpayne@4	2
jpayne@4	3 import requests
jpayne@4	4 import sys
jpayne@4	5 import csv
jpayne@11	6 import os
jpayne@4	7
jpayne@8	8 try:
jpayne@8	9 from itertools import batched
jpayne@8	10 except ImportError:
jpayne@9	11 from itertools import islice
jpayne@8	12 def batched(iterable, n):
jpayne@8	13 "Batch data into tuples of length n. The last batch may be shorter."
jpayne@8	14 # batched('ABCDEFG', 3) --> ABC DEF G
jpayne@8	15 if n < 1:
jpayne@8	16 raise ValueError('n must be at least one')
jpayne@8	17 it = iter(iterable)
jpayne@8	18 while batch := tuple(islice(it, n)):
jpayne@8	19 yield batch
jpayne@4	20 from functools import cmp_to_key
jpayne@4	21 from time import sleep
jpayne@4	22 from xml.etree import ElementTree as xml
jpayne@4	23
jpayne@4	24 esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
jpayne@4	25 esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
jpayne@4	26 elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
jpayne@4	27
jpayne@4	28
jpayne@4	29 import logging
jpayne@4	30 logging.basicConfig(level=logging.INFO)
jpayne@4	31
jpayne@4	32 logger = logging.getLogger("bio2srr")
jpayne@4	33
jpayne@11	34 extra_params = {}
jpayne@11	35
jpayne@11	36 api_key = os.environ.get("NCBI_API_KEY")
jpayne@11	37
jpayne@11	38 if api_key:
jpayne@11	39 logger.info(f"Using NCBI API key {api_key[:4]}{'' (len(api_key) - 8)}{api_key[-4:]}")
jpayne@11	40 extra_params["api_key"] = api_key
jpayne@11	41
jpayne@4	42 def log(msg):
jpayne@11	43 if api_key:
jpayne@11	44 logger.info(msg.replace(api_key, f"{api_key[:4]}{'' (len(api_key) - 8)}{api_key[-4:]}")) # fix logging later
jpayne@11	45 else:
jpayne@11	46 logger.info(msg)
jpayne@4	47
jpayne@4	48 def get_tag(root, tag):
jpayne@4	49 val = root.find(tag)
jpayne@4	50 if val is not None:
jpayne@4	51 return val.text
jpayne@4	52 log(f"No result for {tag}")
jpayne@4	53
jpayne@4	54
jpayne@4	55
jpayne@4	56 def header_sort_override(a, b):
jpayne@4	57 if a == b:
jpayne@4	58 return 0
jpayne@4	59 try:
jpayne@4	60 for name in ["bioproject", "srr_accession", "biosample_accession", "organism", "taxid", "package",]:
jpayne@4	61 if a == name:
jpayne@4	62 return -1
jpayne@4	63 if b == name:
jpayne@4	64 return 1
jpayne@4	65 except:
jpayne@4	66 pass
jpayne@4	67 if a < b:
jpayne@4	68 return -1
jpayne@4	69 else:
jpayne@4	70 return 1
jpayne@4	71
jpayne@4	72 hso = cmp_to_key(header_sort_override)
jpayne@4	73
jpayne@4	74 def resolve_bioproject_ids_and_links(bioproject_id_list):
jpayne@4	75 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML"
jpayne@4	76 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list):
jpayne@4	77 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}")
jpayne@4	78 #get bioproject to bioproject links
jpayne@11	79 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params))
jpayne@4	80 response.raise_for_status()
jpayne@4	81 reply = response.json()
jpayne@4	82 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}])
jpayne@4	83 if len(linksets) >= 3:
jpayne@4	84 for id in linksets[2].get("links", []): #third index is the up to down links
jpayne@4	85 response = requests.get(esummary, params=dict(id=id, db="bioproject", format="json"))
jpayne@4	86 response.raise_for_status()
jpayne@4	87 replyy = response.json()
jpayne@4	88 biop = replyy["result"][id]["project_acc"]
jpayne@4	89 if id not in bioproject_id_list:
jpayne@4	90 bioproject_id_list.append((biop, id)) # recurse over bioproject links
jpayne@4	91 # get bioproject to biosample links
jpayne@11	92 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params))
jpayne@4	93 response.raise_for_status()
jpayne@4	94 reply = response.json()
jpayne@4	95 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", [])
jpayne@4	96 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})")
jpayne@4	97 for ids in batched(links, 200):
jpayne@4	98 response = requests.get(esummary, params=dict(id=",".join(ids), db="biosample", format="json"))
jpayne@4	99 response.raise_for_status()
jpayne@4	100 replyy = response.json()
jpayne@4	101 for field, value in replyy.get("result", {}).items():
jpayne@4	102 if "uids" not in field:
jpayne@4	103 yield bioproject, field, value["sampledata"] # this is XML, deleriously
jpayne@11	104 sleep(1 if not api_key else 0.1)
jpayne@4	105
jpayne@4	106
jpayne@4	107 biosample_example = """
jpayne@4	108 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268">
jpayne@4	109 <Ids>
jpayne@4	110 <Id db="BioSample" is_primary="1">SAMN17131268</Id>
jpayne@4	111 <Id db_label="Sample name">CJP19-D996</Id>
jpayne@4	112 </Ids>
jpayne@4	113 <Description>
jpayne@4	114 <Title>Pathogen: environmental/food/other sample from Campylobacter jejuni</Title>
jpayne@4	115 <Organism taxonomy_id="197" taxonomy_name="Campylobacter jejuni">
jpayne@4	116 <OrganismName>Campylobacter jejuni</OrganismName>
jpayne@4	117 </Organism>
jpayne@4	118 </Description>
jpayne@4	119 <Owner>
jpayne@4	120 <Name url="http://www.fda.gov/Food/FoodScienceResearch/WholeGenomeSequencingProgramWGS/default.htm" abbreviation="CFSAN">FDA Center for Food Safety and Applied Nutrition</Name>
jpayne@4	121 </Owner>
jpayne@4	122 <Models>
jpayne@4	123 <Model>Pathogen.env</Model>
jpayne@4	124 </Models>
jpayne@4	125 <Package display_name="Pathogen: environmental/food/other; version 1.0">Pathogen.env.1.0</Package>
jpayne@4	126 <Attributes>
jpayne@4	127 <Attribute attribute_name="strain" harmonized_name="strain" display_name="strain">CJP19-D996</Attribute>
jpayne@4	128 <Attribute attribute_name="collection_date" harmonized_name="collection_date" display_name="collection date">missing</Attribute>
jpayne@4	129 <Attribute attribute_name="geo_loc_name" harmonized_name="geo_loc_name" display_name="geographic location">missing</Attribute>
jpayne@4	130 <Attribute attribute_name="collected_by" harmonized_name="collected_by" display_name="collected by">CDC</Attribute>
jpayne@4	131 <Attribute attribute_name="lat_lon" harmonized_name="lat_lon" display_name="latitude and longitude">missing</Attribute>
jpayne@4	132 <Attribute attribute_name="isolation_source" harmonized_name="isolation_source" display_name="isolation source">missing</Attribute>
jpayne@4	133 <Attribute attribute_name="isolate" harmonized_name="isolate" display_name="isolate">CFSAN091032</Attribute>
jpayne@4	134 <Attribute attribute_name="project name" harmonized_name="project_name" display_name="project name">GenomeTrakr</Attribute>
jpayne@4	135 <Attribute attribute_name="sequenced by" harmonized_name="sequenced_by" display_name="sequenced by">FDA Center for Food Safety and Applied Nutrition</Attribute>
jpayne@4	136 </Attributes>
jpayne@4	137 <Links>
jpayne@4	138 <Link type="entrez" target="bioproject" label="PRJNA681235">681235</Link>
jpayne@4	139 </Links>
jpayne@4	140 <Status status="live" when="2020-12-21T15:08:05.693"/>
jpayne@4	141 </BioSample>
jpayne@4	142
jpayne@4	143 """
jpayne@4	144
jpayne@4	145 def flatten_biosample_xml(biosampxml):
jpayne@4	146 root = xml.fromstring(biosampxml)
jpayne@4	147 accession = get_tag(root, r'.//Id[@db="BioSample"]')
jpayne@4	148 # sample_name = get_tag(root, r'.//Id[@db_label="Sample name"]')
jpayne@4	149 organism = get_tag(root, r".//OrganismName")
jpayne@4	150 tax_id = root.find(r".//Organism").attrib.get("taxonomy_id")
jpayne@4	151 package = get_tag(root, r".//Package")
jpayne@4	152 sampledict = dict(
jpayne@4	153 biosample_accession=accession,
jpayne@4	154 # sample_name=sample_name,
jpayne@4	155 organism = organism,
jpayne@4	156 taxid = tax_id,
jpayne@4	157 package = package
jpayne@4	158 )
jpayne@4	159 for attribute in root.findall("Attributes/Attribute"):
jpayne@4	160 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text
jpayne@4	161
jpayne@4	162 return sampledict
jpayne@4	163
jpayne@4	164
jpayne@12	165 def yield_sra_runs_from_sample(biosample):
jpayne@11	166 sleep(1 if not api_key else 0.1)
jpayne@12	167 response = requests.get(elink, params=dict(id=biosample, dbfrom="biosample", db="sra", format="json", **extra_params))
jpayne@4	168 response.raise_for_status()
jpayne@4	169 reply = response.json()
jpayne@4	170 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200):
jpayne@11	171 sleep(1 if not api_key else 0.1)
jpayne@11	172 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params))
jpayne@4	173 response.raise_for_status()
jpayne@4	174 replyy = response.json()
jpayne@4	175 for field, value in replyy.get("result", {}).items():
jpayne@4	176 if "uids" not in field:
jpayne@4	177 yield field, value.get("runs")
jpayne@4	178
jpayne@4	179
jpayne@4	180 runs_example = """
jpayne@4	181 <Run acc="SRR13167188" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>
jpayne@4	182 <Run acc="SRR13167189" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>
jpayne@4	183 """
jpayne@4	184
jpayne@4	185 def flatten_runs(runxml):
jpayne@4	186 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted
jpayne@4	187 for run in root.findall(".//Run"):
jpayne@12	188 if run.attrib["is_public"] == "false":
jpayne@12	189 logger.warning(f"Skipping non-public run {run.attrib['acc']}")
jpayne@4	190 yield dict(
jpayne@4	191 sra_run_accession = run.attrib["acc"],
jpayne@4	192 total_spots = run.attrib["total_spots"],
jpayne@4	193 total_bases = run.attrib["total_bases"],
jpayne@4	194 )
jpayne@4	195
jpayne@4	196
jpayne@4	197
jpayne@4	198 def main(starting_bioproject):
jpayne@4	199 rows = []
jpayne@4	200 response = requests.get(esearch, params=dict(db="bioproject", term=starting_bioproject, field="PRJA", format="json"))
jpayne@4	201 response.raise_for_status()
jpayne@4	202 reply = response.json()
jpayne@4	203 try:
jpayne@4	204 bioproject_id = reply["esearchresult"]["idlist"][0]
jpayne@4	205 log(f"Found UID {bioproject_id} for '{starting_bioproject}'")
jpayne@4	206 except IndexError:
jpayne@4	207 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"")
jpayne@4	208 sys.exit(1)
jpayne@11	209 sleep(1 if not api_key else 0.1)
jpayne@4	210 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]):
jpayne@4	211 try:
jpayne@4	212 sampledict = flatten_biosample_xml(biosample_xml)
jpayne@4	213 except KeyError:
jpayne@4	214 log(biosample_xml)
jpayne@4	215 raise
jpayne@4	216 sampledict["bioproject"] = bioproject
jpayne@12	217 noruns = True
jpayne@4	218 for sra, runs in yield_sra_runs_from_sample(biosample):
jpayne@4	219 for run in flatten_runs(runs.strip()):
jpayne@12	220 noruns = False
jpayne@4	221 run.update(sampledict)
jpayne@4	222 rows.append(run)
jpayne@12	223 if noruns:
jpayne@12	224 rows.append(sampledict)
jpayne@4	225
jpayne@4	226 log(f"Writing {len(rows)} rows to metadata.tsv")
jpayne@4	227
jpayne@4	228 header = set()
jpayne@4	229 for row in rows:
jpayne@4	230 for key in row.keys():
jpayne@4	231 header.add(key)
jpayne@4	232
jpayne@4	233 header = sorted(list(header), key=hso)
jpayne@12	234 # logger.info(f"Header: {header}")
jpayne@4	235
jpayne@4	236 rows.sort(key=lambda x: x["biosample_accession"])
jpayne@4	237
jpayne@4	238 with open("metadata.tsv", "w") as f:
jpayne@4	239 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel")
jpayne@4	240 writer.writeheader()
jpayne@4	241 writer.writerows(rows)
jpayne@4	242
jpayne@12	243 # check for duplicate runs and unreleased samples
jpayne@12	244
jpayne@12	245 accessions = [row.get("sra_run_accession") for row in rows if row.get("sra_run_accession")]
jpayne@12	246
jpayne@12	247 raw_length = len(accessions)
jpayne@12	248
jpayne@12	249 accessions = sorted(list(set(accessions)))
jpayne@12	250
jpayne@12	251 if raw_length < len(rows):
jpayne@12	252 logger.warning(f"Bioproject {starting_bioproject} contains unreleased samples. {len(rows) - raw_length} samples will not be included in accessions.txt")
jpayne@12	253
jpayne@12	254 if len(accessions) < raw_length:
jpayne@12	255 logger.warning(f"Some SRA runs may have been reached through multiple projects or samples. accessions.txt will be deduplicated but the metadata table is not")
jpayne@12	256
jpayne@12	257 log(f"Writing {len(accessions)} unique accessions to accessions.txt")
jpayne@4	258
jpayne@4	259 with open("accessions.txt", "w") as f:
jpayne@12	260 f.writelines(accessions)
jpayne@4	261
jpayne@4	262
jpayne@4	263 if __name__ == "__main__":
jpayne@4	264 b = sys.argv[1].strip()
jpayne@4	265 log(f"Starting with {b}")
jpayne@4	266 try:
jpayne@4	267 main(b)
jpayne@4	268 except requests.HTTPError as e:
jpayne@4	269 logger.error(e)
jpayne@4	270 sys.exit(1)
jpayne@4	271
jpayne@4	272
jpayne@4	273
jpayne@4	274
jpayne@4	275

Mercurial > repos > jpayne > bioproject_to_srr_2

annotate bio2srr.py @ 14:18e1cb6018fd