Mercurial > repos > jpayne > bioproject_to_srr_2
diff bio2srr.py @ 11:7fd0ef5842e7
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Mon, 06 May 2024 01:42:27 -0400 |
parents | ccec96a537b7 |
children | fc77995bc4da |
line wrap: on
line diff
--- a/bio2srr.py Mon May 06 00:12:39 2024 -0400 +++ b/bio2srr.py Mon May 06 01:42:27 2024 -0400 @@ -3,6 +3,7 @@ import requests import sys import csv +import os try: from itertools import batched @@ -30,8 +31,19 @@ logger = logging.getLogger("bio2srr") +extra_params = {} + +api_key = os.environ.get("NCBI_API_KEY") + +if api_key: + logger.info(f"Using NCBI API key {api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}") + extra_params["api_key"] = api_key + def log(msg): - logger.info(msg) # fix logging later + if api_key: + logger.info(msg.replace(api_key, f"{api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")) # fix logging later + else: + logger.info(msg) def get_tag(root, tag): val = root.find(tag) @@ -64,7 +76,7 @@ for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list): log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}") #get bioproject to bioproject links - response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json")) + response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params)) response.raise_for_status() reply = response.json() linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}]) @@ -77,7 +89,7 @@ if id not in bioproject_id_list: bioproject_id_list.append((biop, id)) # recurse over bioproject links # get bioproject to biosample links - response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json")) + response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params)) response.raise_for_status() reply = response.json() links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []) @@ -89,7 +101,7 @@ for field, value in replyy.get("result", {}).items(): if "uids" not in field: yield bioproject, field, value["sampledata"] # this is XML, deleriously - sleep(1) + sleep(1 if not api_key else 0.1) biosample_example = """ @@ -151,13 +163,13 @@ def yield_sra_runs_from_sample(biosampleids): - sleep(0.1) - response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json")) + sleep(1 if not api_key else 0.1) + response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json", **extra_params)) response.raise_for_status() reply = response.json() for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): - sleep(0.3) - response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json")) + sleep(1 if not api_key else 0.1) + response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params)) response.raise_for_status() replyy = response.json() for field, value in replyy.get("result", {}).items(): @@ -192,6 +204,7 @@ except IndexError: logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"") sys.exit(1) + sleep(1 if not api_key else 0.1) for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]): try: sampledict = flatten_biosample_xml(biosample_xml)