comparison bio2srr.py @ 11:7fd0ef5842e7

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Mon, 06 May 2024 01:42:27 -0400
parents ccec96a537b7
children fc77995bc4da
comparison
equal deleted inserted replaced
10:ccec96a537b7 11:7fd0ef5842e7
1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils" 1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils"
2 2
3 import requests 3 import requests
4 import sys 4 import sys
5 import csv 5 import csv
6 import os
6 7
7 try: 8 try:
8 from itertools import batched 9 from itertools import batched
9 except ImportError: 10 except ImportError:
10 from itertools import islice 11 from itertools import islice
28 import logging 29 import logging
29 logging.basicConfig(level=logging.INFO) 30 logging.basicConfig(level=logging.INFO)
30 31
31 logger = logging.getLogger("bio2srr") 32 logger = logging.getLogger("bio2srr")
32 33
34 extra_params = {}
35
36 api_key = os.environ.get("NCBI_API_KEY")
37
38 if api_key:
39 logger.info(f"Using NCBI API key {api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")
40 extra_params["api_key"] = api_key
41
33 def log(msg): 42 def log(msg):
34 logger.info(msg) # fix logging later 43 if api_key:
44 logger.info(msg.replace(api_key, f"{api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")) # fix logging later
45 else:
46 logger.info(msg)
35 47
36 def get_tag(root, tag): 48 def get_tag(root, tag):
37 val = root.find(tag) 49 val = root.find(tag)
38 if val is not None: 50 if val is not None:
39 return val.text 51 return val.text
62 def resolve_bioproject_ids_and_links(bioproject_id_list): 74 def resolve_bioproject_ids_and_links(bioproject_id_list):
63 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML" 75 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML"
64 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list): 76 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list):
65 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}") 77 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}")
66 #get bioproject to bioproject links 78 #get bioproject to bioproject links
67 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json")) 79 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params))
68 response.raise_for_status() 80 response.raise_for_status()
69 reply = response.json() 81 reply = response.json()
70 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}]) 82 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}])
71 if len(linksets) >= 3: 83 if len(linksets) >= 3:
72 for id in linksets[2].get("links", []): #third index is the up to down links 84 for id in linksets[2].get("links", []): #third index is the up to down links
75 replyy = response.json() 87 replyy = response.json()
76 biop = replyy["result"][id]["project_acc"] 88 biop = replyy["result"][id]["project_acc"]
77 if id not in bioproject_id_list: 89 if id not in bioproject_id_list:
78 bioproject_id_list.append((biop, id)) # recurse over bioproject links 90 bioproject_id_list.append((biop, id)) # recurse over bioproject links
79 # get bioproject to biosample links 91 # get bioproject to biosample links
80 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json")) 92 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params))
81 response.raise_for_status() 93 response.raise_for_status()
82 reply = response.json() 94 reply = response.json()
83 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []) 95 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", [])
84 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})") 96 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})")
85 for ids in batched(links, 200): 97 for ids in batched(links, 200):
87 response.raise_for_status() 99 response.raise_for_status()
88 replyy = response.json() 100 replyy = response.json()
89 for field, value in replyy.get("result", {}).items(): 101 for field, value in replyy.get("result", {}).items():
90 if "uids" not in field: 102 if "uids" not in field:
91 yield bioproject, field, value["sampledata"] # this is XML, deleriously 103 yield bioproject, field, value["sampledata"] # this is XML, deleriously
92 sleep(1) 104 sleep(1 if not api_key else 0.1)
93 105
94 106
95 biosample_example = """ 107 biosample_example = """
96 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268"> 108 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268">
97 <Ids> 109 <Ids>
149 161
150 return sampledict 162 return sampledict
151 163
152 164
153 def yield_sra_runs_from_sample(biosampleids): 165 def yield_sra_runs_from_sample(biosampleids):
154 sleep(0.1) 166 sleep(1 if not api_key else 0.1)
155 response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json")) 167 response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json", **extra_params))
156 response.raise_for_status() 168 response.raise_for_status()
157 reply = response.json() 169 reply = response.json()
158 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): 170 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200):
159 sleep(0.3) 171 sleep(1 if not api_key else 0.1)
160 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json")) 172 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params))
161 response.raise_for_status() 173 response.raise_for_status()
162 replyy = response.json() 174 replyy = response.json()
163 for field, value in replyy.get("result", {}).items(): 175 for field, value in replyy.get("result", {}).items():
164 if "uids" not in field: 176 if "uids" not in field:
165 yield field, value.get("runs") 177 yield field, value.get("runs")
190 bioproject_id = reply["esearchresult"]["idlist"][0] 202 bioproject_id = reply["esearchresult"]["idlist"][0]
191 log(f"Found UID {bioproject_id} for '{starting_bioproject}'") 203 log(f"Found UID {bioproject_id} for '{starting_bioproject}'")
192 except IndexError: 204 except IndexError:
193 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"") 205 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"")
194 sys.exit(1) 206 sys.exit(1)
207 sleep(1 if not api_key else 0.1)
195 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]): 208 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]):
196 try: 209 try:
197 sampledict = flatten_biosample_xml(biosample_xml) 210 sampledict = flatten_biosample_xml(biosample_xml)
198 except KeyError: 211 except KeyError:
199 log(biosample_xml) 212 log(biosample_xml)