Mercurial > repos > jpayne > bioproject_to_srr_2
comparison bio2srr.py @ 11:7fd0ef5842e7
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Mon, 06 May 2024 01:42:27 -0400 |
parents | ccec96a537b7 |
children | fc77995bc4da |
comparison
equal
deleted
inserted
replaced
10:ccec96a537b7 | 11:7fd0ef5842e7 |
---|---|
1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils" | 1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils" |
2 | 2 |
3 import requests | 3 import requests |
4 import sys | 4 import sys |
5 import csv | 5 import csv |
6 import os | |
6 | 7 |
7 try: | 8 try: |
8 from itertools import batched | 9 from itertools import batched |
9 except ImportError: | 10 except ImportError: |
10 from itertools import islice | 11 from itertools import islice |
28 import logging | 29 import logging |
29 logging.basicConfig(level=logging.INFO) | 30 logging.basicConfig(level=logging.INFO) |
30 | 31 |
31 logger = logging.getLogger("bio2srr") | 32 logger = logging.getLogger("bio2srr") |
32 | 33 |
34 extra_params = {} | |
35 | |
36 api_key = os.environ.get("NCBI_API_KEY") | |
37 | |
38 if api_key: | |
39 logger.info(f"Using NCBI API key {api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}") | |
40 extra_params["api_key"] = api_key | |
41 | |
33 def log(msg): | 42 def log(msg): |
34 logger.info(msg) # fix logging later | 43 if api_key: |
44 logger.info(msg.replace(api_key, f"{api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")) # fix logging later | |
45 else: | |
46 logger.info(msg) | |
35 | 47 |
36 def get_tag(root, tag): | 48 def get_tag(root, tag): |
37 val = root.find(tag) | 49 val = root.find(tag) |
38 if val is not None: | 50 if val is not None: |
39 return val.text | 51 return val.text |
62 def resolve_bioproject_ids_and_links(bioproject_id_list): | 74 def resolve_bioproject_ids_and_links(bioproject_id_list): |
63 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML" | 75 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML" |
64 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list): | 76 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list): |
65 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}") | 77 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}") |
66 #get bioproject to bioproject links | 78 #get bioproject to bioproject links |
67 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json")) | 79 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params)) |
68 response.raise_for_status() | 80 response.raise_for_status() |
69 reply = response.json() | 81 reply = response.json() |
70 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}]) | 82 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}]) |
71 if len(linksets) >= 3: | 83 if len(linksets) >= 3: |
72 for id in linksets[2].get("links", []): #third index is the up to down links | 84 for id in linksets[2].get("links", []): #third index is the up to down links |
75 replyy = response.json() | 87 replyy = response.json() |
76 biop = replyy["result"][id]["project_acc"] | 88 biop = replyy["result"][id]["project_acc"] |
77 if id not in bioproject_id_list: | 89 if id not in bioproject_id_list: |
78 bioproject_id_list.append((biop, id)) # recurse over bioproject links | 90 bioproject_id_list.append((biop, id)) # recurse over bioproject links |
79 # get bioproject to biosample links | 91 # get bioproject to biosample links |
80 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json")) | 92 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params)) |
81 response.raise_for_status() | 93 response.raise_for_status() |
82 reply = response.json() | 94 reply = response.json() |
83 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []) | 95 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []) |
84 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})") | 96 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})") |
85 for ids in batched(links, 200): | 97 for ids in batched(links, 200): |
87 response.raise_for_status() | 99 response.raise_for_status() |
88 replyy = response.json() | 100 replyy = response.json() |
89 for field, value in replyy.get("result", {}).items(): | 101 for field, value in replyy.get("result", {}).items(): |
90 if "uids" not in field: | 102 if "uids" not in field: |
91 yield bioproject, field, value["sampledata"] # this is XML, deleriously | 103 yield bioproject, field, value["sampledata"] # this is XML, deleriously |
92 sleep(1) | 104 sleep(1 if not api_key else 0.1) |
93 | 105 |
94 | 106 |
95 biosample_example = """ | 107 biosample_example = """ |
96 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268"> | 108 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268"> |
97 <Ids> | 109 <Ids> |
149 | 161 |
150 return sampledict | 162 return sampledict |
151 | 163 |
152 | 164 |
153 def yield_sra_runs_from_sample(biosampleids): | 165 def yield_sra_runs_from_sample(biosampleids): |
154 sleep(0.1) | 166 sleep(1 if not api_key else 0.1) |
155 response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json")) | 167 response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json", **extra_params)) |
156 response.raise_for_status() | 168 response.raise_for_status() |
157 reply = response.json() | 169 reply = response.json() |
158 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): | 170 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): |
159 sleep(0.3) | 171 sleep(1 if not api_key else 0.1) |
160 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json")) | 172 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params)) |
161 response.raise_for_status() | 173 response.raise_for_status() |
162 replyy = response.json() | 174 replyy = response.json() |
163 for field, value in replyy.get("result", {}).items(): | 175 for field, value in replyy.get("result", {}).items(): |
164 if "uids" not in field: | 176 if "uids" not in field: |
165 yield field, value.get("runs") | 177 yield field, value.get("runs") |
190 bioproject_id = reply["esearchresult"]["idlist"][0] | 202 bioproject_id = reply["esearchresult"]["idlist"][0] |
191 log(f"Found UID {bioproject_id} for '{starting_bioproject}'") | 203 log(f"Found UID {bioproject_id} for '{starting_bioproject}'") |
192 except IndexError: | 204 except IndexError: |
193 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"") | 205 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"") |
194 sys.exit(1) | 206 sys.exit(1) |
207 sleep(1 if not api_key else 0.1) | |
195 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]): | 208 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]): |
196 try: | 209 try: |
197 sampledict = flatten_biosample_xml(biosample_xml) | 210 sampledict = flatten_biosample_xml(biosample_xml) |
198 except KeyError: | 211 except KeyError: |
199 log(biosample_xml) | 212 log(biosample_xml) |