Mercurial > repos > jpayne > bioproject_to_srr_2
comparison bio2srr.py @ 12:fc77995bc4da
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Wed, 08 May 2024 00:32:13 -0400 |
parents | 7fd0ef5842e7 |
children | 0a3943480712 |
comparison
equal
deleted
inserted
replaced
11:7fd0ef5842e7 | 12:fc77995bc4da |
---|---|
160 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text | 160 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text |
161 | 161 |
162 return sampledict | 162 return sampledict |
163 | 163 |
164 | 164 |
165 def yield_sra_runs_from_sample(biosampleids): | 165 def yield_sra_runs_from_sample(biosample): |
166 sleep(1 if not api_key else 0.1) | 166 sleep(1 if not api_key else 0.1) |
167 response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json", **extra_params)) | 167 response = requests.get(elink, params=dict(id=biosample, dbfrom="biosample", db="sra", format="json", **extra_params)) |
168 response.raise_for_status() | 168 response.raise_for_status() |
169 reply = response.json() | 169 reply = response.json() |
170 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): | 170 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): |
171 sleep(1 if not api_key else 0.1) | 171 sleep(1 if not api_key else 0.1) |
172 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params)) | 172 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params)) |
183 """ | 183 """ |
184 | 184 |
185 def flatten_runs(runxml): | 185 def flatten_runs(runxml): |
186 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted | 186 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted |
187 for run in root.findall(".//Run"): | 187 for run in root.findall(".//Run"): |
188 if run.attrib["is_public"] == "false": | |
189 logger.warning(f"Skipping non-public run {run.attrib['acc']}") | |
188 yield dict( | 190 yield dict( |
189 sra_run_accession = run.attrib["acc"], | 191 sra_run_accession = run.attrib["acc"], |
190 total_spots = run.attrib["total_spots"], | 192 total_spots = run.attrib["total_spots"], |
191 total_bases = run.attrib["total_bases"], | 193 total_bases = run.attrib["total_bases"], |
192 ) | 194 ) |
210 sampledict = flatten_biosample_xml(biosample_xml) | 212 sampledict = flatten_biosample_xml(biosample_xml) |
211 except KeyError: | 213 except KeyError: |
212 log(biosample_xml) | 214 log(biosample_xml) |
213 raise | 215 raise |
214 sampledict["bioproject"] = bioproject | 216 sampledict["bioproject"] = bioproject |
217 noruns = True | |
215 for sra, runs in yield_sra_runs_from_sample(biosample): | 218 for sra, runs in yield_sra_runs_from_sample(biosample): |
216 for run in flatten_runs(runs.strip()): | 219 for run in flatten_runs(runs.strip()): |
220 noruns = False | |
217 run.update(sampledict) | 221 run.update(sampledict) |
218 rows.append(run) | 222 rows.append(run) |
223 if noruns: | |
224 rows.append(sampledict) | |
219 | 225 |
220 log(f"Writing {len(rows)} rows to metadata.tsv") | 226 log(f"Writing {len(rows)} rows to metadata.tsv") |
221 | 227 |
222 header = set() | 228 header = set() |
223 for row in rows: | 229 for row in rows: |
224 for key in row.keys(): | 230 for key in row.keys(): |
225 header.add(key) | 231 header.add(key) |
226 | 232 |
227 header = sorted(list(header), key=hso) | 233 header = sorted(list(header), key=hso) |
228 logger.info(f"Header: {header}") | 234 # logger.info(f"Header: {header}") |
229 | 235 |
230 rows.sort(key=lambda x: x["biosample_accession"]) | 236 rows.sort(key=lambda x: x["biosample_accession"]) |
231 | 237 |
232 with open("metadata.tsv", "w") as f: | 238 with open("metadata.tsv", "w") as f: |
233 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel") | 239 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel") |
234 writer.writeheader() | 240 writer.writeheader() |
235 writer.writerows(rows) | 241 writer.writerows(rows) |
236 | 242 |
237 log(f"Writing {len(rows)} accessions to accessions.txt") | 243 # check for duplicate runs and unreleased samples |
244 | |
245 accessions = [row.get("sra_run_accession") for row in rows if row.get("sra_run_accession")] | |
246 | |
247 raw_length = len(accessions) | |
248 | |
249 accessions = sorted(list(set(accessions))) | |
250 | |
251 if raw_length < len(rows): | |
252 logger.warning(f"Bioproject {starting_bioproject} contains unreleased samples. {len(rows) - raw_length} samples will not be included in accessions.txt") | |
253 | |
254 if len(accessions) < raw_length: | |
255 logger.warning(f"Some SRA runs may have been reached through multiple projects or samples. accessions.txt will be deduplicated but the metadata table is not") | |
256 | |
257 log(f"Writing {len(accessions)} unique accessions to accessions.txt") | |
238 | 258 |
239 with open("accessions.txt", "w") as f: | 259 with open("accessions.txt", "w") as f: |
240 for row in rows: | 260 f.writelines(accessions) |
241 f.write(row["sra_run_accession"] + "\n") | |
242 | 261 |
243 | 262 |
244 if __name__ == "__main__": | 263 if __name__ == "__main__": |
245 b = sys.argv[1].strip() | 264 b = sys.argv[1].strip() |
246 log(f"Starting with {b}") | 265 log(f"Starting with {b}") |