comparison bio2srr.py @ 12:fc77995bc4da

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Wed, 08 May 2024 00:32:13 -0400
parents 7fd0ef5842e7
children 0a3943480712
comparison
equal deleted inserted replaced
11:7fd0ef5842e7 12:fc77995bc4da
160 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text 160 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text
161 161
162 return sampledict 162 return sampledict
163 163
164 164
165 def yield_sra_runs_from_sample(biosampleids): 165 def yield_sra_runs_from_sample(biosample):
166 sleep(1 if not api_key else 0.1) 166 sleep(1 if not api_key else 0.1)
167 response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json", **extra_params)) 167 response = requests.get(elink, params=dict(id=biosample, dbfrom="biosample", db="sra", format="json", **extra_params))
168 response.raise_for_status() 168 response.raise_for_status()
169 reply = response.json() 169 reply = response.json()
170 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): 170 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200):
171 sleep(1 if not api_key else 0.1) 171 sleep(1 if not api_key else 0.1)
172 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params)) 172 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params))
183 """ 183 """
184 184
185 def flatten_runs(runxml): 185 def flatten_runs(runxml):
186 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted 186 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted
187 for run in root.findall(".//Run"): 187 for run in root.findall(".//Run"):
188 if run.attrib["is_public"] == "false":
189 logger.warning(f"Skipping non-public run {run.attrib['acc']}")
188 yield dict( 190 yield dict(
189 sra_run_accession = run.attrib["acc"], 191 sra_run_accession = run.attrib["acc"],
190 total_spots = run.attrib["total_spots"], 192 total_spots = run.attrib["total_spots"],
191 total_bases = run.attrib["total_bases"], 193 total_bases = run.attrib["total_bases"],
192 ) 194 )
210 sampledict = flatten_biosample_xml(biosample_xml) 212 sampledict = flatten_biosample_xml(biosample_xml)
211 except KeyError: 213 except KeyError:
212 log(biosample_xml) 214 log(biosample_xml)
213 raise 215 raise
214 sampledict["bioproject"] = bioproject 216 sampledict["bioproject"] = bioproject
217 noruns = True
215 for sra, runs in yield_sra_runs_from_sample(biosample): 218 for sra, runs in yield_sra_runs_from_sample(biosample):
216 for run in flatten_runs(runs.strip()): 219 for run in flatten_runs(runs.strip()):
220 noruns = False
217 run.update(sampledict) 221 run.update(sampledict)
218 rows.append(run) 222 rows.append(run)
223 if noruns:
224 rows.append(sampledict)
219 225
220 log(f"Writing {len(rows)} rows to metadata.tsv") 226 log(f"Writing {len(rows)} rows to metadata.tsv")
221 227
222 header = set() 228 header = set()
223 for row in rows: 229 for row in rows:
224 for key in row.keys(): 230 for key in row.keys():
225 header.add(key) 231 header.add(key)
226 232
227 header = sorted(list(header), key=hso) 233 header = sorted(list(header), key=hso)
228 logger.info(f"Header: {header}") 234 # logger.info(f"Header: {header}")
229 235
230 rows.sort(key=lambda x: x["biosample_accession"]) 236 rows.sort(key=lambda x: x["biosample_accession"])
231 237
232 with open("metadata.tsv", "w") as f: 238 with open("metadata.tsv", "w") as f:
233 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel") 239 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel")
234 writer.writeheader() 240 writer.writeheader()
235 writer.writerows(rows) 241 writer.writerows(rows)
236 242
237 log(f"Writing {len(rows)} accessions to accessions.txt") 243 # check for duplicate runs and unreleased samples
244
245 accessions = [row.get("sra_run_accession") for row in rows if row.get("sra_run_accession")]
246
247 raw_length = len(accessions)
248
249 accessions = sorted(list(set(accessions)))
250
251 if raw_length < len(rows):
252 logger.warning(f"Bioproject {starting_bioproject} contains unreleased samples. {len(rows) - raw_length} samples will not be included in accessions.txt")
253
254 if len(accessions) < raw_length:
255 logger.warning(f"Some SRA runs may have been reached through multiple projects or samples. accessions.txt will be deduplicated but the metadata table is not")
256
257 log(f"Writing {len(accessions)} unique accessions to accessions.txt")
238 258
239 with open("accessions.txt", "w") as f: 259 with open("accessions.txt", "w") as f:
240 for row in rows: 260 f.writelines(accessions)
241 f.write(row["sra_run_accession"] + "\n")
242 261
243 262
244 if __name__ == "__main__": 263 if __name__ == "__main__":
245 b = sys.argv[1].strip() 264 b = sys.argv[1].strip()
246 log(f"Starting with {b}") 265 log(f"Starting with {b}")