bioproject_to_srr_2: bio2srr.py comparison

comparison bio2srr.py @ 12:fc77995bc4da

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538

author	jpayne
date	Wed, 08 May 2024 00:32:13 -0400
parents	7fd0ef5842e7
children	0a3943480712

comparison

equal deleted inserted replaced

-:7fd0ef5842e7
+:fc77995bc4da
 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text
 return sampledict
-def yield_sra_runs_from_sample(biosampleids):
+def yield_sra_runs_from_sample(biosample):
 sleep(1 if not api_key else 0.1)
-response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json", **extra_params))
+response = requests.get(elink, params=dict(id=biosample, dbfrom="biosample", db="sra", format="json", **extra_params))
 response.raise_for_status()
 reply = response.json()
 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200):
 sleep(1 if not api_key else 0.1)
 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params))
 """
 def flatten_runs(runxml):
 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted
 for run in root.findall(".//Run"):
+if run.attrib["is_public"] == "false":
+logger.warning(f"Skipping non-public run {run.attrib['acc']}")
 yield dict(
 sra_run_accession = run.attrib["acc"],
 total_spots = run.attrib["total_spots"],
 total_bases = run.attrib["total_bases"],
 )
 sampledict = flatten_biosample_xml(biosample_xml)
 except KeyError:
 log(biosample_xml)
 raise
 sampledict["bioproject"] = bioproject
+noruns = True
 for sra, runs in yield_sra_runs_from_sample(biosample):
 for run in flatten_runs(runs.strip()):
+noruns = False
 run.update(sampledict)
 rows.append(run)
+if noruns:
+rows.append(sampledict)
 log(f"Writing {len(rows)} rows to metadata.tsv")
 header = set()
 for row in rows:
 for key in row.keys():
 header.add(key)
 header = sorted(list(header), key=hso)
-logger.info(f"Header: {header}")
+# logger.info(f"Header: {header}")
 rows.sort(key=lambda x: x["biosample_accession"])
 with open("metadata.tsv", "w") as f:
 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel")
 writer.writeheader()
 writer.writerows(rows)
-log(f"Writing {len(rows)} accessions to accessions.txt")
+# check for duplicate runs and unreleased samples
+accessions = [row.get("sra_run_accession") for row in rows if row.get("sra_run_accession")]
+raw_length = len(accessions)
+accessions = sorted(list(set(accessions)))
+if raw_length < len(rows):
+logger.warning(f"Bioproject {starting_bioproject} contains unreleased samples. {len(rows) - raw_length} samples will not be included in accessions.txt")
+if len(accessions) < raw_length:
+logger.warning(f"Some SRA runs may have been reached through multiple projects or samples. accessions.txt will be deduplicated but the metadata table is not")
+log(f"Writing {len(accessions)} unique accessions to accessions.txt")
 with open("accessions.txt", "w") as f:
-for row in rows:
+f.writelines(accessions)
-f.write(row["sra_run_accession"] + "\n")
 if __name__ == "__main__":
 b = sys.argv[1].strip()
 log(f"Starting with {b}")

Mercurial > repos > jpayne > bioproject_to_srr_2

comparison bio2srr.py @ 12:fc77995bc4da