diff bio2srr.py @ 12:fc77995bc4da

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Wed, 08 May 2024 00:32:13 -0400
parents 7fd0ef5842e7
children 0a3943480712
line wrap: on
line diff
--- a/bio2srr.py	Mon May 06 01:42:27 2024 -0400
+++ b/bio2srr.py	Wed May 08 00:32:13 2024 -0400
@@ -162,9 +162,9 @@
     return sampledict
 
 
-def yield_sra_runs_from_sample(biosampleids):
+def yield_sra_runs_from_sample(biosample):
     sleep(1 if not api_key else 0.1)
-    response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json", **extra_params))
+    response = requests.get(elink, params=dict(id=biosample, dbfrom="biosample", db="sra", format="json", **extra_params))
     response.raise_for_status()
     reply = response.json()
     for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200):
@@ -185,6 +185,8 @@
 def flatten_runs(runxml):
     root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted
     for run in root.findall(".//Run"):
+        if run.attrib["is_public"] == "false":
+            logger.warning(f"Skipping non-public run {run.attrib['acc']}")
         yield dict(
             sra_run_accession = run.attrib["acc"],
             total_spots = run.attrib["total_spots"],
@@ -212,10 +214,14 @@
             log(biosample_xml)
             raise
         sampledict["bioproject"] = bioproject
+        noruns = True
         for sra, runs in yield_sra_runs_from_sample(biosample):
             for run in flatten_runs(runs.strip()):
+                noruns = False
                 run.update(sampledict)
                 rows.append(run)
+        if noruns:
+            rows.append(sampledict)
 
     log(f"Writing {len(rows)} rows to metadata.tsv")
 
@@ -225,7 +231,7 @@
             header.add(key)
 
     header = sorted(list(header), key=hso)
-    logger.info(f"Header: {header}")
+    # logger.info(f"Header: {header}")
 
     rows.sort(key=lambda x: x["biosample_accession"])
 
@@ -234,11 +240,24 @@
         writer.writeheader()
         writer.writerows(rows)
 
-    log(f"Writing {len(rows)} accessions to accessions.txt")
+    # check for duplicate runs and unreleased samples
+
+    accessions = [row.get("sra_run_accession") for row in rows if row.get("sra_run_accession")]
+
+    raw_length = len(accessions)
+
+    accessions = sorted(list(set(accessions)))
+
+    if raw_length < len(rows):
+        logger.warning(f"Bioproject {starting_bioproject} contains unreleased samples. {len(rows) - raw_length} samples will not be included in accessions.txt")
+
+    if len(accessions) < raw_length:
+        logger.warning(f"Some SRA runs may have been reached through multiple projects or samples. accessions.txt will be deduplicated but the metadata table is not")
+
+    log(f"Writing {len(accessions)} unique accessions to accessions.txt")
 
     with open("accessions.txt", "w") as f:
-        for row in rows:
-            f.write(row["sra_run_accession"] + "\n")
+        f.writelines(accessions)
 
 
 if __name__ == "__main__":