Mercurial > repos > jpayne > bioproject_to_srr_2
diff bio2srr.py @ 12:fc77995bc4da
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Wed, 08 May 2024 00:32:13 -0400 |
parents | 7fd0ef5842e7 |
children | 0a3943480712 |
line wrap: on
line diff
--- a/bio2srr.py Mon May 06 01:42:27 2024 -0400 +++ b/bio2srr.py Wed May 08 00:32:13 2024 -0400 @@ -162,9 +162,9 @@ return sampledict -def yield_sra_runs_from_sample(biosampleids): +def yield_sra_runs_from_sample(biosample): sleep(1 if not api_key else 0.1) - response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json", **extra_params)) + response = requests.get(elink, params=dict(id=biosample, dbfrom="biosample", db="sra", format="json", **extra_params)) response.raise_for_status() reply = response.json() for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): @@ -185,6 +185,8 @@ def flatten_runs(runxml): root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted for run in root.findall(".//Run"): + if run.attrib["is_public"] == "false": + logger.warning(f"Skipping non-public run {run.attrib['acc']}") yield dict( sra_run_accession = run.attrib["acc"], total_spots = run.attrib["total_spots"], @@ -212,10 +214,14 @@ log(biosample_xml) raise sampledict["bioproject"] = bioproject + noruns = True for sra, runs in yield_sra_runs_from_sample(biosample): for run in flatten_runs(runs.strip()): + noruns = False run.update(sampledict) rows.append(run) + if noruns: + rows.append(sampledict) log(f"Writing {len(rows)} rows to metadata.tsv") @@ -225,7 +231,7 @@ header.add(key) header = sorted(list(header), key=hso) - logger.info(f"Header: {header}") + # logger.info(f"Header: {header}") rows.sort(key=lambda x: x["biosample_accession"]) @@ -234,11 +240,24 @@ writer.writeheader() writer.writerows(rows) - log(f"Writing {len(rows)} accessions to accessions.txt") + # check for duplicate runs and unreleased samples + + accessions = [row.get("sra_run_accession") for row in rows if row.get("sra_run_accession")] + + raw_length = len(accessions) + + accessions = sorted(list(set(accessions))) + + if raw_length < len(rows): + logger.warning(f"Bioproject {starting_bioproject} contains unreleased samples. {len(rows) - raw_length} samples will not be included in accessions.txt") + + if len(accessions) < raw_length: + logger.warning(f"Some SRA runs may have been reached through multiple projects or samples. accessions.txt will be deduplicated but the metadata table is not") + + log(f"Writing {len(accessions)} unique accessions to accessions.txt") with open("accessions.txt", "w") as f: - for row in rows: - f.write(row["sra_run_accession"] + "\n") + f.writelines(accessions) if __name__ == "__main__":