annotate bio2srr.py @ 5:1a3038b6d1dd

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Fri, 03 May 2024 03:13:56 -0400
parents 2d4a2159c74b
children 832f269deeb0
rev   line source
jpayne@4 1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils"
jpayne@4 2
jpayne@4 3 import requests
jpayne@4 4 import sys
jpayne@4 5 import csv
jpayne@4 6
jpayne@4 7
jpayne@4 8 from itertools import batched
jpayne@4 9 from functools import cmp_to_key
jpayne@4 10 from time import sleep
jpayne@4 11 from xml.etree import ElementTree as xml
jpayne@4 12
jpayne@4 13 esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
jpayne@4 14 esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
jpayne@4 15 elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
jpayne@4 16
jpayne@4 17
jpayne@4 18 import logging
jpayne@4 19 logging.basicConfig(level=logging.INFO)
jpayne@4 20
jpayne@4 21 logger = logging.getLogger("bio2srr")
jpayne@4 22
jpayne@4 23 def log(msg):
jpayne@4 24 logger.info(msg) # fix logging later
jpayne@4 25
jpayne@4 26 def get_tag(root, tag):
jpayne@4 27 val = root.find(tag)
jpayne@4 28 if val is not None:
jpayne@4 29 return val.text
jpayne@4 30 log(f"No result for {tag}")
jpayne@4 31
jpayne@4 32
jpayne@4 33
jpayne@4 34 def header_sort_override(a, b):
jpayne@4 35 if a == b:
jpayne@4 36 return 0
jpayne@4 37 try:
jpayne@4 38 for name in ["bioproject", "srr_accession", "biosample_accession", "organism", "taxid", "package",]:
jpayne@4 39 if a == name:
jpayne@4 40 return -1
jpayne@4 41 if b == name:
jpayne@4 42 return 1
jpayne@4 43 except:
jpayne@4 44 pass
jpayne@4 45 if a < b:
jpayne@4 46 return -1
jpayne@4 47 else:
jpayne@4 48 return 1
jpayne@4 49
jpayne@4 50 hso = cmp_to_key(header_sort_override)
jpayne@4 51
jpayne@4 52 def resolve_bioproject_ids_and_links(bioproject_id_list):
jpayne@4 53 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML"
jpayne@4 54 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list):
jpayne@4 55 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}")
jpayne@4 56 #get bioproject to bioproject links
jpayne@4 57 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json"))
jpayne@4 58 response.raise_for_status()
jpayne@4 59 reply = response.json()
jpayne@4 60 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}])
jpayne@4 61 if len(linksets) >= 3:
jpayne@4 62 for id in linksets[2].get("links", []): #third index is the up to down links
jpayne@4 63 response = requests.get(esummary, params=dict(id=id, db="bioproject", format="json"))
jpayne@4 64 response.raise_for_status()
jpayne@4 65 replyy = response.json()
jpayne@4 66 biop = replyy["result"][id]["project_acc"]
jpayne@4 67 if id not in bioproject_id_list:
jpayne@4 68 bioproject_id_list.append((biop, id)) # recurse over bioproject links
jpayne@4 69 # get bioproject to biosample links
jpayne@4 70 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json"))
jpayne@4 71 response.raise_for_status()
jpayne@4 72 reply = response.json()
jpayne@4 73 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", [])
jpayne@4 74 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})")
jpayne@4 75 for ids in batched(links, 200):
jpayne@4 76 response = requests.get(esummary, params=dict(id=",".join(ids), db="biosample", format="json"))
jpayne@4 77 response.raise_for_status()
jpayne@4 78 replyy = response.json()
jpayne@4 79 for field, value in replyy.get("result", {}).items():
jpayne@4 80 if "uids" not in field:
jpayne@4 81 yield bioproject, field, value["sampledata"] # this is XML, deleriously
jpayne@4 82 sleep(0.1)
jpayne@4 83
jpayne@4 84
jpayne@4 85 biosample_example = """
jpayne@4 86 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268">
jpayne@4 87 <Ids>
jpayne@4 88 <Id db="BioSample" is_primary="1">SAMN17131268</Id>
jpayne@4 89 <Id db_label="Sample name">CJP19-D996</Id>
jpayne@4 90 </Ids>
jpayne@4 91 <Description>
jpayne@4 92 <Title>Pathogen: environmental/food/other sample from Campylobacter jejuni</Title>
jpayne@4 93 <Organism taxonomy_id="197" taxonomy_name="Campylobacter jejuni">
jpayne@4 94 <OrganismName>Campylobacter jejuni</OrganismName>
jpayne@4 95 </Organism>
jpayne@4 96 </Description>
jpayne@4 97 <Owner>
jpayne@4 98 <Name url="http://www.fda.gov/Food/FoodScienceResearch/WholeGenomeSequencingProgramWGS/default.htm" abbreviation="CFSAN">FDA Center for Food Safety and Applied Nutrition</Name>
jpayne@4 99 </Owner>
jpayne@4 100 <Models>
jpayne@4 101 <Model>Pathogen.env</Model>
jpayne@4 102 </Models>
jpayne@4 103 <Package display_name="Pathogen: environmental/food/other; version 1.0">Pathogen.env.1.0</Package>
jpayne@4 104 <Attributes>
jpayne@4 105 <Attribute attribute_name="strain" harmonized_name="strain" display_name="strain">CJP19-D996</Attribute>
jpayne@4 106 <Attribute attribute_name="collection_date" harmonized_name="collection_date" display_name="collection date">missing</Attribute>
jpayne@4 107 <Attribute attribute_name="geo_loc_name" harmonized_name="geo_loc_name" display_name="geographic location">missing</Attribute>
jpayne@4 108 <Attribute attribute_name="collected_by" harmonized_name="collected_by" display_name="collected by">CDC</Attribute>
jpayne@4 109 <Attribute attribute_name="lat_lon" harmonized_name="lat_lon" display_name="latitude and longitude">missing</Attribute>
jpayne@4 110 <Attribute attribute_name="isolation_source" harmonized_name="isolation_source" display_name="isolation source">missing</Attribute>
jpayne@4 111 <Attribute attribute_name="isolate" harmonized_name="isolate" display_name="isolate">CFSAN091032</Attribute>
jpayne@4 112 <Attribute attribute_name="project name" harmonized_name="project_name" display_name="project name">GenomeTrakr</Attribute>
jpayne@4 113 <Attribute attribute_name="sequenced by" harmonized_name="sequenced_by" display_name="sequenced by">FDA Center for Food Safety and Applied Nutrition</Attribute>
jpayne@4 114 </Attributes>
jpayne@4 115 <Links>
jpayne@4 116 <Link type="entrez" target="bioproject" label="PRJNA681235">681235</Link>
jpayne@4 117 </Links>
jpayne@4 118 <Status status="live" when="2020-12-21T15:08:05.693"/>
jpayne@4 119 </BioSample>
jpayne@4 120
jpayne@4 121 """
jpayne@4 122
jpayne@4 123 def flatten_biosample_xml(biosampxml):
jpayne@4 124 root = xml.fromstring(biosampxml)
jpayne@4 125 accession = get_tag(root, r'.//Id[@db="BioSample"]')
jpayne@4 126 # sample_name = get_tag(root, r'.//Id[@db_label="Sample name"]')
jpayne@4 127 organism = get_tag(root, r".//OrganismName")
jpayne@4 128 tax_id = root.find(r".//Organism").attrib.get("taxonomy_id")
jpayne@4 129 package = get_tag(root, r".//Package")
jpayne@4 130 sampledict = dict(
jpayne@4 131 biosample_accession=accession,
jpayne@4 132 # sample_name=sample_name,
jpayne@4 133 organism = organism,
jpayne@4 134 taxid = tax_id,
jpayne@4 135 package = package
jpayne@4 136 )
jpayne@4 137 for attribute in root.findall("Attributes/Attribute"):
jpayne@4 138 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text
jpayne@4 139
jpayne@4 140 return sampledict
jpayne@4 141
jpayne@4 142
jpayne@4 143 def yield_sra_runs_from_sample(biosampleids):
jpayne@4 144 sleep(0.1)
jpayne@4 145 response = requests.get(elink, params=dict(id=",".join(biosampleids), dbfrom="biosample", db="sra", format="json"))
jpayne@4 146 response.raise_for_status()
jpayne@4 147 reply = response.json()
jpayne@4 148 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200):
jpayne@4 149 sleep(0.1)
jpayne@4 150 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json"))
jpayne@4 151 response.raise_for_status()
jpayne@4 152 replyy = response.json()
jpayne@4 153 for field, value in replyy.get("result", {}).items():
jpayne@4 154 if "uids" not in field:
jpayne@4 155 yield field, value.get("runs")
jpayne@4 156
jpayne@4 157
jpayne@4 158 runs_example = """
jpayne@4 159 <Run acc="SRR13167188" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>
jpayne@4 160 <Run acc="SRR13167189" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>
jpayne@4 161 """
jpayne@4 162
jpayne@4 163 def flatten_runs(runxml):
jpayne@4 164 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted
jpayne@4 165 for run in root.findall(".//Run"):
jpayne@4 166 yield dict(
jpayne@4 167 sra_run_accession = run.attrib["acc"],
jpayne@4 168 total_spots = run.attrib["total_spots"],
jpayne@4 169 total_bases = run.attrib["total_bases"],
jpayne@4 170 )
jpayne@4 171
jpayne@4 172
jpayne@4 173
jpayne@4 174 def main(starting_bioproject):
jpayne@4 175 rows = []
jpayne@4 176 response = requests.get(esearch, params=dict(db="bioproject", term=starting_bioproject, field="PRJA", format="json"))
jpayne@4 177 response.raise_for_status()
jpayne@4 178 reply = response.json()
jpayne@4 179 try:
jpayne@4 180 bioproject_id = reply["esearchresult"]["idlist"][0]
jpayne@4 181 log(f"Found UID {bioproject_id} for '{starting_bioproject}'")
jpayne@4 182 except IndexError:
jpayne@4 183 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"")
jpayne@4 184 sys.exit(1)
jpayne@4 185 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]):
jpayne@4 186 try:
jpayne@4 187 sampledict = flatten_biosample_xml(biosample_xml)
jpayne@4 188 except KeyError:
jpayne@4 189 log(biosample_xml)
jpayne@4 190 raise
jpayne@4 191 sampledict["bioproject"] = bioproject
jpayne@4 192 for sra, runs in yield_sra_runs_from_sample(biosample):
jpayne@4 193 for run in flatten_runs(runs.strip()):
jpayne@4 194 run.update(sampledict)
jpayne@4 195 rows.append(run)
jpayne@4 196
jpayne@4 197 log(f"Writing {len(rows)} rows to metadata.tsv")
jpayne@4 198
jpayne@4 199 header = set()
jpayne@4 200 for row in rows:
jpayne@4 201 for key in row.keys():
jpayne@4 202 header.add(key)
jpayne@4 203
jpayne@4 204 header = sorted(list(header), key=hso)
jpayne@4 205 logger.info(f"Header: {header}")
jpayne@4 206
jpayne@4 207 rows.sort(key=lambda x: x["biosample_accession"])
jpayne@4 208
jpayne@4 209 with open("metadata.tsv", "w") as f:
jpayne@4 210 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel")
jpayne@4 211 writer.writeheader()
jpayne@4 212 writer.writerows(rows)
jpayne@4 213
jpayne@4 214 log(f"Writing {len(rows)} accessions to accessions.txt")
jpayne@4 215
jpayne@4 216 with open("accessions.txt", "w") as f:
jpayne@4 217 for row in rows:
jpayne@4 218 f.write(row["sra_run_accession"] + "\n")
jpayne@4 219
jpayne@4 220
jpayne@4 221 if __name__ == "__main__":
jpayne@4 222 b = sys.argv[1].strip()
jpayne@4 223 log(f"Starting with {b}")
jpayne@4 224 try:
jpayne@4 225 main(b)
jpayne@4 226 except requests.HTTPError as e:
jpayne@4 227 logger.error(e)
jpayne@4 228 sys.exit(1)
jpayne@4 229
jpayne@4 230
jpayne@4 231
jpayne@4 232
jpayne@4 233