annotate bio2srr.py @ 3:80f1001797c7

Uploaded
author jpayne
date Wed, 27 Oct 2021 05:00:45 -0400
parents 556cac4fb538
children 2d4a2159c74b
rev   line source
jpayne@0 1 #! /usr/bin/env python3
jpayne@0 2
jpayne@0 3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's."
jpayne@0 4
jpayne@0 5 import requests
jpayne@0 6 import sys
jpayne@2 7 from xml.etree import ElementTree as xml
jpayne@2 8 import csv
jpayne@0 9
jpayne@3 10 from time import sleep
jpayne@3 11
jpayne@2 12 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
jpayne@0 13
jpayne@0 14 sample = """{
jpayne@0 15 "hitCount": 2,
jpayne@0 16 "entries": [
jpayne@0 17 {
jpayne@0 18 "id": "SRX377510",
jpayne@0 19 "source": "sra-experiment"
jpayne@0 20 },
jpayne@0 21 {
jpayne@0 22 "id": "SRX583279",
jpayne@0 23 "source": "sra-experiment"
jpayne@0 24 }
jpayne@0 25 ],
jpayne@0 26 "facets": []
jpayne@0 27 }"""
jpayne@0 28
jpayne@3 29 data_query = "?display=xml"
jpayne@2 30
jpayne@3 31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run"
jpayne@0 32
jpayne@0 33 sample = """{
jpayne@0 34 "hitCount": 1,
jpayne@0 35 "entries": [
jpayne@0 36 {
jpayne@0 37 "id": "SRR1029665",
jpayne@0 38 "source": "sra-run"
jpayne@0 39 }
jpayne@0 40 ],
jpayne@0 41 "facets": []
jpayne@0 42 }"""
jpayne@0 43
jpayne@3 44 def get_tag(root, tag):
jpayne@3 45 val = root.find(tag)
jpayne@3 46 if val:
jpayne@3 47 return val.text
jpayne@3 48
jpayne@0 49 if __name__ == "__main__":
jpayne@2 50 try:
jpayne@2 51 bioproject = sys.argv[1]
jpayne@3 52
jpayne@3 53 b_result = None
jpayne@3 54
jpayne@3 55 runs = []
jpayne@3 56
jpayne@3 57 while not b_result or len(runs) < b_result['hitCount']:
jpayne@3 58 b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json"))
jpayne@3 59 b_result.raise_for_status()
jpayne@3 60 b_result = b_result.json()
jpayne@3 61 runs += [d['id'] for d in b_result['entries']]
jpayne@3 62
jpayne@2 63 if not runs:
jpayne@2 64 print(f"No results found for '{bioproject}'.", file=sys.stderr)
jpayne@2 65 quit(1)
jpayne@2 66 except IndexError:
jpayne@2 67 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
jpayne@0 68
jpayne@2 69 try:
jpayne@2 70 with open(sys.argv[2], 'r') as f:
jpayne@2 71 rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
jpayne@2 72 rcds = list(rdr)
jpayne@0 73
jpayne@0 74
jpayne@2 75 except IndexError:
jpayne@2 76 rcds = []
jpayne@2 77
jpayne@3 78 bsams = []
jpayne@2 79
jpayne@3 80 for id in runs:
jpayne@3 81 res = requests.get(
jpayne@3 82 f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample",
jpayne@3 83 headers=dict(Accept="application/json")
jpayne@3 84 )
jpayne@3 85 res.raise_for_status()
jpayne@3 86 bsams.append(res.json()['entries'][0]['references'][0]['acc'])
jpayne@3 87 sleep(.1)
jpayne@3 88
jpayne@3 89 # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
jpayne@3 90 # res.raise_for_status()
jpayne@3 91 # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
jpayne@3 92
jpayne@3 93 for run_id, sam_id in zip(runs, bsams):
jpayne@3 94 print(run_id)
jpayne@2 95 record = {}
jpayne@2 96 record['sample'] = run_id
jpayne@2 97 record['biosample_accession'] = sam_id
jpayne@3 98 res = requests.get(
jpayne@3 99 f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}"
jpayne@3 100 )
jpayne@2 101 res.raise_for_status()
jpayne@2 102 root = xml.fromstring(res.text)
jpayne@3 103
jpayne@3 104 record['submitter_id'] = get_tag(root, './/SUBMITTER_ID')
jpayne@3 105 record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME')
jpayne@3 106
jpayne@2 107 for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
jpayne@2 108 key, value = iter(attr)
jpayne@2 109 record[key.text] = value.text
jpayne@2 110 rcds.append(record)
jpayne@3 111 sleep(.1)
jpayne@2 112
jpayne@2 113 headers = {}
jpayne@2 114 for record in rcds:
jpayne@2 115 for key in record.keys():
jpayne@2 116 headers[key] = None # use a dict to preserve header order
jpayne@2 117
jpayne@2 118
jpayne@2 119 with open('./metadata.tsv', 'w') as f:
jpayne@2 120 wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
jpayne@2 121 wtr.writeheader()
jpayne@2 122 wtr.writerows(rcds)
jpayne@2 123
jpayne@2 124
jpayne@2 125