bioproject_to_srr_2: bio2srr.py annotate

annotate bio2srr.py @ 3:80f1001797c7

Uploaded

author	jpayne
date	Wed, 27 Oct 2021 05:00:45 -0400
parents	556cac4fb538
children	2d4a2159c74b

rev	line source
jpayne@0	1 #! /usr/bin/env python3
jpayne@0	2
jpayne@0	3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's."
jpayne@0	4
jpayne@0	5 import requests
jpayne@0	6 import sys
jpayne@2	7 from xml.etree import ElementTree as xml
jpayne@2	8 import csv
jpayne@0	9
jpayne@3	10 from time import sleep
jpayne@3	11
jpayne@2	12 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
jpayne@0	13
jpayne@0	14 sample = """{
jpayne@0	15 "hitCount": 2,
jpayne@0	16 "entries": [
jpayne@0	17 {
jpayne@0	18 "id": "SRX377510",
jpayne@0	19 "source": "sra-experiment"
jpayne@0	20 },
jpayne@0	21 {
jpayne@0	22 "id": "SRX583279",
jpayne@0	23 "source": "sra-experiment"
jpayne@0	24 }
jpayne@0	25 ],
jpayne@0	26 "facets": []
jpayne@0	27 }"""
jpayne@0	28
jpayne@3	29 data_query = "?display=xml"
jpayne@2	30
jpayne@3	31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run"
jpayne@0	32
jpayne@0	33 sample = """{
jpayne@0	34 "hitCount": 1,
jpayne@0	35 "entries": [
jpayne@0	36 {
jpayne@0	37 "id": "SRR1029665",
jpayne@0	38 "source": "sra-run"
jpayne@0	39 }
jpayne@0	40 ],
jpayne@0	41 "facets": []
jpayne@0	42 }"""
jpayne@0	43
jpayne@3	44 def get_tag(root, tag):
jpayne@3	45 val = root.find(tag)
jpayne@3	46 if val:
jpayne@3	47 return val.text
jpayne@3	48
jpayne@0	49 if __name__ == "__main__":
jpayne@2	50 try:
jpayne@2	51 bioproject = sys.argv[1]
jpayne@3	52
jpayne@3	53 b_result = None
jpayne@3	54
jpayne@3	55 runs = []
jpayne@3	56
jpayne@3	57 while not b_result or len(runs) < b_result['hitCount']:
jpayne@3	58 b_result = requests.get(sra_run_query, params=dict(query=bioproject, start=len(runs)), headers=dict(Accept="application/json"))
jpayne@3	59 b_result.raise_for_status()
jpayne@3	60 b_result = b_result.json()
jpayne@3	61 runs += [d['id'] for d in b_result['entries']]
jpayne@3	62
jpayne@2	63 if not runs:
jpayne@2	64 print(f"No results found for '{bioproject}'.", file=sys.stderr)
jpayne@2	65 quit(1)
jpayne@2	66 except IndexError:
jpayne@2	67 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
jpayne@0	68
jpayne@2	69 try:
jpayne@2	70 with open(sys.argv[2], 'r') as f:
jpayne@2	71 rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
jpayne@2	72 rcds = list(rdr)
jpayne@0	73
jpayne@0	74
jpayne@2	75 except IndexError:
jpayne@2	76 rcds = []
jpayne@2	77
jpayne@3	78 bsams = []
jpayne@2	79
jpayne@3	80 for id in runs:
jpayne@3	81 res = requests.get(
jpayne@3	82 f"https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{id}/xref/sra-sample",
jpayne@3	83 headers=dict(Accept="application/json")
jpayne@3	84 )
jpayne@3	85 res.raise_for_status()
jpayne@3	86 bsams.append(res.json()['entries'][0]['references'][0]['acc'])
jpayne@3	87 sleep(.1)
jpayne@3	88
jpayne@3	89 # res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
jpayne@3	90 # res.raise_for_status()
jpayne@3	91 # bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
jpayne@3	92
jpayne@3	93 for run_id, sam_id in zip(runs, bsams):
jpayne@3	94 print(run_id)
jpayne@2	95 record = {}
jpayne@2	96 record['sample'] = run_id
jpayne@2	97 record['biosample_accession'] = sam_id
jpayne@3	98 res = requests.get(
jpayne@3	99 f"https://www.ebi.ac.uk/ena/browser/api/xml/{sam_id}"
jpayne@3	100 )
jpayne@2	101 res.raise_for_status()
jpayne@2	102 root = xml.fromstring(res.text)
jpayne@3	103
jpayne@3	104 record['submitter_id'] = get_tag(root, './/SUBMITTER_ID')
jpayne@3	105 record['scientific_name'] = get_tag(root, './/SCIENTIFIC_NAME')
jpayne@3	106
jpayne@2	107 for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
jpayne@2	108 key, value = iter(attr)
jpayne@2	109 record[key.text] = value.text
jpayne@2	110 rcds.append(record)
jpayne@3	111 sleep(.1)
jpayne@2	112
jpayne@2	113 headers = {}
jpayne@2	114 for record in rcds:
jpayne@2	115 for key in record.keys():
jpayne@2	116 headers[key] = None # use a dict to preserve header order
jpayne@2	117
jpayne@2	118
jpayne@2	119 with open('./metadata.tsv', 'w') as f:
jpayne@2	120 wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
jpayne@2	121 wtr.writeheader()
jpayne@2	122 wtr.writerows(rcds)
jpayne@2	123
jpayne@2	124
jpayne@2	125

Mercurial > repos > jpayne > bioproject_to_srr_2

annotate bio2srr.py @ 3:80f1001797c7