annotate bio2srr.py @ 2:556cac4fb538

"planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/b07378e15ad9"
author jpayne
date Mon, 03 Aug 2020 10:39:37 -0400
parents 02ac32a00e25
children 80f1001797c7
rev   line source
jpayne@0 1 #! /usr/bin/env python3
jpayne@0 2
jpayne@0 3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's."
jpayne@0 4
jpayne@0 5 import requests
jpayne@0 6 import sys
jpayne@2 7 from xml.etree import ElementTree as xml
jpayne@2 8 import csv
jpayne@0 9
jpayne@2 10 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}"
jpayne@0 11
jpayne@0 12 sample = """{
jpayne@0 13 "hitCount": 2,
jpayne@0 14 "entries": [
jpayne@0 15 {
jpayne@0 16 "id": "SRX377510",
jpayne@0 17 "source": "sra-experiment"
jpayne@0 18 },
jpayne@0 19 {
jpayne@0 20 "id": "SRX583279",
jpayne@0 21 "source": "sra-experiment"
jpayne@0 22 }
jpayne@0 23 ],
jpayne@0 24 "facets": []
jpayne@0 25 }"""
jpayne@0 26
jpayne@2 27 data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml"
jpayne@2 28
jpayne@2 29 xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample"
jpayne@2 30
jpayne@2 31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}"
jpayne@0 32
jpayne@0 33 sample = """{
jpayne@0 34 "hitCount": 1,
jpayne@0 35 "entries": [
jpayne@0 36 {
jpayne@0 37 "id": "SRR1029665",
jpayne@0 38 "source": "sra-run"
jpayne@0 39 }
jpayne@0 40 ],
jpayne@0 41 "facets": []
jpayne@0 42 }"""
jpayne@0 43
jpayne@0 44 if __name__ == "__main__":
jpayne@2 45 try:
jpayne@2 46 bioproject = sys.argv[1]
jpayne@2 47 b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json"))
jpayne@2 48 b_result.raise_for_status()
jpayne@2 49 runs = [d['id'] for d in b_result.json()['entries']]
jpayne@2 50 if not runs:
jpayne@2 51 print(f"No results found for '{bioproject}'.", file=sys.stderr)
jpayne@2 52 quit(1)
jpayne@2 53 except IndexError:
jpayne@2 54 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.")
jpayne@2 55 except KeyError as e:
jpayne@2 56 raise ValueError() from e
jpayne@0 57
jpayne@2 58 try:
jpayne@2 59 with open(sys.argv[2], 'r') as f:
jpayne@2 60 rdr = csv.DictReader(f, dialect='excel', delimiter='\t')
jpayne@2 61 rcds = list(rdr)
jpayne@0 62
jpayne@0 63
jpayne@2 64 except IndexError:
jpayne@2 65 rcds = []
jpayne@2 66
jpayne@2 67 res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json"))
jpayne@2 68 res.raise_for_status()
jpayne@2 69 bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']]
jpayne@2 70
jpayne@2 71 for run_id, sam_id in bsams:
jpayne@2 72 record = {}
jpayne@2 73 record['sample'] = run_id
jpayne@2 74 record['biosample_accession'] = sam_id
jpayne@2 75 res = requests.get(data_query.format(accession=sam_id))
jpayne@2 76 res.raise_for_status()
jpayne@2 77 root = xml.fromstring(res.text)
jpayne@2 78 for attr in root.findall('.//SAMPLE_ATTRIBUTE'):
jpayne@2 79 key, value = iter(attr)
jpayne@2 80 record[key.text] = value.text
jpayne@2 81 rcds.append(record)
jpayne@2 82 print(run_id)
jpayne@2 83
jpayne@2 84 headers = {}
jpayne@2 85 for record in rcds:
jpayne@2 86 for key in record.keys():
jpayne@2 87 headers[key] = None # use a dict to preserve header order
jpayne@2 88
jpayne@2 89
jpayne@2 90 with open('./metadata.tsv', 'w') as f:
jpayne@2 91 wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys())
jpayne@2 92 wtr.writeheader()
jpayne@2 93 wtr.writerows(rcds)
jpayne@2 94
jpayne@2 95
jpayne@2 96