Mercurial > repos > jpayne > bioproject_to_srr_2
comparison bio2srr.py @ 2:556cac4fb538
"planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/b07378e15ad9"
author | jpayne |
---|---|
date | Mon, 03 Aug 2020 10:39:37 -0400 |
parents | 02ac32a00e25 |
children | 80f1001797c7 |
comparison
equal
deleted
inserted
replaced
1:b07378e15ad9 | 2:556cac4fb538 |
---|---|
2 | 2 |
3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's." | 3 "Grab SRR numbers from BioProjects via the EMBL-ENA REST API's." |
4 | 4 |
5 import requests | 5 import requests |
6 import sys | 6 import sys |
7 from xml.etree import ElementTree as xml | |
8 import csv | |
7 | 9 |
8 sra_exp_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" | 10 sra_exp_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-experiment?query={bioproject}" |
9 | 11 |
10 sample = """{ | 12 sample = """{ |
11 "hitCount": 2, | 13 "hitCount": 2, |
12 "entries": [ | 14 "entries": [ |
13 { | 15 { |
20 } | 22 } |
21 ], | 23 ], |
22 "facets": [] | 24 "facets": [] |
23 }""" | 25 }""" |
24 | 26 |
25 sra_run_query = "http://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={experiment}" | 27 data_query = "https://www.ebi.ac.uk/ena/data/view/{accession}&display=xml" |
28 | |
29 xref_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run/entry/{runs}/xref/sra-sample" | |
30 | |
31 sra_run_query = "https://www.ebi.ac.uk/ebisearch/ws/rest/sra-run?query={bioproject}" | |
26 | 32 |
27 sample = """{ | 33 sample = """{ |
28 "hitCount": 1, | 34 "hitCount": 1, |
29 "entries": [ | 35 "entries": [ |
30 { | 36 { |
34 ], | 40 ], |
35 "facets": [] | 41 "facets": [] |
36 }""" | 42 }""" |
37 | 43 |
38 if __name__ == "__main__": | 44 if __name__ == "__main__": |
39 try: | 45 try: |
40 bioproject = sys.argv[1] | 46 bioproject = sys.argv[1] |
41 b_result = requests.get(sra_exp_query.format(bioproject=bioproject), headers=dict(Accept="application/json")) | 47 b_result = requests.get(sra_run_query.format(bioproject=bioproject), headers=dict(Accept="application/json")) |
42 b_result.raise_for_status() | 48 b_result.raise_for_status() |
43 if b_result.json()['entries']: | 49 runs = [d['id'] for d in b_result.json()['entries']] |
44 for experiment in [d['id'] for d in b_result.json()['entries']]: | 50 if not runs: |
45 r_result = requests.get(sra_run_query.format(experiment=experiment), headers=dict(Accept="application/json")) | 51 print(f"No results found for '{bioproject}'.", file=sys.stderr) |
46 r_result.raise_for_status() | 52 quit(1) |
47 for run in [d['id'] for d in r_result.json()['entries']]: | 53 except IndexError: |
48 print(run) | 54 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") |
49 else: | 55 except KeyError as e: |
50 print(f"No results found for '{bioproject}'.", file=sys.stderr) | 56 raise ValueError() from e |
51 quit(1) | 57 |
52 except IndexError: | 58 try: |
53 raise ValueError("Please provide an NCBI BioProject, NCBI BioSample, EMBL Project, or EMBL Study accession.") | 59 with open(sys.argv[2], 'r') as f: |
54 except KeyError as e: | 60 rdr = csv.DictReader(f, dialect='excel', delimiter='\t') |
55 raise ValueError() from e | 61 rcds = list(rdr) |
62 | |
63 | |
64 except IndexError: | |
65 rcds = [] | |
66 | |
67 res = requests.get(xref_query.format(runs=",".join(runs)), headers=dict(Accept="application/json")) | |
68 res.raise_for_status() | |
69 bsams = [(e['id'], e['references'][0]['acc']) for e in res.json()['entries']] | |
70 | |
71 for run_id, sam_id in bsams: | |
72 record = {} | |
73 record['sample'] = run_id | |
74 record['biosample_accession'] = sam_id | |
75 res = requests.get(data_query.format(accession=sam_id)) | |
76 res.raise_for_status() | |
77 root = xml.fromstring(res.text) | |
78 for attr in root.findall('.//SAMPLE_ATTRIBUTE'): | |
79 key, value = iter(attr) | |
80 record[key.text] = value.text | |
81 rcds.append(record) | |
82 print(run_id) | |
83 | |
84 headers = {} | |
85 for record in rcds: | |
86 for key in record.keys(): | |
87 headers[key] = None # use a dict to preserve header order | |
88 | |
89 | |
90 with open('./metadata.tsv', 'w') as f: | |
91 wtr = csv.DictWriter(f, dialect='excel', delimiter='\t', fieldnames=headers.keys()) | |
92 wtr.writeheader() | |
93 wtr.writerows(rcds) | |
56 | 94 |
57 | 95 |
58 | 96 |