Mercurial > repos > jpayne > bioproject2srr
annotate bio2srr.py @ 0:79fa4330f2c9 draft default tip
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
| author | jpayne |
|---|---|
| date | Mon, 08 Dec 2025 20:18:03 +0000 |
| parents | |
| children |
| rev | line source |
|---|---|
|
0
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils" |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
2 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
3 import requests |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
4 import sys |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
5 import csv |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
6 import os |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
7 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
8 try: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
9 from itertools import batched |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
10 except ImportError: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
11 from itertools import islice |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
12 def batched(iterable, n): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
13 "Batch data into tuples of length n. The last batch may be shorter." |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
14 # batched('ABCDEFG', 3) --> ABC DEF G |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
15 if n < 1: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
16 raise ValueError('n must be at least one') |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
17 it = iter(iterable) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
18 while batch := tuple(islice(it, n)): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
19 yield batch |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
20 from functools import cmp_to_key |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
21 from time import sleep |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
22 from xml.etree import ElementTree as xml |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
23 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
24 esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
25 esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
26 elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi" |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
27 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
28 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
29 import logging |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
30 logging.basicConfig(level=logging.INFO) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
31 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
32 logger = logging.getLogger("bio2srr") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
33 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
34 extra_params = {} |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
35 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
36 api_key = os.environ.get("NCBI_API_KEY") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
37 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
38 if api_key: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
39 logger.info(f"Using NCBI API key {api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
40 extra_params["api_key"] = api_key |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
41 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
42 def log(msg): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
43 if api_key: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
44 logger.info(msg.replace(api_key, f"{api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")) # fix logging later |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
45 else: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
46 logger.info(msg) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
47 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
48 def get_tag(root, tag): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
49 val = root.find(tag) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
50 if val is not None: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
51 return val.text |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
52 log(f"No result for {tag}") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
53 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
54 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
55 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
56 def header_sort_override(a, b): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
57 if a == b: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
58 return 0 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
59 try: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
60 for name in ["bioproject", "srr_accession", "biosample_accession", "organism", "taxid", "package",]: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
61 if a == name: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
62 return -1 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
63 if b == name: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
64 return 1 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
65 except: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
66 pass |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
67 if a < b: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
68 return -1 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
69 else: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
70 return 1 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
71 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
72 hso = cmp_to_key(header_sort_override) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
73 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
74 def resolve_bioproject_ids_and_links(bioproject_id_list): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
75 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML" |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
76 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
77 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
78 #get bioproject to bioproject links |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
79 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params)) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
80 response.raise_for_status() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
81 reply = response.json() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
82 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}]) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
83 if len(linksets) >= 3: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
84 for id in linksets[2].get("links", []): #third index is the up to down links |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
85 response = requests.get(esummary, params=dict(id=id, db="bioproject", format="json")) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
86 response.raise_for_status() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
87 replyy = response.json() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
88 biop = replyy["result"][id]["project_acc"] |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
89 if id not in bioproject_id_list: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
90 bioproject_id_list.append((biop, id)) # recurse over bioproject links |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
91 # get bioproject to biosample links |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
92 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params)) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
93 response.raise_for_status() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
94 reply = response.json() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
95 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
96 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
97 for ids in batched(links, 200): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
98 response = requests.get(esummary, params=dict(id=",".join(ids), db="biosample", format="json")) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
99 response.raise_for_status() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
100 replyy = response.json() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
101 for field, value in replyy.get("result", {}).items(): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
102 if "uids" not in field: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
103 yield bioproject, field, value["sampledata"] # this is XML, deleriously |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
104 sleep(1 if not api_key else 0.1) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
105 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
106 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
107 biosample_example = """ |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
108 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268"> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
109 <Ids> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
110 <Id db="BioSample" is_primary="1">SAMN17131268</Id> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
111 <Id db_label="Sample name">CJP19-D996</Id> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
112 </Ids> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
113 <Description> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
114 <Title>Pathogen: environmental/food/other sample from Campylobacter jejuni</Title> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
115 <Organism taxonomy_id="197" taxonomy_name="Campylobacter jejuni"> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
116 <OrganismName>Campylobacter jejuni</OrganismName> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
117 </Organism> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
118 </Description> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
119 <Owner> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
120 <Name url="http://www.fda.gov/Food/FoodScienceResearch/WholeGenomeSequencingProgramWGS/default.htm" abbreviation="CFSAN">FDA Center for Food Safety and Applied Nutrition</Name> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
121 </Owner> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
122 <Models> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
123 <Model>Pathogen.env</Model> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
124 </Models> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
125 <Package display_name="Pathogen: environmental/food/other; version 1.0">Pathogen.env.1.0</Package> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
126 <Attributes> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
127 <Attribute attribute_name="strain" harmonized_name="strain" display_name="strain">CJP19-D996</Attribute> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
128 <Attribute attribute_name="collection_date" harmonized_name="collection_date" display_name="collection date">missing</Attribute> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
129 <Attribute attribute_name="geo_loc_name" harmonized_name="geo_loc_name" display_name="geographic location">missing</Attribute> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
130 <Attribute attribute_name="collected_by" harmonized_name="collected_by" display_name="collected by">CDC</Attribute> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
131 <Attribute attribute_name="lat_lon" harmonized_name="lat_lon" display_name="latitude and longitude">missing</Attribute> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
132 <Attribute attribute_name="isolation_source" harmonized_name="isolation_source" display_name="isolation source">missing</Attribute> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
133 <Attribute attribute_name="isolate" harmonized_name="isolate" display_name="isolate">CFSAN091032</Attribute> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
134 <Attribute attribute_name="project name" harmonized_name="project_name" display_name="project name">GenomeTrakr</Attribute> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
135 <Attribute attribute_name="sequenced by" harmonized_name="sequenced_by" display_name="sequenced by">FDA Center for Food Safety and Applied Nutrition</Attribute> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
136 </Attributes> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
137 <Links> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
138 <Link type="entrez" target="bioproject" label="PRJNA681235">681235</Link> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
139 </Links> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
140 <Status status="live" when="2020-12-21T15:08:05.693"/> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
141 </BioSample> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
142 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
143 """ |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
144 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
145 def flatten_biosample_xml(biosampxml): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
146 root = xml.fromstring(biosampxml) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
147 accession = get_tag(root, r'.//Id[@db="BioSample"]') |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
148 # sample_name = get_tag(root, r'.//Id[@db_label="Sample name"]') |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
149 organism = get_tag(root, r".//OrganismName") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
150 tax_id = root.find(r".//Organism").attrib.get("taxonomy_id") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
151 package = get_tag(root, r".//Package") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
152 sampledict = dict( |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
153 biosample_accession=accession, |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
154 # sample_name=sample_name, |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
155 organism = organism, |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
156 taxid = tax_id, |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
157 package = package |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
158 ) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
159 for attribute in root.findall("Attributes/Attribute"): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
160 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
161 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
162 return sampledict |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
163 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
164 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
165 def yield_sra_runs_from_sample(biosample): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
166 sleep(1 if not api_key else 0.1) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
167 response = requests.get(elink, params=dict(id=biosample, dbfrom="biosample", db="sra", format="json", **extra_params)) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
168 response.raise_for_status() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
169 reply = response.json() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
170 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
171 sleep(1 if not api_key else 0.1) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
172 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params)) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
173 response.raise_for_status() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
174 replyy = response.json() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
175 for field, value in replyy.get("result", {}).items(): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
176 if "uids" not in field: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
177 yield field, value.get("runs") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
178 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
179 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
180 runs_example = """ |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
181 <Run acc="SRR13167188" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
182 <Run acc="SRR13167189" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/> |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
183 """ |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
184 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
185 def flatten_runs(runxml): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
186 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
187 for run in root.findall(".//Run"): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
188 if run.attrib["is_public"] == "false": |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
189 logger.warning(f"Skipping non-public run {run.attrib['acc']}") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
190 yield dict( |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
191 sra_run_accession = run.attrib["acc"], |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
192 total_spots = run.attrib["total_spots"], |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
193 total_bases = run.attrib["total_bases"], |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
194 ) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
195 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
196 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
197 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
198 def main(starting_bioproject): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
199 rows = [] |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
200 response = requests.get(esearch, params=dict(db="bioproject", term=starting_bioproject, field="PRJA", format="json")) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
201 response.raise_for_status() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
202 reply = response.json() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
203 try: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
204 bioproject_id = reply["esearchresult"]["idlist"][0] |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
205 log(f"Found UID {bioproject_id} for '{starting_bioproject}'") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
206 except IndexError: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
207 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
208 sys.exit(1) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
209 sleep(1 if not api_key else 0.1) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
210 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
211 try: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
212 sampledict = flatten_biosample_xml(biosample_xml) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
213 except KeyError: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
214 log(biosample_xml) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
215 raise |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
216 sampledict["bioproject"] = bioproject |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
217 noruns = True |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
218 for sra, runs in yield_sra_runs_from_sample(biosample): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
219 for run in flatten_runs(runs.strip()): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
220 noruns = False |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
221 run.update(sampledict) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
222 rows.append(run) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
223 if noruns: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
224 rows.append(sampledict) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
225 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
226 log(f"Writing {len(rows)} rows to metadata.tsv") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
227 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
228 header = set() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
229 for row in rows: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
230 for key in row.keys(): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
231 header.add(key) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
232 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
233 header = sorted(list(header), key=hso) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
234 # logger.info(f"Header: {header}") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
235 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
236 rows.sort(key=lambda x: x["biosample_accession"]) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
237 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
238 with open("metadata.tsv", "w") as f: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
239 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
240 writer.writeheader() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
241 writer.writerows(rows) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
242 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
243 # check for duplicate runs and unreleased samples |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
244 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
245 accessions = [row.get("sra_run_accession") for row in rows if row.get("sra_run_accession")] |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
246 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
247 raw_length = len(accessions) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
248 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
249 accessions = sorted(list(set(accessions))) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
250 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
251 if raw_length < len(rows): |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
252 logger.warning(f"Bioproject {starting_bioproject} contains unreleased samples. {len(rows) - raw_length} samples will not be included in accessions.txt") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
253 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
254 if len(accessions) < raw_length: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
255 logger.warning(f"Some SRA runs may have been reached through multiple projects or samples. accessions.txt will be deduplicated but the metadata table is not") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
256 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
257 log(f"Writing {len(accessions)} unique accessions to accessions.txt") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
258 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
259 with open("accessions.txt", "w") as f: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
260 for accession in accessions: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
261 f.write(accession + "\n") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
262 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
263 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
264 if __name__ == "__main__": |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
265 b = sys.argv[1].strip() |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
266 log(f"Starting with {b}") |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
267 try: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
268 main(b) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
269 except requests.HTTPError as e: |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
270 logger.error(e) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
271 sys.exit(1) |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
272 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
273 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
274 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
275 |
|
79fa4330f2c9
planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff
changeset
|
276 |
