annotate bio2srr.py @ 0:79fa4330f2c9 draft default tip

planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
author jpayne
date Mon, 08 Dec 2025 20:18:03 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
1 "Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils"
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
2
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
3 import requests
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
4 import sys
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
5 import csv
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
6 import os
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
7
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
8 try:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
9 from itertools import batched
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
10 except ImportError:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
11 from itertools import islice
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
12 def batched(iterable, n):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
13 "Batch data into tuples of length n. The last batch may be shorter."
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
14 # batched('ABCDEFG', 3) --> ABC DEF G
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
15 if n < 1:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
16 raise ValueError('n must be at least one')
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
17 it = iter(iterable)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
18 while batch := tuple(islice(it, n)):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
19 yield batch
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
20 from functools import cmp_to_key
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
21 from time import sleep
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
22 from xml.etree import ElementTree as xml
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
23
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
24 esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
25 esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
26 elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
27
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
28
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
29 import logging
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
30 logging.basicConfig(level=logging.INFO)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
31
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
32 logger = logging.getLogger("bio2srr")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
33
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
34 extra_params = {}
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
35
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
36 api_key = os.environ.get("NCBI_API_KEY")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
37
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
38 if api_key:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
39 logger.info(f"Using NCBI API key {api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
40 extra_params["api_key"] = api_key
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
41
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
42 def log(msg):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
43 if api_key:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
44 logger.info(msg.replace(api_key, f"{api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")) # fix logging later
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
45 else:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
46 logger.info(msg)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
47
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
48 def get_tag(root, tag):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
49 val = root.find(tag)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
50 if val is not None:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
51 return val.text
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
52 log(f"No result for {tag}")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
53
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
54
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
55
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
56 def header_sort_override(a, b):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
57 if a == b:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
58 return 0
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
59 try:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
60 for name in ["bioproject", "srr_accession", "biosample_accession", "organism", "taxid", "package",]:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
61 if a == name:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
62 return -1
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
63 if b == name:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
64 return 1
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
65 except:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
66 pass
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
67 if a < b:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
68 return -1
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
69 else:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
70 return 1
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
71
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
72 hso = cmp_to_key(header_sort_override)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
73
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
74 def resolve_bioproject_ids_and_links(bioproject_id_list):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
75 "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML"
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
76 for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
77 log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
78 #get bioproject to bioproject links
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
79 response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params))
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
80 response.raise_for_status()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
81 reply = response.json()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
82 linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}])
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
83 if len(linksets) >= 3:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
84 for id in linksets[2].get("links", []): #third index is the up to down links
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
85 response = requests.get(esummary, params=dict(id=id, db="bioproject", format="json"))
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
86 response.raise_for_status()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
87 replyy = response.json()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
88 biop = replyy["result"][id]["project_acc"]
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
89 if id not in bioproject_id_list:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
90 bioproject_id_list.append((biop, id)) # recurse over bioproject links
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
91 # get bioproject to biosample links
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
92 response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params))
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
93 response.raise_for_status()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
94 reply = response.json()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
95 links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", [])
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
96 log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
97 for ids in batched(links, 200):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
98 response = requests.get(esummary, params=dict(id=",".join(ids), db="biosample", format="json"))
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
99 response.raise_for_status()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
100 replyy = response.json()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
101 for field, value in replyy.get("result", {}).items():
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
102 if "uids" not in field:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
103 yield bioproject, field, value["sampledata"] # this is XML, deleriously
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
104 sleep(1 if not api_key else 0.1)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
105
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
106
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
107 biosample_example = """
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
108 <BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268">
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
109 <Ids>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
110 <Id db="BioSample" is_primary="1">SAMN17131268</Id>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
111 <Id db_label="Sample name">CJP19-D996</Id>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
112 </Ids>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
113 <Description>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
114 <Title>Pathogen: environmental/food/other sample from Campylobacter jejuni</Title>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
115 <Organism taxonomy_id="197" taxonomy_name="Campylobacter jejuni">
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
116 <OrganismName>Campylobacter jejuni</OrganismName>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
117 </Organism>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
118 </Description>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
119 <Owner>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
120 <Name url="http://www.fda.gov/Food/FoodScienceResearch/WholeGenomeSequencingProgramWGS/default.htm" abbreviation="CFSAN">FDA Center for Food Safety and Applied Nutrition</Name>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
121 </Owner>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
122 <Models>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
123 <Model>Pathogen.env</Model>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
124 </Models>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
125 <Package display_name="Pathogen: environmental/food/other; version 1.0">Pathogen.env.1.0</Package>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
126 <Attributes>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
127 <Attribute attribute_name="strain" harmonized_name="strain" display_name="strain">CJP19-D996</Attribute>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
128 <Attribute attribute_name="collection_date" harmonized_name="collection_date" display_name="collection date">missing</Attribute>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
129 <Attribute attribute_name="geo_loc_name" harmonized_name="geo_loc_name" display_name="geographic location">missing</Attribute>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
130 <Attribute attribute_name="collected_by" harmonized_name="collected_by" display_name="collected by">CDC</Attribute>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
131 <Attribute attribute_name="lat_lon" harmonized_name="lat_lon" display_name="latitude and longitude">missing</Attribute>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
132 <Attribute attribute_name="isolation_source" harmonized_name="isolation_source" display_name="isolation source">missing</Attribute>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
133 <Attribute attribute_name="isolate" harmonized_name="isolate" display_name="isolate">CFSAN091032</Attribute>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
134 <Attribute attribute_name="project name" harmonized_name="project_name" display_name="project name">GenomeTrakr</Attribute>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
135 <Attribute attribute_name="sequenced by" harmonized_name="sequenced_by" display_name="sequenced by">FDA Center for Food Safety and Applied Nutrition</Attribute>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
136 </Attributes>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
137 <Links>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
138 <Link type="entrez" target="bioproject" label="PRJNA681235">681235</Link>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
139 </Links>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
140 <Status status="live" when="2020-12-21T15:08:05.693"/>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
141 </BioSample>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
142
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
143 """
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
144
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
145 def flatten_biosample_xml(biosampxml):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
146 root = xml.fromstring(biosampxml)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
147 accession = get_tag(root, r'.//Id[@db="BioSample"]')
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
148 # sample_name = get_tag(root, r'.//Id[@db_label="Sample name"]')
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
149 organism = get_tag(root, r".//OrganismName")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
150 tax_id = root.find(r".//Organism").attrib.get("taxonomy_id")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
151 package = get_tag(root, r".//Package")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
152 sampledict = dict(
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
153 biosample_accession=accession,
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
154 # sample_name=sample_name,
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
155 organism = organism,
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
156 taxid = tax_id,
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
157 package = package
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
158 )
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
159 for attribute in root.findall("Attributes/Attribute"):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
160 sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
161
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
162 return sampledict
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
163
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
164
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
165 def yield_sra_runs_from_sample(biosample):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
166 sleep(1 if not api_key else 0.1)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
167 response = requests.get(elink, params=dict(id=biosample, dbfrom="biosample", db="sra", format="json", **extra_params))
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
168 response.raise_for_status()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
169 reply = response.json()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
170 for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
171 sleep(1 if not api_key else 0.1)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
172 response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params))
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
173 response.raise_for_status()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
174 replyy = response.json()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
175 for field, value in replyy.get("result", {}).items():
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
176 if "uids" not in field:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
177 yield field, value.get("runs")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
178
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
179
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
180 runs_example = """
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
181 <Run acc="SRR13167188" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
182 <Run acc="SRR13167189" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
183 """
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
184
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
185 def flatten_runs(runxml):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
186 root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
187 for run in root.findall(".//Run"):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
188 if run.attrib["is_public"] == "false":
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
189 logger.warning(f"Skipping non-public run {run.attrib['acc']}")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
190 yield dict(
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
191 sra_run_accession = run.attrib["acc"],
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
192 total_spots = run.attrib["total_spots"],
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
193 total_bases = run.attrib["total_bases"],
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
194 )
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
195
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
196
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
197
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
198 def main(starting_bioproject):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
199 rows = []
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
200 response = requests.get(esearch, params=dict(db="bioproject", term=starting_bioproject, field="PRJA", format="json"))
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
201 response.raise_for_status()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
202 reply = response.json()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
203 try:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
204 bioproject_id = reply["esearchresult"]["idlist"][0]
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
205 log(f"Found UID {bioproject_id} for '{starting_bioproject}'")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
206 except IndexError:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
207 logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
208 sys.exit(1)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
209 sleep(1 if not api_key else 0.1)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
210 for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
211 try:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
212 sampledict = flatten_biosample_xml(biosample_xml)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
213 except KeyError:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
214 log(biosample_xml)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
215 raise
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
216 sampledict["bioproject"] = bioproject
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
217 noruns = True
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
218 for sra, runs in yield_sra_runs_from_sample(biosample):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
219 for run in flatten_runs(runs.strip()):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
220 noruns = False
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
221 run.update(sampledict)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
222 rows.append(run)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
223 if noruns:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
224 rows.append(sampledict)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
225
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
226 log(f"Writing {len(rows)} rows to metadata.tsv")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
227
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
228 header = set()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
229 for row in rows:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
230 for key in row.keys():
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
231 header.add(key)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
232
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
233 header = sorted(list(header), key=hso)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
234 # logger.info(f"Header: {header}")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
235
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
236 rows.sort(key=lambda x: x["biosample_accession"])
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
237
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
238 with open("metadata.tsv", "w") as f:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
239 writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
240 writer.writeheader()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
241 writer.writerows(rows)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
242
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
243 # check for duplicate runs and unreleased samples
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
244
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
245 accessions = [row.get("sra_run_accession") for row in rows if row.get("sra_run_accession")]
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
246
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
247 raw_length = len(accessions)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
248
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
249 accessions = sorted(list(set(accessions)))
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
250
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
251 if raw_length < len(rows):
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
252 logger.warning(f"Bioproject {starting_bioproject} contains unreleased samples. {len(rows) - raw_length} samples will not be included in accessions.txt")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
253
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
254 if len(accessions) < raw_length:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
255 logger.warning(f"Some SRA runs may have been reached through multiple projects or samples. accessions.txt will be deduplicated but the metadata table is not")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
256
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
257 log(f"Writing {len(accessions)} unique accessions to accessions.txt")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
258
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
259 with open("accessions.txt", "w") as f:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
260 for accession in accessions:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
261 f.write(accession + "\n")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
262
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
263
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
264 if __name__ == "__main__":
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
265 b = sys.argv[1].strip()
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
266 log(f"Starting with {b}")
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
267 try:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
268 main(b)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
269 except requests.HTTPError as e:
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
270 logger.error(e)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
271 sys.exit(1)
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
272
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
273
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
274
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
275
79fa4330f2c9 planemo upload commit 27af5cba48986d508a67a5024a3bd35dd47bee15-dirty
jpayne
parents:
diff changeset
276