jpayne@69
|
1 # Copyright (c) 2018 The Pooch Developers.
|
jpayne@69
|
2 # Distributed under the terms of the BSD 3-Clause License.
|
jpayne@69
|
3 # SPDX-License-Identifier: BSD-3-Clause
|
jpayne@69
|
4 #
|
jpayne@69
|
5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
jpayne@69
|
6 #
|
jpayne@69
|
7 """
|
jpayne@69
|
8 The classes that actually handle the downloads.
|
jpayne@69
|
9 """
|
jpayne@69
|
10 import os
|
jpayne@69
|
11 import sys
|
jpayne@69
|
12 import ftplib
|
jpayne@69
|
13
|
jpayne@69
|
14 import warnings
|
jpayne@69
|
15
|
jpayne@69
|
16 from .utils import parse_url
|
jpayne@69
|
17
|
jpayne@69
|
18 try:
|
jpayne@69
|
19 from tqdm import tqdm
|
jpayne@69
|
20 except ImportError:
|
jpayne@69
|
21 tqdm = None
|
jpayne@69
|
22
|
jpayne@69
|
23 try:
|
jpayne@69
|
24 import paramiko
|
jpayne@69
|
25 except ImportError:
|
jpayne@69
|
26 paramiko = None
|
jpayne@69
|
27
|
jpayne@69
|
28
|
jpayne@69
|
29 # Set the default timeout in seconds so it can be configured in a pinch for the
|
jpayne@69
|
30 # methods that don't or can't expose a way set it at runtime.
|
jpayne@69
|
31 # See https://github.com/fatiando/pooch/issues/409
|
jpayne@69
|
32 DEFAULT_TIMEOUT = 30
|
jpayne@69
|
33
|
jpayne@69
|
34
|
jpayne@69
|
35 def choose_downloader(url, progressbar=False):
|
jpayne@69
|
36 """
|
jpayne@69
|
37 Choose the appropriate downloader for the given URL based on the protocol.
|
jpayne@69
|
38
|
jpayne@69
|
39 Parameters
|
jpayne@69
|
40 ----------
|
jpayne@69
|
41 url : str
|
jpayne@69
|
42 A URL (including protocol).
|
jpayne@69
|
43 progressbar : bool or an arbitrary progress bar object
|
jpayne@69
|
44 If True, will print a progress bar of the download to standard error
|
jpayne@69
|
45 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
|
jpayne@69
|
46 installed. Alternatively, an arbitrary progress bar object can be
|
jpayne@69
|
47 passed. See :ref:`custom-progressbar` for details.
|
jpayne@69
|
48
|
jpayne@69
|
49 Returns
|
jpayne@69
|
50 -------
|
jpayne@69
|
51 downloader
|
jpayne@69
|
52 A downloader class, like :class:`pooch.HTTPDownloader`,
|
jpayne@69
|
53 :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
|
jpayne@69
|
54
|
jpayne@69
|
55 Examples
|
jpayne@69
|
56 --------
|
jpayne@69
|
57
|
jpayne@69
|
58 >>> downloader = choose_downloader("http://something.com")
|
jpayne@69
|
59 >>> print(downloader.__class__.__name__)
|
jpayne@69
|
60 HTTPDownloader
|
jpayne@69
|
61 >>> downloader = choose_downloader("https://something.com")
|
jpayne@69
|
62 >>> print(downloader.__class__.__name__)
|
jpayne@69
|
63 HTTPDownloader
|
jpayne@69
|
64 >>> downloader = choose_downloader("ftp://something.com")
|
jpayne@69
|
65 >>> print(downloader.__class__.__name__)
|
jpayne@69
|
66 FTPDownloader
|
jpayne@69
|
67 >>> downloader = choose_downloader("doi:DOI/filename.csv")
|
jpayne@69
|
68 >>> print(downloader.__class__.__name__)
|
jpayne@69
|
69 DOIDownloader
|
jpayne@69
|
70
|
jpayne@69
|
71 """
|
jpayne@69
|
72 known_downloaders = {
|
jpayne@69
|
73 "ftp": FTPDownloader,
|
jpayne@69
|
74 "https": HTTPDownloader,
|
jpayne@69
|
75 "http": HTTPDownloader,
|
jpayne@69
|
76 "sftp": SFTPDownloader,
|
jpayne@69
|
77 "doi": DOIDownloader,
|
jpayne@69
|
78 }
|
jpayne@69
|
79
|
jpayne@69
|
80 parsed_url = parse_url(url)
|
jpayne@69
|
81 if parsed_url["protocol"] not in known_downloaders:
|
jpayne@69
|
82 raise ValueError(
|
jpayne@69
|
83 f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
|
jpayne@69
|
84 f"Must be one of {known_downloaders.keys()}."
|
jpayne@69
|
85 )
|
jpayne@69
|
86 downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
|
jpayne@69
|
87 return downloader
|
jpayne@69
|
88
|
jpayne@69
|
89
|
jpayne@69
|
90 class HTTPDownloader: # pylint: disable=too-few-public-methods
|
jpayne@69
|
91 """
|
jpayne@69
|
92 Download manager for fetching files over HTTP/HTTPS.
|
jpayne@69
|
93
|
jpayne@69
|
94 When called, downloads the given file URL into the specified local file.
|
jpayne@69
|
95 Uses the :mod:`requests` library to manage downloads.
|
jpayne@69
|
96
|
jpayne@69
|
97 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
|
jpayne@69
|
98 the download of files (for example, to use authentication or print a
|
jpayne@69
|
99 progress bar).
|
jpayne@69
|
100
|
jpayne@69
|
101 Parameters
|
jpayne@69
|
102 ----------
|
jpayne@69
|
103 progressbar : bool or an arbitrary progress bar object
|
jpayne@69
|
104 If True, will print a progress bar of the download to standard error
|
jpayne@69
|
105 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
|
jpayne@69
|
106 installed. Alternatively, an arbitrary progress bar object can be
|
jpayne@69
|
107 passed. See :ref:`custom-progressbar` for details.
|
jpayne@69
|
108 chunk_size : int
|
jpayne@69
|
109 Files are streamed *chunk_size* bytes at a time instead of loading
|
jpayne@69
|
110 everything into memory at one. Usually doesn't need to be changed.
|
jpayne@69
|
111 **kwargs
|
jpayne@69
|
112 All keyword arguments given when creating an instance of this class
|
jpayne@69
|
113 will be passed to :func:`requests.get`.
|
jpayne@69
|
114
|
jpayne@69
|
115 Examples
|
jpayne@69
|
116 --------
|
jpayne@69
|
117
|
jpayne@69
|
118 Download one of the data files from the Pooch repository:
|
jpayne@69
|
119
|
jpayne@69
|
120 >>> import os
|
jpayne@69
|
121 >>> from pooch import __version__, check_version
|
jpayne@69
|
122 >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
|
jpayne@69
|
123 >>> url = url.format(check_version(__version__, fallback="main"))
|
jpayne@69
|
124 >>> downloader = HTTPDownloader()
|
jpayne@69
|
125 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
|
jpayne@69
|
126 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
|
jpayne@69
|
127 >>> os.path.exists("tiny-data.txt")
|
jpayne@69
|
128 True
|
jpayne@69
|
129 >>> with open("tiny-data.txt") as f:
|
jpayne@69
|
130 ... print(f.read().strip())
|
jpayne@69
|
131 # A tiny data file for test purposes only
|
jpayne@69
|
132 1 2 3 4 5 6
|
jpayne@69
|
133 >>> os.remove("tiny-data.txt")
|
jpayne@69
|
134
|
jpayne@69
|
135 Authentication can be handled by passing a user name and password to
|
jpayne@69
|
136 :func:`requests.get`. All arguments provided when creating an instance of
|
jpayne@69
|
137 the class are forwarded to :func:`requests.get`. We'll use
|
jpayne@69
|
138 ``auth=(username, password)`` to use basic HTTPS authentication. The
|
jpayne@69
|
139 https://httpbin.org website allows us to make a fake a login request using
|
jpayne@69
|
140 whatever username and password we provide to it:
|
jpayne@69
|
141
|
jpayne@69
|
142 >>> user = "doggo"
|
jpayne@69
|
143 >>> password = "goodboy"
|
jpayne@69
|
144 >>> # httpbin will ask for the user and password we provide in the URL
|
jpayne@69
|
145 >>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
|
jpayne@69
|
146 >>> # Trying without the login credentials causes an error
|
jpayne@69
|
147 >>> downloader = HTTPDownloader()
|
jpayne@69
|
148 >>> try:
|
jpayne@69
|
149 ... downloader(url=url, output_file="tiny-data.txt", pooch=None)
|
jpayne@69
|
150 ... except Exception:
|
jpayne@69
|
151 ... print("There was an error!")
|
jpayne@69
|
152 There was an error!
|
jpayne@69
|
153 >>> # Pass in the credentials to HTTPDownloader
|
jpayne@69
|
154 >>> downloader = HTTPDownloader(auth=(user, password))
|
jpayne@69
|
155 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
|
jpayne@69
|
156 >>> with open("tiny-data.txt") as f:
|
jpayne@69
|
157 ... for line in f:
|
jpayne@69
|
158 ... print(line.rstrip())
|
jpayne@69
|
159 {
|
jpayne@69
|
160 "authenticated": true,
|
jpayne@69
|
161 "user": "doggo"
|
jpayne@69
|
162 }
|
jpayne@69
|
163 >>> os.remove("tiny-data.txt")
|
jpayne@69
|
164
|
jpayne@69
|
165 """
|
jpayne@69
|
166
|
jpayne@69
|
167 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
|
jpayne@69
|
168 self.kwargs = kwargs
|
jpayne@69
|
169 self.progressbar = progressbar
|
jpayne@69
|
170 self.chunk_size = chunk_size
|
jpayne@69
|
171 if self.progressbar is True and tqdm is None:
|
jpayne@69
|
172 raise ValueError("Missing package 'tqdm' required for progress bars.")
|
jpayne@69
|
173
|
jpayne@69
|
174 def __call__(
|
jpayne@69
|
175 self, url, output_file, pooch, check_only=False
|
jpayne@69
|
176 ): # pylint: disable=R0914
|
jpayne@69
|
177 """
|
jpayne@69
|
178 Download the given URL over HTTP to the given output file.
|
jpayne@69
|
179
|
jpayne@69
|
180 Uses :func:`requests.get`.
|
jpayne@69
|
181
|
jpayne@69
|
182 Parameters
|
jpayne@69
|
183 ----------
|
jpayne@69
|
184 url : str
|
jpayne@69
|
185 The URL to the file you want to download.
|
jpayne@69
|
186 output_file : str or file-like object
|
jpayne@69
|
187 Path (and file name) to which the file will be downloaded.
|
jpayne@69
|
188 pooch : :class:`~pooch.Pooch`
|
jpayne@69
|
189 The instance of :class:`~pooch.Pooch` that is calling this method.
|
jpayne@69
|
190 check_only : bool
|
jpayne@69
|
191 If True, will only check if a file exists on the server and
|
jpayne@69
|
192 **without downloading the file**. Will return ``True`` if the file
|
jpayne@69
|
193 exists and ``False`` otherwise.
|
jpayne@69
|
194
|
jpayne@69
|
195 Returns
|
jpayne@69
|
196 -------
|
jpayne@69
|
197 availability : bool or None
|
jpayne@69
|
198 If ``check_only==True``, returns a boolean indicating if the file
|
jpayne@69
|
199 is available on the server. Otherwise, returns ``None``.
|
jpayne@69
|
200
|
jpayne@69
|
201 """
|
jpayne@69
|
202 # Lazy import requests to speed up import time
|
jpayne@69
|
203 import requests # pylint: disable=C0415
|
jpayne@69
|
204
|
jpayne@69
|
205 if check_only:
|
jpayne@69
|
206 timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT)
|
jpayne@69
|
207 response = requests.head(url, timeout=timeout, allow_redirects=True)
|
jpayne@69
|
208 available = bool(response.status_code == 200)
|
jpayne@69
|
209 return available
|
jpayne@69
|
210
|
jpayne@69
|
211 kwargs = self.kwargs.copy()
|
jpayne@69
|
212 timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT)
|
jpayne@69
|
213 kwargs.setdefault("stream", True)
|
jpayne@69
|
214 ispath = not hasattr(output_file, "write")
|
jpayne@69
|
215 if ispath:
|
jpayne@69
|
216 # pylint: disable=consider-using-with
|
jpayne@69
|
217 output_file = open(output_file, "w+b")
|
jpayne@69
|
218 # pylint: enable=consider-using-with
|
jpayne@69
|
219 try:
|
jpayne@69
|
220 response = requests.get(url, timeout=timeout, **kwargs)
|
jpayne@69
|
221 response.raise_for_status()
|
jpayne@69
|
222 content = response.iter_content(chunk_size=self.chunk_size)
|
jpayne@69
|
223 total = int(response.headers.get("content-length", 0))
|
jpayne@69
|
224 if self.progressbar is True:
|
jpayne@69
|
225 # Need to use ascii characters on Windows because there isn't
|
jpayne@69
|
226 # always full unicode support
|
jpayne@69
|
227 # (see https://github.com/tqdm/tqdm/issues/454)
|
jpayne@69
|
228 use_ascii = bool(sys.platform == "win32")
|
jpayne@69
|
229 progress = tqdm(
|
jpayne@69
|
230 total=total,
|
jpayne@69
|
231 ncols=79,
|
jpayne@69
|
232 ascii=use_ascii,
|
jpayne@69
|
233 unit="B",
|
jpayne@69
|
234 unit_scale=True,
|
jpayne@69
|
235 leave=True,
|
jpayne@69
|
236 )
|
jpayne@69
|
237 elif self.progressbar:
|
jpayne@69
|
238 progress = self.progressbar
|
jpayne@69
|
239 progress.total = total
|
jpayne@69
|
240 for chunk in content:
|
jpayne@69
|
241 if chunk:
|
jpayne@69
|
242 output_file.write(chunk)
|
jpayne@69
|
243 output_file.flush()
|
jpayne@69
|
244 if self.progressbar:
|
jpayne@69
|
245 # Use the chunk size here because chunk may be much
|
jpayne@69
|
246 # larger if the data are decompressed by requests after
|
jpayne@69
|
247 # reading (happens with text files).
|
jpayne@69
|
248 progress.update(self.chunk_size)
|
jpayne@69
|
249 # Make sure the progress bar gets filled even if the actual number
|
jpayne@69
|
250 # is chunks is smaller than expected. This happens when streaming
|
jpayne@69
|
251 # text files that are compressed by the server when sending (gzip).
|
jpayne@69
|
252 # Binary files don't experience this.
|
jpayne@69
|
253 if self.progressbar:
|
jpayne@69
|
254 progress.reset()
|
jpayne@69
|
255 progress.update(total)
|
jpayne@69
|
256 progress.close()
|
jpayne@69
|
257 finally:
|
jpayne@69
|
258 if ispath:
|
jpayne@69
|
259 output_file.close()
|
jpayne@69
|
260 return None
|
jpayne@69
|
261
|
jpayne@69
|
262
|
jpayne@69
|
263 class FTPDownloader: # pylint: disable=too-few-public-methods
|
jpayne@69
|
264 """
|
jpayne@69
|
265 Download manager for fetching files over FTP.
|
jpayne@69
|
266
|
jpayne@69
|
267 When called, downloads the given file URL into the specified local file.
|
jpayne@69
|
268 Uses the :mod:`ftplib` module to manage downloads.
|
jpayne@69
|
269
|
jpayne@69
|
270 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
|
jpayne@69
|
271 the download of files (for example, to use authentication or print a
|
jpayne@69
|
272 progress bar).
|
jpayne@69
|
273
|
jpayne@69
|
274 Parameters
|
jpayne@69
|
275 ----------
|
jpayne@69
|
276 port : int
|
jpayne@69
|
277 Port used for the FTP connection.
|
jpayne@69
|
278 username : str
|
jpayne@69
|
279 User name used to login to the server. Only needed if the server
|
jpayne@69
|
280 requires authentication (i.e., no anonymous FTP).
|
jpayne@69
|
281 password : str
|
jpayne@69
|
282 Password used to login to the server. Only needed if the server
|
jpayne@69
|
283 requires authentication (i.e., no anonymous FTP). Use the empty string
|
jpayne@69
|
284 to indicate no password is required.
|
jpayne@69
|
285 account : str
|
jpayne@69
|
286 Some servers also require an "account" name for authentication.
|
jpayne@69
|
287 timeout : int
|
jpayne@69
|
288 Timeout in seconds for ftp socket operations, use None to mean no
|
jpayne@69
|
289 timeout.
|
jpayne@69
|
290 progressbar : bool
|
jpayne@69
|
291 If True, will print a progress bar of the download to standard error
|
jpayne@69
|
292 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
|
jpayne@69
|
293 installed. **Custom progress bars are not yet supported.**
|
jpayne@69
|
294 chunk_size : int
|
jpayne@69
|
295 Files are streamed *chunk_size* bytes at a time instead of loading
|
jpayne@69
|
296 everything into memory at one. Usually doesn't need to be changed.
|
jpayne@69
|
297
|
jpayne@69
|
298 """
|
jpayne@69
|
299
|
jpayne@69
|
300 def __init__(
|
jpayne@69
|
301 self,
|
jpayne@69
|
302 port=21,
|
jpayne@69
|
303 username="anonymous",
|
jpayne@69
|
304 password="",
|
jpayne@69
|
305 account="",
|
jpayne@69
|
306 timeout=None,
|
jpayne@69
|
307 progressbar=False,
|
jpayne@69
|
308 chunk_size=1024,
|
jpayne@69
|
309 ):
|
jpayne@69
|
310 self.port = port
|
jpayne@69
|
311 self.username = username
|
jpayne@69
|
312 self.password = password
|
jpayne@69
|
313 self.account = account
|
jpayne@69
|
314 self.timeout = timeout
|
jpayne@69
|
315 self.progressbar = progressbar
|
jpayne@69
|
316 self.chunk_size = chunk_size
|
jpayne@69
|
317 if self.progressbar is True and tqdm is None:
|
jpayne@69
|
318 raise ValueError("Missing package 'tqdm' required for progress bars.")
|
jpayne@69
|
319
|
jpayne@69
|
320 def __call__(self, url, output_file, pooch, check_only=False):
|
jpayne@69
|
321 """
|
jpayne@69
|
322 Download the given URL over FTP to the given output file.
|
jpayne@69
|
323
|
jpayne@69
|
324 Parameters
|
jpayne@69
|
325 ----------
|
jpayne@69
|
326 url : str
|
jpayne@69
|
327 The URL to the file you want to download.
|
jpayne@69
|
328 output_file : str or file-like object
|
jpayne@69
|
329 Path (and file name) to which the file will be downloaded.
|
jpayne@69
|
330 pooch : :class:`~pooch.Pooch`
|
jpayne@69
|
331 The instance of :class:`~pooch.Pooch` that is calling this method.
|
jpayne@69
|
332 check_only : bool
|
jpayne@69
|
333 If True, will only check if a file exists on the server and
|
jpayne@69
|
334 **without downloading the file**. Will return ``True`` if the file
|
jpayne@69
|
335 exists and ``False`` otherwise.
|
jpayne@69
|
336
|
jpayne@69
|
337 Returns
|
jpayne@69
|
338 -------
|
jpayne@69
|
339 availability : bool or None
|
jpayne@69
|
340 If ``check_only==True``, returns a boolean indicating if the file
|
jpayne@69
|
341 is available on the server. Otherwise, returns ``None``.
|
jpayne@69
|
342
|
jpayne@69
|
343 """
|
jpayne@69
|
344 parsed_url = parse_url(url)
|
jpayne@69
|
345 ftp = ftplib.FTP(timeout=self.timeout)
|
jpayne@69
|
346 ftp.connect(host=parsed_url["netloc"], port=self.port)
|
jpayne@69
|
347
|
jpayne@69
|
348 if check_only:
|
jpayne@69
|
349 directory, file_name = os.path.split(parsed_url["path"])
|
jpayne@69
|
350 try:
|
jpayne@69
|
351 ftp.login(user=self.username, passwd=self.password, acct=self.account)
|
jpayne@69
|
352 available = file_name in ftp.nlst(directory)
|
jpayne@69
|
353 finally:
|
jpayne@69
|
354 ftp.close()
|
jpayne@69
|
355 return available
|
jpayne@69
|
356
|
jpayne@69
|
357 ispath = not hasattr(output_file, "write")
|
jpayne@69
|
358 if ispath:
|
jpayne@69
|
359 # pylint: disable=consider-using-with
|
jpayne@69
|
360 output_file = open(output_file, "w+b")
|
jpayne@69
|
361 # pylint: enable=consider-using-with
|
jpayne@69
|
362 try:
|
jpayne@69
|
363 ftp.login(user=self.username, passwd=self.password, acct=self.account)
|
jpayne@69
|
364 command = f"RETR {parsed_url['path']}"
|
jpayne@69
|
365 if self.progressbar:
|
jpayne@69
|
366 # Make sure the file is set to binary mode, otherwise we can't
|
jpayne@69
|
367 # get the file size. See: https://stackoverflow.com/a/22093848
|
jpayne@69
|
368 ftp.voidcmd("TYPE I")
|
jpayne@69
|
369 use_ascii = bool(sys.platform == "win32")
|
jpayne@69
|
370 progress = tqdm(
|
jpayne@69
|
371 total=int(ftp.size(parsed_url["path"])),
|
jpayne@69
|
372 ncols=79,
|
jpayne@69
|
373 ascii=use_ascii,
|
jpayne@69
|
374 unit="B",
|
jpayne@69
|
375 unit_scale=True,
|
jpayne@69
|
376 leave=True,
|
jpayne@69
|
377 )
|
jpayne@69
|
378 with progress:
|
jpayne@69
|
379
|
jpayne@69
|
380 def callback(data):
|
jpayne@69
|
381 "Update the progress bar and write to output"
|
jpayne@69
|
382 progress.update(len(data))
|
jpayne@69
|
383 output_file.write(data)
|
jpayne@69
|
384
|
jpayne@69
|
385 ftp.retrbinary(command, callback, blocksize=self.chunk_size)
|
jpayne@69
|
386 else:
|
jpayne@69
|
387 ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
|
jpayne@69
|
388 finally:
|
jpayne@69
|
389 ftp.quit()
|
jpayne@69
|
390 if ispath:
|
jpayne@69
|
391 output_file.close()
|
jpayne@69
|
392 return None
|
jpayne@69
|
393
|
jpayne@69
|
394
|
jpayne@69
|
395 class SFTPDownloader: # pylint: disable=too-few-public-methods
|
jpayne@69
|
396 """
|
jpayne@69
|
397 Download manager for fetching files over SFTP.
|
jpayne@69
|
398
|
jpayne@69
|
399 When called, downloads the given file URL into the specified local file.
|
jpayne@69
|
400 Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
|
jpayne@69
|
401 installed.
|
jpayne@69
|
402
|
jpayne@69
|
403 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
|
jpayne@69
|
404 the download of files (for example, to use authentication or print a
|
jpayne@69
|
405 progress bar).
|
jpayne@69
|
406
|
jpayne@69
|
407 Parameters
|
jpayne@69
|
408 ----------
|
jpayne@69
|
409 port : int
|
jpayne@69
|
410 Port used for the SFTP connection.
|
jpayne@69
|
411 username : str
|
jpayne@69
|
412 User name used to login to the server. Only needed if the server
|
jpayne@69
|
413 requires authentication (i.e., no anonymous SFTP).
|
jpayne@69
|
414 password : str
|
jpayne@69
|
415 Password used to login to the server. Only needed if the server
|
jpayne@69
|
416 requires authentication (i.e., no anonymous SFTP). Use the empty
|
jpayne@69
|
417 string to indicate no password is required.
|
jpayne@69
|
418 timeout : int
|
jpayne@69
|
419 Timeout in seconds for sftp socket operations, use None to mean no
|
jpayne@69
|
420 timeout.
|
jpayne@69
|
421 progressbar : bool or an arbitrary progress bar object
|
jpayne@69
|
422 If True, will print a progress bar of the download to standard
|
jpayne@69
|
423 error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
|
jpayne@69
|
424 be installed.
|
jpayne@69
|
425
|
jpayne@69
|
426 """
|
jpayne@69
|
427
|
jpayne@69
|
428 def __init__(
|
jpayne@69
|
429 self,
|
jpayne@69
|
430 port=22,
|
jpayne@69
|
431 username="anonymous",
|
jpayne@69
|
432 password="",
|
jpayne@69
|
433 account="",
|
jpayne@69
|
434 timeout=None,
|
jpayne@69
|
435 progressbar=False,
|
jpayne@69
|
436 ):
|
jpayne@69
|
437 self.port = port
|
jpayne@69
|
438 self.username = username
|
jpayne@69
|
439 self.password = password
|
jpayne@69
|
440 self.account = account
|
jpayne@69
|
441 self.timeout = timeout
|
jpayne@69
|
442 self.progressbar = progressbar
|
jpayne@69
|
443 # Collect errors and raise only once so that both missing packages are
|
jpayne@69
|
444 # captured. Otherwise, the user is only warned of one of them at a
|
jpayne@69
|
445 # time (and we can't test properly when they are both missing).
|
jpayne@69
|
446 errors = []
|
jpayne@69
|
447 if self.progressbar and tqdm is None:
|
jpayne@69
|
448 errors.append("Missing package 'tqdm' required for progress bars.")
|
jpayne@69
|
449 if paramiko is None:
|
jpayne@69
|
450 errors.append("Missing package 'paramiko' required for SFTP downloads.")
|
jpayne@69
|
451 if errors:
|
jpayne@69
|
452 raise ValueError(" ".join(errors))
|
jpayne@69
|
453
|
jpayne@69
|
454 def __call__(self, url, output_file, pooch):
|
jpayne@69
|
455 """
|
jpayne@69
|
456 Download the given URL over SFTP to the given output file.
|
jpayne@69
|
457
|
jpayne@69
|
458 The output file must be given as a string (file name/path) and not an
|
jpayne@69
|
459 open file object! Otherwise, paramiko cannot save to that file.
|
jpayne@69
|
460
|
jpayne@69
|
461 Parameters
|
jpayne@69
|
462 ----------
|
jpayne@69
|
463 url : str
|
jpayne@69
|
464 The URL to the file you want to download.
|
jpayne@69
|
465 output_file : str
|
jpayne@69
|
466 Path (and file name) to which the file will be downloaded. **Cannot
|
jpayne@69
|
467 be a file object**.
|
jpayne@69
|
468 pooch : :class:`~pooch.Pooch`
|
jpayne@69
|
469 The instance of :class:`~pooch.Pooch` that is calling this method.
|
jpayne@69
|
470 """
|
jpayne@69
|
471 parsed_url = parse_url(url)
|
jpayne@69
|
472 connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
|
jpayne@69
|
473 sftp = None
|
jpayne@69
|
474 try:
|
jpayne@69
|
475 connection.connect(username=self.username, password=self.password)
|
jpayne@69
|
476 sftp = paramiko.SFTPClient.from_transport(connection)
|
jpayne@69
|
477 sftp.get_channel().settimeout = self.timeout
|
jpayne@69
|
478 if self.progressbar:
|
jpayne@69
|
479 size = int(sftp.stat(parsed_url["path"]).st_size)
|
jpayne@69
|
480 use_ascii = bool(sys.platform == "win32")
|
jpayne@69
|
481 progress = tqdm(
|
jpayne@69
|
482 total=size,
|
jpayne@69
|
483 ncols=79,
|
jpayne@69
|
484 ascii=use_ascii,
|
jpayne@69
|
485 unit="B",
|
jpayne@69
|
486 unit_scale=True,
|
jpayne@69
|
487 leave=True,
|
jpayne@69
|
488 )
|
jpayne@69
|
489 if self.progressbar:
|
jpayne@69
|
490 with progress:
|
jpayne@69
|
491
|
jpayne@69
|
492 def callback(current, total):
|
jpayne@69
|
493 "Update the progress bar and write to output"
|
jpayne@69
|
494 progress.total = int(total)
|
jpayne@69
|
495 progress.update(int(current - progress.n))
|
jpayne@69
|
496
|
jpayne@69
|
497 sftp.get(parsed_url["path"], output_file, callback=callback)
|
jpayne@69
|
498 else:
|
jpayne@69
|
499 sftp.get(parsed_url["path"], output_file)
|
jpayne@69
|
500 finally:
|
jpayne@69
|
501 connection.close()
|
jpayne@69
|
502 if sftp is not None:
|
jpayne@69
|
503 sftp.close()
|
jpayne@69
|
504
|
jpayne@69
|
505
|
jpayne@69
|
506 class DOIDownloader: # pylint: disable=too-few-public-methods
|
jpayne@69
|
507 """
|
jpayne@69
|
508 Download manager for fetching files from Digital Object Identifiers (DOIs).
|
jpayne@69
|
509
|
jpayne@69
|
510 Open-access data repositories often issue Digital Object Identifiers (DOIs)
|
jpayne@69
|
511 for data which provide a stable link and citation point. The trick is
|
jpayne@69
|
512 finding out the download URL for a file given the DOI.
|
jpayne@69
|
513
|
jpayne@69
|
514 When called, this downloader uses the repository's public API to find out
|
jpayne@69
|
515 the download URL from the DOI and file name. It then uses
|
jpayne@69
|
516 :class:`pooch.HTTPDownloader` to download the URL into the specified local
|
jpayne@69
|
517 file. Allowing "URL"s to be specified with the DOI instead of the actual
|
jpayne@69
|
518 HTTP download link. Uses the :mod:`requests` library to manage downloads
|
jpayne@69
|
519 and interact with the APIs.
|
jpayne@69
|
520
|
jpayne@69
|
521 The **format of the "URL"** is: ``doi:{DOI}/{file name}``.
|
jpayne@69
|
522
|
jpayne@69
|
523 Notice that there are no ``//`` like in HTTP/FTP and you must specify a
|
jpayne@69
|
524 file name after the DOI (separated by a ``/``).
|
jpayne@69
|
525
|
jpayne@69
|
526 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
|
jpayne@69
|
527 download files given the DOI instead of an HTTP link.
|
jpayne@69
|
528
|
jpayne@69
|
529 Supported repositories:
|
jpayne@69
|
530
|
jpayne@69
|
531 * `figshare <https://www.figshare.com>`__
|
jpayne@69
|
532 * `Zenodo <https://www.zenodo.org>`__
|
jpayne@69
|
533 * `Dataverse <https://dataverse.org/>`__ instances
|
jpayne@69
|
534
|
jpayne@69
|
535 .. attention::
|
jpayne@69
|
536
|
jpayne@69
|
537 DOIs from other repositories **will not work** since we need to access
|
jpayne@69
|
538 their particular APIs to find the download links. We welcome
|
jpayne@69
|
539 suggestions and contributions adding new repositories.
|
jpayne@69
|
540
|
jpayne@69
|
541 Parameters
|
jpayne@69
|
542 ----------
|
jpayne@69
|
543 progressbar : bool or an arbitrary progress bar object
|
jpayne@69
|
544 If True, will print a progress bar of the download to standard error
|
jpayne@69
|
545 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
|
jpayne@69
|
546 installed. Alternatively, an arbitrary progress bar object can be
|
jpayne@69
|
547 passed. See :ref:`custom-progressbar` for details.
|
jpayne@69
|
548 chunk_size : int
|
jpayne@69
|
549 Files are streamed *chunk_size* bytes at a time instead of loading
|
jpayne@69
|
550 everything into memory at one. Usually doesn't need to be changed.
|
jpayne@69
|
551 **kwargs
|
jpayne@69
|
552 All keyword arguments given when creating an instance of this class
|
jpayne@69
|
553 will be passed to :func:`requests.get`.
|
jpayne@69
|
554
|
jpayne@69
|
555 Examples
|
jpayne@69
|
556 --------
|
jpayne@69
|
557
|
jpayne@69
|
558 Download one of the data files from the figshare archive of Pooch test
|
jpayne@69
|
559 data:
|
jpayne@69
|
560
|
jpayne@69
|
561 >>> import os
|
jpayne@69
|
562 >>> downloader = DOIDownloader()
|
jpayne@69
|
563 >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
|
jpayne@69
|
564 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
|
jpayne@69
|
565 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
|
jpayne@69
|
566 >>> os.path.exists("tiny-data.txt")
|
jpayne@69
|
567 True
|
jpayne@69
|
568 >>> with open("tiny-data.txt") as f:
|
jpayne@69
|
569 ... print(f.read().strip())
|
jpayne@69
|
570 # A tiny data file for test purposes only
|
jpayne@69
|
571 1 2 3 4 5 6
|
jpayne@69
|
572 >>> os.remove("tiny-data.txt")
|
jpayne@69
|
573
|
jpayne@69
|
574 Same thing but for our Zenodo archive:
|
jpayne@69
|
575
|
jpayne@69
|
576 >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
|
jpayne@69
|
577 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
|
jpayne@69
|
578 >>> os.path.exists("tiny-data.txt")
|
jpayne@69
|
579 True
|
jpayne@69
|
580 >>> with open("tiny-data.txt") as f:
|
jpayne@69
|
581 ... print(f.read().strip())
|
jpayne@69
|
582 # A tiny data file for test purposes only
|
jpayne@69
|
583 1 2 3 4 5 6
|
jpayne@69
|
584 >>> os.remove("tiny-data.txt")
|
jpayne@69
|
585
|
jpayne@69
|
586 """
|
jpayne@69
|
587
|
jpayne@69
|
588 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
|
jpayne@69
|
589 self.kwargs = kwargs
|
jpayne@69
|
590 self.progressbar = progressbar
|
jpayne@69
|
591 self.chunk_size = chunk_size
|
jpayne@69
|
592
|
jpayne@69
|
593 def __call__(self, url, output_file, pooch):
|
jpayne@69
|
594 """
|
jpayne@69
|
595 Download the given DOI URL over HTTP to the given output file.
|
jpayne@69
|
596
|
jpayne@69
|
597 Uses the repository's API to determine the actual HTTP download URL
|
jpayne@69
|
598 from the given DOI.
|
jpayne@69
|
599
|
jpayne@69
|
600 Uses :func:`requests.get`.
|
jpayne@69
|
601
|
jpayne@69
|
602 Parameters
|
jpayne@69
|
603 ----------
|
jpayne@69
|
604 url : str
|
jpayne@69
|
605 The URL to the file you want to download.
|
jpayne@69
|
606 output_file : str or file-like object
|
jpayne@69
|
607 Path (and file name) to which the file will be downloaded.
|
jpayne@69
|
608 pooch : :class:`~pooch.Pooch`
|
jpayne@69
|
609 The instance of :class:`~pooch.Pooch` that is calling this method.
|
jpayne@69
|
610
|
jpayne@69
|
611 """
|
jpayne@69
|
612
|
jpayne@69
|
613 parsed_url = parse_url(url)
|
jpayne@69
|
614 data_repository = doi_to_repository(parsed_url["netloc"])
|
jpayne@69
|
615
|
jpayne@69
|
616 # Resolve the URL
|
jpayne@69
|
617 file_name = parsed_url["path"]
|
jpayne@69
|
618 # remove the leading slash in the path
|
jpayne@69
|
619 if file_name[0] == "/":
|
jpayne@69
|
620 file_name = file_name[1:]
|
jpayne@69
|
621 download_url = data_repository.download_url(file_name)
|
jpayne@69
|
622
|
jpayne@69
|
623 # Instantiate the downloader object
|
jpayne@69
|
624 downloader = HTTPDownloader(
|
jpayne@69
|
625 progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
|
jpayne@69
|
626 )
|
jpayne@69
|
627 downloader(download_url, output_file, pooch)
|
jpayne@69
|
628
|
jpayne@69
|
629
|
jpayne@69
|
630 def doi_to_url(doi):
|
jpayne@69
|
631 """
|
jpayne@69
|
632 Follow a DOI link to resolve the URL of the archive.
|
jpayne@69
|
633
|
jpayne@69
|
634 Parameters
|
jpayne@69
|
635 ----------
|
jpayne@69
|
636 doi : str
|
jpayne@69
|
637 The DOI of the archive.
|
jpayne@69
|
638
|
jpayne@69
|
639 Returns
|
jpayne@69
|
640 -------
|
jpayne@69
|
641 url : str
|
jpayne@69
|
642 The URL of the archive in the data repository.
|
jpayne@69
|
643
|
jpayne@69
|
644 """
|
jpayne@69
|
645 # Lazy import requests to speed up import time
|
jpayne@69
|
646 import requests # pylint: disable=C0415
|
jpayne@69
|
647
|
jpayne@69
|
648 # Use doi.org to resolve the DOI to the repository website.
|
jpayne@69
|
649 response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT)
|
jpayne@69
|
650 url = response.url
|
jpayne@69
|
651 if 400 <= response.status_code < 600:
|
jpayne@69
|
652 raise ValueError(
|
jpayne@69
|
653 f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
|
jpayne@69
|
654 )
|
jpayne@69
|
655 return url
|
jpayne@69
|
656
|
jpayne@69
|
657
|
jpayne@69
|
658 def doi_to_repository(doi):
|
jpayne@69
|
659 """
|
jpayne@69
|
660 Instantiate a data repository instance from a given DOI.
|
jpayne@69
|
661
|
jpayne@69
|
662 This function implements the chain of responsibility dispatch
|
jpayne@69
|
663 to the correct data repository class.
|
jpayne@69
|
664
|
jpayne@69
|
665 Parameters
|
jpayne@69
|
666 ----------
|
jpayne@69
|
667 doi : str
|
jpayne@69
|
668 The DOI of the archive.
|
jpayne@69
|
669
|
jpayne@69
|
670 Returns
|
jpayne@69
|
671 -------
|
jpayne@69
|
672 data_repository : DataRepository
|
jpayne@69
|
673 The data repository object
|
jpayne@69
|
674 """
|
jpayne@69
|
675
|
jpayne@69
|
676 # This should go away in a separate issue: DOI handling should
|
jpayne@69
|
677 # not rely on the (non-)existence of trailing slashes. The issue
|
jpayne@69
|
678 # is documented in https://github.com/fatiando/pooch/issues/324
|
jpayne@69
|
679 if doi[-1] == "/":
|
jpayne@69
|
680 doi = doi[:-1]
|
jpayne@69
|
681
|
jpayne@69
|
682 repositories = [
|
jpayne@69
|
683 FigshareRepository,
|
jpayne@69
|
684 ZenodoRepository,
|
jpayne@69
|
685 DataverseRepository,
|
jpayne@69
|
686 ]
|
jpayne@69
|
687
|
jpayne@69
|
688 # Extract the DOI and the repository information
|
jpayne@69
|
689 archive_url = doi_to_url(doi)
|
jpayne@69
|
690
|
jpayne@69
|
691 # Try the converters one by one until one of them returned a URL
|
jpayne@69
|
692 data_repository = None
|
jpayne@69
|
693 for repo in repositories:
|
jpayne@69
|
694 if data_repository is None:
|
jpayne@69
|
695 data_repository = repo.initialize(
|
jpayne@69
|
696 archive_url=archive_url,
|
jpayne@69
|
697 doi=doi,
|
jpayne@69
|
698 )
|
jpayne@69
|
699
|
jpayne@69
|
700 if data_repository is None:
|
jpayne@69
|
701 repository = parse_url(archive_url)["netloc"]
|
jpayne@69
|
702 raise ValueError(
|
jpayne@69
|
703 f"Invalid data repository '{repository}'. "
|
jpayne@69
|
704 "To request or contribute support for this repository, "
|
jpayne@69
|
705 "please open an issue at https://github.com/fatiando/pooch/issues"
|
jpayne@69
|
706 )
|
jpayne@69
|
707
|
jpayne@69
|
708 return data_repository
|
jpayne@69
|
709
|
jpayne@69
|
710
|
jpayne@69
|
711 class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring
|
jpayne@69
|
712 @classmethod
|
jpayne@69
|
713 def initialize(cls, doi, archive_url): # pylint: disable=unused-argument
|
jpayne@69
|
714 """
|
jpayne@69
|
715 Initialize the data repository if the given URL points to a
|
jpayne@69
|
716 corresponding repository.
|
jpayne@69
|
717
|
jpayne@69
|
718 Initializes a data repository object. This is done as part of
|
jpayne@69
|
719 a chain of responsibility. If the class cannot handle the given
|
jpayne@69
|
720 repository URL, it returns `None`. Otherwise a `DataRepository`
|
jpayne@69
|
721 instance is returned.
|
jpayne@69
|
722
|
jpayne@69
|
723 Parameters
|
jpayne@69
|
724 ----------
|
jpayne@69
|
725 doi : str
|
jpayne@69
|
726 The DOI that identifies the repository
|
jpayne@69
|
727 archive_url : str
|
jpayne@69
|
728 The resolved URL for the DOI
|
jpayne@69
|
729 """
|
jpayne@69
|
730
|
jpayne@69
|
731 return None # pragma: no cover
|
jpayne@69
|
732
|
jpayne@69
|
733 def download_url(self, file_name):
|
jpayne@69
|
734 """
|
jpayne@69
|
735 Use the repository API to get the download URL for a file given
|
jpayne@69
|
736 the archive URL.
|
jpayne@69
|
737
|
jpayne@69
|
738 Parameters
|
jpayne@69
|
739 ----------
|
jpayne@69
|
740 file_name : str
|
jpayne@69
|
741 The name of the file in the archive that will be downloaded.
|
jpayne@69
|
742
|
jpayne@69
|
743 Returns
|
jpayne@69
|
744 -------
|
jpayne@69
|
745 download_url : str
|
jpayne@69
|
746 The HTTP URL that can be used to download the file.
|
jpayne@69
|
747 """
|
jpayne@69
|
748
|
jpayne@69
|
749 raise NotImplementedError # pragma: no cover
|
jpayne@69
|
750
|
jpayne@69
|
751 def populate_registry(self, pooch):
|
jpayne@69
|
752 """
|
jpayne@69
|
753 Populate the registry using the data repository's API
|
jpayne@69
|
754
|
jpayne@69
|
755 Parameters
|
jpayne@69
|
756 ----------
|
jpayne@69
|
757 pooch : Pooch
|
jpayne@69
|
758 The pooch instance that the registry will be added to.
|
jpayne@69
|
759 """
|
jpayne@69
|
760
|
jpayne@69
|
761 raise NotImplementedError # pragma: no cover
|
jpayne@69
|
762
|
jpayne@69
|
763
|
jpayne@69
|
764 class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring
|
jpayne@69
|
765 base_api_url = "https://zenodo.org/api/records"
|
jpayne@69
|
766
|
jpayne@69
|
767 def __init__(self, doi, archive_url):
|
jpayne@69
|
768 self.archive_url = archive_url
|
jpayne@69
|
769 self.doi = doi
|
jpayne@69
|
770 self._api_response = None
|
jpayne@69
|
771 self._api_version = None
|
jpayne@69
|
772
|
jpayne@69
|
773 @classmethod
|
jpayne@69
|
774 def initialize(cls, doi, archive_url):
|
jpayne@69
|
775 """
|
jpayne@69
|
776 Initialize the data repository if the given URL points to a
|
jpayne@69
|
777 corresponding repository.
|
jpayne@69
|
778
|
jpayne@69
|
779 Initializes a data repository object. This is done as part of
|
jpayne@69
|
780 a chain of responsibility. If the class cannot handle the given
|
jpayne@69
|
781 repository URL, it returns `None`. Otherwise a `DataRepository`
|
jpayne@69
|
782 instance is returned.
|
jpayne@69
|
783
|
jpayne@69
|
784 Parameters
|
jpayne@69
|
785 ----------
|
jpayne@69
|
786 doi : str
|
jpayne@69
|
787 The DOI that identifies the repository
|
jpayne@69
|
788 archive_url : str
|
jpayne@69
|
789 The resolved URL for the DOI
|
jpayne@69
|
790 """
|
jpayne@69
|
791
|
jpayne@69
|
792 # Check whether this is a Zenodo URL
|
jpayne@69
|
793 parsed_archive_url = parse_url(archive_url)
|
jpayne@69
|
794 if parsed_archive_url["netloc"] != "zenodo.org":
|
jpayne@69
|
795 return None
|
jpayne@69
|
796
|
jpayne@69
|
797 return cls(doi, archive_url)
|
jpayne@69
|
798
|
jpayne@69
|
799 @property
|
jpayne@69
|
800 def api_response(self):
|
jpayne@69
|
801 """Cached API response from Zenodo"""
|
jpayne@69
|
802 if self._api_response is None:
|
jpayne@69
|
803 # Lazy import requests to speed up import time
|
jpayne@69
|
804 import requests # pylint: disable=C0415
|
jpayne@69
|
805
|
jpayne@69
|
806 article_id = self.archive_url.split("/")[-1]
|
jpayne@69
|
807 self._api_response = requests.get(
|
jpayne@69
|
808 f"{self.base_api_url}/{article_id}",
|
jpayne@69
|
809 timeout=DEFAULT_TIMEOUT,
|
jpayne@69
|
810 ).json()
|
jpayne@69
|
811
|
jpayne@69
|
812 return self._api_response
|
jpayne@69
|
813
|
jpayne@69
|
814 @property
|
jpayne@69
|
815 def api_version(self):
|
jpayne@69
|
816 """
|
jpayne@69
|
817 Version of the Zenodo API we are interacting with
|
jpayne@69
|
818
|
jpayne@69
|
819 The versions can either be :
|
jpayne@69
|
820
|
jpayne@69
|
821 - ``"legacy"``: corresponds to the Zenodo API that was supported until
|
jpayne@69
|
822 2023-10-12 (before the migration to InvenioRDM).
|
jpayne@69
|
823 - ``"new"``: corresponds to the new API that went online on 2023-10-13
|
jpayne@69
|
824 after the migration to InvenioRDM.
|
jpayne@69
|
825
|
jpayne@69
|
826 The ``"new"`` API breaks backward compatibility with the ``"legacy"``
|
jpayne@69
|
827 one and could probably be replaced by an updated version that restores
|
jpayne@69
|
828 the behaviour of the ``"legacy"`` one.
|
jpayne@69
|
829
|
jpayne@69
|
830 Returns
|
jpayne@69
|
831 -------
|
jpayne@69
|
832 str
|
jpayne@69
|
833 """
|
jpayne@69
|
834 if self._api_version is None:
|
jpayne@69
|
835 if all("key" in file for file in self.api_response["files"]):
|
jpayne@69
|
836 self._api_version = "legacy"
|
jpayne@69
|
837 elif all("filename" in file for file in self.api_response["files"]):
|
jpayne@69
|
838 self._api_version = "new"
|
jpayne@69
|
839 else:
|
jpayne@69
|
840 raise ValueError(
|
jpayne@69
|
841 "Couldn't determine the version of the Zenodo API for "
|
jpayne@69
|
842 f"{self.archive_url} (doi:{self.doi})."
|
jpayne@69
|
843 )
|
jpayne@69
|
844 return self._api_version
|
jpayne@69
|
845
|
jpayne@69
|
846 def download_url(self, file_name):
|
jpayne@69
|
847 """
|
jpayne@69
|
848 Use the repository API to get the download URL for a file given
|
jpayne@69
|
849 the archive URL.
|
jpayne@69
|
850
|
jpayne@69
|
851 Parameters
|
jpayne@69
|
852 ----------
|
jpayne@69
|
853 file_name : str
|
jpayne@69
|
854 The name of the file in the archive that will be downloaded.
|
jpayne@69
|
855
|
jpayne@69
|
856 Returns
|
jpayne@69
|
857 -------
|
jpayne@69
|
858 download_url : str
|
jpayne@69
|
859 The HTTP URL that can be used to download the file.
|
jpayne@69
|
860
|
jpayne@69
|
861 Notes
|
jpayne@69
|
862 -----
|
jpayne@69
|
863 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
|
jpayne@69
|
864 link to the desired files that appears in the API response leads to 404
|
jpayne@69
|
865 errors (by 2023-10-17). The files are available in the following url:
|
jpayne@69
|
866 ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.
|
jpayne@69
|
867
|
jpayne@69
|
868 This method supports both the legacy and the new API.
|
jpayne@69
|
869 """
|
jpayne@69
|
870 # Create list of files in the repository
|
jpayne@69
|
871 if self.api_version == "legacy":
|
jpayne@69
|
872 files = {item["key"]: item for item in self.api_response["files"]}
|
jpayne@69
|
873 else:
|
jpayne@69
|
874 files = [item["filename"] for item in self.api_response["files"]]
|
jpayne@69
|
875 # Check if file exists in the repository
|
jpayne@69
|
876 if file_name not in files:
|
jpayne@69
|
877 raise ValueError(
|
jpayne@69
|
878 f"File '{file_name}' not found in data archive "
|
jpayne@69
|
879 f"{self.archive_url} (doi:{self.doi})."
|
jpayne@69
|
880 )
|
jpayne@69
|
881 # Build download url
|
jpayne@69
|
882 if self.api_version == "legacy":
|
jpayne@69
|
883 download_url = files[file_name]["links"]["self"]
|
jpayne@69
|
884 else:
|
jpayne@69
|
885 article_id = self.api_response["id"]
|
jpayne@69
|
886 download_url = (
|
jpayne@69
|
887 f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
|
jpayne@69
|
888 )
|
jpayne@69
|
889 return download_url
|
jpayne@69
|
890
|
jpayne@69
|
891 def populate_registry(self, pooch):
|
jpayne@69
|
892 """
|
jpayne@69
|
893 Populate the registry using the data repository's API
|
jpayne@69
|
894
|
jpayne@69
|
895 Parameters
|
jpayne@69
|
896 ----------
|
jpayne@69
|
897 pooch : Pooch
|
jpayne@69
|
898 The pooch instance that the registry will be added to.
|
jpayne@69
|
899
|
jpayne@69
|
900 Notes
|
jpayne@69
|
901 -----
|
jpayne@69
|
902 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
|
jpayne@69
|
903 checksums for each file listed in the API reference is now an md5 sum.
|
jpayne@69
|
904
|
jpayne@69
|
905 This method supports both the legacy and the new API.
|
jpayne@69
|
906 """
|
jpayne@69
|
907 for filedata in self.api_response["files"]:
|
jpayne@69
|
908 checksum = filedata["checksum"]
|
jpayne@69
|
909 if self.api_version == "legacy":
|
jpayne@69
|
910 key = "key"
|
jpayne@69
|
911 else:
|
jpayne@69
|
912 key = "filename"
|
jpayne@69
|
913 checksum = f"md5:{checksum}"
|
jpayne@69
|
914 pooch.registry[filedata[key]] = checksum
|
jpayne@69
|
915
|
jpayne@69
|
916
|
jpayne@69
|
917 class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring
|
jpayne@69
|
918 def __init__(self, doi, archive_url):
|
jpayne@69
|
919 self.archive_url = archive_url
|
jpayne@69
|
920 self.doi = doi
|
jpayne@69
|
921 self._api_response = None
|
jpayne@69
|
922
|
jpayne@69
|
923 @classmethod
|
jpayne@69
|
924 def initialize(cls, doi, archive_url):
|
jpayne@69
|
925 """
|
jpayne@69
|
926 Initialize the data repository if the given URL points to a
|
jpayne@69
|
927 corresponding repository.
|
jpayne@69
|
928
|
jpayne@69
|
929 Initializes a data repository object. This is done as part of
|
jpayne@69
|
930 a chain of responsibility. If the class cannot handle the given
|
jpayne@69
|
931 repository URL, it returns `None`. Otherwise a `DataRepository`
|
jpayne@69
|
932 instance is returned.
|
jpayne@69
|
933
|
jpayne@69
|
934 Parameters
|
jpayne@69
|
935 ----------
|
jpayne@69
|
936 doi : str
|
jpayne@69
|
937 The DOI that identifies the repository
|
jpayne@69
|
938 archive_url : str
|
jpayne@69
|
939 The resolved URL for the DOI
|
jpayne@69
|
940 """
|
jpayne@69
|
941
|
jpayne@69
|
942 # Check whether this is a Figshare URL
|
jpayne@69
|
943 parsed_archive_url = parse_url(archive_url)
|
jpayne@69
|
944 if parsed_archive_url["netloc"] != "figshare.com":
|
jpayne@69
|
945 return None
|
jpayne@69
|
946
|
jpayne@69
|
947 return cls(doi, archive_url)
|
jpayne@69
|
948
|
jpayne@69
|
949 def _parse_version_from_doi(self):
|
jpayne@69
|
950 """
|
jpayne@69
|
951 Parse version from the doi
|
jpayne@69
|
952
|
jpayne@69
|
953 Return None if version is not available in the doi.
|
jpayne@69
|
954 """
|
jpayne@69
|
955 # Get suffix of the doi
|
jpayne@69
|
956 _, suffix = self.doi.split("/")
|
jpayne@69
|
957 # Split the suffix by dots and keep the last part
|
jpayne@69
|
958 last_part = suffix.split(".")[-1]
|
jpayne@69
|
959 # Parse the version from the last part
|
jpayne@69
|
960 if last_part[0] != "v":
|
jpayne@69
|
961 return None
|
jpayne@69
|
962 version = int(last_part[1:])
|
jpayne@69
|
963 return version
|
jpayne@69
|
964
|
jpayne@69
|
965 @property
|
jpayne@69
|
966 def api_response(self):
|
jpayne@69
|
967 """Cached API response from Figshare"""
|
jpayne@69
|
968 if self._api_response is None:
|
jpayne@69
|
969 # Lazy import requests to speed up import time
|
jpayne@69
|
970 import requests # pylint: disable=C0415
|
jpayne@69
|
971
|
jpayne@69
|
972 # Use the figshare API to find the article ID from the DOI
|
jpayne@69
|
973 article = requests.get(
|
jpayne@69
|
974 f"https://api.figshare.com/v2/articles?doi={self.doi}",
|
jpayne@69
|
975 timeout=DEFAULT_TIMEOUT,
|
jpayne@69
|
976 ).json()[0]
|
jpayne@69
|
977 article_id = article["id"]
|
jpayne@69
|
978 # Parse desired version from the doi
|
jpayne@69
|
979 version = self._parse_version_from_doi()
|
jpayne@69
|
980 # With the ID and version, we can get a list of files and their
|
jpayne@69
|
981 # download links
|
jpayne@69
|
982 if version is None:
|
jpayne@69
|
983 # Figshare returns the latest version available when no version
|
jpayne@69
|
984 # is specified through the DOI.
|
jpayne@69
|
985 warnings.warn(
|
jpayne@69
|
986 f"The Figshare DOI '{self.doi}' doesn't specify which version of "
|
jpayne@69
|
987 "the repository should be used. "
|
jpayne@69
|
988 "Figshare will point to the latest version available.",
|
jpayne@69
|
989 UserWarning,
|
jpayne@69
|
990 )
|
jpayne@69
|
991 # Define API url using only the article id
|
jpayne@69
|
992 # (figshare will resolve the latest version)
|
jpayne@69
|
993 api_url = f"https://api.figshare.com/v2/articles/{article_id}"
|
jpayne@69
|
994 else:
|
jpayne@69
|
995 # Define API url using article id and the desired version
|
jpayne@69
|
996 # Get list of files using article id and the version
|
jpayne@69
|
997 api_url = (
|
jpayne@69
|
998 "https://api.figshare.com/v2/articles/"
|
jpayne@69
|
999 f"{article_id}/versions/{version}"
|
jpayne@69
|
1000 )
|
jpayne@69
|
1001 # Make the request and return the files in the figshare repository
|
jpayne@69
|
1002 response = requests.get(api_url, timeout=DEFAULT_TIMEOUT)
|
jpayne@69
|
1003 response.raise_for_status()
|
jpayne@69
|
1004 self._api_response = response.json()["files"]
|
jpayne@69
|
1005
|
jpayne@69
|
1006 return self._api_response
|
jpayne@69
|
1007
|
jpayne@69
|
1008 def download_url(self, file_name):
|
jpayne@69
|
1009 """
|
jpayne@69
|
1010 Use the repository API to get the download URL for a file given
|
jpayne@69
|
1011 the archive URL.
|
jpayne@69
|
1012
|
jpayne@69
|
1013 Parameters
|
jpayne@69
|
1014 ----------
|
jpayne@69
|
1015 file_name : str
|
jpayne@69
|
1016 The name of the file in the archive that will be downloaded.
|
jpayne@69
|
1017
|
jpayne@69
|
1018 Returns
|
jpayne@69
|
1019 -------
|
jpayne@69
|
1020 download_url : str
|
jpayne@69
|
1021 The HTTP URL that can be used to download the file.
|
jpayne@69
|
1022 """
|
jpayne@69
|
1023 files = {item["name"]: item for item in self.api_response}
|
jpayne@69
|
1024 if file_name not in files:
|
jpayne@69
|
1025 raise ValueError(
|
jpayne@69
|
1026 f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
|
jpayne@69
|
1027 )
|
jpayne@69
|
1028 download_url = files[file_name]["download_url"]
|
jpayne@69
|
1029 return download_url
|
jpayne@69
|
1030
|
jpayne@69
|
1031 def populate_registry(self, pooch):
|
jpayne@69
|
1032 """
|
jpayne@69
|
1033 Populate the registry using the data repository's API
|
jpayne@69
|
1034
|
jpayne@69
|
1035 Parameters
|
jpayne@69
|
1036 ----------
|
jpayne@69
|
1037 pooch : Pooch
|
jpayne@69
|
1038 The pooch instance that the registry will be added to.
|
jpayne@69
|
1039 """
|
jpayne@69
|
1040
|
jpayne@69
|
1041 for filedata in self.api_response:
|
jpayne@69
|
1042 pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
|
jpayne@69
|
1043
|
jpayne@69
|
1044
|
jpayne@69
|
1045 class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring
|
jpayne@69
|
1046 def __init__(self, doi, archive_url):
|
jpayne@69
|
1047 self.archive_url = archive_url
|
jpayne@69
|
1048 self.doi = doi
|
jpayne@69
|
1049 self._api_response = None
|
jpayne@69
|
1050
|
jpayne@69
|
1051 @classmethod
|
jpayne@69
|
1052 def initialize(cls, doi, archive_url):
|
jpayne@69
|
1053 """
|
jpayne@69
|
1054 Initialize the data repository if the given URL points to a
|
jpayne@69
|
1055 corresponding repository.
|
jpayne@69
|
1056
|
jpayne@69
|
1057 Initializes a data repository object. This is done as part of
|
jpayne@69
|
1058 a chain of responsibility. If the class cannot handle the given
|
jpayne@69
|
1059 repository URL, it returns `None`. Otherwise a `DataRepository`
|
jpayne@69
|
1060 instance is returned.
|
jpayne@69
|
1061
|
jpayne@69
|
1062 Parameters
|
jpayne@69
|
1063 ----------
|
jpayne@69
|
1064 doi : str
|
jpayne@69
|
1065 The DOI that identifies the repository
|
jpayne@69
|
1066 archive_url : str
|
jpayne@69
|
1067 The resolved URL for the DOI
|
jpayne@69
|
1068 """
|
jpayne@69
|
1069 # Access the DOI as if this was a DataVerse instance
|
jpayne@69
|
1070 response = cls._get_api_response(doi, archive_url)
|
jpayne@69
|
1071
|
jpayne@69
|
1072 # If we failed, this is probably not a DataVerse instance
|
jpayne@69
|
1073 if 400 <= response.status_code < 600:
|
jpayne@69
|
1074 return None
|
jpayne@69
|
1075
|
jpayne@69
|
1076 # Initialize the repository and overwrite the api response
|
jpayne@69
|
1077 repository = cls(doi, archive_url)
|
jpayne@69
|
1078 repository.api_response = response
|
jpayne@69
|
1079 return repository
|
jpayne@69
|
1080
|
jpayne@69
|
1081 @classmethod
|
jpayne@69
|
1082 def _get_api_response(cls, doi, archive_url):
|
jpayne@69
|
1083 """
|
jpayne@69
|
1084 Perform the actual API request
|
jpayne@69
|
1085
|
jpayne@69
|
1086 This has been separated into a separate ``classmethod``, as it can be
|
jpayne@69
|
1087 used prior and after the initialization.
|
jpayne@69
|
1088 """
|
jpayne@69
|
1089 # Lazy import requests to speed up import time
|
jpayne@69
|
1090 import requests # pylint: disable=C0415
|
jpayne@69
|
1091
|
jpayne@69
|
1092 parsed = parse_url(archive_url)
|
jpayne@69
|
1093 response = requests.get(
|
jpayne@69
|
1094 f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
|
jpayne@69
|
1095 f":persistentId?persistentId=doi:{doi}",
|
jpayne@69
|
1096 timeout=DEFAULT_TIMEOUT,
|
jpayne@69
|
1097 )
|
jpayne@69
|
1098 return response
|
jpayne@69
|
1099
|
jpayne@69
|
1100 @property
|
jpayne@69
|
1101 def api_response(self):
|
jpayne@69
|
1102 """Cached API response from a DataVerse instance"""
|
jpayne@69
|
1103
|
jpayne@69
|
1104 if self._api_response is None:
|
jpayne@69
|
1105 self._api_response = self._get_api_response(
|
jpayne@69
|
1106 self.doi, self.archive_url
|
jpayne@69
|
1107 ) # pragma: no cover
|
jpayne@69
|
1108
|
jpayne@69
|
1109 return self._api_response
|
jpayne@69
|
1110
|
jpayne@69
|
1111 @api_response.setter
|
jpayne@69
|
1112 def api_response(self, response):
|
jpayne@69
|
1113 """Update the cached API response"""
|
jpayne@69
|
1114
|
jpayne@69
|
1115 self._api_response = response
|
jpayne@69
|
1116
|
jpayne@69
|
1117 def download_url(self, file_name):
|
jpayne@69
|
1118 """
|
jpayne@69
|
1119 Use the repository API to get the download URL for a file given
|
jpayne@69
|
1120 the archive URL.
|
jpayne@69
|
1121
|
jpayne@69
|
1122 Parameters
|
jpayne@69
|
1123 ----------
|
jpayne@69
|
1124 file_name : str
|
jpayne@69
|
1125 The name of the file in the archive that will be downloaded.
|
jpayne@69
|
1126
|
jpayne@69
|
1127 Returns
|
jpayne@69
|
1128 -------
|
jpayne@69
|
1129 download_url : str
|
jpayne@69
|
1130 The HTTP URL that can be used to download the file.
|
jpayne@69
|
1131 """
|
jpayne@69
|
1132 parsed = parse_url(self.archive_url)
|
jpayne@69
|
1133 response = self.api_response.json()
|
jpayne@69
|
1134 files = {
|
jpayne@69
|
1135 file["dataFile"]["filename"]: file["dataFile"]
|
jpayne@69
|
1136 for file in response["data"]["latestVersion"]["files"]
|
jpayne@69
|
1137 }
|
jpayne@69
|
1138 if file_name not in files:
|
jpayne@69
|
1139 raise ValueError(
|
jpayne@69
|
1140 f"File '{file_name}' not found in data archive "
|
jpayne@69
|
1141 f"{self.archive_url} (doi:{self.doi})."
|
jpayne@69
|
1142 )
|
jpayne@69
|
1143 # Generate download_url using the file id
|
jpayne@69
|
1144 download_url = (
|
jpayne@69
|
1145 f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
|
jpayne@69
|
1146 f"{files[file_name]['id']}"
|
jpayne@69
|
1147 )
|
jpayne@69
|
1148 return download_url
|
jpayne@69
|
1149
|
jpayne@69
|
1150 def populate_registry(self, pooch):
|
jpayne@69
|
1151 """
|
jpayne@69
|
1152 Populate the registry using the data repository's API
|
jpayne@69
|
1153
|
jpayne@69
|
1154 Parameters
|
jpayne@69
|
1155 ----------
|
jpayne@69
|
1156 pooch : Pooch
|
jpayne@69
|
1157 The pooch instance that the registry will be added to.
|
jpayne@69
|
1158 """
|
jpayne@69
|
1159
|
jpayne@69
|
1160 for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
|
jpayne@69
|
1161 pooch.registry[filedata["dataFile"]["filename"]] = (
|
jpayne@69
|
1162 f"md5:{filedata['dataFile']['md5']}"
|
jpayne@69
|
1163 )
|