Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 # Copyright (c) 2018 The Pooch Developers. | |
2 # Distributed under the terms of the BSD 3-Clause License. | |
3 # SPDX-License-Identifier: BSD-3-Clause | |
4 # | |
5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org) | |
6 # | |
7 """ | |
8 The classes that actually handle the downloads. | |
9 """ | |
10 import os | |
11 import sys | |
12 import ftplib | |
13 | |
14 import warnings | |
15 | |
16 from .utils import parse_url | |
17 | |
18 try: | |
19 from tqdm import tqdm | |
20 except ImportError: | |
21 tqdm = None | |
22 | |
23 try: | |
24 import paramiko | |
25 except ImportError: | |
26 paramiko = None | |
27 | |
28 | |
29 # Set the default timeout in seconds so it can be configured in a pinch for the | |
30 # methods that don't or can't expose a way set it at runtime. | |
31 # See https://github.com/fatiando/pooch/issues/409 | |
32 DEFAULT_TIMEOUT = 30 | |
33 | |
34 | |
35 def choose_downloader(url, progressbar=False): | |
36 """ | |
37 Choose the appropriate downloader for the given URL based on the protocol. | |
38 | |
39 Parameters | |
40 ---------- | |
41 url : str | |
42 A URL (including protocol). | |
43 progressbar : bool or an arbitrary progress bar object | |
44 If True, will print a progress bar of the download to standard error | |
45 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be | |
46 installed. Alternatively, an arbitrary progress bar object can be | |
47 passed. See :ref:`custom-progressbar` for details. | |
48 | |
49 Returns | |
50 ------- | |
51 downloader | |
52 A downloader class, like :class:`pooch.HTTPDownloader`, | |
53 :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`. | |
54 | |
55 Examples | |
56 -------- | |
57 | |
58 >>> downloader = choose_downloader("http://something.com") | |
59 >>> print(downloader.__class__.__name__) | |
60 HTTPDownloader | |
61 >>> downloader = choose_downloader("https://something.com") | |
62 >>> print(downloader.__class__.__name__) | |
63 HTTPDownloader | |
64 >>> downloader = choose_downloader("ftp://something.com") | |
65 >>> print(downloader.__class__.__name__) | |
66 FTPDownloader | |
67 >>> downloader = choose_downloader("doi:DOI/filename.csv") | |
68 >>> print(downloader.__class__.__name__) | |
69 DOIDownloader | |
70 | |
71 """ | |
72 known_downloaders = { | |
73 "ftp": FTPDownloader, | |
74 "https": HTTPDownloader, | |
75 "http": HTTPDownloader, | |
76 "sftp": SFTPDownloader, | |
77 "doi": DOIDownloader, | |
78 } | |
79 | |
80 parsed_url = parse_url(url) | |
81 if parsed_url["protocol"] not in known_downloaders: | |
82 raise ValueError( | |
83 f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. " | |
84 f"Must be one of {known_downloaders.keys()}." | |
85 ) | |
86 downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar) | |
87 return downloader | |
88 | |
89 | |
90 class HTTPDownloader: # pylint: disable=too-few-public-methods | |
91 """ | |
92 Download manager for fetching files over HTTP/HTTPS. | |
93 | |
94 When called, downloads the given file URL into the specified local file. | |
95 Uses the :mod:`requests` library to manage downloads. | |
96 | |
97 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize | |
98 the download of files (for example, to use authentication or print a | |
99 progress bar). | |
100 | |
101 Parameters | |
102 ---------- | |
103 progressbar : bool or an arbitrary progress bar object | |
104 If True, will print a progress bar of the download to standard error | |
105 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be | |
106 installed. Alternatively, an arbitrary progress bar object can be | |
107 passed. See :ref:`custom-progressbar` for details. | |
108 chunk_size : int | |
109 Files are streamed *chunk_size* bytes at a time instead of loading | |
110 everything into memory at one. Usually doesn't need to be changed. | |
111 **kwargs | |
112 All keyword arguments given when creating an instance of this class | |
113 will be passed to :func:`requests.get`. | |
114 | |
115 Examples | |
116 -------- | |
117 | |
118 Download one of the data files from the Pooch repository: | |
119 | |
120 >>> import os | |
121 >>> from pooch import __version__, check_version | |
122 >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt" | |
123 >>> url = url.format(check_version(__version__, fallback="main")) | |
124 >>> downloader = HTTPDownloader() | |
125 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch | |
126 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) | |
127 >>> os.path.exists("tiny-data.txt") | |
128 True | |
129 >>> with open("tiny-data.txt") as f: | |
130 ... print(f.read().strip()) | |
131 # A tiny data file for test purposes only | |
132 1 2 3 4 5 6 | |
133 >>> os.remove("tiny-data.txt") | |
134 | |
135 Authentication can be handled by passing a user name and password to | |
136 :func:`requests.get`. All arguments provided when creating an instance of | |
137 the class are forwarded to :func:`requests.get`. We'll use | |
138 ``auth=(username, password)`` to use basic HTTPS authentication. The | |
139 https://httpbin.org website allows us to make a fake a login request using | |
140 whatever username and password we provide to it: | |
141 | |
142 >>> user = "doggo" | |
143 >>> password = "goodboy" | |
144 >>> # httpbin will ask for the user and password we provide in the URL | |
145 >>> url = f"https://httpbin.org/basic-auth/{user}/{password}" | |
146 >>> # Trying without the login credentials causes an error | |
147 >>> downloader = HTTPDownloader() | |
148 >>> try: | |
149 ... downloader(url=url, output_file="tiny-data.txt", pooch=None) | |
150 ... except Exception: | |
151 ... print("There was an error!") | |
152 There was an error! | |
153 >>> # Pass in the credentials to HTTPDownloader | |
154 >>> downloader = HTTPDownloader(auth=(user, password)) | |
155 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) | |
156 >>> with open("tiny-data.txt") as f: | |
157 ... for line in f: | |
158 ... print(line.rstrip()) | |
159 { | |
160 "authenticated": true, | |
161 "user": "doggo" | |
162 } | |
163 >>> os.remove("tiny-data.txt") | |
164 | |
165 """ | |
166 | |
167 def __init__(self, progressbar=False, chunk_size=1024, **kwargs): | |
168 self.kwargs = kwargs | |
169 self.progressbar = progressbar | |
170 self.chunk_size = chunk_size | |
171 if self.progressbar is True and tqdm is None: | |
172 raise ValueError("Missing package 'tqdm' required for progress bars.") | |
173 | |
174 def __call__( | |
175 self, url, output_file, pooch, check_only=False | |
176 ): # pylint: disable=R0914 | |
177 """ | |
178 Download the given URL over HTTP to the given output file. | |
179 | |
180 Uses :func:`requests.get`. | |
181 | |
182 Parameters | |
183 ---------- | |
184 url : str | |
185 The URL to the file you want to download. | |
186 output_file : str or file-like object | |
187 Path (and file name) to which the file will be downloaded. | |
188 pooch : :class:`~pooch.Pooch` | |
189 The instance of :class:`~pooch.Pooch` that is calling this method. | |
190 check_only : bool | |
191 If True, will only check if a file exists on the server and | |
192 **without downloading the file**. Will return ``True`` if the file | |
193 exists and ``False`` otherwise. | |
194 | |
195 Returns | |
196 ------- | |
197 availability : bool or None | |
198 If ``check_only==True``, returns a boolean indicating if the file | |
199 is available on the server. Otherwise, returns ``None``. | |
200 | |
201 """ | |
202 # Lazy import requests to speed up import time | |
203 import requests # pylint: disable=C0415 | |
204 | |
205 if check_only: | |
206 timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT) | |
207 response = requests.head(url, timeout=timeout, allow_redirects=True) | |
208 available = bool(response.status_code == 200) | |
209 return available | |
210 | |
211 kwargs = self.kwargs.copy() | |
212 timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT) | |
213 kwargs.setdefault("stream", True) | |
214 ispath = not hasattr(output_file, "write") | |
215 if ispath: | |
216 # pylint: disable=consider-using-with | |
217 output_file = open(output_file, "w+b") | |
218 # pylint: enable=consider-using-with | |
219 try: | |
220 response = requests.get(url, timeout=timeout, **kwargs) | |
221 response.raise_for_status() | |
222 content = response.iter_content(chunk_size=self.chunk_size) | |
223 total = int(response.headers.get("content-length", 0)) | |
224 if self.progressbar is True: | |
225 # Need to use ascii characters on Windows because there isn't | |
226 # always full unicode support | |
227 # (see https://github.com/tqdm/tqdm/issues/454) | |
228 use_ascii = bool(sys.platform == "win32") | |
229 progress = tqdm( | |
230 total=total, | |
231 ncols=79, | |
232 ascii=use_ascii, | |
233 unit="B", | |
234 unit_scale=True, | |
235 leave=True, | |
236 ) | |
237 elif self.progressbar: | |
238 progress = self.progressbar | |
239 progress.total = total | |
240 for chunk in content: | |
241 if chunk: | |
242 output_file.write(chunk) | |
243 output_file.flush() | |
244 if self.progressbar: | |
245 # Use the chunk size here because chunk may be much | |
246 # larger if the data are decompressed by requests after | |
247 # reading (happens with text files). | |
248 progress.update(self.chunk_size) | |
249 # Make sure the progress bar gets filled even if the actual number | |
250 # is chunks is smaller than expected. This happens when streaming | |
251 # text files that are compressed by the server when sending (gzip). | |
252 # Binary files don't experience this. | |
253 if self.progressbar: | |
254 progress.reset() | |
255 progress.update(total) | |
256 progress.close() | |
257 finally: | |
258 if ispath: | |
259 output_file.close() | |
260 return None | |
261 | |
262 | |
263 class FTPDownloader: # pylint: disable=too-few-public-methods | |
264 """ | |
265 Download manager for fetching files over FTP. | |
266 | |
267 When called, downloads the given file URL into the specified local file. | |
268 Uses the :mod:`ftplib` module to manage downloads. | |
269 | |
270 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize | |
271 the download of files (for example, to use authentication or print a | |
272 progress bar). | |
273 | |
274 Parameters | |
275 ---------- | |
276 port : int | |
277 Port used for the FTP connection. | |
278 username : str | |
279 User name used to login to the server. Only needed if the server | |
280 requires authentication (i.e., no anonymous FTP). | |
281 password : str | |
282 Password used to login to the server. Only needed if the server | |
283 requires authentication (i.e., no anonymous FTP). Use the empty string | |
284 to indicate no password is required. | |
285 account : str | |
286 Some servers also require an "account" name for authentication. | |
287 timeout : int | |
288 Timeout in seconds for ftp socket operations, use None to mean no | |
289 timeout. | |
290 progressbar : bool | |
291 If True, will print a progress bar of the download to standard error | |
292 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be | |
293 installed. **Custom progress bars are not yet supported.** | |
294 chunk_size : int | |
295 Files are streamed *chunk_size* bytes at a time instead of loading | |
296 everything into memory at one. Usually doesn't need to be changed. | |
297 | |
298 """ | |
299 | |
300 def __init__( | |
301 self, | |
302 port=21, | |
303 username="anonymous", | |
304 password="", | |
305 account="", | |
306 timeout=None, | |
307 progressbar=False, | |
308 chunk_size=1024, | |
309 ): | |
310 self.port = port | |
311 self.username = username | |
312 self.password = password | |
313 self.account = account | |
314 self.timeout = timeout | |
315 self.progressbar = progressbar | |
316 self.chunk_size = chunk_size | |
317 if self.progressbar is True and tqdm is None: | |
318 raise ValueError("Missing package 'tqdm' required for progress bars.") | |
319 | |
320 def __call__(self, url, output_file, pooch, check_only=False): | |
321 """ | |
322 Download the given URL over FTP to the given output file. | |
323 | |
324 Parameters | |
325 ---------- | |
326 url : str | |
327 The URL to the file you want to download. | |
328 output_file : str or file-like object | |
329 Path (and file name) to which the file will be downloaded. | |
330 pooch : :class:`~pooch.Pooch` | |
331 The instance of :class:`~pooch.Pooch` that is calling this method. | |
332 check_only : bool | |
333 If True, will only check if a file exists on the server and | |
334 **without downloading the file**. Will return ``True`` if the file | |
335 exists and ``False`` otherwise. | |
336 | |
337 Returns | |
338 ------- | |
339 availability : bool or None | |
340 If ``check_only==True``, returns a boolean indicating if the file | |
341 is available on the server. Otherwise, returns ``None``. | |
342 | |
343 """ | |
344 parsed_url = parse_url(url) | |
345 ftp = ftplib.FTP(timeout=self.timeout) | |
346 ftp.connect(host=parsed_url["netloc"], port=self.port) | |
347 | |
348 if check_only: | |
349 directory, file_name = os.path.split(parsed_url["path"]) | |
350 try: | |
351 ftp.login(user=self.username, passwd=self.password, acct=self.account) | |
352 available = file_name in ftp.nlst(directory) | |
353 finally: | |
354 ftp.close() | |
355 return available | |
356 | |
357 ispath = not hasattr(output_file, "write") | |
358 if ispath: | |
359 # pylint: disable=consider-using-with | |
360 output_file = open(output_file, "w+b") | |
361 # pylint: enable=consider-using-with | |
362 try: | |
363 ftp.login(user=self.username, passwd=self.password, acct=self.account) | |
364 command = f"RETR {parsed_url['path']}" | |
365 if self.progressbar: | |
366 # Make sure the file is set to binary mode, otherwise we can't | |
367 # get the file size. See: https://stackoverflow.com/a/22093848 | |
368 ftp.voidcmd("TYPE I") | |
369 use_ascii = bool(sys.platform == "win32") | |
370 progress = tqdm( | |
371 total=int(ftp.size(parsed_url["path"])), | |
372 ncols=79, | |
373 ascii=use_ascii, | |
374 unit="B", | |
375 unit_scale=True, | |
376 leave=True, | |
377 ) | |
378 with progress: | |
379 | |
380 def callback(data): | |
381 "Update the progress bar and write to output" | |
382 progress.update(len(data)) | |
383 output_file.write(data) | |
384 | |
385 ftp.retrbinary(command, callback, blocksize=self.chunk_size) | |
386 else: | |
387 ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size) | |
388 finally: | |
389 ftp.quit() | |
390 if ispath: | |
391 output_file.close() | |
392 return None | |
393 | |
394 | |
395 class SFTPDownloader: # pylint: disable=too-few-public-methods | |
396 """ | |
397 Download manager for fetching files over SFTP. | |
398 | |
399 When called, downloads the given file URL into the specified local file. | |
400 Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be | |
401 installed. | |
402 | |
403 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize | |
404 the download of files (for example, to use authentication or print a | |
405 progress bar). | |
406 | |
407 Parameters | |
408 ---------- | |
409 port : int | |
410 Port used for the SFTP connection. | |
411 username : str | |
412 User name used to login to the server. Only needed if the server | |
413 requires authentication (i.e., no anonymous SFTP). | |
414 password : str | |
415 Password used to login to the server. Only needed if the server | |
416 requires authentication (i.e., no anonymous SFTP). Use the empty | |
417 string to indicate no password is required. | |
418 timeout : int | |
419 Timeout in seconds for sftp socket operations, use None to mean no | |
420 timeout. | |
421 progressbar : bool or an arbitrary progress bar object | |
422 If True, will print a progress bar of the download to standard | |
423 error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to | |
424 be installed. | |
425 | |
426 """ | |
427 | |
428 def __init__( | |
429 self, | |
430 port=22, | |
431 username="anonymous", | |
432 password="", | |
433 account="", | |
434 timeout=None, | |
435 progressbar=False, | |
436 ): | |
437 self.port = port | |
438 self.username = username | |
439 self.password = password | |
440 self.account = account | |
441 self.timeout = timeout | |
442 self.progressbar = progressbar | |
443 # Collect errors and raise only once so that both missing packages are | |
444 # captured. Otherwise, the user is only warned of one of them at a | |
445 # time (and we can't test properly when they are both missing). | |
446 errors = [] | |
447 if self.progressbar and tqdm is None: | |
448 errors.append("Missing package 'tqdm' required for progress bars.") | |
449 if paramiko is None: | |
450 errors.append("Missing package 'paramiko' required for SFTP downloads.") | |
451 if errors: | |
452 raise ValueError(" ".join(errors)) | |
453 | |
454 def __call__(self, url, output_file, pooch): | |
455 """ | |
456 Download the given URL over SFTP to the given output file. | |
457 | |
458 The output file must be given as a string (file name/path) and not an | |
459 open file object! Otherwise, paramiko cannot save to that file. | |
460 | |
461 Parameters | |
462 ---------- | |
463 url : str | |
464 The URL to the file you want to download. | |
465 output_file : str | |
466 Path (and file name) to which the file will be downloaded. **Cannot | |
467 be a file object**. | |
468 pooch : :class:`~pooch.Pooch` | |
469 The instance of :class:`~pooch.Pooch` that is calling this method. | |
470 """ | |
471 parsed_url = parse_url(url) | |
472 connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port)) | |
473 sftp = None | |
474 try: | |
475 connection.connect(username=self.username, password=self.password) | |
476 sftp = paramiko.SFTPClient.from_transport(connection) | |
477 sftp.get_channel().settimeout = self.timeout | |
478 if self.progressbar: | |
479 size = int(sftp.stat(parsed_url["path"]).st_size) | |
480 use_ascii = bool(sys.platform == "win32") | |
481 progress = tqdm( | |
482 total=size, | |
483 ncols=79, | |
484 ascii=use_ascii, | |
485 unit="B", | |
486 unit_scale=True, | |
487 leave=True, | |
488 ) | |
489 if self.progressbar: | |
490 with progress: | |
491 | |
492 def callback(current, total): | |
493 "Update the progress bar and write to output" | |
494 progress.total = int(total) | |
495 progress.update(int(current - progress.n)) | |
496 | |
497 sftp.get(parsed_url["path"], output_file, callback=callback) | |
498 else: | |
499 sftp.get(parsed_url["path"], output_file) | |
500 finally: | |
501 connection.close() | |
502 if sftp is not None: | |
503 sftp.close() | |
504 | |
505 | |
506 class DOIDownloader: # pylint: disable=too-few-public-methods | |
507 """ | |
508 Download manager for fetching files from Digital Object Identifiers (DOIs). | |
509 | |
510 Open-access data repositories often issue Digital Object Identifiers (DOIs) | |
511 for data which provide a stable link and citation point. The trick is | |
512 finding out the download URL for a file given the DOI. | |
513 | |
514 When called, this downloader uses the repository's public API to find out | |
515 the download URL from the DOI and file name. It then uses | |
516 :class:`pooch.HTTPDownloader` to download the URL into the specified local | |
517 file. Allowing "URL"s to be specified with the DOI instead of the actual | |
518 HTTP download link. Uses the :mod:`requests` library to manage downloads | |
519 and interact with the APIs. | |
520 | |
521 The **format of the "URL"** is: ``doi:{DOI}/{file name}``. | |
522 | |
523 Notice that there are no ``//`` like in HTTP/FTP and you must specify a | |
524 file name after the DOI (separated by a ``/``). | |
525 | |
526 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to | |
527 download files given the DOI instead of an HTTP link. | |
528 | |
529 Supported repositories: | |
530 | |
531 * `figshare <https://www.figshare.com>`__ | |
532 * `Zenodo <https://www.zenodo.org>`__ | |
533 * `Dataverse <https://dataverse.org/>`__ instances | |
534 | |
535 .. attention:: | |
536 | |
537 DOIs from other repositories **will not work** since we need to access | |
538 their particular APIs to find the download links. We welcome | |
539 suggestions and contributions adding new repositories. | |
540 | |
541 Parameters | |
542 ---------- | |
543 progressbar : bool or an arbitrary progress bar object | |
544 If True, will print a progress bar of the download to standard error | |
545 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be | |
546 installed. Alternatively, an arbitrary progress bar object can be | |
547 passed. See :ref:`custom-progressbar` for details. | |
548 chunk_size : int | |
549 Files are streamed *chunk_size* bytes at a time instead of loading | |
550 everything into memory at one. Usually doesn't need to be changed. | |
551 **kwargs | |
552 All keyword arguments given when creating an instance of this class | |
553 will be passed to :func:`requests.get`. | |
554 | |
555 Examples | |
556 -------- | |
557 | |
558 Download one of the data files from the figshare archive of Pooch test | |
559 data: | |
560 | |
561 >>> import os | |
562 >>> downloader = DOIDownloader() | |
563 >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt" | |
564 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch | |
565 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) | |
566 >>> os.path.exists("tiny-data.txt") | |
567 True | |
568 >>> with open("tiny-data.txt") as f: | |
569 ... print(f.read().strip()) | |
570 # A tiny data file for test purposes only | |
571 1 2 3 4 5 6 | |
572 >>> os.remove("tiny-data.txt") | |
573 | |
574 Same thing but for our Zenodo archive: | |
575 | |
576 >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt" | |
577 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) | |
578 >>> os.path.exists("tiny-data.txt") | |
579 True | |
580 >>> with open("tiny-data.txt") as f: | |
581 ... print(f.read().strip()) | |
582 # A tiny data file for test purposes only | |
583 1 2 3 4 5 6 | |
584 >>> os.remove("tiny-data.txt") | |
585 | |
586 """ | |
587 | |
588 def __init__(self, progressbar=False, chunk_size=1024, **kwargs): | |
589 self.kwargs = kwargs | |
590 self.progressbar = progressbar | |
591 self.chunk_size = chunk_size | |
592 | |
593 def __call__(self, url, output_file, pooch): | |
594 """ | |
595 Download the given DOI URL over HTTP to the given output file. | |
596 | |
597 Uses the repository's API to determine the actual HTTP download URL | |
598 from the given DOI. | |
599 | |
600 Uses :func:`requests.get`. | |
601 | |
602 Parameters | |
603 ---------- | |
604 url : str | |
605 The URL to the file you want to download. | |
606 output_file : str or file-like object | |
607 Path (and file name) to which the file will be downloaded. | |
608 pooch : :class:`~pooch.Pooch` | |
609 The instance of :class:`~pooch.Pooch` that is calling this method. | |
610 | |
611 """ | |
612 | |
613 parsed_url = parse_url(url) | |
614 data_repository = doi_to_repository(parsed_url["netloc"]) | |
615 | |
616 # Resolve the URL | |
617 file_name = parsed_url["path"] | |
618 # remove the leading slash in the path | |
619 if file_name[0] == "/": | |
620 file_name = file_name[1:] | |
621 download_url = data_repository.download_url(file_name) | |
622 | |
623 # Instantiate the downloader object | |
624 downloader = HTTPDownloader( | |
625 progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs | |
626 ) | |
627 downloader(download_url, output_file, pooch) | |
628 | |
629 | |
630 def doi_to_url(doi): | |
631 """ | |
632 Follow a DOI link to resolve the URL of the archive. | |
633 | |
634 Parameters | |
635 ---------- | |
636 doi : str | |
637 The DOI of the archive. | |
638 | |
639 Returns | |
640 ------- | |
641 url : str | |
642 The URL of the archive in the data repository. | |
643 | |
644 """ | |
645 # Lazy import requests to speed up import time | |
646 import requests # pylint: disable=C0415 | |
647 | |
648 # Use doi.org to resolve the DOI to the repository website. | |
649 response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT) | |
650 url = response.url | |
651 if 400 <= response.status_code < 600: | |
652 raise ValueError( | |
653 f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?" | |
654 ) | |
655 return url | |
656 | |
657 | |
658 def doi_to_repository(doi): | |
659 """ | |
660 Instantiate a data repository instance from a given DOI. | |
661 | |
662 This function implements the chain of responsibility dispatch | |
663 to the correct data repository class. | |
664 | |
665 Parameters | |
666 ---------- | |
667 doi : str | |
668 The DOI of the archive. | |
669 | |
670 Returns | |
671 ------- | |
672 data_repository : DataRepository | |
673 The data repository object | |
674 """ | |
675 | |
676 # This should go away in a separate issue: DOI handling should | |
677 # not rely on the (non-)existence of trailing slashes. The issue | |
678 # is documented in https://github.com/fatiando/pooch/issues/324 | |
679 if doi[-1] == "/": | |
680 doi = doi[:-1] | |
681 | |
682 repositories = [ | |
683 FigshareRepository, | |
684 ZenodoRepository, | |
685 DataverseRepository, | |
686 ] | |
687 | |
688 # Extract the DOI and the repository information | |
689 archive_url = doi_to_url(doi) | |
690 | |
691 # Try the converters one by one until one of them returned a URL | |
692 data_repository = None | |
693 for repo in repositories: | |
694 if data_repository is None: | |
695 data_repository = repo.initialize( | |
696 archive_url=archive_url, | |
697 doi=doi, | |
698 ) | |
699 | |
700 if data_repository is None: | |
701 repository = parse_url(archive_url)["netloc"] | |
702 raise ValueError( | |
703 f"Invalid data repository '{repository}'. " | |
704 "To request or contribute support for this repository, " | |
705 "please open an issue at https://github.com/fatiando/pooch/issues" | |
706 ) | |
707 | |
708 return data_repository | |
709 | |
710 | |
711 class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring | |
712 @classmethod | |
713 def initialize(cls, doi, archive_url): # pylint: disable=unused-argument | |
714 """ | |
715 Initialize the data repository if the given URL points to a | |
716 corresponding repository. | |
717 | |
718 Initializes a data repository object. This is done as part of | |
719 a chain of responsibility. If the class cannot handle the given | |
720 repository URL, it returns `None`. Otherwise a `DataRepository` | |
721 instance is returned. | |
722 | |
723 Parameters | |
724 ---------- | |
725 doi : str | |
726 The DOI that identifies the repository | |
727 archive_url : str | |
728 The resolved URL for the DOI | |
729 """ | |
730 | |
731 return None # pragma: no cover | |
732 | |
733 def download_url(self, file_name): | |
734 """ | |
735 Use the repository API to get the download URL for a file given | |
736 the archive URL. | |
737 | |
738 Parameters | |
739 ---------- | |
740 file_name : str | |
741 The name of the file in the archive that will be downloaded. | |
742 | |
743 Returns | |
744 ------- | |
745 download_url : str | |
746 The HTTP URL that can be used to download the file. | |
747 """ | |
748 | |
749 raise NotImplementedError # pragma: no cover | |
750 | |
751 def populate_registry(self, pooch): | |
752 """ | |
753 Populate the registry using the data repository's API | |
754 | |
755 Parameters | |
756 ---------- | |
757 pooch : Pooch | |
758 The pooch instance that the registry will be added to. | |
759 """ | |
760 | |
761 raise NotImplementedError # pragma: no cover | |
762 | |
763 | |
764 class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring | |
765 base_api_url = "https://zenodo.org/api/records" | |
766 | |
767 def __init__(self, doi, archive_url): | |
768 self.archive_url = archive_url | |
769 self.doi = doi | |
770 self._api_response = None | |
771 self._api_version = None | |
772 | |
773 @classmethod | |
774 def initialize(cls, doi, archive_url): | |
775 """ | |
776 Initialize the data repository if the given URL points to a | |
777 corresponding repository. | |
778 | |
779 Initializes a data repository object. This is done as part of | |
780 a chain of responsibility. If the class cannot handle the given | |
781 repository URL, it returns `None`. Otherwise a `DataRepository` | |
782 instance is returned. | |
783 | |
784 Parameters | |
785 ---------- | |
786 doi : str | |
787 The DOI that identifies the repository | |
788 archive_url : str | |
789 The resolved URL for the DOI | |
790 """ | |
791 | |
792 # Check whether this is a Zenodo URL | |
793 parsed_archive_url = parse_url(archive_url) | |
794 if parsed_archive_url["netloc"] != "zenodo.org": | |
795 return None | |
796 | |
797 return cls(doi, archive_url) | |
798 | |
799 @property | |
800 def api_response(self): | |
801 """Cached API response from Zenodo""" | |
802 if self._api_response is None: | |
803 # Lazy import requests to speed up import time | |
804 import requests # pylint: disable=C0415 | |
805 | |
806 article_id = self.archive_url.split("/")[-1] | |
807 self._api_response = requests.get( | |
808 f"{self.base_api_url}/{article_id}", | |
809 timeout=DEFAULT_TIMEOUT, | |
810 ).json() | |
811 | |
812 return self._api_response | |
813 | |
814 @property | |
815 def api_version(self): | |
816 """ | |
817 Version of the Zenodo API we are interacting with | |
818 | |
819 The versions can either be : | |
820 | |
821 - ``"legacy"``: corresponds to the Zenodo API that was supported until | |
822 2023-10-12 (before the migration to InvenioRDM). | |
823 - ``"new"``: corresponds to the new API that went online on 2023-10-13 | |
824 after the migration to InvenioRDM. | |
825 | |
826 The ``"new"`` API breaks backward compatibility with the ``"legacy"`` | |
827 one and could probably be replaced by an updated version that restores | |
828 the behaviour of the ``"legacy"`` one. | |
829 | |
830 Returns | |
831 ------- | |
832 str | |
833 """ | |
834 if self._api_version is None: | |
835 if all("key" in file for file in self.api_response["files"]): | |
836 self._api_version = "legacy" | |
837 elif all("filename" in file for file in self.api_response["files"]): | |
838 self._api_version = "new" | |
839 else: | |
840 raise ValueError( | |
841 "Couldn't determine the version of the Zenodo API for " | |
842 f"{self.archive_url} (doi:{self.doi})." | |
843 ) | |
844 return self._api_version | |
845 | |
846 def download_url(self, file_name): | |
847 """ | |
848 Use the repository API to get the download URL for a file given | |
849 the archive URL. | |
850 | |
851 Parameters | |
852 ---------- | |
853 file_name : str | |
854 The name of the file in the archive that will be downloaded. | |
855 | |
856 Returns | |
857 ------- | |
858 download_url : str | |
859 The HTTP URL that can be used to download the file. | |
860 | |
861 Notes | |
862 ----- | |
863 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The | |
864 link to the desired files that appears in the API response leads to 404 | |
865 errors (by 2023-10-17). The files are available in the following url: | |
866 ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``. | |
867 | |
868 This method supports both the legacy and the new API. | |
869 """ | |
870 # Create list of files in the repository | |
871 if self.api_version == "legacy": | |
872 files = {item["key"]: item for item in self.api_response["files"]} | |
873 else: | |
874 files = [item["filename"] for item in self.api_response["files"]] | |
875 # Check if file exists in the repository | |
876 if file_name not in files: | |
877 raise ValueError( | |
878 f"File '{file_name}' not found in data archive " | |
879 f"{self.archive_url} (doi:{self.doi})." | |
880 ) | |
881 # Build download url | |
882 if self.api_version == "legacy": | |
883 download_url = files[file_name]["links"]["self"] | |
884 else: | |
885 article_id = self.api_response["id"] | |
886 download_url = ( | |
887 f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1" | |
888 ) | |
889 return download_url | |
890 | |
891 def populate_registry(self, pooch): | |
892 """ | |
893 Populate the registry using the data repository's API | |
894 | |
895 Parameters | |
896 ---------- | |
897 pooch : Pooch | |
898 The pooch instance that the registry will be added to. | |
899 | |
900 Notes | |
901 ----- | |
902 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The | |
903 checksums for each file listed in the API reference is now an md5 sum. | |
904 | |
905 This method supports both the legacy and the new API. | |
906 """ | |
907 for filedata in self.api_response["files"]: | |
908 checksum = filedata["checksum"] | |
909 if self.api_version == "legacy": | |
910 key = "key" | |
911 else: | |
912 key = "filename" | |
913 checksum = f"md5:{checksum}" | |
914 pooch.registry[filedata[key]] = checksum | |
915 | |
916 | |
917 class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring | |
918 def __init__(self, doi, archive_url): | |
919 self.archive_url = archive_url | |
920 self.doi = doi | |
921 self._api_response = None | |
922 | |
923 @classmethod | |
924 def initialize(cls, doi, archive_url): | |
925 """ | |
926 Initialize the data repository if the given URL points to a | |
927 corresponding repository. | |
928 | |
929 Initializes a data repository object. This is done as part of | |
930 a chain of responsibility. If the class cannot handle the given | |
931 repository URL, it returns `None`. Otherwise a `DataRepository` | |
932 instance is returned. | |
933 | |
934 Parameters | |
935 ---------- | |
936 doi : str | |
937 The DOI that identifies the repository | |
938 archive_url : str | |
939 The resolved URL for the DOI | |
940 """ | |
941 | |
942 # Check whether this is a Figshare URL | |
943 parsed_archive_url = parse_url(archive_url) | |
944 if parsed_archive_url["netloc"] != "figshare.com": | |
945 return None | |
946 | |
947 return cls(doi, archive_url) | |
948 | |
949 def _parse_version_from_doi(self): | |
950 """ | |
951 Parse version from the doi | |
952 | |
953 Return None if version is not available in the doi. | |
954 """ | |
955 # Get suffix of the doi | |
956 _, suffix = self.doi.split("/") | |
957 # Split the suffix by dots and keep the last part | |
958 last_part = suffix.split(".")[-1] | |
959 # Parse the version from the last part | |
960 if last_part[0] != "v": | |
961 return None | |
962 version = int(last_part[1:]) | |
963 return version | |
964 | |
965 @property | |
966 def api_response(self): | |
967 """Cached API response from Figshare""" | |
968 if self._api_response is None: | |
969 # Lazy import requests to speed up import time | |
970 import requests # pylint: disable=C0415 | |
971 | |
972 # Use the figshare API to find the article ID from the DOI | |
973 article = requests.get( | |
974 f"https://api.figshare.com/v2/articles?doi={self.doi}", | |
975 timeout=DEFAULT_TIMEOUT, | |
976 ).json()[0] | |
977 article_id = article["id"] | |
978 # Parse desired version from the doi | |
979 version = self._parse_version_from_doi() | |
980 # With the ID and version, we can get a list of files and their | |
981 # download links | |
982 if version is None: | |
983 # Figshare returns the latest version available when no version | |
984 # is specified through the DOI. | |
985 warnings.warn( | |
986 f"The Figshare DOI '{self.doi}' doesn't specify which version of " | |
987 "the repository should be used. " | |
988 "Figshare will point to the latest version available.", | |
989 UserWarning, | |
990 ) | |
991 # Define API url using only the article id | |
992 # (figshare will resolve the latest version) | |
993 api_url = f"https://api.figshare.com/v2/articles/{article_id}" | |
994 else: | |
995 # Define API url using article id and the desired version | |
996 # Get list of files using article id and the version | |
997 api_url = ( | |
998 "https://api.figshare.com/v2/articles/" | |
999 f"{article_id}/versions/{version}" | |
1000 ) | |
1001 # Make the request and return the files in the figshare repository | |
1002 response = requests.get(api_url, timeout=DEFAULT_TIMEOUT) | |
1003 response.raise_for_status() | |
1004 self._api_response = response.json()["files"] | |
1005 | |
1006 return self._api_response | |
1007 | |
1008 def download_url(self, file_name): | |
1009 """ | |
1010 Use the repository API to get the download URL for a file given | |
1011 the archive URL. | |
1012 | |
1013 Parameters | |
1014 ---------- | |
1015 file_name : str | |
1016 The name of the file in the archive that will be downloaded. | |
1017 | |
1018 Returns | |
1019 ------- | |
1020 download_url : str | |
1021 The HTTP URL that can be used to download the file. | |
1022 """ | |
1023 files = {item["name"]: item for item in self.api_response} | |
1024 if file_name not in files: | |
1025 raise ValueError( | |
1026 f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." | |
1027 ) | |
1028 download_url = files[file_name]["download_url"] | |
1029 return download_url | |
1030 | |
1031 def populate_registry(self, pooch): | |
1032 """ | |
1033 Populate the registry using the data repository's API | |
1034 | |
1035 Parameters | |
1036 ---------- | |
1037 pooch : Pooch | |
1038 The pooch instance that the registry will be added to. | |
1039 """ | |
1040 | |
1041 for filedata in self.api_response: | |
1042 pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}" | |
1043 | |
1044 | |
1045 class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring | |
1046 def __init__(self, doi, archive_url): | |
1047 self.archive_url = archive_url | |
1048 self.doi = doi | |
1049 self._api_response = None | |
1050 | |
1051 @classmethod | |
1052 def initialize(cls, doi, archive_url): | |
1053 """ | |
1054 Initialize the data repository if the given URL points to a | |
1055 corresponding repository. | |
1056 | |
1057 Initializes a data repository object. This is done as part of | |
1058 a chain of responsibility. If the class cannot handle the given | |
1059 repository URL, it returns `None`. Otherwise a `DataRepository` | |
1060 instance is returned. | |
1061 | |
1062 Parameters | |
1063 ---------- | |
1064 doi : str | |
1065 The DOI that identifies the repository | |
1066 archive_url : str | |
1067 The resolved URL for the DOI | |
1068 """ | |
1069 # Access the DOI as if this was a DataVerse instance | |
1070 response = cls._get_api_response(doi, archive_url) | |
1071 | |
1072 # If we failed, this is probably not a DataVerse instance | |
1073 if 400 <= response.status_code < 600: | |
1074 return None | |
1075 | |
1076 # Initialize the repository and overwrite the api response | |
1077 repository = cls(doi, archive_url) | |
1078 repository.api_response = response | |
1079 return repository | |
1080 | |
1081 @classmethod | |
1082 def _get_api_response(cls, doi, archive_url): | |
1083 """ | |
1084 Perform the actual API request | |
1085 | |
1086 This has been separated into a separate ``classmethod``, as it can be | |
1087 used prior and after the initialization. | |
1088 """ | |
1089 # Lazy import requests to speed up import time | |
1090 import requests # pylint: disable=C0415 | |
1091 | |
1092 parsed = parse_url(archive_url) | |
1093 response = requests.get( | |
1094 f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/" | |
1095 f":persistentId?persistentId=doi:{doi}", | |
1096 timeout=DEFAULT_TIMEOUT, | |
1097 ) | |
1098 return response | |
1099 | |
1100 @property | |
1101 def api_response(self): | |
1102 """Cached API response from a DataVerse instance""" | |
1103 | |
1104 if self._api_response is None: | |
1105 self._api_response = self._get_api_response( | |
1106 self.doi, self.archive_url | |
1107 ) # pragma: no cover | |
1108 | |
1109 return self._api_response | |
1110 | |
1111 @api_response.setter | |
1112 def api_response(self, response): | |
1113 """Update the cached API response""" | |
1114 | |
1115 self._api_response = response | |
1116 | |
1117 def download_url(self, file_name): | |
1118 """ | |
1119 Use the repository API to get the download URL for a file given | |
1120 the archive URL. | |
1121 | |
1122 Parameters | |
1123 ---------- | |
1124 file_name : str | |
1125 The name of the file in the archive that will be downloaded. | |
1126 | |
1127 Returns | |
1128 ------- | |
1129 download_url : str | |
1130 The HTTP URL that can be used to download the file. | |
1131 """ | |
1132 parsed = parse_url(self.archive_url) | |
1133 response = self.api_response.json() | |
1134 files = { | |
1135 file["dataFile"]["filename"]: file["dataFile"] | |
1136 for file in response["data"]["latestVersion"]["files"] | |
1137 } | |
1138 if file_name not in files: | |
1139 raise ValueError( | |
1140 f"File '{file_name}' not found in data archive " | |
1141 f"{self.archive_url} (doi:{self.doi})." | |
1142 ) | |
1143 # Generate download_url using the file id | |
1144 download_url = ( | |
1145 f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" | |
1146 f"{files[file_name]['id']}" | |
1147 ) | |
1148 return download_url | |
1149 | |
1150 def populate_registry(self, pooch): | |
1151 """ | |
1152 Populate the registry using the data repository's API | |
1153 | |
1154 Parameters | |
1155 ---------- | |
1156 pooch : Pooch | |
1157 The pooch instance that the registry will be added to. | |
1158 """ | |
1159 | |
1160 for filedata in self.api_response.json()["data"]["latestVersion"]["files"]: | |
1161 pooch.registry[filedata["dataFile"]["filename"]] = ( | |
1162 f"md5:{filedata['dataFile']['md5']}" | |
1163 ) |