comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 # Copyright (c) 2018 The Pooch Developers.
2 # Distributed under the terms of the BSD 3-Clause License.
3 # SPDX-License-Identifier: BSD-3-Clause
4 #
5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
6 #
7 """
8 The classes that actually handle the downloads.
9 """
10 import os
11 import sys
12 import ftplib
13
14 import warnings
15
16 from .utils import parse_url
17
18 try:
19 from tqdm import tqdm
20 except ImportError:
21 tqdm = None
22
23 try:
24 import paramiko
25 except ImportError:
26 paramiko = None
27
28
29 # Set the default timeout in seconds so it can be configured in a pinch for the
30 # methods that don't or can't expose a way set it at runtime.
31 # See https://github.com/fatiando/pooch/issues/409
32 DEFAULT_TIMEOUT = 30
33
34
35 def choose_downloader(url, progressbar=False):
36 """
37 Choose the appropriate downloader for the given URL based on the protocol.
38
39 Parameters
40 ----------
41 url : str
42 A URL (including protocol).
43 progressbar : bool or an arbitrary progress bar object
44 If True, will print a progress bar of the download to standard error
45 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
46 installed. Alternatively, an arbitrary progress bar object can be
47 passed. See :ref:`custom-progressbar` for details.
48
49 Returns
50 -------
51 downloader
52 A downloader class, like :class:`pooch.HTTPDownloader`,
53 :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
54
55 Examples
56 --------
57
58 >>> downloader = choose_downloader("http://something.com")
59 >>> print(downloader.__class__.__name__)
60 HTTPDownloader
61 >>> downloader = choose_downloader("https://something.com")
62 >>> print(downloader.__class__.__name__)
63 HTTPDownloader
64 >>> downloader = choose_downloader("ftp://something.com")
65 >>> print(downloader.__class__.__name__)
66 FTPDownloader
67 >>> downloader = choose_downloader("doi:DOI/filename.csv")
68 >>> print(downloader.__class__.__name__)
69 DOIDownloader
70
71 """
72 known_downloaders = {
73 "ftp": FTPDownloader,
74 "https": HTTPDownloader,
75 "http": HTTPDownloader,
76 "sftp": SFTPDownloader,
77 "doi": DOIDownloader,
78 }
79
80 parsed_url = parse_url(url)
81 if parsed_url["protocol"] not in known_downloaders:
82 raise ValueError(
83 f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
84 f"Must be one of {known_downloaders.keys()}."
85 )
86 downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
87 return downloader
88
89
90 class HTTPDownloader: # pylint: disable=too-few-public-methods
91 """
92 Download manager for fetching files over HTTP/HTTPS.
93
94 When called, downloads the given file URL into the specified local file.
95 Uses the :mod:`requests` library to manage downloads.
96
97 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
98 the download of files (for example, to use authentication or print a
99 progress bar).
100
101 Parameters
102 ----------
103 progressbar : bool or an arbitrary progress bar object
104 If True, will print a progress bar of the download to standard error
105 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
106 installed. Alternatively, an arbitrary progress bar object can be
107 passed. See :ref:`custom-progressbar` for details.
108 chunk_size : int
109 Files are streamed *chunk_size* bytes at a time instead of loading
110 everything into memory at one. Usually doesn't need to be changed.
111 **kwargs
112 All keyword arguments given when creating an instance of this class
113 will be passed to :func:`requests.get`.
114
115 Examples
116 --------
117
118 Download one of the data files from the Pooch repository:
119
120 >>> import os
121 >>> from pooch import __version__, check_version
122 >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
123 >>> url = url.format(check_version(__version__, fallback="main"))
124 >>> downloader = HTTPDownloader()
125 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
126 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
127 >>> os.path.exists("tiny-data.txt")
128 True
129 >>> with open("tiny-data.txt") as f:
130 ... print(f.read().strip())
131 # A tiny data file for test purposes only
132 1 2 3 4 5 6
133 >>> os.remove("tiny-data.txt")
134
135 Authentication can be handled by passing a user name and password to
136 :func:`requests.get`. All arguments provided when creating an instance of
137 the class are forwarded to :func:`requests.get`. We'll use
138 ``auth=(username, password)`` to use basic HTTPS authentication. The
139 https://httpbin.org website allows us to make a fake a login request using
140 whatever username and password we provide to it:
141
142 >>> user = "doggo"
143 >>> password = "goodboy"
144 >>> # httpbin will ask for the user and password we provide in the URL
145 >>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
146 >>> # Trying without the login credentials causes an error
147 >>> downloader = HTTPDownloader()
148 >>> try:
149 ... downloader(url=url, output_file="tiny-data.txt", pooch=None)
150 ... except Exception:
151 ... print("There was an error!")
152 There was an error!
153 >>> # Pass in the credentials to HTTPDownloader
154 >>> downloader = HTTPDownloader(auth=(user, password))
155 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
156 >>> with open("tiny-data.txt") as f:
157 ... for line in f:
158 ... print(line.rstrip())
159 {
160 "authenticated": true,
161 "user": "doggo"
162 }
163 >>> os.remove("tiny-data.txt")
164
165 """
166
167 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
168 self.kwargs = kwargs
169 self.progressbar = progressbar
170 self.chunk_size = chunk_size
171 if self.progressbar is True and tqdm is None:
172 raise ValueError("Missing package 'tqdm' required for progress bars.")
173
174 def __call__(
175 self, url, output_file, pooch, check_only=False
176 ): # pylint: disable=R0914
177 """
178 Download the given URL over HTTP to the given output file.
179
180 Uses :func:`requests.get`.
181
182 Parameters
183 ----------
184 url : str
185 The URL to the file you want to download.
186 output_file : str or file-like object
187 Path (and file name) to which the file will be downloaded.
188 pooch : :class:`~pooch.Pooch`
189 The instance of :class:`~pooch.Pooch` that is calling this method.
190 check_only : bool
191 If True, will only check if a file exists on the server and
192 **without downloading the file**. Will return ``True`` if the file
193 exists and ``False`` otherwise.
194
195 Returns
196 -------
197 availability : bool or None
198 If ``check_only==True``, returns a boolean indicating if the file
199 is available on the server. Otherwise, returns ``None``.
200
201 """
202 # Lazy import requests to speed up import time
203 import requests # pylint: disable=C0415
204
205 if check_only:
206 timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT)
207 response = requests.head(url, timeout=timeout, allow_redirects=True)
208 available = bool(response.status_code == 200)
209 return available
210
211 kwargs = self.kwargs.copy()
212 timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT)
213 kwargs.setdefault("stream", True)
214 ispath = not hasattr(output_file, "write")
215 if ispath:
216 # pylint: disable=consider-using-with
217 output_file = open(output_file, "w+b")
218 # pylint: enable=consider-using-with
219 try:
220 response = requests.get(url, timeout=timeout, **kwargs)
221 response.raise_for_status()
222 content = response.iter_content(chunk_size=self.chunk_size)
223 total = int(response.headers.get("content-length", 0))
224 if self.progressbar is True:
225 # Need to use ascii characters on Windows because there isn't
226 # always full unicode support
227 # (see https://github.com/tqdm/tqdm/issues/454)
228 use_ascii = bool(sys.platform == "win32")
229 progress = tqdm(
230 total=total,
231 ncols=79,
232 ascii=use_ascii,
233 unit="B",
234 unit_scale=True,
235 leave=True,
236 )
237 elif self.progressbar:
238 progress = self.progressbar
239 progress.total = total
240 for chunk in content:
241 if chunk:
242 output_file.write(chunk)
243 output_file.flush()
244 if self.progressbar:
245 # Use the chunk size here because chunk may be much
246 # larger if the data are decompressed by requests after
247 # reading (happens with text files).
248 progress.update(self.chunk_size)
249 # Make sure the progress bar gets filled even if the actual number
250 # is chunks is smaller than expected. This happens when streaming
251 # text files that are compressed by the server when sending (gzip).
252 # Binary files don't experience this.
253 if self.progressbar:
254 progress.reset()
255 progress.update(total)
256 progress.close()
257 finally:
258 if ispath:
259 output_file.close()
260 return None
261
262
263 class FTPDownloader: # pylint: disable=too-few-public-methods
264 """
265 Download manager for fetching files over FTP.
266
267 When called, downloads the given file URL into the specified local file.
268 Uses the :mod:`ftplib` module to manage downloads.
269
270 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
271 the download of files (for example, to use authentication or print a
272 progress bar).
273
274 Parameters
275 ----------
276 port : int
277 Port used for the FTP connection.
278 username : str
279 User name used to login to the server. Only needed if the server
280 requires authentication (i.e., no anonymous FTP).
281 password : str
282 Password used to login to the server. Only needed if the server
283 requires authentication (i.e., no anonymous FTP). Use the empty string
284 to indicate no password is required.
285 account : str
286 Some servers also require an "account" name for authentication.
287 timeout : int
288 Timeout in seconds for ftp socket operations, use None to mean no
289 timeout.
290 progressbar : bool
291 If True, will print a progress bar of the download to standard error
292 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
293 installed. **Custom progress bars are not yet supported.**
294 chunk_size : int
295 Files are streamed *chunk_size* bytes at a time instead of loading
296 everything into memory at one. Usually doesn't need to be changed.
297
298 """
299
300 def __init__(
301 self,
302 port=21,
303 username="anonymous",
304 password="",
305 account="",
306 timeout=None,
307 progressbar=False,
308 chunk_size=1024,
309 ):
310 self.port = port
311 self.username = username
312 self.password = password
313 self.account = account
314 self.timeout = timeout
315 self.progressbar = progressbar
316 self.chunk_size = chunk_size
317 if self.progressbar is True and tqdm is None:
318 raise ValueError("Missing package 'tqdm' required for progress bars.")
319
320 def __call__(self, url, output_file, pooch, check_only=False):
321 """
322 Download the given URL over FTP to the given output file.
323
324 Parameters
325 ----------
326 url : str
327 The URL to the file you want to download.
328 output_file : str or file-like object
329 Path (and file name) to which the file will be downloaded.
330 pooch : :class:`~pooch.Pooch`
331 The instance of :class:`~pooch.Pooch` that is calling this method.
332 check_only : bool
333 If True, will only check if a file exists on the server and
334 **without downloading the file**. Will return ``True`` if the file
335 exists and ``False`` otherwise.
336
337 Returns
338 -------
339 availability : bool or None
340 If ``check_only==True``, returns a boolean indicating if the file
341 is available on the server. Otherwise, returns ``None``.
342
343 """
344 parsed_url = parse_url(url)
345 ftp = ftplib.FTP(timeout=self.timeout)
346 ftp.connect(host=parsed_url["netloc"], port=self.port)
347
348 if check_only:
349 directory, file_name = os.path.split(parsed_url["path"])
350 try:
351 ftp.login(user=self.username, passwd=self.password, acct=self.account)
352 available = file_name in ftp.nlst(directory)
353 finally:
354 ftp.close()
355 return available
356
357 ispath = not hasattr(output_file, "write")
358 if ispath:
359 # pylint: disable=consider-using-with
360 output_file = open(output_file, "w+b")
361 # pylint: enable=consider-using-with
362 try:
363 ftp.login(user=self.username, passwd=self.password, acct=self.account)
364 command = f"RETR {parsed_url['path']}"
365 if self.progressbar:
366 # Make sure the file is set to binary mode, otherwise we can't
367 # get the file size. See: https://stackoverflow.com/a/22093848
368 ftp.voidcmd("TYPE I")
369 use_ascii = bool(sys.platform == "win32")
370 progress = tqdm(
371 total=int(ftp.size(parsed_url["path"])),
372 ncols=79,
373 ascii=use_ascii,
374 unit="B",
375 unit_scale=True,
376 leave=True,
377 )
378 with progress:
379
380 def callback(data):
381 "Update the progress bar and write to output"
382 progress.update(len(data))
383 output_file.write(data)
384
385 ftp.retrbinary(command, callback, blocksize=self.chunk_size)
386 else:
387 ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
388 finally:
389 ftp.quit()
390 if ispath:
391 output_file.close()
392 return None
393
394
395 class SFTPDownloader: # pylint: disable=too-few-public-methods
396 """
397 Download manager for fetching files over SFTP.
398
399 When called, downloads the given file URL into the specified local file.
400 Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
401 installed.
402
403 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
404 the download of files (for example, to use authentication or print a
405 progress bar).
406
407 Parameters
408 ----------
409 port : int
410 Port used for the SFTP connection.
411 username : str
412 User name used to login to the server. Only needed if the server
413 requires authentication (i.e., no anonymous SFTP).
414 password : str
415 Password used to login to the server. Only needed if the server
416 requires authentication (i.e., no anonymous SFTP). Use the empty
417 string to indicate no password is required.
418 timeout : int
419 Timeout in seconds for sftp socket operations, use None to mean no
420 timeout.
421 progressbar : bool or an arbitrary progress bar object
422 If True, will print a progress bar of the download to standard
423 error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
424 be installed.
425
426 """
427
428 def __init__(
429 self,
430 port=22,
431 username="anonymous",
432 password="",
433 account="",
434 timeout=None,
435 progressbar=False,
436 ):
437 self.port = port
438 self.username = username
439 self.password = password
440 self.account = account
441 self.timeout = timeout
442 self.progressbar = progressbar
443 # Collect errors and raise only once so that both missing packages are
444 # captured. Otherwise, the user is only warned of one of them at a
445 # time (and we can't test properly when they are both missing).
446 errors = []
447 if self.progressbar and tqdm is None:
448 errors.append("Missing package 'tqdm' required for progress bars.")
449 if paramiko is None:
450 errors.append("Missing package 'paramiko' required for SFTP downloads.")
451 if errors:
452 raise ValueError(" ".join(errors))
453
454 def __call__(self, url, output_file, pooch):
455 """
456 Download the given URL over SFTP to the given output file.
457
458 The output file must be given as a string (file name/path) and not an
459 open file object! Otherwise, paramiko cannot save to that file.
460
461 Parameters
462 ----------
463 url : str
464 The URL to the file you want to download.
465 output_file : str
466 Path (and file name) to which the file will be downloaded. **Cannot
467 be a file object**.
468 pooch : :class:`~pooch.Pooch`
469 The instance of :class:`~pooch.Pooch` that is calling this method.
470 """
471 parsed_url = parse_url(url)
472 connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
473 sftp = None
474 try:
475 connection.connect(username=self.username, password=self.password)
476 sftp = paramiko.SFTPClient.from_transport(connection)
477 sftp.get_channel().settimeout = self.timeout
478 if self.progressbar:
479 size = int(sftp.stat(parsed_url["path"]).st_size)
480 use_ascii = bool(sys.platform == "win32")
481 progress = tqdm(
482 total=size,
483 ncols=79,
484 ascii=use_ascii,
485 unit="B",
486 unit_scale=True,
487 leave=True,
488 )
489 if self.progressbar:
490 with progress:
491
492 def callback(current, total):
493 "Update the progress bar and write to output"
494 progress.total = int(total)
495 progress.update(int(current - progress.n))
496
497 sftp.get(parsed_url["path"], output_file, callback=callback)
498 else:
499 sftp.get(parsed_url["path"], output_file)
500 finally:
501 connection.close()
502 if sftp is not None:
503 sftp.close()
504
505
506 class DOIDownloader: # pylint: disable=too-few-public-methods
507 """
508 Download manager for fetching files from Digital Object Identifiers (DOIs).
509
510 Open-access data repositories often issue Digital Object Identifiers (DOIs)
511 for data which provide a stable link and citation point. The trick is
512 finding out the download URL for a file given the DOI.
513
514 When called, this downloader uses the repository's public API to find out
515 the download URL from the DOI and file name. It then uses
516 :class:`pooch.HTTPDownloader` to download the URL into the specified local
517 file. Allowing "URL"s to be specified with the DOI instead of the actual
518 HTTP download link. Uses the :mod:`requests` library to manage downloads
519 and interact with the APIs.
520
521 The **format of the "URL"** is: ``doi:{DOI}/{file name}``.
522
523 Notice that there are no ``//`` like in HTTP/FTP and you must specify a
524 file name after the DOI (separated by a ``/``).
525
526 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
527 download files given the DOI instead of an HTTP link.
528
529 Supported repositories:
530
531 * `figshare <https://www.figshare.com>`__
532 * `Zenodo <https://www.zenodo.org>`__
533 * `Dataverse <https://dataverse.org/>`__ instances
534
535 .. attention::
536
537 DOIs from other repositories **will not work** since we need to access
538 their particular APIs to find the download links. We welcome
539 suggestions and contributions adding new repositories.
540
541 Parameters
542 ----------
543 progressbar : bool or an arbitrary progress bar object
544 If True, will print a progress bar of the download to standard error
545 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
546 installed. Alternatively, an arbitrary progress bar object can be
547 passed. See :ref:`custom-progressbar` for details.
548 chunk_size : int
549 Files are streamed *chunk_size* bytes at a time instead of loading
550 everything into memory at one. Usually doesn't need to be changed.
551 **kwargs
552 All keyword arguments given when creating an instance of this class
553 will be passed to :func:`requests.get`.
554
555 Examples
556 --------
557
558 Download one of the data files from the figshare archive of Pooch test
559 data:
560
561 >>> import os
562 >>> downloader = DOIDownloader()
563 >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
564 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
565 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
566 >>> os.path.exists("tiny-data.txt")
567 True
568 >>> with open("tiny-data.txt") as f:
569 ... print(f.read().strip())
570 # A tiny data file for test purposes only
571 1 2 3 4 5 6
572 >>> os.remove("tiny-data.txt")
573
574 Same thing but for our Zenodo archive:
575
576 >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
577 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
578 >>> os.path.exists("tiny-data.txt")
579 True
580 >>> with open("tiny-data.txt") as f:
581 ... print(f.read().strip())
582 # A tiny data file for test purposes only
583 1 2 3 4 5 6
584 >>> os.remove("tiny-data.txt")
585
586 """
587
588 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
589 self.kwargs = kwargs
590 self.progressbar = progressbar
591 self.chunk_size = chunk_size
592
593 def __call__(self, url, output_file, pooch):
594 """
595 Download the given DOI URL over HTTP to the given output file.
596
597 Uses the repository's API to determine the actual HTTP download URL
598 from the given DOI.
599
600 Uses :func:`requests.get`.
601
602 Parameters
603 ----------
604 url : str
605 The URL to the file you want to download.
606 output_file : str or file-like object
607 Path (and file name) to which the file will be downloaded.
608 pooch : :class:`~pooch.Pooch`
609 The instance of :class:`~pooch.Pooch` that is calling this method.
610
611 """
612
613 parsed_url = parse_url(url)
614 data_repository = doi_to_repository(parsed_url["netloc"])
615
616 # Resolve the URL
617 file_name = parsed_url["path"]
618 # remove the leading slash in the path
619 if file_name[0] == "/":
620 file_name = file_name[1:]
621 download_url = data_repository.download_url(file_name)
622
623 # Instantiate the downloader object
624 downloader = HTTPDownloader(
625 progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
626 )
627 downloader(download_url, output_file, pooch)
628
629
630 def doi_to_url(doi):
631 """
632 Follow a DOI link to resolve the URL of the archive.
633
634 Parameters
635 ----------
636 doi : str
637 The DOI of the archive.
638
639 Returns
640 -------
641 url : str
642 The URL of the archive in the data repository.
643
644 """
645 # Lazy import requests to speed up import time
646 import requests # pylint: disable=C0415
647
648 # Use doi.org to resolve the DOI to the repository website.
649 response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT)
650 url = response.url
651 if 400 <= response.status_code < 600:
652 raise ValueError(
653 f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
654 )
655 return url
656
657
658 def doi_to_repository(doi):
659 """
660 Instantiate a data repository instance from a given DOI.
661
662 This function implements the chain of responsibility dispatch
663 to the correct data repository class.
664
665 Parameters
666 ----------
667 doi : str
668 The DOI of the archive.
669
670 Returns
671 -------
672 data_repository : DataRepository
673 The data repository object
674 """
675
676 # This should go away in a separate issue: DOI handling should
677 # not rely on the (non-)existence of trailing slashes. The issue
678 # is documented in https://github.com/fatiando/pooch/issues/324
679 if doi[-1] == "/":
680 doi = doi[:-1]
681
682 repositories = [
683 FigshareRepository,
684 ZenodoRepository,
685 DataverseRepository,
686 ]
687
688 # Extract the DOI and the repository information
689 archive_url = doi_to_url(doi)
690
691 # Try the converters one by one until one of them returned a URL
692 data_repository = None
693 for repo in repositories:
694 if data_repository is None:
695 data_repository = repo.initialize(
696 archive_url=archive_url,
697 doi=doi,
698 )
699
700 if data_repository is None:
701 repository = parse_url(archive_url)["netloc"]
702 raise ValueError(
703 f"Invalid data repository '{repository}'. "
704 "To request or contribute support for this repository, "
705 "please open an issue at https://github.com/fatiando/pooch/issues"
706 )
707
708 return data_repository
709
710
711 class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring
712 @classmethod
713 def initialize(cls, doi, archive_url): # pylint: disable=unused-argument
714 """
715 Initialize the data repository if the given URL points to a
716 corresponding repository.
717
718 Initializes a data repository object. This is done as part of
719 a chain of responsibility. If the class cannot handle the given
720 repository URL, it returns `None`. Otherwise a `DataRepository`
721 instance is returned.
722
723 Parameters
724 ----------
725 doi : str
726 The DOI that identifies the repository
727 archive_url : str
728 The resolved URL for the DOI
729 """
730
731 return None # pragma: no cover
732
733 def download_url(self, file_name):
734 """
735 Use the repository API to get the download URL for a file given
736 the archive URL.
737
738 Parameters
739 ----------
740 file_name : str
741 The name of the file in the archive that will be downloaded.
742
743 Returns
744 -------
745 download_url : str
746 The HTTP URL that can be used to download the file.
747 """
748
749 raise NotImplementedError # pragma: no cover
750
751 def populate_registry(self, pooch):
752 """
753 Populate the registry using the data repository's API
754
755 Parameters
756 ----------
757 pooch : Pooch
758 The pooch instance that the registry will be added to.
759 """
760
761 raise NotImplementedError # pragma: no cover
762
763
764 class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring
765 base_api_url = "https://zenodo.org/api/records"
766
767 def __init__(self, doi, archive_url):
768 self.archive_url = archive_url
769 self.doi = doi
770 self._api_response = None
771 self._api_version = None
772
773 @classmethod
774 def initialize(cls, doi, archive_url):
775 """
776 Initialize the data repository if the given URL points to a
777 corresponding repository.
778
779 Initializes a data repository object. This is done as part of
780 a chain of responsibility. If the class cannot handle the given
781 repository URL, it returns `None`. Otherwise a `DataRepository`
782 instance is returned.
783
784 Parameters
785 ----------
786 doi : str
787 The DOI that identifies the repository
788 archive_url : str
789 The resolved URL for the DOI
790 """
791
792 # Check whether this is a Zenodo URL
793 parsed_archive_url = parse_url(archive_url)
794 if parsed_archive_url["netloc"] != "zenodo.org":
795 return None
796
797 return cls(doi, archive_url)
798
799 @property
800 def api_response(self):
801 """Cached API response from Zenodo"""
802 if self._api_response is None:
803 # Lazy import requests to speed up import time
804 import requests # pylint: disable=C0415
805
806 article_id = self.archive_url.split("/")[-1]
807 self._api_response = requests.get(
808 f"{self.base_api_url}/{article_id}",
809 timeout=DEFAULT_TIMEOUT,
810 ).json()
811
812 return self._api_response
813
814 @property
815 def api_version(self):
816 """
817 Version of the Zenodo API we are interacting with
818
819 The versions can either be :
820
821 - ``"legacy"``: corresponds to the Zenodo API that was supported until
822 2023-10-12 (before the migration to InvenioRDM).
823 - ``"new"``: corresponds to the new API that went online on 2023-10-13
824 after the migration to InvenioRDM.
825
826 The ``"new"`` API breaks backward compatibility with the ``"legacy"``
827 one and could probably be replaced by an updated version that restores
828 the behaviour of the ``"legacy"`` one.
829
830 Returns
831 -------
832 str
833 """
834 if self._api_version is None:
835 if all("key" in file for file in self.api_response["files"]):
836 self._api_version = "legacy"
837 elif all("filename" in file for file in self.api_response["files"]):
838 self._api_version = "new"
839 else:
840 raise ValueError(
841 "Couldn't determine the version of the Zenodo API for "
842 f"{self.archive_url} (doi:{self.doi})."
843 )
844 return self._api_version
845
846 def download_url(self, file_name):
847 """
848 Use the repository API to get the download URL for a file given
849 the archive URL.
850
851 Parameters
852 ----------
853 file_name : str
854 The name of the file in the archive that will be downloaded.
855
856 Returns
857 -------
858 download_url : str
859 The HTTP URL that can be used to download the file.
860
861 Notes
862 -----
863 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
864 link to the desired files that appears in the API response leads to 404
865 errors (by 2023-10-17). The files are available in the following url:
866 ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.
867
868 This method supports both the legacy and the new API.
869 """
870 # Create list of files in the repository
871 if self.api_version == "legacy":
872 files = {item["key"]: item for item in self.api_response["files"]}
873 else:
874 files = [item["filename"] for item in self.api_response["files"]]
875 # Check if file exists in the repository
876 if file_name not in files:
877 raise ValueError(
878 f"File '{file_name}' not found in data archive "
879 f"{self.archive_url} (doi:{self.doi})."
880 )
881 # Build download url
882 if self.api_version == "legacy":
883 download_url = files[file_name]["links"]["self"]
884 else:
885 article_id = self.api_response["id"]
886 download_url = (
887 f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
888 )
889 return download_url
890
891 def populate_registry(self, pooch):
892 """
893 Populate the registry using the data repository's API
894
895 Parameters
896 ----------
897 pooch : Pooch
898 The pooch instance that the registry will be added to.
899
900 Notes
901 -----
902 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
903 checksums for each file listed in the API reference is now an md5 sum.
904
905 This method supports both the legacy and the new API.
906 """
907 for filedata in self.api_response["files"]:
908 checksum = filedata["checksum"]
909 if self.api_version == "legacy":
910 key = "key"
911 else:
912 key = "filename"
913 checksum = f"md5:{checksum}"
914 pooch.registry[filedata[key]] = checksum
915
916
917 class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring
918 def __init__(self, doi, archive_url):
919 self.archive_url = archive_url
920 self.doi = doi
921 self._api_response = None
922
923 @classmethod
924 def initialize(cls, doi, archive_url):
925 """
926 Initialize the data repository if the given URL points to a
927 corresponding repository.
928
929 Initializes a data repository object. This is done as part of
930 a chain of responsibility. If the class cannot handle the given
931 repository URL, it returns `None`. Otherwise a `DataRepository`
932 instance is returned.
933
934 Parameters
935 ----------
936 doi : str
937 The DOI that identifies the repository
938 archive_url : str
939 The resolved URL for the DOI
940 """
941
942 # Check whether this is a Figshare URL
943 parsed_archive_url = parse_url(archive_url)
944 if parsed_archive_url["netloc"] != "figshare.com":
945 return None
946
947 return cls(doi, archive_url)
948
949 def _parse_version_from_doi(self):
950 """
951 Parse version from the doi
952
953 Return None if version is not available in the doi.
954 """
955 # Get suffix of the doi
956 _, suffix = self.doi.split("/")
957 # Split the suffix by dots and keep the last part
958 last_part = suffix.split(".")[-1]
959 # Parse the version from the last part
960 if last_part[0] != "v":
961 return None
962 version = int(last_part[1:])
963 return version
964
965 @property
966 def api_response(self):
967 """Cached API response from Figshare"""
968 if self._api_response is None:
969 # Lazy import requests to speed up import time
970 import requests # pylint: disable=C0415
971
972 # Use the figshare API to find the article ID from the DOI
973 article = requests.get(
974 f"https://api.figshare.com/v2/articles?doi={self.doi}",
975 timeout=DEFAULT_TIMEOUT,
976 ).json()[0]
977 article_id = article["id"]
978 # Parse desired version from the doi
979 version = self._parse_version_from_doi()
980 # With the ID and version, we can get a list of files and their
981 # download links
982 if version is None:
983 # Figshare returns the latest version available when no version
984 # is specified through the DOI.
985 warnings.warn(
986 f"The Figshare DOI '{self.doi}' doesn't specify which version of "
987 "the repository should be used. "
988 "Figshare will point to the latest version available.",
989 UserWarning,
990 )
991 # Define API url using only the article id
992 # (figshare will resolve the latest version)
993 api_url = f"https://api.figshare.com/v2/articles/{article_id}"
994 else:
995 # Define API url using article id and the desired version
996 # Get list of files using article id and the version
997 api_url = (
998 "https://api.figshare.com/v2/articles/"
999 f"{article_id}/versions/{version}"
1000 )
1001 # Make the request and return the files in the figshare repository
1002 response = requests.get(api_url, timeout=DEFAULT_TIMEOUT)
1003 response.raise_for_status()
1004 self._api_response = response.json()["files"]
1005
1006 return self._api_response
1007
1008 def download_url(self, file_name):
1009 """
1010 Use the repository API to get the download URL for a file given
1011 the archive URL.
1012
1013 Parameters
1014 ----------
1015 file_name : str
1016 The name of the file in the archive that will be downloaded.
1017
1018 Returns
1019 -------
1020 download_url : str
1021 The HTTP URL that can be used to download the file.
1022 """
1023 files = {item["name"]: item for item in self.api_response}
1024 if file_name not in files:
1025 raise ValueError(
1026 f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
1027 )
1028 download_url = files[file_name]["download_url"]
1029 return download_url
1030
1031 def populate_registry(self, pooch):
1032 """
1033 Populate the registry using the data repository's API
1034
1035 Parameters
1036 ----------
1037 pooch : Pooch
1038 The pooch instance that the registry will be added to.
1039 """
1040
1041 for filedata in self.api_response:
1042 pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
1043
1044
1045 class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring
1046 def __init__(self, doi, archive_url):
1047 self.archive_url = archive_url
1048 self.doi = doi
1049 self._api_response = None
1050
1051 @classmethod
1052 def initialize(cls, doi, archive_url):
1053 """
1054 Initialize the data repository if the given URL points to a
1055 corresponding repository.
1056
1057 Initializes a data repository object. This is done as part of
1058 a chain of responsibility. If the class cannot handle the given
1059 repository URL, it returns `None`. Otherwise a `DataRepository`
1060 instance is returned.
1061
1062 Parameters
1063 ----------
1064 doi : str
1065 The DOI that identifies the repository
1066 archive_url : str
1067 The resolved URL for the DOI
1068 """
1069 # Access the DOI as if this was a DataVerse instance
1070 response = cls._get_api_response(doi, archive_url)
1071
1072 # If we failed, this is probably not a DataVerse instance
1073 if 400 <= response.status_code < 600:
1074 return None
1075
1076 # Initialize the repository and overwrite the api response
1077 repository = cls(doi, archive_url)
1078 repository.api_response = response
1079 return repository
1080
1081 @classmethod
1082 def _get_api_response(cls, doi, archive_url):
1083 """
1084 Perform the actual API request
1085
1086 This has been separated into a separate ``classmethod``, as it can be
1087 used prior and after the initialization.
1088 """
1089 # Lazy import requests to speed up import time
1090 import requests # pylint: disable=C0415
1091
1092 parsed = parse_url(archive_url)
1093 response = requests.get(
1094 f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
1095 f":persistentId?persistentId=doi:{doi}",
1096 timeout=DEFAULT_TIMEOUT,
1097 )
1098 return response
1099
1100 @property
1101 def api_response(self):
1102 """Cached API response from a DataVerse instance"""
1103
1104 if self._api_response is None:
1105 self._api_response = self._get_api_response(
1106 self.doi, self.archive_url
1107 ) # pragma: no cover
1108
1109 return self._api_response
1110
1111 @api_response.setter
1112 def api_response(self, response):
1113 """Update the cached API response"""
1114
1115 self._api_response = response
1116
1117 def download_url(self, file_name):
1118 """
1119 Use the repository API to get the download URL for a file given
1120 the archive URL.
1121
1122 Parameters
1123 ----------
1124 file_name : str
1125 The name of the file in the archive that will be downloaded.
1126
1127 Returns
1128 -------
1129 download_url : str
1130 The HTTP URL that can be used to download the file.
1131 """
1132 parsed = parse_url(self.archive_url)
1133 response = self.api_response.json()
1134 files = {
1135 file["dataFile"]["filename"]: file["dataFile"]
1136 for file in response["data"]["latestVersion"]["files"]
1137 }
1138 if file_name not in files:
1139 raise ValueError(
1140 f"File '{file_name}' not found in data archive "
1141 f"{self.archive_url} (doi:{self.doi})."
1142 )
1143 # Generate download_url using the file id
1144 download_url = (
1145 f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
1146 f"{files[file_name]['id']}"
1147 )
1148 return download_url
1149
1150 def populate_registry(self, pooch):
1151 """
1152 Populate the registry using the data repository's API
1153
1154 Parameters
1155 ----------
1156 pooch : Pooch
1157 The pooch instance that the registry will be added to.
1158 """
1159
1160 for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
1161 pooch.registry[filedata["dataFile"]["filename"]] = (
1162 f"md5:{filedata['dataFile']['md5']}"
1163 )