Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 16:23:26 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py Tue Mar 18 16:23:26 2025 -0400 @@ -0,0 +1,1163 @@ +# Copyright (c) 2018 The Pooch Developers. +# Distributed under the terms of the BSD 3-Clause License. +# SPDX-License-Identifier: BSD-3-Clause +# +# This code is part of the Fatiando a Terra project (https://www.fatiando.org) +# +""" +The classes that actually handle the downloads. +""" +import os +import sys +import ftplib + +import warnings + +from .utils import parse_url + +try: + from tqdm import tqdm +except ImportError: + tqdm = None + +try: + import paramiko +except ImportError: + paramiko = None + + +# Set the default timeout in seconds so it can be configured in a pinch for the +# methods that don't or can't expose a way set it at runtime. +# See https://github.com/fatiando/pooch/issues/409 +DEFAULT_TIMEOUT = 30 + + +def choose_downloader(url, progressbar=False): + """ + Choose the appropriate downloader for the given URL based on the protocol. + + Parameters + ---------- + url : str + A URL (including protocol). + progressbar : bool or an arbitrary progress bar object + If True, will print a progress bar of the download to standard error + (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be + installed. Alternatively, an arbitrary progress bar object can be + passed. See :ref:`custom-progressbar` for details. + + Returns + ------- + downloader + A downloader class, like :class:`pooch.HTTPDownloader`, + :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`. + + Examples + -------- + + >>> downloader = choose_downloader("http://something.com") + >>> print(downloader.__class__.__name__) + HTTPDownloader + >>> downloader = choose_downloader("https://something.com") + >>> print(downloader.__class__.__name__) + HTTPDownloader + >>> downloader = choose_downloader("ftp://something.com") + >>> print(downloader.__class__.__name__) + FTPDownloader + >>> downloader = choose_downloader("doi:DOI/filename.csv") + >>> print(downloader.__class__.__name__) + DOIDownloader + + """ + known_downloaders = { + "ftp": FTPDownloader, + "https": HTTPDownloader, + "http": HTTPDownloader, + "sftp": SFTPDownloader, + "doi": DOIDownloader, + } + + parsed_url = parse_url(url) + if parsed_url["protocol"] not in known_downloaders: + raise ValueError( + f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. " + f"Must be one of {known_downloaders.keys()}." + ) + downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar) + return downloader + + +class HTTPDownloader: # pylint: disable=too-few-public-methods + """ + Download manager for fetching files over HTTP/HTTPS. + + When called, downloads the given file URL into the specified local file. + Uses the :mod:`requests` library to manage downloads. + + Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize + the download of files (for example, to use authentication or print a + progress bar). + + Parameters + ---------- + progressbar : bool or an arbitrary progress bar object + If True, will print a progress bar of the download to standard error + (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be + installed. Alternatively, an arbitrary progress bar object can be + passed. See :ref:`custom-progressbar` for details. + chunk_size : int + Files are streamed *chunk_size* bytes at a time instead of loading + everything into memory at one. Usually doesn't need to be changed. + **kwargs + All keyword arguments given when creating an instance of this class + will be passed to :func:`requests.get`. + + Examples + -------- + + Download one of the data files from the Pooch repository: + + >>> import os + >>> from pooch import __version__, check_version + >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt" + >>> url = url.format(check_version(__version__, fallback="main")) + >>> downloader = HTTPDownloader() + >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch + >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) + >>> os.path.exists("tiny-data.txt") + True + >>> with open("tiny-data.txt") as f: + ... print(f.read().strip()) + # A tiny data file for test purposes only + 1 2 3 4 5 6 + >>> os.remove("tiny-data.txt") + + Authentication can be handled by passing a user name and password to + :func:`requests.get`. All arguments provided when creating an instance of + the class are forwarded to :func:`requests.get`. We'll use + ``auth=(username, password)`` to use basic HTTPS authentication. The + https://httpbin.org website allows us to make a fake a login request using + whatever username and password we provide to it: + + >>> user = "doggo" + >>> password = "goodboy" + >>> # httpbin will ask for the user and password we provide in the URL + >>> url = f"https://httpbin.org/basic-auth/{user}/{password}" + >>> # Trying without the login credentials causes an error + >>> downloader = HTTPDownloader() + >>> try: + ... downloader(url=url, output_file="tiny-data.txt", pooch=None) + ... except Exception: + ... print("There was an error!") + There was an error! + >>> # Pass in the credentials to HTTPDownloader + >>> downloader = HTTPDownloader(auth=(user, password)) + >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) + >>> with open("tiny-data.txt") as f: + ... for line in f: + ... print(line.rstrip()) + { + "authenticated": true, + "user": "doggo" + } + >>> os.remove("tiny-data.txt") + + """ + + def __init__(self, progressbar=False, chunk_size=1024, **kwargs): + self.kwargs = kwargs + self.progressbar = progressbar + self.chunk_size = chunk_size + if self.progressbar is True and tqdm is None: + raise ValueError("Missing package 'tqdm' required for progress bars.") + + def __call__( + self, url, output_file, pooch, check_only=False + ): # pylint: disable=R0914 + """ + Download the given URL over HTTP to the given output file. + + Uses :func:`requests.get`. + + Parameters + ---------- + url : str + The URL to the file you want to download. + output_file : str or file-like object + Path (and file name) to which the file will be downloaded. + pooch : :class:`~pooch.Pooch` + The instance of :class:`~pooch.Pooch` that is calling this method. + check_only : bool + If True, will only check if a file exists on the server and + **without downloading the file**. Will return ``True`` if the file + exists and ``False`` otherwise. + + Returns + ------- + availability : bool or None + If ``check_only==True``, returns a boolean indicating if the file + is available on the server. Otherwise, returns ``None``. + + """ + # Lazy import requests to speed up import time + import requests # pylint: disable=C0415 + + if check_only: + timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT) + response = requests.head(url, timeout=timeout, allow_redirects=True) + available = bool(response.status_code == 200) + return available + + kwargs = self.kwargs.copy() + timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT) + kwargs.setdefault("stream", True) + ispath = not hasattr(output_file, "write") + if ispath: + # pylint: disable=consider-using-with + output_file = open(output_file, "w+b") + # pylint: enable=consider-using-with + try: + response = requests.get(url, timeout=timeout, **kwargs) + response.raise_for_status() + content = response.iter_content(chunk_size=self.chunk_size) + total = int(response.headers.get("content-length", 0)) + if self.progressbar is True: + # Need to use ascii characters on Windows because there isn't + # always full unicode support + # (see https://github.com/tqdm/tqdm/issues/454) + use_ascii = bool(sys.platform == "win32") + progress = tqdm( + total=total, + ncols=79, + ascii=use_ascii, + unit="B", + unit_scale=True, + leave=True, + ) + elif self.progressbar: + progress = self.progressbar + progress.total = total + for chunk in content: + if chunk: + output_file.write(chunk) + output_file.flush() + if self.progressbar: + # Use the chunk size here because chunk may be much + # larger if the data are decompressed by requests after + # reading (happens with text files). + progress.update(self.chunk_size) + # Make sure the progress bar gets filled even if the actual number + # is chunks is smaller than expected. This happens when streaming + # text files that are compressed by the server when sending (gzip). + # Binary files don't experience this. + if self.progressbar: + progress.reset() + progress.update(total) + progress.close() + finally: + if ispath: + output_file.close() + return None + + +class FTPDownloader: # pylint: disable=too-few-public-methods + """ + Download manager for fetching files over FTP. + + When called, downloads the given file URL into the specified local file. + Uses the :mod:`ftplib` module to manage downloads. + + Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize + the download of files (for example, to use authentication or print a + progress bar). + + Parameters + ---------- + port : int + Port used for the FTP connection. + username : str + User name used to login to the server. Only needed if the server + requires authentication (i.e., no anonymous FTP). + password : str + Password used to login to the server. Only needed if the server + requires authentication (i.e., no anonymous FTP). Use the empty string + to indicate no password is required. + account : str + Some servers also require an "account" name for authentication. + timeout : int + Timeout in seconds for ftp socket operations, use None to mean no + timeout. + progressbar : bool + If True, will print a progress bar of the download to standard error + (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be + installed. **Custom progress bars are not yet supported.** + chunk_size : int + Files are streamed *chunk_size* bytes at a time instead of loading + everything into memory at one. Usually doesn't need to be changed. + + """ + + def __init__( + self, + port=21, + username="anonymous", + password="", + account="", + timeout=None, + progressbar=False, + chunk_size=1024, + ): + self.port = port + self.username = username + self.password = password + self.account = account + self.timeout = timeout + self.progressbar = progressbar + self.chunk_size = chunk_size + if self.progressbar is True and tqdm is None: + raise ValueError("Missing package 'tqdm' required for progress bars.") + + def __call__(self, url, output_file, pooch, check_only=False): + """ + Download the given URL over FTP to the given output file. + + Parameters + ---------- + url : str + The URL to the file you want to download. + output_file : str or file-like object + Path (and file name) to which the file will be downloaded. + pooch : :class:`~pooch.Pooch` + The instance of :class:`~pooch.Pooch` that is calling this method. + check_only : bool + If True, will only check if a file exists on the server and + **without downloading the file**. Will return ``True`` if the file + exists and ``False`` otherwise. + + Returns + ------- + availability : bool or None + If ``check_only==True``, returns a boolean indicating if the file + is available on the server. Otherwise, returns ``None``. + + """ + parsed_url = parse_url(url) + ftp = ftplib.FTP(timeout=self.timeout) + ftp.connect(host=parsed_url["netloc"], port=self.port) + + if check_only: + directory, file_name = os.path.split(parsed_url["path"]) + try: + ftp.login(user=self.username, passwd=self.password, acct=self.account) + available = file_name in ftp.nlst(directory) + finally: + ftp.close() + return available + + ispath = not hasattr(output_file, "write") + if ispath: + # pylint: disable=consider-using-with + output_file = open(output_file, "w+b") + # pylint: enable=consider-using-with + try: + ftp.login(user=self.username, passwd=self.password, acct=self.account) + command = f"RETR {parsed_url['path']}" + if self.progressbar: + # Make sure the file is set to binary mode, otherwise we can't + # get the file size. See: https://stackoverflow.com/a/22093848 + ftp.voidcmd("TYPE I") + use_ascii = bool(sys.platform == "win32") + progress = tqdm( + total=int(ftp.size(parsed_url["path"])), + ncols=79, + ascii=use_ascii, + unit="B", + unit_scale=True, + leave=True, + ) + with progress: + + def callback(data): + "Update the progress bar and write to output" + progress.update(len(data)) + output_file.write(data) + + ftp.retrbinary(command, callback, blocksize=self.chunk_size) + else: + ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size) + finally: + ftp.quit() + if ispath: + output_file.close() + return None + + +class SFTPDownloader: # pylint: disable=too-few-public-methods + """ + Download manager for fetching files over SFTP. + + When called, downloads the given file URL into the specified local file. + Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be + installed. + + Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize + the download of files (for example, to use authentication or print a + progress bar). + + Parameters + ---------- + port : int + Port used for the SFTP connection. + username : str + User name used to login to the server. Only needed if the server + requires authentication (i.e., no anonymous SFTP). + password : str + Password used to login to the server. Only needed if the server + requires authentication (i.e., no anonymous SFTP). Use the empty + string to indicate no password is required. + timeout : int + Timeout in seconds for sftp socket operations, use None to mean no + timeout. + progressbar : bool or an arbitrary progress bar object + If True, will print a progress bar of the download to standard + error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to + be installed. + + """ + + def __init__( + self, + port=22, + username="anonymous", + password="", + account="", + timeout=None, + progressbar=False, + ): + self.port = port + self.username = username + self.password = password + self.account = account + self.timeout = timeout + self.progressbar = progressbar + # Collect errors and raise only once so that both missing packages are + # captured. Otherwise, the user is only warned of one of them at a + # time (and we can't test properly when they are both missing). + errors = [] + if self.progressbar and tqdm is None: + errors.append("Missing package 'tqdm' required for progress bars.") + if paramiko is None: + errors.append("Missing package 'paramiko' required for SFTP downloads.") + if errors: + raise ValueError(" ".join(errors)) + + def __call__(self, url, output_file, pooch): + """ + Download the given URL over SFTP to the given output file. + + The output file must be given as a string (file name/path) and not an + open file object! Otherwise, paramiko cannot save to that file. + + Parameters + ---------- + url : str + The URL to the file you want to download. + output_file : str + Path (and file name) to which the file will be downloaded. **Cannot + be a file object**. + pooch : :class:`~pooch.Pooch` + The instance of :class:`~pooch.Pooch` that is calling this method. + """ + parsed_url = parse_url(url) + connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port)) + sftp = None + try: + connection.connect(username=self.username, password=self.password) + sftp = paramiko.SFTPClient.from_transport(connection) + sftp.get_channel().settimeout = self.timeout + if self.progressbar: + size = int(sftp.stat(parsed_url["path"]).st_size) + use_ascii = bool(sys.platform == "win32") + progress = tqdm( + total=size, + ncols=79, + ascii=use_ascii, + unit="B", + unit_scale=True, + leave=True, + ) + if self.progressbar: + with progress: + + def callback(current, total): + "Update the progress bar and write to output" + progress.total = int(total) + progress.update(int(current - progress.n)) + + sftp.get(parsed_url["path"], output_file, callback=callback) + else: + sftp.get(parsed_url["path"], output_file) + finally: + connection.close() + if sftp is not None: + sftp.close() + + +class DOIDownloader: # pylint: disable=too-few-public-methods + """ + Download manager for fetching files from Digital Object Identifiers (DOIs). + + Open-access data repositories often issue Digital Object Identifiers (DOIs) + for data which provide a stable link and citation point. The trick is + finding out the download URL for a file given the DOI. + + When called, this downloader uses the repository's public API to find out + the download URL from the DOI and file name. It then uses + :class:`pooch.HTTPDownloader` to download the URL into the specified local + file. Allowing "URL"s to be specified with the DOI instead of the actual + HTTP download link. Uses the :mod:`requests` library to manage downloads + and interact with the APIs. + + The **format of the "URL"** is: ``doi:{DOI}/{file name}``. + + Notice that there are no ``//`` like in HTTP/FTP and you must specify a + file name after the DOI (separated by a ``/``). + + Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to + download files given the DOI instead of an HTTP link. + + Supported repositories: + + * `figshare <https://www.figshare.com>`__ + * `Zenodo <https://www.zenodo.org>`__ + * `Dataverse <https://dataverse.org/>`__ instances + + .. attention:: + + DOIs from other repositories **will not work** since we need to access + their particular APIs to find the download links. We welcome + suggestions and contributions adding new repositories. + + Parameters + ---------- + progressbar : bool or an arbitrary progress bar object + If True, will print a progress bar of the download to standard error + (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be + installed. Alternatively, an arbitrary progress bar object can be + passed. See :ref:`custom-progressbar` for details. + chunk_size : int + Files are streamed *chunk_size* bytes at a time instead of loading + everything into memory at one. Usually doesn't need to be changed. + **kwargs + All keyword arguments given when creating an instance of this class + will be passed to :func:`requests.get`. + + Examples + -------- + + Download one of the data files from the figshare archive of Pooch test + data: + + >>> import os + >>> downloader = DOIDownloader() + >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt" + >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch + >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) + >>> os.path.exists("tiny-data.txt") + True + >>> with open("tiny-data.txt") as f: + ... print(f.read().strip()) + # A tiny data file for test purposes only + 1 2 3 4 5 6 + >>> os.remove("tiny-data.txt") + + Same thing but for our Zenodo archive: + + >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt" + >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) + >>> os.path.exists("tiny-data.txt") + True + >>> with open("tiny-data.txt") as f: + ... print(f.read().strip()) + # A tiny data file for test purposes only + 1 2 3 4 5 6 + >>> os.remove("tiny-data.txt") + + """ + + def __init__(self, progressbar=False, chunk_size=1024, **kwargs): + self.kwargs = kwargs + self.progressbar = progressbar + self.chunk_size = chunk_size + + def __call__(self, url, output_file, pooch): + """ + Download the given DOI URL over HTTP to the given output file. + + Uses the repository's API to determine the actual HTTP download URL + from the given DOI. + + Uses :func:`requests.get`. + + Parameters + ---------- + url : str + The URL to the file you want to download. + output_file : str or file-like object + Path (and file name) to which the file will be downloaded. + pooch : :class:`~pooch.Pooch` + The instance of :class:`~pooch.Pooch` that is calling this method. + + """ + + parsed_url = parse_url(url) + data_repository = doi_to_repository(parsed_url["netloc"]) + + # Resolve the URL + file_name = parsed_url["path"] + # remove the leading slash in the path + if file_name[0] == "/": + file_name = file_name[1:] + download_url = data_repository.download_url(file_name) + + # Instantiate the downloader object + downloader = HTTPDownloader( + progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs + ) + downloader(download_url, output_file, pooch) + + +def doi_to_url(doi): + """ + Follow a DOI link to resolve the URL of the archive. + + Parameters + ---------- + doi : str + The DOI of the archive. + + Returns + ------- + url : str + The URL of the archive in the data repository. + + """ + # Lazy import requests to speed up import time + import requests # pylint: disable=C0415 + + # Use doi.org to resolve the DOI to the repository website. + response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT) + url = response.url + if 400 <= response.status_code < 600: + raise ValueError( + f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?" + ) + return url + + +def doi_to_repository(doi): + """ + Instantiate a data repository instance from a given DOI. + + This function implements the chain of responsibility dispatch + to the correct data repository class. + + Parameters + ---------- + doi : str + The DOI of the archive. + + Returns + ------- + data_repository : DataRepository + The data repository object + """ + + # This should go away in a separate issue: DOI handling should + # not rely on the (non-)existence of trailing slashes. The issue + # is documented in https://github.com/fatiando/pooch/issues/324 + if doi[-1] == "/": + doi = doi[:-1] + + repositories = [ + FigshareRepository, + ZenodoRepository, + DataverseRepository, + ] + + # Extract the DOI and the repository information + archive_url = doi_to_url(doi) + + # Try the converters one by one until one of them returned a URL + data_repository = None + for repo in repositories: + if data_repository is None: + data_repository = repo.initialize( + archive_url=archive_url, + doi=doi, + ) + + if data_repository is None: + repository = parse_url(archive_url)["netloc"] + raise ValueError( + f"Invalid data repository '{repository}'. " + "To request or contribute support for this repository, " + "please open an issue at https://github.com/fatiando/pooch/issues" + ) + + return data_repository + + +class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring + @classmethod + def initialize(cls, doi, archive_url): # pylint: disable=unused-argument + """ + Initialize the data repository if the given URL points to a + corresponding repository. + + Initializes a data repository object. This is done as part of + a chain of responsibility. If the class cannot handle the given + repository URL, it returns `None`. Otherwise a `DataRepository` + instance is returned. + + Parameters + ---------- + doi : str + The DOI that identifies the repository + archive_url : str + The resolved URL for the DOI + """ + + return None # pragma: no cover + + def download_url(self, file_name): + """ + Use the repository API to get the download URL for a file given + the archive URL. + + Parameters + ---------- + file_name : str + The name of the file in the archive that will be downloaded. + + Returns + ------- + download_url : str + The HTTP URL that can be used to download the file. + """ + + raise NotImplementedError # pragma: no cover + + def populate_registry(self, pooch): + """ + Populate the registry using the data repository's API + + Parameters + ---------- + pooch : Pooch + The pooch instance that the registry will be added to. + """ + + raise NotImplementedError # pragma: no cover + + +class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring + base_api_url = "https://zenodo.org/api/records" + + def __init__(self, doi, archive_url): + self.archive_url = archive_url + self.doi = doi + self._api_response = None + self._api_version = None + + @classmethod + def initialize(cls, doi, archive_url): + """ + Initialize the data repository if the given URL points to a + corresponding repository. + + Initializes a data repository object. This is done as part of + a chain of responsibility. If the class cannot handle the given + repository URL, it returns `None`. Otherwise a `DataRepository` + instance is returned. + + Parameters + ---------- + doi : str + The DOI that identifies the repository + archive_url : str + The resolved URL for the DOI + """ + + # Check whether this is a Zenodo URL + parsed_archive_url = parse_url(archive_url) + if parsed_archive_url["netloc"] != "zenodo.org": + return None + + return cls(doi, archive_url) + + @property + def api_response(self): + """Cached API response from Zenodo""" + if self._api_response is None: + # Lazy import requests to speed up import time + import requests # pylint: disable=C0415 + + article_id = self.archive_url.split("/")[-1] + self._api_response = requests.get( + f"{self.base_api_url}/{article_id}", + timeout=DEFAULT_TIMEOUT, + ).json() + + return self._api_response + + @property + def api_version(self): + """ + Version of the Zenodo API we are interacting with + + The versions can either be : + + - ``"legacy"``: corresponds to the Zenodo API that was supported until + 2023-10-12 (before the migration to InvenioRDM). + - ``"new"``: corresponds to the new API that went online on 2023-10-13 + after the migration to InvenioRDM. + + The ``"new"`` API breaks backward compatibility with the ``"legacy"`` + one and could probably be replaced by an updated version that restores + the behaviour of the ``"legacy"`` one. + + Returns + ------- + str + """ + if self._api_version is None: + if all("key" in file for file in self.api_response["files"]): + self._api_version = "legacy" + elif all("filename" in file for file in self.api_response["files"]): + self._api_version = "new" + else: + raise ValueError( + "Couldn't determine the version of the Zenodo API for " + f"{self.archive_url} (doi:{self.doi})." + ) + return self._api_version + + def download_url(self, file_name): + """ + Use the repository API to get the download URL for a file given + the archive URL. + + Parameters + ---------- + file_name : str + The name of the file in the archive that will be downloaded. + + Returns + ------- + download_url : str + The HTTP URL that can be used to download the file. + + Notes + ----- + After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The + link to the desired files that appears in the API response leads to 404 + errors (by 2023-10-17). The files are available in the following url: + ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``. + + This method supports both the legacy and the new API. + """ + # Create list of files in the repository + if self.api_version == "legacy": + files = {item["key"]: item for item in self.api_response["files"]} + else: + files = [item["filename"] for item in self.api_response["files"]] + # Check if file exists in the repository + if file_name not in files: + raise ValueError( + f"File '{file_name}' not found in data archive " + f"{self.archive_url} (doi:{self.doi})." + ) + # Build download url + if self.api_version == "legacy": + download_url = files[file_name]["links"]["self"] + else: + article_id = self.api_response["id"] + download_url = ( + f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1" + ) + return download_url + + def populate_registry(self, pooch): + """ + Populate the registry using the data repository's API + + Parameters + ---------- + pooch : Pooch + The pooch instance that the registry will be added to. + + Notes + ----- + After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The + checksums for each file listed in the API reference is now an md5 sum. + + This method supports both the legacy and the new API. + """ + for filedata in self.api_response["files"]: + checksum = filedata["checksum"] + if self.api_version == "legacy": + key = "key" + else: + key = "filename" + checksum = f"md5:{checksum}" + pooch.registry[filedata[key]] = checksum + + +class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring + def __init__(self, doi, archive_url): + self.archive_url = archive_url + self.doi = doi + self._api_response = None + + @classmethod + def initialize(cls, doi, archive_url): + """ + Initialize the data repository if the given URL points to a + corresponding repository. + + Initializes a data repository object. This is done as part of + a chain of responsibility. If the class cannot handle the given + repository URL, it returns `None`. Otherwise a `DataRepository` + instance is returned. + + Parameters + ---------- + doi : str + The DOI that identifies the repository + archive_url : str + The resolved URL for the DOI + """ + + # Check whether this is a Figshare URL + parsed_archive_url = parse_url(archive_url) + if parsed_archive_url["netloc"] != "figshare.com": + return None + + return cls(doi, archive_url) + + def _parse_version_from_doi(self): + """ + Parse version from the doi + + Return None if version is not available in the doi. + """ + # Get suffix of the doi + _, suffix = self.doi.split("/") + # Split the suffix by dots and keep the last part + last_part = suffix.split(".")[-1] + # Parse the version from the last part + if last_part[0] != "v": + return None + version = int(last_part[1:]) + return version + + @property + def api_response(self): + """Cached API response from Figshare""" + if self._api_response is None: + # Lazy import requests to speed up import time + import requests # pylint: disable=C0415 + + # Use the figshare API to find the article ID from the DOI + article = requests.get( + f"https://api.figshare.com/v2/articles?doi={self.doi}", + timeout=DEFAULT_TIMEOUT, + ).json()[0] + article_id = article["id"] + # Parse desired version from the doi + version = self._parse_version_from_doi() + # With the ID and version, we can get a list of files and their + # download links + if version is None: + # Figshare returns the latest version available when no version + # is specified through the DOI. + warnings.warn( + f"The Figshare DOI '{self.doi}' doesn't specify which version of " + "the repository should be used. " + "Figshare will point to the latest version available.", + UserWarning, + ) + # Define API url using only the article id + # (figshare will resolve the latest version) + api_url = f"https://api.figshare.com/v2/articles/{article_id}" + else: + # Define API url using article id and the desired version + # Get list of files using article id and the version + api_url = ( + "https://api.figshare.com/v2/articles/" + f"{article_id}/versions/{version}" + ) + # Make the request and return the files in the figshare repository + response = requests.get(api_url, timeout=DEFAULT_TIMEOUT) + response.raise_for_status() + self._api_response = response.json()["files"] + + return self._api_response + + def download_url(self, file_name): + """ + Use the repository API to get the download URL for a file given + the archive URL. + + Parameters + ---------- + file_name : str + The name of the file in the archive that will be downloaded. + + Returns + ------- + download_url : str + The HTTP URL that can be used to download the file. + """ + files = {item["name"]: item for item in self.api_response} + if file_name not in files: + raise ValueError( + f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." + ) + download_url = files[file_name]["download_url"] + return download_url + + def populate_registry(self, pooch): + """ + Populate the registry using the data repository's API + + Parameters + ---------- + pooch : Pooch + The pooch instance that the registry will be added to. + """ + + for filedata in self.api_response: + pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}" + + +class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring + def __init__(self, doi, archive_url): + self.archive_url = archive_url + self.doi = doi + self._api_response = None + + @classmethod + def initialize(cls, doi, archive_url): + """ + Initialize the data repository if the given URL points to a + corresponding repository. + + Initializes a data repository object. This is done as part of + a chain of responsibility. If the class cannot handle the given + repository URL, it returns `None`. Otherwise a `DataRepository` + instance is returned. + + Parameters + ---------- + doi : str + The DOI that identifies the repository + archive_url : str + The resolved URL for the DOI + """ + # Access the DOI as if this was a DataVerse instance + response = cls._get_api_response(doi, archive_url) + + # If we failed, this is probably not a DataVerse instance + if 400 <= response.status_code < 600: + return None + + # Initialize the repository and overwrite the api response + repository = cls(doi, archive_url) + repository.api_response = response + return repository + + @classmethod + def _get_api_response(cls, doi, archive_url): + """ + Perform the actual API request + + This has been separated into a separate ``classmethod``, as it can be + used prior and after the initialization. + """ + # Lazy import requests to speed up import time + import requests # pylint: disable=C0415 + + parsed = parse_url(archive_url) + response = requests.get( + f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/" + f":persistentId?persistentId=doi:{doi}", + timeout=DEFAULT_TIMEOUT, + ) + return response + + @property + def api_response(self): + """Cached API response from a DataVerse instance""" + + if self._api_response is None: + self._api_response = self._get_api_response( + self.doi, self.archive_url + ) # pragma: no cover + + return self._api_response + + @api_response.setter + def api_response(self, response): + """Update the cached API response""" + + self._api_response = response + + def download_url(self, file_name): + """ + Use the repository API to get the download URL for a file given + the archive URL. + + Parameters + ---------- + file_name : str + The name of the file in the archive that will be downloaded. + + Returns + ------- + download_url : str + The HTTP URL that can be used to download the file. + """ + parsed = parse_url(self.archive_url) + response = self.api_response.json() + files = { + file["dataFile"]["filename"]: file["dataFile"] + for file in response["data"]["latestVersion"]["files"] + } + if file_name not in files: + raise ValueError( + f"File '{file_name}' not found in data archive " + f"{self.archive_url} (doi:{self.doi})." + ) + # Generate download_url using the file id + download_url = ( + f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" + f"{files[file_name]['id']}" + ) + return download_url + + def populate_registry(self, pooch): + """ + Populate the registry using the data repository's API + + Parameters + ---------- + pooch : Pooch + The pooch instance that the registry will be added to. + """ + + for filedata in self.api_response.json()["data"]["latestVersion"]["files"]: + pooch.registry[filedata["dataFile"]["filename"]] = ( + f"md5:{filedata['dataFile']['md5']}" + )