Mercurial > repos > rliterman > csp2

diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 68:5028fdace37b
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author: jpayne
date: Tue, 18 Mar 2025 16:23:26 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py	Tue Mar 18 16:23:26 2025 -0400
@@ -0,0 +1,1163 @@
+# Copyright (c) 2018 The Pooch Developers.
+# Distributed under the terms of the BSD 3-Clause License.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
+#
+"""
+The classes that actually handle the downloads.
+"""
+import os
+import sys
+import ftplib
+
+import warnings
+
+from .utils import parse_url
+
+try:
+    from tqdm import tqdm
+except ImportError:
+    tqdm = None
+
+try:
+    import paramiko
+except ImportError:
+    paramiko = None
+
+
+# Set the default timeout in seconds so it can be configured in a pinch for the
+# methods that don't or can't expose a way set it at runtime.
+# See https://github.com/fatiando/pooch/issues/409
+DEFAULT_TIMEOUT = 30
+
+
+def choose_downloader(url, progressbar=False):
+    """
+    Choose the appropriate downloader for the given URL based on the protocol.
+
+    Parameters
+    ----------
+    url : str
+        A URL (including protocol).
+    progressbar : bool or an arbitrary progress bar object
+        If True, will print a progress bar of the download to standard error
+        (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+        installed. Alternatively, an arbitrary progress bar object can be
+        passed. See :ref:`custom-progressbar` for details.
+
+    Returns
+    -------
+    downloader
+        A downloader class, like :class:`pooch.HTTPDownloader`,
+        :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
+
+    Examples
+    --------
+
+    >>> downloader = choose_downloader("http://something.com")
+    >>> print(downloader.__class__.__name__)
+    HTTPDownloader
+    >>> downloader = choose_downloader("https://something.com")
+    >>> print(downloader.__class__.__name__)
+    HTTPDownloader
+    >>> downloader = choose_downloader("ftp://something.com")
+    >>> print(downloader.__class__.__name__)
+    FTPDownloader
+    >>> downloader = choose_downloader("doi:DOI/filename.csv")
+    >>> print(downloader.__class__.__name__)
+    DOIDownloader
+
+    """
+    known_downloaders = {
+        "ftp": FTPDownloader,
+        "https": HTTPDownloader,
+        "http": HTTPDownloader,
+        "sftp": SFTPDownloader,
+        "doi": DOIDownloader,
+    }
+
+    parsed_url = parse_url(url)
+    if parsed_url["protocol"] not in known_downloaders:
+        raise ValueError(
+            f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
+            f"Must be one of {known_downloaders.keys()}."
+        )
+    downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
+    return downloader
+
+
+class HTTPDownloader:  # pylint: disable=too-few-public-methods
+    """
+    Download manager for fetching files over HTTP/HTTPS.
+
+    When called, downloads the given file URL into the specified local file.
+    Uses the :mod:`requests` library to manage downloads.
+
+    Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
+    the download of files (for example, to use authentication or print a
+    progress bar).
+
+    Parameters
+    ----------
+    progressbar : bool or an arbitrary progress bar object
+        If True, will print a progress bar of the download to standard error
+        (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+        installed. Alternatively, an arbitrary progress bar object can be
+        passed. See :ref:`custom-progressbar` for details.
+    chunk_size : int
+        Files are streamed *chunk_size* bytes at a time instead of loading
+        everything into memory at one. Usually doesn't need to be changed.
+    **kwargs
+        All keyword arguments given when creating an instance of this class
+        will be passed to :func:`requests.get`.
+
+    Examples
+    --------
+
+    Download one of the data files from the Pooch repository:
+
+    >>> import os
+    >>> from pooch import __version__, check_version
+    >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
+    >>> url = url.format(check_version(__version__, fallback="main"))
+    >>> downloader = HTTPDownloader()
+    >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
+    >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+    >>> os.path.exists("tiny-data.txt")
+    True
+    >>> with open("tiny-data.txt") as f:
+    ...     print(f.read().strip())
+    # A tiny data file for test purposes only
+    1  2  3  4  5  6
+    >>> os.remove("tiny-data.txt")
+
+    Authentication can be handled by passing a user name and password to
+    :func:`requests.get`. All arguments provided when creating an instance of
+    the class are forwarded to :func:`requests.get`. We'll use
+    ``auth=(username, password)`` to use basic HTTPS authentication. The
+    https://httpbin.org website allows us to make a fake a login request using
+    whatever username and password we provide to it:
+
+    >>> user = "doggo"
+    >>> password = "goodboy"
+    >>> # httpbin will ask for the user and password we provide in the URL
+    >>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
+    >>> # Trying without the login credentials causes an error
+    >>> downloader = HTTPDownloader()
+    >>> try:
+    ...     downloader(url=url, output_file="tiny-data.txt", pooch=None)
+    ... except Exception:
+    ...     print("There was an error!")
+    There was an error!
+    >>> # Pass in the credentials to HTTPDownloader
+    >>> downloader = HTTPDownloader(auth=(user, password))
+    >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+    >>> with open("tiny-data.txt") as f:
+    ...     for line in f:
+    ...         print(line.rstrip())
+    {
+      "authenticated": true,
+      "user": "doggo"
+    }
+    >>> os.remove("tiny-data.txt")
+
+    """
+
+    def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
+        self.kwargs = kwargs
+        self.progressbar = progressbar
+        self.chunk_size = chunk_size
+        if self.progressbar is True and tqdm is None:
+            raise ValueError("Missing package 'tqdm' required for progress bars.")
+
+    def __call__(
+        self, url, output_file, pooch, check_only=False
+    ):  # pylint: disable=R0914
+        """
+        Download the given URL over HTTP to the given output file.
+
+        Uses :func:`requests.get`.
+
+        Parameters
+        ----------
+        url : str
+            The URL to the file you want to download.
+        output_file : str or file-like object
+            Path (and file name) to which the file will be downloaded.
+        pooch : :class:`~pooch.Pooch`
+            The instance of :class:`~pooch.Pooch` that is calling this method.
+        check_only : bool
+            If True, will only check if a file exists on the server and
+            **without downloading the file**. Will return ``True`` if the file
+            exists and ``False`` otherwise.
+
+        Returns
+        -------
+        availability : bool or None
+            If ``check_only==True``, returns a boolean indicating if the file
+            is available on the server. Otherwise, returns ``None``.
+
+        """
+        # Lazy import requests to speed up import time
+        import requests  # pylint: disable=C0415
+
+        if check_only:
+            timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT)
+            response = requests.head(url, timeout=timeout, allow_redirects=True)
+            available = bool(response.status_code == 200)
+            return available
+
+        kwargs = self.kwargs.copy()
+        timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT)
+        kwargs.setdefault("stream", True)
+        ispath = not hasattr(output_file, "write")
+        if ispath:
+            # pylint: disable=consider-using-with
+            output_file = open(output_file, "w+b")
+            # pylint: enable=consider-using-with
+        try:
+            response = requests.get(url, timeout=timeout, **kwargs)
+            response.raise_for_status()
+            content = response.iter_content(chunk_size=self.chunk_size)
+            total = int(response.headers.get("content-length", 0))
+            if self.progressbar is True:
+                # Need to use ascii characters on Windows because there isn't
+                # always full unicode support
+                # (see https://github.com/tqdm/tqdm/issues/454)
+                use_ascii = bool(sys.platform == "win32")
+                progress = tqdm(
+                    total=total,
+                    ncols=79,
+                    ascii=use_ascii,
+                    unit="B",
+                    unit_scale=True,
+                    leave=True,
+                )
+            elif self.progressbar:
+                progress = self.progressbar
+                progress.total = total
+            for chunk in content:
+                if chunk:
+                    output_file.write(chunk)
+                    output_file.flush()
+                    if self.progressbar:
+                        # Use the chunk size here because chunk may be much
+                        # larger if the data are decompressed by requests after
+                        # reading (happens with text files).
+                        progress.update(self.chunk_size)
+            # Make sure the progress bar gets filled even if the actual number
+            # is chunks is smaller than expected. This happens when streaming
+            # text files that are compressed by the server when sending (gzip).
+            # Binary files don't experience this.
+            if self.progressbar:
+                progress.reset()
+                progress.update(total)
+                progress.close()
+        finally:
+            if ispath:
+                output_file.close()
+        return None
+
+
+class FTPDownloader:  # pylint: disable=too-few-public-methods
+    """
+    Download manager for fetching files over FTP.
+
+    When called, downloads the given file URL into the specified local file.
+    Uses the :mod:`ftplib` module to manage downloads.
+
+    Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
+    the download of files (for example, to use authentication or print a
+    progress bar).
+
+    Parameters
+    ----------
+    port : int
+        Port used for the FTP connection.
+    username : str
+        User name used to login to the server. Only needed if the server
+        requires authentication (i.e., no anonymous FTP).
+    password : str
+        Password used to login to the server. Only needed if the server
+        requires authentication (i.e., no anonymous FTP). Use the empty string
+        to indicate no password is required.
+    account : str
+        Some servers also require an "account" name for authentication.
+    timeout : int
+        Timeout in seconds for ftp socket operations, use None to mean no
+        timeout.
+    progressbar : bool
+        If True, will print a progress bar of the download to standard error
+        (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+        installed. **Custom progress bars are not yet supported.**
+    chunk_size : int
+        Files are streamed *chunk_size* bytes at a time instead of loading
+        everything into memory at one. Usually doesn't need to be changed.
+
+    """
+
+    def __init__(
+        self,
+        port=21,
+        username="anonymous",
+        password="",
+        account="",
+        timeout=None,
+        progressbar=False,
+        chunk_size=1024,
+    ):
+        self.port = port
+        self.username = username
+        self.password = password
+        self.account = account
+        self.timeout = timeout
+        self.progressbar = progressbar
+        self.chunk_size = chunk_size
+        if self.progressbar is True and tqdm is None:
+            raise ValueError("Missing package 'tqdm' required for progress bars.")
+
+    def __call__(self, url, output_file, pooch, check_only=False):
+        """
+        Download the given URL over FTP to the given output file.
+
+        Parameters
+        ----------
+        url : str
+            The URL to the file you want to download.
+        output_file : str or file-like object
+            Path (and file name) to which the file will be downloaded.
+        pooch : :class:`~pooch.Pooch`
+            The instance of :class:`~pooch.Pooch` that is calling this method.
+        check_only : bool
+            If True, will only check if a file exists on the server and
+            **without downloading the file**. Will return ``True`` if the file
+            exists and ``False`` otherwise.
+
+        Returns
+        -------
+        availability : bool or None
+            If ``check_only==True``, returns a boolean indicating if the file
+            is available on the server. Otherwise, returns ``None``.
+
+        """
+        parsed_url = parse_url(url)
+        ftp = ftplib.FTP(timeout=self.timeout)
+        ftp.connect(host=parsed_url["netloc"], port=self.port)
+
+        if check_only:
+            directory, file_name = os.path.split(parsed_url["path"])
+            try:
+                ftp.login(user=self.username, passwd=self.password, acct=self.account)
+                available = file_name in ftp.nlst(directory)
+            finally:
+                ftp.close()
+            return available
+
+        ispath = not hasattr(output_file, "write")
+        if ispath:
+            # pylint: disable=consider-using-with
+            output_file = open(output_file, "w+b")
+            # pylint: enable=consider-using-with
+        try:
+            ftp.login(user=self.username, passwd=self.password, acct=self.account)
+            command = f"RETR {parsed_url['path']}"
+            if self.progressbar:
+                # Make sure the file is set to binary mode, otherwise we can't
+                # get the file size. See: https://stackoverflow.com/a/22093848
+                ftp.voidcmd("TYPE I")
+                use_ascii = bool(sys.platform == "win32")
+                progress = tqdm(
+                    total=int(ftp.size(parsed_url["path"])),
+                    ncols=79,
+                    ascii=use_ascii,
+                    unit="B",
+                    unit_scale=True,
+                    leave=True,
+                )
+                with progress:
+
+                    def callback(data):
+                        "Update the progress bar and write to output"
+                        progress.update(len(data))
+                        output_file.write(data)
+
+                    ftp.retrbinary(command, callback, blocksize=self.chunk_size)
+            else:
+                ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
+        finally:
+            ftp.quit()
+            if ispath:
+                output_file.close()
+        return None
+
+
+class SFTPDownloader:  # pylint: disable=too-few-public-methods
+    """
+    Download manager for fetching files over SFTP.
+
+    When called, downloads the given file URL into the specified local file.
+    Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
+    installed.
+
+    Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
+    the download of files (for example, to use authentication or print a
+    progress bar).
+
+    Parameters
+    ----------
+    port : int
+        Port used for the SFTP connection.
+    username : str
+        User name used to login to the server. Only needed if the server
+        requires authentication (i.e., no anonymous SFTP).
+    password : str
+        Password used to login to the server. Only needed if the server
+        requires authentication (i.e., no anonymous SFTP). Use the empty
+        string to indicate no password is required.
+    timeout : int
+        Timeout in seconds for sftp socket operations, use None to mean no
+        timeout.
+    progressbar : bool or an arbitrary progress bar object
+        If True, will print a progress bar of the download to standard
+        error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
+        be installed.
+
+    """
+
+    def __init__(
+        self,
+        port=22,
+        username="anonymous",
+        password="",
+        account="",
+        timeout=None,
+        progressbar=False,
+    ):
+        self.port = port
+        self.username = username
+        self.password = password
+        self.account = account
+        self.timeout = timeout
+        self.progressbar = progressbar
+        # Collect errors and raise only once so that both missing packages are
+        # captured. Otherwise, the user is only warned of one of them at a
+        # time (and we can't test properly when they are both missing).
+        errors = []
+        if self.progressbar and tqdm is None:
+            errors.append("Missing package 'tqdm' required for progress bars.")
+        if paramiko is None:
+            errors.append("Missing package 'paramiko' required for SFTP downloads.")
+        if errors:
+            raise ValueError(" ".join(errors))
+
+    def __call__(self, url, output_file, pooch):
+        """
+        Download the given URL over SFTP to the given output file.
+
+        The output file must be given as a string (file name/path) and not an
+        open file object! Otherwise, paramiko cannot save to that file.
+
+        Parameters
+        ----------
+        url : str
+            The URL to the file you want to download.
+        output_file : str
+            Path (and file name) to which the file will be downloaded. **Cannot
+            be a file object**.
+        pooch : :class:`~pooch.Pooch`
+            The instance of :class:`~pooch.Pooch` that is calling this method.
+        """
+        parsed_url = parse_url(url)
+        connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
+        sftp = None
+        try:
+            connection.connect(username=self.username, password=self.password)
+            sftp = paramiko.SFTPClient.from_transport(connection)
+            sftp.get_channel().settimeout = self.timeout
+            if self.progressbar:
+                size = int(sftp.stat(parsed_url["path"]).st_size)
+                use_ascii = bool(sys.platform == "win32")
+                progress = tqdm(
+                    total=size,
+                    ncols=79,
+                    ascii=use_ascii,
+                    unit="B",
+                    unit_scale=True,
+                    leave=True,
+                )
+            if self.progressbar:
+                with progress:
+
+                    def callback(current, total):
+                        "Update the progress bar and write to output"
+                        progress.total = int(total)
+                        progress.update(int(current - progress.n))
+
+                    sftp.get(parsed_url["path"], output_file, callback=callback)
+            else:
+                sftp.get(parsed_url["path"], output_file)
+        finally:
+            connection.close()
+            if sftp is not None:
+                sftp.close()
+
+
+class DOIDownloader:  # pylint: disable=too-few-public-methods
+    """
+    Download manager for fetching files from Digital Object Identifiers (DOIs).
+
+    Open-access data repositories often issue Digital Object Identifiers (DOIs)
+    for data which provide a stable link and citation point. The trick is
+    finding out the download URL for a file given the DOI.
+
+    When called, this downloader uses the repository's public API to find out
+    the download URL from the DOI and file name. It then uses
+    :class:`pooch.HTTPDownloader` to download the URL into the specified local
+    file. Allowing "URL"s  to be specified with the DOI instead of the actual
+    HTTP download link. Uses the :mod:`requests` library to manage downloads
+    and interact with the APIs.
+
+    The **format of the "URL"** is: ``doi:{DOI}/{file name}``.
+
+    Notice that there are no ``//`` like in HTTP/FTP and you must specify a
+    file name after the DOI (separated by a ``/``).
+
+    Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
+    download files given the DOI instead of an HTTP link.
+
+    Supported repositories:
+
+    * `figshare <https://www.figshare.com>`__
+    * `Zenodo <https://www.zenodo.org>`__
+    * `Dataverse <https://dataverse.org/>`__ instances
+
+    .. attention::
+
+        DOIs from other repositories **will not work** since we need to access
+        their particular APIs to find the download links. We welcome
+        suggestions and contributions adding new repositories.
+
+    Parameters
+    ----------
+    progressbar : bool or an arbitrary progress bar object
+        If True, will print a progress bar of the download to standard error
+        (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+        installed. Alternatively, an arbitrary progress bar object can be
+        passed. See :ref:`custom-progressbar` for details.
+    chunk_size : int
+        Files are streamed *chunk_size* bytes at a time instead of loading
+        everything into memory at one. Usually doesn't need to be changed.
+    **kwargs
+        All keyword arguments given when creating an instance of this class
+        will be passed to :func:`requests.get`.
+
+    Examples
+    --------
+
+    Download one of the data files from the figshare archive of Pooch test
+    data:
+
+    >>> import os
+    >>> downloader = DOIDownloader()
+    >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
+    >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
+    >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+    >>> os.path.exists("tiny-data.txt")
+    True
+    >>> with open("tiny-data.txt") as f:
+    ...     print(f.read().strip())
+    # A tiny data file for test purposes only
+    1  2  3  4  5  6
+    >>> os.remove("tiny-data.txt")
+
+    Same thing but for our Zenodo archive:
+
+    >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
+    >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+    >>> os.path.exists("tiny-data.txt")
+    True
+    >>> with open("tiny-data.txt") as f:
+    ...     print(f.read().strip())
+    # A tiny data file for test purposes only
+    1  2  3  4  5  6
+    >>> os.remove("tiny-data.txt")
+
+    """
+
+    def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
+        self.kwargs = kwargs
+        self.progressbar = progressbar
+        self.chunk_size = chunk_size
+
+    def __call__(self, url, output_file, pooch):
+        """
+        Download the given DOI URL over HTTP to the given output file.
+
+        Uses the repository's API to determine the actual HTTP download URL
+        from the given DOI.
+
+        Uses :func:`requests.get`.
+
+        Parameters
+        ----------
+        url : str
+            The URL to the file you want to download.
+        output_file : str or file-like object
+            Path (and file name) to which the file will be downloaded.
+        pooch : :class:`~pooch.Pooch`
+            The instance of :class:`~pooch.Pooch` that is calling this method.
+
+        """
+
+        parsed_url = parse_url(url)
+        data_repository = doi_to_repository(parsed_url["netloc"])
+
+        # Resolve the URL
+        file_name = parsed_url["path"]
+        # remove the leading slash in the path
+        if file_name[0] == "/":
+            file_name = file_name[1:]
+        download_url = data_repository.download_url(file_name)
+
+        # Instantiate the downloader object
+        downloader = HTTPDownloader(
+            progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
+        )
+        downloader(download_url, output_file, pooch)
+
+
+def doi_to_url(doi):
+    """
+    Follow a DOI link to resolve the URL of the archive.
+
+    Parameters
+    ----------
+    doi : str
+        The DOI of the archive.
+
+    Returns
+    -------
+    url : str
+        The URL of the archive in the data repository.
+
+    """
+    # Lazy import requests to speed up import time
+    import requests  # pylint: disable=C0415
+
+    # Use doi.org to resolve the DOI to the repository website.
+    response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT)
+    url = response.url
+    if 400 <= response.status_code < 600:
+        raise ValueError(
+            f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
+        )
+    return url
+
+
+def doi_to_repository(doi):
+    """
+    Instantiate a data repository instance from a given DOI.
+
+    This function implements the chain of responsibility dispatch
+    to the correct data repository class.
+
+    Parameters
+    ----------
+    doi : str
+        The DOI of the archive.
+
+    Returns
+    -------
+    data_repository : DataRepository
+        The data repository object
+    """
+
+    # This should go away in a separate issue: DOI handling should
+    # not rely on the (non-)existence of trailing slashes. The issue
+    # is documented in https://github.com/fatiando/pooch/issues/324
+    if doi[-1] == "/":
+        doi = doi[:-1]
+
+    repositories = [
+        FigshareRepository,
+        ZenodoRepository,
+        DataverseRepository,
+    ]
+
+    # Extract the DOI and the repository information
+    archive_url = doi_to_url(doi)
+
+    # Try the converters one by one until one of them returned a URL
+    data_repository = None
+    for repo in repositories:
+        if data_repository is None:
+            data_repository = repo.initialize(
+                archive_url=archive_url,
+                doi=doi,
+            )
+
+    if data_repository is None:
+        repository = parse_url(archive_url)["netloc"]
+        raise ValueError(
+            f"Invalid data repository '{repository}'. "
+            "To request or contribute support for this repository, "
+            "please open an issue at https://github.com/fatiando/pooch/issues"
+        )
+
+    return data_repository
+
+
+class DataRepository:  # pylint: disable=too-few-public-methods, missing-class-docstring
+    @classmethod
+    def initialize(cls, doi, archive_url):  # pylint: disable=unused-argument
+        """
+        Initialize the data repository if the given URL points to a
+        corresponding repository.
+
+        Initializes a data repository object. This is done as part of
+        a chain of responsibility. If the class cannot handle the given
+        repository URL, it returns `None`. Otherwise a `DataRepository`
+        instance is returned.
+
+        Parameters
+        ----------
+        doi : str
+            The DOI that identifies the repository
+        archive_url : str
+            The resolved URL for the DOI
+        """
+
+        return None  # pragma: no cover
+
+    def download_url(self, file_name):
+        """
+        Use the repository API to get the download URL for a file given
+        the archive URL.
+
+        Parameters
+        ----------
+        file_name : str
+            The name of the file in the archive that will be downloaded.
+
+        Returns
+        -------
+        download_url : str
+            The HTTP URL that can be used to download the file.
+        """
+
+        raise NotImplementedError  # pragma: no cover
+
+    def populate_registry(self, pooch):
+        """
+        Populate the registry using the data repository's API
+
+        Parameters
+        ----------
+        pooch : Pooch
+            The pooch instance that the registry will be added to.
+        """
+
+        raise NotImplementedError  # pragma: no cover
+
+
+class ZenodoRepository(DataRepository):  # pylint: disable=missing-class-docstring
+    base_api_url = "https://zenodo.org/api/records"
+
+    def __init__(self, doi, archive_url):
+        self.archive_url = archive_url
+        self.doi = doi
+        self._api_response = None
+        self._api_version = None
+
+    @classmethod
+    def initialize(cls, doi, archive_url):
+        """
+        Initialize the data repository if the given URL points to a
+        corresponding repository.
+
+        Initializes a data repository object. This is done as part of
+        a chain of responsibility. If the class cannot handle the given
+        repository URL, it returns `None`. Otherwise a `DataRepository`
+        instance is returned.
+
+        Parameters
+        ----------
+        doi : str
+            The DOI that identifies the repository
+        archive_url : str
+            The resolved URL for the DOI
+        """
+
+        # Check whether this is a Zenodo URL
+        parsed_archive_url = parse_url(archive_url)
+        if parsed_archive_url["netloc"] != "zenodo.org":
+            return None
+
+        return cls(doi, archive_url)
+
+    @property
+    def api_response(self):
+        """Cached API response from Zenodo"""
+        if self._api_response is None:
+            # Lazy import requests to speed up import time
+            import requests  # pylint: disable=C0415
+
+            article_id = self.archive_url.split("/")[-1]
+            self._api_response = requests.get(
+                f"{self.base_api_url}/{article_id}",
+                timeout=DEFAULT_TIMEOUT,
+            ).json()
+
+        return self._api_response
+
+    @property
+    def api_version(self):
+        """
+        Version of the Zenodo API we are interacting with
+
+        The versions can either be :
+
+        - ``"legacy"``: corresponds to the Zenodo API that was supported until
+          2023-10-12 (before the migration to InvenioRDM).
+        - ``"new"``: corresponds to the new API that went online on 2023-10-13
+          after the migration to InvenioRDM.
+
+        The ``"new"`` API breaks backward compatibility with the ``"legacy"``
+        one and could probably be replaced by an updated version that restores
+        the behaviour of the ``"legacy"`` one.
+
+        Returns
+        -------
+        str
+        """
+        if self._api_version is None:
+            if all("key" in file for file in self.api_response["files"]):
+                self._api_version = "legacy"
+            elif all("filename" in file for file in self.api_response["files"]):
+                self._api_version = "new"
+            else:
+                raise ValueError(
+                    "Couldn't determine the version of the Zenodo API for "
+                    f"{self.archive_url} (doi:{self.doi})."
+                )
+        return self._api_version
+
+    def download_url(self, file_name):
+        """
+        Use the repository API to get the download URL for a file given
+        the archive URL.
+
+        Parameters
+        ----------
+        file_name : str
+            The name of the file in the archive that will be downloaded.
+
+        Returns
+        -------
+        download_url : str
+            The HTTP URL that can be used to download the file.
+
+        Notes
+        -----
+        After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
+        link to the desired files that appears in the API response leads to 404
+        errors (by 2023-10-17). The files are available in the following url:
+        ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.
+
+        This method supports both the legacy and the new API.
+        """
+        # Create list of files in the repository
+        if self.api_version == "legacy":
+            files = {item["key"]: item for item in self.api_response["files"]}
+        else:
+            files = [item["filename"] for item in self.api_response["files"]]
+        # Check if file exists in the repository
+        if file_name not in files:
+            raise ValueError(
+                f"File '{file_name}' not found in data archive "
+                f"{self.archive_url} (doi:{self.doi})."
+            )
+        # Build download url
+        if self.api_version == "legacy":
+            download_url = files[file_name]["links"]["self"]
+        else:
+            article_id = self.api_response["id"]
+            download_url = (
+                f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
+            )
+        return download_url
+
+    def populate_registry(self, pooch):
+        """
+        Populate the registry using the data repository's API
+
+        Parameters
+        ----------
+        pooch : Pooch
+            The pooch instance that the registry will be added to.
+
+        Notes
+        -----
+        After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
+        checksums for each file listed in the API reference is now an md5 sum.
+
+        This method supports both the legacy and the new API.
+        """
+        for filedata in self.api_response["files"]:
+            checksum = filedata["checksum"]
+            if self.api_version == "legacy":
+                key = "key"
+            else:
+                key = "filename"
+                checksum = f"md5:{checksum}"
+            pooch.registry[filedata[key]] = checksum
+
+
+class FigshareRepository(DataRepository):  # pylint: disable=missing-class-docstring
+    def __init__(self, doi, archive_url):
+        self.archive_url = archive_url
+        self.doi = doi
+        self._api_response = None
+
+    @classmethod
+    def initialize(cls, doi, archive_url):
+        """
+        Initialize the data repository if the given URL points to a
+        corresponding repository.
+
+        Initializes a data repository object. This is done as part of
+        a chain of responsibility. If the class cannot handle the given
+        repository URL, it returns `None`. Otherwise a `DataRepository`
+        instance is returned.
+
+        Parameters
+        ----------
+        doi : str
+            The DOI that identifies the repository
+        archive_url : str
+            The resolved URL for the DOI
+        """
+
+        # Check whether this is a Figshare URL
+        parsed_archive_url = parse_url(archive_url)
+        if parsed_archive_url["netloc"] != "figshare.com":
+            return None
+
+        return cls(doi, archive_url)
+
+    def _parse_version_from_doi(self):
+        """
+        Parse version from the doi
+
+        Return None if version is not available in the doi.
+        """
+        # Get suffix of the doi
+        _, suffix = self.doi.split("/")
+        # Split the suffix by dots and keep the last part
+        last_part = suffix.split(".")[-1]
+        # Parse the version from the last part
+        if last_part[0] != "v":
+            return None
+        version = int(last_part[1:])
+        return version
+
+    @property
+    def api_response(self):
+        """Cached API response from Figshare"""
+        if self._api_response is None:
+            # Lazy import requests to speed up import time
+            import requests  # pylint: disable=C0415
+
+            # Use the figshare API to find the article ID from the DOI
+            article = requests.get(
+                f"https://api.figshare.com/v2/articles?doi={self.doi}",
+                timeout=DEFAULT_TIMEOUT,
+            ).json()[0]
+            article_id = article["id"]
+            # Parse desired version from the doi
+            version = self._parse_version_from_doi()
+            # With the ID and version, we can get a list of files and their
+            # download links
+            if version is None:
+                # Figshare returns the latest version available when no version
+                # is specified through the DOI.
+                warnings.warn(
+                    f"The Figshare DOI '{self.doi}' doesn't specify which version of "
+                    "the repository should be used. "
+                    "Figshare will point to the latest version available.",
+                    UserWarning,
+                )
+                # Define API url using only the article id
+                # (figshare will resolve the latest version)
+                api_url = f"https://api.figshare.com/v2/articles/{article_id}"
+            else:
+                # Define API url using article id and the desired version
+                # Get list of files using article id and the version
+                api_url = (
+                    "https://api.figshare.com/v2/articles/"
+                    f"{article_id}/versions/{version}"
+                )
+            # Make the request and return the files in the figshare repository
+            response = requests.get(api_url, timeout=DEFAULT_TIMEOUT)
+            response.raise_for_status()
+            self._api_response = response.json()["files"]
+
+        return self._api_response
+
+    def download_url(self, file_name):
+        """
+        Use the repository API to get the download URL for a file given
+        the archive URL.
+
+        Parameters
+        ----------
+        file_name : str
+            The name of the file in the archive that will be downloaded.
+
+        Returns
+        -------
+        download_url : str
+            The HTTP URL that can be used to download the file.
+        """
+        files = {item["name"]: item for item in self.api_response}
+        if file_name not in files:
+            raise ValueError(
+                f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
+            )
+        download_url = files[file_name]["download_url"]
+        return download_url
+
+    def populate_registry(self, pooch):
+        """
+        Populate the registry using the data repository's API
+
+        Parameters
+        ----------
+        pooch : Pooch
+            The pooch instance that the registry will be added to.
+        """
+
+        for filedata in self.api_response:
+            pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
+
+
+class DataverseRepository(DataRepository):  # pylint: disable=missing-class-docstring
+    def __init__(self, doi, archive_url):
+        self.archive_url = archive_url
+        self.doi = doi
+        self._api_response = None
+
+    @classmethod
+    def initialize(cls, doi, archive_url):
+        """
+        Initialize the data repository if the given URL points to a
+        corresponding repository.
+
+        Initializes a data repository object. This is done as part of
+        a chain of responsibility. If the class cannot handle the given
+        repository URL, it returns `None`. Otherwise a `DataRepository`
+        instance is returned.
+
+        Parameters
+        ----------
+        doi : str
+            The DOI that identifies the repository
+        archive_url : str
+            The resolved URL for the DOI
+        """
+        # Access the DOI as if this was a DataVerse instance
+        response = cls._get_api_response(doi, archive_url)
+
+        # If we failed, this is probably not a DataVerse instance
+        if 400 <= response.status_code < 600:
+            return None
+
+        # Initialize the repository and overwrite the api response
+        repository = cls(doi, archive_url)
+        repository.api_response = response
+        return repository
+
+    @classmethod
+    def _get_api_response(cls, doi, archive_url):
+        """
+        Perform the actual API request
+
+        This has been separated into a separate ``classmethod``, as it can be
+        used prior and after the initialization.
+        """
+        # Lazy import requests to speed up import time
+        import requests  # pylint: disable=C0415
+
+        parsed = parse_url(archive_url)
+        response = requests.get(
+            f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
+            f":persistentId?persistentId=doi:{doi}",
+            timeout=DEFAULT_TIMEOUT,
+        )
+        return response
+
+    @property
+    def api_response(self):
+        """Cached API response from a DataVerse instance"""
+
+        if self._api_response is None:
+            self._api_response = self._get_api_response(
+                self.doi, self.archive_url
+            )  # pragma: no cover
+
+        return self._api_response
+
+    @api_response.setter
+    def api_response(self, response):
+        """Update the cached API response"""
+
+        self._api_response = response
+
+    def download_url(self, file_name):
+        """
+        Use the repository API to get the download URL for a file given
+        the archive URL.
+
+        Parameters
+        ----------
+        file_name : str
+            The name of the file in the archive that will be downloaded.
+
+        Returns
+        -------
+        download_url : str
+            The HTTP URL that can be used to download the file.
+        """
+        parsed = parse_url(self.archive_url)
+        response = self.api_response.json()
+        files = {
+            file["dataFile"]["filename"]: file["dataFile"]
+            for file in response["data"]["latestVersion"]["files"]
+        }
+        if file_name not in files:
+            raise ValueError(
+                f"File '{file_name}' not found in data archive "
+                f"{self.archive_url} (doi:{self.doi})."
+            )
+        # Generate download_url using the file id
+        download_url = (
+            f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
+            f"{files[file_name]['id']}"
+        )
+        return download_url
+
+    def populate_registry(self, pooch):
+        """
+        Populate the registry using the data repository's API
+
+        Parameters
+        ----------
+        pooch : Pooch
+            The pooch instance that the registry will be added to.
+        """
+
+        for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
+            pooch.registry[filedata["dataFile"]["filename"]] = (
+                f"md5:{filedata['dataFile']['md5']}"
+            )
author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children