Mercurial > repos > rliterman > csp2

# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
The classes that actually handle the downloads.
"""
import os
import sys
import ftplib

import warnings

from .utils import parse_url

try:
    from tqdm import tqdm
except ImportError:
    tqdm = None

try:
    import paramiko
except ImportError:
    paramiko = None


# Set the default timeout in seconds so it can be configured in a pinch for the
# methods that don't or can't expose a way set it at runtime.
# See https://github.com/fatiando/pooch/issues/409
DEFAULT_TIMEOUT = 30


def choose_downloader(url, progressbar=False):
    """
    Choose the appropriate downloader for the given URL based on the protocol.

    Parameters
    ----------
    url : str
        A URL (including protocol).
    progressbar : bool or an arbitrary progress bar object
        If True, will print a progress bar of the download to standard error
        (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
        installed. Alternatively, an arbitrary progress bar object can be
        passed. See :ref:`custom-progressbar` for details.

    Returns
    -------
    downloader
        A downloader class, like :class:`pooch.HTTPDownloader`,
        :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.

    Examples
    --------

    >>> downloader = choose_downloader("http://something.com")
    >>> print(downloader.__class__.__name__)
    HTTPDownloader
    >>> downloader = choose_downloader("https://something.com")
    >>> print(downloader.__class__.__name__)
    HTTPDownloader
    >>> downloader = choose_downloader("ftp://something.com")
    >>> print(downloader.__class__.__name__)
    FTPDownloader
    >>> downloader = choose_downloader("doi:DOI/filename.csv")
    >>> print(downloader.__class__.__name__)
    DOIDownloader

    """
    known_downloaders = {
        "ftp": FTPDownloader,
        "https": HTTPDownloader,
        "http": HTTPDownloader,
        "sftp": SFTPDownloader,
        "doi": DOIDownloader,
    }

    parsed_url = parse_url(url)
    if parsed_url["protocol"] not in known_downloaders:
        raise ValueError(
            f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
            f"Must be one of {known_downloaders.keys()}."
        )
    downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
    return downloader


class HTTPDownloader:  # pylint: disable=too-few-public-methods
    """
    Download manager for fetching files over HTTP/HTTPS.

    When called, downloads the given file URL into the specified local file.
    Uses the :mod:`requests` library to manage downloads.

    Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
    the download of files (for example, to use authentication or print a
    progress bar).

    Parameters
    ----------
    progressbar : bool or an arbitrary progress bar object
        If True, will print a progress bar of the download to standard error
        (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
        installed. Alternatively, an arbitrary progress bar object can be
        passed. See :ref:`custom-progressbar` for details.
    chunk_size : int
        Files are streamed *chunk_size* bytes at a time instead of loading
        everything into memory at one. Usually doesn't need to be changed.
    **kwargs
        All keyword arguments given when creating an instance of this class
        will be passed to :func:`requests.get`.

    Examples
    --------

    Download one of the data files from the Pooch repository:

    >>> import os
    >>> from pooch import __version__, check_version
    >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
    >>> url = url.format(check_version(__version__, fallback="main"))
    >>> downloader = HTTPDownloader()
    >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
    >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
    >>> os.path.exists("tiny-data.txt")
    True
    >>> with open("tiny-data.txt") as f:
    ...     print(f.read().strip())
    # A tiny data file for test purposes only
    1  2  3  4  5  6
    >>> os.remove("tiny-data.txt")

    Authentication can be handled by passing a user name and password to
    :func:`requests.get`. All arguments provided when creating an instance of
    the class are forwarded to :func:`requests.get`. We'll use
    ``auth=(username, password)`` to use basic HTTPS authentication. The
    https://httpbin.org website allows us to make a fake a login request using
    whatever username and password we provide to it:

    >>> user = "doggo"
    >>> password = "goodboy"
    >>> # httpbin will ask for the user and password we provide in the URL
    >>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
    >>> # Trying without the login credentials causes an error
    >>> downloader = HTTPDownloader()
    >>> try:
    ...     downloader(url=url, output_file="tiny-data.txt", pooch=None)
    ... except Exception:
    ...     print("There was an error!")
    There was an error!
    >>> # Pass in the credentials to HTTPDownloader
    >>> downloader = HTTPDownloader(auth=(user, password))
    >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
    >>> with open("tiny-data.txt") as f:
    ...     for line in f:
    ...         print(line.rstrip())
    {
      "authenticated": true,
      "user": "doggo"
    }
    >>> os.remove("tiny-data.txt")

    """

    def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
        self.kwargs = kwargs
        self.progressbar = progressbar
        self.chunk_size = chunk_size
        if self.progressbar is True and tqdm is None:
            raise ValueError("Missing package 'tqdm' required for progress bars.")

    def __call__(
        self, url, output_file, pooch, check_only=False
    ):  # pylint: disable=R0914
        """
        Download the given URL over HTTP to the given output file.

        Uses :func:`requests.get`.

        Parameters
        ----------
        url : str
            The URL to the file you want to download.
        output_file : str or file-like object
            Path (and file name) to which the file will be downloaded.
        pooch : :class:`~pooch.Pooch`
            The instance of :class:`~pooch.Pooch` that is calling this method.
        check_only : bool
            If True, will only check if a file exists on the server and
            **without downloading the file**. Will return ``True`` if the file
            exists and ``False`` otherwise.

        Returns
        -------
        availability : bool or None
            If ``check_only==True``, returns a boolean indicating if the file
            is available on the server. Otherwise, returns ``None``.

        """
        # Lazy import requests to speed up import time
        import requests  # pylint: disable=C0415

        if check_only:
            timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT)
            response = requests.head(url, timeout=timeout, allow_redirects=True)
            available = bool(response.status_code == 200)
            return available

        kwargs = self.kwargs.copy()
        timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT)
        kwargs.setdefault("stream", True)
        ispath = not hasattr(output_file, "write")
        if ispath:
            # pylint: disable=consider-using-with
            output_file = open(output_file, "w+b")
            # pylint: enable=consider-using-with
        try:
            response = requests.get(url, timeout=timeout, **kwargs)
            response.raise_for_status()
            content = response.iter_content(chunk_size=self.chunk_size)
            total = int(response.headers.get("content-length", 0))
            if self.progressbar is True:
                # Need to use ascii characters on Windows because there isn't
                # always full unicode support
                # (see https://github.com/tqdm/tqdm/issues/454)
                use_ascii = bool(sys.platform == "win32")
                progress = tqdm(
                    total=total,
                    ncols=79,
                    ascii=use_ascii,
                    unit="B",
                    unit_scale=True,
                    leave=True,
                )
            elif self.progressbar:
                progress = self.progressbar
                progress.total = total
            for chunk in content:
                if chunk:
                    output_file.write(chunk)
                    output_file.flush()
                    if self.progressbar:
                        # Use the chunk size here because chunk may be much
                        # larger if the data are decompressed by requests after
                        # reading (happens with text files).
                        progress.update(self.chunk_size)
            # Make sure the progress bar gets filled even if the actual number
            # is chunks is smaller than expected. This happens when streaming
            # text files that are compressed by the server when sending (gzip).
            # Binary files don't experience this.
            if self.progressbar:
                progress.reset()
                progress.update(total)
                progress.close()
        finally:
            if ispath:
                output_file.close()
        return None


class FTPDownloader:  # pylint: disable=too-few-public-methods
    """
    Download manager for fetching files over FTP.

    When called, downloads the given file URL into the specified local file.
    Uses the :mod:`ftplib` module to manage downloads.

    Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
    the download of files (for example, to use authentication or print a
    progress bar).

    Parameters
    ----------
    port : int
        Port used for the FTP connection.
    username : str
        User name used to login to the server. Only needed if the server
        requires authentication (i.e., no anonymous FTP).
    password : str
        Password used to login to the server. Only needed if the server
        requires authentication (i.e., no anonymous FTP). Use the empty string
        to indicate no password is required.
    account : str
        Some servers also require an "account" name for authentication.
    timeout : int
        Timeout in seconds for ftp socket operations, use None to mean no
        timeout.
    progressbar : bool
        If True, will print a progress bar of the download to standard error
        (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
        installed. **Custom progress bars are not yet supported.**
    chunk_size : int
        Files are streamed *chunk_size* bytes at a time instead of loading
        everything into memory at one. Usually doesn't need to be changed.

    """

    def __init__(
        self,
        port=21,
        username="anonymous",
        password="",
        account="",
        timeout=None,
        progressbar=False,
        chunk_size=1024,
    ):
        self.port = port
        self.username = username
        self.password = password
        self.account = account
        self.timeout = timeout
        self.progressbar = progressbar
        self.chunk_size = chunk_size
        if self.progressbar is True and tqdm is None:
            raise ValueError("Missing package 'tqdm' required for progress bars.")

    def __call__(self, url, output_file, pooch, check_only=False):
        """
        Download the given URL over FTP to the given output file.

        Parameters
        ----------
        url : str
            The URL to the file you want to download.
        output_file : str or file-like object
            Path (and file name) to which the file will be downloaded.
        pooch : :class:`~pooch.Pooch`
            The instance of :class:`~pooch.Pooch` that is calling this method.
        check_only : bool
            If True, will only check if a file exists on the server and
            **without downloading the file**. Will return ``True`` if the file
            exists and ``False`` otherwise.

        Returns
        -------
        availability : bool or None
            If ``check_only==True``, returns a boolean indicating if the file
            is available on the server. Otherwise, returns ``None``.

        """
        parsed_url = parse_url(url)
        ftp = ftplib.FTP(timeout=self.timeout)
        ftp.connect(host=parsed_url["netloc"], port=self.port)

        if check_only:
            directory, file_name = os.path.split(parsed_url["path"])
            try:
                ftp.login(user=self.username, passwd=self.password, acct=self.account)
                available = file_name in ftp.nlst(directory)
            finally:
                ftp.close()
            return available

        ispath = not hasattr(output_file, "write")
        if ispath:
            # pylint: disable=consider-using-with
            output_file = open(output_file, "w+b")
            # pylint: enable=consider-using-with
        try:
            ftp.login(user=self.username, passwd=self.password, acct=self.account)
            command = f"RETR {parsed_url['path']}"
            if self.progressbar:
                # Make sure the file is set to binary mode, otherwise we can't
                # get the file size. See: https://stackoverflow.com/a/22093848
                ftp.voidcmd("TYPE I")
                use_ascii = bool(sys.platform == "win32")
                progress = tqdm(
                    total=int(ftp.size(parsed_url["path"])),
                    ncols=79,
                    ascii=use_ascii,
                    unit="B",
                    unit_scale=True,
                    leave=True,
                )
                with progress:

                    def callback(data):
                        "Update the progress bar and write to output"
                        progress.update(len(data))
                        output_file.write(data)

                    ftp.retrbinary(command, callback, blocksize=self.chunk_size)
            else:
                ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
        finally:
            ftp.quit()
            if ispath:
                output_file.close()
        return None


class SFTPDownloader:  # pylint: disable=too-few-public-methods
    """
    Download manager for fetching files over SFTP.

    When called, downloads the given file URL into the specified local file.
    Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
    installed.

    Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
    the download of files (for example, to use authentication or print a
    progress bar).

    Parameters
    ----------
    port : int
        Port used for the SFTP connection.
    username : str
        User name used to login to the server. Only needed if the server
        requires authentication (i.e., no anonymous SFTP).
    password : str
        Password used to login to the server. Only needed if the server
        requires authentication (i.e., no anonymous SFTP). Use the empty
        string to indicate no password is required.
    timeout : int
        Timeout in seconds for sftp socket operations, use None to mean no
        timeout.
    progressbar : bool or an arbitrary progress bar object
        If True, will print a progress bar of the download to standard
        error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
        be installed.

    """

    def __init__(
        self,
        port=22,
        username="anonymous",
        password="",
        account="",
        timeout=None,
        progressbar=False,
    ):
        self.port = port
        self.username = username
        self.password = password
        self.account = account
        self.timeout = timeout
        self.progressbar = progressbar
        # Collect errors and raise only once so that both missing packages are
        # captured. Otherwise, the user is only warned of one of them at a
        # time (and we can't test properly when they are both missing).
        errors = []
        if self.progressbar and tqdm is None:
            errors.append("Missing package 'tqdm' required for progress bars.")
        if paramiko is None:
            errors.append("Missing package 'paramiko' required for SFTP downloads.")
        if errors:
            raise ValueError(" ".join(errors))

    def __call__(self, url, output_file, pooch):
        """
        Download the given URL over SFTP to the given output file.

        The output file must be given as a string (file name/path) and not an
        open file object! Otherwise, paramiko cannot save to that file.

        Parameters
        ----------
        url : str
            The URL to the file you want to download.
        output_file : str
            Path (and file name) to which the file will be downloaded. **Cannot
            be a file object**.
        pooch : :class:`~pooch.Pooch`
            The instance of :class:`~pooch.Pooch` that is calling this method.
        """
        parsed_url = parse_url(url)
        connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
        sftp = None
        try:
            connection.connect(username=self.username, password=self.password)
            sftp = paramiko.SFTPClient.from_transport(connection)
            sftp.get_channel().settimeout = self.timeout
            if self.progressbar:
                size = int(sftp.stat(parsed_url["path"]).st_size)
                use_ascii = bool(sys.platform == "win32")
                progress = tqdm(
                    total=size,
                    ncols=79,
                    ascii=use_ascii,
                    unit="B",
                    unit_scale=True,
                    leave=True,
                )
            if self.progressbar:
                with progress:

                    def callback(current, total):
                        "Update the progress bar and write to output"
                        progress.total = int(total)
                        progress.update(int(current - progress.n))

                    sftp.get(parsed_url["path"], output_file, callback=callback)
            else:
                sftp.get(parsed_url["path"], output_file)
        finally:
            connection.close()
            if sftp is not None:
                sftp.close()


class DOIDownloader:  # pylint: disable=too-few-public-methods
    """
    Download manager for fetching files from Digital Object Identifiers (DOIs).

    Open-access data repositories often issue Digital Object Identifiers (DOIs)
    for data which provide a stable link and citation point. The trick is
    finding out the download URL for a file given the DOI.

    When called, this downloader uses the repository's public API to find out
    the download URL from the DOI and file name. It then uses
    :class:`pooch.HTTPDownloader` to download the URL into the specified local
    file. Allowing "URL"s  to be specified with the DOI instead of the actual
    HTTP download link. Uses the :mod:`requests` library to manage downloads
    and interact with the APIs.

    The **format of the "URL"** is: ``doi:{DOI}/{file name}``.

    Notice that there are no ``//`` like in HTTP/FTP and you must specify a
    file name after the DOI (separated by a ``/``).

    Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
    download files given the DOI instead of an HTTP link.

    Supported repositories:

    * `figshare <https://www.figshare.com>`__
    * `Zenodo <https://www.zenodo.org>`__
    * `Dataverse <https://dataverse.org/>`__ instances

    .. attention::

        DOIs from other repositories **will not work** since we need to access
        their particular APIs to find the download links. We welcome
        suggestions and contributions adding new repositories.

    Parameters
    ----------
    progressbar : bool or an arbitrary progress bar object
        If True, will print a progress bar of the download to standard error
        (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
        installed. Alternatively, an arbitrary progress bar object can be
        passed. See :ref:`custom-progressbar` for details.
    chunk_size : int
        Files are streamed *chunk_size* bytes at a time instead of loading
        everything into memory at one. Usually doesn't need to be changed.
    **kwargs
        All keyword arguments given when creating an instance of this class
        will be passed to :func:`requests.get`.

    Examples
    --------

    Download one of the data files from the figshare archive of Pooch test
    data:

    >>> import os
    >>> downloader = DOIDownloader()
    >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
    >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
    >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
    >>> os.path.exists("tiny-data.txt")
    True
    >>> with open("tiny-data.txt") as f:
    ...     print(f.read().strip())
    # A tiny data file for test purposes only
    1  2  3  4  5  6
    >>> os.remove("tiny-data.txt")

    Same thing but for our Zenodo archive:

    >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
    >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
    >>> os.path.exists("tiny-data.txt")
    True
    >>> with open("tiny-data.txt") as f:
    ...     print(f.read().strip())
    # A tiny data file for test purposes only
    1  2  3  4  5  6
    >>> os.remove("tiny-data.txt")

    """

    def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
        self.kwargs = kwargs
        self.progressbar = progressbar
        self.chunk_size = chunk_size

    def __call__(self, url, output_file, pooch):
        """
        Download the given DOI URL over HTTP to the given output file.

        Uses the repository's API to determine the actual HTTP download URL
        from the given DOI.

        Uses :func:`requests.get`.

        Parameters
        ----------
        url : str
            The URL to the file you want to download.
        output_file : str or file-like object
            Path (and file name) to which the file will be downloaded.
        pooch : :class:`~pooch.Pooch`
            The instance of :class:`~pooch.Pooch` that is calling this method.

        """

        parsed_url = parse_url(url)
        data_repository = doi_to_repository(parsed_url["netloc"])

        # Resolve the URL
        file_name = parsed_url["path"]
        # remove the leading slash in the path
        if file_name[0] == "/":
            file_name = file_name[1:]
        download_url = data_repository.download_url(file_name)

        # Instantiate the downloader object
        downloader = HTTPDownloader(
            progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
        )
        downloader(download_url, output_file, pooch)


def doi_to_url(doi):
    """
    Follow a DOI link to resolve the URL of the archive.

    Parameters
    ----------
    doi : str
        The DOI of the archive.

    Returns
    -------
    url : str
        The URL of the archive in the data repository.

    """
    # Lazy import requests to speed up import time
    import requests  # pylint: disable=C0415

    # Use doi.org to resolve the DOI to the repository website.
    response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT)
    url = response.url
    if 400 <= response.status_code < 600:
        raise ValueError(
            f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
        )
    return url


def doi_to_repository(doi):
    """
    Instantiate a data repository instance from a given DOI.

    This function implements the chain of responsibility dispatch
    to the correct data repository class.

    Parameters
    ----------
    doi : str
        The DOI of the archive.

    Returns
    -------
    data_repository : DataRepository
        The data repository object
    """

    # This should go away in a separate issue: DOI handling should
    # not rely on the (non-)existence of trailing slashes. The issue
    # is documented in https://github.com/fatiando/pooch/issues/324
    if doi[-1] == "/":
        doi = doi[:-1]

    repositories = [
        FigshareRepository,
        ZenodoRepository,
        DataverseRepository,
    ]

    # Extract the DOI and the repository information
    archive_url = doi_to_url(doi)

    # Try the converters one by one until one of them returned a URL
    data_repository = None
    for repo in repositories:
        if data_repository is None:
            data_repository = repo.initialize(
                archive_url=archive_url,
                doi=doi,
            )

    if data_repository is None:
        repository = parse_url(archive_url)["netloc"]
        raise ValueError(
            f"Invalid data repository '{repository}'. "
            "To request or contribute support for this repository, "
            "please open an issue at https://github.com/fatiando/pooch/issues"
        )

    return data_repository


class DataRepository:  # pylint: disable=too-few-public-methods, missing-class-docstring
    @classmethod
    def initialize(cls, doi, archive_url):  # pylint: disable=unused-argument
        """
        Initialize the data repository if the given URL points to a
        corresponding repository.

        Initializes a data repository object. This is done as part of
        a chain of responsibility. If the class cannot handle the given
        repository URL, it returns `None`. Otherwise a `DataRepository`
        instance is returned.

        Parameters
        ----------
        doi : str
            The DOI that identifies the repository
        archive_url : str
            The resolved URL for the DOI
        """

        return None  # pragma: no cover

    def download_url(self, file_name):
        """
        Use the repository API to get the download URL for a file given
        the archive URL.

        Parameters
        ----------
        file_name : str
            The name of the file in the archive that will be downloaded.

        Returns
        -------
        download_url : str
            The HTTP URL that can be used to download the file.
        """

        raise NotImplementedError  # pragma: no cover

    def populate_registry(self, pooch):
        """
        Populate the registry using the data repository's API

        Parameters
        ----------
        pooch : Pooch
            The pooch instance that the registry will be added to.
        """

        raise NotImplementedError  # pragma: no cover


class ZenodoRepository(DataRepository):  # pylint: disable=missing-class-docstring
    base_api_url = "https://zenodo.org/api/records"

    def __init__(self, doi, archive_url):
        self.archive_url = archive_url
        self.doi = doi
        self._api_response = None
        self._api_version = None

    @classmethod
    def initialize(cls, doi, archive_url):
        """
        Initialize the data repository if the given URL points to a
        corresponding repository.

        Initializes a data repository object. This is done as part of
        a chain of responsibility. If the class cannot handle the given
        repository URL, it returns `None`. Otherwise a `DataRepository`
        instance is returned.

        Parameters
        ----------
        doi : str
            The DOI that identifies the repository
        archive_url : str
            The resolved URL for the DOI
        """

        # Check whether this is a Zenodo URL
        parsed_archive_url = parse_url(archive_url)
        if parsed_archive_url["netloc"] != "zenodo.org":
            return None

        return cls(doi, archive_url)

    @property
    def api_response(self):
        """Cached API response from Zenodo"""
        if self._api_response is None:
            # Lazy import requests to speed up import time
            import requests  # pylint: disable=C0415

            article_id = self.archive_url.split("/")[-1]
            self._api_response = requests.get(
                f"{self.base_api_url}/{article_id}",
                timeout=DEFAULT_TIMEOUT,
            ).json()

        return self._api_response

    @property
    def api_version(self):
        """
        Version of the Zenodo API we are interacting with

        The versions can either be :

        - ``"legacy"``: corresponds to the Zenodo API that was supported until
          2023-10-12 (before the migration to InvenioRDM).
        - ``"new"``: corresponds to the new API that went online on 2023-10-13
          after the migration to InvenioRDM.

        The ``"new"`` API breaks backward compatibility with the ``"legacy"``
        one and could probably be replaced by an updated version that restores
        the behaviour of the ``"legacy"`` one.

        Returns
        -------
        str
        """
        if self._api_version is None:
            if all("key" in file for file in self.api_response["files"]):
                self._api_version = "legacy"
            elif all("filename" in file for file in self.api_response["files"]):
                self._api_version = "new"
            else:
                raise ValueError(
                    "Couldn't determine the version of the Zenodo API for "
                    f"{self.archive_url} (doi:{self.doi})."
                )
        return self._api_version

    def download_url(self, file_name):
        """
        Use the repository API to get the download URL for a file given
        the archive URL.

        Parameters
        ----------
        file_name : str
            The name of the file in the archive that will be downloaded.

        Returns
        -------
        download_url : str
            The HTTP URL that can be used to download the file.

        Notes
        -----
        After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
        link to the desired files that appears in the API response leads to 404
        errors (by 2023-10-17). The files are available in the following url:
        ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.

        This method supports both the legacy and the new API.
        """
        # Create list of files in the repository
        if self.api_version == "legacy":
            files = {item["key"]: item for item in self.api_response["files"]}
        else:
            files = [item["filename"] for item in self.api_response["files"]]
        # Check if file exists in the repository
        if file_name not in files:
            raise ValueError(
                f"File '{file_name}' not found in data archive "
                f"{self.archive_url} (doi:{self.doi})."
            )
        # Build download url
        if self.api_version == "legacy":
            download_url = files[file_name]["links"]["self"]
        else:
            article_id = self.api_response["id"]
            download_url = (
                f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
            )
        return download_url

    def populate_registry(self, pooch):
        """
        Populate the registry using the data repository's API

        Parameters
        ----------
        pooch : Pooch
            The pooch instance that the registry will be added to.

        Notes
        -----
        After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
        checksums for each file listed in the API reference is now an md5 sum.

        This method supports both the legacy and the new API.
        """
        for filedata in self.api_response["files"]:
            checksum = filedata["checksum"]
            if self.api_version == "legacy":
                key = "key"
            else:
                key = "filename"
                checksum = f"md5:{checksum}"
            pooch.registry[filedata[key]] = checksum


class FigshareRepository(DataRepository):  # pylint: disable=missing-class-docstring
    def __init__(self, doi, archive_url):
        self.archive_url = archive_url
        self.doi = doi
        self._api_response = None

    @classmethod
    def initialize(cls, doi, archive_url):
        """
        Initialize the data repository if the given URL points to a
        corresponding repository.

        Initializes a data repository object. This is done as part of
        a chain of responsibility. If the class cannot handle the given
        repository URL, it returns `None`. Otherwise a `DataRepository`
        instance is returned.

        Parameters
        ----------
        doi : str
            The DOI that identifies the repository
        archive_url : str
            The resolved URL for the DOI
        """

        # Check whether this is a Figshare URL
        parsed_archive_url = parse_url(archive_url)
        if parsed_archive_url["netloc"] != "figshare.com":
            return None

        return cls(doi, archive_url)

    def _parse_version_from_doi(self):
        """
        Parse version from the doi

        Return None if version is not available in the doi.
        """
        # Get suffix of the doi
        _, suffix = self.doi.split("/")
        # Split the suffix by dots and keep the last part
        last_part = suffix.split(".")[-1]
        # Parse the version from the last part
        if last_part[0] != "v":
            return None
        version = int(last_part[1:])
        return version

    @property
    def api_response(self):
        """Cached API response from Figshare"""
        if self._api_response is None:
            # Lazy import requests to speed up import time
            import requests  # pylint: disable=C0415

            # Use the figshare API to find the article ID from the DOI
            article = requests.get(
                f"https://api.figshare.com/v2/articles?doi={self.doi}",
                timeout=DEFAULT_TIMEOUT,
            ).json()[0]
            article_id = article["id"]
            # Parse desired version from the doi
            version = self._parse_version_from_doi()
            # With the ID and version, we can get a list of files and their
            # download links
            if version is None:
                # Figshare returns the latest version available when no version
                # is specified through the DOI.
                warnings.warn(
                    f"The Figshare DOI '{self.doi}' doesn't specify which version of "
                    "the repository should be used. "
                    "Figshare will point to the latest version available.",
                    UserWarning,
                )
                # Define API url using only the article id
                # (figshare will resolve the latest version)
                api_url = f"https://api.figshare.com/v2/articles/{article_id}"
            else:
                # Define API url using article id and the desired version
                # Get list of files using article id and the version
                api_url = (
                    "https://api.figshare.com/v2/articles/"
                    f"{article_id}/versions/{version}"
                )
            # Make the request and return the files in the figshare repository
            response = requests.get(api_url, timeout=DEFAULT_TIMEOUT)
            response.raise_for_status()
            self._api_response = response.json()["files"]

        return self._api_response

    def download_url(self, file_name):
        """
        Use the repository API to get the download URL for a file given
        the archive URL.

        Parameters
        ----------
        file_name : str
            The name of the file in the archive that will be downloaded.

        Returns
        -------
        download_url : str
            The HTTP URL that can be used to download the file.
        """
        files = {item["name"]: item for item in self.api_response}
        if file_name not in files:
            raise ValueError(
                f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
            )
        download_url = files[file_name]["download_url"]
        return download_url

    def populate_registry(self, pooch):
        """
        Populate the registry using the data repository's API

        Parameters
        ----------
        pooch : Pooch
            The pooch instance that the registry will be added to.
        """

        for filedata in self.api_response:
            pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"


class DataverseRepository(DataRepository):  # pylint: disable=missing-class-docstring
    def __init__(self, doi, archive_url):
        self.archive_url = archive_url
        self.doi = doi
        self._api_response = None

    @classmethod
    def initialize(cls, doi, archive_url):
        """
        Initialize the data repository if the given URL points to a
        corresponding repository.

        Initializes a data repository object. This is done as part of
        a chain of responsibility. If the class cannot handle the given
        repository URL, it returns `None`. Otherwise a `DataRepository`
        instance is returned.

        Parameters
        ----------
        doi : str
            The DOI that identifies the repository
        archive_url : str
            The resolved URL for the DOI
        """
        # Access the DOI as if this was a DataVerse instance
        response = cls._get_api_response(doi, archive_url)

        # If we failed, this is probably not a DataVerse instance
        if 400 <= response.status_code < 600:
            return None

        # Initialize the repository and overwrite the api response
        repository = cls(doi, archive_url)
        repository.api_response = response
        return repository

    @classmethod
    def _get_api_response(cls, doi, archive_url):
        """
        Perform the actual API request

        This has been separated into a separate ``classmethod``, as it can be
        used prior and after the initialization.
        """
        # Lazy import requests to speed up import time
        import requests  # pylint: disable=C0415

        parsed = parse_url(archive_url)
        response = requests.get(
            f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
            f":persistentId?persistentId=doi:{doi}",
            timeout=DEFAULT_TIMEOUT,
        )
        return response

    @property
    def api_response(self):
        """Cached API response from a DataVerse instance"""

        if self._api_response is None:
            self._api_response = self._get_api_response(
                self.doi, self.archive_url
            )  # pragma: no cover

        return self._api_response

    @api_response.setter
    def api_response(self, response):
        """Update the cached API response"""

        self._api_response = response

    def download_url(self, file_name):
        """
        Use the repository API to get the download URL for a file given
        the archive URL.

        Parameters
        ----------
        file_name : str
            The name of the file in the archive that will be downloaded.

        Returns
        -------
        download_url : str
            The HTTP URL that can be used to download the file.
        """
        parsed = parse_url(self.archive_url)
        response = self.api_response.json()
        files = {
            file["dataFile"]["filename"]: file["dataFile"]
            for file in response["data"]["latestVersion"]["files"]
        }
        if file_name not in files:
            raise ValueError(
                f"File '{file_name}' not found in data archive "
                f"{self.archive_url} (doi:{self.doi})."
            )
        # Generate download_url using the file id
        download_url = (
            f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
            f"{files[file_name]['id']}"
        )
        return download_url

    def populate_registry(self, pooch):
        """
        Populate the registry using the data repository's API

        Parameters
        ----------
        pooch : Pooch
            The pooch instance that the registry will be added to.
        """

        for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
            pooch.registry[filedata["dataFile"]["filename"]] = (
                f"md5:{filedata['dataFile']['md5']}"
            )
author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children