jpayne@68: # Copyright (c) 2018 The Pooch Developers.
jpayne@68: # Distributed under the terms of the BSD 3-Clause License.
jpayne@68: # SPDX-License-Identifier: BSD-3-Clause
jpayne@68: #
jpayne@68: # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
jpayne@68: #
jpayne@68: """
jpayne@68: The classes that actually handle the downloads.
jpayne@68: """
jpayne@68: import os
jpayne@68: import sys
jpayne@68: import ftplib
jpayne@68: 
jpayne@68: import warnings
jpayne@68: 
jpayne@68: from .utils import parse_url
jpayne@68: 
jpayne@68: try:
jpayne@68:     from tqdm import tqdm
jpayne@68: except ImportError:
jpayne@68:     tqdm = None
jpayne@68: 
jpayne@68: try:
jpayne@68:     import paramiko
jpayne@68: except ImportError:
jpayne@68:     paramiko = None
jpayne@68: 
jpayne@68: 
jpayne@68: # Set the default timeout in seconds so it can be configured in a pinch for the
jpayne@68: # methods that don't or can't expose a way set it at runtime.
jpayne@68: # See https://github.com/fatiando/pooch/issues/409
jpayne@68: DEFAULT_TIMEOUT = 30
jpayne@68: 
jpayne@68: 
jpayne@68: def choose_downloader(url, progressbar=False):
jpayne@68:     """
jpayne@68:     Choose the appropriate downloader for the given URL based on the protocol.
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     url : str
jpayne@68:         A URL (including protocol).
jpayne@68:     progressbar : bool or an arbitrary progress bar object
jpayne@68:         If True, will print a progress bar of the download to standard error
jpayne@68:         (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68:         installed. Alternatively, an arbitrary progress bar object can be
jpayne@68:         passed. See :ref:`custom-progressbar` for details.
jpayne@68: 
jpayne@68:     Returns
jpayne@68:     -------
jpayne@68:     downloader
jpayne@68:         A downloader class, like :class:`pooch.HTTPDownloader`,
jpayne@68:         :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
jpayne@68: 
jpayne@68:     Examples
jpayne@68:     --------
jpayne@68: 
jpayne@68:     >>> downloader = choose_downloader("http://something.com")
jpayne@68:     >>> print(downloader.__class__.__name__)
jpayne@68:     HTTPDownloader
jpayne@68:     >>> downloader = choose_downloader("https://something.com")
jpayne@68:     >>> print(downloader.__class__.__name__)
jpayne@68:     HTTPDownloader
jpayne@68:     >>> downloader = choose_downloader("ftp://something.com")
jpayne@68:     >>> print(downloader.__class__.__name__)
jpayne@68:     FTPDownloader
jpayne@68:     >>> downloader = choose_downloader("doi:DOI/filename.csv")
jpayne@68:     >>> print(downloader.__class__.__name__)
jpayne@68:     DOIDownloader
jpayne@68: 
jpayne@68:     """
jpayne@68:     known_downloaders = {
jpayne@68:         "ftp": FTPDownloader,
jpayne@68:         "https": HTTPDownloader,
jpayne@68:         "http": HTTPDownloader,
jpayne@68:         "sftp": SFTPDownloader,
jpayne@68:         "doi": DOIDownloader,
jpayne@68:     }
jpayne@68: 
jpayne@68:     parsed_url = parse_url(url)
jpayne@68:     if parsed_url["protocol"] not in known_downloaders:
jpayne@68:         raise ValueError(
jpayne@68:             f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
jpayne@68:             f"Must be one of {known_downloaders.keys()}."
jpayne@68:         )
jpayne@68:     downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
jpayne@68:     return downloader
jpayne@68: 
jpayne@68: 
jpayne@68: class HTTPDownloader:  # pylint: disable=too-few-public-methods
jpayne@68:     """
jpayne@68:     Download manager for fetching files over HTTP/HTTPS.
jpayne@68: 
jpayne@68:     When called, downloads the given file URL into the specified local file.
jpayne@68:     Uses the :mod:`requests` library to manage downloads.
jpayne@68: 
jpayne@68:     Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@68:     the download of files (for example, to use authentication or print a
jpayne@68:     progress bar).
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     progressbar : bool or an arbitrary progress bar object
jpayne@68:         If True, will print a progress bar of the download to standard error
jpayne@68:         (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68:         installed. Alternatively, an arbitrary progress bar object can be
jpayne@68:         passed. See :ref:`custom-progressbar` for details.
jpayne@68:     chunk_size : int
jpayne@68:         Files are streamed *chunk_size* bytes at a time instead of loading
jpayne@68:         everything into memory at one. Usually doesn't need to be changed.
jpayne@68:     **kwargs
jpayne@68:         All keyword arguments given when creating an instance of this class
jpayne@68:         will be passed to :func:`requests.get`.
jpayne@68: 
jpayne@68:     Examples
jpayne@68:     --------
jpayne@68: 
jpayne@68:     Download one of the data files from the Pooch repository:
jpayne@68: 
jpayne@68:     >>> import os
jpayne@68:     >>> from pooch import __version__, check_version
jpayne@68:     >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
jpayne@68:     >>> url = url.format(check_version(__version__, fallback="main"))
jpayne@68:     >>> downloader = HTTPDownloader()
jpayne@68:     >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
jpayne@68:     >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68:     >>> os.path.exists("tiny-data.txt")
jpayne@68:     True
jpayne@68:     >>> with open("tiny-data.txt") as f:
jpayne@68:     ...     print(f.read().strip())
jpayne@68:     # A tiny data file for test purposes only
jpayne@68:     1  2  3  4  5  6
jpayne@68:     >>> os.remove("tiny-data.txt")
jpayne@68: 
jpayne@68:     Authentication can be handled by passing a user name and password to
jpayne@68:     :func:`requests.get`. All arguments provided when creating an instance of
jpayne@68:     the class are forwarded to :func:`requests.get`. We'll use
jpayne@68:     ``auth=(username, password)`` to use basic HTTPS authentication. The
jpayne@68:     https://httpbin.org website allows us to make a fake a login request using
jpayne@68:     whatever username and password we provide to it:
jpayne@68: 
jpayne@68:     >>> user = "doggo"
jpayne@68:     >>> password = "goodboy"
jpayne@68:     >>> # httpbin will ask for the user and password we provide in the URL
jpayne@68:     >>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
jpayne@68:     >>> # Trying without the login credentials causes an error
jpayne@68:     >>> downloader = HTTPDownloader()
jpayne@68:     >>> try:
jpayne@68:     ...     downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68:     ... except Exception:
jpayne@68:     ...     print("There was an error!")
jpayne@68:     There was an error!
jpayne@68:     >>> # Pass in the credentials to HTTPDownloader
jpayne@68:     >>> downloader = HTTPDownloader(auth=(user, password))
jpayne@68:     >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68:     >>> with open("tiny-data.txt") as f:
jpayne@68:     ...     for line in f:
jpayne@68:     ...         print(line.rstrip())
jpayne@68:     {
jpayne@68:       "authenticated": true,
jpayne@68:       "user": "doggo"
jpayne@68:     }
jpayne@68:     >>> os.remove("tiny-data.txt")
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
jpayne@68:         self.kwargs = kwargs
jpayne@68:         self.progressbar = progressbar
jpayne@68:         self.chunk_size = chunk_size
jpayne@68:         if self.progressbar is True and tqdm is None:
jpayne@68:             raise ValueError("Missing package 'tqdm' required for progress bars.")
jpayne@68: 
jpayne@68:     def __call__(
jpayne@68:         self, url, output_file, pooch, check_only=False
jpayne@68:     ):  # pylint: disable=R0914
jpayne@68:         """
jpayne@68:         Download the given URL over HTTP to the given output file.
jpayne@68: 
jpayne@68:         Uses :func:`requests.get`.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         url : str
jpayne@68:             The URL to the file you want to download.
jpayne@68:         output_file : str or file-like object
jpayne@68:             Path (and file name) to which the file will be downloaded.
jpayne@68:         pooch : :class:`~pooch.Pooch`
jpayne@68:             The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68:         check_only : bool
jpayne@68:             If True, will only check if a file exists on the server and
jpayne@68:             **without downloading the file**. Will return ``True`` if the file
jpayne@68:             exists and ``False`` otherwise.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68:         availability : bool or None
jpayne@68:             If ``check_only==True``, returns a boolean indicating if the file
jpayne@68:             is available on the server. Otherwise, returns ``None``.
jpayne@68: 
jpayne@68:         """
jpayne@68:         # Lazy import requests to speed up import time
jpayne@68:         import requests  # pylint: disable=C0415
jpayne@68: 
jpayne@68:         if check_only:
jpayne@68:             timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT)
jpayne@68:             response = requests.head(url, timeout=timeout, allow_redirects=True)
jpayne@68:             available = bool(response.status_code == 200)
jpayne@68:             return available
jpayne@68: 
jpayne@68:         kwargs = self.kwargs.copy()
jpayne@68:         timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT)
jpayne@68:         kwargs.setdefault("stream", True)
jpayne@68:         ispath = not hasattr(output_file, "write")
jpayne@68:         if ispath:
jpayne@68:             # pylint: disable=consider-using-with
jpayne@68:             output_file = open(output_file, "w+b")
jpayne@68:             # pylint: enable=consider-using-with
jpayne@68:         try:
jpayne@68:             response = requests.get(url, timeout=timeout, **kwargs)
jpayne@68:             response.raise_for_status()
jpayne@68:             content = response.iter_content(chunk_size=self.chunk_size)
jpayne@68:             total = int(response.headers.get("content-length", 0))
jpayne@68:             if self.progressbar is True:
jpayne@68:                 # Need to use ascii characters on Windows because there isn't
jpayne@68:                 # always full unicode support
jpayne@68:                 # (see https://github.com/tqdm/tqdm/issues/454)
jpayne@68:                 use_ascii = bool(sys.platform == "win32")
jpayne@68:                 progress = tqdm(
jpayne@68:                     total=total,
jpayne@68:                     ncols=79,
jpayne@68:                     ascii=use_ascii,
jpayne@68:                     unit="B",
jpayne@68:                     unit_scale=True,
jpayne@68:                     leave=True,
jpayne@68:                 )
jpayne@68:             elif self.progressbar:
jpayne@68:                 progress = self.progressbar
jpayne@68:                 progress.total = total
jpayne@68:             for chunk in content:
jpayne@68:                 if chunk:
jpayne@68:                     output_file.write(chunk)
jpayne@68:                     output_file.flush()
jpayne@68:                     if self.progressbar:
jpayne@68:                         # Use the chunk size here because chunk may be much
jpayne@68:                         # larger if the data are decompressed by requests after
jpayne@68:                         # reading (happens with text files).
jpayne@68:                         progress.update(self.chunk_size)
jpayne@68:             # Make sure the progress bar gets filled even if the actual number
jpayne@68:             # is chunks is smaller than expected. This happens when streaming
jpayne@68:             # text files that are compressed by the server when sending (gzip).
jpayne@68:             # Binary files don't experience this.
jpayne@68:             if self.progressbar:
jpayne@68:                 progress.reset()
jpayne@68:                 progress.update(total)
jpayne@68:                 progress.close()
jpayne@68:         finally:
jpayne@68:             if ispath:
jpayne@68:                 output_file.close()
jpayne@68:         return None
jpayne@68: 
jpayne@68: 
jpayne@68: class FTPDownloader:  # pylint: disable=too-few-public-methods
jpayne@68:     """
jpayne@68:     Download manager for fetching files over FTP.
jpayne@68: 
jpayne@68:     When called, downloads the given file URL into the specified local file.
jpayne@68:     Uses the :mod:`ftplib` module to manage downloads.
jpayne@68: 
jpayne@68:     Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@68:     the download of files (for example, to use authentication or print a
jpayne@68:     progress bar).
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     port : int
jpayne@68:         Port used for the FTP connection.
jpayne@68:     username : str
jpayne@68:         User name used to login to the server. Only needed if the server
jpayne@68:         requires authentication (i.e., no anonymous FTP).
jpayne@68:     password : str
jpayne@68:         Password used to login to the server. Only needed if the server
jpayne@68:         requires authentication (i.e., no anonymous FTP). Use the empty string
jpayne@68:         to indicate no password is required.
jpayne@68:     account : str
jpayne@68:         Some servers also require an "account" name for authentication.
jpayne@68:     timeout : int
jpayne@68:         Timeout in seconds for ftp socket operations, use None to mean no
jpayne@68:         timeout.
jpayne@68:     progressbar : bool
jpayne@68:         If True, will print a progress bar of the download to standard error
jpayne@68:         (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68:         installed. **Custom progress bars are not yet supported.**
jpayne@68:     chunk_size : int
jpayne@68:         Files are streamed *chunk_size* bytes at a time instead of loading
jpayne@68:         everything into memory at one. Usually doesn't need to be changed.
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(
jpayne@68:         self,
jpayne@68:         port=21,
jpayne@68:         username="anonymous",
jpayne@68:         password="",
jpayne@68:         account="",
jpayne@68:         timeout=None,
jpayne@68:         progressbar=False,
jpayne@68:         chunk_size=1024,
jpayne@68:     ):
jpayne@68:         self.port = port
jpayne@68:         self.username = username
jpayne@68:         self.password = password
jpayne@68:         self.account = account
jpayne@68:         self.timeout = timeout
jpayne@68:         self.progressbar = progressbar
jpayne@68:         self.chunk_size = chunk_size
jpayne@68:         if self.progressbar is True and tqdm is None:
jpayne@68:             raise ValueError("Missing package 'tqdm' required for progress bars.")
jpayne@68: 
jpayne@68:     def __call__(self, url, output_file, pooch, check_only=False):
jpayne@68:         """
jpayne@68:         Download the given URL over FTP to the given output file.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         url : str
jpayne@68:             The URL to the file you want to download.
jpayne@68:         output_file : str or file-like object
jpayne@68:             Path (and file name) to which the file will be downloaded.
jpayne@68:         pooch : :class:`~pooch.Pooch`
jpayne@68:             The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68:         check_only : bool
jpayne@68:             If True, will only check if a file exists on the server and
jpayne@68:             **without downloading the file**. Will return ``True`` if the file
jpayne@68:             exists and ``False`` otherwise.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68:         availability : bool or None
jpayne@68:             If ``check_only==True``, returns a boolean indicating if the file
jpayne@68:             is available on the server. Otherwise, returns ``None``.
jpayne@68: 
jpayne@68:         """
jpayne@68:         parsed_url = parse_url(url)
jpayne@68:         ftp = ftplib.FTP(timeout=self.timeout)
jpayne@68:         ftp.connect(host=parsed_url["netloc"], port=self.port)
jpayne@68: 
jpayne@68:         if check_only:
jpayne@68:             directory, file_name = os.path.split(parsed_url["path"])
jpayne@68:             try:
jpayne@68:                 ftp.login(user=self.username, passwd=self.password, acct=self.account)
jpayne@68:                 available = file_name in ftp.nlst(directory)
jpayne@68:             finally:
jpayne@68:                 ftp.close()
jpayne@68:             return available
jpayne@68: 
jpayne@68:         ispath = not hasattr(output_file, "write")
jpayne@68:         if ispath:
jpayne@68:             # pylint: disable=consider-using-with
jpayne@68:             output_file = open(output_file, "w+b")
jpayne@68:             # pylint: enable=consider-using-with
jpayne@68:         try:
jpayne@68:             ftp.login(user=self.username, passwd=self.password, acct=self.account)
jpayne@68:             command = f"RETR {parsed_url['path']}"
jpayne@68:             if self.progressbar:
jpayne@68:                 # Make sure the file is set to binary mode, otherwise we can't
jpayne@68:                 # get the file size. See: https://stackoverflow.com/a/22093848
jpayne@68:                 ftp.voidcmd("TYPE I")
jpayne@68:                 use_ascii = bool(sys.platform == "win32")
jpayne@68:                 progress = tqdm(
jpayne@68:                     total=int(ftp.size(parsed_url["path"])),
jpayne@68:                     ncols=79,
jpayne@68:                     ascii=use_ascii,
jpayne@68:                     unit="B",
jpayne@68:                     unit_scale=True,
jpayne@68:                     leave=True,
jpayne@68:                 )
jpayne@68:                 with progress:
jpayne@68: 
jpayne@68:                     def callback(data):
jpayne@68:                         "Update the progress bar and write to output"
jpayne@68:                         progress.update(len(data))
jpayne@68:                         output_file.write(data)
jpayne@68: 
jpayne@68:                     ftp.retrbinary(command, callback, blocksize=self.chunk_size)
jpayne@68:             else:
jpayne@68:                 ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
jpayne@68:         finally:
jpayne@68:             ftp.quit()
jpayne@68:             if ispath:
jpayne@68:                 output_file.close()
jpayne@68:         return None
jpayne@68: 
jpayne@68: 
jpayne@68: class SFTPDownloader:  # pylint: disable=too-few-public-methods
jpayne@68:     """
jpayne@68:     Download manager for fetching files over SFTP.
jpayne@68: 
jpayne@68:     When called, downloads the given file URL into the specified local file.
jpayne@68:     Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
jpayne@68:     installed.
jpayne@68: 
jpayne@68:     Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@68:     the download of files (for example, to use authentication or print a
jpayne@68:     progress bar).
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     port : int
jpayne@68:         Port used for the SFTP connection.
jpayne@68:     username : str
jpayne@68:         User name used to login to the server. Only needed if the server
jpayne@68:         requires authentication (i.e., no anonymous SFTP).
jpayne@68:     password : str
jpayne@68:         Password used to login to the server. Only needed if the server
jpayne@68:         requires authentication (i.e., no anonymous SFTP). Use the empty
jpayne@68:         string to indicate no password is required.
jpayne@68:     timeout : int
jpayne@68:         Timeout in seconds for sftp socket operations, use None to mean no
jpayne@68:         timeout.
jpayne@68:     progressbar : bool or an arbitrary progress bar object
jpayne@68:         If True, will print a progress bar of the download to standard
jpayne@68:         error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
jpayne@68:         be installed.
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(
jpayne@68:         self,
jpayne@68:         port=22,
jpayne@68:         username="anonymous",
jpayne@68:         password="",
jpayne@68:         account="",
jpayne@68:         timeout=None,
jpayne@68:         progressbar=False,
jpayne@68:     ):
jpayne@68:         self.port = port
jpayne@68:         self.username = username
jpayne@68:         self.password = password
jpayne@68:         self.account = account
jpayne@68:         self.timeout = timeout
jpayne@68:         self.progressbar = progressbar
jpayne@68:         # Collect errors and raise only once so that both missing packages are
jpayne@68:         # captured. Otherwise, the user is only warned of one of them at a
jpayne@68:         # time (and we can't test properly when they are both missing).
jpayne@68:         errors = []
jpayne@68:         if self.progressbar and tqdm is None:
jpayne@68:             errors.append("Missing package 'tqdm' required for progress bars.")
jpayne@68:         if paramiko is None:
jpayne@68:             errors.append("Missing package 'paramiko' required for SFTP downloads.")
jpayne@68:         if errors:
jpayne@68:             raise ValueError(" ".join(errors))
jpayne@68: 
jpayne@68:     def __call__(self, url, output_file, pooch):
jpayne@68:         """
jpayne@68:         Download the given URL over SFTP to the given output file.
jpayne@68: 
jpayne@68:         The output file must be given as a string (file name/path) and not an
jpayne@68:         open file object! Otherwise, paramiko cannot save to that file.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         url : str
jpayne@68:             The URL to the file you want to download.
jpayne@68:         output_file : str
jpayne@68:             Path (and file name) to which the file will be downloaded. **Cannot
jpayne@68:             be a file object**.
jpayne@68:         pooch : :class:`~pooch.Pooch`
jpayne@68:             The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68:         """
jpayne@68:         parsed_url = parse_url(url)
jpayne@68:         connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
jpayne@68:         sftp = None
jpayne@68:         try:
jpayne@68:             connection.connect(username=self.username, password=self.password)
jpayne@68:             sftp = paramiko.SFTPClient.from_transport(connection)
jpayne@68:             sftp.get_channel().settimeout = self.timeout
jpayne@68:             if self.progressbar:
jpayne@68:                 size = int(sftp.stat(parsed_url["path"]).st_size)
jpayne@68:                 use_ascii = bool(sys.platform == "win32")
jpayne@68:                 progress = tqdm(
jpayne@68:                     total=size,
jpayne@68:                     ncols=79,
jpayne@68:                     ascii=use_ascii,
jpayne@68:                     unit="B",
jpayne@68:                     unit_scale=True,
jpayne@68:                     leave=True,
jpayne@68:                 )
jpayne@68:             if self.progressbar:
jpayne@68:                 with progress:
jpayne@68: 
jpayne@68:                     def callback(current, total):
jpayne@68:                         "Update the progress bar and write to output"
jpayne@68:                         progress.total = int(total)
jpayne@68:                         progress.update(int(current - progress.n))
jpayne@68: 
jpayne@68:                     sftp.get(parsed_url["path"], output_file, callback=callback)
jpayne@68:             else:
jpayne@68:                 sftp.get(parsed_url["path"], output_file)
jpayne@68:         finally:
jpayne@68:             connection.close()
jpayne@68:             if sftp is not None:
jpayne@68:                 sftp.close()
jpayne@68: 
jpayne@68: 
jpayne@68: class DOIDownloader:  # pylint: disable=too-few-public-methods
jpayne@68:     """
jpayne@68:     Download manager for fetching files from Digital Object Identifiers (DOIs).
jpayne@68: 
jpayne@68:     Open-access data repositories often issue Digital Object Identifiers (DOIs)
jpayne@68:     for data which provide a stable link and citation point. The trick is
jpayne@68:     finding out the download URL for a file given the DOI.
jpayne@68: 
jpayne@68:     When called, this downloader uses the repository's public API to find out
jpayne@68:     the download URL from the DOI and file name. It then uses
jpayne@68:     :class:`pooch.HTTPDownloader` to download the URL into the specified local
jpayne@68:     file. Allowing "URL"s  to be specified with the DOI instead of the actual
jpayne@68:     HTTP download link. Uses the :mod:`requests` library to manage downloads
jpayne@68:     and interact with the APIs.
jpayne@68: 
jpayne@68:     The **format of the "URL"** is: ``doi:{DOI}/{file name}``.
jpayne@68: 
jpayne@68:     Notice that there are no ``//`` like in HTTP/FTP and you must specify a
jpayne@68:     file name after the DOI (separated by a ``/``).
jpayne@68: 
jpayne@68:     Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
jpayne@68:     download files given the DOI instead of an HTTP link.
jpayne@68: 
jpayne@68:     Supported repositories:
jpayne@68: 
jpayne@68:     * `figshare <https://www.figshare.com>`__
jpayne@68:     * `Zenodo <https://www.zenodo.org>`__
jpayne@68:     * `Dataverse <https://dataverse.org/>`__ instances
jpayne@68: 
jpayne@68:     .. attention::
jpayne@68: 
jpayne@68:         DOIs from other repositories **will not work** since we need to access
jpayne@68:         their particular APIs to find the download links. We welcome
jpayne@68:         suggestions and contributions adding new repositories.
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     progressbar : bool or an arbitrary progress bar object
jpayne@68:         If True, will print a progress bar of the download to standard error
jpayne@68:         (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68:         installed. Alternatively, an arbitrary progress bar object can be
jpayne@68:         passed. See :ref:`custom-progressbar` for details.
jpayne@68:     chunk_size : int
jpayne@68:         Files are streamed *chunk_size* bytes at a time instead of loading
jpayne@68:         everything into memory at one. Usually doesn't need to be changed.
jpayne@68:     **kwargs
jpayne@68:         All keyword arguments given when creating an instance of this class
jpayne@68:         will be passed to :func:`requests.get`.
jpayne@68: 
jpayne@68:     Examples
jpayne@68:     --------
jpayne@68: 
jpayne@68:     Download one of the data files from the figshare archive of Pooch test
jpayne@68:     data:
jpayne@68: 
jpayne@68:     >>> import os
jpayne@68:     >>> downloader = DOIDownloader()
jpayne@68:     >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
jpayne@68:     >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
jpayne@68:     >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68:     >>> os.path.exists("tiny-data.txt")
jpayne@68:     True
jpayne@68:     >>> with open("tiny-data.txt") as f:
jpayne@68:     ...     print(f.read().strip())
jpayne@68:     # A tiny data file for test purposes only
jpayne@68:     1  2  3  4  5  6
jpayne@68:     >>> os.remove("tiny-data.txt")
jpayne@68: 
jpayne@68:     Same thing but for our Zenodo archive:
jpayne@68: 
jpayne@68:     >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
jpayne@68:     >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68:     >>> os.path.exists("tiny-data.txt")
jpayne@68:     True
jpayne@68:     >>> with open("tiny-data.txt") as f:
jpayne@68:     ...     print(f.read().strip())
jpayne@68:     # A tiny data file for test purposes only
jpayne@68:     1  2  3  4  5  6
jpayne@68:     >>> os.remove("tiny-data.txt")
jpayne@68: 
jpayne@68:     """
jpayne@68: 
jpayne@68:     def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
jpayne@68:         self.kwargs = kwargs
jpayne@68:         self.progressbar = progressbar
jpayne@68:         self.chunk_size = chunk_size
jpayne@68: 
jpayne@68:     def __call__(self, url, output_file, pooch):
jpayne@68:         """
jpayne@68:         Download the given DOI URL over HTTP to the given output file.
jpayne@68: 
jpayne@68:         Uses the repository's API to determine the actual HTTP download URL
jpayne@68:         from the given DOI.
jpayne@68: 
jpayne@68:         Uses :func:`requests.get`.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         url : str
jpayne@68:             The URL to the file you want to download.
jpayne@68:         output_file : str or file-like object
jpayne@68:             Path (and file name) to which the file will be downloaded.
jpayne@68:         pooch : :class:`~pooch.Pooch`
jpayne@68:             The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68: 
jpayne@68:         """
jpayne@68: 
jpayne@68:         parsed_url = parse_url(url)
jpayne@68:         data_repository = doi_to_repository(parsed_url["netloc"])
jpayne@68: 
jpayne@68:         # Resolve the URL
jpayne@68:         file_name = parsed_url["path"]
jpayne@68:         # remove the leading slash in the path
jpayne@68:         if file_name[0] == "/":
jpayne@68:             file_name = file_name[1:]
jpayne@68:         download_url = data_repository.download_url(file_name)
jpayne@68: 
jpayne@68:         # Instantiate the downloader object
jpayne@68:         downloader = HTTPDownloader(
jpayne@68:             progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
jpayne@68:         )
jpayne@68:         downloader(download_url, output_file, pooch)
jpayne@68: 
jpayne@68: 
jpayne@68: def doi_to_url(doi):
jpayne@68:     """
jpayne@68:     Follow a DOI link to resolve the URL of the archive.
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     doi : str
jpayne@68:         The DOI of the archive.
jpayne@68: 
jpayne@68:     Returns
jpayne@68:     -------
jpayne@68:     url : str
jpayne@68:         The URL of the archive in the data repository.
jpayne@68: 
jpayne@68:     """
jpayne@68:     # Lazy import requests to speed up import time
jpayne@68:     import requests  # pylint: disable=C0415
jpayne@68: 
jpayne@68:     # Use doi.org to resolve the DOI to the repository website.
jpayne@68:     response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT)
jpayne@68:     url = response.url
jpayne@68:     if 400 <= response.status_code < 600:
jpayne@68:         raise ValueError(
jpayne@68:             f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
jpayne@68:         )
jpayne@68:     return url
jpayne@68: 
jpayne@68: 
jpayne@68: def doi_to_repository(doi):
jpayne@68:     """
jpayne@68:     Instantiate a data repository instance from a given DOI.
jpayne@68: 
jpayne@68:     This function implements the chain of responsibility dispatch
jpayne@68:     to the correct data repository class.
jpayne@68: 
jpayne@68:     Parameters
jpayne@68:     ----------
jpayne@68:     doi : str
jpayne@68:         The DOI of the archive.
jpayne@68: 
jpayne@68:     Returns
jpayne@68:     -------
jpayne@68:     data_repository : DataRepository
jpayne@68:         The data repository object
jpayne@68:     """
jpayne@68: 
jpayne@68:     # This should go away in a separate issue: DOI handling should
jpayne@68:     # not rely on the (non-)existence of trailing slashes. The issue
jpayne@68:     # is documented in https://github.com/fatiando/pooch/issues/324
jpayne@68:     if doi[-1] == "/":
jpayne@68:         doi = doi[:-1]
jpayne@68: 
jpayne@68:     repositories = [
jpayne@68:         FigshareRepository,
jpayne@68:         ZenodoRepository,
jpayne@68:         DataverseRepository,
jpayne@68:     ]
jpayne@68: 
jpayne@68:     # Extract the DOI and the repository information
jpayne@68:     archive_url = doi_to_url(doi)
jpayne@68: 
jpayne@68:     # Try the converters one by one until one of them returned a URL
jpayne@68:     data_repository = None
jpayne@68:     for repo in repositories:
jpayne@68:         if data_repository is None:
jpayne@68:             data_repository = repo.initialize(
jpayne@68:                 archive_url=archive_url,
jpayne@68:                 doi=doi,
jpayne@68:             )
jpayne@68: 
jpayne@68:     if data_repository is None:
jpayne@68:         repository = parse_url(archive_url)["netloc"]
jpayne@68:         raise ValueError(
jpayne@68:             f"Invalid data repository '{repository}'. "
jpayne@68:             "To request or contribute support for this repository, "
jpayne@68:             "please open an issue at https://github.com/fatiando/pooch/issues"
jpayne@68:         )
jpayne@68: 
jpayne@68:     return data_repository
jpayne@68: 
jpayne@68: 
jpayne@68: class DataRepository:  # pylint: disable=too-few-public-methods, missing-class-docstring
jpayne@68:     @classmethod
jpayne@68:     def initialize(cls, doi, archive_url):  # pylint: disable=unused-argument
jpayne@68:         """
jpayne@68:         Initialize the data repository if the given URL points to a
jpayne@68:         corresponding repository.
jpayne@68: 
jpayne@68:         Initializes a data repository object. This is done as part of
jpayne@68:         a chain of responsibility. If the class cannot handle the given
jpayne@68:         repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68:         instance is returned.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         doi : str
jpayne@68:             The DOI that identifies the repository
jpayne@68:         archive_url : str
jpayne@68:             The resolved URL for the DOI
jpayne@68:         """
jpayne@68: 
jpayne@68:         return None  # pragma: no cover
jpayne@68: 
jpayne@68:     def download_url(self, file_name):
jpayne@68:         """
jpayne@68:         Use the repository API to get the download URL for a file given
jpayne@68:         the archive URL.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         file_name : str
jpayne@68:             The name of the file in the archive that will be downloaded.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68:         download_url : str
jpayne@68:             The HTTP URL that can be used to download the file.
jpayne@68:         """
jpayne@68: 
jpayne@68:         raise NotImplementedError  # pragma: no cover
jpayne@68: 
jpayne@68:     def populate_registry(self, pooch):
jpayne@68:         """
jpayne@68:         Populate the registry using the data repository's API
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         pooch : Pooch
jpayne@68:             The pooch instance that the registry will be added to.
jpayne@68:         """
jpayne@68: 
jpayne@68:         raise NotImplementedError  # pragma: no cover
jpayne@68: 
jpayne@68: 
jpayne@68: class ZenodoRepository(DataRepository):  # pylint: disable=missing-class-docstring
jpayne@68:     base_api_url = "https://zenodo.org/api/records"
jpayne@68: 
jpayne@68:     def __init__(self, doi, archive_url):
jpayne@68:         self.archive_url = archive_url
jpayne@68:         self.doi = doi
jpayne@68:         self._api_response = None
jpayne@68:         self._api_version = None
jpayne@68: 
jpayne@68:     @classmethod
jpayne@68:     def initialize(cls, doi, archive_url):
jpayne@68:         """
jpayne@68:         Initialize the data repository if the given URL points to a
jpayne@68:         corresponding repository.
jpayne@68: 
jpayne@68:         Initializes a data repository object. This is done as part of
jpayne@68:         a chain of responsibility. If the class cannot handle the given
jpayne@68:         repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68:         instance is returned.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         doi : str
jpayne@68:             The DOI that identifies the repository
jpayne@68:         archive_url : str
jpayne@68:             The resolved URL for the DOI
jpayne@68:         """
jpayne@68: 
jpayne@68:         # Check whether this is a Zenodo URL
jpayne@68:         parsed_archive_url = parse_url(archive_url)
jpayne@68:         if parsed_archive_url["netloc"] != "zenodo.org":
jpayne@68:             return None
jpayne@68: 
jpayne@68:         return cls(doi, archive_url)
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def api_response(self):
jpayne@68:         """Cached API response from Zenodo"""
jpayne@68:         if self._api_response is None:
jpayne@68:             # Lazy import requests to speed up import time
jpayne@68:             import requests  # pylint: disable=C0415
jpayne@68: 
jpayne@68:             article_id = self.archive_url.split("/")[-1]
jpayne@68:             self._api_response = requests.get(
jpayne@68:                 f"{self.base_api_url}/{article_id}",
jpayne@68:                 timeout=DEFAULT_TIMEOUT,
jpayne@68:             ).json()
jpayne@68: 
jpayne@68:         return self._api_response
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def api_version(self):
jpayne@68:         """
jpayne@68:         Version of the Zenodo API we are interacting with
jpayne@68: 
jpayne@68:         The versions can either be :
jpayne@68: 
jpayne@68:         - ``"legacy"``: corresponds to the Zenodo API that was supported until
jpayne@68:           2023-10-12 (before the migration to InvenioRDM).
jpayne@68:         - ``"new"``: corresponds to the new API that went online on 2023-10-13
jpayne@68:           after the migration to InvenioRDM.
jpayne@68: 
jpayne@68:         The ``"new"`` API breaks backward compatibility with the ``"legacy"``
jpayne@68:         one and could probably be replaced by an updated version that restores
jpayne@68:         the behaviour of the ``"legacy"`` one.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68:         str
jpayne@68:         """
jpayne@68:         if self._api_version is None:
jpayne@68:             if all("key" in file for file in self.api_response["files"]):
jpayne@68:                 self._api_version = "legacy"
jpayne@68:             elif all("filename" in file for file in self.api_response["files"]):
jpayne@68:                 self._api_version = "new"
jpayne@68:             else:
jpayne@68:                 raise ValueError(
jpayne@68:                     "Couldn't determine the version of the Zenodo API for "
jpayne@68:                     f"{self.archive_url} (doi:{self.doi})."
jpayne@68:                 )
jpayne@68:         return self._api_version
jpayne@68: 
jpayne@68:     def download_url(self, file_name):
jpayne@68:         """
jpayne@68:         Use the repository API to get the download URL for a file given
jpayne@68:         the archive URL.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         file_name : str
jpayne@68:             The name of the file in the archive that will be downloaded.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68:         download_url : str
jpayne@68:             The HTTP URL that can be used to download the file.
jpayne@68: 
jpayne@68:         Notes
jpayne@68:         -----
jpayne@68:         After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
jpayne@68:         link to the desired files that appears in the API response leads to 404
jpayne@68:         errors (by 2023-10-17). The files are available in the following url:
jpayne@68:         ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.
jpayne@68: 
jpayne@68:         This method supports both the legacy and the new API.
jpayne@68:         """
jpayne@68:         # Create list of files in the repository
jpayne@68:         if self.api_version == "legacy":
jpayne@68:             files = {item["key"]: item for item in self.api_response["files"]}
jpayne@68:         else:
jpayne@68:             files = [item["filename"] for item in self.api_response["files"]]
jpayne@68:         # Check if file exists in the repository
jpayne@68:         if file_name not in files:
jpayne@68:             raise ValueError(
jpayne@68:                 f"File '{file_name}' not found in data archive "
jpayne@68:                 f"{self.archive_url} (doi:{self.doi})."
jpayne@68:             )
jpayne@68:         # Build download url
jpayne@68:         if self.api_version == "legacy":
jpayne@68:             download_url = files[file_name]["links"]["self"]
jpayne@68:         else:
jpayne@68:             article_id = self.api_response["id"]
jpayne@68:             download_url = (
jpayne@68:                 f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
jpayne@68:             )
jpayne@68:         return download_url
jpayne@68: 
jpayne@68:     def populate_registry(self, pooch):
jpayne@68:         """
jpayne@68:         Populate the registry using the data repository's API
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         pooch : Pooch
jpayne@68:             The pooch instance that the registry will be added to.
jpayne@68: 
jpayne@68:         Notes
jpayne@68:         -----
jpayne@68:         After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
jpayne@68:         checksums for each file listed in the API reference is now an md5 sum.
jpayne@68: 
jpayne@68:         This method supports both the legacy and the new API.
jpayne@68:         """
jpayne@68:         for filedata in self.api_response["files"]:
jpayne@68:             checksum = filedata["checksum"]
jpayne@68:             if self.api_version == "legacy":
jpayne@68:                 key = "key"
jpayne@68:             else:
jpayne@68:                 key = "filename"
jpayne@68:                 checksum = f"md5:{checksum}"
jpayne@68:             pooch.registry[filedata[key]] = checksum
jpayne@68: 
jpayne@68: 
jpayne@68: class FigshareRepository(DataRepository):  # pylint: disable=missing-class-docstring
jpayne@68:     def __init__(self, doi, archive_url):
jpayne@68:         self.archive_url = archive_url
jpayne@68:         self.doi = doi
jpayne@68:         self._api_response = None
jpayne@68: 
jpayne@68:     @classmethod
jpayne@68:     def initialize(cls, doi, archive_url):
jpayne@68:         """
jpayne@68:         Initialize the data repository if the given URL points to a
jpayne@68:         corresponding repository.
jpayne@68: 
jpayne@68:         Initializes a data repository object. This is done as part of
jpayne@68:         a chain of responsibility. If the class cannot handle the given
jpayne@68:         repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68:         instance is returned.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         doi : str
jpayne@68:             The DOI that identifies the repository
jpayne@68:         archive_url : str
jpayne@68:             The resolved URL for the DOI
jpayne@68:         """
jpayne@68: 
jpayne@68:         # Check whether this is a Figshare URL
jpayne@68:         parsed_archive_url = parse_url(archive_url)
jpayne@68:         if parsed_archive_url["netloc"] != "figshare.com":
jpayne@68:             return None
jpayne@68: 
jpayne@68:         return cls(doi, archive_url)
jpayne@68: 
jpayne@68:     def _parse_version_from_doi(self):
jpayne@68:         """
jpayne@68:         Parse version from the doi
jpayne@68: 
jpayne@68:         Return None if version is not available in the doi.
jpayne@68:         """
jpayne@68:         # Get suffix of the doi
jpayne@68:         _, suffix = self.doi.split("/")
jpayne@68:         # Split the suffix by dots and keep the last part
jpayne@68:         last_part = suffix.split(".")[-1]
jpayne@68:         # Parse the version from the last part
jpayne@68:         if last_part[0] != "v":
jpayne@68:             return None
jpayne@68:         version = int(last_part[1:])
jpayne@68:         return version
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def api_response(self):
jpayne@68:         """Cached API response from Figshare"""
jpayne@68:         if self._api_response is None:
jpayne@68:             # Lazy import requests to speed up import time
jpayne@68:             import requests  # pylint: disable=C0415
jpayne@68: 
jpayne@68:             # Use the figshare API to find the article ID from the DOI
jpayne@68:             article = requests.get(
jpayne@68:                 f"https://api.figshare.com/v2/articles?doi={self.doi}",
jpayne@68:                 timeout=DEFAULT_TIMEOUT,
jpayne@68:             ).json()[0]
jpayne@68:             article_id = article["id"]
jpayne@68:             # Parse desired version from the doi
jpayne@68:             version = self._parse_version_from_doi()
jpayne@68:             # With the ID and version, we can get a list of files and their
jpayne@68:             # download links
jpayne@68:             if version is None:
jpayne@68:                 # Figshare returns the latest version available when no version
jpayne@68:                 # is specified through the DOI.
jpayne@68:                 warnings.warn(
jpayne@68:                     f"The Figshare DOI '{self.doi}' doesn't specify which version of "
jpayne@68:                     "the repository should be used. "
jpayne@68:                     "Figshare will point to the latest version available.",
jpayne@68:                     UserWarning,
jpayne@68:                 )
jpayne@68:                 # Define API url using only the article id
jpayne@68:                 # (figshare will resolve the latest version)
jpayne@68:                 api_url = f"https://api.figshare.com/v2/articles/{article_id}"
jpayne@68:             else:
jpayne@68:                 # Define API url using article id and the desired version
jpayne@68:                 # Get list of files using article id and the version
jpayne@68:                 api_url = (
jpayne@68:                     "https://api.figshare.com/v2/articles/"
jpayne@68:                     f"{article_id}/versions/{version}"
jpayne@68:                 )
jpayne@68:             # Make the request and return the files in the figshare repository
jpayne@68:             response = requests.get(api_url, timeout=DEFAULT_TIMEOUT)
jpayne@68:             response.raise_for_status()
jpayne@68:             self._api_response = response.json()["files"]
jpayne@68: 
jpayne@68:         return self._api_response
jpayne@68: 
jpayne@68:     def download_url(self, file_name):
jpayne@68:         """
jpayne@68:         Use the repository API to get the download URL for a file given
jpayne@68:         the archive URL.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         file_name : str
jpayne@68:             The name of the file in the archive that will be downloaded.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68:         download_url : str
jpayne@68:             The HTTP URL that can be used to download the file.
jpayne@68:         """
jpayne@68:         files = {item["name"]: item for item in self.api_response}
jpayne@68:         if file_name not in files:
jpayne@68:             raise ValueError(
jpayne@68:                 f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
jpayne@68:             )
jpayne@68:         download_url = files[file_name]["download_url"]
jpayne@68:         return download_url
jpayne@68: 
jpayne@68:     def populate_registry(self, pooch):
jpayne@68:         """
jpayne@68:         Populate the registry using the data repository's API
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         pooch : Pooch
jpayne@68:             The pooch instance that the registry will be added to.
jpayne@68:         """
jpayne@68: 
jpayne@68:         for filedata in self.api_response:
jpayne@68:             pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
jpayne@68: 
jpayne@68: 
jpayne@68: class DataverseRepository(DataRepository):  # pylint: disable=missing-class-docstring
jpayne@68:     def __init__(self, doi, archive_url):
jpayne@68:         self.archive_url = archive_url
jpayne@68:         self.doi = doi
jpayne@68:         self._api_response = None
jpayne@68: 
jpayne@68:     @classmethod
jpayne@68:     def initialize(cls, doi, archive_url):
jpayne@68:         """
jpayne@68:         Initialize the data repository if the given URL points to a
jpayne@68:         corresponding repository.
jpayne@68: 
jpayne@68:         Initializes a data repository object. This is done as part of
jpayne@68:         a chain of responsibility. If the class cannot handle the given
jpayne@68:         repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68:         instance is returned.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         doi : str
jpayne@68:             The DOI that identifies the repository
jpayne@68:         archive_url : str
jpayne@68:             The resolved URL for the DOI
jpayne@68:         """
jpayne@68:         # Access the DOI as if this was a DataVerse instance
jpayne@68:         response = cls._get_api_response(doi, archive_url)
jpayne@68: 
jpayne@68:         # If we failed, this is probably not a DataVerse instance
jpayne@68:         if 400 <= response.status_code < 600:
jpayne@68:             return None
jpayne@68: 
jpayne@68:         # Initialize the repository and overwrite the api response
jpayne@68:         repository = cls(doi, archive_url)
jpayne@68:         repository.api_response = response
jpayne@68:         return repository
jpayne@68: 
jpayne@68:     @classmethod
jpayne@68:     def _get_api_response(cls, doi, archive_url):
jpayne@68:         """
jpayne@68:         Perform the actual API request
jpayne@68: 
jpayne@68:         This has been separated into a separate ``classmethod``, as it can be
jpayne@68:         used prior and after the initialization.
jpayne@68:         """
jpayne@68:         # Lazy import requests to speed up import time
jpayne@68:         import requests  # pylint: disable=C0415
jpayne@68: 
jpayne@68:         parsed = parse_url(archive_url)
jpayne@68:         response = requests.get(
jpayne@68:             f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
jpayne@68:             f":persistentId?persistentId=doi:{doi}",
jpayne@68:             timeout=DEFAULT_TIMEOUT,
jpayne@68:         )
jpayne@68:         return response
jpayne@68: 
jpayne@68:     @property
jpayne@68:     def api_response(self):
jpayne@68:         """Cached API response from a DataVerse instance"""
jpayne@68: 
jpayne@68:         if self._api_response is None:
jpayne@68:             self._api_response = self._get_api_response(
jpayne@68:                 self.doi, self.archive_url
jpayne@68:             )  # pragma: no cover
jpayne@68: 
jpayne@68:         return self._api_response
jpayne@68: 
jpayne@68:     @api_response.setter
jpayne@68:     def api_response(self, response):
jpayne@68:         """Update the cached API response"""
jpayne@68: 
jpayne@68:         self._api_response = response
jpayne@68: 
jpayne@68:     def download_url(self, file_name):
jpayne@68:         """
jpayne@68:         Use the repository API to get the download URL for a file given
jpayne@68:         the archive URL.
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         file_name : str
jpayne@68:             The name of the file in the archive that will be downloaded.
jpayne@68: 
jpayne@68:         Returns
jpayne@68:         -------
jpayne@68:         download_url : str
jpayne@68:             The HTTP URL that can be used to download the file.
jpayne@68:         """
jpayne@68:         parsed = parse_url(self.archive_url)
jpayne@68:         response = self.api_response.json()
jpayne@68:         files = {
jpayne@68:             file["dataFile"]["filename"]: file["dataFile"]
jpayne@68:             for file in response["data"]["latestVersion"]["files"]
jpayne@68:         }
jpayne@68:         if file_name not in files:
jpayne@68:             raise ValueError(
jpayne@68:                 f"File '{file_name}' not found in data archive "
jpayne@68:                 f"{self.archive_url} (doi:{self.doi})."
jpayne@68:             )
jpayne@68:         # Generate download_url using the file id
jpayne@68:         download_url = (
jpayne@68:             f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
jpayne@68:             f"{files[file_name]['id']}"
jpayne@68:         )
jpayne@68:         return download_url
jpayne@68: 
jpayne@68:     def populate_registry(self, pooch):
jpayne@68:         """
jpayne@68:         Populate the registry using the data repository's API
jpayne@68: 
jpayne@68:         Parameters
jpayne@68:         ----------
jpayne@68:         pooch : Pooch
jpayne@68:             The pooch instance that the registry will be added to.
jpayne@68:         """
jpayne@68: 
jpayne@68:         for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
jpayne@68:             pooch.registry[filedata["dataFile"]["filename"]] = (
jpayne@68:                 f"md5:{filedata['dataFile']['md5']}"
jpayne@68:             )