jpayne@68: # Copyright (c) 2018 The Pooch Developers. jpayne@68: # Distributed under the terms of the BSD 3-Clause License. jpayne@68: # SPDX-License-Identifier: BSD-3-Clause jpayne@68: # jpayne@68: # This code is part of the Fatiando a Terra project (https://www.fatiando.org) jpayne@68: # jpayne@68: """ jpayne@68: The classes that actually handle the downloads. jpayne@68: """ jpayne@68: import os jpayne@68: import sys jpayne@68: import ftplib jpayne@68: jpayne@68: import warnings jpayne@68: jpayne@68: from .utils import parse_url jpayne@68: jpayne@68: try: jpayne@68: from tqdm import tqdm jpayne@68: except ImportError: jpayne@68: tqdm = None jpayne@68: jpayne@68: try: jpayne@68: import paramiko jpayne@68: except ImportError: jpayne@68: paramiko = None jpayne@68: jpayne@68: jpayne@68: # Set the default timeout in seconds so it can be configured in a pinch for the jpayne@68: # methods that don't or can't expose a way set it at runtime. jpayne@68: # See https://github.com/fatiando/pooch/issues/409 jpayne@68: DEFAULT_TIMEOUT = 30 jpayne@68: jpayne@68: jpayne@68: def choose_downloader(url, progressbar=False): jpayne@68: """ jpayne@68: Choose the appropriate downloader for the given URL based on the protocol. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: url : str jpayne@68: A URL (including protocol). jpayne@68: progressbar : bool or an arbitrary progress bar object jpayne@68: If True, will print a progress bar of the download to standard error jpayne@68: (stderr). Requires `tqdm `__ to be jpayne@68: installed. Alternatively, an arbitrary progress bar object can be jpayne@68: passed. See :ref:`custom-progressbar` for details. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: downloader jpayne@68: A downloader class, like :class:`pooch.HTTPDownloader`, jpayne@68: :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`. jpayne@68: jpayne@68: Examples jpayne@68: -------- jpayne@68: jpayne@68: >>> downloader = choose_downloader("http://something.com") jpayne@68: >>> print(downloader.__class__.__name__) jpayne@68: HTTPDownloader jpayne@68: >>> downloader = choose_downloader("https://something.com") jpayne@68: >>> print(downloader.__class__.__name__) jpayne@68: HTTPDownloader jpayne@68: >>> downloader = choose_downloader("ftp://something.com") jpayne@68: >>> print(downloader.__class__.__name__) jpayne@68: FTPDownloader jpayne@68: >>> downloader = choose_downloader("doi:DOI/filename.csv") jpayne@68: >>> print(downloader.__class__.__name__) jpayne@68: DOIDownloader jpayne@68: jpayne@68: """ jpayne@68: known_downloaders = { jpayne@68: "ftp": FTPDownloader, jpayne@68: "https": HTTPDownloader, jpayne@68: "http": HTTPDownloader, jpayne@68: "sftp": SFTPDownloader, jpayne@68: "doi": DOIDownloader, jpayne@68: } jpayne@68: jpayne@68: parsed_url = parse_url(url) jpayne@68: if parsed_url["protocol"] not in known_downloaders: jpayne@68: raise ValueError( jpayne@68: f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. " jpayne@68: f"Must be one of {known_downloaders.keys()}." jpayne@68: ) jpayne@68: downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar) jpayne@68: return downloader jpayne@68: jpayne@68: jpayne@68: class HTTPDownloader: # pylint: disable=too-few-public-methods jpayne@68: """ jpayne@68: Download manager for fetching files over HTTP/HTTPS. jpayne@68: jpayne@68: When called, downloads the given file URL into the specified local file. jpayne@68: Uses the :mod:`requests` library to manage downloads. jpayne@68: jpayne@68: Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize jpayne@68: the download of files (for example, to use authentication or print a jpayne@68: progress bar). jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: progressbar : bool or an arbitrary progress bar object jpayne@68: If True, will print a progress bar of the download to standard error jpayne@68: (stderr). Requires `tqdm `__ to be jpayne@68: installed. Alternatively, an arbitrary progress bar object can be jpayne@68: passed. See :ref:`custom-progressbar` for details. jpayne@68: chunk_size : int jpayne@68: Files are streamed *chunk_size* bytes at a time instead of loading jpayne@68: everything into memory at one. Usually doesn't need to be changed. jpayne@68: **kwargs jpayne@68: All keyword arguments given when creating an instance of this class jpayne@68: will be passed to :func:`requests.get`. jpayne@68: jpayne@68: Examples jpayne@68: -------- jpayne@68: jpayne@68: Download one of the data files from the Pooch repository: jpayne@68: jpayne@68: >>> import os jpayne@68: >>> from pooch import __version__, check_version jpayne@68: >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt" jpayne@68: >>> url = url.format(check_version(__version__, fallback="main")) jpayne@68: >>> downloader = HTTPDownloader() jpayne@68: >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch jpayne@68: >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) jpayne@68: >>> os.path.exists("tiny-data.txt") jpayne@68: True jpayne@68: >>> with open("tiny-data.txt") as f: jpayne@68: ... print(f.read().strip()) jpayne@68: # A tiny data file for test purposes only jpayne@68: 1 2 3 4 5 6 jpayne@68: >>> os.remove("tiny-data.txt") jpayne@68: jpayne@68: Authentication can be handled by passing a user name and password to jpayne@68: :func:`requests.get`. All arguments provided when creating an instance of jpayne@68: the class are forwarded to :func:`requests.get`. We'll use jpayne@68: ``auth=(username, password)`` to use basic HTTPS authentication. The jpayne@68: https://httpbin.org website allows us to make a fake a login request using jpayne@68: whatever username and password we provide to it: jpayne@68: jpayne@68: >>> user = "doggo" jpayne@68: >>> password = "goodboy" jpayne@68: >>> # httpbin will ask for the user and password we provide in the URL jpayne@68: >>> url = f"https://httpbin.org/basic-auth/{user}/{password}" jpayne@68: >>> # Trying without the login credentials causes an error jpayne@68: >>> downloader = HTTPDownloader() jpayne@68: >>> try: jpayne@68: ... downloader(url=url, output_file="tiny-data.txt", pooch=None) jpayne@68: ... except Exception: jpayne@68: ... print("There was an error!") jpayne@68: There was an error! jpayne@68: >>> # Pass in the credentials to HTTPDownloader jpayne@68: >>> downloader = HTTPDownloader(auth=(user, password)) jpayne@68: >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) jpayne@68: >>> with open("tiny-data.txt") as f: jpayne@68: ... for line in f: jpayne@68: ... print(line.rstrip()) jpayne@68: { jpayne@68: "authenticated": true, jpayne@68: "user": "doggo" jpayne@68: } jpayne@68: >>> os.remove("tiny-data.txt") jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, progressbar=False, chunk_size=1024, **kwargs): jpayne@68: self.kwargs = kwargs jpayne@68: self.progressbar = progressbar jpayne@68: self.chunk_size = chunk_size jpayne@68: if self.progressbar is True and tqdm is None: jpayne@68: raise ValueError("Missing package 'tqdm' required for progress bars.") jpayne@68: jpayne@68: def __call__( jpayne@68: self, url, output_file, pooch, check_only=False jpayne@68: ): # pylint: disable=R0914 jpayne@68: """ jpayne@68: Download the given URL over HTTP to the given output file. jpayne@68: jpayne@68: Uses :func:`requests.get`. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: url : str jpayne@68: The URL to the file you want to download. jpayne@68: output_file : str or file-like object jpayne@68: Path (and file name) to which the file will be downloaded. jpayne@68: pooch : :class:`~pooch.Pooch` jpayne@68: The instance of :class:`~pooch.Pooch` that is calling this method. jpayne@68: check_only : bool jpayne@68: If True, will only check if a file exists on the server and jpayne@68: **without downloading the file**. Will return ``True`` if the file jpayne@68: exists and ``False`` otherwise. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: availability : bool or None jpayne@68: If ``check_only==True``, returns a boolean indicating if the file jpayne@68: is available on the server. Otherwise, returns ``None``. jpayne@68: jpayne@68: """ jpayne@68: # Lazy import requests to speed up import time jpayne@68: import requests # pylint: disable=C0415 jpayne@68: jpayne@68: if check_only: jpayne@68: timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT) jpayne@68: response = requests.head(url, timeout=timeout, allow_redirects=True) jpayne@68: available = bool(response.status_code == 200) jpayne@68: return available jpayne@68: jpayne@68: kwargs = self.kwargs.copy() jpayne@68: timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT) jpayne@68: kwargs.setdefault("stream", True) jpayne@68: ispath = not hasattr(output_file, "write") jpayne@68: if ispath: jpayne@68: # pylint: disable=consider-using-with jpayne@68: output_file = open(output_file, "w+b") jpayne@68: # pylint: enable=consider-using-with jpayne@68: try: jpayne@68: response = requests.get(url, timeout=timeout, **kwargs) jpayne@68: response.raise_for_status() jpayne@68: content = response.iter_content(chunk_size=self.chunk_size) jpayne@68: total = int(response.headers.get("content-length", 0)) jpayne@68: if self.progressbar is True: jpayne@68: # Need to use ascii characters on Windows because there isn't jpayne@68: # always full unicode support jpayne@68: # (see https://github.com/tqdm/tqdm/issues/454) jpayne@68: use_ascii = bool(sys.platform == "win32") jpayne@68: progress = tqdm( jpayne@68: total=total, jpayne@68: ncols=79, jpayne@68: ascii=use_ascii, jpayne@68: unit="B", jpayne@68: unit_scale=True, jpayne@68: leave=True, jpayne@68: ) jpayne@68: elif self.progressbar: jpayne@68: progress = self.progressbar jpayne@68: progress.total = total jpayne@68: for chunk in content: jpayne@68: if chunk: jpayne@68: output_file.write(chunk) jpayne@68: output_file.flush() jpayne@68: if self.progressbar: jpayne@68: # Use the chunk size here because chunk may be much jpayne@68: # larger if the data are decompressed by requests after jpayne@68: # reading (happens with text files). jpayne@68: progress.update(self.chunk_size) jpayne@68: # Make sure the progress bar gets filled even if the actual number jpayne@68: # is chunks is smaller than expected. This happens when streaming jpayne@68: # text files that are compressed by the server when sending (gzip). jpayne@68: # Binary files don't experience this. jpayne@68: if self.progressbar: jpayne@68: progress.reset() jpayne@68: progress.update(total) jpayne@68: progress.close() jpayne@68: finally: jpayne@68: if ispath: jpayne@68: output_file.close() jpayne@68: return None jpayne@68: jpayne@68: jpayne@68: class FTPDownloader: # pylint: disable=too-few-public-methods jpayne@68: """ jpayne@68: Download manager for fetching files over FTP. jpayne@68: jpayne@68: When called, downloads the given file URL into the specified local file. jpayne@68: Uses the :mod:`ftplib` module to manage downloads. jpayne@68: jpayne@68: Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize jpayne@68: the download of files (for example, to use authentication or print a jpayne@68: progress bar). jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: port : int jpayne@68: Port used for the FTP connection. jpayne@68: username : str jpayne@68: User name used to login to the server. Only needed if the server jpayne@68: requires authentication (i.e., no anonymous FTP). jpayne@68: password : str jpayne@68: Password used to login to the server. Only needed if the server jpayne@68: requires authentication (i.e., no anonymous FTP). Use the empty string jpayne@68: to indicate no password is required. jpayne@68: account : str jpayne@68: Some servers also require an "account" name for authentication. jpayne@68: timeout : int jpayne@68: Timeout in seconds for ftp socket operations, use None to mean no jpayne@68: timeout. jpayne@68: progressbar : bool jpayne@68: If True, will print a progress bar of the download to standard error jpayne@68: (stderr). Requires `tqdm `__ to be jpayne@68: installed. **Custom progress bars are not yet supported.** jpayne@68: chunk_size : int jpayne@68: Files are streamed *chunk_size* bytes at a time instead of loading jpayne@68: everything into memory at one. Usually doesn't need to be changed. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__( jpayne@68: self, jpayne@68: port=21, jpayne@68: username="anonymous", jpayne@68: password="", jpayne@68: account="", jpayne@68: timeout=None, jpayne@68: progressbar=False, jpayne@68: chunk_size=1024, jpayne@68: ): jpayne@68: self.port = port jpayne@68: self.username = username jpayne@68: self.password = password jpayne@68: self.account = account jpayne@68: self.timeout = timeout jpayne@68: self.progressbar = progressbar jpayne@68: self.chunk_size = chunk_size jpayne@68: if self.progressbar is True and tqdm is None: jpayne@68: raise ValueError("Missing package 'tqdm' required for progress bars.") jpayne@68: jpayne@68: def __call__(self, url, output_file, pooch, check_only=False): jpayne@68: """ jpayne@68: Download the given URL over FTP to the given output file. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: url : str jpayne@68: The URL to the file you want to download. jpayne@68: output_file : str or file-like object jpayne@68: Path (and file name) to which the file will be downloaded. jpayne@68: pooch : :class:`~pooch.Pooch` jpayne@68: The instance of :class:`~pooch.Pooch` that is calling this method. jpayne@68: check_only : bool jpayne@68: If True, will only check if a file exists on the server and jpayne@68: **without downloading the file**. Will return ``True`` if the file jpayne@68: exists and ``False`` otherwise. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: availability : bool or None jpayne@68: If ``check_only==True``, returns a boolean indicating if the file jpayne@68: is available on the server. Otherwise, returns ``None``. jpayne@68: jpayne@68: """ jpayne@68: parsed_url = parse_url(url) jpayne@68: ftp = ftplib.FTP(timeout=self.timeout) jpayne@68: ftp.connect(host=parsed_url["netloc"], port=self.port) jpayne@68: jpayne@68: if check_only: jpayne@68: directory, file_name = os.path.split(parsed_url["path"]) jpayne@68: try: jpayne@68: ftp.login(user=self.username, passwd=self.password, acct=self.account) jpayne@68: available = file_name in ftp.nlst(directory) jpayne@68: finally: jpayne@68: ftp.close() jpayne@68: return available jpayne@68: jpayne@68: ispath = not hasattr(output_file, "write") jpayne@68: if ispath: jpayne@68: # pylint: disable=consider-using-with jpayne@68: output_file = open(output_file, "w+b") jpayne@68: # pylint: enable=consider-using-with jpayne@68: try: jpayne@68: ftp.login(user=self.username, passwd=self.password, acct=self.account) jpayne@68: command = f"RETR {parsed_url['path']}" jpayne@68: if self.progressbar: jpayne@68: # Make sure the file is set to binary mode, otherwise we can't jpayne@68: # get the file size. See: https://stackoverflow.com/a/22093848 jpayne@68: ftp.voidcmd("TYPE I") jpayne@68: use_ascii = bool(sys.platform == "win32") jpayne@68: progress = tqdm( jpayne@68: total=int(ftp.size(parsed_url["path"])), jpayne@68: ncols=79, jpayne@68: ascii=use_ascii, jpayne@68: unit="B", jpayne@68: unit_scale=True, jpayne@68: leave=True, jpayne@68: ) jpayne@68: with progress: jpayne@68: jpayne@68: def callback(data): jpayne@68: "Update the progress bar and write to output" jpayne@68: progress.update(len(data)) jpayne@68: output_file.write(data) jpayne@68: jpayne@68: ftp.retrbinary(command, callback, blocksize=self.chunk_size) jpayne@68: else: jpayne@68: ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size) jpayne@68: finally: jpayne@68: ftp.quit() jpayne@68: if ispath: jpayne@68: output_file.close() jpayne@68: return None jpayne@68: jpayne@68: jpayne@68: class SFTPDownloader: # pylint: disable=too-few-public-methods jpayne@68: """ jpayne@68: Download manager for fetching files over SFTP. jpayne@68: jpayne@68: When called, downloads the given file URL into the specified local file. jpayne@68: Requires `paramiko `__ to be jpayne@68: installed. jpayne@68: jpayne@68: Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize jpayne@68: the download of files (for example, to use authentication or print a jpayne@68: progress bar). jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: port : int jpayne@68: Port used for the SFTP connection. jpayne@68: username : str jpayne@68: User name used to login to the server. Only needed if the server jpayne@68: requires authentication (i.e., no anonymous SFTP). jpayne@68: password : str jpayne@68: Password used to login to the server. Only needed if the server jpayne@68: requires authentication (i.e., no anonymous SFTP). Use the empty jpayne@68: string to indicate no password is required. jpayne@68: timeout : int jpayne@68: Timeout in seconds for sftp socket operations, use None to mean no jpayne@68: timeout. jpayne@68: progressbar : bool or an arbitrary progress bar object jpayne@68: If True, will print a progress bar of the download to standard jpayne@68: error (stderr). Requires `tqdm `__ to jpayne@68: be installed. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__( jpayne@68: self, jpayne@68: port=22, jpayne@68: username="anonymous", jpayne@68: password="", jpayne@68: account="", jpayne@68: timeout=None, jpayne@68: progressbar=False, jpayne@68: ): jpayne@68: self.port = port jpayne@68: self.username = username jpayne@68: self.password = password jpayne@68: self.account = account jpayne@68: self.timeout = timeout jpayne@68: self.progressbar = progressbar jpayne@68: # Collect errors and raise only once so that both missing packages are jpayne@68: # captured. Otherwise, the user is only warned of one of them at a jpayne@68: # time (and we can't test properly when they are both missing). jpayne@68: errors = [] jpayne@68: if self.progressbar and tqdm is None: jpayne@68: errors.append("Missing package 'tqdm' required for progress bars.") jpayne@68: if paramiko is None: jpayne@68: errors.append("Missing package 'paramiko' required for SFTP downloads.") jpayne@68: if errors: jpayne@68: raise ValueError(" ".join(errors)) jpayne@68: jpayne@68: def __call__(self, url, output_file, pooch): jpayne@68: """ jpayne@68: Download the given URL over SFTP to the given output file. jpayne@68: jpayne@68: The output file must be given as a string (file name/path) and not an jpayne@68: open file object! Otherwise, paramiko cannot save to that file. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: url : str jpayne@68: The URL to the file you want to download. jpayne@68: output_file : str jpayne@68: Path (and file name) to which the file will be downloaded. **Cannot jpayne@68: be a file object**. jpayne@68: pooch : :class:`~pooch.Pooch` jpayne@68: The instance of :class:`~pooch.Pooch` that is calling this method. jpayne@68: """ jpayne@68: parsed_url = parse_url(url) jpayne@68: connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port)) jpayne@68: sftp = None jpayne@68: try: jpayne@68: connection.connect(username=self.username, password=self.password) jpayne@68: sftp = paramiko.SFTPClient.from_transport(connection) jpayne@68: sftp.get_channel().settimeout = self.timeout jpayne@68: if self.progressbar: jpayne@68: size = int(sftp.stat(parsed_url["path"]).st_size) jpayne@68: use_ascii = bool(sys.platform == "win32") jpayne@68: progress = tqdm( jpayne@68: total=size, jpayne@68: ncols=79, jpayne@68: ascii=use_ascii, jpayne@68: unit="B", jpayne@68: unit_scale=True, jpayne@68: leave=True, jpayne@68: ) jpayne@68: if self.progressbar: jpayne@68: with progress: jpayne@68: jpayne@68: def callback(current, total): jpayne@68: "Update the progress bar and write to output" jpayne@68: progress.total = int(total) jpayne@68: progress.update(int(current - progress.n)) jpayne@68: jpayne@68: sftp.get(parsed_url["path"], output_file, callback=callback) jpayne@68: else: jpayne@68: sftp.get(parsed_url["path"], output_file) jpayne@68: finally: jpayne@68: connection.close() jpayne@68: if sftp is not None: jpayne@68: sftp.close() jpayne@68: jpayne@68: jpayne@68: class DOIDownloader: # pylint: disable=too-few-public-methods jpayne@68: """ jpayne@68: Download manager for fetching files from Digital Object Identifiers (DOIs). jpayne@68: jpayne@68: Open-access data repositories often issue Digital Object Identifiers (DOIs) jpayne@68: for data which provide a stable link and citation point. The trick is jpayne@68: finding out the download URL for a file given the DOI. jpayne@68: jpayne@68: When called, this downloader uses the repository's public API to find out jpayne@68: the download URL from the DOI and file name. It then uses jpayne@68: :class:`pooch.HTTPDownloader` to download the URL into the specified local jpayne@68: file. Allowing "URL"s to be specified with the DOI instead of the actual jpayne@68: HTTP download link. Uses the :mod:`requests` library to manage downloads jpayne@68: and interact with the APIs. jpayne@68: jpayne@68: The **format of the "URL"** is: ``doi:{DOI}/{file name}``. jpayne@68: jpayne@68: Notice that there are no ``//`` like in HTTP/FTP and you must specify a jpayne@68: file name after the DOI (separated by a ``/``). jpayne@68: jpayne@68: Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to jpayne@68: download files given the DOI instead of an HTTP link. jpayne@68: jpayne@68: Supported repositories: jpayne@68: jpayne@68: * `figshare `__ jpayne@68: * `Zenodo `__ jpayne@68: * `Dataverse `__ instances jpayne@68: jpayne@68: .. attention:: jpayne@68: jpayne@68: DOIs from other repositories **will not work** since we need to access jpayne@68: their particular APIs to find the download links. We welcome jpayne@68: suggestions and contributions adding new repositories. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: progressbar : bool or an arbitrary progress bar object jpayne@68: If True, will print a progress bar of the download to standard error jpayne@68: (stderr). Requires `tqdm `__ to be jpayne@68: installed. Alternatively, an arbitrary progress bar object can be jpayne@68: passed. See :ref:`custom-progressbar` for details. jpayne@68: chunk_size : int jpayne@68: Files are streamed *chunk_size* bytes at a time instead of loading jpayne@68: everything into memory at one. Usually doesn't need to be changed. jpayne@68: **kwargs jpayne@68: All keyword arguments given when creating an instance of this class jpayne@68: will be passed to :func:`requests.get`. jpayne@68: jpayne@68: Examples jpayne@68: -------- jpayne@68: jpayne@68: Download one of the data files from the figshare archive of Pooch test jpayne@68: data: jpayne@68: jpayne@68: >>> import os jpayne@68: >>> downloader = DOIDownloader() jpayne@68: >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt" jpayne@68: >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch jpayne@68: >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) jpayne@68: >>> os.path.exists("tiny-data.txt") jpayne@68: True jpayne@68: >>> with open("tiny-data.txt") as f: jpayne@68: ... print(f.read().strip()) jpayne@68: # A tiny data file for test purposes only jpayne@68: 1 2 3 4 5 6 jpayne@68: >>> os.remove("tiny-data.txt") jpayne@68: jpayne@68: Same thing but for our Zenodo archive: jpayne@68: jpayne@68: >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt" jpayne@68: >>> downloader(url=url, output_file="tiny-data.txt", pooch=None) jpayne@68: >>> os.path.exists("tiny-data.txt") jpayne@68: True jpayne@68: >>> with open("tiny-data.txt") as f: jpayne@68: ... print(f.read().strip()) jpayne@68: # A tiny data file for test purposes only jpayne@68: 1 2 3 4 5 6 jpayne@68: >>> os.remove("tiny-data.txt") jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: def __init__(self, progressbar=False, chunk_size=1024, **kwargs): jpayne@68: self.kwargs = kwargs jpayne@68: self.progressbar = progressbar jpayne@68: self.chunk_size = chunk_size jpayne@68: jpayne@68: def __call__(self, url, output_file, pooch): jpayne@68: """ jpayne@68: Download the given DOI URL over HTTP to the given output file. jpayne@68: jpayne@68: Uses the repository's API to determine the actual HTTP download URL jpayne@68: from the given DOI. jpayne@68: jpayne@68: Uses :func:`requests.get`. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: url : str jpayne@68: The URL to the file you want to download. jpayne@68: output_file : str or file-like object jpayne@68: Path (and file name) to which the file will be downloaded. jpayne@68: pooch : :class:`~pooch.Pooch` jpayne@68: The instance of :class:`~pooch.Pooch` that is calling this method. jpayne@68: jpayne@68: """ jpayne@68: jpayne@68: parsed_url = parse_url(url) jpayne@68: data_repository = doi_to_repository(parsed_url["netloc"]) jpayne@68: jpayne@68: # Resolve the URL jpayne@68: file_name = parsed_url["path"] jpayne@68: # remove the leading slash in the path jpayne@68: if file_name[0] == "/": jpayne@68: file_name = file_name[1:] jpayne@68: download_url = data_repository.download_url(file_name) jpayne@68: jpayne@68: # Instantiate the downloader object jpayne@68: downloader = HTTPDownloader( jpayne@68: progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs jpayne@68: ) jpayne@68: downloader(download_url, output_file, pooch) jpayne@68: jpayne@68: jpayne@68: def doi_to_url(doi): jpayne@68: """ jpayne@68: Follow a DOI link to resolve the URL of the archive. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: doi : str jpayne@68: The DOI of the archive. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: url : str jpayne@68: The URL of the archive in the data repository. jpayne@68: jpayne@68: """ jpayne@68: # Lazy import requests to speed up import time jpayne@68: import requests # pylint: disable=C0415 jpayne@68: jpayne@68: # Use doi.org to resolve the DOI to the repository website. jpayne@68: response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT) jpayne@68: url = response.url jpayne@68: if 400 <= response.status_code < 600: jpayne@68: raise ValueError( jpayne@68: f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?" jpayne@68: ) jpayne@68: return url jpayne@68: jpayne@68: jpayne@68: def doi_to_repository(doi): jpayne@68: """ jpayne@68: Instantiate a data repository instance from a given DOI. jpayne@68: jpayne@68: This function implements the chain of responsibility dispatch jpayne@68: to the correct data repository class. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: doi : str jpayne@68: The DOI of the archive. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: data_repository : DataRepository jpayne@68: The data repository object jpayne@68: """ jpayne@68: jpayne@68: # This should go away in a separate issue: DOI handling should jpayne@68: # not rely on the (non-)existence of trailing slashes. The issue jpayne@68: # is documented in https://github.com/fatiando/pooch/issues/324 jpayne@68: if doi[-1] == "/": jpayne@68: doi = doi[:-1] jpayne@68: jpayne@68: repositories = [ jpayne@68: FigshareRepository, jpayne@68: ZenodoRepository, jpayne@68: DataverseRepository, jpayne@68: ] jpayne@68: jpayne@68: # Extract the DOI and the repository information jpayne@68: archive_url = doi_to_url(doi) jpayne@68: jpayne@68: # Try the converters one by one until one of them returned a URL jpayne@68: data_repository = None jpayne@68: for repo in repositories: jpayne@68: if data_repository is None: jpayne@68: data_repository = repo.initialize( jpayne@68: archive_url=archive_url, jpayne@68: doi=doi, jpayne@68: ) jpayne@68: jpayne@68: if data_repository is None: jpayne@68: repository = parse_url(archive_url)["netloc"] jpayne@68: raise ValueError( jpayne@68: f"Invalid data repository '{repository}'. " jpayne@68: "To request or contribute support for this repository, " jpayne@68: "please open an issue at https://github.com/fatiando/pooch/issues" jpayne@68: ) jpayne@68: jpayne@68: return data_repository jpayne@68: jpayne@68: jpayne@68: class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring jpayne@68: @classmethod jpayne@68: def initialize(cls, doi, archive_url): # pylint: disable=unused-argument jpayne@68: """ jpayne@68: Initialize the data repository if the given URL points to a jpayne@68: corresponding repository. jpayne@68: jpayne@68: Initializes a data repository object. This is done as part of jpayne@68: a chain of responsibility. If the class cannot handle the given jpayne@68: repository URL, it returns `None`. Otherwise a `DataRepository` jpayne@68: instance is returned. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: doi : str jpayne@68: The DOI that identifies the repository jpayne@68: archive_url : str jpayne@68: The resolved URL for the DOI jpayne@68: """ jpayne@68: jpayne@68: return None # pragma: no cover jpayne@68: jpayne@68: def download_url(self, file_name): jpayne@68: """ jpayne@68: Use the repository API to get the download URL for a file given jpayne@68: the archive URL. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: file_name : str jpayne@68: The name of the file in the archive that will be downloaded. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: download_url : str jpayne@68: The HTTP URL that can be used to download the file. jpayne@68: """ jpayne@68: jpayne@68: raise NotImplementedError # pragma: no cover jpayne@68: jpayne@68: def populate_registry(self, pooch): jpayne@68: """ jpayne@68: Populate the registry using the data repository's API jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: pooch : Pooch jpayne@68: The pooch instance that the registry will be added to. jpayne@68: """ jpayne@68: jpayne@68: raise NotImplementedError # pragma: no cover jpayne@68: jpayne@68: jpayne@68: class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring jpayne@68: base_api_url = "https://zenodo.org/api/records" jpayne@68: jpayne@68: def __init__(self, doi, archive_url): jpayne@68: self.archive_url = archive_url jpayne@68: self.doi = doi jpayne@68: self._api_response = None jpayne@68: self._api_version = None jpayne@68: jpayne@68: @classmethod jpayne@68: def initialize(cls, doi, archive_url): jpayne@68: """ jpayne@68: Initialize the data repository if the given URL points to a jpayne@68: corresponding repository. jpayne@68: jpayne@68: Initializes a data repository object. This is done as part of jpayne@68: a chain of responsibility. If the class cannot handle the given jpayne@68: repository URL, it returns `None`. Otherwise a `DataRepository` jpayne@68: instance is returned. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: doi : str jpayne@68: The DOI that identifies the repository jpayne@68: archive_url : str jpayne@68: The resolved URL for the DOI jpayne@68: """ jpayne@68: jpayne@68: # Check whether this is a Zenodo URL jpayne@68: parsed_archive_url = parse_url(archive_url) jpayne@68: if parsed_archive_url["netloc"] != "zenodo.org": jpayne@68: return None jpayne@68: jpayne@68: return cls(doi, archive_url) jpayne@68: jpayne@68: @property jpayne@68: def api_response(self): jpayne@68: """Cached API response from Zenodo""" jpayne@68: if self._api_response is None: jpayne@68: # Lazy import requests to speed up import time jpayne@68: import requests # pylint: disable=C0415 jpayne@68: jpayne@68: article_id = self.archive_url.split("/")[-1] jpayne@68: self._api_response = requests.get( jpayne@68: f"{self.base_api_url}/{article_id}", jpayne@68: timeout=DEFAULT_TIMEOUT, jpayne@68: ).json() jpayne@68: jpayne@68: return self._api_response jpayne@68: jpayne@68: @property jpayne@68: def api_version(self): jpayne@68: """ jpayne@68: Version of the Zenodo API we are interacting with jpayne@68: jpayne@68: The versions can either be : jpayne@68: jpayne@68: - ``"legacy"``: corresponds to the Zenodo API that was supported until jpayne@68: 2023-10-12 (before the migration to InvenioRDM). jpayne@68: - ``"new"``: corresponds to the new API that went online on 2023-10-13 jpayne@68: after the migration to InvenioRDM. jpayne@68: jpayne@68: The ``"new"`` API breaks backward compatibility with the ``"legacy"`` jpayne@68: one and could probably be replaced by an updated version that restores jpayne@68: the behaviour of the ``"legacy"`` one. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: str jpayne@68: """ jpayne@68: if self._api_version is None: jpayne@68: if all("key" in file for file in self.api_response["files"]): jpayne@68: self._api_version = "legacy" jpayne@68: elif all("filename" in file for file in self.api_response["files"]): jpayne@68: self._api_version = "new" jpayne@68: else: jpayne@68: raise ValueError( jpayne@68: "Couldn't determine the version of the Zenodo API for " jpayne@68: f"{self.archive_url} (doi:{self.doi})." jpayne@68: ) jpayne@68: return self._api_version jpayne@68: jpayne@68: def download_url(self, file_name): jpayne@68: """ jpayne@68: Use the repository API to get the download URL for a file given jpayne@68: the archive URL. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: file_name : str jpayne@68: The name of the file in the archive that will be downloaded. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: download_url : str jpayne@68: The HTTP URL that can be used to download the file. jpayne@68: jpayne@68: Notes jpayne@68: ----- jpayne@68: After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The jpayne@68: link to the desired files that appears in the API response leads to 404 jpayne@68: errors (by 2023-10-17). The files are available in the following url: jpayne@68: ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``. jpayne@68: jpayne@68: This method supports both the legacy and the new API. jpayne@68: """ jpayne@68: # Create list of files in the repository jpayne@68: if self.api_version == "legacy": jpayne@68: files = {item["key"]: item for item in self.api_response["files"]} jpayne@68: else: jpayne@68: files = [item["filename"] for item in self.api_response["files"]] jpayne@68: # Check if file exists in the repository jpayne@68: if file_name not in files: jpayne@68: raise ValueError( jpayne@68: f"File '{file_name}' not found in data archive " jpayne@68: f"{self.archive_url} (doi:{self.doi})." jpayne@68: ) jpayne@68: # Build download url jpayne@68: if self.api_version == "legacy": jpayne@68: download_url = files[file_name]["links"]["self"] jpayne@68: else: jpayne@68: article_id = self.api_response["id"] jpayne@68: download_url = ( jpayne@68: f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1" jpayne@68: ) jpayne@68: return download_url jpayne@68: jpayne@68: def populate_registry(self, pooch): jpayne@68: """ jpayne@68: Populate the registry using the data repository's API jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: pooch : Pooch jpayne@68: The pooch instance that the registry will be added to. jpayne@68: jpayne@68: Notes jpayne@68: ----- jpayne@68: After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The jpayne@68: checksums for each file listed in the API reference is now an md5 sum. jpayne@68: jpayne@68: This method supports both the legacy and the new API. jpayne@68: """ jpayne@68: for filedata in self.api_response["files"]: jpayne@68: checksum = filedata["checksum"] jpayne@68: if self.api_version == "legacy": jpayne@68: key = "key" jpayne@68: else: jpayne@68: key = "filename" jpayne@68: checksum = f"md5:{checksum}" jpayne@68: pooch.registry[filedata[key]] = checksum jpayne@68: jpayne@68: jpayne@68: class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring jpayne@68: def __init__(self, doi, archive_url): jpayne@68: self.archive_url = archive_url jpayne@68: self.doi = doi jpayne@68: self._api_response = None jpayne@68: jpayne@68: @classmethod jpayne@68: def initialize(cls, doi, archive_url): jpayne@68: """ jpayne@68: Initialize the data repository if the given URL points to a jpayne@68: corresponding repository. jpayne@68: jpayne@68: Initializes a data repository object. This is done as part of jpayne@68: a chain of responsibility. If the class cannot handle the given jpayne@68: repository URL, it returns `None`. Otherwise a `DataRepository` jpayne@68: instance is returned. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: doi : str jpayne@68: The DOI that identifies the repository jpayne@68: archive_url : str jpayne@68: The resolved URL for the DOI jpayne@68: """ jpayne@68: jpayne@68: # Check whether this is a Figshare URL jpayne@68: parsed_archive_url = parse_url(archive_url) jpayne@68: if parsed_archive_url["netloc"] != "figshare.com": jpayne@68: return None jpayne@68: jpayne@68: return cls(doi, archive_url) jpayne@68: jpayne@68: def _parse_version_from_doi(self): jpayne@68: """ jpayne@68: Parse version from the doi jpayne@68: jpayne@68: Return None if version is not available in the doi. jpayne@68: """ jpayne@68: # Get suffix of the doi jpayne@68: _, suffix = self.doi.split("/") jpayne@68: # Split the suffix by dots and keep the last part jpayne@68: last_part = suffix.split(".")[-1] jpayne@68: # Parse the version from the last part jpayne@68: if last_part[0] != "v": jpayne@68: return None jpayne@68: version = int(last_part[1:]) jpayne@68: return version jpayne@68: jpayne@68: @property jpayne@68: def api_response(self): jpayne@68: """Cached API response from Figshare""" jpayne@68: if self._api_response is None: jpayne@68: # Lazy import requests to speed up import time jpayne@68: import requests # pylint: disable=C0415 jpayne@68: jpayne@68: # Use the figshare API to find the article ID from the DOI jpayne@68: article = requests.get( jpayne@68: f"https://api.figshare.com/v2/articles?doi={self.doi}", jpayne@68: timeout=DEFAULT_TIMEOUT, jpayne@68: ).json()[0] jpayne@68: article_id = article["id"] jpayne@68: # Parse desired version from the doi jpayne@68: version = self._parse_version_from_doi() jpayne@68: # With the ID and version, we can get a list of files and their jpayne@68: # download links jpayne@68: if version is None: jpayne@68: # Figshare returns the latest version available when no version jpayne@68: # is specified through the DOI. jpayne@68: warnings.warn( jpayne@68: f"The Figshare DOI '{self.doi}' doesn't specify which version of " jpayne@68: "the repository should be used. " jpayne@68: "Figshare will point to the latest version available.", jpayne@68: UserWarning, jpayne@68: ) jpayne@68: # Define API url using only the article id jpayne@68: # (figshare will resolve the latest version) jpayne@68: api_url = f"https://api.figshare.com/v2/articles/{article_id}" jpayne@68: else: jpayne@68: # Define API url using article id and the desired version jpayne@68: # Get list of files using article id and the version jpayne@68: api_url = ( jpayne@68: "https://api.figshare.com/v2/articles/" jpayne@68: f"{article_id}/versions/{version}" jpayne@68: ) jpayne@68: # Make the request and return the files in the figshare repository jpayne@68: response = requests.get(api_url, timeout=DEFAULT_TIMEOUT) jpayne@68: response.raise_for_status() jpayne@68: self._api_response = response.json()["files"] jpayne@68: jpayne@68: return self._api_response jpayne@68: jpayne@68: def download_url(self, file_name): jpayne@68: """ jpayne@68: Use the repository API to get the download URL for a file given jpayne@68: the archive URL. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: file_name : str jpayne@68: The name of the file in the archive that will be downloaded. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: download_url : str jpayne@68: The HTTP URL that can be used to download the file. jpayne@68: """ jpayne@68: files = {item["name"]: item for item in self.api_response} jpayne@68: if file_name not in files: jpayne@68: raise ValueError( jpayne@68: f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})." jpayne@68: ) jpayne@68: download_url = files[file_name]["download_url"] jpayne@68: return download_url jpayne@68: jpayne@68: def populate_registry(self, pooch): jpayne@68: """ jpayne@68: Populate the registry using the data repository's API jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: pooch : Pooch jpayne@68: The pooch instance that the registry will be added to. jpayne@68: """ jpayne@68: jpayne@68: for filedata in self.api_response: jpayne@68: pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}" jpayne@68: jpayne@68: jpayne@68: class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring jpayne@68: def __init__(self, doi, archive_url): jpayne@68: self.archive_url = archive_url jpayne@68: self.doi = doi jpayne@68: self._api_response = None jpayne@68: jpayne@68: @classmethod jpayne@68: def initialize(cls, doi, archive_url): jpayne@68: """ jpayne@68: Initialize the data repository if the given URL points to a jpayne@68: corresponding repository. jpayne@68: jpayne@68: Initializes a data repository object. This is done as part of jpayne@68: a chain of responsibility. If the class cannot handle the given jpayne@68: repository URL, it returns `None`. Otherwise a `DataRepository` jpayne@68: instance is returned. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: doi : str jpayne@68: The DOI that identifies the repository jpayne@68: archive_url : str jpayne@68: The resolved URL for the DOI jpayne@68: """ jpayne@68: # Access the DOI as if this was a DataVerse instance jpayne@68: response = cls._get_api_response(doi, archive_url) jpayne@68: jpayne@68: # If we failed, this is probably not a DataVerse instance jpayne@68: if 400 <= response.status_code < 600: jpayne@68: return None jpayne@68: jpayne@68: # Initialize the repository and overwrite the api response jpayne@68: repository = cls(doi, archive_url) jpayne@68: repository.api_response = response jpayne@68: return repository jpayne@68: jpayne@68: @classmethod jpayne@68: def _get_api_response(cls, doi, archive_url): jpayne@68: """ jpayne@68: Perform the actual API request jpayne@68: jpayne@68: This has been separated into a separate ``classmethod``, as it can be jpayne@68: used prior and after the initialization. jpayne@68: """ jpayne@68: # Lazy import requests to speed up import time jpayne@68: import requests # pylint: disable=C0415 jpayne@68: jpayne@68: parsed = parse_url(archive_url) jpayne@68: response = requests.get( jpayne@68: f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/" jpayne@68: f":persistentId?persistentId=doi:{doi}", jpayne@68: timeout=DEFAULT_TIMEOUT, jpayne@68: ) jpayne@68: return response jpayne@68: jpayne@68: @property jpayne@68: def api_response(self): jpayne@68: """Cached API response from a DataVerse instance""" jpayne@68: jpayne@68: if self._api_response is None: jpayne@68: self._api_response = self._get_api_response( jpayne@68: self.doi, self.archive_url jpayne@68: ) # pragma: no cover jpayne@68: jpayne@68: return self._api_response jpayne@68: jpayne@68: @api_response.setter jpayne@68: def api_response(self, response): jpayne@68: """Update the cached API response""" jpayne@68: jpayne@68: self._api_response = response jpayne@68: jpayne@68: def download_url(self, file_name): jpayne@68: """ jpayne@68: Use the repository API to get the download URL for a file given jpayne@68: the archive URL. jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: file_name : str jpayne@68: The name of the file in the archive that will be downloaded. jpayne@68: jpayne@68: Returns jpayne@68: ------- jpayne@68: download_url : str jpayne@68: The HTTP URL that can be used to download the file. jpayne@68: """ jpayne@68: parsed = parse_url(self.archive_url) jpayne@68: response = self.api_response.json() jpayne@68: files = { jpayne@68: file["dataFile"]["filename"]: file["dataFile"] jpayne@68: for file in response["data"]["latestVersion"]["files"] jpayne@68: } jpayne@68: if file_name not in files: jpayne@68: raise ValueError( jpayne@68: f"File '{file_name}' not found in data archive " jpayne@68: f"{self.archive_url} (doi:{self.doi})." jpayne@68: ) jpayne@68: # Generate download_url using the file id jpayne@68: download_url = ( jpayne@68: f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/" jpayne@68: f"{files[file_name]['id']}" jpayne@68: ) jpayne@68: return download_url jpayne@68: jpayne@68: def populate_registry(self, pooch): jpayne@68: """ jpayne@68: Populate the registry using the data repository's API jpayne@68: jpayne@68: Parameters jpayne@68: ---------- jpayne@68: pooch : Pooch jpayne@68: The pooch instance that the registry will be added to. jpayne@68: """ jpayne@68: jpayne@68: for filedata in self.api_response.json()["data"]["latestVersion"]["files"]: jpayne@68: pooch.registry[filedata["dataFile"]["filename"]] = ( jpayne@68: f"md5:{filedata['dataFile']['md5']}" jpayne@68: )