csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py comparison

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

comparison

equal deleted inserted replaced

-:0e9998148a16
+:33d812a61356
+# Copyright (c) 2018 The Pooch Developers.
+# Distributed under the terms of the BSD 3-Clause License.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
+#
+"""
+The classes that actually handle the downloads.
+"""
+import os
+import sys
+import ftplib
+import warnings
+from .utils import parse_url
+try:
+from tqdm import tqdm
+except ImportError:
+tqdm = None
+try:
+import paramiko
+except ImportError:
+paramiko = None
+# Set the default timeout in seconds so it can be configured in a pinch for the
+# methods that don't or can't expose a way set it at runtime.
+# See https://github.com/fatiando/pooch/issues/409
+DEFAULT_TIMEOUT = 30
+def choose_downloader(url, progressbar=False):
+"""
+Choose the appropriate downloader for the given URL based on the protocol.
+Parameters
+----------
+url : str
+A URL (including protocol).
+progressbar : bool or an arbitrary progress bar object
+If True, will print a progress bar of the download to standard error
+(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+installed. Alternatively, an arbitrary progress bar object can be
+passed. See :ref:`custom-progressbar` for details.
+Returns
+-------
+downloader
+A downloader class, like :class:`pooch.HTTPDownloader`,
+:class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
+Examples
+--------
+>>> downloader = choose_downloader("http://something.com")
+>>> print(downloader.__class__.__name__)
+HTTPDownloader
+>>> downloader = choose_downloader("https://something.com")
+>>> print(downloader.__class__.__name__)
+HTTPDownloader
+>>> downloader = choose_downloader("ftp://something.com")
+>>> print(downloader.__class__.__name__)
+FTPDownloader
+>>> downloader = choose_downloader("doi:DOI/filename.csv")
+>>> print(downloader.__class__.__name__)
+DOIDownloader
+"""
+known_downloaders = {
+"ftp": FTPDownloader,
+"https": HTTPDownloader,
+"http": HTTPDownloader,
+"sftp": SFTPDownloader,
+"doi": DOIDownloader,
+}
+parsed_url = parse_url(url)
+if parsed_url["protocol"] not in known_downloaders:
+raise ValueError(
+f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
+f"Must be one of {known_downloaders.keys()}."
+)
+downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
+return downloader
+class HTTPDownloader:  # pylint: disable=too-few-public-methods
+"""
+Download manager for fetching files over HTTP/HTTPS.
+When called, downloads the given file URL into the specified local file.
+Uses the :mod:`requests` library to manage downloads.
+Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
+the download of files (for example, to use authentication or print a
+progress bar).
+Parameters
+----------
+progressbar : bool or an arbitrary progress bar object
+If True, will print a progress bar of the download to standard error
+(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+installed. Alternatively, an arbitrary progress bar object can be
+passed. See :ref:`custom-progressbar` for details.
+chunk_size : int
+Files are streamed *chunk_size* bytes at a time instead of loading
+everything into memory at one. Usually doesn't need to be changed.
+**kwargs
+All keyword arguments given when creating an instance of this class
+will be passed to :func:`requests.get`.
+Examples
+--------
+Download one of the data files from the Pooch repository:
+>>> import os
+>>> from pooch import __version__, check_version
+>>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
+>>> url = url.format(check_version(__version__, fallback="main"))
+>>> downloader = HTTPDownloader()
+>>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
+>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+>>> os.path.exists("tiny-data.txt")
+True
+>>> with open("tiny-data.txt") as f:
+...     print(f.read().strip())
+# A tiny data file for test purposes only
+1  2  3  4  5  6
+>>> os.remove("tiny-data.txt")
+Authentication can be handled by passing a user name and password to
+:func:`requests.get`. All arguments provided when creating an instance of
+the class are forwarded to :func:`requests.get`. We'll use
+``auth=(username, password)`` to use basic HTTPS authentication. The
+https://httpbin.org website allows us to make a fake a login request using
+whatever username and password we provide to it:
+>>> user = "doggo"
+>>> password = "goodboy"
+>>> # httpbin will ask for the user and password we provide in the URL
+>>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
+>>> # Trying without the login credentials causes an error
+>>> downloader = HTTPDownloader()
+>>> try:
+...     downloader(url=url, output_file="tiny-data.txt", pooch=None)
+... except Exception:
+...     print("There was an error!")
+There was an error!
+>>> # Pass in the credentials to HTTPDownloader
+>>> downloader = HTTPDownloader(auth=(user, password))
+>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+>>> with open("tiny-data.txt") as f:
+...     for line in f:
+...         print(line.rstrip())
+{
+"authenticated": true,
+"user": "doggo"
+}
+>>> os.remove("tiny-data.txt")
+"""
+def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
+self.kwargs = kwargs
+self.progressbar = progressbar
+self.chunk_size = chunk_size
+if self.progressbar is True and tqdm is None:
+raise ValueError("Missing package 'tqdm' required for progress bars.")
+def __call__(
+self, url, output_file, pooch, check_only=False
+):  # pylint: disable=R0914
+"""
+Download the given URL over HTTP to the given output file.
+Uses :func:`requests.get`.
+Parameters
+----------
+url : str
+The URL to the file you want to download.
+output_file : str or file-like object
+Path (and file name) to which the file will be downloaded.
+pooch : :class:`~pooch.Pooch`
+The instance of :class:`~pooch.Pooch` that is calling this method.
+check_only : bool
+If True, will only check if a file exists on the server and
+**without downloading the file**. Will return ``True`` if the file
+exists and ``False`` otherwise.
+Returns
+-------
+availability : bool or None
+If ``check_only==True``, returns a boolean indicating if the file
+is available on the server. Otherwise, returns ``None``.
+"""
+# Lazy import requests to speed up import time
+import requests  # pylint: disable=C0415
+if check_only:
+timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT)
+response = requests.head(url, timeout=timeout, allow_redirects=True)
+available = bool(response.status_code == 200)
+return available
+kwargs = self.kwargs.copy()
+timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT)
+kwargs.setdefault("stream", True)
+ispath = not hasattr(output_file, "write")
+if ispath:
+# pylint: disable=consider-using-with
+output_file = open(output_file, "w+b")
+# pylint: enable=consider-using-with
+try:
+response = requests.get(url, timeout=timeout, **kwargs)
+response.raise_for_status()
+content = response.iter_content(chunk_size=self.chunk_size)
+total = int(response.headers.get("content-length", 0))
+if self.progressbar is True:
+# Need to use ascii characters on Windows because there isn't
+# always full unicode support
+# (see https://github.com/tqdm/tqdm/issues/454)
+use_ascii = bool(sys.platform == "win32")
+progress = tqdm(
+total=total,
+ncols=79,
+ascii=use_ascii,
+unit="B",
+unit_scale=True,
+leave=True,
+)
+elif self.progressbar:
+progress = self.progressbar
+progress.total = total
+for chunk in content:
+if chunk:
+output_file.write(chunk)
+output_file.flush()
+if self.progressbar:
+# Use the chunk size here because chunk may be much
+# larger if the data are decompressed by requests after
+# reading (happens with text files).
+progress.update(self.chunk_size)
+# Make sure the progress bar gets filled even if the actual number
+# is chunks is smaller than expected. This happens when streaming
+# text files that are compressed by the server when sending (gzip).
+# Binary files don't experience this.
+if self.progressbar:
+progress.reset()
+progress.update(total)
+progress.close()
+finally:
+if ispath:
+output_file.close()
+return None
+class FTPDownloader:  # pylint: disable=too-few-public-methods
+"""
+Download manager for fetching files over FTP.
+When called, downloads the given file URL into the specified local file.
+Uses the :mod:`ftplib` module to manage downloads.
+Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
+the download of files (for example, to use authentication or print a
+progress bar).
+Parameters
+----------
+port : int
+Port used for the FTP connection.
+username : str
+User name used to login to the server. Only needed if the server
+requires authentication (i.e., no anonymous FTP).
+password : str
+Password used to login to the server. Only needed if the server
+requires authentication (i.e., no anonymous FTP). Use the empty string
+to indicate no password is required.
+account : str
+Some servers also require an "account" name for authentication.
+timeout : int
+Timeout in seconds for ftp socket operations, use None to mean no
+timeout.
+progressbar : bool
+If True, will print a progress bar of the download to standard error
+(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+installed. **Custom progress bars are not yet supported.**
+chunk_size : int
+Files are streamed *chunk_size* bytes at a time instead of loading
+everything into memory at one. Usually doesn't need to be changed.
+"""
+def __init__(
+self,
+port=21,
+username="anonymous",
+password="",
+account="",
+timeout=None,
+progressbar=False,
+chunk_size=1024,
+):
+self.port = port
+self.username = username
+self.password = password
+self.account = account
+self.timeout = timeout
+self.progressbar = progressbar
+self.chunk_size = chunk_size
+if self.progressbar is True and tqdm is None:
+raise ValueError("Missing package 'tqdm' required for progress bars.")
+def __call__(self, url, output_file, pooch, check_only=False):
+"""
+Download the given URL over FTP to the given output file.
+Parameters
+----------
+url : str
+The URL to the file you want to download.
+output_file : str or file-like object
+Path (and file name) to which the file will be downloaded.
+pooch : :class:`~pooch.Pooch`
+The instance of :class:`~pooch.Pooch` that is calling this method.
+check_only : bool
+If True, will only check if a file exists on the server and
+**without downloading the file**. Will return ``True`` if the file
+exists and ``False`` otherwise.
+Returns
+-------
+availability : bool or None
+If ``check_only==True``, returns a boolean indicating if the file
+is available on the server. Otherwise, returns ``None``.
+"""
+parsed_url = parse_url(url)
+ftp = ftplib.FTP(timeout=self.timeout)
+ftp.connect(host=parsed_url["netloc"], port=self.port)
+if check_only:
+directory, file_name = os.path.split(parsed_url["path"])
+try:
+ftp.login(user=self.username, passwd=self.password, acct=self.account)
+available = file_name in ftp.nlst(directory)
+finally:
+ftp.close()
+return available
+ispath = not hasattr(output_file, "write")
+if ispath:
+# pylint: disable=consider-using-with
+output_file = open(output_file, "w+b")
+# pylint: enable=consider-using-with
+try:
+ftp.login(user=self.username, passwd=self.password, acct=self.account)
+command = f"RETR {parsed_url['path']}"
+if self.progressbar:
+# Make sure the file is set to binary mode, otherwise we can't
+# get the file size. See: https://stackoverflow.com/a/22093848
+ftp.voidcmd("TYPE I")
+use_ascii = bool(sys.platform == "win32")
+progress = tqdm(
+total=int(ftp.size(parsed_url["path"])),
+ncols=79,
+ascii=use_ascii,
+unit="B",
+unit_scale=True,
+leave=True,
+)
+with progress:
+def callback(data):
+"Update the progress bar and write to output"
+progress.update(len(data))
+output_file.write(data)
+ftp.retrbinary(command, callback, blocksize=self.chunk_size)
+else:
+ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
+finally:
+ftp.quit()
+if ispath:
+output_file.close()
+return None
+class SFTPDownloader:  # pylint: disable=too-few-public-methods
+"""
+Download manager for fetching files over SFTP.
+When called, downloads the given file URL into the specified local file.
+Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
+installed.
+Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
+the download of files (for example, to use authentication or print a
+progress bar).
+Parameters
+----------
+port : int
+Port used for the SFTP connection.
+username : str
+User name used to login to the server. Only needed if the server
+requires authentication (i.e., no anonymous SFTP).
+password : str
+Password used to login to the server. Only needed if the server
+requires authentication (i.e., no anonymous SFTP). Use the empty
+string to indicate no password is required.
+timeout : int
+Timeout in seconds for sftp socket operations, use None to mean no
+timeout.
+progressbar : bool or an arbitrary progress bar object
+If True, will print a progress bar of the download to standard
+error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
+be installed.
+"""
+def __init__(
+self,
+port=22,
+username="anonymous",
+password="",
+account="",
+timeout=None,
+progressbar=False,
+):
+self.port = port
+self.username = username
+self.password = password
+self.account = account
+self.timeout = timeout
+self.progressbar = progressbar
+# Collect errors and raise only once so that both missing packages are
+# captured. Otherwise, the user is only warned of one of them at a
+# time (and we can't test properly when they are both missing).
+errors = []
+if self.progressbar and tqdm is None:
+errors.append("Missing package 'tqdm' required for progress bars.")
+if paramiko is None:
+errors.append("Missing package 'paramiko' required for SFTP downloads.")
+if errors:
+raise ValueError(" ".join(errors))
+def __call__(self, url, output_file, pooch):
+"""
+Download the given URL over SFTP to the given output file.
+The output file must be given as a string (file name/path) and not an
+open file object! Otherwise, paramiko cannot save to that file.
+Parameters
+----------
+url : str
+The URL to the file you want to download.
+output_file : str
+Path (and file name) to which the file will be downloaded. **Cannot
+be a file object**.
+pooch : :class:`~pooch.Pooch`
+The instance of :class:`~pooch.Pooch` that is calling this method.
+"""
+parsed_url = parse_url(url)
+connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
+sftp = None
+try:
+connection.connect(username=self.username, password=self.password)
+sftp = paramiko.SFTPClient.from_transport(connection)
+sftp.get_channel().settimeout = self.timeout
+if self.progressbar:
+size = int(sftp.stat(parsed_url["path"]).st_size)
+use_ascii = bool(sys.platform == "win32")
+progress = tqdm(
+total=size,
+ncols=79,
+ascii=use_ascii,
+unit="B",
+unit_scale=True,
+leave=True,
+)
+if self.progressbar:
+with progress:
+def callback(current, total):
+"Update the progress bar and write to output"
+progress.total = int(total)
+progress.update(int(current - progress.n))
+sftp.get(parsed_url["path"], output_file, callback=callback)
+else:
+sftp.get(parsed_url["path"], output_file)
+finally:
+connection.close()
+if sftp is not None:
+sftp.close()
+class DOIDownloader:  # pylint: disable=too-few-public-methods
+"""
+Download manager for fetching files from Digital Object Identifiers (DOIs).
+Open-access data repositories often issue Digital Object Identifiers (DOIs)
+for data which provide a stable link and citation point. The trick is
+finding out the download URL for a file given the DOI.
+When called, this downloader uses the repository's public API to find out
+the download URL from the DOI and file name. It then uses
+:class:`pooch.HTTPDownloader` to download the URL into the specified local
+file. Allowing "URL"s  to be specified with the DOI instead of the actual
+HTTP download link. Uses the :mod:`requests` library to manage downloads
+and interact with the APIs.
+The **format of the "URL"** is: ``doi:{DOI}/{file name}``.
+Notice that there are no ``//`` like in HTTP/FTP and you must specify a
+file name after the DOI (separated by a ``/``).
+Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
+download files given the DOI instead of an HTTP link.
+Supported repositories:
+* `figshare <https://www.figshare.com>`__
+* `Zenodo <https://www.zenodo.org>`__
+* `Dataverse <https://dataverse.org/>`__ instances
+.. attention::
+DOIs from other repositories **will not work** since we need to access
+their particular APIs to find the download links. We welcome
+suggestions and contributions adding new repositories.
+Parameters
+----------
+progressbar : bool or an arbitrary progress bar object
+If True, will print a progress bar of the download to standard error
+(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
+installed. Alternatively, an arbitrary progress bar object can be
+passed. See :ref:`custom-progressbar` for details.
+chunk_size : int
+Files are streamed *chunk_size* bytes at a time instead of loading
+everything into memory at one. Usually doesn't need to be changed.
+**kwargs
+All keyword arguments given when creating an instance of this class
+will be passed to :func:`requests.get`.
+Examples
+--------
+Download one of the data files from the figshare archive of Pooch test
+data:
+>>> import os
+>>> downloader = DOIDownloader()
+>>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
+>>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
+>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+>>> os.path.exists("tiny-data.txt")
+True
+>>> with open("tiny-data.txt") as f:
+...     print(f.read().strip())
+# A tiny data file for test purposes only
+1  2  3  4  5  6
+>>> os.remove("tiny-data.txt")
+Same thing but for our Zenodo archive:
+>>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
+>>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
+>>> os.path.exists("tiny-data.txt")
+True
+>>> with open("tiny-data.txt") as f:
+...     print(f.read().strip())
+# A tiny data file for test purposes only
+1  2  3  4  5  6
+>>> os.remove("tiny-data.txt")
+"""
+def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
+self.kwargs = kwargs
+self.progressbar = progressbar
+self.chunk_size = chunk_size
+def __call__(self, url, output_file, pooch):
+"""
+Download the given DOI URL over HTTP to the given output file.
+Uses the repository's API to determine the actual HTTP download URL
+from the given DOI.
+Uses :func:`requests.get`.
+Parameters
+----------
+url : str
+The URL to the file you want to download.
+output_file : str or file-like object
+Path (and file name) to which the file will be downloaded.
+pooch : :class:`~pooch.Pooch`
+The instance of :class:`~pooch.Pooch` that is calling this method.
+"""
+parsed_url = parse_url(url)
+data_repository = doi_to_repository(parsed_url["netloc"])
+# Resolve the URL
+file_name = parsed_url["path"]
+# remove the leading slash in the path
+if file_name[0] == "/":
+file_name = file_name[1:]
+download_url = data_repository.download_url(file_name)
+# Instantiate the downloader object
+downloader = HTTPDownloader(
+progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
+)
+downloader(download_url, output_file, pooch)
+def doi_to_url(doi):
+"""
+Follow a DOI link to resolve the URL of the archive.
+Parameters
+----------
+doi : str
+The DOI of the archive.
+Returns
+-------
+url : str
+The URL of the archive in the data repository.
+"""
+# Lazy import requests to speed up import time
+import requests  # pylint: disable=C0415
+# Use doi.org to resolve the DOI to the repository website.
+response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT)
+url = response.url
+if 400 <= response.status_code < 600:
+raise ValueError(
+f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
+)
+return url
+def doi_to_repository(doi):
+"""
+Instantiate a data repository instance from a given DOI.
+This function implements the chain of responsibility dispatch
+to the correct data repository class.
+Parameters
+----------
+doi : str
+The DOI of the archive.
+Returns
+-------
+data_repository : DataRepository
+The data repository object
+"""
+# This should go away in a separate issue: DOI handling should
+# not rely on the (non-)existence of trailing slashes. The issue
+# is documented in https://github.com/fatiando/pooch/issues/324
+if doi[-1] == "/":
+doi = doi[:-1]
+repositories = [
+FigshareRepository,
+ZenodoRepository,
+DataverseRepository,
+]
+# Extract the DOI and the repository information
+archive_url = doi_to_url(doi)
+# Try the converters one by one until one of them returned a URL
+data_repository = None
+for repo in repositories:
+if data_repository is None:
+data_repository = repo.initialize(
+archive_url=archive_url,
+doi=doi,
+)
+if data_repository is None:
+repository = parse_url(archive_url)["netloc"]
+raise ValueError(
+f"Invalid data repository '{repository}'. "
+"To request or contribute support for this repository, "
+"please open an issue at https://github.com/fatiando/pooch/issues"
+)
+return data_repository
+class DataRepository:  # pylint: disable=too-few-public-methods, missing-class-docstring
+@classmethod
+def initialize(cls, doi, archive_url):  # pylint: disable=unused-argument
+"""
+Initialize the data repository if the given URL points to a
+corresponding repository.
+Initializes a data repository object. This is done as part of
+a chain of responsibility. If the class cannot handle the given
+repository URL, it returns `None`. Otherwise a `DataRepository`
+instance is returned.
+Parameters
+----------
+doi : str
+The DOI that identifies the repository
+archive_url : str
+The resolved URL for the DOI
+"""
+return None  # pragma: no cover
+def download_url(self, file_name):
+"""
+Use the repository API to get the download URL for a file given
+the archive URL.
+Parameters
+----------
+file_name : str
+The name of the file in the archive that will be downloaded.
+Returns
+-------
+download_url : str
+The HTTP URL that can be used to download the file.
+"""
+raise NotImplementedError  # pragma: no cover
+def populate_registry(self, pooch):
+"""
+Populate the registry using the data repository's API
+Parameters
+----------
+pooch : Pooch
+The pooch instance that the registry will be added to.
+"""
+raise NotImplementedError  # pragma: no cover
+class ZenodoRepository(DataRepository):  # pylint: disable=missing-class-docstring
+base_api_url = "https://zenodo.org/api/records"
+def __init__(self, doi, archive_url):
+self.archive_url = archive_url
+self.doi = doi
+self._api_response = None
+self._api_version = None
+@classmethod
+def initialize(cls, doi, archive_url):
+"""
+Initialize the data repository if the given URL points to a
+corresponding repository.
+Initializes a data repository object. This is done as part of
+a chain of responsibility. If the class cannot handle the given
+repository URL, it returns `None`. Otherwise a `DataRepository`
+instance is returned.
+Parameters
+----------
+doi : str
+The DOI that identifies the repository
+archive_url : str
+The resolved URL for the DOI
+"""
+# Check whether this is a Zenodo URL
+parsed_archive_url = parse_url(archive_url)
+if parsed_archive_url["netloc"] != "zenodo.org":
+return None
+return cls(doi, archive_url)
+@property
+def api_response(self):
+"""Cached API response from Zenodo"""
+if self._api_response is None:
+# Lazy import requests to speed up import time
+import requests  # pylint: disable=C0415
+article_id = self.archive_url.split("/")[-1]
+self._api_response = requests.get(
+f"{self.base_api_url}/{article_id}",
+timeout=DEFAULT_TIMEOUT,
+).json()
+return self._api_response
+@property
+def api_version(self):
+"""
+Version of the Zenodo API we are interacting with
+The versions can either be :
+- ``"legacy"``: corresponds to the Zenodo API that was supported until
+2023-10-12 (before the migration to InvenioRDM).
+- ``"new"``: corresponds to the new API that went online on 2023-10-13
+after the migration to InvenioRDM.
+The ``"new"`` API breaks backward compatibility with the ``"legacy"``
+one and could probably be replaced by an updated version that restores
+the behaviour of the ``"legacy"`` one.
+Returns
+-------
+str
+"""
+if self._api_version is None:
+if all("key" in file for file in self.api_response["files"]):
+self._api_version = "legacy"
+elif all("filename" in file for file in self.api_response["files"]):
+self._api_version = "new"
+else:
+raise ValueError(
+"Couldn't determine the version of the Zenodo API for "
+f"{self.archive_url} (doi:{self.doi})."
+)
+return self._api_version
+def download_url(self, file_name):
+"""
+Use the repository API to get the download URL for a file given
+the archive URL.
+Parameters
+----------
+file_name : str
+The name of the file in the archive that will be downloaded.
+Returns
+-------
+download_url : str
+The HTTP URL that can be used to download the file.
+Notes
+-----
+After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
+link to the desired files that appears in the API response leads to 404
+errors (by 2023-10-17). The files are available in the following url:
+``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.
+This method supports both the legacy and the new API.
+"""
+# Create list of files in the repository
+if self.api_version == "legacy":
+files = {item["key"]: item for item in self.api_response["files"]}
+else:
+files = [item["filename"] for item in self.api_response["files"]]
+# Check if file exists in the repository
+if file_name not in files:
+raise ValueError(
+f"File '{file_name}' not found in data archive "
+f"{self.archive_url} (doi:{self.doi})."
+)
+# Build download url
+if self.api_version == "legacy":
+download_url = files[file_name]["links"]["self"]
+else:
+article_id = self.api_response["id"]
+download_url = (
+f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
+)
+return download_url
+def populate_registry(self, pooch):
+"""
+Populate the registry using the data repository's API
+Parameters
+----------
+pooch : Pooch
+The pooch instance that the registry will be added to.
+Notes
+-----
+After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
+checksums for each file listed in the API reference is now an md5 sum.
+This method supports both the legacy and the new API.
+"""
+for filedata in self.api_response["files"]:
+checksum = filedata["checksum"]
+if self.api_version == "legacy":
+key = "key"
+else:
+key = "filename"
+checksum = f"md5:{checksum}"
+pooch.registry[filedata[key]] = checksum
+class FigshareRepository(DataRepository):  # pylint: disable=missing-class-docstring
+def __init__(self, doi, archive_url):
+self.archive_url = archive_url
+self.doi = doi
+self._api_response = None
+@classmethod
+def initialize(cls, doi, archive_url):
+"""
+Initialize the data repository if the given URL points to a
+corresponding repository.
+Initializes a data repository object. This is done as part of
+a chain of responsibility. If the class cannot handle the given
+repository URL, it returns `None`. Otherwise a `DataRepository`
+instance is returned.
+Parameters
+----------
+doi : str
+The DOI that identifies the repository
+archive_url : str
+The resolved URL for the DOI
+"""
+# Check whether this is a Figshare URL
+parsed_archive_url = parse_url(archive_url)
+if parsed_archive_url["netloc"] != "figshare.com":
+return None
+return cls(doi, archive_url)
+def _parse_version_from_doi(self):
+"""
+Parse version from the doi
+Return None if version is not available in the doi.
+"""
+# Get suffix of the doi
+_, suffix = self.doi.split("/")
+# Split the suffix by dots and keep the last part
+last_part = suffix.split(".")[-1]
+# Parse the version from the last part
+if last_part[0] != "v":
+return None
+version = int(last_part[1:])
+return version
+@property
+def api_response(self):
+"""Cached API response from Figshare"""
+if self._api_response is None:
+# Lazy import requests to speed up import time
+import requests  # pylint: disable=C0415
+# Use the figshare API to find the article ID from the DOI
+article = requests.get(
+f"https://api.figshare.com/v2/articles?doi={self.doi}",
+timeout=DEFAULT_TIMEOUT,
+).json()[0]
+article_id = article["id"]
+# Parse desired version from the doi
+version = self._parse_version_from_doi()
+# With the ID and version, we can get a list of files and their
+# download links
+if version is None:
+# Figshare returns the latest version available when no version
+# is specified through the DOI.
+warnings.warn(
+f"The Figshare DOI '{self.doi}' doesn't specify which version of "
+"the repository should be used. "
+"Figshare will point to the latest version available.",
+UserWarning,
+)
+# Define API url using only the article id
+# (figshare will resolve the latest version)
+api_url = f"https://api.figshare.com/v2/articles/{article_id}"
+else:
+# Define API url using article id and the desired version
+# Get list of files using article id and the version
+api_url = (
+"https://api.figshare.com/v2/articles/"
+f"{article_id}/versions/{version}"
+)
+# Make the request and return the files in the figshare repository
+response = requests.get(api_url, timeout=DEFAULT_TIMEOUT)
+response.raise_for_status()
+self._api_response = response.json()["files"]
+return self._api_response
+def download_url(self, file_name):
+"""
+Use the repository API to get the download URL for a file given
+the archive URL.
+Parameters
+----------
+file_name : str
+The name of the file in the archive that will be downloaded.
+Returns
+-------
+download_url : str
+The HTTP URL that can be used to download the file.
+"""
+files = {item["name"]: item for item in self.api_response}
+if file_name not in files:
+raise ValueError(
+f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
+)
+download_url = files[file_name]["download_url"]
+return download_url
+def populate_registry(self, pooch):
+"""
+Populate the registry using the data repository's API
+Parameters
+----------
+pooch : Pooch
+The pooch instance that the registry will be added to.
+"""
+for filedata in self.api_response:
+pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
+class DataverseRepository(DataRepository):  # pylint: disable=missing-class-docstring
+def __init__(self, doi, archive_url):
+self.archive_url = archive_url
+self.doi = doi
+self._api_response = None
+@classmethod
+def initialize(cls, doi, archive_url):
+"""
+Initialize the data repository if the given URL points to a
+corresponding repository.
+Initializes a data repository object. This is done as part of
+a chain of responsibility. If the class cannot handle the given
+repository URL, it returns `None`. Otherwise a `DataRepository`
+instance is returned.
+Parameters
+----------
+doi : str
+The DOI that identifies the repository
+archive_url : str
+The resolved URL for the DOI
+"""
+# Access the DOI as if this was a DataVerse instance
+response = cls._get_api_response(doi, archive_url)
+# If we failed, this is probably not a DataVerse instance
+if 400 <= response.status_code < 600:
+return None
+# Initialize the repository and overwrite the api response
+repository = cls(doi, archive_url)
+repository.api_response = response
+return repository
+@classmethod
+def _get_api_response(cls, doi, archive_url):
+"""
+Perform the actual API request
+This has been separated into a separate ``classmethod``, as it can be
+used prior and after the initialization.
+"""
+# Lazy import requests to speed up import time
+import requests  # pylint: disable=C0415
+parsed = parse_url(archive_url)
+response = requests.get(
+f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
+f":persistentId?persistentId=doi:{doi}",
+timeout=DEFAULT_TIMEOUT,
+)
+return response
+@property
+def api_response(self):
+"""Cached API response from a DataVerse instance"""
+if self._api_response is None:
+self._api_response = self._get_api_response(
+self.doi, self.archive_url
+)  # pragma: no cover
+return self._api_response
+@api_response.setter
+def api_response(self, response):
+"""Update the cached API response"""
+self._api_response = response
+def download_url(self, file_name):
+"""
+Use the repository API to get the download URL for a file given
+the archive URL.
+Parameters
+----------
+file_name : str
+The name of the file in the archive that will be downloaded.
+Returns
+-------
+download_url : str
+The HTTP URL that can be used to download the file.
+"""
+parsed = parse_url(self.archive_url)
+response = self.api_response.json()
+files = {
+file["dataFile"]["filename"]: file["dataFile"]
+for file in response["data"]["latestVersion"]["files"]
+}
+if file_name not in files:
+raise ValueError(
+f"File '{file_name}' not found in data archive "
+f"{self.archive_url} (doi:{self.doi})."
+)
+# Generate download_url using the file id
+download_url = (
+f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
+f"{files[file_name]['id']}"
+)
+return download_url
+def populate_registry(self, pooch):
+"""
+Populate the registry using the data repository's API
+Parameters
+----------
+pooch : Pooch
+The pooch instance that the registry will be added to.
+"""
+for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
+pooch.registry[filedata["dataFile"]["filename"]] = (
+f"md5:{filedata['dataFile']['md5']}"
+)

Mercurial > repos > rliterman > csp2

comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 69:33d812a61356