annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # Copyright (c) 2018 The Pooch Developers.
jpayne@68 2 # Distributed under the terms of the BSD 3-Clause License.
jpayne@68 3 # SPDX-License-Identifier: BSD-3-Clause
jpayne@68 4 #
jpayne@68 5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
jpayne@68 6 #
jpayne@68 7 """
jpayne@68 8 The classes that actually handle the downloads.
jpayne@68 9 """
jpayne@68 10 import os
jpayne@68 11 import sys
jpayne@68 12 import ftplib
jpayne@68 13
jpayne@68 14 import warnings
jpayne@68 15
jpayne@68 16 from .utils import parse_url
jpayne@68 17
jpayne@68 18 try:
jpayne@68 19 from tqdm import tqdm
jpayne@68 20 except ImportError:
jpayne@68 21 tqdm = None
jpayne@68 22
jpayne@68 23 try:
jpayne@68 24 import paramiko
jpayne@68 25 except ImportError:
jpayne@68 26 paramiko = None
jpayne@68 27
jpayne@68 28
jpayne@68 29 # Set the default timeout in seconds so it can be configured in a pinch for the
jpayne@68 30 # methods that don't or can't expose a way set it at runtime.
jpayne@68 31 # See https://github.com/fatiando/pooch/issues/409
jpayne@68 32 DEFAULT_TIMEOUT = 30
jpayne@68 33
jpayne@68 34
jpayne@68 35 def choose_downloader(url, progressbar=False):
jpayne@68 36 """
jpayne@68 37 Choose the appropriate downloader for the given URL based on the protocol.
jpayne@68 38
jpayne@68 39 Parameters
jpayne@68 40 ----------
jpayne@68 41 url : str
jpayne@68 42 A URL (including protocol).
jpayne@68 43 progressbar : bool or an arbitrary progress bar object
jpayne@68 44 If True, will print a progress bar of the download to standard error
jpayne@68 45 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68 46 installed. Alternatively, an arbitrary progress bar object can be
jpayne@68 47 passed. See :ref:`custom-progressbar` for details.
jpayne@68 48
jpayne@68 49 Returns
jpayne@68 50 -------
jpayne@68 51 downloader
jpayne@68 52 A downloader class, like :class:`pooch.HTTPDownloader`,
jpayne@68 53 :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
jpayne@68 54
jpayne@68 55 Examples
jpayne@68 56 --------
jpayne@68 57
jpayne@68 58 >>> downloader = choose_downloader("http://something.com")
jpayne@68 59 >>> print(downloader.__class__.__name__)
jpayne@68 60 HTTPDownloader
jpayne@68 61 >>> downloader = choose_downloader("https://something.com")
jpayne@68 62 >>> print(downloader.__class__.__name__)
jpayne@68 63 HTTPDownloader
jpayne@68 64 >>> downloader = choose_downloader("ftp://something.com")
jpayne@68 65 >>> print(downloader.__class__.__name__)
jpayne@68 66 FTPDownloader
jpayne@68 67 >>> downloader = choose_downloader("doi:DOI/filename.csv")
jpayne@68 68 >>> print(downloader.__class__.__name__)
jpayne@68 69 DOIDownloader
jpayne@68 70
jpayne@68 71 """
jpayne@68 72 known_downloaders = {
jpayne@68 73 "ftp": FTPDownloader,
jpayne@68 74 "https": HTTPDownloader,
jpayne@68 75 "http": HTTPDownloader,
jpayne@68 76 "sftp": SFTPDownloader,
jpayne@68 77 "doi": DOIDownloader,
jpayne@68 78 }
jpayne@68 79
jpayne@68 80 parsed_url = parse_url(url)
jpayne@68 81 if parsed_url["protocol"] not in known_downloaders:
jpayne@68 82 raise ValueError(
jpayne@68 83 f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
jpayne@68 84 f"Must be one of {known_downloaders.keys()}."
jpayne@68 85 )
jpayne@68 86 downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
jpayne@68 87 return downloader
jpayne@68 88
jpayne@68 89
jpayne@68 90 class HTTPDownloader: # pylint: disable=too-few-public-methods
jpayne@68 91 """
jpayne@68 92 Download manager for fetching files over HTTP/HTTPS.
jpayne@68 93
jpayne@68 94 When called, downloads the given file URL into the specified local file.
jpayne@68 95 Uses the :mod:`requests` library to manage downloads.
jpayne@68 96
jpayne@68 97 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@68 98 the download of files (for example, to use authentication or print a
jpayne@68 99 progress bar).
jpayne@68 100
jpayne@68 101 Parameters
jpayne@68 102 ----------
jpayne@68 103 progressbar : bool or an arbitrary progress bar object
jpayne@68 104 If True, will print a progress bar of the download to standard error
jpayne@68 105 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68 106 installed. Alternatively, an arbitrary progress bar object can be
jpayne@68 107 passed. See :ref:`custom-progressbar` for details.
jpayne@68 108 chunk_size : int
jpayne@68 109 Files are streamed *chunk_size* bytes at a time instead of loading
jpayne@68 110 everything into memory at one. Usually doesn't need to be changed.
jpayne@68 111 **kwargs
jpayne@68 112 All keyword arguments given when creating an instance of this class
jpayne@68 113 will be passed to :func:`requests.get`.
jpayne@68 114
jpayne@68 115 Examples
jpayne@68 116 --------
jpayne@68 117
jpayne@68 118 Download one of the data files from the Pooch repository:
jpayne@68 119
jpayne@68 120 >>> import os
jpayne@68 121 >>> from pooch import __version__, check_version
jpayne@68 122 >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
jpayne@68 123 >>> url = url.format(check_version(__version__, fallback="main"))
jpayne@68 124 >>> downloader = HTTPDownloader()
jpayne@68 125 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
jpayne@68 126 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68 127 >>> os.path.exists("tiny-data.txt")
jpayne@68 128 True
jpayne@68 129 >>> with open("tiny-data.txt") as f:
jpayne@68 130 ... print(f.read().strip())
jpayne@68 131 # A tiny data file for test purposes only
jpayne@68 132 1 2 3 4 5 6
jpayne@68 133 >>> os.remove("tiny-data.txt")
jpayne@68 134
jpayne@68 135 Authentication can be handled by passing a user name and password to
jpayne@68 136 :func:`requests.get`. All arguments provided when creating an instance of
jpayne@68 137 the class are forwarded to :func:`requests.get`. We'll use
jpayne@68 138 ``auth=(username, password)`` to use basic HTTPS authentication. The
jpayne@68 139 https://httpbin.org website allows us to make a fake a login request using
jpayne@68 140 whatever username and password we provide to it:
jpayne@68 141
jpayne@68 142 >>> user = "doggo"
jpayne@68 143 >>> password = "goodboy"
jpayne@68 144 >>> # httpbin will ask for the user and password we provide in the URL
jpayne@68 145 >>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
jpayne@68 146 >>> # Trying without the login credentials causes an error
jpayne@68 147 >>> downloader = HTTPDownloader()
jpayne@68 148 >>> try:
jpayne@68 149 ... downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68 150 ... except Exception:
jpayne@68 151 ... print("There was an error!")
jpayne@68 152 There was an error!
jpayne@68 153 >>> # Pass in the credentials to HTTPDownloader
jpayne@68 154 >>> downloader = HTTPDownloader(auth=(user, password))
jpayne@68 155 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68 156 >>> with open("tiny-data.txt") as f:
jpayne@68 157 ... for line in f:
jpayne@68 158 ... print(line.rstrip())
jpayne@68 159 {
jpayne@68 160 "authenticated": true,
jpayne@68 161 "user": "doggo"
jpayne@68 162 }
jpayne@68 163 >>> os.remove("tiny-data.txt")
jpayne@68 164
jpayne@68 165 """
jpayne@68 166
jpayne@68 167 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
jpayne@68 168 self.kwargs = kwargs
jpayne@68 169 self.progressbar = progressbar
jpayne@68 170 self.chunk_size = chunk_size
jpayne@68 171 if self.progressbar is True and tqdm is None:
jpayne@68 172 raise ValueError("Missing package 'tqdm' required for progress bars.")
jpayne@68 173
jpayne@68 174 def __call__(
jpayne@68 175 self, url, output_file, pooch, check_only=False
jpayne@68 176 ): # pylint: disable=R0914
jpayne@68 177 """
jpayne@68 178 Download the given URL over HTTP to the given output file.
jpayne@68 179
jpayne@68 180 Uses :func:`requests.get`.
jpayne@68 181
jpayne@68 182 Parameters
jpayne@68 183 ----------
jpayne@68 184 url : str
jpayne@68 185 The URL to the file you want to download.
jpayne@68 186 output_file : str or file-like object
jpayne@68 187 Path (and file name) to which the file will be downloaded.
jpayne@68 188 pooch : :class:`~pooch.Pooch`
jpayne@68 189 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68 190 check_only : bool
jpayne@68 191 If True, will only check if a file exists on the server and
jpayne@68 192 **without downloading the file**. Will return ``True`` if the file
jpayne@68 193 exists and ``False`` otherwise.
jpayne@68 194
jpayne@68 195 Returns
jpayne@68 196 -------
jpayne@68 197 availability : bool or None
jpayne@68 198 If ``check_only==True``, returns a boolean indicating if the file
jpayne@68 199 is available on the server. Otherwise, returns ``None``.
jpayne@68 200
jpayne@68 201 """
jpayne@68 202 # Lazy import requests to speed up import time
jpayne@68 203 import requests # pylint: disable=C0415
jpayne@68 204
jpayne@68 205 if check_only:
jpayne@68 206 timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT)
jpayne@68 207 response = requests.head(url, timeout=timeout, allow_redirects=True)
jpayne@68 208 available = bool(response.status_code == 200)
jpayne@68 209 return available
jpayne@68 210
jpayne@68 211 kwargs = self.kwargs.copy()
jpayne@68 212 timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT)
jpayne@68 213 kwargs.setdefault("stream", True)
jpayne@68 214 ispath = not hasattr(output_file, "write")
jpayne@68 215 if ispath:
jpayne@68 216 # pylint: disable=consider-using-with
jpayne@68 217 output_file = open(output_file, "w+b")
jpayne@68 218 # pylint: enable=consider-using-with
jpayne@68 219 try:
jpayne@68 220 response = requests.get(url, timeout=timeout, **kwargs)
jpayne@68 221 response.raise_for_status()
jpayne@68 222 content = response.iter_content(chunk_size=self.chunk_size)
jpayne@68 223 total = int(response.headers.get("content-length", 0))
jpayne@68 224 if self.progressbar is True:
jpayne@68 225 # Need to use ascii characters on Windows because there isn't
jpayne@68 226 # always full unicode support
jpayne@68 227 # (see https://github.com/tqdm/tqdm/issues/454)
jpayne@68 228 use_ascii = bool(sys.platform == "win32")
jpayne@68 229 progress = tqdm(
jpayne@68 230 total=total,
jpayne@68 231 ncols=79,
jpayne@68 232 ascii=use_ascii,
jpayne@68 233 unit="B",
jpayne@68 234 unit_scale=True,
jpayne@68 235 leave=True,
jpayne@68 236 )
jpayne@68 237 elif self.progressbar:
jpayne@68 238 progress = self.progressbar
jpayne@68 239 progress.total = total
jpayne@68 240 for chunk in content:
jpayne@68 241 if chunk:
jpayne@68 242 output_file.write(chunk)
jpayne@68 243 output_file.flush()
jpayne@68 244 if self.progressbar:
jpayne@68 245 # Use the chunk size here because chunk may be much
jpayne@68 246 # larger if the data are decompressed by requests after
jpayne@68 247 # reading (happens with text files).
jpayne@68 248 progress.update(self.chunk_size)
jpayne@68 249 # Make sure the progress bar gets filled even if the actual number
jpayne@68 250 # is chunks is smaller than expected. This happens when streaming
jpayne@68 251 # text files that are compressed by the server when sending (gzip).
jpayne@68 252 # Binary files don't experience this.
jpayne@68 253 if self.progressbar:
jpayne@68 254 progress.reset()
jpayne@68 255 progress.update(total)
jpayne@68 256 progress.close()
jpayne@68 257 finally:
jpayne@68 258 if ispath:
jpayne@68 259 output_file.close()
jpayne@68 260 return None
jpayne@68 261
jpayne@68 262
jpayne@68 263 class FTPDownloader: # pylint: disable=too-few-public-methods
jpayne@68 264 """
jpayne@68 265 Download manager for fetching files over FTP.
jpayne@68 266
jpayne@68 267 When called, downloads the given file URL into the specified local file.
jpayne@68 268 Uses the :mod:`ftplib` module to manage downloads.
jpayne@68 269
jpayne@68 270 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@68 271 the download of files (for example, to use authentication or print a
jpayne@68 272 progress bar).
jpayne@68 273
jpayne@68 274 Parameters
jpayne@68 275 ----------
jpayne@68 276 port : int
jpayne@68 277 Port used for the FTP connection.
jpayne@68 278 username : str
jpayne@68 279 User name used to login to the server. Only needed if the server
jpayne@68 280 requires authentication (i.e., no anonymous FTP).
jpayne@68 281 password : str
jpayne@68 282 Password used to login to the server. Only needed if the server
jpayne@68 283 requires authentication (i.e., no anonymous FTP). Use the empty string
jpayne@68 284 to indicate no password is required.
jpayne@68 285 account : str
jpayne@68 286 Some servers also require an "account" name for authentication.
jpayne@68 287 timeout : int
jpayne@68 288 Timeout in seconds for ftp socket operations, use None to mean no
jpayne@68 289 timeout.
jpayne@68 290 progressbar : bool
jpayne@68 291 If True, will print a progress bar of the download to standard error
jpayne@68 292 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68 293 installed. **Custom progress bars are not yet supported.**
jpayne@68 294 chunk_size : int
jpayne@68 295 Files are streamed *chunk_size* bytes at a time instead of loading
jpayne@68 296 everything into memory at one. Usually doesn't need to be changed.
jpayne@68 297
jpayne@68 298 """
jpayne@68 299
jpayne@68 300 def __init__(
jpayne@68 301 self,
jpayne@68 302 port=21,
jpayne@68 303 username="anonymous",
jpayne@68 304 password="",
jpayne@68 305 account="",
jpayne@68 306 timeout=None,
jpayne@68 307 progressbar=False,
jpayne@68 308 chunk_size=1024,
jpayne@68 309 ):
jpayne@68 310 self.port = port
jpayne@68 311 self.username = username
jpayne@68 312 self.password = password
jpayne@68 313 self.account = account
jpayne@68 314 self.timeout = timeout
jpayne@68 315 self.progressbar = progressbar
jpayne@68 316 self.chunk_size = chunk_size
jpayne@68 317 if self.progressbar is True and tqdm is None:
jpayne@68 318 raise ValueError("Missing package 'tqdm' required for progress bars.")
jpayne@68 319
jpayne@68 320 def __call__(self, url, output_file, pooch, check_only=False):
jpayne@68 321 """
jpayne@68 322 Download the given URL over FTP to the given output file.
jpayne@68 323
jpayne@68 324 Parameters
jpayne@68 325 ----------
jpayne@68 326 url : str
jpayne@68 327 The URL to the file you want to download.
jpayne@68 328 output_file : str or file-like object
jpayne@68 329 Path (and file name) to which the file will be downloaded.
jpayne@68 330 pooch : :class:`~pooch.Pooch`
jpayne@68 331 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68 332 check_only : bool
jpayne@68 333 If True, will only check if a file exists on the server and
jpayne@68 334 **without downloading the file**. Will return ``True`` if the file
jpayne@68 335 exists and ``False`` otherwise.
jpayne@68 336
jpayne@68 337 Returns
jpayne@68 338 -------
jpayne@68 339 availability : bool or None
jpayne@68 340 If ``check_only==True``, returns a boolean indicating if the file
jpayne@68 341 is available on the server. Otherwise, returns ``None``.
jpayne@68 342
jpayne@68 343 """
jpayne@68 344 parsed_url = parse_url(url)
jpayne@68 345 ftp = ftplib.FTP(timeout=self.timeout)
jpayne@68 346 ftp.connect(host=parsed_url["netloc"], port=self.port)
jpayne@68 347
jpayne@68 348 if check_only:
jpayne@68 349 directory, file_name = os.path.split(parsed_url["path"])
jpayne@68 350 try:
jpayne@68 351 ftp.login(user=self.username, passwd=self.password, acct=self.account)
jpayne@68 352 available = file_name in ftp.nlst(directory)
jpayne@68 353 finally:
jpayne@68 354 ftp.close()
jpayne@68 355 return available
jpayne@68 356
jpayne@68 357 ispath = not hasattr(output_file, "write")
jpayne@68 358 if ispath:
jpayne@68 359 # pylint: disable=consider-using-with
jpayne@68 360 output_file = open(output_file, "w+b")
jpayne@68 361 # pylint: enable=consider-using-with
jpayne@68 362 try:
jpayne@68 363 ftp.login(user=self.username, passwd=self.password, acct=self.account)
jpayne@68 364 command = f"RETR {parsed_url['path']}"
jpayne@68 365 if self.progressbar:
jpayne@68 366 # Make sure the file is set to binary mode, otherwise we can't
jpayne@68 367 # get the file size. See: https://stackoverflow.com/a/22093848
jpayne@68 368 ftp.voidcmd("TYPE I")
jpayne@68 369 use_ascii = bool(sys.platform == "win32")
jpayne@68 370 progress = tqdm(
jpayne@68 371 total=int(ftp.size(parsed_url["path"])),
jpayne@68 372 ncols=79,
jpayne@68 373 ascii=use_ascii,
jpayne@68 374 unit="B",
jpayne@68 375 unit_scale=True,
jpayne@68 376 leave=True,
jpayne@68 377 )
jpayne@68 378 with progress:
jpayne@68 379
jpayne@68 380 def callback(data):
jpayne@68 381 "Update the progress bar and write to output"
jpayne@68 382 progress.update(len(data))
jpayne@68 383 output_file.write(data)
jpayne@68 384
jpayne@68 385 ftp.retrbinary(command, callback, blocksize=self.chunk_size)
jpayne@68 386 else:
jpayne@68 387 ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
jpayne@68 388 finally:
jpayne@68 389 ftp.quit()
jpayne@68 390 if ispath:
jpayne@68 391 output_file.close()
jpayne@68 392 return None
jpayne@68 393
jpayne@68 394
jpayne@68 395 class SFTPDownloader: # pylint: disable=too-few-public-methods
jpayne@68 396 """
jpayne@68 397 Download manager for fetching files over SFTP.
jpayne@68 398
jpayne@68 399 When called, downloads the given file URL into the specified local file.
jpayne@68 400 Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
jpayne@68 401 installed.
jpayne@68 402
jpayne@68 403 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@68 404 the download of files (for example, to use authentication or print a
jpayne@68 405 progress bar).
jpayne@68 406
jpayne@68 407 Parameters
jpayne@68 408 ----------
jpayne@68 409 port : int
jpayne@68 410 Port used for the SFTP connection.
jpayne@68 411 username : str
jpayne@68 412 User name used to login to the server. Only needed if the server
jpayne@68 413 requires authentication (i.e., no anonymous SFTP).
jpayne@68 414 password : str
jpayne@68 415 Password used to login to the server. Only needed if the server
jpayne@68 416 requires authentication (i.e., no anonymous SFTP). Use the empty
jpayne@68 417 string to indicate no password is required.
jpayne@68 418 timeout : int
jpayne@68 419 Timeout in seconds for sftp socket operations, use None to mean no
jpayne@68 420 timeout.
jpayne@68 421 progressbar : bool or an arbitrary progress bar object
jpayne@68 422 If True, will print a progress bar of the download to standard
jpayne@68 423 error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
jpayne@68 424 be installed.
jpayne@68 425
jpayne@68 426 """
jpayne@68 427
jpayne@68 428 def __init__(
jpayne@68 429 self,
jpayne@68 430 port=22,
jpayne@68 431 username="anonymous",
jpayne@68 432 password="",
jpayne@68 433 account="",
jpayne@68 434 timeout=None,
jpayne@68 435 progressbar=False,
jpayne@68 436 ):
jpayne@68 437 self.port = port
jpayne@68 438 self.username = username
jpayne@68 439 self.password = password
jpayne@68 440 self.account = account
jpayne@68 441 self.timeout = timeout
jpayne@68 442 self.progressbar = progressbar
jpayne@68 443 # Collect errors and raise only once so that both missing packages are
jpayne@68 444 # captured. Otherwise, the user is only warned of one of them at a
jpayne@68 445 # time (and we can't test properly when they are both missing).
jpayne@68 446 errors = []
jpayne@68 447 if self.progressbar and tqdm is None:
jpayne@68 448 errors.append("Missing package 'tqdm' required for progress bars.")
jpayne@68 449 if paramiko is None:
jpayne@68 450 errors.append("Missing package 'paramiko' required for SFTP downloads.")
jpayne@68 451 if errors:
jpayne@68 452 raise ValueError(" ".join(errors))
jpayne@68 453
jpayne@68 454 def __call__(self, url, output_file, pooch):
jpayne@68 455 """
jpayne@68 456 Download the given URL over SFTP to the given output file.
jpayne@68 457
jpayne@68 458 The output file must be given as a string (file name/path) and not an
jpayne@68 459 open file object! Otherwise, paramiko cannot save to that file.
jpayne@68 460
jpayne@68 461 Parameters
jpayne@68 462 ----------
jpayne@68 463 url : str
jpayne@68 464 The URL to the file you want to download.
jpayne@68 465 output_file : str
jpayne@68 466 Path (and file name) to which the file will be downloaded. **Cannot
jpayne@68 467 be a file object**.
jpayne@68 468 pooch : :class:`~pooch.Pooch`
jpayne@68 469 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68 470 """
jpayne@68 471 parsed_url = parse_url(url)
jpayne@68 472 connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
jpayne@68 473 sftp = None
jpayne@68 474 try:
jpayne@68 475 connection.connect(username=self.username, password=self.password)
jpayne@68 476 sftp = paramiko.SFTPClient.from_transport(connection)
jpayne@68 477 sftp.get_channel().settimeout = self.timeout
jpayne@68 478 if self.progressbar:
jpayne@68 479 size = int(sftp.stat(parsed_url["path"]).st_size)
jpayne@68 480 use_ascii = bool(sys.platform == "win32")
jpayne@68 481 progress = tqdm(
jpayne@68 482 total=size,
jpayne@68 483 ncols=79,
jpayne@68 484 ascii=use_ascii,
jpayne@68 485 unit="B",
jpayne@68 486 unit_scale=True,
jpayne@68 487 leave=True,
jpayne@68 488 )
jpayne@68 489 if self.progressbar:
jpayne@68 490 with progress:
jpayne@68 491
jpayne@68 492 def callback(current, total):
jpayne@68 493 "Update the progress bar and write to output"
jpayne@68 494 progress.total = int(total)
jpayne@68 495 progress.update(int(current - progress.n))
jpayne@68 496
jpayne@68 497 sftp.get(parsed_url["path"], output_file, callback=callback)
jpayne@68 498 else:
jpayne@68 499 sftp.get(parsed_url["path"], output_file)
jpayne@68 500 finally:
jpayne@68 501 connection.close()
jpayne@68 502 if sftp is not None:
jpayne@68 503 sftp.close()
jpayne@68 504
jpayne@68 505
jpayne@68 506 class DOIDownloader: # pylint: disable=too-few-public-methods
jpayne@68 507 """
jpayne@68 508 Download manager for fetching files from Digital Object Identifiers (DOIs).
jpayne@68 509
jpayne@68 510 Open-access data repositories often issue Digital Object Identifiers (DOIs)
jpayne@68 511 for data which provide a stable link and citation point. The trick is
jpayne@68 512 finding out the download URL for a file given the DOI.
jpayne@68 513
jpayne@68 514 When called, this downloader uses the repository's public API to find out
jpayne@68 515 the download URL from the DOI and file name. It then uses
jpayne@68 516 :class:`pooch.HTTPDownloader` to download the URL into the specified local
jpayne@68 517 file. Allowing "URL"s to be specified with the DOI instead of the actual
jpayne@68 518 HTTP download link. Uses the :mod:`requests` library to manage downloads
jpayne@68 519 and interact with the APIs.
jpayne@68 520
jpayne@68 521 The **format of the "URL"** is: ``doi:{DOI}/{file name}``.
jpayne@68 522
jpayne@68 523 Notice that there are no ``//`` like in HTTP/FTP and you must specify a
jpayne@68 524 file name after the DOI (separated by a ``/``).
jpayne@68 525
jpayne@68 526 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
jpayne@68 527 download files given the DOI instead of an HTTP link.
jpayne@68 528
jpayne@68 529 Supported repositories:
jpayne@68 530
jpayne@68 531 * `figshare <https://www.figshare.com>`__
jpayne@68 532 * `Zenodo <https://www.zenodo.org>`__
jpayne@68 533 * `Dataverse <https://dataverse.org/>`__ instances
jpayne@68 534
jpayne@68 535 .. attention::
jpayne@68 536
jpayne@68 537 DOIs from other repositories **will not work** since we need to access
jpayne@68 538 their particular APIs to find the download links. We welcome
jpayne@68 539 suggestions and contributions adding new repositories.
jpayne@68 540
jpayne@68 541 Parameters
jpayne@68 542 ----------
jpayne@68 543 progressbar : bool or an arbitrary progress bar object
jpayne@68 544 If True, will print a progress bar of the download to standard error
jpayne@68 545 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68 546 installed. Alternatively, an arbitrary progress bar object can be
jpayne@68 547 passed. See :ref:`custom-progressbar` for details.
jpayne@68 548 chunk_size : int
jpayne@68 549 Files are streamed *chunk_size* bytes at a time instead of loading
jpayne@68 550 everything into memory at one. Usually doesn't need to be changed.
jpayne@68 551 **kwargs
jpayne@68 552 All keyword arguments given when creating an instance of this class
jpayne@68 553 will be passed to :func:`requests.get`.
jpayne@68 554
jpayne@68 555 Examples
jpayne@68 556 --------
jpayne@68 557
jpayne@68 558 Download one of the data files from the figshare archive of Pooch test
jpayne@68 559 data:
jpayne@68 560
jpayne@68 561 >>> import os
jpayne@68 562 >>> downloader = DOIDownloader()
jpayne@68 563 >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
jpayne@68 564 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
jpayne@68 565 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68 566 >>> os.path.exists("tiny-data.txt")
jpayne@68 567 True
jpayne@68 568 >>> with open("tiny-data.txt") as f:
jpayne@68 569 ... print(f.read().strip())
jpayne@68 570 # A tiny data file for test purposes only
jpayne@68 571 1 2 3 4 5 6
jpayne@68 572 >>> os.remove("tiny-data.txt")
jpayne@68 573
jpayne@68 574 Same thing but for our Zenodo archive:
jpayne@68 575
jpayne@68 576 >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
jpayne@68 577 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68 578 >>> os.path.exists("tiny-data.txt")
jpayne@68 579 True
jpayne@68 580 >>> with open("tiny-data.txt") as f:
jpayne@68 581 ... print(f.read().strip())
jpayne@68 582 # A tiny data file for test purposes only
jpayne@68 583 1 2 3 4 5 6
jpayne@68 584 >>> os.remove("tiny-data.txt")
jpayne@68 585
jpayne@68 586 """
jpayne@68 587
jpayne@68 588 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
jpayne@68 589 self.kwargs = kwargs
jpayne@68 590 self.progressbar = progressbar
jpayne@68 591 self.chunk_size = chunk_size
jpayne@68 592
jpayne@68 593 def __call__(self, url, output_file, pooch):
jpayne@68 594 """
jpayne@68 595 Download the given DOI URL over HTTP to the given output file.
jpayne@68 596
jpayne@68 597 Uses the repository's API to determine the actual HTTP download URL
jpayne@68 598 from the given DOI.
jpayne@68 599
jpayne@68 600 Uses :func:`requests.get`.
jpayne@68 601
jpayne@68 602 Parameters
jpayne@68 603 ----------
jpayne@68 604 url : str
jpayne@68 605 The URL to the file you want to download.
jpayne@68 606 output_file : str or file-like object
jpayne@68 607 Path (and file name) to which the file will be downloaded.
jpayne@68 608 pooch : :class:`~pooch.Pooch`
jpayne@68 609 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68 610
jpayne@68 611 """
jpayne@68 612
jpayne@68 613 parsed_url = parse_url(url)
jpayne@68 614 data_repository = doi_to_repository(parsed_url["netloc"])
jpayne@68 615
jpayne@68 616 # Resolve the URL
jpayne@68 617 file_name = parsed_url["path"]
jpayne@68 618 # remove the leading slash in the path
jpayne@68 619 if file_name[0] == "/":
jpayne@68 620 file_name = file_name[1:]
jpayne@68 621 download_url = data_repository.download_url(file_name)
jpayne@68 622
jpayne@68 623 # Instantiate the downloader object
jpayne@68 624 downloader = HTTPDownloader(
jpayne@68 625 progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
jpayne@68 626 )
jpayne@68 627 downloader(download_url, output_file, pooch)
jpayne@68 628
jpayne@68 629
jpayne@68 630 def doi_to_url(doi):
jpayne@68 631 """
jpayne@68 632 Follow a DOI link to resolve the URL of the archive.
jpayne@68 633
jpayne@68 634 Parameters
jpayne@68 635 ----------
jpayne@68 636 doi : str
jpayne@68 637 The DOI of the archive.
jpayne@68 638
jpayne@68 639 Returns
jpayne@68 640 -------
jpayne@68 641 url : str
jpayne@68 642 The URL of the archive in the data repository.
jpayne@68 643
jpayne@68 644 """
jpayne@68 645 # Lazy import requests to speed up import time
jpayne@68 646 import requests # pylint: disable=C0415
jpayne@68 647
jpayne@68 648 # Use doi.org to resolve the DOI to the repository website.
jpayne@68 649 response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT)
jpayne@68 650 url = response.url
jpayne@68 651 if 400 <= response.status_code < 600:
jpayne@68 652 raise ValueError(
jpayne@68 653 f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
jpayne@68 654 )
jpayne@68 655 return url
jpayne@68 656
jpayne@68 657
jpayne@68 658 def doi_to_repository(doi):
jpayne@68 659 """
jpayne@68 660 Instantiate a data repository instance from a given DOI.
jpayne@68 661
jpayne@68 662 This function implements the chain of responsibility dispatch
jpayne@68 663 to the correct data repository class.
jpayne@68 664
jpayne@68 665 Parameters
jpayne@68 666 ----------
jpayne@68 667 doi : str
jpayne@68 668 The DOI of the archive.
jpayne@68 669
jpayne@68 670 Returns
jpayne@68 671 -------
jpayne@68 672 data_repository : DataRepository
jpayne@68 673 The data repository object
jpayne@68 674 """
jpayne@68 675
jpayne@68 676 # This should go away in a separate issue: DOI handling should
jpayne@68 677 # not rely on the (non-)existence of trailing slashes. The issue
jpayne@68 678 # is documented in https://github.com/fatiando/pooch/issues/324
jpayne@68 679 if doi[-1] == "/":
jpayne@68 680 doi = doi[:-1]
jpayne@68 681
jpayne@68 682 repositories = [
jpayne@68 683 FigshareRepository,
jpayne@68 684 ZenodoRepository,
jpayne@68 685 DataverseRepository,
jpayne@68 686 ]
jpayne@68 687
jpayne@68 688 # Extract the DOI and the repository information
jpayne@68 689 archive_url = doi_to_url(doi)
jpayne@68 690
jpayne@68 691 # Try the converters one by one until one of them returned a URL
jpayne@68 692 data_repository = None
jpayne@68 693 for repo in repositories:
jpayne@68 694 if data_repository is None:
jpayne@68 695 data_repository = repo.initialize(
jpayne@68 696 archive_url=archive_url,
jpayne@68 697 doi=doi,
jpayne@68 698 )
jpayne@68 699
jpayne@68 700 if data_repository is None:
jpayne@68 701 repository = parse_url(archive_url)["netloc"]
jpayne@68 702 raise ValueError(
jpayne@68 703 f"Invalid data repository '{repository}'. "
jpayne@68 704 "To request or contribute support for this repository, "
jpayne@68 705 "please open an issue at https://github.com/fatiando/pooch/issues"
jpayne@68 706 )
jpayne@68 707
jpayne@68 708 return data_repository
jpayne@68 709
jpayne@68 710
jpayne@68 711 class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring
jpayne@68 712 @classmethod
jpayne@68 713 def initialize(cls, doi, archive_url): # pylint: disable=unused-argument
jpayne@68 714 """
jpayne@68 715 Initialize the data repository if the given URL points to a
jpayne@68 716 corresponding repository.
jpayne@68 717
jpayne@68 718 Initializes a data repository object. This is done as part of
jpayne@68 719 a chain of responsibility. If the class cannot handle the given
jpayne@68 720 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68 721 instance is returned.
jpayne@68 722
jpayne@68 723 Parameters
jpayne@68 724 ----------
jpayne@68 725 doi : str
jpayne@68 726 The DOI that identifies the repository
jpayne@68 727 archive_url : str
jpayne@68 728 The resolved URL for the DOI
jpayne@68 729 """
jpayne@68 730
jpayne@68 731 return None # pragma: no cover
jpayne@68 732
jpayne@68 733 def download_url(self, file_name):
jpayne@68 734 """
jpayne@68 735 Use the repository API to get the download URL for a file given
jpayne@68 736 the archive URL.
jpayne@68 737
jpayne@68 738 Parameters
jpayne@68 739 ----------
jpayne@68 740 file_name : str
jpayne@68 741 The name of the file in the archive that will be downloaded.
jpayne@68 742
jpayne@68 743 Returns
jpayne@68 744 -------
jpayne@68 745 download_url : str
jpayne@68 746 The HTTP URL that can be used to download the file.
jpayne@68 747 """
jpayne@68 748
jpayne@68 749 raise NotImplementedError # pragma: no cover
jpayne@68 750
jpayne@68 751 def populate_registry(self, pooch):
jpayne@68 752 """
jpayne@68 753 Populate the registry using the data repository's API
jpayne@68 754
jpayne@68 755 Parameters
jpayne@68 756 ----------
jpayne@68 757 pooch : Pooch
jpayne@68 758 The pooch instance that the registry will be added to.
jpayne@68 759 """
jpayne@68 760
jpayne@68 761 raise NotImplementedError # pragma: no cover
jpayne@68 762
jpayne@68 763
jpayne@68 764 class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@68 765 base_api_url = "https://zenodo.org/api/records"
jpayne@68 766
jpayne@68 767 def __init__(self, doi, archive_url):
jpayne@68 768 self.archive_url = archive_url
jpayne@68 769 self.doi = doi
jpayne@68 770 self._api_response = None
jpayne@68 771 self._api_version = None
jpayne@68 772
jpayne@68 773 @classmethod
jpayne@68 774 def initialize(cls, doi, archive_url):
jpayne@68 775 """
jpayne@68 776 Initialize the data repository if the given URL points to a
jpayne@68 777 corresponding repository.
jpayne@68 778
jpayne@68 779 Initializes a data repository object. This is done as part of
jpayne@68 780 a chain of responsibility. If the class cannot handle the given
jpayne@68 781 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68 782 instance is returned.
jpayne@68 783
jpayne@68 784 Parameters
jpayne@68 785 ----------
jpayne@68 786 doi : str
jpayne@68 787 The DOI that identifies the repository
jpayne@68 788 archive_url : str
jpayne@68 789 The resolved URL for the DOI
jpayne@68 790 """
jpayne@68 791
jpayne@68 792 # Check whether this is a Zenodo URL
jpayne@68 793 parsed_archive_url = parse_url(archive_url)
jpayne@68 794 if parsed_archive_url["netloc"] != "zenodo.org":
jpayne@68 795 return None
jpayne@68 796
jpayne@68 797 return cls(doi, archive_url)
jpayne@68 798
jpayne@68 799 @property
jpayne@68 800 def api_response(self):
jpayne@68 801 """Cached API response from Zenodo"""
jpayne@68 802 if self._api_response is None:
jpayne@68 803 # Lazy import requests to speed up import time
jpayne@68 804 import requests # pylint: disable=C0415
jpayne@68 805
jpayne@68 806 article_id = self.archive_url.split("/")[-1]
jpayne@68 807 self._api_response = requests.get(
jpayne@68 808 f"{self.base_api_url}/{article_id}",
jpayne@68 809 timeout=DEFAULT_TIMEOUT,
jpayne@68 810 ).json()
jpayne@68 811
jpayne@68 812 return self._api_response
jpayne@68 813
jpayne@68 814 @property
jpayne@68 815 def api_version(self):
jpayne@68 816 """
jpayne@68 817 Version of the Zenodo API we are interacting with
jpayne@68 818
jpayne@68 819 The versions can either be :
jpayne@68 820
jpayne@68 821 - ``"legacy"``: corresponds to the Zenodo API that was supported until
jpayne@68 822 2023-10-12 (before the migration to InvenioRDM).
jpayne@68 823 - ``"new"``: corresponds to the new API that went online on 2023-10-13
jpayne@68 824 after the migration to InvenioRDM.
jpayne@68 825
jpayne@68 826 The ``"new"`` API breaks backward compatibility with the ``"legacy"``
jpayne@68 827 one and could probably be replaced by an updated version that restores
jpayne@68 828 the behaviour of the ``"legacy"`` one.
jpayne@68 829
jpayne@68 830 Returns
jpayne@68 831 -------
jpayne@68 832 str
jpayne@68 833 """
jpayne@68 834 if self._api_version is None:
jpayne@68 835 if all("key" in file for file in self.api_response["files"]):
jpayne@68 836 self._api_version = "legacy"
jpayne@68 837 elif all("filename" in file for file in self.api_response["files"]):
jpayne@68 838 self._api_version = "new"
jpayne@68 839 else:
jpayne@68 840 raise ValueError(
jpayne@68 841 "Couldn't determine the version of the Zenodo API for "
jpayne@68 842 f"{self.archive_url} (doi:{self.doi})."
jpayne@68 843 )
jpayne@68 844 return self._api_version
jpayne@68 845
jpayne@68 846 def download_url(self, file_name):
jpayne@68 847 """
jpayne@68 848 Use the repository API to get the download URL for a file given
jpayne@68 849 the archive URL.
jpayne@68 850
jpayne@68 851 Parameters
jpayne@68 852 ----------
jpayne@68 853 file_name : str
jpayne@68 854 The name of the file in the archive that will be downloaded.
jpayne@68 855
jpayne@68 856 Returns
jpayne@68 857 -------
jpayne@68 858 download_url : str
jpayne@68 859 The HTTP URL that can be used to download the file.
jpayne@68 860
jpayne@68 861 Notes
jpayne@68 862 -----
jpayne@68 863 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
jpayne@68 864 link to the desired files that appears in the API response leads to 404
jpayne@68 865 errors (by 2023-10-17). The files are available in the following url:
jpayne@68 866 ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.
jpayne@68 867
jpayne@68 868 This method supports both the legacy and the new API.
jpayne@68 869 """
jpayne@68 870 # Create list of files in the repository
jpayne@68 871 if self.api_version == "legacy":
jpayne@68 872 files = {item["key"]: item for item in self.api_response["files"]}
jpayne@68 873 else:
jpayne@68 874 files = [item["filename"] for item in self.api_response["files"]]
jpayne@68 875 # Check if file exists in the repository
jpayne@68 876 if file_name not in files:
jpayne@68 877 raise ValueError(
jpayne@68 878 f"File '{file_name}' not found in data archive "
jpayne@68 879 f"{self.archive_url} (doi:{self.doi})."
jpayne@68 880 )
jpayne@68 881 # Build download url
jpayne@68 882 if self.api_version == "legacy":
jpayne@68 883 download_url = files[file_name]["links"]["self"]
jpayne@68 884 else:
jpayne@68 885 article_id = self.api_response["id"]
jpayne@68 886 download_url = (
jpayne@68 887 f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
jpayne@68 888 )
jpayne@68 889 return download_url
jpayne@68 890
jpayne@68 891 def populate_registry(self, pooch):
jpayne@68 892 """
jpayne@68 893 Populate the registry using the data repository's API
jpayne@68 894
jpayne@68 895 Parameters
jpayne@68 896 ----------
jpayne@68 897 pooch : Pooch
jpayne@68 898 The pooch instance that the registry will be added to.
jpayne@68 899
jpayne@68 900 Notes
jpayne@68 901 -----
jpayne@68 902 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
jpayne@68 903 checksums for each file listed in the API reference is now an md5 sum.
jpayne@68 904
jpayne@68 905 This method supports both the legacy and the new API.
jpayne@68 906 """
jpayne@68 907 for filedata in self.api_response["files"]:
jpayne@68 908 checksum = filedata["checksum"]
jpayne@68 909 if self.api_version == "legacy":
jpayne@68 910 key = "key"
jpayne@68 911 else:
jpayne@68 912 key = "filename"
jpayne@68 913 checksum = f"md5:{checksum}"
jpayne@68 914 pooch.registry[filedata[key]] = checksum
jpayne@68 915
jpayne@68 916
jpayne@68 917 class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@68 918 def __init__(self, doi, archive_url):
jpayne@68 919 self.archive_url = archive_url
jpayne@68 920 self.doi = doi
jpayne@68 921 self._api_response = None
jpayne@68 922
jpayne@68 923 @classmethod
jpayne@68 924 def initialize(cls, doi, archive_url):
jpayne@68 925 """
jpayne@68 926 Initialize the data repository if the given URL points to a
jpayne@68 927 corresponding repository.
jpayne@68 928
jpayne@68 929 Initializes a data repository object. This is done as part of
jpayne@68 930 a chain of responsibility. If the class cannot handle the given
jpayne@68 931 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68 932 instance is returned.
jpayne@68 933
jpayne@68 934 Parameters
jpayne@68 935 ----------
jpayne@68 936 doi : str
jpayne@68 937 The DOI that identifies the repository
jpayne@68 938 archive_url : str
jpayne@68 939 The resolved URL for the DOI
jpayne@68 940 """
jpayne@68 941
jpayne@68 942 # Check whether this is a Figshare URL
jpayne@68 943 parsed_archive_url = parse_url(archive_url)
jpayne@68 944 if parsed_archive_url["netloc"] != "figshare.com":
jpayne@68 945 return None
jpayne@68 946
jpayne@68 947 return cls(doi, archive_url)
jpayne@68 948
jpayne@68 949 def _parse_version_from_doi(self):
jpayne@68 950 """
jpayne@68 951 Parse version from the doi
jpayne@68 952
jpayne@68 953 Return None if version is not available in the doi.
jpayne@68 954 """
jpayne@68 955 # Get suffix of the doi
jpayne@68 956 _, suffix = self.doi.split("/")
jpayne@68 957 # Split the suffix by dots and keep the last part
jpayne@68 958 last_part = suffix.split(".")[-1]
jpayne@68 959 # Parse the version from the last part
jpayne@68 960 if last_part[0] != "v":
jpayne@68 961 return None
jpayne@68 962 version = int(last_part[1:])
jpayne@68 963 return version
jpayne@68 964
jpayne@68 965 @property
jpayne@68 966 def api_response(self):
jpayne@68 967 """Cached API response from Figshare"""
jpayne@68 968 if self._api_response is None:
jpayne@68 969 # Lazy import requests to speed up import time
jpayne@68 970 import requests # pylint: disable=C0415
jpayne@68 971
jpayne@68 972 # Use the figshare API to find the article ID from the DOI
jpayne@68 973 article = requests.get(
jpayne@68 974 f"https://api.figshare.com/v2/articles?doi={self.doi}",
jpayne@68 975 timeout=DEFAULT_TIMEOUT,
jpayne@68 976 ).json()[0]
jpayne@68 977 article_id = article["id"]
jpayne@68 978 # Parse desired version from the doi
jpayne@68 979 version = self._parse_version_from_doi()
jpayne@68 980 # With the ID and version, we can get a list of files and their
jpayne@68 981 # download links
jpayne@68 982 if version is None:
jpayne@68 983 # Figshare returns the latest version available when no version
jpayne@68 984 # is specified through the DOI.
jpayne@68 985 warnings.warn(
jpayne@68 986 f"The Figshare DOI '{self.doi}' doesn't specify which version of "
jpayne@68 987 "the repository should be used. "
jpayne@68 988 "Figshare will point to the latest version available.",
jpayne@68 989 UserWarning,
jpayne@68 990 )
jpayne@68 991 # Define API url using only the article id
jpayne@68 992 # (figshare will resolve the latest version)
jpayne@68 993 api_url = f"https://api.figshare.com/v2/articles/{article_id}"
jpayne@68 994 else:
jpayne@68 995 # Define API url using article id and the desired version
jpayne@68 996 # Get list of files using article id and the version
jpayne@68 997 api_url = (
jpayne@68 998 "https://api.figshare.com/v2/articles/"
jpayne@68 999 f"{article_id}/versions/{version}"
jpayne@68 1000 )
jpayne@68 1001 # Make the request and return the files in the figshare repository
jpayne@68 1002 response = requests.get(api_url, timeout=DEFAULT_TIMEOUT)
jpayne@68 1003 response.raise_for_status()
jpayne@68 1004 self._api_response = response.json()["files"]
jpayne@68 1005
jpayne@68 1006 return self._api_response
jpayne@68 1007
jpayne@68 1008 def download_url(self, file_name):
jpayne@68 1009 """
jpayne@68 1010 Use the repository API to get the download URL for a file given
jpayne@68 1011 the archive URL.
jpayne@68 1012
jpayne@68 1013 Parameters
jpayne@68 1014 ----------
jpayne@68 1015 file_name : str
jpayne@68 1016 The name of the file in the archive that will be downloaded.
jpayne@68 1017
jpayne@68 1018 Returns
jpayne@68 1019 -------
jpayne@68 1020 download_url : str
jpayne@68 1021 The HTTP URL that can be used to download the file.
jpayne@68 1022 """
jpayne@68 1023 files = {item["name"]: item for item in self.api_response}
jpayne@68 1024 if file_name not in files:
jpayne@68 1025 raise ValueError(
jpayne@68 1026 f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
jpayne@68 1027 )
jpayne@68 1028 download_url = files[file_name]["download_url"]
jpayne@68 1029 return download_url
jpayne@68 1030
jpayne@68 1031 def populate_registry(self, pooch):
jpayne@68 1032 """
jpayne@68 1033 Populate the registry using the data repository's API
jpayne@68 1034
jpayne@68 1035 Parameters
jpayne@68 1036 ----------
jpayne@68 1037 pooch : Pooch
jpayne@68 1038 The pooch instance that the registry will be added to.
jpayne@68 1039 """
jpayne@68 1040
jpayne@68 1041 for filedata in self.api_response:
jpayne@68 1042 pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
jpayne@68 1043
jpayne@68 1044
jpayne@68 1045 class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@68 1046 def __init__(self, doi, archive_url):
jpayne@68 1047 self.archive_url = archive_url
jpayne@68 1048 self.doi = doi
jpayne@68 1049 self._api_response = None
jpayne@68 1050
jpayne@68 1051 @classmethod
jpayne@68 1052 def initialize(cls, doi, archive_url):
jpayne@68 1053 """
jpayne@68 1054 Initialize the data repository if the given URL points to a
jpayne@68 1055 corresponding repository.
jpayne@68 1056
jpayne@68 1057 Initializes a data repository object. This is done as part of
jpayne@68 1058 a chain of responsibility. If the class cannot handle the given
jpayne@68 1059 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68 1060 instance is returned.
jpayne@68 1061
jpayne@68 1062 Parameters
jpayne@68 1063 ----------
jpayne@68 1064 doi : str
jpayne@68 1065 The DOI that identifies the repository
jpayne@68 1066 archive_url : str
jpayne@68 1067 The resolved URL for the DOI
jpayne@68 1068 """
jpayne@68 1069 # Access the DOI as if this was a DataVerse instance
jpayne@68 1070 response = cls._get_api_response(doi, archive_url)
jpayne@68 1071
jpayne@68 1072 # If we failed, this is probably not a DataVerse instance
jpayne@68 1073 if 400 <= response.status_code < 600:
jpayne@68 1074 return None
jpayne@68 1075
jpayne@68 1076 # Initialize the repository and overwrite the api response
jpayne@68 1077 repository = cls(doi, archive_url)
jpayne@68 1078 repository.api_response = response
jpayne@68 1079 return repository
jpayne@68 1080
jpayne@68 1081 @classmethod
jpayne@68 1082 def _get_api_response(cls, doi, archive_url):
jpayne@68 1083 """
jpayne@68 1084 Perform the actual API request
jpayne@68 1085
jpayne@68 1086 This has been separated into a separate ``classmethod``, as it can be
jpayne@68 1087 used prior and after the initialization.
jpayne@68 1088 """
jpayne@68 1089 # Lazy import requests to speed up import time
jpayne@68 1090 import requests # pylint: disable=C0415
jpayne@68 1091
jpayne@68 1092 parsed = parse_url(archive_url)
jpayne@68 1093 response = requests.get(
jpayne@68 1094 f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
jpayne@68 1095 f":persistentId?persistentId=doi:{doi}",
jpayne@68 1096 timeout=DEFAULT_TIMEOUT,
jpayne@68 1097 )
jpayne@68 1098 return response
jpayne@68 1099
jpayne@68 1100 @property
jpayne@68 1101 def api_response(self):
jpayne@68 1102 """Cached API response from a DataVerse instance"""
jpayne@68 1103
jpayne@68 1104 if self._api_response is None:
jpayne@68 1105 self._api_response = self._get_api_response(
jpayne@68 1106 self.doi, self.archive_url
jpayne@68 1107 ) # pragma: no cover
jpayne@68 1108
jpayne@68 1109 return self._api_response
jpayne@68 1110
jpayne@68 1111 @api_response.setter
jpayne@68 1112 def api_response(self, response):
jpayne@68 1113 """Update the cached API response"""
jpayne@68 1114
jpayne@68 1115 self._api_response = response
jpayne@68 1116
jpayne@68 1117 def download_url(self, file_name):
jpayne@68 1118 """
jpayne@68 1119 Use the repository API to get the download URL for a file given
jpayne@68 1120 the archive URL.
jpayne@68 1121
jpayne@68 1122 Parameters
jpayne@68 1123 ----------
jpayne@68 1124 file_name : str
jpayne@68 1125 The name of the file in the archive that will be downloaded.
jpayne@68 1126
jpayne@68 1127 Returns
jpayne@68 1128 -------
jpayne@68 1129 download_url : str
jpayne@68 1130 The HTTP URL that can be used to download the file.
jpayne@68 1131 """
jpayne@68 1132 parsed = parse_url(self.archive_url)
jpayne@68 1133 response = self.api_response.json()
jpayne@68 1134 files = {
jpayne@68 1135 file["dataFile"]["filename"]: file["dataFile"]
jpayne@68 1136 for file in response["data"]["latestVersion"]["files"]
jpayne@68 1137 }
jpayne@68 1138 if file_name not in files:
jpayne@68 1139 raise ValueError(
jpayne@68 1140 f"File '{file_name}' not found in data archive "
jpayne@68 1141 f"{self.archive_url} (doi:{self.doi})."
jpayne@68 1142 )
jpayne@68 1143 # Generate download_url using the file id
jpayne@68 1144 download_url = (
jpayne@68 1145 f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
jpayne@68 1146 f"{files[file_name]['id']}"
jpayne@68 1147 )
jpayne@68 1148 return download_url
jpayne@68 1149
jpayne@68 1150 def populate_registry(self, pooch):
jpayne@68 1151 """
jpayne@68 1152 Populate the registry using the data repository's API
jpayne@68 1153
jpayne@68 1154 Parameters
jpayne@68 1155 ----------
jpayne@68 1156 pooch : Pooch
jpayne@68 1157 The pooch instance that the registry will be added to.
jpayne@68 1158 """
jpayne@68 1159
jpayne@68 1160 for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
jpayne@68 1161 pooch.registry[filedata["dataFile"]["filename"]] = (
jpayne@68 1162 f"md5:{filedata['dataFile']['md5']}"
jpayne@68 1163 )