annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 # Copyright (c) 2018 The Pooch Developers.
jpayne@69 2 # Distributed under the terms of the BSD 3-Clause License.
jpayne@69 3 # SPDX-License-Identifier: BSD-3-Clause
jpayne@69 4 #
jpayne@69 5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
jpayne@69 6 #
jpayne@69 7 """
jpayne@69 8 The classes that actually handle the downloads.
jpayne@69 9 """
jpayne@69 10 import os
jpayne@69 11 import sys
jpayne@69 12 import ftplib
jpayne@69 13
jpayne@69 14 import warnings
jpayne@69 15
jpayne@69 16 from .utils import parse_url
jpayne@69 17
jpayne@69 18 try:
jpayne@69 19 from tqdm import tqdm
jpayne@69 20 except ImportError:
jpayne@69 21 tqdm = None
jpayne@69 22
jpayne@69 23 try:
jpayne@69 24 import paramiko
jpayne@69 25 except ImportError:
jpayne@69 26 paramiko = None
jpayne@69 27
jpayne@69 28
jpayne@69 29 # Set the default timeout in seconds so it can be configured in a pinch for the
jpayne@69 30 # methods that don't or can't expose a way set it at runtime.
jpayne@69 31 # See https://github.com/fatiando/pooch/issues/409
jpayne@69 32 DEFAULT_TIMEOUT = 30
jpayne@69 33
jpayne@69 34
jpayne@69 35 def choose_downloader(url, progressbar=False):
jpayne@69 36 """
jpayne@69 37 Choose the appropriate downloader for the given URL based on the protocol.
jpayne@69 38
jpayne@69 39 Parameters
jpayne@69 40 ----------
jpayne@69 41 url : str
jpayne@69 42 A URL (including protocol).
jpayne@69 43 progressbar : bool or an arbitrary progress bar object
jpayne@69 44 If True, will print a progress bar of the download to standard error
jpayne@69 45 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@69 46 installed. Alternatively, an arbitrary progress bar object can be
jpayne@69 47 passed. See :ref:`custom-progressbar` for details.
jpayne@69 48
jpayne@69 49 Returns
jpayne@69 50 -------
jpayne@69 51 downloader
jpayne@69 52 A downloader class, like :class:`pooch.HTTPDownloader`,
jpayne@69 53 :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
jpayne@69 54
jpayne@69 55 Examples
jpayne@69 56 --------
jpayne@69 57
jpayne@69 58 >>> downloader = choose_downloader("http://something.com")
jpayne@69 59 >>> print(downloader.__class__.__name__)
jpayne@69 60 HTTPDownloader
jpayne@69 61 >>> downloader = choose_downloader("https://something.com")
jpayne@69 62 >>> print(downloader.__class__.__name__)
jpayne@69 63 HTTPDownloader
jpayne@69 64 >>> downloader = choose_downloader("ftp://something.com")
jpayne@69 65 >>> print(downloader.__class__.__name__)
jpayne@69 66 FTPDownloader
jpayne@69 67 >>> downloader = choose_downloader("doi:DOI/filename.csv")
jpayne@69 68 >>> print(downloader.__class__.__name__)
jpayne@69 69 DOIDownloader
jpayne@69 70
jpayne@69 71 """
jpayne@69 72 known_downloaders = {
jpayne@69 73 "ftp": FTPDownloader,
jpayne@69 74 "https": HTTPDownloader,
jpayne@69 75 "http": HTTPDownloader,
jpayne@69 76 "sftp": SFTPDownloader,
jpayne@69 77 "doi": DOIDownloader,
jpayne@69 78 }
jpayne@69 79
jpayne@69 80 parsed_url = parse_url(url)
jpayne@69 81 if parsed_url["protocol"] not in known_downloaders:
jpayne@69 82 raise ValueError(
jpayne@69 83 f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
jpayne@69 84 f"Must be one of {known_downloaders.keys()}."
jpayne@69 85 )
jpayne@69 86 downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
jpayne@69 87 return downloader
jpayne@69 88
jpayne@69 89
jpayne@69 90 class HTTPDownloader: # pylint: disable=too-few-public-methods
jpayne@69 91 """
jpayne@69 92 Download manager for fetching files over HTTP/HTTPS.
jpayne@69 93
jpayne@69 94 When called, downloads the given file URL into the specified local file.
jpayne@69 95 Uses the :mod:`requests` library to manage downloads.
jpayne@69 96
jpayne@69 97 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@69 98 the download of files (for example, to use authentication or print a
jpayne@69 99 progress bar).
jpayne@69 100
jpayne@69 101 Parameters
jpayne@69 102 ----------
jpayne@69 103 progressbar : bool or an arbitrary progress bar object
jpayne@69 104 If True, will print a progress bar of the download to standard error
jpayne@69 105 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@69 106 installed. Alternatively, an arbitrary progress bar object can be
jpayne@69 107 passed. See :ref:`custom-progressbar` for details.
jpayne@69 108 chunk_size : int
jpayne@69 109 Files are streamed *chunk_size* bytes at a time instead of loading
jpayne@69 110 everything into memory at one. Usually doesn't need to be changed.
jpayne@69 111 **kwargs
jpayne@69 112 All keyword arguments given when creating an instance of this class
jpayne@69 113 will be passed to :func:`requests.get`.
jpayne@69 114
jpayne@69 115 Examples
jpayne@69 116 --------
jpayne@69 117
jpayne@69 118 Download one of the data files from the Pooch repository:
jpayne@69 119
jpayne@69 120 >>> import os
jpayne@69 121 >>> from pooch import __version__, check_version
jpayne@69 122 >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
jpayne@69 123 >>> url = url.format(check_version(__version__, fallback="main"))
jpayne@69 124 >>> downloader = HTTPDownloader()
jpayne@69 125 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
jpayne@69 126 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@69 127 >>> os.path.exists("tiny-data.txt")
jpayne@69 128 True
jpayne@69 129 >>> with open("tiny-data.txt") as f:
jpayne@69 130 ... print(f.read().strip())
jpayne@69 131 # A tiny data file for test purposes only
jpayne@69 132 1 2 3 4 5 6
jpayne@69 133 >>> os.remove("tiny-data.txt")
jpayne@69 134
jpayne@69 135 Authentication can be handled by passing a user name and password to
jpayne@69 136 :func:`requests.get`. All arguments provided when creating an instance of
jpayne@69 137 the class are forwarded to :func:`requests.get`. We'll use
jpayne@69 138 ``auth=(username, password)`` to use basic HTTPS authentication. The
jpayne@69 139 https://httpbin.org website allows us to make a fake a login request using
jpayne@69 140 whatever username and password we provide to it:
jpayne@69 141
jpayne@69 142 >>> user = "doggo"
jpayne@69 143 >>> password = "goodboy"
jpayne@69 144 >>> # httpbin will ask for the user and password we provide in the URL
jpayne@69 145 >>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
jpayne@69 146 >>> # Trying without the login credentials causes an error
jpayne@69 147 >>> downloader = HTTPDownloader()
jpayne@69 148 >>> try:
jpayne@69 149 ... downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@69 150 ... except Exception:
jpayne@69 151 ... print("There was an error!")
jpayne@69 152 There was an error!
jpayne@69 153 >>> # Pass in the credentials to HTTPDownloader
jpayne@69 154 >>> downloader = HTTPDownloader(auth=(user, password))
jpayne@69 155 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@69 156 >>> with open("tiny-data.txt") as f:
jpayne@69 157 ... for line in f:
jpayne@69 158 ... print(line.rstrip())
jpayne@69 159 {
jpayne@69 160 "authenticated": true,
jpayne@69 161 "user": "doggo"
jpayne@69 162 }
jpayne@69 163 >>> os.remove("tiny-data.txt")
jpayne@69 164
jpayne@69 165 """
jpayne@69 166
jpayne@69 167 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
jpayne@69 168 self.kwargs = kwargs
jpayne@69 169 self.progressbar = progressbar
jpayne@69 170 self.chunk_size = chunk_size
jpayne@69 171 if self.progressbar is True and tqdm is None:
jpayne@69 172 raise ValueError("Missing package 'tqdm' required for progress bars.")
jpayne@69 173
jpayne@69 174 def __call__(
jpayne@69 175 self, url, output_file, pooch, check_only=False
jpayne@69 176 ): # pylint: disable=R0914
jpayne@69 177 """
jpayne@69 178 Download the given URL over HTTP to the given output file.
jpayne@69 179
jpayne@69 180 Uses :func:`requests.get`.
jpayne@69 181
jpayne@69 182 Parameters
jpayne@69 183 ----------
jpayne@69 184 url : str
jpayne@69 185 The URL to the file you want to download.
jpayne@69 186 output_file : str or file-like object
jpayne@69 187 Path (and file name) to which the file will be downloaded.
jpayne@69 188 pooch : :class:`~pooch.Pooch`
jpayne@69 189 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@69 190 check_only : bool
jpayne@69 191 If True, will only check if a file exists on the server and
jpayne@69 192 **without downloading the file**. Will return ``True`` if the file
jpayne@69 193 exists and ``False`` otherwise.
jpayne@69 194
jpayne@69 195 Returns
jpayne@69 196 -------
jpayne@69 197 availability : bool or None
jpayne@69 198 If ``check_only==True``, returns a boolean indicating if the file
jpayne@69 199 is available on the server. Otherwise, returns ``None``.
jpayne@69 200
jpayne@69 201 """
jpayne@69 202 # Lazy import requests to speed up import time
jpayne@69 203 import requests # pylint: disable=C0415
jpayne@69 204
jpayne@69 205 if check_only:
jpayne@69 206 timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT)
jpayne@69 207 response = requests.head(url, timeout=timeout, allow_redirects=True)
jpayne@69 208 available = bool(response.status_code == 200)
jpayne@69 209 return available
jpayne@69 210
jpayne@69 211 kwargs = self.kwargs.copy()
jpayne@69 212 timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT)
jpayne@69 213 kwargs.setdefault("stream", True)
jpayne@69 214 ispath = not hasattr(output_file, "write")
jpayne@69 215 if ispath:
jpayne@69 216 # pylint: disable=consider-using-with
jpayne@69 217 output_file = open(output_file, "w+b")
jpayne@69 218 # pylint: enable=consider-using-with
jpayne@69 219 try:
jpayne@69 220 response = requests.get(url, timeout=timeout, **kwargs)
jpayne@69 221 response.raise_for_status()
jpayne@69 222 content = response.iter_content(chunk_size=self.chunk_size)
jpayne@69 223 total = int(response.headers.get("content-length", 0))
jpayne@69 224 if self.progressbar is True:
jpayne@69 225 # Need to use ascii characters on Windows because there isn't
jpayne@69 226 # always full unicode support
jpayne@69 227 # (see https://github.com/tqdm/tqdm/issues/454)
jpayne@69 228 use_ascii = bool(sys.platform == "win32")
jpayne@69 229 progress = tqdm(
jpayne@69 230 total=total,
jpayne@69 231 ncols=79,
jpayne@69 232 ascii=use_ascii,
jpayne@69 233 unit="B",
jpayne@69 234 unit_scale=True,
jpayne@69 235 leave=True,
jpayne@69 236 )
jpayne@69 237 elif self.progressbar:
jpayne@69 238 progress = self.progressbar
jpayne@69 239 progress.total = total
jpayne@69 240 for chunk in content:
jpayne@69 241 if chunk:
jpayne@69 242 output_file.write(chunk)
jpayne@69 243 output_file.flush()
jpayne@69 244 if self.progressbar:
jpayne@69 245 # Use the chunk size here because chunk may be much
jpayne@69 246 # larger if the data are decompressed by requests after
jpayne@69 247 # reading (happens with text files).
jpayne@69 248 progress.update(self.chunk_size)
jpayne@69 249 # Make sure the progress bar gets filled even if the actual number
jpayne@69 250 # is chunks is smaller than expected. This happens when streaming
jpayne@69 251 # text files that are compressed by the server when sending (gzip).
jpayne@69 252 # Binary files don't experience this.
jpayne@69 253 if self.progressbar:
jpayne@69 254 progress.reset()
jpayne@69 255 progress.update(total)
jpayne@69 256 progress.close()
jpayne@69 257 finally:
jpayne@69 258 if ispath:
jpayne@69 259 output_file.close()
jpayne@69 260 return None
jpayne@69 261
jpayne@69 262
jpayne@69 263 class FTPDownloader: # pylint: disable=too-few-public-methods
jpayne@69 264 """
jpayne@69 265 Download manager for fetching files over FTP.
jpayne@69 266
jpayne@69 267 When called, downloads the given file URL into the specified local file.
jpayne@69 268 Uses the :mod:`ftplib` module to manage downloads.
jpayne@69 269
jpayne@69 270 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@69 271 the download of files (for example, to use authentication or print a
jpayne@69 272 progress bar).
jpayne@69 273
jpayne@69 274 Parameters
jpayne@69 275 ----------
jpayne@69 276 port : int
jpayne@69 277 Port used for the FTP connection.
jpayne@69 278 username : str
jpayne@69 279 User name used to login to the server. Only needed if the server
jpayne@69 280 requires authentication (i.e., no anonymous FTP).
jpayne@69 281 password : str
jpayne@69 282 Password used to login to the server. Only needed if the server
jpayne@69 283 requires authentication (i.e., no anonymous FTP). Use the empty string
jpayne@69 284 to indicate no password is required.
jpayne@69 285 account : str
jpayne@69 286 Some servers also require an "account" name for authentication.
jpayne@69 287 timeout : int
jpayne@69 288 Timeout in seconds for ftp socket operations, use None to mean no
jpayne@69 289 timeout.
jpayne@69 290 progressbar : bool
jpayne@69 291 If True, will print a progress bar of the download to standard error
jpayne@69 292 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@69 293 installed. **Custom progress bars are not yet supported.**
jpayne@69 294 chunk_size : int
jpayne@69 295 Files are streamed *chunk_size* bytes at a time instead of loading
jpayne@69 296 everything into memory at one. Usually doesn't need to be changed.
jpayne@69 297
jpayne@69 298 """
jpayne@69 299
jpayne@69 300 def __init__(
jpayne@69 301 self,
jpayne@69 302 port=21,
jpayne@69 303 username="anonymous",
jpayne@69 304 password="",
jpayne@69 305 account="",
jpayne@69 306 timeout=None,
jpayne@69 307 progressbar=False,
jpayne@69 308 chunk_size=1024,
jpayne@69 309 ):
jpayne@69 310 self.port = port
jpayne@69 311 self.username = username
jpayne@69 312 self.password = password
jpayne@69 313 self.account = account
jpayne@69 314 self.timeout = timeout
jpayne@69 315 self.progressbar = progressbar
jpayne@69 316 self.chunk_size = chunk_size
jpayne@69 317 if self.progressbar is True and tqdm is None:
jpayne@69 318 raise ValueError("Missing package 'tqdm' required for progress bars.")
jpayne@69 319
jpayne@69 320 def __call__(self, url, output_file, pooch, check_only=False):
jpayne@69 321 """
jpayne@69 322 Download the given URL over FTP to the given output file.
jpayne@69 323
jpayne@69 324 Parameters
jpayne@69 325 ----------
jpayne@69 326 url : str
jpayne@69 327 The URL to the file you want to download.
jpayne@69 328 output_file : str or file-like object
jpayne@69 329 Path (and file name) to which the file will be downloaded.
jpayne@69 330 pooch : :class:`~pooch.Pooch`
jpayne@69 331 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@69 332 check_only : bool
jpayne@69 333 If True, will only check if a file exists on the server and
jpayne@69 334 **without downloading the file**. Will return ``True`` if the file
jpayne@69 335 exists and ``False`` otherwise.
jpayne@69 336
jpayne@69 337 Returns
jpayne@69 338 -------
jpayne@69 339 availability : bool or None
jpayne@69 340 If ``check_only==True``, returns a boolean indicating if the file
jpayne@69 341 is available on the server. Otherwise, returns ``None``.
jpayne@69 342
jpayne@69 343 """
jpayne@69 344 parsed_url = parse_url(url)
jpayne@69 345 ftp = ftplib.FTP(timeout=self.timeout)
jpayne@69 346 ftp.connect(host=parsed_url["netloc"], port=self.port)
jpayne@69 347
jpayne@69 348 if check_only:
jpayne@69 349 directory, file_name = os.path.split(parsed_url["path"])
jpayne@69 350 try:
jpayne@69 351 ftp.login(user=self.username, passwd=self.password, acct=self.account)
jpayne@69 352 available = file_name in ftp.nlst(directory)
jpayne@69 353 finally:
jpayne@69 354 ftp.close()
jpayne@69 355 return available
jpayne@69 356
jpayne@69 357 ispath = not hasattr(output_file, "write")
jpayne@69 358 if ispath:
jpayne@69 359 # pylint: disable=consider-using-with
jpayne@69 360 output_file = open(output_file, "w+b")
jpayne@69 361 # pylint: enable=consider-using-with
jpayne@69 362 try:
jpayne@69 363 ftp.login(user=self.username, passwd=self.password, acct=self.account)
jpayne@69 364 command = f"RETR {parsed_url['path']}"
jpayne@69 365 if self.progressbar:
jpayne@69 366 # Make sure the file is set to binary mode, otherwise we can't
jpayne@69 367 # get the file size. See: https://stackoverflow.com/a/22093848
jpayne@69 368 ftp.voidcmd("TYPE I")
jpayne@69 369 use_ascii = bool(sys.platform == "win32")
jpayne@69 370 progress = tqdm(
jpayne@69 371 total=int(ftp.size(parsed_url["path"])),
jpayne@69 372 ncols=79,
jpayne@69 373 ascii=use_ascii,
jpayne@69 374 unit="B",
jpayne@69 375 unit_scale=True,
jpayne@69 376 leave=True,
jpayne@69 377 )
jpayne@69 378 with progress:
jpayne@69 379
jpayne@69 380 def callback(data):
jpayne@69 381 "Update the progress bar and write to output"
jpayne@69 382 progress.update(len(data))
jpayne@69 383 output_file.write(data)
jpayne@69 384
jpayne@69 385 ftp.retrbinary(command, callback, blocksize=self.chunk_size)
jpayne@69 386 else:
jpayne@69 387 ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
jpayne@69 388 finally:
jpayne@69 389 ftp.quit()
jpayne@69 390 if ispath:
jpayne@69 391 output_file.close()
jpayne@69 392 return None
jpayne@69 393
jpayne@69 394
jpayne@69 395 class SFTPDownloader: # pylint: disable=too-few-public-methods
jpayne@69 396 """
jpayne@69 397 Download manager for fetching files over SFTP.
jpayne@69 398
jpayne@69 399 When called, downloads the given file URL into the specified local file.
jpayne@69 400 Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
jpayne@69 401 installed.
jpayne@69 402
jpayne@69 403 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@69 404 the download of files (for example, to use authentication or print a
jpayne@69 405 progress bar).
jpayne@69 406
jpayne@69 407 Parameters
jpayne@69 408 ----------
jpayne@69 409 port : int
jpayne@69 410 Port used for the SFTP connection.
jpayne@69 411 username : str
jpayne@69 412 User name used to login to the server. Only needed if the server
jpayne@69 413 requires authentication (i.e., no anonymous SFTP).
jpayne@69 414 password : str
jpayne@69 415 Password used to login to the server. Only needed if the server
jpayne@69 416 requires authentication (i.e., no anonymous SFTP). Use the empty
jpayne@69 417 string to indicate no password is required.
jpayne@69 418 timeout : int
jpayne@69 419 Timeout in seconds for sftp socket operations, use None to mean no
jpayne@69 420 timeout.
jpayne@69 421 progressbar : bool or an arbitrary progress bar object
jpayne@69 422 If True, will print a progress bar of the download to standard
jpayne@69 423 error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
jpayne@69 424 be installed.
jpayne@69 425
jpayne@69 426 """
jpayne@69 427
jpayne@69 428 def __init__(
jpayne@69 429 self,
jpayne@69 430 port=22,
jpayne@69 431 username="anonymous",
jpayne@69 432 password="",
jpayne@69 433 account="",
jpayne@69 434 timeout=None,
jpayne@69 435 progressbar=False,
jpayne@69 436 ):
jpayne@69 437 self.port = port
jpayne@69 438 self.username = username
jpayne@69 439 self.password = password
jpayne@69 440 self.account = account
jpayne@69 441 self.timeout = timeout
jpayne@69 442 self.progressbar = progressbar
jpayne@69 443 # Collect errors and raise only once so that both missing packages are
jpayne@69 444 # captured. Otherwise, the user is only warned of one of them at a
jpayne@69 445 # time (and we can't test properly when they are both missing).
jpayne@69 446 errors = []
jpayne@69 447 if self.progressbar and tqdm is None:
jpayne@69 448 errors.append("Missing package 'tqdm' required for progress bars.")
jpayne@69 449 if paramiko is None:
jpayne@69 450 errors.append("Missing package 'paramiko' required for SFTP downloads.")
jpayne@69 451 if errors:
jpayne@69 452 raise ValueError(" ".join(errors))
jpayne@69 453
jpayne@69 454 def __call__(self, url, output_file, pooch):
jpayne@69 455 """
jpayne@69 456 Download the given URL over SFTP to the given output file.
jpayne@69 457
jpayne@69 458 The output file must be given as a string (file name/path) and not an
jpayne@69 459 open file object! Otherwise, paramiko cannot save to that file.
jpayne@69 460
jpayne@69 461 Parameters
jpayne@69 462 ----------
jpayne@69 463 url : str
jpayne@69 464 The URL to the file you want to download.
jpayne@69 465 output_file : str
jpayne@69 466 Path (and file name) to which the file will be downloaded. **Cannot
jpayne@69 467 be a file object**.
jpayne@69 468 pooch : :class:`~pooch.Pooch`
jpayne@69 469 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@69 470 """
jpayne@69 471 parsed_url = parse_url(url)
jpayne@69 472 connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
jpayne@69 473 sftp = None
jpayne@69 474 try:
jpayne@69 475 connection.connect(username=self.username, password=self.password)
jpayne@69 476 sftp = paramiko.SFTPClient.from_transport(connection)
jpayne@69 477 sftp.get_channel().settimeout = self.timeout
jpayne@69 478 if self.progressbar:
jpayne@69 479 size = int(sftp.stat(parsed_url["path"]).st_size)
jpayne@69 480 use_ascii = bool(sys.platform == "win32")
jpayne@69 481 progress = tqdm(
jpayne@69 482 total=size,
jpayne@69 483 ncols=79,
jpayne@69 484 ascii=use_ascii,
jpayne@69 485 unit="B",
jpayne@69 486 unit_scale=True,
jpayne@69 487 leave=True,
jpayne@69 488 )
jpayne@69 489 if self.progressbar:
jpayne@69 490 with progress:
jpayne@69 491
jpayne@69 492 def callback(current, total):
jpayne@69 493 "Update the progress bar and write to output"
jpayne@69 494 progress.total = int(total)
jpayne@69 495 progress.update(int(current - progress.n))
jpayne@69 496
jpayne@69 497 sftp.get(parsed_url["path"], output_file, callback=callback)
jpayne@69 498 else:
jpayne@69 499 sftp.get(parsed_url["path"], output_file)
jpayne@69 500 finally:
jpayne@69 501 connection.close()
jpayne@69 502 if sftp is not None:
jpayne@69 503 sftp.close()
jpayne@69 504
jpayne@69 505
jpayne@69 506 class DOIDownloader: # pylint: disable=too-few-public-methods
jpayne@69 507 """
jpayne@69 508 Download manager for fetching files from Digital Object Identifiers (DOIs).
jpayne@69 509
jpayne@69 510 Open-access data repositories often issue Digital Object Identifiers (DOIs)
jpayne@69 511 for data which provide a stable link and citation point. The trick is
jpayne@69 512 finding out the download URL for a file given the DOI.
jpayne@69 513
jpayne@69 514 When called, this downloader uses the repository's public API to find out
jpayne@69 515 the download URL from the DOI and file name. It then uses
jpayne@69 516 :class:`pooch.HTTPDownloader` to download the URL into the specified local
jpayne@69 517 file. Allowing "URL"s to be specified with the DOI instead of the actual
jpayne@69 518 HTTP download link. Uses the :mod:`requests` library to manage downloads
jpayne@69 519 and interact with the APIs.
jpayne@69 520
jpayne@69 521 The **format of the "URL"** is: ``doi:{DOI}/{file name}``.
jpayne@69 522
jpayne@69 523 Notice that there are no ``//`` like in HTTP/FTP and you must specify a
jpayne@69 524 file name after the DOI (separated by a ``/``).
jpayne@69 525
jpayne@69 526 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
jpayne@69 527 download files given the DOI instead of an HTTP link.
jpayne@69 528
jpayne@69 529 Supported repositories:
jpayne@69 530
jpayne@69 531 * `figshare <https://www.figshare.com>`__
jpayne@69 532 * `Zenodo <https://www.zenodo.org>`__
jpayne@69 533 * `Dataverse <https://dataverse.org/>`__ instances
jpayne@69 534
jpayne@69 535 .. attention::
jpayne@69 536
jpayne@69 537 DOIs from other repositories **will not work** since we need to access
jpayne@69 538 their particular APIs to find the download links. We welcome
jpayne@69 539 suggestions and contributions adding new repositories.
jpayne@69 540
jpayne@69 541 Parameters
jpayne@69 542 ----------
jpayne@69 543 progressbar : bool or an arbitrary progress bar object
jpayne@69 544 If True, will print a progress bar of the download to standard error
jpayne@69 545 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@69 546 installed. Alternatively, an arbitrary progress bar object can be
jpayne@69 547 passed. See :ref:`custom-progressbar` for details.
jpayne@69 548 chunk_size : int
jpayne@69 549 Files are streamed *chunk_size* bytes at a time instead of loading
jpayne@69 550 everything into memory at one. Usually doesn't need to be changed.
jpayne@69 551 **kwargs
jpayne@69 552 All keyword arguments given when creating an instance of this class
jpayne@69 553 will be passed to :func:`requests.get`.
jpayne@69 554
jpayne@69 555 Examples
jpayne@69 556 --------
jpayne@69 557
jpayne@69 558 Download one of the data files from the figshare archive of Pooch test
jpayne@69 559 data:
jpayne@69 560
jpayne@69 561 >>> import os
jpayne@69 562 >>> downloader = DOIDownloader()
jpayne@69 563 >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
jpayne@69 564 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
jpayne@69 565 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@69 566 >>> os.path.exists("tiny-data.txt")
jpayne@69 567 True
jpayne@69 568 >>> with open("tiny-data.txt") as f:
jpayne@69 569 ... print(f.read().strip())
jpayne@69 570 # A tiny data file for test purposes only
jpayne@69 571 1 2 3 4 5 6
jpayne@69 572 >>> os.remove("tiny-data.txt")
jpayne@69 573
jpayne@69 574 Same thing but for our Zenodo archive:
jpayne@69 575
jpayne@69 576 >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
jpayne@69 577 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@69 578 >>> os.path.exists("tiny-data.txt")
jpayne@69 579 True
jpayne@69 580 >>> with open("tiny-data.txt") as f:
jpayne@69 581 ... print(f.read().strip())
jpayne@69 582 # A tiny data file for test purposes only
jpayne@69 583 1 2 3 4 5 6
jpayne@69 584 >>> os.remove("tiny-data.txt")
jpayne@69 585
jpayne@69 586 """
jpayne@69 587
jpayne@69 588 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
jpayne@69 589 self.kwargs = kwargs
jpayne@69 590 self.progressbar = progressbar
jpayne@69 591 self.chunk_size = chunk_size
jpayne@69 592
jpayne@69 593 def __call__(self, url, output_file, pooch):
jpayne@69 594 """
jpayne@69 595 Download the given DOI URL over HTTP to the given output file.
jpayne@69 596
jpayne@69 597 Uses the repository's API to determine the actual HTTP download URL
jpayne@69 598 from the given DOI.
jpayne@69 599
jpayne@69 600 Uses :func:`requests.get`.
jpayne@69 601
jpayne@69 602 Parameters
jpayne@69 603 ----------
jpayne@69 604 url : str
jpayne@69 605 The URL to the file you want to download.
jpayne@69 606 output_file : str or file-like object
jpayne@69 607 Path (and file name) to which the file will be downloaded.
jpayne@69 608 pooch : :class:`~pooch.Pooch`
jpayne@69 609 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@69 610
jpayne@69 611 """
jpayne@69 612
jpayne@69 613 parsed_url = parse_url(url)
jpayne@69 614 data_repository = doi_to_repository(parsed_url["netloc"])
jpayne@69 615
jpayne@69 616 # Resolve the URL
jpayne@69 617 file_name = parsed_url["path"]
jpayne@69 618 # remove the leading slash in the path
jpayne@69 619 if file_name[0] == "/":
jpayne@69 620 file_name = file_name[1:]
jpayne@69 621 download_url = data_repository.download_url(file_name)
jpayne@69 622
jpayne@69 623 # Instantiate the downloader object
jpayne@69 624 downloader = HTTPDownloader(
jpayne@69 625 progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
jpayne@69 626 )
jpayne@69 627 downloader(download_url, output_file, pooch)
jpayne@69 628
jpayne@69 629
jpayne@69 630 def doi_to_url(doi):
jpayne@69 631 """
jpayne@69 632 Follow a DOI link to resolve the URL of the archive.
jpayne@69 633
jpayne@69 634 Parameters
jpayne@69 635 ----------
jpayne@69 636 doi : str
jpayne@69 637 The DOI of the archive.
jpayne@69 638
jpayne@69 639 Returns
jpayne@69 640 -------
jpayne@69 641 url : str
jpayne@69 642 The URL of the archive in the data repository.
jpayne@69 643
jpayne@69 644 """
jpayne@69 645 # Lazy import requests to speed up import time
jpayne@69 646 import requests # pylint: disable=C0415
jpayne@69 647
jpayne@69 648 # Use doi.org to resolve the DOI to the repository website.
jpayne@69 649 response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT)
jpayne@69 650 url = response.url
jpayne@69 651 if 400 <= response.status_code < 600:
jpayne@69 652 raise ValueError(
jpayne@69 653 f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
jpayne@69 654 )
jpayne@69 655 return url
jpayne@69 656
jpayne@69 657
jpayne@69 658 def doi_to_repository(doi):
jpayne@69 659 """
jpayne@69 660 Instantiate a data repository instance from a given DOI.
jpayne@69 661
jpayne@69 662 This function implements the chain of responsibility dispatch
jpayne@69 663 to the correct data repository class.
jpayne@69 664
jpayne@69 665 Parameters
jpayne@69 666 ----------
jpayne@69 667 doi : str
jpayne@69 668 The DOI of the archive.
jpayne@69 669
jpayne@69 670 Returns
jpayne@69 671 -------
jpayne@69 672 data_repository : DataRepository
jpayne@69 673 The data repository object
jpayne@69 674 """
jpayne@69 675
jpayne@69 676 # This should go away in a separate issue: DOI handling should
jpayne@69 677 # not rely on the (non-)existence of trailing slashes. The issue
jpayne@69 678 # is documented in https://github.com/fatiando/pooch/issues/324
jpayne@69 679 if doi[-1] == "/":
jpayne@69 680 doi = doi[:-1]
jpayne@69 681
jpayne@69 682 repositories = [
jpayne@69 683 FigshareRepository,
jpayne@69 684 ZenodoRepository,
jpayne@69 685 DataverseRepository,
jpayne@69 686 ]
jpayne@69 687
jpayne@69 688 # Extract the DOI and the repository information
jpayne@69 689 archive_url = doi_to_url(doi)
jpayne@69 690
jpayne@69 691 # Try the converters one by one until one of them returned a URL
jpayne@69 692 data_repository = None
jpayne@69 693 for repo in repositories:
jpayne@69 694 if data_repository is None:
jpayne@69 695 data_repository = repo.initialize(
jpayne@69 696 archive_url=archive_url,
jpayne@69 697 doi=doi,
jpayne@69 698 )
jpayne@69 699
jpayne@69 700 if data_repository is None:
jpayne@69 701 repository = parse_url(archive_url)["netloc"]
jpayne@69 702 raise ValueError(
jpayne@69 703 f"Invalid data repository '{repository}'. "
jpayne@69 704 "To request or contribute support for this repository, "
jpayne@69 705 "please open an issue at https://github.com/fatiando/pooch/issues"
jpayne@69 706 )
jpayne@69 707
jpayne@69 708 return data_repository
jpayne@69 709
jpayne@69 710
jpayne@69 711 class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring
jpayne@69 712 @classmethod
jpayne@69 713 def initialize(cls, doi, archive_url): # pylint: disable=unused-argument
jpayne@69 714 """
jpayne@69 715 Initialize the data repository if the given URL points to a
jpayne@69 716 corresponding repository.
jpayne@69 717
jpayne@69 718 Initializes a data repository object. This is done as part of
jpayne@69 719 a chain of responsibility. If the class cannot handle the given
jpayne@69 720 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@69 721 instance is returned.
jpayne@69 722
jpayne@69 723 Parameters
jpayne@69 724 ----------
jpayne@69 725 doi : str
jpayne@69 726 The DOI that identifies the repository
jpayne@69 727 archive_url : str
jpayne@69 728 The resolved URL for the DOI
jpayne@69 729 """
jpayne@69 730
jpayne@69 731 return None # pragma: no cover
jpayne@69 732
jpayne@69 733 def download_url(self, file_name):
jpayne@69 734 """
jpayne@69 735 Use the repository API to get the download URL for a file given
jpayne@69 736 the archive URL.
jpayne@69 737
jpayne@69 738 Parameters
jpayne@69 739 ----------
jpayne@69 740 file_name : str
jpayne@69 741 The name of the file in the archive that will be downloaded.
jpayne@69 742
jpayne@69 743 Returns
jpayne@69 744 -------
jpayne@69 745 download_url : str
jpayne@69 746 The HTTP URL that can be used to download the file.
jpayne@69 747 """
jpayne@69 748
jpayne@69 749 raise NotImplementedError # pragma: no cover
jpayne@69 750
jpayne@69 751 def populate_registry(self, pooch):
jpayne@69 752 """
jpayne@69 753 Populate the registry using the data repository's API
jpayne@69 754
jpayne@69 755 Parameters
jpayne@69 756 ----------
jpayne@69 757 pooch : Pooch
jpayne@69 758 The pooch instance that the registry will be added to.
jpayne@69 759 """
jpayne@69 760
jpayne@69 761 raise NotImplementedError # pragma: no cover
jpayne@69 762
jpayne@69 763
jpayne@69 764 class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@69 765 base_api_url = "https://zenodo.org/api/records"
jpayne@69 766
jpayne@69 767 def __init__(self, doi, archive_url):
jpayne@69 768 self.archive_url = archive_url
jpayne@69 769 self.doi = doi
jpayne@69 770 self._api_response = None
jpayne@69 771 self._api_version = None
jpayne@69 772
jpayne@69 773 @classmethod
jpayne@69 774 def initialize(cls, doi, archive_url):
jpayne@69 775 """
jpayne@69 776 Initialize the data repository if the given URL points to a
jpayne@69 777 corresponding repository.
jpayne@69 778
jpayne@69 779 Initializes a data repository object. This is done as part of
jpayne@69 780 a chain of responsibility. If the class cannot handle the given
jpayne@69 781 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@69 782 instance is returned.
jpayne@69 783
jpayne@69 784 Parameters
jpayne@69 785 ----------
jpayne@69 786 doi : str
jpayne@69 787 The DOI that identifies the repository
jpayne@69 788 archive_url : str
jpayne@69 789 The resolved URL for the DOI
jpayne@69 790 """
jpayne@69 791
jpayne@69 792 # Check whether this is a Zenodo URL
jpayne@69 793 parsed_archive_url = parse_url(archive_url)
jpayne@69 794 if parsed_archive_url["netloc"] != "zenodo.org":
jpayne@69 795 return None
jpayne@69 796
jpayne@69 797 return cls(doi, archive_url)
jpayne@69 798
jpayne@69 799 @property
jpayne@69 800 def api_response(self):
jpayne@69 801 """Cached API response from Zenodo"""
jpayne@69 802 if self._api_response is None:
jpayne@69 803 # Lazy import requests to speed up import time
jpayne@69 804 import requests # pylint: disable=C0415
jpayne@69 805
jpayne@69 806 article_id = self.archive_url.split("/")[-1]
jpayne@69 807 self._api_response = requests.get(
jpayne@69 808 f"{self.base_api_url}/{article_id}",
jpayne@69 809 timeout=DEFAULT_TIMEOUT,
jpayne@69 810 ).json()
jpayne@69 811
jpayne@69 812 return self._api_response
jpayne@69 813
jpayne@69 814 @property
jpayne@69 815 def api_version(self):
jpayne@69 816 """
jpayne@69 817 Version of the Zenodo API we are interacting with
jpayne@69 818
jpayne@69 819 The versions can either be :
jpayne@69 820
jpayne@69 821 - ``"legacy"``: corresponds to the Zenodo API that was supported until
jpayne@69 822 2023-10-12 (before the migration to InvenioRDM).
jpayne@69 823 - ``"new"``: corresponds to the new API that went online on 2023-10-13
jpayne@69 824 after the migration to InvenioRDM.
jpayne@69 825
jpayne@69 826 The ``"new"`` API breaks backward compatibility with the ``"legacy"``
jpayne@69 827 one and could probably be replaced by an updated version that restores
jpayne@69 828 the behaviour of the ``"legacy"`` one.
jpayne@69 829
jpayne@69 830 Returns
jpayne@69 831 -------
jpayne@69 832 str
jpayne@69 833 """
jpayne@69 834 if self._api_version is None:
jpayne@69 835 if all("key" in file for file in self.api_response["files"]):
jpayne@69 836 self._api_version = "legacy"
jpayne@69 837 elif all("filename" in file for file in self.api_response["files"]):
jpayne@69 838 self._api_version = "new"
jpayne@69 839 else:
jpayne@69 840 raise ValueError(
jpayne@69 841 "Couldn't determine the version of the Zenodo API for "
jpayne@69 842 f"{self.archive_url} (doi:{self.doi})."
jpayne@69 843 )
jpayne@69 844 return self._api_version
jpayne@69 845
jpayne@69 846 def download_url(self, file_name):
jpayne@69 847 """
jpayne@69 848 Use the repository API to get the download URL for a file given
jpayne@69 849 the archive URL.
jpayne@69 850
jpayne@69 851 Parameters
jpayne@69 852 ----------
jpayne@69 853 file_name : str
jpayne@69 854 The name of the file in the archive that will be downloaded.
jpayne@69 855
jpayne@69 856 Returns
jpayne@69 857 -------
jpayne@69 858 download_url : str
jpayne@69 859 The HTTP URL that can be used to download the file.
jpayne@69 860
jpayne@69 861 Notes
jpayne@69 862 -----
jpayne@69 863 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
jpayne@69 864 link to the desired files that appears in the API response leads to 404
jpayne@69 865 errors (by 2023-10-17). The files are available in the following url:
jpayne@69 866 ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.
jpayne@69 867
jpayne@69 868 This method supports both the legacy and the new API.
jpayne@69 869 """
jpayne@69 870 # Create list of files in the repository
jpayne@69 871 if self.api_version == "legacy":
jpayne@69 872 files = {item["key"]: item for item in self.api_response["files"]}
jpayne@69 873 else:
jpayne@69 874 files = [item["filename"] for item in self.api_response["files"]]
jpayne@69 875 # Check if file exists in the repository
jpayne@69 876 if file_name not in files:
jpayne@69 877 raise ValueError(
jpayne@69 878 f"File '{file_name}' not found in data archive "
jpayne@69 879 f"{self.archive_url} (doi:{self.doi})."
jpayne@69 880 )
jpayne@69 881 # Build download url
jpayne@69 882 if self.api_version == "legacy":
jpayne@69 883 download_url = files[file_name]["links"]["self"]
jpayne@69 884 else:
jpayne@69 885 article_id = self.api_response["id"]
jpayne@69 886 download_url = (
jpayne@69 887 f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
jpayne@69 888 )
jpayne@69 889 return download_url
jpayne@69 890
jpayne@69 891 def populate_registry(self, pooch):
jpayne@69 892 """
jpayne@69 893 Populate the registry using the data repository's API
jpayne@69 894
jpayne@69 895 Parameters
jpayne@69 896 ----------
jpayne@69 897 pooch : Pooch
jpayne@69 898 The pooch instance that the registry will be added to.
jpayne@69 899
jpayne@69 900 Notes
jpayne@69 901 -----
jpayne@69 902 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
jpayne@69 903 checksums for each file listed in the API reference is now an md5 sum.
jpayne@69 904
jpayne@69 905 This method supports both the legacy and the new API.
jpayne@69 906 """
jpayne@69 907 for filedata in self.api_response["files"]:
jpayne@69 908 checksum = filedata["checksum"]
jpayne@69 909 if self.api_version == "legacy":
jpayne@69 910 key = "key"
jpayne@69 911 else:
jpayne@69 912 key = "filename"
jpayne@69 913 checksum = f"md5:{checksum}"
jpayne@69 914 pooch.registry[filedata[key]] = checksum
jpayne@69 915
jpayne@69 916
jpayne@69 917 class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@69 918 def __init__(self, doi, archive_url):
jpayne@69 919 self.archive_url = archive_url
jpayne@69 920 self.doi = doi
jpayne@69 921 self._api_response = None
jpayne@69 922
jpayne@69 923 @classmethod
jpayne@69 924 def initialize(cls, doi, archive_url):
jpayne@69 925 """
jpayne@69 926 Initialize the data repository if the given URL points to a
jpayne@69 927 corresponding repository.
jpayne@69 928
jpayne@69 929 Initializes a data repository object. This is done as part of
jpayne@69 930 a chain of responsibility. If the class cannot handle the given
jpayne@69 931 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@69 932 instance is returned.
jpayne@69 933
jpayne@69 934 Parameters
jpayne@69 935 ----------
jpayne@69 936 doi : str
jpayne@69 937 The DOI that identifies the repository
jpayne@69 938 archive_url : str
jpayne@69 939 The resolved URL for the DOI
jpayne@69 940 """
jpayne@69 941
jpayne@69 942 # Check whether this is a Figshare URL
jpayne@69 943 parsed_archive_url = parse_url(archive_url)
jpayne@69 944 if parsed_archive_url["netloc"] != "figshare.com":
jpayne@69 945 return None
jpayne@69 946
jpayne@69 947 return cls(doi, archive_url)
jpayne@69 948
jpayne@69 949 def _parse_version_from_doi(self):
jpayne@69 950 """
jpayne@69 951 Parse version from the doi
jpayne@69 952
jpayne@69 953 Return None if version is not available in the doi.
jpayne@69 954 """
jpayne@69 955 # Get suffix of the doi
jpayne@69 956 _, suffix = self.doi.split("/")
jpayne@69 957 # Split the suffix by dots and keep the last part
jpayne@69 958 last_part = suffix.split(".")[-1]
jpayne@69 959 # Parse the version from the last part
jpayne@69 960 if last_part[0] != "v":
jpayne@69 961 return None
jpayne@69 962 version = int(last_part[1:])
jpayne@69 963 return version
jpayne@69 964
jpayne@69 965 @property
jpayne@69 966 def api_response(self):
jpayne@69 967 """Cached API response from Figshare"""
jpayne@69 968 if self._api_response is None:
jpayne@69 969 # Lazy import requests to speed up import time
jpayne@69 970 import requests # pylint: disable=C0415
jpayne@69 971
jpayne@69 972 # Use the figshare API to find the article ID from the DOI
jpayne@69 973 article = requests.get(
jpayne@69 974 f"https://api.figshare.com/v2/articles?doi={self.doi}",
jpayne@69 975 timeout=DEFAULT_TIMEOUT,
jpayne@69 976 ).json()[0]
jpayne@69 977 article_id = article["id"]
jpayne@69 978 # Parse desired version from the doi
jpayne@69 979 version = self._parse_version_from_doi()
jpayne@69 980 # With the ID and version, we can get a list of files and their
jpayne@69 981 # download links
jpayne@69 982 if version is None:
jpayne@69 983 # Figshare returns the latest version available when no version
jpayne@69 984 # is specified through the DOI.
jpayne@69 985 warnings.warn(
jpayne@69 986 f"The Figshare DOI '{self.doi}' doesn't specify which version of "
jpayne@69 987 "the repository should be used. "
jpayne@69 988 "Figshare will point to the latest version available.",
jpayne@69 989 UserWarning,
jpayne@69 990 )
jpayne@69 991 # Define API url using only the article id
jpayne@69 992 # (figshare will resolve the latest version)
jpayne@69 993 api_url = f"https://api.figshare.com/v2/articles/{article_id}"
jpayne@69 994 else:
jpayne@69 995 # Define API url using article id and the desired version
jpayne@69 996 # Get list of files using article id and the version
jpayne@69 997 api_url = (
jpayne@69 998 "https://api.figshare.com/v2/articles/"
jpayne@69 999 f"{article_id}/versions/{version}"
jpayne@69 1000 )
jpayne@69 1001 # Make the request and return the files in the figshare repository
jpayne@69 1002 response = requests.get(api_url, timeout=DEFAULT_TIMEOUT)
jpayne@69 1003 response.raise_for_status()
jpayne@69 1004 self._api_response = response.json()["files"]
jpayne@69 1005
jpayne@69 1006 return self._api_response
jpayne@69 1007
jpayne@69 1008 def download_url(self, file_name):
jpayne@69 1009 """
jpayne@69 1010 Use the repository API to get the download URL for a file given
jpayne@69 1011 the archive URL.
jpayne@69 1012
jpayne@69 1013 Parameters
jpayne@69 1014 ----------
jpayne@69 1015 file_name : str
jpayne@69 1016 The name of the file in the archive that will be downloaded.
jpayne@69 1017
jpayne@69 1018 Returns
jpayne@69 1019 -------
jpayne@69 1020 download_url : str
jpayne@69 1021 The HTTP URL that can be used to download the file.
jpayne@69 1022 """
jpayne@69 1023 files = {item["name"]: item for item in self.api_response}
jpayne@69 1024 if file_name not in files:
jpayne@69 1025 raise ValueError(
jpayne@69 1026 f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
jpayne@69 1027 )
jpayne@69 1028 download_url = files[file_name]["download_url"]
jpayne@69 1029 return download_url
jpayne@69 1030
jpayne@69 1031 def populate_registry(self, pooch):
jpayne@69 1032 """
jpayne@69 1033 Populate the registry using the data repository's API
jpayne@69 1034
jpayne@69 1035 Parameters
jpayne@69 1036 ----------
jpayne@69 1037 pooch : Pooch
jpayne@69 1038 The pooch instance that the registry will be added to.
jpayne@69 1039 """
jpayne@69 1040
jpayne@69 1041 for filedata in self.api_response:
jpayne@69 1042 pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
jpayne@69 1043
jpayne@69 1044
jpayne@69 1045 class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@69 1046 def __init__(self, doi, archive_url):
jpayne@69 1047 self.archive_url = archive_url
jpayne@69 1048 self.doi = doi
jpayne@69 1049 self._api_response = None
jpayne@69 1050
jpayne@69 1051 @classmethod
jpayne@69 1052 def initialize(cls, doi, archive_url):
jpayne@69 1053 """
jpayne@69 1054 Initialize the data repository if the given URL points to a
jpayne@69 1055 corresponding repository.
jpayne@69 1056
jpayne@69 1057 Initializes a data repository object. This is done as part of
jpayne@69 1058 a chain of responsibility. If the class cannot handle the given
jpayne@69 1059 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@69 1060 instance is returned.
jpayne@69 1061
jpayne@69 1062 Parameters
jpayne@69 1063 ----------
jpayne@69 1064 doi : str
jpayne@69 1065 The DOI that identifies the repository
jpayne@69 1066 archive_url : str
jpayne@69 1067 The resolved URL for the DOI
jpayne@69 1068 """
jpayne@69 1069 # Access the DOI as if this was a DataVerse instance
jpayne@69 1070 response = cls._get_api_response(doi, archive_url)
jpayne@69 1071
jpayne@69 1072 # If we failed, this is probably not a DataVerse instance
jpayne@69 1073 if 400 <= response.status_code < 600:
jpayne@69 1074 return None
jpayne@69 1075
jpayne@69 1076 # Initialize the repository and overwrite the api response
jpayne@69 1077 repository = cls(doi, archive_url)
jpayne@69 1078 repository.api_response = response
jpayne@69 1079 return repository
jpayne@69 1080
jpayne@69 1081 @classmethod
jpayne@69 1082 def _get_api_response(cls, doi, archive_url):
jpayne@69 1083 """
jpayne@69 1084 Perform the actual API request
jpayne@69 1085
jpayne@69 1086 This has been separated into a separate ``classmethod``, as it can be
jpayne@69 1087 used prior and after the initialization.
jpayne@69 1088 """
jpayne@69 1089 # Lazy import requests to speed up import time
jpayne@69 1090 import requests # pylint: disable=C0415
jpayne@69 1091
jpayne@69 1092 parsed = parse_url(archive_url)
jpayne@69 1093 response = requests.get(
jpayne@69 1094 f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
jpayne@69 1095 f":persistentId?persistentId=doi:{doi}",
jpayne@69 1096 timeout=DEFAULT_TIMEOUT,
jpayne@69 1097 )
jpayne@69 1098 return response
jpayne@69 1099
jpayne@69 1100 @property
jpayne@69 1101 def api_response(self):
jpayne@69 1102 """Cached API response from a DataVerse instance"""
jpayne@69 1103
jpayne@69 1104 if self._api_response is None:
jpayne@69 1105 self._api_response = self._get_api_response(
jpayne@69 1106 self.doi, self.archive_url
jpayne@69 1107 ) # pragma: no cover
jpayne@69 1108
jpayne@69 1109 return self._api_response
jpayne@69 1110
jpayne@69 1111 @api_response.setter
jpayne@69 1112 def api_response(self, response):
jpayne@69 1113 """Update the cached API response"""
jpayne@69 1114
jpayne@69 1115 self._api_response = response
jpayne@69 1116
jpayne@69 1117 def download_url(self, file_name):
jpayne@69 1118 """
jpayne@69 1119 Use the repository API to get the download URL for a file given
jpayne@69 1120 the archive URL.
jpayne@69 1121
jpayne@69 1122 Parameters
jpayne@69 1123 ----------
jpayne@69 1124 file_name : str
jpayne@69 1125 The name of the file in the archive that will be downloaded.
jpayne@69 1126
jpayne@69 1127 Returns
jpayne@69 1128 -------
jpayne@69 1129 download_url : str
jpayne@69 1130 The HTTP URL that can be used to download the file.
jpayne@69 1131 """
jpayne@69 1132 parsed = parse_url(self.archive_url)
jpayne@69 1133 response = self.api_response.json()
jpayne@69 1134 files = {
jpayne@69 1135 file["dataFile"]["filename"]: file["dataFile"]
jpayne@69 1136 for file in response["data"]["latestVersion"]["files"]
jpayne@69 1137 }
jpayne@69 1138 if file_name not in files:
jpayne@69 1139 raise ValueError(
jpayne@69 1140 f"File '{file_name}' not found in data archive "
jpayne@69 1141 f"{self.archive_url} (doi:{self.doi})."
jpayne@69 1142 )
jpayne@69 1143 # Generate download_url using the file id
jpayne@69 1144 download_url = (
jpayne@69 1145 f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
jpayne@69 1146 f"{files[file_name]['id']}"
jpayne@69 1147 )
jpayne@69 1148 return download_url
jpayne@69 1149
jpayne@69 1150 def populate_registry(self, pooch):
jpayne@69 1151 """
jpayne@69 1152 Populate the registry using the data repository's API
jpayne@69 1153
jpayne@69 1154 Parameters
jpayne@69 1155 ----------
jpayne@69 1156 pooch : Pooch
jpayne@69 1157 The pooch instance that the registry will be added to.
jpayne@69 1158 """
jpayne@69 1159
jpayne@69 1160 for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
jpayne@69 1161 pooch.registry[filedata["dataFile"]["filename"]] = (
jpayne@69 1162 f"md5:{filedata['dataFile']['md5']}"
jpayne@69 1163 )