csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

rev	line source
jpayne@69	1 # Copyright (c) 2018 The Pooch Developers.
jpayne@69	2 # Distributed under the terms of the BSD 3-Clause License.
jpayne@69	3 # SPDX-License-Identifier: BSD-3-Clause
jpayne@69	4 #
jpayne@69	5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
jpayne@69	6 #
jpayne@69	7 """
jpayne@69	8 The classes that actually handle the downloads.
jpayne@69	9 """
jpayne@69	10 import os
jpayne@69	11 import sys
jpayne@69	12 import ftplib
jpayne@69	13
jpayne@69	14 import warnings
jpayne@69	15
jpayne@69	16 from .utils import parse_url
jpayne@69	17
jpayne@69	18 try:
jpayne@69	19 from tqdm import tqdm
jpayne@69	20 except ImportError:
jpayne@69	21 tqdm = None
jpayne@69	22
jpayne@69	23 try:
jpayne@69	24 import paramiko
jpayne@69	25 except ImportError:
jpayne@69	26 paramiko = None
jpayne@69	27
jpayne@69	28
jpayne@69	29 # Set the default timeout in seconds so it can be configured in a pinch for the
jpayne@69	30 # methods that don't or can't expose a way set it at runtime.
jpayne@69	31 # See https://github.com/fatiando/pooch/issues/409
jpayne@69	32 DEFAULT_TIMEOUT = 30
jpayne@69	33
jpayne@69	34
jpayne@69	35 def choose_downloader(url, progressbar=False):
jpayne@69	36 """
jpayne@69	37 Choose the appropriate downloader for the given URL based on the protocol.
jpayne@69	38
jpayne@69	39 Parameters
jpayne@69	40 ----------
jpayne@69	41 url : str
jpayne@69	42 A URL (including protocol).
jpayne@69	43 progressbar : bool or an arbitrary progress bar object
jpayne@69	44 If True, will print a progress bar of the download to standard error
jpayne@69	45 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@69	46 installed. Alternatively, an arbitrary progress bar object can be
jpayne@69	47 passed. See :ref:`custom-progressbar` for details.
jpayne@69	48
jpayne@69	49 Returns
jpayne@69	50 -------
jpayne@69	51 downloader
jpayne@69	52 A downloader class, like :class:`pooch.HTTPDownloader`,
jpayne@69	53 :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
jpayne@69	54
jpayne@69	55 Examples
jpayne@69	56 --------
jpayne@69	57
jpayne@69	58 >>> downloader = choose_downloader("http://something.com")
jpayne@69	59 >>> print(downloader.__class__.__name__)
jpayne@69	60 HTTPDownloader
jpayne@69	61 >>> downloader = choose_downloader("https://something.com")
jpayne@69	62 >>> print(downloader.__class__.__name__)
jpayne@69	63 HTTPDownloader
jpayne@69	64 >>> downloader = choose_downloader("ftp://something.com")
jpayne@69	65 >>> print(downloader.__class__.__name__)
jpayne@69	66 FTPDownloader
jpayne@69	67 >>> downloader = choose_downloader("doi:DOI/filename.csv")
jpayne@69	68 >>> print(downloader.__class__.__name__)
jpayne@69	69 DOIDownloader
jpayne@69	70
jpayne@69	71 """
jpayne@69	72 known_downloaders = {
jpayne@69	73 "ftp": FTPDownloader,
jpayne@69	74 "https": HTTPDownloader,
jpayne@69	75 "http": HTTPDownloader,
jpayne@69	76 "sftp": SFTPDownloader,
jpayne@69	77 "doi": DOIDownloader,
jpayne@69	78 }
jpayne@69	79
jpayne@69	80 parsed_url = parse_url(url)
jpayne@69	81 if parsed_url["protocol"] not in known_downloaders:
jpayne@69	82 raise ValueError(
jpayne@69	83 f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
jpayne@69	84 f"Must be one of {known_downloaders.keys()}."
jpayne@69	85 )
jpayne@69	86 downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
jpayne@69	87 return downloader
jpayne@69	88
jpayne@69	89
jpayne@69	90 class HTTPDownloader: # pylint: disable=too-few-public-methods
jpayne@69	91 """
jpayne@69	92 Download manager for fetching files over HTTP/HTTPS.
jpayne@69	93
jpayne@69	94 When called, downloads the given file URL into the specified local file.
jpayne@69	95 Uses the :mod:`requests` library to manage downloads.
jpayne@69	96
jpayne@69	97 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@69	98 the download of files (for example, to use authentication or print a
jpayne@69	99 progress bar).
jpayne@69	100
jpayne@69	101 Parameters
jpayne@69	102 ----------
jpayne@69	103 progressbar : bool or an arbitrary progress bar object
jpayne@69	104 If True, will print a progress bar of the download to standard error
jpayne@69	105 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@69	106 installed. Alternatively, an arbitrary progress bar object can be
jpayne@69	107 passed. See :ref:`custom-progressbar` for details.
jpayne@69	108 chunk_size : int
jpayne@69	109 Files are streamed chunk_size bytes at a time instead of loading
jpayne@69	110 everything into memory at one. Usually doesn't need to be changed.
jpayne@69	111 **kwargs
jpayne@69	112 All keyword arguments given when creating an instance of this class
jpayne@69	113 will be passed to :func:`requests.get`.
jpayne@69	114
jpayne@69	115 Examples
jpayne@69	116 --------
jpayne@69	117
jpayne@69	118 Download one of the data files from the Pooch repository:
jpayne@69	119
jpayne@69	120 >>> import os
jpayne@69	121 >>> from pooch import __version__, check_version
jpayne@69	122 >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
jpayne@69	123 >>> url = url.format(check_version(__version__, fallback="main"))
jpayne@69	124 >>> downloader = HTTPDownloader()
jpayne@69	125 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
jpayne@69	126 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@69	127 >>> os.path.exists("tiny-data.txt")
jpayne@69	128 True
jpayne@69	129 >>> with open("tiny-data.txt") as f:
jpayne@69	130 ... print(f.read().strip())
jpayne@69	131 # A tiny data file for test purposes only
jpayne@69	132 1 2 3 4 5 6
jpayne@69	133 >>> os.remove("tiny-data.txt")
jpayne@69	134
jpayne@69	135 Authentication can be handled by passing a user name and password to
jpayne@69	136 :func:`requests.get`. All arguments provided when creating an instance of
jpayne@69	137 the class are forwarded to :func:`requests.get`. We'll use
jpayne@69	138 ``auth=(username, password)`` to use basic HTTPS authentication. The
jpayne@69	139 https://httpbin.org website allows us to make a fake a login request using
jpayne@69	140 whatever username and password we provide to it:
jpayne@69	141
jpayne@69	142 >>> user = "doggo"
jpayne@69	143 >>> password = "goodboy"
jpayne@69	144 >>> # httpbin will ask for the user and password we provide in the URL
jpayne@69	145 >>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
jpayne@69	146 >>> # Trying without the login credentials causes an error
jpayne@69	147 >>> downloader = HTTPDownloader()
jpayne@69	148 >>> try:
jpayne@69	149 ... downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@69	150 ... except Exception:
jpayne@69	151 ... print("There was an error!")
jpayne@69	152 There was an error!
jpayne@69	153 >>> # Pass in the credentials to HTTPDownloader
jpayne@69	154 >>> downloader = HTTPDownloader(auth=(user, password))
jpayne@69	155 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@69	156 >>> with open("tiny-data.txt") as f:
jpayne@69	157 ... for line in f:
jpayne@69	158 ... print(line.rstrip())
jpayne@69	159 {
jpayne@69	160 "authenticated": true,
jpayne@69	161 "user": "doggo"
jpayne@69	162 }
jpayne@69	163 >>> os.remove("tiny-data.txt")
jpayne@69	164
jpayne@69	165 """
jpayne@69	166
jpayne@69	167 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
jpayne@69	168 self.kwargs = kwargs
jpayne@69	169 self.progressbar = progressbar
jpayne@69	170 self.chunk_size = chunk_size
jpayne@69	171 if self.progressbar is True and tqdm is None:
jpayne@69	172 raise ValueError("Missing package 'tqdm' required for progress bars.")
jpayne@69	173
jpayne@69	174 def __call__(
jpayne@69	175 self, url, output_file, pooch, check_only=False
jpayne@69	176 ): # pylint: disable=R0914
jpayne@69	177 """
jpayne@69	178 Download the given URL over HTTP to the given output file.
jpayne@69	179
jpayne@69	180 Uses :func:`requests.get`.
jpayne@69	181
jpayne@69	182 Parameters
jpayne@69	183 ----------
jpayne@69	184 url : str
jpayne@69	185 The URL to the file you want to download.
jpayne@69	186 output_file : str or file-like object
jpayne@69	187 Path (and file name) to which the file will be downloaded.
jpayne@69	188 pooch : :class:`~pooch.Pooch`
jpayne@69	189 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@69	190 check_only : bool
jpayne@69	191 If True, will only check if a file exists on the server and
jpayne@69	192 without downloading the file. Will return ``True`` if the file
jpayne@69	193 exists and ``False`` otherwise.
jpayne@69	194
jpayne@69	195 Returns
jpayne@69	196 -------
jpayne@69	197 availability : bool or None
jpayne@69	198 If ``check_only==True``, returns a boolean indicating if the file
jpayne@69	199 is available on the server. Otherwise, returns ``None``.
jpayne@69	200
jpayne@69	201 """
jpayne@69	202 # Lazy import requests to speed up import time
jpayne@69	203 import requests # pylint: disable=C0415
jpayne@69	204
jpayne@69	205 if check_only:
jpayne@69	206 timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT)
jpayne@69	207 response = requests.head(url, timeout=timeout, allow_redirects=True)
jpayne@69	208 available = bool(response.status_code == 200)
jpayne@69	209 return available
jpayne@69	210
jpayne@69	211 kwargs = self.kwargs.copy()
jpayne@69	212 timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT)
jpayne@69	213 kwargs.setdefault("stream", True)
jpayne@69	214 ispath = not hasattr(output_file, "write")
jpayne@69	215 if ispath:
jpayne@69	216 # pylint: disable=consider-using-with
jpayne@69	217 output_file = open(output_file, "w+b")
jpayne@69	218 # pylint: enable=consider-using-with
jpayne@69	219 try:
jpayne@69	220 response = requests.get(url, timeout=timeout, **kwargs)
jpayne@69	221 response.raise_for_status()
jpayne@69	222 content = response.iter_content(chunk_size=self.chunk_size)
jpayne@69	223 total = int(response.headers.get("content-length", 0))
jpayne@69	224 if self.progressbar is True:
jpayne@69	225 # Need to use ascii characters on Windows because there isn't
jpayne@69	226 # always full unicode support
jpayne@69	227 # (see https://github.com/tqdm/tqdm/issues/454)
jpayne@69	228 use_ascii = bool(sys.platform == "win32")
jpayne@69	229 progress = tqdm(
jpayne@69	230 total=total,
jpayne@69	231 ncols=79,
jpayne@69	232 ascii=use_ascii,
jpayne@69	233 unit="B",
jpayne@69	234 unit_scale=True,
jpayne@69	235 leave=True,
jpayne@69	236 )
jpayne@69	237 elif self.progressbar:
jpayne@69	238 progress = self.progressbar
jpayne@69	239 progress.total = total
jpayne@69	240 for chunk in content:
jpayne@69	241 if chunk:
jpayne@69	242 output_file.write(chunk)
jpayne@69	243 output_file.flush()
jpayne@69	244 if self.progressbar:
jpayne@69	245 # Use the chunk size here because chunk may be much
jpayne@69	246 # larger if the data are decompressed by requests after
jpayne@69	247 # reading (happens with text files).
jpayne@69	248 progress.update(self.chunk_size)
jpayne@69	249 # Make sure the progress bar gets filled even if the actual number
jpayne@69	250 # is chunks is smaller than expected. This happens when streaming
jpayne@69	251 # text files that are compressed by the server when sending (gzip).
jpayne@69	252 # Binary files don't experience this.
jpayne@69	253 if self.progressbar:
jpayne@69	254 progress.reset()
jpayne@69	255 progress.update(total)
jpayne@69	256 progress.close()
jpayne@69	257 finally:
jpayne@69	258 if ispath:
jpayne@69	259 output_file.close()
jpayne@69	260 return None
jpayne@69	261
jpayne@69	262
jpayne@69	263 class FTPDownloader: # pylint: disable=too-few-public-methods
jpayne@69	264 """
jpayne@69	265 Download manager for fetching files over FTP.
jpayne@69	266
jpayne@69	267 When called, downloads the given file URL into the specified local file.
jpayne@69	268 Uses the :mod:`ftplib` module to manage downloads.
jpayne@69	269
jpayne@69	270 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@69	271 the download of files (for example, to use authentication or print a
jpayne@69	272 progress bar).
jpayne@69	273
jpayne@69	274 Parameters
jpayne@69	275 ----------
jpayne@69	276 port : int
jpayne@69	277 Port used for the FTP connection.
jpayne@69	278 username : str
jpayne@69	279 User name used to login to the server. Only needed if the server
jpayne@69	280 requires authentication (i.e., no anonymous FTP).
jpayne@69	281 password : str
jpayne@69	282 Password used to login to the server. Only needed if the server
jpayne@69	283 requires authentication (i.e., no anonymous FTP). Use the empty string
jpayne@69	284 to indicate no password is required.
jpayne@69	285 account : str
jpayne@69	286 Some servers also require an "account" name for authentication.
jpayne@69	287 timeout : int
jpayne@69	288 Timeout in seconds for ftp socket operations, use None to mean no
jpayne@69	289 timeout.
jpayne@69	290 progressbar : bool
jpayne@69	291 If True, will print a progress bar of the download to standard error
jpayne@69	292 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@69	293 installed. Custom progress bars are not yet supported.
jpayne@69	294 chunk_size : int
jpayne@69	295 Files are streamed chunk_size bytes at a time instead of loading
jpayne@69	296 everything into memory at one. Usually doesn't need to be changed.
jpayne@69	297
jpayne@69	298 """
jpayne@69	299
jpayne@69	300 def __init__(
jpayne@69	301 self,
jpayne@69	302 port=21,
jpayne@69	303 username="anonymous",
jpayne@69	304 password="",
jpayne@69	305 account="",
jpayne@69	306 timeout=None,
jpayne@69	307 progressbar=False,
jpayne@69	308 chunk_size=1024,
jpayne@69	309 ):
jpayne@69	310 self.port = port
jpayne@69	311 self.username = username
jpayne@69	312 self.password = password
jpayne@69	313 self.account = account
jpayne@69	314 self.timeout = timeout
jpayne@69	315 self.progressbar = progressbar
jpayne@69	316 self.chunk_size = chunk_size
jpayne@69	317 if self.progressbar is True and tqdm is None:
jpayne@69	318 raise ValueError("Missing package 'tqdm' required for progress bars.")
jpayne@69	319
jpayne@69	320 def __call__(self, url, output_file, pooch, check_only=False):
jpayne@69	321 """
jpayne@69	322 Download the given URL over FTP to the given output file.
jpayne@69	323
jpayne@69	324 Parameters
jpayne@69	325 ----------
jpayne@69	326 url : str
jpayne@69	327 The URL to the file you want to download.
jpayne@69	328 output_file : str or file-like object
jpayne@69	329 Path (and file name) to which the file will be downloaded.
jpayne@69	330 pooch : :class:`~pooch.Pooch`
jpayne@69	331 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@69	332 check_only : bool
jpayne@69	333 If True, will only check if a file exists on the server and
jpayne@69	334 without downloading the file. Will return ``True`` if the file
jpayne@69	335 exists and ``False`` otherwise.
jpayne@69	336
jpayne@69	337 Returns
jpayne@69	338 -------
jpayne@69	339 availability : bool or None
jpayne@69	340 If ``check_only==True``, returns a boolean indicating if the file
jpayne@69	341 is available on the server. Otherwise, returns ``None``.
jpayne@69	342
jpayne@69	343 """
jpayne@69	344 parsed_url = parse_url(url)
jpayne@69	345 ftp = ftplib.FTP(timeout=self.timeout)
jpayne@69	346 ftp.connect(host=parsed_url["netloc"], port=self.port)
jpayne@69	347
jpayne@69	348 if check_only:
jpayne@69	349 directory, file_name = os.path.split(parsed_url["path"])
jpayne@69	350 try:
jpayne@69	351 ftp.login(user=self.username, passwd=self.password, acct=self.account)
jpayne@69	352 available = file_name in ftp.nlst(directory)
jpayne@69	353 finally:
jpayne@69	354 ftp.close()
jpayne@69	355 return available
jpayne@69	356
jpayne@69	357 ispath = not hasattr(output_file, "write")
jpayne@69	358 if ispath:
jpayne@69	359 # pylint: disable=consider-using-with
jpayne@69	360 output_file = open(output_file, "w+b")
jpayne@69	361 # pylint: enable=consider-using-with
jpayne@69	362 try:
jpayne@69	363 ftp.login(user=self.username, passwd=self.password, acct=self.account)
jpayne@69	364 command = f"RETR {parsed_url['path']}"
jpayne@69	365 if self.progressbar:
jpayne@69	366 # Make sure the file is set to binary mode, otherwise we can't
jpayne@69	367 # get the file size. See: https://stackoverflow.com/a/22093848
jpayne@69	368 ftp.voidcmd("TYPE I")
jpayne@69	369 use_ascii = bool(sys.platform == "win32")
jpayne@69	370 progress = tqdm(
jpayne@69	371 total=int(ftp.size(parsed_url["path"])),
jpayne@69	372 ncols=79,
jpayne@69	373 ascii=use_ascii,
jpayne@69	374 unit="B",
jpayne@69	375 unit_scale=True,
jpayne@69	376 leave=True,
jpayne@69	377 )
jpayne@69	378 with progress:
jpayne@69	379
jpayne@69	380 def callback(data):
jpayne@69	381 "Update the progress bar and write to output"
jpayne@69	382 progress.update(len(data))
jpayne@69	383 output_file.write(data)
jpayne@69	384
jpayne@69	385 ftp.retrbinary(command, callback, blocksize=self.chunk_size)
jpayne@69	386 else:
jpayne@69	387 ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
jpayne@69	388 finally:
jpayne@69	389 ftp.quit()
jpayne@69	390 if ispath:
jpayne@69	391 output_file.close()
jpayne@69	392 return None
jpayne@69	393
jpayne@69	394
jpayne@69	395 class SFTPDownloader: # pylint: disable=too-few-public-methods
jpayne@69	396 """
jpayne@69	397 Download manager for fetching files over SFTP.
jpayne@69	398
jpayne@69	399 When called, downloads the given file URL into the specified local file.
jpayne@69	400 Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
jpayne@69	401 installed.
jpayne@69	402
jpayne@69	403 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@69	404 the download of files (for example, to use authentication or print a
jpayne@69	405 progress bar).
jpayne@69	406
jpayne@69	407 Parameters
jpayne@69	408 ----------
jpayne@69	409 port : int
jpayne@69	410 Port used for the SFTP connection.
jpayne@69	411 username : str
jpayne@69	412 User name used to login to the server. Only needed if the server
jpayne@69	413 requires authentication (i.e., no anonymous SFTP).
jpayne@69	414 password : str
jpayne@69	415 Password used to login to the server. Only needed if the server
jpayne@69	416 requires authentication (i.e., no anonymous SFTP). Use the empty
jpayne@69	417 string to indicate no password is required.
jpayne@69	418 timeout : int
jpayne@69	419 Timeout in seconds for sftp socket operations, use None to mean no
jpayne@69	420 timeout.
jpayne@69	421 progressbar : bool or an arbitrary progress bar object
jpayne@69	422 If True, will print a progress bar of the download to standard
jpayne@69	423 error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
jpayne@69	424 be installed.
jpayne@69	425
jpayne@69	426 """
jpayne@69	427
jpayne@69	428 def __init__(
jpayne@69	429 self,
jpayne@69	430 port=22,
jpayne@69	431 username="anonymous",
jpayne@69	432 password="",
jpayne@69	433 account="",
jpayne@69	434 timeout=None,
jpayne@69	435 progressbar=False,
jpayne@69	436 ):
jpayne@69	437 self.port = port
jpayne@69	438 self.username = username
jpayne@69	439 self.password = password
jpayne@69	440 self.account = account
jpayne@69	441 self.timeout = timeout
jpayne@69	442 self.progressbar = progressbar
jpayne@69	443 # Collect errors and raise only once so that both missing packages are
jpayne@69	444 # captured. Otherwise, the user is only warned of one of them at a
jpayne@69	445 # time (and we can't test properly when they are both missing).
jpayne@69	446 errors = []
jpayne@69	447 if self.progressbar and tqdm is None:
jpayne@69	448 errors.append("Missing package 'tqdm' required for progress bars.")
jpayne@69	449 if paramiko is None:
jpayne@69	450 errors.append("Missing package 'paramiko' required for SFTP downloads.")
jpayne@69	451 if errors:
jpayne@69	452 raise ValueError(" ".join(errors))
jpayne@69	453
jpayne@69	454 def __call__(self, url, output_file, pooch):
jpayne@69	455 """
jpayne@69	456 Download the given URL over SFTP to the given output file.
jpayne@69	457
jpayne@69	458 The output file must be given as a string (file name/path) and not an
jpayne@69	459 open file object! Otherwise, paramiko cannot save to that file.
jpayne@69	460
jpayne@69	461 Parameters
jpayne@69	462 ----------
jpayne@69	463 url : str
jpayne@69	464 The URL to the file you want to download.
jpayne@69	465 output_file : str
jpayne@69	466 Path (and file name) to which the file will be downloaded. **Cannot
jpayne@69	467 be a file object**.
jpayne@69	468 pooch : :class:`~pooch.Pooch`
jpayne@69	469 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@69	470 """
jpayne@69	471 parsed_url = parse_url(url)
jpayne@69	472 connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
jpayne@69	473 sftp = None
jpayne@69	474 try:
jpayne@69	475 connection.connect(username=self.username, password=self.password)
jpayne@69	476 sftp = paramiko.SFTPClient.from_transport(connection)
jpayne@69	477 sftp.get_channel().settimeout = self.timeout
jpayne@69	478 if self.progressbar:
jpayne@69	479 size = int(sftp.stat(parsed_url["path"]).st_size)
jpayne@69	480 use_ascii = bool(sys.platform == "win32")
jpayne@69	481 progress = tqdm(
jpayne@69	482 total=size,
jpayne@69	483 ncols=79,
jpayne@69	484 ascii=use_ascii,
jpayne@69	485 unit="B",
jpayne@69	486 unit_scale=True,
jpayne@69	487 leave=True,
jpayne@69	488 )
jpayne@69	489 if self.progressbar:
jpayne@69	490 with progress:
jpayne@69	491
jpayne@69	492 def callback(current, total):
jpayne@69	493 "Update the progress bar and write to output"
jpayne@69	494 progress.total = int(total)
jpayne@69	495 progress.update(int(current - progress.n))
jpayne@69	496
jpayne@69	497 sftp.get(parsed_url["path"], output_file, callback=callback)
jpayne@69	498 else:
jpayne@69	499 sftp.get(parsed_url["path"], output_file)
jpayne@69	500 finally:
jpayne@69	501 connection.close()
jpayne@69	502 if sftp is not None:
jpayne@69	503 sftp.close()
jpayne@69	504
jpayne@69	505
jpayne@69	506 class DOIDownloader: # pylint: disable=too-few-public-methods
jpayne@69	507 """
jpayne@69	508 Download manager for fetching files from Digital Object Identifiers (DOIs).
jpayne@69	509
jpayne@69	510 Open-access data repositories often issue Digital Object Identifiers (DOIs)
jpayne@69	511 for data which provide a stable link and citation point. The trick is
jpayne@69	512 finding out the download URL for a file given the DOI.
jpayne@69	513
jpayne@69	514 When called, this downloader uses the repository's public API to find out
jpayne@69	515 the download URL from the DOI and file name. It then uses
jpayne@69	516 :class:`pooch.HTTPDownloader` to download the URL into the specified local
jpayne@69	517 file. Allowing "URL"s to be specified with the DOI instead of the actual
jpayne@69	518 HTTP download link. Uses the :mod:`requests` library to manage downloads
jpayne@69	519 and interact with the APIs.
jpayne@69	520
jpayne@69	521 The format of the "URL" is: ``doi:{DOI}/{file name}``.
jpayne@69	522
jpayne@69	523 Notice that there are no ``//`` like in HTTP/FTP and you must specify a
jpayne@69	524 file name after the DOI (separated by a ``/``).
jpayne@69	525
jpayne@69	526 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
jpayne@69	527 download files given the DOI instead of an HTTP link.
jpayne@69	528
jpayne@69	529 Supported repositories:
jpayne@69	530
jpayne@69	531 * `figshare <https://www.figshare.com>`__
jpayne@69	532 * `Zenodo <https://www.zenodo.org>`__
jpayne@69	533 * `Dataverse <https://dataverse.org/>`__ instances
jpayne@69	534
jpayne@69	535 .. attention::
jpayne@69	536
jpayne@69	537 DOIs from other repositories will not work since we need to access
jpayne@69	538 their particular APIs to find the download links. We welcome
jpayne@69	539 suggestions and contributions adding new repositories.
jpayne@69	540
jpayne@69	541 Parameters
jpayne@69	542 ----------
jpayne@69	543 progressbar : bool or an arbitrary progress bar object
jpayne@69	544 If True, will print a progress bar of the download to standard error
jpayne@69	545 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@69	546 installed. Alternatively, an arbitrary progress bar object can be
jpayne@69	547 passed. See :ref:`custom-progressbar` for details.
jpayne@69	548 chunk_size : int
jpayne@69	549 Files are streamed chunk_size bytes at a time instead of loading
jpayne@69	550 everything into memory at one. Usually doesn't need to be changed.
jpayne@69	551 **kwargs
jpayne@69	552 All keyword arguments given when creating an instance of this class
jpayne@69	553 will be passed to :func:`requests.get`.
jpayne@69	554
jpayne@69	555 Examples
jpayne@69	556 --------
jpayne@69	557
jpayne@69	558 Download one of the data files from the figshare archive of Pooch test
jpayne@69	559 data:
jpayne@69	560
jpayne@69	561 >>> import os
jpayne@69	562 >>> downloader = DOIDownloader()
jpayne@69	563 >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
jpayne@69	564 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
jpayne@69	565 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@69	566 >>> os.path.exists("tiny-data.txt")
jpayne@69	567 True
jpayne@69	568 >>> with open("tiny-data.txt") as f:
jpayne@69	569 ... print(f.read().strip())
jpayne@69	570 # A tiny data file for test purposes only
jpayne@69	571 1 2 3 4 5 6
jpayne@69	572 >>> os.remove("tiny-data.txt")
jpayne@69	573
jpayne@69	574 Same thing but for our Zenodo archive:
jpayne@69	575
jpayne@69	576 >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
jpayne@69	577 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@69	578 >>> os.path.exists("tiny-data.txt")
jpayne@69	579 True
jpayne@69	580 >>> with open("tiny-data.txt") as f:
jpayne@69	581 ... print(f.read().strip())
jpayne@69	582 # A tiny data file for test purposes only
jpayne@69	583 1 2 3 4 5 6
jpayne@69	584 >>> os.remove("tiny-data.txt")
jpayne@69	585
jpayne@69	586 """
jpayne@69	587
jpayne@69	588 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
jpayne@69	589 self.kwargs = kwargs
jpayne@69	590 self.progressbar = progressbar
jpayne@69	591 self.chunk_size = chunk_size
jpayne@69	592
jpayne@69	593 def __call__(self, url, output_file, pooch):
jpayne@69	594 """
jpayne@69	595 Download the given DOI URL over HTTP to the given output file.
jpayne@69	596
jpayne@69	597 Uses the repository's API to determine the actual HTTP download URL
jpayne@69	598 from the given DOI.
jpayne@69	599
jpayne@69	600 Uses :func:`requests.get`.
jpayne@69	601
jpayne@69	602 Parameters
jpayne@69	603 ----------
jpayne@69	604 url : str
jpayne@69	605 The URL to the file you want to download.
jpayne@69	606 output_file : str or file-like object
jpayne@69	607 Path (and file name) to which the file will be downloaded.
jpayne@69	608 pooch : :class:`~pooch.Pooch`
jpayne@69	609 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@69	610
jpayne@69	611 """
jpayne@69	612
jpayne@69	613 parsed_url = parse_url(url)
jpayne@69	614 data_repository = doi_to_repository(parsed_url["netloc"])
jpayne@69	615
jpayne@69	616 # Resolve the URL
jpayne@69	617 file_name = parsed_url["path"]
jpayne@69	618 # remove the leading slash in the path
jpayne@69	619 if file_name[0] == "/":
jpayne@69	620 file_name = file_name[1:]
jpayne@69	621 download_url = data_repository.download_url(file_name)
jpayne@69	622
jpayne@69	623 # Instantiate the downloader object
jpayne@69	624 downloader = HTTPDownloader(
jpayne@69	625 progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
jpayne@69	626 )
jpayne@69	627 downloader(download_url, output_file, pooch)
jpayne@69	628
jpayne@69	629
jpayne@69	630 def doi_to_url(doi):
jpayne@69	631 """
jpayne@69	632 Follow a DOI link to resolve the URL of the archive.
jpayne@69	633
jpayne@69	634 Parameters
jpayne@69	635 ----------
jpayne@69	636 doi : str
jpayne@69	637 The DOI of the archive.
jpayne@69	638
jpayne@69	639 Returns
jpayne@69	640 -------
jpayne@69	641 url : str
jpayne@69	642 The URL of the archive in the data repository.
jpayne@69	643
jpayne@69	644 """
jpayne@69	645 # Lazy import requests to speed up import time
jpayne@69	646 import requests # pylint: disable=C0415
jpayne@69	647
jpayne@69	648 # Use doi.org to resolve the DOI to the repository website.
jpayne@69	649 response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT)
jpayne@69	650 url = response.url
jpayne@69	651 if 400 <= response.status_code < 600:
jpayne@69	652 raise ValueError(
jpayne@69	653 f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
jpayne@69	654 )
jpayne@69	655 return url
jpayne@69	656
jpayne@69	657
jpayne@69	658 def doi_to_repository(doi):
jpayne@69	659 """
jpayne@69	660 Instantiate a data repository instance from a given DOI.
jpayne@69	661
jpayne@69	662 This function implements the chain of responsibility dispatch
jpayne@69	663 to the correct data repository class.
jpayne@69	664
jpayne@69	665 Parameters
jpayne@69	666 ----------
jpayne@69	667 doi : str
jpayne@69	668 The DOI of the archive.
jpayne@69	669
jpayne@69	670 Returns
jpayne@69	671 -------
jpayne@69	672 data_repository : DataRepository
jpayne@69	673 The data repository object
jpayne@69	674 """
jpayne@69	675
jpayne@69	676 # This should go away in a separate issue: DOI handling should
jpayne@69	677 # not rely on the (non-)existence of trailing slashes. The issue
jpayne@69	678 # is documented in https://github.com/fatiando/pooch/issues/324
jpayne@69	679 if doi[-1] == "/":
jpayne@69	680 doi = doi[:-1]
jpayne@69	681
jpayne@69	682 repositories = [
jpayne@69	683 FigshareRepository,
jpayne@69	684 ZenodoRepository,
jpayne@69	685 DataverseRepository,
jpayne@69	686 ]
jpayne@69	687
jpayne@69	688 # Extract the DOI and the repository information
jpayne@69	689 archive_url = doi_to_url(doi)
jpayne@69	690
jpayne@69	691 # Try the converters one by one until one of them returned a URL
jpayne@69	692 data_repository = None
jpayne@69	693 for repo in repositories:
jpayne@69	694 if data_repository is None:
jpayne@69	695 data_repository = repo.initialize(
jpayne@69	696 archive_url=archive_url,
jpayne@69	697 doi=doi,
jpayne@69	698 )
jpayne@69	699
jpayne@69	700 if data_repository is None:
jpayne@69	701 repository = parse_url(archive_url)["netloc"]
jpayne@69	702 raise ValueError(
jpayne@69	703 f"Invalid data repository '{repository}'. "
jpayne@69	704 "To request or contribute support for this repository, "
jpayne@69	705 "please open an issue at https://github.com/fatiando/pooch/issues"
jpayne@69	706 )
jpayne@69	707
jpayne@69	708 return data_repository
jpayne@69	709
jpayne@69	710
jpayne@69	711 class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring
jpayne@69	712 @classmethod
jpayne@69	713 def initialize(cls, doi, archive_url): # pylint: disable=unused-argument
jpayne@69	714 """
jpayne@69	715 Initialize the data repository if the given URL points to a
jpayne@69	716 corresponding repository.
jpayne@69	717
jpayne@69	718 Initializes a data repository object. This is done as part of
jpayne@69	719 a chain of responsibility. If the class cannot handle the given
jpayne@69	720 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@69	721 instance is returned.
jpayne@69	722
jpayne@69	723 Parameters
jpayne@69	724 ----------
jpayne@69	725 doi : str
jpayne@69	726 The DOI that identifies the repository
jpayne@69	727 archive_url : str
jpayne@69	728 The resolved URL for the DOI
jpayne@69	729 """
jpayne@69	730
jpayne@69	731 return None # pragma: no cover
jpayne@69	732
jpayne@69	733 def download_url(self, file_name):
jpayne@69	734 """
jpayne@69	735 Use the repository API to get the download URL for a file given
jpayne@69	736 the archive URL.
jpayne@69	737
jpayne@69	738 Parameters
jpayne@69	739 ----------
jpayne@69	740 file_name : str
jpayne@69	741 The name of the file in the archive that will be downloaded.
jpayne@69	742
jpayne@69	743 Returns
jpayne@69	744 -------
jpayne@69	745 download_url : str
jpayne@69	746 The HTTP URL that can be used to download the file.
jpayne@69	747 """
jpayne@69	748
jpayne@69	749 raise NotImplementedError # pragma: no cover
jpayne@69	750
jpayne@69	751 def populate_registry(self, pooch):
jpayne@69	752 """
jpayne@69	753 Populate the registry using the data repository's API
jpayne@69	754
jpayne@69	755 Parameters
jpayne@69	756 ----------
jpayne@69	757 pooch : Pooch
jpayne@69	758 The pooch instance that the registry will be added to.
jpayne@69	759 """
jpayne@69	760
jpayne@69	761 raise NotImplementedError # pragma: no cover
jpayne@69	762
jpayne@69	763
jpayne@69	764 class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@69	765 base_api_url = "https://zenodo.org/api/records"
jpayne@69	766
jpayne@69	767 def __init__(self, doi, archive_url):
jpayne@69	768 self.archive_url = archive_url
jpayne@69	769 self.doi = doi
jpayne@69	770 self._api_response = None
jpayne@69	771 self._api_version = None
jpayne@69	772
jpayne@69	773 @classmethod
jpayne@69	774 def initialize(cls, doi, archive_url):
jpayne@69	775 """
jpayne@69	776 Initialize the data repository if the given URL points to a
jpayne@69	777 corresponding repository.
jpayne@69	778
jpayne@69	779 Initializes a data repository object. This is done as part of
jpayne@69	780 a chain of responsibility. If the class cannot handle the given
jpayne@69	781 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@69	782 instance is returned.
jpayne@69	783
jpayne@69	784 Parameters
jpayne@69	785 ----------
jpayne@69	786 doi : str
jpayne@69	787 The DOI that identifies the repository
jpayne@69	788 archive_url : str
jpayne@69	789 The resolved URL for the DOI
jpayne@69	790 """
jpayne@69	791
jpayne@69	792 # Check whether this is a Zenodo URL
jpayne@69	793 parsed_archive_url = parse_url(archive_url)
jpayne@69	794 if parsed_archive_url["netloc"] != "zenodo.org":
jpayne@69	795 return None
jpayne@69	796
jpayne@69	797 return cls(doi, archive_url)
jpayne@69	798
jpayne@69	799 @property
jpayne@69	800 def api_response(self):
jpayne@69	801 """Cached API response from Zenodo"""
jpayne@69	802 if self._api_response is None:
jpayne@69	803 # Lazy import requests to speed up import time
jpayne@69	804 import requests # pylint: disable=C0415
jpayne@69	805
jpayne@69	806 article_id = self.archive_url.split("/")[-1]
jpayne@69	807 self._api_response = requests.get(
jpayne@69	808 f"{self.base_api_url}/{article_id}",
jpayne@69	809 timeout=DEFAULT_TIMEOUT,
jpayne@69	810 ).json()
jpayne@69	811
jpayne@69	812 return self._api_response
jpayne@69	813
jpayne@69	814 @property
jpayne@69	815 def api_version(self):
jpayne@69	816 """
jpayne@69	817 Version of the Zenodo API we are interacting with
jpayne@69	818
jpayne@69	819 The versions can either be :
jpayne@69	820
jpayne@69	821 - ``"legacy"``: corresponds to the Zenodo API that was supported until
jpayne@69	822 2023-10-12 (before the migration to InvenioRDM).
jpayne@69	823 - ``"new"``: corresponds to the new API that went online on 2023-10-13
jpayne@69	824 after the migration to InvenioRDM.
jpayne@69	825
jpayne@69	826 The ``"new"`` API breaks backward compatibility with the ``"legacy"``
jpayne@69	827 one and could probably be replaced by an updated version that restores
jpayne@69	828 the behaviour of the ``"legacy"`` one.
jpayne@69	829
jpayne@69	830 Returns
jpayne@69	831 -------
jpayne@69	832 str
jpayne@69	833 """
jpayne@69	834 if self._api_version is None:
jpayne@69	835 if all("key" in file for file in self.api_response["files"]):
jpayne@69	836 self._api_version = "legacy"
jpayne@69	837 elif all("filename" in file for file in self.api_response["files"]):
jpayne@69	838 self._api_version = "new"
jpayne@69	839 else:
jpayne@69	840 raise ValueError(
jpayne@69	841 "Couldn't determine the version of the Zenodo API for "
jpayne@69	842 f"{self.archive_url} (doi:{self.doi})."
jpayne@69	843 )
jpayne@69	844 return self._api_version
jpayne@69	845
jpayne@69	846 def download_url(self, file_name):
jpayne@69	847 """
jpayne@69	848 Use the repository API to get the download URL for a file given
jpayne@69	849 the archive URL.
jpayne@69	850
jpayne@69	851 Parameters
jpayne@69	852 ----------
jpayne@69	853 file_name : str
jpayne@69	854 The name of the file in the archive that will be downloaded.
jpayne@69	855
jpayne@69	856 Returns
jpayne@69	857 -------
jpayne@69	858 download_url : str
jpayne@69	859 The HTTP URL that can be used to download the file.
jpayne@69	860
jpayne@69	861 Notes
jpayne@69	862 -----
jpayne@69	863 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
jpayne@69	864 link to the desired files that appears in the API response leads to 404
jpayne@69	865 errors (by 2023-10-17). The files are available in the following url:
jpayne@69	866 ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.
jpayne@69	867
jpayne@69	868 This method supports both the legacy and the new API.
jpayne@69	869 """
jpayne@69	870 # Create list of files in the repository
jpayne@69	871 if self.api_version == "legacy":
jpayne@69	872 files = {item["key"]: item for item in self.api_response["files"]}
jpayne@69	873 else:
jpayne@69	874 files = [item["filename"] for item in self.api_response["files"]]
jpayne@69	875 # Check if file exists in the repository
jpayne@69	876 if file_name not in files:
jpayne@69	877 raise ValueError(
jpayne@69	878 f"File '{file_name}' not found in data archive "
jpayne@69	879 f"{self.archive_url} (doi:{self.doi})."
jpayne@69	880 )
jpayne@69	881 # Build download url
jpayne@69	882 if self.api_version == "legacy":
jpayne@69	883 download_url = files[file_name]["links"]["self"]
jpayne@69	884 else:
jpayne@69	885 article_id = self.api_response["id"]
jpayne@69	886 download_url = (
jpayne@69	887 f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
jpayne@69	888 )
jpayne@69	889 return download_url
jpayne@69	890
jpayne@69	891 def populate_registry(self, pooch):
jpayne@69	892 """
jpayne@69	893 Populate the registry using the data repository's API
jpayne@69	894
jpayne@69	895 Parameters
jpayne@69	896 ----------
jpayne@69	897 pooch : Pooch
jpayne@69	898 The pooch instance that the registry will be added to.
jpayne@69	899
jpayne@69	900 Notes
jpayne@69	901 -----
jpayne@69	902 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
jpayne@69	903 checksums for each file listed in the API reference is now an md5 sum.
jpayne@69	904
jpayne@69	905 This method supports both the legacy and the new API.
jpayne@69	906 """
jpayne@69	907 for filedata in self.api_response["files"]:
jpayne@69	908 checksum = filedata["checksum"]
jpayne@69	909 if self.api_version == "legacy":
jpayne@69	910 key = "key"
jpayne@69	911 else:
jpayne@69	912 key = "filename"
jpayne@69	913 checksum = f"md5:{checksum}"
jpayne@69	914 pooch.registry[filedata[key]] = checksum
jpayne@69	915
jpayne@69	916
jpayne@69	917 class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@69	918 def __init__(self, doi, archive_url):
jpayne@69	919 self.archive_url = archive_url
jpayne@69	920 self.doi = doi
jpayne@69	921 self._api_response = None
jpayne@69	922
jpayne@69	923 @classmethod
jpayne@69	924 def initialize(cls, doi, archive_url):
jpayne@69	925 """
jpayne@69	926 Initialize the data repository if the given URL points to a
jpayne@69	927 corresponding repository.
jpayne@69	928
jpayne@69	929 Initializes a data repository object. This is done as part of
jpayne@69	930 a chain of responsibility. If the class cannot handle the given
jpayne@69	931 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@69	932 instance is returned.
jpayne@69	933
jpayne@69	934 Parameters
jpayne@69	935 ----------
jpayne@69	936 doi : str
jpayne@69	937 The DOI that identifies the repository
jpayne@69	938 archive_url : str
jpayne@69	939 The resolved URL for the DOI
jpayne@69	940 """
jpayne@69	941
jpayne@69	942 # Check whether this is a Figshare URL
jpayne@69	943 parsed_archive_url = parse_url(archive_url)
jpayne@69	944 if parsed_archive_url["netloc"] != "figshare.com":
jpayne@69	945 return None
jpayne@69	946
jpayne@69	947 return cls(doi, archive_url)
jpayne@69	948
jpayne@69	949 def _parse_version_from_doi(self):
jpayne@69	950 """
jpayne@69	951 Parse version from the doi
jpayne@69	952
jpayne@69	953 Return None if version is not available in the doi.
jpayne@69	954 """
jpayne@69	955 # Get suffix of the doi
jpayne@69	956 _, suffix = self.doi.split("/")
jpayne@69	957 # Split the suffix by dots and keep the last part
jpayne@69	958 last_part = suffix.split(".")[-1]
jpayne@69	959 # Parse the version from the last part
jpayne@69	960 if last_part[0] != "v":
jpayne@69	961 return None
jpayne@69	962 version = int(last_part[1:])
jpayne@69	963 return version
jpayne@69	964
jpayne@69	965 @property
jpayne@69	966 def api_response(self):
jpayne@69	967 """Cached API response from Figshare"""
jpayne@69	968 if self._api_response is None:
jpayne@69	969 # Lazy import requests to speed up import time
jpayne@69	970 import requests # pylint: disable=C0415
jpayne@69	971
jpayne@69	972 # Use the figshare API to find the article ID from the DOI
jpayne@69	973 article = requests.get(
jpayne@69	974 f"https://api.figshare.com/v2/articles?doi={self.doi}",
jpayne@69	975 timeout=DEFAULT_TIMEOUT,
jpayne@69	976 ).json()[0]
jpayne@69	977 article_id = article["id"]
jpayne@69	978 # Parse desired version from the doi
jpayne@69	979 version = self._parse_version_from_doi()
jpayne@69	980 # With the ID and version, we can get a list of files and their
jpayne@69	981 # download links
jpayne@69	982 if version is None:
jpayne@69	983 # Figshare returns the latest version available when no version
jpayne@69	984 # is specified through the DOI.
jpayne@69	985 warnings.warn(
jpayne@69	986 f"The Figshare DOI '{self.doi}' doesn't specify which version of "
jpayne@69	987 "the repository should be used. "
jpayne@69	988 "Figshare will point to the latest version available.",
jpayne@69	989 UserWarning,
jpayne@69	990 )
jpayne@69	991 # Define API url using only the article id
jpayne@69	992 # (figshare will resolve the latest version)
jpayne@69	993 api_url = f"https://api.figshare.com/v2/articles/{article_id}"
jpayne@69	994 else:
jpayne@69	995 # Define API url using article id and the desired version
jpayne@69	996 # Get list of files using article id and the version
jpayne@69	997 api_url = (
jpayne@69	998 "https://api.figshare.com/v2/articles/"
jpayne@69	999 f"{article_id}/versions/{version}"
jpayne@69	1000 )
jpayne@69	1001 # Make the request and return the files in the figshare repository
jpayne@69	1002 response = requests.get(api_url, timeout=DEFAULT_TIMEOUT)
jpayne@69	1003 response.raise_for_status()
jpayne@69	1004 self._api_response = response.json()["files"]
jpayne@69	1005
jpayne@69	1006 return self._api_response
jpayne@69	1007
jpayne@69	1008 def download_url(self, file_name):
jpayne@69	1009 """
jpayne@69	1010 Use the repository API to get the download URL for a file given
jpayne@69	1011 the archive URL.
jpayne@69	1012
jpayne@69	1013 Parameters
jpayne@69	1014 ----------
jpayne@69	1015 file_name : str
jpayne@69	1016 The name of the file in the archive that will be downloaded.
jpayne@69	1017
jpayne@69	1018 Returns
jpayne@69	1019 -------
jpayne@69	1020 download_url : str
jpayne@69	1021 The HTTP URL that can be used to download the file.
jpayne@69	1022 """
jpayne@69	1023 files = {item["name"]: item for item in self.api_response}
jpayne@69	1024 if file_name not in files:
jpayne@69	1025 raise ValueError(
jpayne@69	1026 f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
jpayne@69	1027 )
jpayne@69	1028 download_url = files[file_name]["download_url"]
jpayne@69	1029 return download_url
jpayne@69	1030
jpayne@69	1031 def populate_registry(self, pooch):
jpayne@69	1032 """
jpayne@69	1033 Populate the registry using the data repository's API
jpayne@69	1034
jpayne@69	1035 Parameters
jpayne@69	1036 ----------
jpayne@69	1037 pooch : Pooch
jpayne@69	1038 The pooch instance that the registry will be added to.
jpayne@69	1039 """
jpayne@69	1040
jpayne@69	1041 for filedata in self.api_response:
jpayne@69	1042 pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
jpayne@69	1043
jpayne@69	1044
jpayne@69	1045 class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@69	1046 def __init__(self, doi, archive_url):
jpayne@69	1047 self.archive_url = archive_url
jpayne@69	1048 self.doi = doi
jpayne@69	1049 self._api_response = None
jpayne@69	1050
jpayne@69	1051 @classmethod
jpayne@69	1052 def initialize(cls, doi, archive_url):
jpayne@69	1053 """
jpayne@69	1054 Initialize the data repository if the given URL points to a
jpayne@69	1055 corresponding repository.
jpayne@69	1056
jpayne@69	1057 Initializes a data repository object. This is done as part of
jpayne@69	1058 a chain of responsibility. If the class cannot handle the given
jpayne@69	1059 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@69	1060 instance is returned.
jpayne@69	1061
jpayne@69	1062 Parameters
jpayne@69	1063 ----------
jpayne@69	1064 doi : str
jpayne@69	1065 The DOI that identifies the repository
jpayne@69	1066 archive_url : str
jpayne@69	1067 The resolved URL for the DOI
jpayne@69	1068 """
jpayne@69	1069 # Access the DOI as if this was a DataVerse instance
jpayne@69	1070 response = cls._get_api_response(doi, archive_url)
jpayne@69	1071
jpayne@69	1072 # If we failed, this is probably not a DataVerse instance
jpayne@69	1073 if 400 <= response.status_code < 600:
jpayne@69	1074 return None
jpayne@69	1075
jpayne@69	1076 # Initialize the repository and overwrite the api response
jpayne@69	1077 repository = cls(doi, archive_url)
jpayne@69	1078 repository.api_response = response
jpayne@69	1079 return repository
jpayne@69	1080
jpayne@69	1081 @classmethod
jpayne@69	1082 def _get_api_response(cls, doi, archive_url):
jpayne@69	1083 """
jpayne@69	1084 Perform the actual API request
jpayne@69	1085
jpayne@69	1086 This has been separated into a separate ``classmethod``, as it can be
jpayne@69	1087 used prior and after the initialization.
jpayne@69	1088 """
jpayne@69	1089 # Lazy import requests to speed up import time
jpayne@69	1090 import requests # pylint: disable=C0415
jpayne@69	1091
jpayne@69	1092 parsed = parse_url(archive_url)
jpayne@69	1093 response = requests.get(
jpayne@69	1094 f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
jpayne@69	1095 f":persistentId?persistentId=doi:{doi}",
jpayne@69	1096 timeout=DEFAULT_TIMEOUT,
jpayne@69	1097 )
jpayne@69	1098 return response
jpayne@69	1099
jpayne@69	1100 @property
jpayne@69	1101 def api_response(self):
jpayne@69	1102 """Cached API response from a DataVerse instance"""
jpayne@69	1103
jpayne@69	1104 if self._api_response is None:
jpayne@69	1105 self._api_response = self._get_api_response(
jpayne@69	1106 self.doi, self.archive_url
jpayne@69	1107 ) # pragma: no cover
jpayne@69	1108
jpayne@69	1109 return self._api_response
jpayne@69	1110
jpayne@69	1111 @api_response.setter
jpayne@69	1112 def api_response(self, response):
jpayne@69	1113 """Update the cached API response"""
jpayne@69	1114
jpayne@69	1115 self._api_response = response
jpayne@69	1116
jpayne@69	1117 def download_url(self, file_name):
jpayne@69	1118 """
jpayne@69	1119 Use the repository API to get the download URL for a file given
jpayne@69	1120 the archive URL.
jpayne@69	1121
jpayne@69	1122 Parameters
jpayne@69	1123 ----------
jpayne@69	1124 file_name : str
jpayne@69	1125 The name of the file in the archive that will be downloaded.
jpayne@69	1126
jpayne@69	1127 Returns
jpayne@69	1128 -------
jpayne@69	1129 download_url : str
jpayne@69	1130 The HTTP URL that can be used to download the file.
jpayne@69	1131 """
jpayne@69	1132 parsed = parse_url(self.archive_url)
jpayne@69	1133 response = self.api_response.json()
jpayne@69	1134 files = {
jpayne@69	1135 file["dataFile"]["filename"]: file["dataFile"]
jpayne@69	1136 for file in response["data"]["latestVersion"]["files"]
jpayne@69	1137 }
jpayne@69	1138 if file_name not in files:
jpayne@69	1139 raise ValueError(
jpayne@69	1140 f"File '{file_name}' not found in data archive "
jpayne@69	1141 f"{self.archive_url} (doi:{self.doi})."
jpayne@69	1142 )
jpayne@69	1143 # Generate download_url using the file id
jpayne@69	1144 download_url = (
jpayne@69	1145 f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
jpayne@69	1146 f"{files[file_name]['id']}"
jpayne@69	1147 )
jpayne@69	1148 return download_url
jpayne@69	1149
jpayne@69	1150 def populate_registry(self, pooch):
jpayne@69	1151 """
jpayne@69	1152 Populate the registry using the data repository's API
jpayne@69	1153
jpayne@69	1154 Parameters
jpayne@69	1155 ----------
jpayne@69	1156 pooch : Pooch
jpayne@69	1157 The pooch instance that the registry will be added to.
jpayne@69	1158 """
jpayne@69	1159
jpayne@69	1160 for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
jpayne@69	1161 pooch.registry[filedata["dataFile"]["filename"]] = (
jpayne@69	1162 f"md5:{filedata['dataFile']['md5']}"
jpayne@69	1163 )

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 69:33d812a61356