csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 16:23:26 -0400
parents
children

rev	line source
jpayne@68	1 # Copyright (c) 2018 The Pooch Developers.
jpayne@68	2 # Distributed under the terms of the BSD 3-Clause License.
jpayne@68	3 # SPDX-License-Identifier: BSD-3-Clause
jpayne@68	4 #
jpayne@68	5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
jpayne@68	6 #
jpayne@68	7 """
jpayne@68	8 The classes that actually handle the downloads.
jpayne@68	9 """
jpayne@68	10 import os
jpayne@68	11 import sys
jpayne@68	12 import ftplib
jpayne@68	13
jpayne@68	14 import warnings
jpayne@68	15
jpayne@68	16 from .utils import parse_url
jpayne@68	17
jpayne@68	18 try:
jpayne@68	19 from tqdm import tqdm
jpayne@68	20 except ImportError:
jpayne@68	21 tqdm = None
jpayne@68	22
jpayne@68	23 try:
jpayne@68	24 import paramiko
jpayne@68	25 except ImportError:
jpayne@68	26 paramiko = None
jpayne@68	27
jpayne@68	28
jpayne@68	29 # Set the default timeout in seconds so it can be configured in a pinch for the
jpayne@68	30 # methods that don't or can't expose a way set it at runtime.
jpayne@68	31 # See https://github.com/fatiando/pooch/issues/409
jpayne@68	32 DEFAULT_TIMEOUT = 30
jpayne@68	33
jpayne@68	34
jpayne@68	35 def choose_downloader(url, progressbar=False):
jpayne@68	36 """
jpayne@68	37 Choose the appropriate downloader for the given URL based on the protocol.
jpayne@68	38
jpayne@68	39 Parameters
jpayne@68	40 ----------
jpayne@68	41 url : str
jpayne@68	42 A URL (including protocol).
jpayne@68	43 progressbar : bool or an arbitrary progress bar object
jpayne@68	44 If True, will print a progress bar of the download to standard error
jpayne@68	45 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68	46 installed. Alternatively, an arbitrary progress bar object can be
jpayne@68	47 passed. See :ref:`custom-progressbar` for details.
jpayne@68	48
jpayne@68	49 Returns
jpayne@68	50 -------
jpayne@68	51 downloader
jpayne@68	52 A downloader class, like :class:`pooch.HTTPDownloader`,
jpayne@68	53 :class:`pooch.FTPDownloader`, or :class: `pooch.SFTPDownloader`.
jpayne@68	54
jpayne@68	55 Examples
jpayne@68	56 --------
jpayne@68	57
jpayne@68	58 >>> downloader = choose_downloader("http://something.com")
jpayne@68	59 >>> print(downloader.__class__.__name__)
jpayne@68	60 HTTPDownloader
jpayne@68	61 >>> downloader = choose_downloader("https://something.com")
jpayne@68	62 >>> print(downloader.__class__.__name__)
jpayne@68	63 HTTPDownloader
jpayne@68	64 >>> downloader = choose_downloader("ftp://something.com")
jpayne@68	65 >>> print(downloader.__class__.__name__)
jpayne@68	66 FTPDownloader
jpayne@68	67 >>> downloader = choose_downloader("doi:DOI/filename.csv")
jpayne@68	68 >>> print(downloader.__class__.__name__)
jpayne@68	69 DOIDownloader
jpayne@68	70
jpayne@68	71 """
jpayne@68	72 known_downloaders = {
jpayne@68	73 "ftp": FTPDownloader,
jpayne@68	74 "https": HTTPDownloader,
jpayne@68	75 "http": HTTPDownloader,
jpayne@68	76 "sftp": SFTPDownloader,
jpayne@68	77 "doi": DOIDownloader,
jpayne@68	78 }
jpayne@68	79
jpayne@68	80 parsed_url = parse_url(url)
jpayne@68	81 if parsed_url["protocol"] not in known_downloaders:
jpayne@68	82 raise ValueError(
jpayne@68	83 f"Unrecognized URL protocol '{parsed_url['protocol']}' in '{url}'. "
jpayne@68	84 f"Must be one of {known_downloaders.keys()}."
jpayne@68	85 )
jpayne@68	86 downloader = known_downloaders[parsed_url["protocol"]](progressbar=progressbar)
jpayne@68	87 return downloader
jpayne@68	88
jpayne@68	89
jpayne@68	90 class HTTPDownloader: # pylint: disable=too-few-public-methods
jpayne@68	91 """
jpayne@68	92 Download manager for fetching files over HTTP/HTTPS.
jpayne@68	93
jpayne@68	94 When called, downloads the given file URL into the specified local file.
jpayne@68	95 Uses the :mod:`requests` library to manage downloads.
jpayne@68	96
jpayne@68	97 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@68	98 the download of files (for example, to use authentication or print a
jpayne@68	99 progress bar).
jpayne@68	100
jpayne@68	101 Parameters
jpayne@68	102 ----------
jpayne@68	103 progressbar : bool or an arbitrary progress bar object
jpayne@68	104 If True, will print a progress bar of the download to standard error
jpayne@68	105 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68	106 installed. Alternatively, an arbitrary progress bar object can be
jpayne@68	107 passed. See :ref:`custom-progressbar` for details.
jpayne@68	108 chunk_size : int
jpayne@68	109 Files are streamed chunk_size bytes at a time instead of loading
jpayne@68	110 everything into memory at one. Usually doesn't need to be changed.
jpayne@68	111 **kwargs
jpayne@68	112 All keyword arguments given when creating an instance of this class
jpayne@68	113 will be passed to :func:`requests.get`.
jpayne@68	114
jpayne@68	115 Examples
jpayne@68	116 --------
jpayne@68	117
jpayne@68	118 Download one of the data files from the Pooch repository:
jpayne@68	119
jpayne@68	120 >>> import os
jpayne@68	121 >>> from pooch import __version__, check_version
jpayne@68	122 >>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
jpayne@68	123 >>> url = url.format(check_version(__version__, fallback="main"))
jpayne@68	124 >>> downloader = HTTPDownloader()
jpayne@68	125 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
jpayne@68	126 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68	127 >>> os.path.exists("tiny-data.txt")
jpayne@68	128 True
jpayne@68	129 >>> with open("tiny-data.txt") as f:
jpayne@68	130 ... print(f.read().strip())
jpayne@68	131 # A tiny data file for test purposes only
jpayne@68	132 1 2 3 4 5 6
jpayne@68	133 >>> os.remove("tiny-data.txt")
jpayne@68	134
jpayne@68	135 Authentication can be handled by passing a user name and password to
jpayne@68	136 :func:`requests.get`. All arguments provided when creating an instance of
jpayne@68	137 the class are forwarded to :func:`requests.get`. We'll use
jpayne@68	138 ``auth=(username, password)`` to use basic HTTPS authentication. The
jpayne@68	139 https://httpbin.org website allows us to make a fake a login request using
jpayne@68	140 whatever username and password we provide to it:
jpayne@68	141
jpayne@68	142 >>> user = "doggo"
jpayne@68	143 >>> password = "goodboy"
jpayne@68	144 >>> # httpbin will ask for the user and password we provide in the URL
jpayne@68	145 >>> url = f"https://httpbin.org/basic-auth/{user}/{password}"
jpayne@68	146 >>> # Trying without the login credentials causes an error
jpayne@68	147 >>> downloader = HTTPDownloader()
jpayne@68	148 >>> try:
jpayne@68	149 ... downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68	150 ... except Exception:
jpayne@68	151 ... print("There was an error!")
jpayne@68	152 There was an error!
jpayne@68	153 >>> # Pass in the credentials to HTTPDownloader
jpayne@68	154 >>> downloader = HTTPDownloader(auth=(user, password))
jpayne@68	155 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68	156 >>> with open("tiny-data.txt") as f:
jpayne@68	157 ... for line in f:
jpayne@68	158 ... print(line.rstrip())
jpayne@68	159 {
jpayne@68	160 "authenticated": true,
jpayne@68	161 "user": "doggo"
jpayne@68	162 }
jpayne@68	163 >>> os.remove("tiny-data.txt")
jpayne@68	164
jpayne@68	165 """
jpayne@68	166
jpayne@68	167 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
jpayne@68	168 self.kwargs = kwargs
jpayne@68	169 self.progressbar = progressbar
jpayne@68	170 self.chunk_size = chunk_size
jpayne@68	171 if self.progressbar is True and tqdm is None:
jpayne@68	172 raise ValueError("Missing package 'tqdm' required for progress bars.")
jpayne@68	173
jpayne@68	174 def __call__(
jpayne@68	175 self, url, output_file, pooch, check_only=False
jpayne@68	176 ): # pylint: disable=R0914
jpayne@68	177 """
jpayne@68	178 Download the given URL over HTTP to the given output file.
jpayne@68	179
jpayne@68	180 Uses :func:`requests.get`.
jpayne@68	181
jpayne@68	182 Parameters
jpayne@68	183 ----------
jpayne@68	184 url : str
jpayne@68	185 The URL to the file you want to download.
jpayne@68	186 output_file : str or file-like object
jpayne@68	187 Path (and file name) to which the file will be downloaded.
jpayne@68	188 pooch : :class:`~pooch.Pooch`
jpayne@68	189 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68	190 check_only : bool
jpayne@68	191 If True, will only check if a file exists on the server and
jpayne@68	192 without downloading the file. Will return ``True`` if the file
jpayne@68	193 exists and ``False`` otherwise.
jpayne@68	194
jpayne@68	195 Returns
jpayne@68	196 -------
jpayne@68	197 availability : bool or None
jpayne@68	198 If ``check_only==True``, returns a boolean indicating if the file
jpayne@68	199 is available on the server. Otherwise, returns ``None``.
jpayne@68	200
jpayne@68	201 """
jpayne@68	202 # Lazy import requests to speed up import time
jpayne@68	203 import requests # pylint: disable=C0415
jpayne@68	204
jpayne@68	205 if check_only:
jpayne@68	206 timeout = self.kwargs.get("timeout", DEFAULT_TIMEOUT)
jpayne@68	207 response = requests.head(url, timeout=timeout, allow_redirects=True)
jpayne@68	208 available = bool(response.status_code == 200)
jpayne@68	209 return available
jpayne@68	210
jpayne@68	211 kwargs = self.kwargs.copy()
jpayne@68	212 timeout = kwargs.pop("timeout", DEFAULT_TIMEOUT)
jpayne@68	213 kwargs.setdefault("stream", True)
jpayne@68	214 ispath = not hasattr(output_file, "write")
jpayne@68	215 if ispath:
jpayne@68	216 # pylint: disable=consider-using-with
jpayne@68	217 output_file = open(output_file, "w+b")
jpayne@68	218 # pylint: enable=consider-using-with
jpayne@68	219 try:
jpayne@68	220 response = requests.get(url, timeout=timeout, **kwargs)
jpayne@68	221 response.raise_for_status()
jpayne@68	222 content = response.iter_content(chunk_size=self.chunk_size)
jpayne@68	223 total = int(response.headers.get("content-length", 0))
jpayne@68	224 if self.progressbar is True:
jpayne@68	225 # Need to use ascii characters on Windows because there isn't
jpayne@68	226 # always full unicode support
jpayne@68	227 # (see https://github.com/tqdm/tqdm/issues/454)
jpayne@68	228 use_ascii = bool(sys.platform == "win32")
jpayne@68	229 progress = tqdm(
jpayne@68	230 total=total,
jpayne@68	231 ncols=79,
jpayne@68	232 ascii=use_ascii,
jpayne@68	233 unit="B",
jpayne@68	234 unit_scale=True,
jpayne@68	235 leave=True,
jpayne@68	236 )
jpayne@68	237 elif self.progressbar:
jpayne@68	238 progress = self.progressbar
jpayne@68	239 progress.total = total
jpayne@68	240 for chunk in content:
jpayne@68	241 if chunk:
jpayne@68	242 output_file.write(chunk)
jpayne@68	243 output_file.flush()
jpayne@68	244 if self.progressbar:
jpayne@68	245 # Use the chunk size here because chunk may be much
jpayne@68	246 # larger if the data are decompressed by requests after
jpayne@68	247 # reading (happens with text files).
jpayne@68	248 progress.update(self.chunk_size)
jpayne@68	249 # Make sure the progress bar gets filled even if the actual number
jpayne@68	250 # is chunks is smaller than expected. This happens when streaming
jpayne@68	251 # text files that are compressed by the server when sending (gzip).
jpayne@68	252 # Binary files don't experience this.
jpayne@68	253 if self.progressbar:
jpayne@68	254 progress.reset()
jpayne@68	255 progress.update(total)
jpayne@68	256 progress.close()
jpayne@68	257 finally:
jpayne@68	258 if ispath:
jpayne@68	259 output_file.close()
jpayne@68	260 return None
jpayne@68	261
jpayne@68	262
jpayne@68	263 class FTPDownloader: # pylint: disable=too-few-public-methods
jpayne@68	264 """
jpayne@68	265 Download manager for fetching files over FTP.
jpayne@68	266
jpayne@68	267 When called, downloads the given file URL into the specified local file.
jpayne@68	268 Uses the :mod:`ftplib` module to manage downloads.
jpayne@68	269
jpayne@68	270 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@68	271 the download of files (for example, to use authentication or print a
jpayne@68	272 progress bar).
jpayne@68	273
jpayne@68	274 Parameters
jpayne@68	275 ----------
jpayne@68	276 port : int
jpayne@68	277 Port used for the FTP connection.
jpayne@68	278 username : str
jpayne@68	279 User name used to login to the server. Only needed if the server
jpayne@68	280 requires authentication (i.e., no anonymous FTP).
jpayne@68	281 password : str
jpayne@68	282 Password used to login to the server. Only needed if the server
jpayne@68	283 requires authentication (i.e., no anonymous FTP). Use the empty string
jpayne@68	284 to indicate no password is required.
jpayne@68	285 account : str
jpayne@68	286 Some servers also require an "account" name for authentication.
jpayne@68	287 timeout : int
jpayne@68	288 Timeout in seconds for ftp socket operations, use None to mean no
jpayne@68	289 timeout.
jpayne@68	290 progressbar : bool
jpayne@68	291 If True, will print a progress bar of the download to standard error
jpayne@68	292 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68	293 installed. Custom progress bars are not yet supported.
jpayne@68	294 chunk_size : int
jpayne@68	295 Files are streamed chunk_size bytes at a time instead of loading
jpayne@68	296 everything into memory at one. Usually doesn't need to be changed.
jpayne@68	297
jpayne@68	298 """
jpayne@68	299
jpayne@68	300 def __init__(
jpayne@68	301 self,
jpayne@68	302 port=21,
jpayne@68	303 username="anonymous",
jpayne@68	304 password="",
jpayne@68	305 account="",
jpayne@68	306 timeout=None,
jpayne@68	307 progressbar=False,
jpayne@68	308 chunk_size=1024,
jpayne@68	309 ):
jpayne@68	310 self.port = port
jpayne@68	311 self.username = username
jpayne@68	312 self.password = password
jpayne@68	313 self.account = account
jpayne@68	314 self.timeout = timeout
jpayne@68	315 self.progressbar = progressbar
jpayne@68	316 self.chunk_size = chunk_size
jpayne@68	317 if self.progressbar is True and tqdm is None:
jpayne@68	318 raise ValueError("Missing package 'tqdm' required for progress bars.")
jpayne@68	319
jpayne@68	320 def __call__(self, url, output_file, pooch, check_only=False):
jpayne@68	321 """
jpayne@68	322 Download the given URL over FTP to the given output file.
jpayne@68	323
jpayne@68	324 Parameters
jpayne@68	325 ----------
jpayne@68	326 url : str
jpayne@68	327 The URL to the file you want to download.
jpayne@68	328 output_file : str or file-like object
jpayne@68	329 Path (and file name) to which the file will be downloaded.
jpayne@68	330 pooch : :class:`~pooch.Pooch`
jpayne@68	331 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68	332 check_only : bool
jpayne@68	333 If True, will only check if a file exists on the server and
jpayne@68	334 without downloading the file. Will return ``True`` if the file
jpayne@68	335 exists and ``False`` otherwise.
jpayne@68	336
jpayne@68	337 Returns
jpayne@68	338 -------
jpayne@68	339 availability : bool or None
jpayne@68	340 If ``check_only==True``, returns a boolean indicating if the file
jpayne@68	341 is available on the server. Otherwise, returns ``None``.
jpayne@68	342
jpayne@68	343 """
jpayne@68	344 parsed_url = parse_url(url)
jpayne@68	345 ftp = ftplib.FTP(timeout=self.timeout)
jpayne@68	346 ftp.connect(host=parsed_url["netloc"], port=self.port)
jpayne@68	347
jpayne@68	348 if check_only:
jpayne@68	349 directory, file_name = os.path.split(parsed_url["path"])
jpayne@68	350 try:
jpayne@68	351 ftp.login(user=self.username, passwd=self.password, acct=self.account)
jpayne@68	352 available = file_name in ftp.nlst(directory)
jpayne@68	353 finally:
jpayne@68	354 ftp.close()
jpayne@68	355 return available
jpayne@68	356
jpayne@68	357 ispath = not hasattr(output_file, "write")
jpayne@68	358 if ispath:
jpayne@68	359 # pylint: disable=consider-using-with
jpayne@68	360 output_file = open(output_file, "w+b")
jpayne@68	361 # pylint: enable=consider-using-with
jpayne@68	362 try:
jpayne@68	363 ftp.login(user=self.username, passwd=self.password, acct=self.account)
jpayne@68	364 command = f"RETR {parsed_url['path']}"
jpayne@68	365 if self.progressbar:
jpayne@68	366 # Make sure the file is set to binary mode, otherwise we can't
jpayne@68	367 # get the file size. See: https://stackoverflow.com/a/22093848
jpayne@68	368 ftp.voidcmd("TYPE I")
jpayne@68	369 use_ascii = bool(sys.platform == "win32")
jpayne@68	370 progress = tqdm(
jpayne@68	371 total=int(ftp.size(parsed_url["path"])),
jpayne@68	372 ncols=79,
jpayne@68	373 ascii=use_ascii,
jpayne@68	374 unit="B",
jpayne@68	375 unit_scale=True,
jpayne@68	376 leave=True,
jpayne@68	377 )
jpayne@68	378 with progress:
jpayne@68	379
jpayne@68	380 def callback(data):
jpayne@68	381 "Update the progress bar and write to output"
jpayne@68	382 progress.update(len(data))
jpayne@68	383 output_file.write(data)
jpayne@68	384
jpayne@68	385 ftp.retrbinary(command, callback, blocksize=self.chunk_size)
jpayne@68	386 else:
jpayne@68	387 ftp.retrbinary(command, output_file.write, blocksize=self.chunk_size)
jpayne@68	388 finally:
jpayne@68	389 ftp.quit()
jpayne@68	390 if ispath:
jpayne@68	391 output_file.close()
jpayne@68	392 return None
jpayne@68	393
jpayne@68	394
jpayne@68	395 class SFTPDownloader: # pylint: disable=too-few-public-methods
jpayne@68	396 """
jpayne@68	397 Download manager for fetching files over SFTP.
jpayne@68	398
jpayne@68	399 When called, downloads the given file URL into the specified local file.
jpayne@68	400 Requires `paramiko <https://github.com/paramiko/paramiko>`__ to be
jpayne@68	401 installed.
jpayne@68	402
jpayne@68	403 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to customize
jpayne@68	404 the download of files (for example, to use authentication or print a
jpayne@68	405 progress bar).
jpayne@68	406
jpayne@68	407 Parameters
jpayne@68	408 ----------
jpayne@68	409 port : int
jpayne@68	410 Port used for the SFTP connection.
jpayne@68	411 username : str
jpayne@68	412 User name used to login to the server. Only needed if the server
jpayne@68	413 requires authentication (i.e., no anonymous SFTP).
jpayne@68	414 password : str
jpayne@68	415 Password used to login to the server. Only needed if the server
jpayne@68	416 requires authentication (i.e., no anonymous SFTP). Use the empty
jpayne@68	417 string to indicate no password is required.
jpayne@68	418 timeout : int
jpayne@68	419 Timeout in seconds for sftp socket operations, use None to mean no
jpayne@68	420 timeout.
jpayne@68	421 progressbar : bool or an arbitrary progress bar object
jpayne@68	422 If True, will print a progress bar of the download to standard
jpayne@68	423 error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
jpayne@68	424 be installed.
jpayne@68	425
jpayne@68	426 """
jpayne@68	427
jpayne@68	428 def __init__(
jpayne@68	429 self,
jpayne@68	430 port=22,
jpayne@68	431 username="anonymous",
jpayne@68	432 password="",
jpayne@68	433 account="",
jpayne@68	434 timeout=None,
jpayne@68	435 progressbar=False,
jpayne@68	436 ):
jpayne@68	437 self.port = port
jpayne@68	438 self.username = username
jpayne@68	439 self.password = password
jpayne@68	440 self.account = account
jpayne@68	441 self.timeout = timeout
jpayne@68	442 self.progressbar = progressbar
jpayne@68	443 # Collect errors and raise only once so that both missing packages are
jpayne@68	444 # captured. Otherwise, the user is only warned of one of them at a
jpayne@68	445 # time (and we can't test properly when they are both missing).
jpayne@68	446 errors = []
jpayne@68	447 if self.progressbar and tqdm is None:
jpayne@68	448 errors.append("Missing package 'tqdm' required for progress bars.")
jpayne@68	449 if paramiko is None:
jpayne@68	450 errors.append("Missing package 'paramiko' required for SFTP downloads.")
jpayne@68	451 if errors:
jpayne@68	452 raise ValueError(" ".join(errors))
jpayne@68	453
jpayne@68	454 def __call__(self, url, output_file, pooch):
jpayne@68	455 """
jpayne@68	456 Download the given URL over SFTP to the given output file.
jpayne@68	457
jpayne@68	458 The output file must be given as a string (file name/path) and not an
jpayne@68	459 open file object! Otherwise, paramiko cannot save to that file.
jpayne@68	460
jpayne@68	461 Parameters
jpayne@68	462 ----------
jpayne@68	463 url : str
jpayne@68	464 The URL to the file you want to download.
jpayne@68	465 output_file : str
jpayne@68	466 Path (and file name) to which the file will be downloaded. **Cannot
jpayne@68	467 be a file object**.
jpayne@68	468 pooch : :class:`~pooch.Pooch`
jpayne@68	469 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68	470 """
jpayne@68	471 parsed_url = parse_url(url)
jpayne@68	472 connection = paramiko.Transport(sock=(parsed_url["netloc"], self.port))
jpayne@68	473 sftp = None
jpayne@68	474 try:
jpayne@68	475 connection.connect(username=self.username, password=self.password)
jpayne@68	476 sftp = paramiko.SFTPClient.from_transport(connection)
jpayne@68	477 sftp.get_channel().settimeout = self.timeout
jpayne@68	478 if self.progressbar:
jpayne@68	479 size = int(sftp.stat(parsed_url["path"]).st_size)
jpayne@68	480 use_ascii = bool(sys.platform == "win32")
jpayne@68	481 progress = tqdm(
jpayne@68	482 total=size,
jpayne@68	483 ncols=79,
jpayne@68	484 ascii=use_ascii,
jpayne@68	485 unit="B",
jpayne@68	486 unit_scale=True,
jpayne@68	487 leave=True,
jpayne@68	488 )
jpayne@68	489 if self.progressbar:
jpayne@68	490 with progress:
jpayne@68	491
jpayne@68	492 def callback(current, total):
jpayne@68	493 "Update the progress bar and write to output"
jpayne@68	494 progress.total = int(total)
jpayne@68	495 progress.update(int(current - progress.n))
jpayne@68	496
jpayne@68	497 sftp.get(parsed_url["path"], output_file, callback=callback)
jpayne@68	498 else:
jpayne@68	499 sftp.get(parsed_url["path"], output_file)
jpayne@68	500 finally:
jpayne@68	501 connection.close()
jpayne@68	502 if sftp is not None:
jpayne@68	503 sftp.close()
jpayne@68	504
jpayne@68	505
jpayne@68	506 class DOIDownloader: # pylint: disable=too-few-public-methods
jpayne@68	507 """
jpayne@68	508 Download manager for fetching files from Digital Object Identifiers (DOIs).
jpayne@68	509
jpayne@68	510 Open-access data repositories often issue Digital Object Identifiers (DOIs)
jpayne@68	511 for data which provide a stable link and citation point. The trick is
jpayne@68	512 finding out the download URL for a file given the DOI.
jpayne@68	513
jpayne@68	514 When called, this downloader uses the repository's public API to find out
jpayne@68	515 the download URL from the DOI and file name. It then uses
jpayne@68	516 :class:`pooch.HTTPDownloader` to download the URL into the specified local
jpayne@68	517 file. Allowing "URL"s to be specified with the DOI instead of the actual
jpayne@68	518 HTTP download link. Uses the :mod:`requests` library to manage downloads
jpayne@68	519 and interact with the APIs.
jpayne@68	520
jpayne@68	521 The format of the "URL" is: ``doi:{DOI}/{file name}``.
jpayne@68	522
jpayne@68	523 Notice that there are no ``//`` like in HTTP/FTP and you must specify a
jpayne@68	524 file name after the DOI (separated by a ``/``).
jpayne@68	525
jpayne@68	526 Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to be able to
jpayne@68	527 download files given the DOI instead of an HTTP link.
jpayne@68	528
jpayne@68	529 Supported repositories:
jpayne@68	530
jpayne@68	531 * `figshare <https://www.figshare.com>`__
jpayne@68	532 * `Zenodo <https://www.zenodo.org>`__
jpayne@68	533 * `Dataverse <https://dataverse.org/>`__ instances
jpayne@68	534
jpayne@68	535 .. attention::
jpayne@68	536
jpayne@68	537 DOIs from other repositories will not work since we need to access
jpayne@68	538 their particular APIs to find the download links. We welcome
jpayne@68	539 suggestions and contributions adding new repositories.
jpayne@68	540
jpayne@68	541 Parameters
jpayne@68	542 ----------
jpayne@68	543 progressbar : bool or an arbitrary progress bar object
jpayne@68	544 If True, will print a progress bar of the download to standard error
jpayne@68	545 (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
jpayne@68	546 installed. Alternatively, an arbitrary progress bar object can be
jpayne@68	547 passed. See :ref:`custom-progressbar` for details.
jpayne@68	548 chunk_size : int
jpayne@68	549 Files are streamed chunk_size bytes at a time instead of loading
jpayne@68	550 everything into memory at one. Usually doesn't need to be changed.
jpayne@68	551 **kwargs
jpayne@68	552 All keyword arguments given when creating an instance of this class
jpayne@68	553 will be passed to :func:`requests.get`.
jpayne@68	554
jpayne@68	555 Examples
jpayne@68	556 --------
jpayne@68	557
jpayne@68	558 Download one of the data files from the figshare archive of Pooch test
jpayne@68	559 data:
jpayne@68	560
jpayne@68	561 >>> import os
jpayne@68	562 >>> downloader = DOIDownloader()
jpayne@68	563 >>> url = "doi:10.6084/m9.figshare.14763051.v1/tiny-data.txt"
jpayne@68	564 >>> # Not using with Pooch.fetch so no need to pass an instance of Pooch
jpayne@68	565 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68	566 >>> os.path.exists("tiny-data.txt")
jpayne@68	567 True
jpayne@68	568 >>> with open("tiny-data.txt") as f:
jpayne@68	569 ... print(f.read().strip())
jpayne@68	570 # A tiny data file for test purposes only
jpayne@68	571 1 2 3 4 5 6
jpayne@68	572 >>> os.remove("tiny-data.txt")
jpayne@68	573
jpayne@68	574 Same thing but for our Zenodo archive:
jpayne@68	575
jpayne@68	576 >>> url = "doi:10.5281/zenodo.4924875/tiny-data.txt"
jpayne@68	577 >>> downloader(url=url, output_file="tiny-data.txt", pooch=None)
jpayne@68	578 >>> os.path.exists("tiny-data.txt")
jpayne@68	579 True
jpayne@68	580 >>> with open("tiny-data.txt") as f:
jpayne@68	581 ... print(f.read().strip())
jpayne@68	582 # A tiny data file for test purposes only
jpayne@68	583 1 2 3 4 5 6
jpayne@68	584 >>> os.remove("tiny-data.txt")
jpayne@68	585
jpayne@68	586 """
jpayne@68	587
jpayne@68	588 def __init__(self, progressbar=False, chunk_size=1024, **kwargs):
jpayne@68	589 self.kwargs = kwargs
jpayne@68	590 self.progressbar = progressbar
jpayne@68	591 self.chunk_size = chunk_size
jpayne@68	592
jpayne@68	593 def __call__(self, url, output_file, pooch):
jpayne@68	594 """
jpayne@68	595 Download the given DOI URL over HTTP to the given output file.
jpayne@68	596
jpayne@68	597 Uses the repository's API to determine the actual HTTP download URL
jpayne@68	598 from the given DOI.
jpayne@68	599
jpayne@68	600 Uses :func:`requests.get`.
jpayne@68	601
jpayne@68	602 Parameters
jpayne@68	603 ----------
jpayne@68	604 url : str
jpayne@68	605 The URL to the file you want to download.
jpayne@68	606 output_file : str or file-like object
jpayne@68	607 Path (and file name) to which the file will be downloaded.
jpayne@68	608 pooch : :class:`~pooch.Pooch`
jpayne@68	609 The instance of :class:`~pooch.Pooch` that is calling this method.
jpayne@68	610
jpayne@68	611 """
jpayne@68	612
jpayne@68	613 parsed_url = parse_url(url)
jpayne@68	614 data_repository = doi_to_repository(parsed_url["netloc"])
jpayne@68	615
jpayne@68	616 # Resolve the URL
jpayne@68	617 file_name = parsed_url["path"]
jpayne@68	618 # remove the leading slash in the path
jpayne@68	619 if file_name[0] == "/":
jpayne@68	620 file_name = file_name[1:]
jpayne@68	621 download_url = data_repository.download_url(file_name)
jpayne@68	622
jpayne@68	623 # Instantiate the downloader object
jpayne@68	624 downloader = HTTPDownloader(
jpayne@68	625 progressbar=self.progressbar, chunk_size=self.chunk_size, **self.kwargs
jpayne@68	626 )
jpayne@68	627 downloader(download_url, output_file, pooch)
jpayne@68	628
jpayne@68	629
jpayne@68	630 def doi_to_url(doi):
jpayne@68	631 """
jpayne@68	632 Follow a DOI link to resolve the URL of the archive.
jpayne@68	633
jpayne@68	634 Parameters
jpayne@68	635 ----------
jpayne@68	636 doi : str
jpayne@68	637 The DOI of the archive.
jpayne@68	638
jpayne@68	639 Returns
jpayne@68	640 -------
jpayne@68	641 url : str
jpayne@68	642 The URL of the archive in the data repository.
jpayne@68	643
jpayne@68	644 """
jpayne@68	645 # Lazy import requests to speed up import time
jpayne@68	646 import requests # pylint: disable=C0415
jpayne@68	647
jpayne@68	648 # Use doi.org to resolve the DOI to the repository website.
jpayne@68	649 response = requests.get(f"https://doi.org/{doi}", timeout=DEFAULT_TIMEOUT)
jpayne@68	650 url = response.url
jpayne@68	651 if 400 <= response.status_code < 600:
jpayne@68	652 raise ValueError(
jpayne@68	653 f"Archive with doi:{doi} not found (see {url}). Is the DOI correct?"
jpayne@68	654 )
jpayne@68	655 return url
jpayne@68	656
jpayne@68	657
jpayne@68	658 def doi_to_repository(doi):
jpayne@68	659 """
jpayne@68	660 Instantiate a data repository instance from a given DOI.
jpayne@68	661
jpayne@68	662 This function implements the chain of responsibility dispatch
jpayne@68	663 to the correct data repository class.
jpayne@68	664
jpayne@68	665 Parameters
jpayne@68	666 ----------
jpayne@68	667 doi : str
jpayne@68	668 The DOI of the archive.
jpayne@68	669
jpayne@68	670 Returns
jpayne@68	671 -------
jpayne@68	672 data_repository : DataRepository
jpayne@68	673 The data repository object
jpayne@68	674 """
jpayne@68	675
jpayne@68	676 # This should go away in a separate issue: DOI handling should
jpayne@68	677 # not rely on the (non-)existence of trailing slashes. The issue
jpayne@68	678 # is documented in https://github.com/fatiando/pooch/issues/324
jpayne@68	679 if doi[-1] == "/":
jpayne@68	680 doi = doi[:-1]
jpayne@68	681
jpayne@68	682 repositories = [
jpayne@68	683 FigshareRepository,
jpayne@68	684 ZenodoRepository,
jpayne@68	685 DataverseRepository,
jpayne@68	686 ]
jpayne@68	687
jpayne@68	688 # Extract the DOI and the repository information
jpayne@68	689 archive_url = doi_to_url(doi)
jpayne@68	690
jpayne@68	691 # Try the converters one by one until one of them returned a URL
jpayne@68	692 data_repository = None
jpayne@68	693 for repo in repositories:
jpayne@68	694 if data_repository is None:
jpayne@68	695 data_repository = repo.initialize(
jpayne@68	696 archive_url=archive_url,
jpayne@68	697 doi=doi,
jpayne@68	698 )
jpayne@68	699
jpayne@68	700 if data_repository is None:
jpayne@68	701 repository = parse_url(archive_url)["netloc"]
jpayne@68	702 raise ValueError(
jpayne@68	703 f"Invalid data repository '{repository}'. "
jpayne@68	704 "To request or contribute support for this repository, "
jpayne@68	705 "please open an issue at https://github.com/fatiando/pooch/issues"
jpayne@68	706 )
jpayne@68	707
jpayne@68	708 return data_repository
jpayne@68	709
jpayne@68	710
jpayne@68	711 class DataRepository: # pylint: disable=too-few-public-methods, missing-class-docstring
jpayne@68	712 @classmethod
jpayne@68	713 def initialize(cls, doi, archive_url): # pylint: disable=unused-argument
jpayne@68	714 """
jpayne@68	715 Initialize the data repository if the given URL points to a
jpayne@68	716 corresponding repository.
jpayne@68	717
jpayne@68	718 Initializes a data repository object. This is done as part of
jpayne@68	719 a chain of responsibility. If the class cannot handle the given
jpayne@68	720 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68	721 instance is returned.
jpayne@68	722
jpayne@68	723 Parameters
jpayne@68	724 ----------
jpayne@68	725 doi : str
jpayne@68	726 The DOI that identifies the repository
jpayne@68	727 archive_url : str
jpayne@68	728 The resolved URL for the DOI
jpayne@68	729 """
jpayne@68	730
jpayne@68	731 return None # pragma: no cover
jpayne@68	732
jpayne@68	733 def download_url(self, file_name):
jpayne@68	734 """
jpayne@68	735 Use the repository API to get the download URL for a file given
jpayne@68	736 the archive URL.
jpayne@68	737
jpayne@68	738 Parameters
jpayne@68	739 ----------
jpayne@68	740 file_name : str
jpayne@68	741 The name of the file in the archive that will be downloaded.
jpayne@68	742
jpayne@68	743 Returns
jpayne@68	744 -------
jpayne@68	745 download_url : str
jpayne@68	746 The HTTP URL that can be used to download the file.
jpayne@68	747 """
jpayne@68	748
jpayne@68	749 raise NotImplementedError # pragma: no cover
jpayne@68	750
jpayne@68	751 def populate_registry(self, pooch):
jpayne@68	752 """
jpayne@68	753 Populate the registry using the data repository's API
jpayne@68	754
jpayne@68	755 Parameters
jpayne@68	756 ----------
jpayne@68	757 pooch : Pooch
jpayne@68	758 The pooch instance that the registry will be added to.
jpayne@68	759 """
jpayne@68	760
jpayne@68	761 raise NotImplementedError # pragma: no cover
jpayne@68	762
jpayne@68	763
jpayne@68	764 class ZenodoRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@68	765 base_api_url = "https://zenodo.org/api/records"
jpayne@68	766
jpayne@68	767 def __init__(self, doi, archive_url):
jpayne@68	768 self.archive_url = archive_url
jpayne@68	769 self.doi = doi
jpayne@68	770 self._api_response = None
jpayne@68	771 self._api_version = None
jpayne@68	772
jpayne@68	773 @classmethod
jpayne@68	774 def initialize(cls, doi, archive_url):
jpayne@68	775 """
jpayne@68	776 Initialize the data repository if the given URL points to a
jpayne@68	777 corresponding repository.
jpayne@68	778
jpayne@68	779 Initializes a data repository object. This is done as part of
jpayne@68	780 a chain of responsibility. If the class cannot handle the given
jpayne@68	781 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68	782 instance is returned.
jpayne@68	783
jpayne@68	784 Parameters
jpayne@68	785 ----------
jpayne@68	786 doi : str
jpayne@68	787 The DOI that identifies the repository
jpayne@68	788 archive_url : str
jpayne@68	789 The resolved URL for the DOI
jpayne@68	790 """
jpayne@68	791
jpayne@68	792 # Check whether this is a Zenodo URL
jpayne@68	793 parsed_archive_url = parse_url(archive_url)
jpayne@68	794 if parsed_archive_url["netloc"] != "zenodo.org":
jpayne@68	795 return None
jpayne@68	796
jpayne@68	797 return cls(doi, archive_url)
jpayne@68	798
jpayne@68	799 @property
jpayne@68	800 def api_response(self):
jpayne@68	801 """Cached API response from Zenodo"""
jpayne@68	802 if self._api_response is None:
jpayne@68	803 # Lazy import requests to speed up import time
jpayne@68	804 import requests # pylint: disable=C0415
jpayne@68	805
jpayne@68	806 article_id = self.archive_url.split("/")[-1]
jpayne@68	807 self._api_response = requests.get(
jpayne@68	808 f"{self.base_api_url}/{article_id}",
jpayne@68	809 timeout=DEFAULT_TIMEOUT,
jpayne@68	810 ).json()
jpayne@68	811
jpayne@68	812 return self._api_response
jpayne@68	813
jpayne@68	814 @property
jpayne@68	815 def api_version(self):
jpayne@68	816 """
jpayne@68	817 Version of the Zenodo API we are interacting with
jpayne@68	818
jpayne@68	819 The versions can either be :
jpayne@68	820
jpayne@68	821 - ``"legacy"``: corresponds to the Zenodo API that was supported until
jpayne@68	822 2023-10-12 (before the migration to InvenioRDM).
jpayne@68	823 - ``"new"``: corresponds to the new API that went online on 2023-10-13
jpayne@68	824 after the migration to InvenioRDM.
jpayne@68	825
jpayne@68	826 The ``"new"`` API breaks backward compatibility with the ``"legacy"``
jpayne@68	827 one and could probably be replaced by an updated version that restores
jpayne@68	828 the behaviour of the ``"legacy"`` one.
jpayne@68	829
jpayne@68	830 Returns
jpayne@68	831 -------
jpayne@68	832 str
jpayne@68	833 """
jpayne@68	834 if self._api_version is None:
jpayne@68	835 if all("key" in file for file in self.api_response["files"]):
jpayne@68	836 self._api_version = "legacy"
jpayne@68	837 elif all("filename" in file for file in self.api_response["files"]):
jpayne@68	838 self._api_version = "new"
jpayne@68	839 else:
jpayne@68	840 raise ValueError(
jpayne@68	841 "Couldn't determine the version of the Zenodo API for "
jpayne@68	842 f"{self.archive_url} (doi:{self.doi})."
jpayne@68	843 )
jpayne@68	844 return self._api_version
jpayne@68	845
jpayne@68	846 def download_url(self, file_name):
jpayne@68	847 """
jpayne@68	848 Use the repository API to get the download URL for a file given
jpayne@68	849 the archive URL.
jpayne@68	850
jpayne@68	851 Parameters
jpayne@68	852 ----------
jpayne@68	853 file_name : str
jpayne@68	854 The name of the file in the archive that will be downloaded.
jpayne@68	855
jpayne@68	856 Returns
jpayne@68	857 -------
jpayne@68	858 download_url : str
jpayne@68	859 The HTTP URL that can be used to download the file.
jpayne@68	860
jpayne@68	861 Notes
jpayne@68	862 -----
jpayne@68	863 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
jpayne@68	864 link to the desired files that appears in the API response leads to 404
jpayne@68	865 errors (by 2023-10-17). The files are available in the following url:
jpayne@68	866 ``https://zenodo.org/records/{article_id}/files/{file_name}?download=1``.
jpayne@68	867
jpayne@68	868 This method supports both the legacy and the new API.
jpayne@68	869 """
jpayne@68	870 # Create list of files in the repository
jpayne@68	871 if self.api_version == "legacy":
jpayne@68	872 files = {item["key"]: item for item in self.api_response["files"]}
jpayne@68	873 else:
jpayne@68	874 files = [item["filename"] for item in self.api_response["files"]]
jpayne@68	875 # Check if file exists in the repository
jpayne@68	876 if file_name not in files:
jpayne@68	877 raise ValueError(
jpayne@68	878 f"File '{file_name}' not found in data archive "
jpayne@68	879 f"{self.archive_url} (doi:{self.doi})."
jpayne@68	880 )
jpayne@68	881 # Build download url
jpayne@68	882 if self.api_version == "legacy":
jpayne@68	883 download_url = files[file_name]["links"]["self"]
jpayne@68	884 else:
jpayne@68	885 article_id = self.api_response["id"]
jpayne@68	886 download_url = (
jpayne@68	887 f"https://zenodo.org/records/{article_id}/files/{file_name}?download=1"
jpayne@68	888 )
jpayne@68	889 return download_url
jpayne@68	890
jpayne@68	891 def populate_registry(self, pooch):
jpayne@68	892 """
jpayne@68	893 Populate the registry using the data repository's API
jpayne@68	894
jpayne@68	895 Parameters
jpayne@68	896 ----------
jpayne@68	897 pooch : Pooch
jpayne@68	898 The pooch instance that the registry will be added to.
jpayne@68	899
jpayne@68	900 Notes
jpayne@68	901 -----
jpayne@68	902 After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
jpayne@68	903 checksums for each file listed in the API reference is now an md5 sum.
jpayne@68	904
jpayne@68	905 This method supports both the legacy and the new API.
jpayne@68	906 """
jpayne@68	907 for filedata in self.api_response["files"]:
jpayne@68	908 checksum = filedata["checksum"]
jpayne@68	909 if self.api_version == "legacy":
jpayne@68	910 key = "key"
jpayne@68	911 else:
jpayne@68	912 key = "filename"
jpayne@68	913 checksum = f"md5:{checksum}"
jpayne@68	914 pooch.registry[filedata[key]] = checksum
jpayne@68	915
jpayne@68	916
jpayne@68	917 class FigshareRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@68	918 def __init__(self, doi, archive_url):
jpayne@68	919 self.archive_url = archive_url
jpayne@68	920 self.doi = doi
jpayne@68	921 self._api_response = None
jpayne@68	922
jpayne@68	923 @classmethod
jpayne@68	924 def initialize(cls, doi, archive_url):
jpayne@68	925 """
jpayne@68	926 Initialize the data repository if the given URL points to a
jpayne@68	927 corresponding repository.
jpayne@68	928
jpayne@68	929 Initializes a data repository object. This is done as part of
jpayne@68	930 a chain of responsibility. If the class cannot handle the given
jpayne@68	931 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68	932 instance is returned.
jpayne@68	933
jpayne@68	934 Parameters
jpayne@68	935 ----------
jpayne@68	936 doi : str
jpayne@68	937 The DOI that identifies the repository
jpayne@68	938 archive_url : str
jpayne@68	939 The resolved URL for the DOI
jpayne@68	940 """
jpayne@68	941
jpayne@68	942 # Check whether this is a Figshare URL
jpayne@68	943 parsed_archive_url = parse_url(archive_url)
jpayne@68	944 if parsed_archive_url["netloc"] != "figshare.com":
jpayne@68	945 return None
jpayne@68	946
jpayne@68	947 return cls(doi, archive_url)
jpayne@68	948
jpayne@68	949 def _parse_version_from_doi(self):
jpayne@68	950 """
jpayne@68	951 Parse version from the doi
jpayne@68	952
jpayne@68	953 Return None if version is not available in the doi.
jpayne@68	954 """
jpayne@68	955 # Get suffix of the doi
jpayne@68	956 _, suffix = self.doi.split("/")
jpayne@68	957 # Split the suffix by dots and keep the last part
jpayne@68	958 last_part = suffix.split(".")[-1]
jpayne@68	959 # Parse the version from the last part
jpayne@68	960 if last_part[0] != "v":
jpayne@68	961 return None
jpayne@68	962 version = int(last_part[1:])
jpayne@68	963 return version
jpayne@68	964
jpayne@68	965 @property
jpayne@68	966 def api_response(self):
jpayne@68	967 """Cached API response from Figshare"""
jpayne@68	968 if self._api_response is None:
jpayne@68	969 # Lazy import requests to speed up import time
jpayne@68	970 import requests # pylint: disable=C0415
jpayne@68	971
jpayne@68	972 # Use the figshare API to find the article ID from the DOI
jpayne@68	973 article = requests.get(
jpayne@68	974 f"https://api.figshare.com/v2/articles?doi={self.doi}",
jpayne@68	975 timeout=DEFAULT_TIMEOUT,
jpayne@68	976 ).json()[0]
jpayne@68	977 article_id = article["id"]
jpayne@68	978 # Parse desired version from the doi
jpayne@68	979 version = self._parse_version_from_doi()
jpayne@68	980 # With the ID and version, we can get a list of files and their
jpayne@68	981 # download links
jpayne@68	982 if version is None:
jpayne@68	983 # Figshare returns the latest version available when no version
jpayne@68	984 # is specified through the DOI.
jpayne@68	985 warnings.warn(
jpayne@68	986 f"The Figshare DOI '{self.doi}' doesn't specify which version of "
jpayne@68	987 "the repository should be used. "
jpayne@68	988 "Figshare will point to the latest version available.",
jpayne@68	989 UserWarning,
jpayne@68	990 )
jpayne@68	991 # Define API url using only the article id
jpayne@68	992 # (figshare will resolve the latest version)
jpayne@68	993 api_url = f"https://api.figshare.com/v2/articles/{article_id}"
jpayne@68	994 else:
jpayne@68	995 # Define API url using article id and the desired version
jpayne@68	996 # Get list of files using article id and the version
jpayne@68	997 api_url = (
jpayne@68	998 "https://api.figshare.com/v2/articles/"
jpayne@68	999 f"{article_id}/versions/{version}"
jpayne@68	1000 )
jpayne@68	1001 # Make the request and return the files in the figshare repository
jpayne@68	1002 response = requests.get(api_url, timeout=DEFAULT_TIMEOUT)
jpayne@68	1003 response.raise_for_status()
jpayne@68	1004 self._api_response = response.json()["files"]
jpayne@68	1005
jpayne@68	1006 return self._api_response
jpayne@68	1007
jpayne@68	1008 def download_url(self, file_name):
jpayne@68	1009 """
jpayne@68	1010 Use the repository API to get the download URL for a file given
jpayne@68	1011 the archive URL.
jpayne@68	1012
jpayne@68	1013 Parameters
jpayne@68	1014 ----------
jpayne@68	1015 file_name : str
jpayne@68	1016 The name of the file in the archive that will be downloaded.
jpayne@68	1017
jpayne@68	1018 Returns
jpayne@68	1019 -------
jpayne@68	1020 download_url : str
jpayne@68	1021 The HTTP URL that can be used to download the file.
jpayne@68	1022 """
jpayne@68	1023 files = {item["name"]: item for item in self.api_response}
jpayne@68	1024 if file_name not in files:
jpayne@68	1025 raise ValueError(
jpayne@68	1026 f"File '{file_name}' not found in data archive {self.archive_url} (doi:{self.doi})."
jpayne@68	1027 )
jpayne@68	1028 download_url = files[file_name]["download_url"]
jpayne@68	1029 return download_url
jpayne@68	1030
jpayne@68	1031 def populate_registry(self, pooch):
jpayne@68	1032 """
jpayne@68	1033 Populate the registry using the data repository's API
jpayne@68	1034
jpayne@68	1035 Parameters
jpayne@68	1036 ----------
jpayne@68	1037 pooch : Pooch
jpayne@68	1038 The pooch instance that the registry will be added to.
jpayne@68	1039 """
jpayne@68	1040
jpayne@68	1041 for filedata in self.api_response:
jpayne@68	1042 pooch.registry[filedata["name"]] = f"md5:{filedata['computed_md5']}"
jpayne@68	1043
jpayne@68	1044
jpayne@68	1045 class DataverseRepository(DataRepository): # pylint: disable=missing-class-docstring
jpayne@68	1046 def __init__(self, doi, archive_url):
jpayne@68	1047 self.archive_url = archive_url
jpayne@68	1048 self.doi = doi
jpayne@68	1049 self._api_response = None
jpayne@68	1050
jpayne@68	1051 @classmethod
jpayne@68	1052 def initialize(cls, doi, archive_url):
jpayne@68	1053 """
jpayne@68	1054 Initialize the data repository if the given URL points to a
jpayne@68	1055 corresponding repository.
jpayne@68	1056
jpayne@68	1057 Initializes a data repository object. This is done as part of
jpayne@68	1058 a chain of responsibility. If the class cannot handle the given
jpayne@68	1059 repository URL, it returns `None`. Otherwise a `DataRepository`
jpayne@68	1060 instance is returned.
jpayne@68	1061
jpayne@68	1062 Parameters
jpayne@68	1063 ----------
jpayne@68	1064 doi : str
jpayne@68	1065 The DOI that identifies the repository
jpayne@68	1066 archive_url : str
jpayne@68	1067 The resolved URL for the DOI
jpayne@68	1068 """
jpayne@68	1069 # Access the DOI as if this was a DataVerse instance
jpayne@68	1070 response = cls._get_api_response(doi, archive_url)
jpayne@68	1071
jpayne@68	1072 # If we failed, this is probably not a DataVerse instance
jpayne@68	1073 if 400 <= response.status_code < 600:
jpayne@68	1074 return None
jpayne@68	1075
jpayne@68	1076 # Initialize the repository and overwrite the api response
jpayne@68	1077 repository = cls(doi, archive_url)
jpayne@68	1078 repository.api_response = response
jpayne@68	1079 return repository
jpayne@68	1080
jpayne@68	1081 @classmethod
jpayne@68	1082 def _get_api_response(cls, doi, archive_url):
jpayne@68	1083 """
jpayne@68	1084 Perform the actual API request
jpayne@68	1085
jpayne@68	1086 This has been separated into a separate ``classmethod``, as it can be
jpayne@68	1087 used prior and after the initialization.
jpayne@68	1088 """
jpayne@68	1089 # Lazy import requests to speed up import time
jpayne@68	1090 import requests # pylint: disable=C0415
jpayne@68	1091
jpayne@68	1092 parsed = parse_url(archive_url)
jpayne@68	1093 response = requests.get(
jpayne@68	1094 f"{parsed['protocol']}://{parsed['netloc']}/api/datasets/"
jpayne@68	1095 f":persistentId?persistentId=doi:{doi}",
jpayne@68	1096 timeout=DEFAULT_TIMEOUT,
jpayne@68	1097 )
jpayne@68	1098 return response
jpayne@68	1099
jpayne@68	1100 @property
jpayne@68	1101 def api_response(self):
jpayne@68	1102 """Cached API response from a DataVerse instance"""
jpayne@68	1103
jpayne@68	1104 if self._api_response is None:
jpayne@68	1105 self._api_response = self._get_api_response(
jpayne@68	1106 self.doi, self.archive_url
jpayne@68	1107 ) # pragma: no cover
jpayne@68	1108
jpayne@68	1109 return self._api_response
jpayne@68	1110
jpayne@68	1111 @api_response.setter
jpayne@68	1112 def api_response(self, response):
jpayne@68	1113 """Update the cached API response"""
jpayne@68	1114
jpayne@68	1115 self._api_response = response
jpayne@68	1116
jpayne@68	1117 def download_url(self, file_name):
jpayne@68	1118 """
jpayne@68	1119 Use the repository API to get the download URL for a file given
jpayne@68	1120 the archive URL.
jpayne@68	1121
jpayne@68	1122 Parameters
jpayne@68	1123 ----------
jpayne@68	1124 file_name : str
jpayne@68	1125 The name of the file in the archive that will be downloaded.
jpayne@68	1126
jpayne@68	1127 Returns
jpayne@68	1128 -------
jpayne@68	1129 download_url : str
jpayne@68	1130 The HTTP URL that can be used to download the file.
jpayne@68	1131 """
jpayne@68	1132 parsed = parse_url(self.archive_url)
jpayne@68	1133 response = self.api_response.json()
jpayne@68	1134 files = {
jpayne@68	1135 file["dataFile"]["filename"]: file["dataFile"]
jpayne@68	1136 for file in response["data"]["latestVersion"]["files"]
jpayne@68	1137 }
jpayne@68	1138 if file_name not in files:
jpayne@68	1139 raise ValueError(
jpayne@68	1140 f"File '{file_name}' not found in data archive "
jpayne@68	1141 f"{self.archive_url} (doi:{self.doi})."
jpayne@68	1142 )
jpayne@68	1143 # Generate download_url using the file id
jpayne@68	1144 download_url = (
jpayne@68	1145 f"{parsed['protocol']}://{parsed['netloc']}/api/access/datafile/"
jpayne@68	1146 f"{files[file_name]['id']}"
jpayne@68	1147 )
jpayne@68	1148 return download_url
jpayne@68	1149
jpayne@68	1150 def populate_registry(self, pooch):
jpayne@68	1151 """
jpayne@68	1152 Populate the registry using the data repository's API
jpayne@68	1153
jpayne@68	1154 Parameters
jpayne@68	1155 ----------
jpayne@68	1156 pooch : Pooch
jpayne@68	1157 The pooch instance that the registry will be added to.
jpayne@68	1158 """
jpayne@68	1159
jpayne@68	1160 for filedata in self.api_response.json()["data"]["latestVersion"]["files"]:
jpayne@68	1161 pooch.registry[filedata["dataFile"]["filename"]] = (
jpayne@68	1162 f"md5:{filedata['dataFile']['md5']}"
jpayne@68	1163 )

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/downloaders.py @ 68:5028fdace37b