jpayne@69: # Copyright (c) 2018 The Pooch Developers. jpayne@69: # Distributed under the terms of the BSD 3-Clause License. jpayne@69: # SPDX-License-Identifier: BSD-3-Clause jpayne@69: # jpayne@69: # This code is part of the Fatiando a Terra project (https://www.fatiando.org) jpayne@69: # jpayne@69: """ jpayne@69: Misc utilities jpayne@69: """ jpayne@69: import logging jpayne@69: import os jpayne@69: import tempfile jpayne@69: import hashlib jpayne@69: from pathlib import Path jpayne@69: from urllib.parse import urlsplit jpayne@69: from contextlib import contextmanager jpayne@69: import warnings jpayne@69: jpayne@69: import platformdirs jpayne@69: from packaging.version import Version jpayne@69: jpayne@69: jpayne@69: LOGGER = logging.Logger("pooch") jpayne@69: LOGGER.addHandler(logging.StreamHandler()) jpayne@69: jpayne@69: jpayne@69: def file_hash(*args, **kwargs): jpayne@69: """ jpayne@69: WARNING: Importing this function from pooch.utils is DEPRECATED. jpayne@69: Please import from the top-level namespace (`from pooch import file_hash`) jpayne@69: instead, which is fully backwards compatible with pooch >= 0.1. jpayne@69: jpayne@69: Examples jpayne@69: -------- jpayne@69: jpayne@69: >>> fname = "test-file-for-hash.txt" jpayne@69: >>> with open(fname, "w") as f: jpayne@69: ... __ = f.write("content of the file") jpayne@69: >>> print(file_hash(fname)) jpayne@69: 0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00 jpayne@69: >>> import os jpayne@69: >>> os.remove(fname) jpayne@69: jpayne@69: """ jpayne@69: # pylint: disable=import-outside-toplevel jpayne@69: from .hashes import file_hash as new_file_hash jpayne@69: jpayne@69: message = """ jpayne@69: Importing file_hash from pooch.utils is DEPRECATED. Please import from the jpayne@69: top-level namespace (`from pooch import file_hash`) instead, which is fully jpayne@69: backwards compatible with pooch >= 0.1. jpayne@69: """ jpayne@69: warnings.warn(message, DeprecationWarning, stacklevel=2) jpayne@69: return new_file_hash(*args, **kwargs) jpayne@69: jpayne@69: jpayne@69: def get_logger(): jpayne@69: r""" jpayne@69: Get the default event logger. jpayne@69: jpayne@69: The logger records events like downloading files, unzipping archives, etc. jpayne@69: Use the method :meth:`logging.Logger.setLevel` of this object to adjust the jpayne@69: verbosity level from Pooch. jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: logger : :class:`logging.Logger` jpayne@69: The logger object for Pooch jpayne@69: """ jpayne@69: return LOGGER jpayne@69: jpayne@69: jpayne@69: def os_cache(project): jpayne@69: r""" jpayne@69: Default cache location based on the operating system. jpayne@69: jpayne@69: The folder locations are defined by the ``platformdirs`` package jpayne@69: using the ``user_cache_dir`` function. jpayne@69: Usually, the locations will be following (see the jpayne@69: `platformdirs documentation `__): jpayne@69: jpayne@69: * Mac: ``~/Library/Caches/`` jpayne@69: * Unix: ``~/.cache/`` or the value of the ``XDG_CACHE_HOME`` jpayne@69: environment variable, if defined. jpayne@69: * Windows: ``C:\Users\\AppData\Local\\\Cache`` jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: project : str jpayne@69: The project name. jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: cache_path : :class:`pathlib.Path` jpayne@69: The default location for the data cache. User directories (``'~'``) are jpayne@69: not expanded. jpayne@69: jpayne@69: """ jpayne@69: return Path(platformdirs.user_cache_dir(project)) jpayne@69: jpayne@69: jpayne@69: def check_version(version, fallback="master"): jpayne@69: """ jpayne@69: Check if a version is PEP440 compliant and there are no unreleased changes. jpayne@69: jpayne@69: For example, ``version = "0.1"`` will be returned as is but ``version = jpayne@69: "0.1+10.8dl8dh9"`` will return the fallback. This is the convention used by jpayne@69: `versioneer `__ to mark that jpayne@69: this version is 10 commits ahead of the last release. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: version : str jpayne@69: A version string. jpayne@69: fallback : str jpayne@69: What to return if the version string has unreleased changes. jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: version : str jpayne@69: If *version* is PEP440 compliant and there are unreleased changes, then jpayne@69: return *version*. Otherwise, return *fallback*. jpayne@69: jpayne@69: Raises jpayne@69: ------ jpayne@69: InvalidVersion jpayne@69: If *version* is not PEP440 compliant. jpayne@69: jpayne@69: Examples jpayne@69: -------- jpayne@69: jpayne@69: >>> check_version("0.1") jpayne@69: '0.1' jpayne@69: >>> check_version("0.1a10") jpayne@69: '0.1a10' jpayne@69: >>> check_version("0.1+111.9hdg36") jpayne@69: 'master' jpayne@69: >>> check_version("0.1+111.9hdg36", fallback="dev") jpayne@69: 'dev' jpayne@69: jpayne@69: """ jpayne@69: parse = Version(version) jpayne@69: if parse.local is not None: jpayne@69: return fallback jpayne@69: return version jpayne@69: jpayne@69: jpayne@69: def parse_url(url): jpayne@69: """ jpayne@69: Parse a URL into 3 components: jpayne@69: jpayne@69: :/// jpayne@69: jpayne@69: Example URLs: jpayne@69: jpayne@69: * http://127.0.0.1:8080/test.nc jpayne@69: * ftp://127.0.0.1:8080/test.nc jpayne@69: * doi:10.6084/m9.figshare.923450.v1/test.nc jpayne@69: jpayne@69: The DOI is a special case. The protocol will be "doi", the netloc will be jpayne@69: the DOI, and the path is what comes after the last "/". jpayne@69: The only exception are Zenodo dois: the protocol will be "doi", the netloc jpayne@69: will be composed by the "prefix/suffix" and the path is what comes after jpayne@69: the second "/". This allows to support special cases of Zenodo dois where jpayne@69: the path contains forward slashes "/", created by the GitHub-Zenodo jpayne@69: integration service. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: url : str jpayne@69: The URL. jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: parsed_url : dict jpayne@69: Three components of a URL (e.g., jpayne@69: ``{'protocol':'http', 'netloc':'127.0.0.1:8080','path': '/test.nc'}``). jpayne@69: jpayne@69: """ jpayne@69: if url.startswith("doi://"): jpayne@69: raise ValueError( jpayne@69: f"Invalid DOI link '{url}'. You must not use '//' after 'doi:'." jpayne@69: ) jpayne@69: if url.startswith("doi:"): jpayne@69: protocol = "doi" jpayne@69: parts = url[4:].split("/") jpayne@69: if "zenodo" in parts[1].lower(): jpayne@69: netloc = "/".join(parts[:2]) jpayne@69: path = "/" + "/".join(parts[2:]) jpayne@69: else: jpayne@69: netloc = "/".join(parts[:-1]) jpayne@69: path = "/" + parts[-1] jpayne@69: else: jpayne@69: parsed_url = urlsplit(url) jpayne@69: protocol = parsed_url.scheme or "file" jpayne@69: netloc = parsed_url.netloc jpayne@69: path = parsed_url.path jpayne@69: return {"protocol": protocol, "netloc": netloc, "path": path} jpayne@69: jpayne@69: jpayne@69: def cache_location(path, env=None, version=None): jpayne@69: """ jpayne@69: Location of the cache given a base path and optional configuration. jpayne@69: jpayne@69: Checks for the environment variable to overwrite the path of the local jpayne@69: cache. Optionally add *version* to the path if given. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: path : str, PathLike, list or tuple jpayne@69: The path to the local data storage folder. If this is a list or tuple, jpayne@69: we'll join the parts with the appropriate separator. Use jpayne@69: :func:`pooch.os_cache` for a sensible default. jpayne@69: version : str or None jpayne@69: The version string for your project. Will be appended to given path if jpayne@69: not None. jpayne@69: env : str or None jpayne@69: An environment variable that can be used to overwrite *path*. This jpayne@69: allows users to control where they want the data to be stored. We'll jpayne@69: append *version* to the end of this value as well. jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: local_path : PathLike jpayne@69: The path to the local directory. jpayne@69: jpayne@69: """ jpayne@69: if env is not None and env in os.environ and os.environ[env]: jpayne@69: path = os.environ[env] jpayne@69: if isinstance(path, (list, tuple)): jpayne@69: path = os.path.join(*path) jpayne@69: if version is not None: jpayne@69: path = os.path.join(str(path), version) jpayne@69: path = os.path.expanduser(str(path)) jpayne@69: return Path(path) jpayne@69: jpayne@69: jpayne@69: def make_local_storage(path, env=None): jpayne@69: """ jpayne@69: Create the local cache directory and make sure it's writable. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: path : str or PathLike jpayne@69: The path to the local data storage folder. jpayne@69: env : str or None jpayne@69: An environment variable that can be used to overwrite *path*. Only used jpayne@69: in the error message in case the folder is not writable. jpayne@69: """ jpayne@69: path = str(path) jpayne@69: # Check that the data directory is writable jpayne@69: if not os.path.exists(path): jpayne@69: action = "create" jpayne@69: else: jpayne@69: action = "write to" jpayne@69: jpayne@69: try: jpayne@69: if action == "create": jpayne@69: # When running in parallel, it's possible that multiple jobs will jpayne@69: # try to create the path at the same time. Use exist_ok to avoid jpayne@69: # raising an error. jpayne@69: os.makedirs(path, exist_ok=True) jpayne@69: else: jpayne@69: with tempfile.NamedTemporaryFile(dir=path): jpayne@69: pass jpayne@69: except PermissionError as error: jpayne@69: message = [ jpayne@69: str(error), jpayne@69: f"| Pooch could not {action} data cache folder '{path}'.", jpayne@69: "Will not be able to download data files.", jpayne@69: ] jpayne@69: if env is not None: jpayne@69: message.append( jpayne@69: f"Use environment variable '{env}' to specify a different location." jpayne@69: ) jpayne@69: raise PermissionError(" ".join(message)) from error jpayne@69: jpayne@69: jpayne@69: @contextmanager jpayne@69: def temporary_file(path=None): jpayne@69: """ jpayne@69: Create a closed and named temporary file and make sure it's cleaned up. jpayne@69: jpayne@69: Using :class:`tempfile.NamedTemporaryFile` will fail on Windows if trying jpayne@69: to open the file a second time (when passing its name to Pooch function, jpayne@69: for example). This context manager creates the file, closes it, yields the jpayne@69: file path, and makes sure it's deleted in the end. jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: path : str or PathLike jpayne@69: The directory in which the temporary file will be created. jpayne@69: jpayne@69: Yields jpayne@69: ------ jpayne@69: fname : str jpayne@69: The path to the temporary file. jpayne@69: jpayne@69: """ jpayne@69: tmp = tempfile.NamedTemporaryFile(delete=False, dir=path) jpayne@69: # Close the temp file so that it can be opened elsewhere jpayne@69: tmp.close() jpayne@69: try: jpayne@69: yield tmp.name jpayne@69: finally: jpayne@69: if os.path.exists(tmp.name): jpayne@69: os.remove(tmp.name) jpayne@69: jpayne@69: jpayne@69: def unique_file_name(url): jpayne@69: """ jpayne@69: Create a unique file name based on the given URL. jpayne@69: jpayne@69: The file name will be unique to the URL by prepending the name with the MD5 jpayne@69: hash (hex digest) of the URL. The name will also include the last portion jpayne@69: of the URL. jpayne@69: jpayne@69: The format will be: ``{md5}-{filename}.{ext}`` jpayne@69: jpayne@69: The file name will be cropped so that the entire name (including the hash) jpayne@69: is less than 255 characters long (the limit on most file systems). jpayne@69: jpayne@69: Parameters jpayne@69: ---------- jpayne@69: url : str jpayne@69: The URL with a file name at the end. jpayne@69: jpayne@69: Returns jpayne@69: ------- jpayne@69: fname : str jpayne@69: The file name, unique to this URL. jpayne@69: jpayne@69: Examples jpayne@69: -------- jpayne@69: jpayne@69: >>> print(unique_file_name("https://www.some-server.org/2020/data.txt")) jpayne@69: 02ddee027ce5ebb3d7059fb23d210604-data.txt jpayne@69: >>> print(unique_file_name("https://www.some-server.org/2019/data.txt")) jpayne@69: 9780092867b497fca6fc87d8308f1025-data.txt jpayne@69: >>> print(unique_file_name("https://www.some-server.org/2020/data.txt.gz")) jpayne@69: 181a9d52e908219c2076f55145d6a344-data.txt.gz jpayne@69: jpayne@69: """ jpayne@69: md5 = hashlib.md5(url.encode()).hexdigest() jpayne@69: fname = parse_url(url)["path"].split("/")[-1] jpayne@69: # Crop the start of the file name to fit 255 characters including the hash jpayne@69: # and the : jpayne@69: fname = fname[-(255 - len(md5) - 1) :] jpayne@69: unique_name = f"{md5}-{fname}" jpayne@69: return unique_name