annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/hashes.py @ 68:5028fdace37b

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 16:23:26 -0400
parents
children
rev   line source
jpayne@68 1 # Copyright (c) 2018 The Pooch Developers.
jpayne@68 2 # Distributed under the terms of the BSD 3-Clause License.
jpayne@68 3 # SPDX-License-Identifier: BSD-3-Clause
jpayne@68 4 #
jpayne@68 5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
jpayne@68 6 #
jpayne@68 7 """
jpayne@68 8 Calculating and checking file hashes.
jpayne@68 9 """
jpayne@68 10 import hashlib
jpayne@68 11 import functools
jpayne@68 12 from pathlib import Path
jpayne@68 13
jpayne@68 14 # From the docs: https://docs.python.org/3/library/hashlib.html#hashlib.new
jpayne@68 15 # The named constructors are much faster than new() and should be
jpayne@68 16 # preferred.
jpayne@68 17 # Need to fallback on new() for some algorithms.
jpayne@68 18 ALGORITHMS_AVAILABLE = {
jpayne@68 19 alg: getattr(hashlib, alg, functools.partial(hashlib.new, alg))
jpayne@68 20 for alg in hashlib.algorithms_available
jpayne@68 21 }
jpayne@68 22
jpayne@68 23 try:
jpayne@68 24 import xxhash
jpayne@68 25
jpayne@68 26 # xxhash doesn't have a list of available algorithms yet.
jpayne@68 27 # https://github.com/ifduyue/python-xxhash/issues/48
jpayne@68 28 ALGORITHMS_AVAILABLE.update(
jpayne@68 29 {
jpayne@68 30 alg: getattr(xxhash, alg, None)
jpayne@68 31 for alg in ["xxh128", "xxh64", "xxh32", "xxh3_128", "xxh3_64"]
jpayne@68 32 }
jpayne@68 33 )
jpayne@68 34 # The xxh3 algorithms are only available for version>=2.0. Set to None and
jpayne@68 35 # remove to ensure backwards compatibility.
jpayne@68 36 ALGORITHMS_AVAILABLE = {
jpayne@68 37 alg: func for alg, func in ALGORITHMS_AVAILABLE.items() if func is not None
jpayne@68 38 }
jpayne@68 39 except ImportError:
jpayne@68 40 pass
jpayne@68 41
jpayne@68 42
jpayne@68 43 def file_hash(fname, alg="sha256"):
jpayne@68 44 """
jpayne@68 45 Calculate the hash of a given file.
jpayne@68 46
jpayne@68 47 Useful for checking if a file has changed or been corrupted.
jpayne@68 48
jpayne@68 49 Parameters
jpayne@68 50 ----------
jpayne@68 51 fname : str
jpayne@68 52 The name of the file.
jpayne@68 53 alg : str
jpayne@68 54 The type of the hashing algorithm
jpayne@68 55
jpayne@68 56 Returns
jpayne@68 57 -------
jpayne@68 58 hash : str
jpayne@68 59 The hash of the file.
jpayne@68 60
jpayne@68 61 Examples
jpayne@68 62 --------
jpayne@68 63
jpayne@68 64 >>> fname = "test-file-for-hash.txt"
jpayne@68 65 >>> with open(fname, "w") as f:
jpayne@68 66 ... __ = f.write("content of the file")
jpayne@68 67 >>> print(file_hash(fname))
jpayne@68 68 0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
jpayne@68 69 >>> import os
jpayne@68 70 >>> os.remove(fname)
jpayne@68 71
jpayne@68 72 """
jpayne@68 73 if alg not in ALGORITHMS_AVAILABLE:
jpayne@68 74 raise ValueError(
jpayne@68 75 f"Algorithm '{alg}' not available to the pooch library. "
jpayne@68 76 "Only the following algorithms are available "
jpayne@68 77 f"{list(ALGORITHMS_AVAILABLE.keys())}."
jpayne@68 78 )
jpayne@68 79 # Calculate the hash in chunks to avoid overloading the memory
jpayne@68 80 chunksize = 65536
jpayne@68 81 hasher = ALGORITHMS_AVAILABLE[alg]()
jpayne@68 82 with open(fname, "rb") as fin:
jpayne@68 83 buff = fin.read(chunksize)
jpayne@68 84 while buff:
jpayne@68 85 hasher.update(buff)
jpayne@68 86 buff = fin.read(chunksize)
jpayne@68 87 return hasher.hexdigest()
jpayne@68 88
jpayne@68 89
jpayne@68 90 def hash_algorithm(hash_string):
jpayne@68 91 """
jpayne@68 92 Parse the name of the hash method from the hash string.
jpayne@68 93
jpayne@68 94 The hash string should have the following form ``algorithm:hash``, where
jpayne@68 95 algorithm can be the name of any algorithm known to :mod:`hashlib`.
jpayne@68 96
jpayne@68 97 If the algorithm is omitted or the hash string is None, will default to
jpayne@68 98 ``"sha256"``.
jpayne@68 99
jpayne@68 100 Parameters
jpayne@68 101 ----------
jpayne@68 102 hash_string : str
jpayne@68 103 The hash string with optional algorithm prepended.
jpayne@68 104
jpayne@68 105 Returns
jpayne@68 106 -------
jpayne@68 107 hash_algorithm : str
jpayne@68 108 The name of the algorithm.
jpayne@68 109
jpayne@68 110 Examples
jpayne@68 111 --------
jpayne@68 112
jpayne@68 113 >>> print(hash_algorithm("qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
jpayne@68 114 sha256
jpayne@68 115 >>> print(hash_algorithm("md5:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
jpayne@68 116 md5
jpayne@68 117 >>> print(hash_algorithm("sha256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
jpayne@68 118 sha256
jpayne@68 119 >>> print(hash_algorithm("SHA256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
jpayne@68 120 sha256
jpayne@68 121 >>> print(hash_algorithm("xxh3_64:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
jpayne@68 122 xxh3_64
jpayne@68 123 >>> print(hash_algorithm(None))
jpayne@68 124 sha256
jpayne@68 125
jpayne@68 126 """
jpayne@68 127 default = "sha256"
jpayne@68 128 if hash_string is None:
jpayne@68 129 algorithm = default
jpayne@68 130 elif ":" not in hash_string:
jpayne@68 131 algorithm = default
jpayne@68 132 else:
jpayne@68 133 algorithm = hash_string.split(":")[0]
jpayne@68 134 return algorithm.lower()
jpayne@68 135
jpayne@68 136
jpayne@68 137 def hash_matches(fname, known_hash, strict=False, source=None):
jpayne@68 138 """
jpayne@68 139 Check if the hash of a file matches a known hash.
jpayne@68 140
jpayne@68 141 If the *known_hash* is None, will always return True.
jpayne@68 142
jpayne@68 143 Coverts hashes to lowercase before comparison to avoid system specific
jpayne@68 144 mismatches between hashes in the registry and computed hashes.
jpayne@68 145
jpayne@68 146 Parameters
jpayne@68 147 ----------
jpayne@68 148 fname : str or PathLike
jpayne@68 149 The path to the file.
jpayne@68 150 known_hash : str
jpayne@68 151 The known hash. Optionally, prepend ``alg:`` to the hash to specify the
jpayne@68 152 hashing algorithm. Default is SHA256.
jpayne@68 153 strict : bool
jpayne@68 154 If True, will raise a :class:`ValueError` if the hash does not match
jpayne@68 155 informing the user that the file may be corrupted.
jpayne@68 156 source : str
jpayne@68 157 The source of the downloaded file (name or URL, for example). Will be
jpayne@68 158 used in the error message if *strict* is True. Has no other use other
jpayne@68 159 than reporting to the user where the file came from in case of hash
jpayne@68 160 mismatch. If None, will default to *fname*.
jpayne@68 161
jpayne@68 162 Returns
jpayne@68 163 -------
jpayne@68 164 is_same : bool
jpayne@68 165 True if the hash matches, False otherwise.
jpayne@68 166
jpayne@68 167 """
jpayne@68 168 if known_hash is None:
jpayne@68 169 return True
jpayne@68 170 algorithm = hash_algorithm(known_hash)
jpayne@68 171 new_hash = file_hash(fname, alg=algorithm)
jpayne@68 172 matches = new_hash.lower() == known_hash.split(":")[-1].lower()
jpayne@68 173 if strict and not matches:
jpayne@68 174 if source is None:
jpayne@68 175 source = str(fname)
jpayne@68 176 raise ValueError(
jpayne@68 177 f"{algorithm.upper()} hash of downloaded file ({source}) does not match"
jpayne@68 178 f" the known hash: expected {known_hash} but got {new_hash}. Deleted"
jpayne@68 179 " download for safety. The downloaded file may have been corrupted or"
jpayne@68 180 " the known hash may be outdated."
jpayne@68 181 )
jpayne@68 182 return matches
jpayne@68 183
jpayne@68 184
jpayne@68 185 def make_registry(directory, output, recursive=True):
jpayne@68 186 """
jpayne@68 187 Make a registry of files and hashes for the given directory.
jpayne@68 188
jpayne@68 189 This is helpful if you have many files in your test dataset as it keeps you
jpayne@68 190 from needing to manually update the registry.
jpayne@68 191
jpayne@68 192 Parameters
jpayne@68 193 ----------
jpayne@68 194 directory : str
jpayne@68 195 Directory of the test data to put in the registry. All file names in
jpayne@68 196 the registry will be relative to this directory.
jpayne@68 197 output : str
jpayne@68 198 Name of the output registry file.
jpayne@68 199 recursive : bool
jpayne@68 200 If True, will recursively look for files in subdirectories of
jpayne@68 201 *directory*.
jpayne@68 202
jpayne@68 203 """
jpayne@68 204 directory = Path(directory)
jpayne@68 205 if recursive:
jpayne@68 206 pattern = "**/*"
jpayne@68 207 else:
jpayne@68 208 pattern = "*"
jpayne@68 209
jpayne@68 210 files = sorted(
jpayne@68 211 str(path.relative_to(directory))
jpayne@68 212 for path in directory.glob(pattern)
jpayne@68 213 if path.is_file()
jpayne@68 214 )
jpayne@68 215
jpayne@68 216 hashes = [file_hash(str(directory / fname)) for fname in files]
jpayne@68 217
jpayne@68 218 with open(output, "w", encoding="utf-8") as outfile:
jpayne@68 219 for fname, fhash in zip(files, hashes):
jpayne@68 220 # Only use Unix separators for the registry so that we don't go
jpayne@68 221 # insane dealing with file paths.
jpayne@68 222 outfile.write("{} {}\n".format(fname.replace("\\", "/"), fhash))