annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/hashes.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 # Copyright (c) 2018 The Pooch Developers.
jpayne@69 2 # Distributed under the terms of the BSD 3-Clause License.
jpayne@69 3 # SPDX-License-Identifier: BSD-3-Clause
jpayne@69 4 #
jpayne@69 5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
jpayne@69 6 #
jpayne@69 7 """
jpayne@69 8 Calculating and checking file hashes.
jpayne@69 9 """
jpayne@69 10 import hashlib
jpayne@69 11 import functools
jpayne@69 12 from pathlib import Path
jpayne@69 13
jpayne@69 14 # From the docs: https://docs.python.org/3/library/hashlib.html#hashlib.new
jpayne@69 15 # The named constructors are much faster than new() and should be
jpayne@69 16 # preferred.
jpayne@69 17 # Need to fallback on new() for some algorithms.
jpayne@69 18 ALGORITHMS_AVAILABLE = {
jpayne@69 19 alg: getattr(hashlib, alg, functools.partial(hashlib.new, alg))
jpayne@69 20 for alg in hashlib.algorithms_available
jpayne@69 21 }
jpayne@69 22
jpayne@69 23 try:
jpayne@69 24 import xxhash
jpayne@69 25
jpayne@69 26 # xxhash doesn't have a list of available algorithms yet.
jpayne@69 27 # https://github.com/ifduyue/python-xxhash/issues/48
jpayne@69 28 ALGORITHMS_AVAILABLE.update(
jpayne@69 29 {
jpayne@69 30 alg: getattr(xxhash, alg, None)
jpayne@69 31 for alg in ["xxh128", "xxh64", "xxh32", "xxh3_128", "xxh3_64"]
jpayne@69 32 }
jpayne@69 33 )
jpayne@69 34 # The xxh3 algorithms are only available for version>=2.0. Set to None and
jpayne@69 35 # remove to ensure backwards compatibility.
jpayne@69 36 ALGORITHMS_AVAILABLE = {
jpayne@69 37 alg: func for alg, func in ALGORITHMS_AVAILABLE.items() if func is not None
jpayne@69 38 }
jpayne@69 39 except ImportError:
jpayne@69 40 pass
jpayne@69 41
jpayne@69 42
jpayne@69 43 def file_hash(fname, alg="sha256"):
jpayne@69 44 """
jpayne@69 45 Calculate the hash of a given file.
jpayne@69 46
jpayne@69 47 Useful for checking if a file has changed or been corrupted.
jpayne@69 48
jpayne@69 49 Parameters
jpayne@69 50 ----------
jpayne@69 51 fname : str
jpayne@69 52 The name of the file.
jpayne@69 53 alg : str
jpayne@69 54 The type of the hashing algorithm
jpayne@69 55
jpayne@69 56 Returns
jpayne@69 57 -------
jpayne@69 58 hash : str
jpayne@69 59 The hash of the file.
jpayne@69 60
jpayne@69 61 Examples
jpayne@69 62 --------
jpayne@69 63
jpayne@69 64 >>> fname = "test-file-for-hash.txt"
jpayne@69 65 >>> with open(fname, "w") as f:
jpayne@69 66 ... __ = f.write("content of the file")
jpayne@69 67 >>> print(file_hash(fname))
jpayne@69 68 0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
jpayne@69 69 >>> import os
jpayne@69 70 >>> os.remove(fname)
jpayne@69 71
jpayne@69 72 """
jpayne@69 73 if alg not in ALGORITHMS_AVAILABLE:
jpayne@69 74 raise ValueError(
jpayne@69 75 f"Algorithm '{alg}' not available to the pooch library. "
jpayne@69 76 "Only the following algorithms are available "
jpayne@69 77 f"{list(ALGORITHMS_AVAILABLE.keys())}."
jpayne@69 78 )
jpayne@69 79 # Calculate the hash in chunks to avoid overloading the memory
jpayne@69 80 chunksize = 65536
jpayne@69 81 hasher = ALGORITHMS_AVAILABLE[alg]()
jpayne@69 82 with open(fname, "rb") as fin:
jpayne@69 83 buff = fin.read(chunksize)
jpayne@69 84 while buff:
jpayne@69 85 hasher.update(buff)
jpayne@69 86 buff = fin.read(chunksize)
jpayne@69 87 return hasher.hexdigest()
jpayne@69 88
jpayne@69 89
jpayne@69 90 def hash_algorithm(hash_string):
jpayne@69 91 """
jpayne@69 92 Parse the name of the hash method from the hash string.
jpayne@69 93
jpayne@69 94 The hash string should have the following form ``algorithm:hash``, where
jpayne@69 95 algorithm can be the name of any algorithm known to :mod:`hashlib`.
jpayne@69 96
jpayne@69 97 If the algorithm is omitted or the hash string is None, will default to
jpayne@69 98 ``"sha256"``.
jpayne@69 99
jpayne@69 100 Parameters
jpayne@69 101 ----------
jpayne@69 102 hash_string : str
jpayne@69 103 The hash string with optional algorithm prepended.
jpayne@69 104
jpayne@69 105 Returns
jpayne@69 106 -------
jpayne@69 107 hash_algorithm : str
jpayne@69 108 The name of the algorithm.
jpayne@69 109
jpayne@69 110 Examples
jpayne@69 111 --------
jpayne@69 112
jpayne@69 113 >>> print(hash_algorithm("qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
jpayne@69 114 sha256
jpayne@69 115 >>> print(hash_algorithm("md5:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
jpayne@69 116 md5
jpayne@69 117 >>> print(hash_algorithm("sha256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
jpayne@69 118 sha256
jpayne@69 119 >>> print(hash_algorithm("SHA256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
jpayne@69 120 sha256
jpayne@69 121 >>> print(hash_algorithm("xxh3_64:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
jpayne@69 122 xxh3_64
jpayne@69 123 >>> print(hash_algorithm(None))
jpayne@69 124 sha256
jpayne@69 125
jpayne@69 126 """
jpayne@69 127 default = "sha256"
jpayne@69 128 if hash_string is None:
jpayne@69 129 algorithm = default
jpayne@69 130 elif ":" not in hash_string:
jpayne@69 131 algorithm = default
jpayne@69 132 else:
jpayne@69 133 algorithm = hash_string.split(":")[0]
jpayne@69 134 return algorithm.lower()
jpayne@69 135
jpayne@69 136
jpayne@69 137 def hash_matches(fname, known_hash, strict=False, source=None):
jpayne@69 138 """
jpayne@69 139 Check if the hash of a file matches a known hash.
jpayne@69 140
jpayne@69 141 If the *known_hash* is None, will always return True.
jpayne@69 142
jpayne@69 143 Coverts hashes to lowercase before comparison to avoid system specific
jpayne@69 144 mismatches between hashes in the registry and computed hashes.
jpayne@69 145
jpayne@69 146 Parameters
jpayne@69 147 ----------
jpayne@69 148 fname : str or PathLike
jpayne@69 149 The path to the file.
jpayne@69 150 known_hash : str
jpayne@69 151 The known hash. Optionally, prepend ``alg:`` to the hash to specify the
jpayne@69 152 hashing algorithm. Default is SHA256.
jpayne@69 153 strict : bool
jpayne@69 154 If True, will raise a :class:`ValueError` if the hash does not match
jpayne@69 155 informing the user that the file may be corrupted.
jpayne@69 156 source : str
jpayne@69 157 The source of the downloaded file (name or URL, for example). Will be
jpayne@69 158 used in the error message if *strict* is True. Has no other use other
jpayne@69 159 than reporting to the user where the file came from in case of hash
jpayne@69 160 mismatch. If None, will default to *fname*.
jpayne@69 161
jpayne@69 162 Returns
jpayne@69 163 -------
jpayne@69 164 is_same : bool
jpayne@69 165 True if the hash matches, False otherwise.
jpayne@69 166
jpayne@69 167 """
jpayne@69 168 if known_hash is None:
jpayne@69 169 return True
jpayne@69 170 algorithm = hash_algorithm(known_hash)
jpayne@69 171 new_hash = file_hash(fname, alg=algorithm)
jpayne@69 172 matches = new_hash.lower() == known_hash.split(":")[-1].lower()
jpayne@69 173 if strict and not matches:
jpayne@69 174 if source is None:
jpayne@69 175 source = str(fname)
jpayne@69 176 raise ValueError(
jpayne@69 177 f"{algorithm.upper()} hash of downloaded file ({source}) does not match"
jpayne@69 178 f" the known hash: expected {known_hash} but got {new_hash}. Deleted"
jpayne@69 179 " download for safety. The downloaded file may have been corrupted or"
jpayne@69 180 " the known hash may be outdated."
jpayne@69 181 )
jpayne@69 182 return matches
jpayne@69 183
jpayne@69 184
jpayne@69 185 def make_registry(directory, output, recursive=True):
jpayne@69 186 """
jpayne@69 187 Make a registry of files and hashes for the given directory.
jpayne@69 188
jpayne@69 189 This is helpful if you have many files in your test dataset as it keeps you
jpayne@69 190 from needing to manually update the registry.
jpayne@69 191
jpayne@69 192 Parameters
jpayne@69 193 ----------
jpayne@69 194 directory : str
jpayne@69 195 Directory of the test data to put in the registry. All file names in
jpayne@69 196 the registry will be relative to this directory.
jpayne@69 197 output : str
jpayne@69 198 Name of the output registry file.
jpayne@69 199 recursive : bool
jpayne@69 200 If True, will recursively look for files in subdirectories of
jpayne@69 201 *directory*.
jpayne@69 202
jpayne@69 203 """
jpayne@69 204 directory = Path(directory)
jpayne@69 205 if recursive:
jpayne@69 206 pattern = "**/*"
jpayne@69 207 else:
jpayne@69 208 pattern = "*"
jpayne@69 209
jpayne@69 210 files = sorted(
jpayne@69 211 str(path.relative_to(directory))
jpayne@69 212 for path in directory.glob(pattern)
jpayne@69 213 if path.is_file()
jpayne@69 214 )
jpayne@69 215
jpayne@69 216 hashes = [file_hash(str(directory / fname)) for fname in files]
jpayne@69 217
jpayne@69 218 with open(output, "w", encoding="utf-8") as outfile:
jpayne@69 219 for fname, fhash in zip(files, hashes):
jpayne@69 220 # Only use Unix separators for the registry so that we don't go
jpayne@69 221 # insane dealing with file paths.
jpayne@69 222 outfile.write("{} {}\n".format(fname.replace("\\", "/"), fhash))