comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/hashes.py @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 # Copyright (c) 2018 The Pooch Developers.
2 # Distributed under the terms of the BSD 3-Clause License.
3 # SPDX-License-Identifier: BSD-3-Clause
4 #
5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
6 #
7 """
8 Calculating and checking file hashes.
9 """
10 import hashlib
11 import functools
12 from pathlib import Path
13
14 # From the docs: https://docs.python.org/3/library/hashlib.html#hashlib.new
15 # The named constructors are much faster than new() and should be
16 # preferred.
17 # Need to fallback on new() for some algorithms.
18 ALGORITHMS_AVAILABLE = {
19 alg: getattr(hashlib, alg, functools.partial(hashlib.new, alg))
20 for alg in hashlib.algorithms_available
21 }
22
23 try:
24 import xxhash
25
26 # xxhash doesn't have a list of available algorithms yet.
27 # https://github.com/ifduyue/python-xxhash/issues/48
28 ALGORITHMS_AVAILABLE.update(
29 {
30 alg: getattr(xxhash, alg, None)
31 for alg in ["xxh128", "xxh64", "xxh32", "xxh3_128", "xxh3_64"]
32 }
33 )
34 # The xxh3 algorithms are only available for version>=2.0. Set to None and
35 # remove to ensure backwards compatibility.
36 ALGORITHMS_AVAILABLE = {
37 alg: func for alg, func in ALGORITHMS_AVAILABLE.items() if func is not None
38 }
39 except ImportError:
40 pass
41
42
43 def file_hash(fname, alg="sha256"):
44 """
45 Calculate the hash of a given file.
46
47 Useful for checking if a file has changed or been corrupted.
48
49 Parameters
50 ----------
51 fname : str
52 The name of the file.
53 alg : str
54 The type of the hashing algorithm
55
56 Returns
57 -------
58 hash : str
59 The hash of the file.
60
61 Examples
62 --------
63
64 >>> fname = "test-file-for-hash.txt"
65 >>> with open(fname, "w") as f:
66 ... __ = f.write("content of the file")
67 >>> print(file_hash(fname))
68 0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
69 >>> import os
70 >>> os.remove(fname)
71
72 """
73 if alg not in ALGORITHMS_AVAILABLE:
74 raise ValueError(
75 f"Algorithm '{alg}' not available to the pooch library. "
76 "Only the following algorithms are available "
77 f"{list(ALGORITHMS_AVAILABLE.keys())}."
78 )
79 # Calculate the hash in chunks to avoid overloading the memory
80 chunksize = 65536
81 hasher = ALGORITHMS_AVAILABLE[alg]()
82 with open(fname, "rb") as fin:
83 buff = fin.read(chunksize)
84 while buff:
85 hasher.update(buff)
86 buff = fin.read(chunksize)
87 return hasher.hexdigest()
88
89
90 def hash_algorithm(hash_string):
91 """
92 Parse the name of the hash method from the hash string.
93
94 The hash string should have the following form ``algorithm:hash``, where
95 algorithm can be the name of any algorithm known to :mod:`hashlib`.
96
97 If the algorithm is omitted or the hash string is None, will default to
98 ``"sha256"``.
99
100 Parameters
101 ----------
102 hash_string : str
103 The hash string with optional algorithm prepended.
104
105 Returns
106 -------
107 hash_algorithm : str
108 The name of the algorithm.
109
110 Examples
111 --------
112
113 >>> print(hash_algorithm("qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
114 sha256
115 >>> print(hash_algorithm("md5:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
116 md5
117 >>> print(hash_algorithm("sha256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
118 sha256
119 >>> print(hash_algorithm("SHA256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
120 sha256
121 >>> print(hash_algorithm("xxh3_64:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
122 xxh3_64
123 >>> print(hash_algorithm(None))
124 sha256
125
126 """
127 default = "sha256"
128 if hash_string is None:
129 algorithm = default
130 elif ":" not in hash_string:
131 algorithm = default
132 else:
133 algorithm = hash_string.split(":")[0]
134 return algorithm.lower()
135
136
137 def hash_matches(fname, known_hash, strict=False, source=None):
138 """
139 Check if the hash of a file matches a known hash.
140
141 If the *known_hash* is None, will always return True.
142
143 Coverts hashes to lowercase before comparison to avoid system specific
144 mismatches between hashes in the registry and computed hashes.
145
146 Parameters
147 ----------
148 fname : str or PathLike
149 The path to the file.
150 known_hash : str
151 The known hash. Optionally, prepend ``alg:`` to the hash to specify the
152 hashing algorithm. Default is SHA256.
153 strict : bool
154 If True, will raise a :class:`ValueError` if the hash does not match
155 informing the user that the file may be corrupted.
156 source : str
157 The source of the downloaded file (name or URL, for example). Will be
158 used in the error message if *strict* is True. Has no other use other
159 than reporting to the user where the file came from in case of hash
160 mismatch. If None, will default to *fname*.
161
162 Returns
163 -------
164 is_same : bool
165 True if the hash matches, False otherwise.
166
167 """
168 if known_hash is None:
169 return True
170 algorithm = hash_algorithm(known_hash)
171 new_hash = file_hash(fname, alg=algorithm)
172 matches = new_hash.lower() == known_hash.split(":")[-1].lower()
173 if strict and not matches:
174 if source is None:
175 source = str(fname)
176 raise ValueError(
177 f"{algorithm.upper()} hash of downloaded file ({source}) does not match"
178 f" the known hash: expected {known_hash} but got {new_hash}. Deleted"
179 " download for safety. The downloaded file may have been corrupted or"
180 " the known hash may be outdated."
181 )
182 return matches
183
184
185 def make_registry(directory, output, recursive=True):
186 """
187 Make a registry of files and hashes for the given directory.
188
189 This is helpful if you have many files in your test dataset as it keeps you
190 from needing to manually update the registry.
191
192 Parameters
193 ----------
194 directory : str
195 Directory of the test data to put in the registry. All file names in
196 the registry will be relative to this directory.
197 output : str
198 Name of the output registry file.
199 recursive : bool
200 If True, will recursively look for files in subdirectories of
201 *directory*.
202
203 """
204 directory = Path(directory)
205 if recursive:
206 pattern = "**/*"
207 else:
208 pattern = "*"
209
210 files = sorted(
211 str(path.relative_to(directory))
212 for path in directory.glob(pattern)
213 if path.is_file()
214 )
215
216 hashes = [file_hash(str(directory / fname)) for fname in files]
217
218 with open(output, "w", encoding="utf-8") as outfile:
219 for fname, fhash in zip(files, hashes):
220 # Only use Unix separators for the registry so that we don't go
221 # insane dealing with file paths.
222 outfile.write("{} {}\n".format(fname.replace("\\", "/"), fhash))