Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/lib/python3.8/site-packages/pooch/hashes.py @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 # Copyright (c) 2018 The Pooch Developers. | |
2 # Distributed under the terms of the BSD 3-Clause License. | |
3 # SPDX-License-Identifier: BSD-3-Clause | |
4 # | |
5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org) | |
6 # | |
7 """ | |
8 Calculating and checking file hashes. | |
9 """ | |
10 import hashlib | |
11 import functools | |
12 from pathlib import Path | |
13 | |
14 # From the docs: https://docs.python.org/3/library/hashlib.html#hashlib.new | |
15 # The named constructors are much faster than new() and should be | |
16 # preferred. | |
17 # Need to fallback on new() for some algorithms. | |
18 ALGORITHMS_AVAILABLE = { | |
19 alg: getattr(hashlib, alg, functools.partial(hashlib.new, alg)) | |
20 for alg in hashlib.algorithms_available | |
21 } | |
22 | |
23 try: | |
24 import xxhash | |
25 | |
26 # xxhash doesn't have a list of available algorithms yet. | |
27 # https://github.com/ifduyue/python-xxhash/issues/48 | |
28 ALGORITHMS_AVAILABLE.update( | |
29 { | |
30 alg: getattr(xxhash, alg, None) | |
31 for alg in ["xxh128", "xxh64", "xxh32", "xxh3_128", "xxh3_64"] | |
32 } | |
33 ) | |
34 # The xxh3 algorithms are only available for version>=2.0. Set to None and | |
35 # remove to ensure backwards compatibility. | |
36 ALGORITHMS_AVAILABLE = { | |
37 alg: func for alg, func in ALGORITHMS_AVAILABLE.items() if func is not None | |
38 } | |
39 except ImportError: | |
40 pass | |
41 | |
42 | |
43 def file_hash(fname, alg="sha256"): | |
44 """ | |
45 Calculate the hash of a given file. | |
46 | |
47 Useful for checking if a file has changed or been corrupted. | |
48 | |
49 Parameters | |
50 ---------- | |
51 fname : str | |
52 The name of the file. | |
53 alg : str | |
54 The type of the hashing algorithm | |
55 | |
56 Returns | |
57 ------- | |
58 hash : str | |
59 The hash of the file. | |
60 | |
61 Examples | |
62 -------- | |
63 | |
64 >>> fname = "test-file-for-hash.txt" | |
65 >>> with open(fname, "w") as f: | |
66 ... __ = f.write("content of the file") | |
67 >>> print(file_hash(fname)) | |
68 0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00 | |
69 >>> import os | |
70 >>> os.remove(fname) | |
71 | |
72 """ | |
73 if alg not in ALGORITHMS_AVAILABLE: | |
74 raise ValueError( | |
75 f"Algorithm '{alg}' not available to the pooch library. " | |
76 "Only the following algorithms are available " | |
77 f"{list(ALGORITHMS_AVAILABLE.keys())}." | |
78 ) | |
79 # Calculate the hash in chunks to avoid overloading the memory | |
80 chunksize = 65536 | |
81 hasher = ALGORITHMS_AVAILABLE[alg]() | |
82 with open(fname, "rb") as fin: | |
83 buff = fin.read(chunksize) | |
84 while buff: | |
85 hasher.update(buff) | |
86 buff = fin.read(chunksize) | |
87 return hasher.hexdigest() | |
88 | |
89 | |
90 def hash_algorithm(hash_string): | |
91 """ | |
92 Parse the name of the hash method from the hash string. | |
93 | |
94 The hash string should have the following form ``algorithm:hash``, where | |
95 algorithm can be the name of any algorithm known to :mod:`hashlib`. | |
96 | |
97 If the algorithm is omitted or the hash string is None, will default to | |
98 ``"sha256"``. | |
99 | |
100 Parameters | |
101 ---------- | |
102 hash_string : str | |
103 The hash string with optional algorithm prepended. | |
104 | |
105 Returns | |
106 ------- | |
107 hash_algorithm : str | |
108 The name of the algorithm. | |
109 | |
110 Examples | |
111 -------- | |
112 | |
113 >>> print(hash_algorithm("qouuwhwd2j192y1lb1iwgowdj2898wd2d9")) | |
114 sha256 | |
115 >>> print(hash_algorithm("md5:qouuwhwd2j192y1lb1iwgowdj2898wd2d9")) | |
116 md5 | |
117 >>> print(hash_algorithm("sha256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9")) | |
118 sha256 | |
119 >>> print(hash_algorithm("SHA256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9")) | |
120 sha256 | |
121 >>> print(hash_algorithm("xxh3_64:qouuwhwd2j192y1lb1iwgowdj2898wd2d9")) | |
122 xxh3_64 | |
123 >>> print(hash_algorithm(None)) | |
124 sha256 | |
125 | |
126 """ | |
127 default = "sha256" | |
128 if hash_string is None: | |
129 algorithm = default | |
130 elif ":" not in hash_string: | |
131 algorithm = default | |
132 else: | |
133 algorithm = hash_string.split(":")[0] | |
134 return algorithm.lower() | |
135 | |
136 | |
137 def hash_matches(fname, known_hash, strict=False, source=None): | |
138 """ | |
139 Check if the hash of a file matches a known hash. | |
140 | |
141 If the *known_hash* is None, will always return True. | |
142 | |
143 Coverts hashes to lowercase before comparison to avoid system specific | |
144 mismatches between hashes in the registry and computed hashes. | |
145 | |
146 Parameters | |
147 ---------- | |
148 fname : str or PathLike | |
149 The path to the file. | |
150 known_hash : str | |
151 The known hash. Optionally, prepend ``alg:`` to the hash to specify the | |
152 hashing algorithm. Default is SHA256. | |
153 strict : bool | |
154 If True, will raise a :class:`ValueError` if the hash does not match | |
155 informing the user that the file may be corrupted. | |
156 source : str | |
157 The source of the downloaded file (name or URL, for example). Will be | |
158 used in the error message if *strict* is True. Has no other use other | |
159 than reporting to the user where the file came from in case of hash | |
160 mismatch. If None, will default to *fname*. | |
161 | |
162 Returns | |
163 ------- | |
164 is_same : bool | |
165 True if the hash matches, False otherwise. | |
166 | |
167 """ | |
168 if known_hash is None: | |
169 return True | |
170 algorithm = hash_algorithm(known_hash) | |
171 new_hash = file_hash(fname, alg=algorithm) | |
172 matches = new_hash.lower() == known_hash.split(":")[-1].lower() | |
173 if strict and not matches: | |
174 if source is None: | |
175 source = str(fname) | |
176 raise ValueError( | |
177 f"{algorithm.upper()} hash of downloaded file ({source}) does not match" | |
178 f" the known hash: expected {known_hash} but got {new_hash}. Deleted" | |
179 " download for safety. The downloaded file may have been corrupted or" | |
180 " the known hash may be outdated." | |
181 ) | |
182 return matches | |
183 | |
184 | |
185 def make_registry(directory, output, recursive=True): | |
186 """ | |
187 Make a registry of files and hashes for the given directory. | |
188 | |
189 This is helpful if you have many files in your test dataset as it keeps you | |
190 from needing to manually update the registry. | |
191 | |
192 Parameters | |
193 ---------- | |
194 directory : str | |
195 Directory of the test data to put in the registry. All file names in | |
196 the registry will be relative to this directory. | |
197 output : str | |
198 Name of the output registry file. | |
199 recursive : bool | |
200 If True, will recursively look for files in subdirectories of | |
201 *directory*. | |
202 | |
203 """ | |
204 directory = Path(directory) | |
205 if recursive: | |
206 pattern = "**/*" | |
207 else: | |
208 pattern = "*" | |
209 | |
210 files = sorted( | |
211 str(path.relative_to(directory)) | |
212 for path in directory.glob(pattern) | |
213 if path.is_file() | |
214 ) | |
215 | |
216 hashes = [file_hash(str(directory / fname)) for fname in files] | |
217 | |
218 with open(output, "w", encoding="utf-8") as outfile: | |
219 for fname, fhash in zip(files, hashes): | |
220 # Only use Unix separators for the registry so that we don't go | |
221 # insane dealing with file paths. | |
222 outfile.write("{} {}\n".format(fname.replace("\\", "/"), fhash)) |