jpayne@69
|
1 # Copyright (c) 2018 The Pooch Developers.
|
jpayne@69
|
2 # Distributed under the terms of the BSD 3-Clause License.
|
jpayne@69
|
3 # SPDX-License-Identifier: BSD-3-Clause
|
jpayne@69
|
4 #
|
jpayne@69
|
5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
jpayne@69
|
6 #
|
jpayne@69
|
7 """
|
jpayne@69
|
8 Calculating and checking file hashes.
|
jpayne@69
|
9 """
|
jpayne@69
|
10 import hashlib
|
jpayne@69
|
11 import functools
|
jpayne@69
|
12 from pathlib import Path
|
jpayne@69
|
13
|
jpayne@69
|
14 # From the docs: https://docs.python.org/3/library/hashlib.html#hashlib.new
|
jpayne@69
|
15 # The named constructors are much faster than new() and should be
|
jpayne@69
|
16 # preferred.
|
jpayne@69
|
17 # Need to fallback on new() for some algorithms.
|
jpayne@69
|
18 ALGORITHMS_AVAILABLE = {
|
jpayne@69
|
19 alg: getattr(hashlib, alg, functools.partial(hashlib.new, alg))
|
jpayne@69
|
20 for alg in hashlib.algorithms_available
|
jpayne@69
|
21 }
|
jpayne@69
|
22
|
jpayne@69
|
23 try:
|
jpayne@69
|
24 import xxhash
|
jpayne@69
|
25
|
jpayne@69
|
26 # xxhash doesn't have a list of available algorithms yet.
|
jpayne@69
|
27 # https://github.com/ifduyue/python-xxhash/issues/48
|
jpayne@69
|
28 ALGORITHMS_AVAILABLE.update(
|
jpayne@69
|
29 {
|
jpayne@69
|
30 alg: getattr(xxhash, alg, None)
|
jpayne@69
|
31 for alg in ["xxh128", "xxh64", "xxh32", "xxh3_128", "xxh3_64"]
|
jpayne@69
|
32 }
|
jpayne@69
|
33 )
|
jpayne@69
|
34 # The xxh3 algorithms are only available for version>=2.0. Set to None and
|
jpayne@69
|
35 # remove to ensure backwards compatibility.
|
jpayne@69
|
36 ALGORITHMS_AVAILABLE = {
|
jpayne@69
|
37 alg: func for alg, func in ALGORITHMS_AVAILABLE.items() if func is not None
|
jpayne@69
|
38 }
|
jpayne@69
|
39 except ImportError:
|
jpayne@69
|
40 pass
|
jpayne@69
|
41
|
jpayne@69
|
42
|
jpayne@69
|
43 def file_hash(fname, alg="sha256"):
|
jpayne@69
|
44 """
|
jpayne@69
|
45 Calculate the hash of a given file.
|
jpayne@69
|
46
|
jpayne@69
|
47 Useful for checking if a file has changed or been corrupted.
|
jpayne@69
|
48
|
jpayne@69
|
49 Parameters
|
jpayne@69
|
50 ----------
|
jpayne@69
|
51 fname : str
|
jpayne@69
|
52 The name of the file.
|
jpayne@69
|
53 alg : str
|
jpayne@69
|
54 The type of the hashing algorithm
|
jpayne@69
|
55
|
jpayne@69
|
56 Returns
|
jpayne@69
|
57 -------
|
jpayne@69
|
58 hash : str
|
jpayne@69
|
59 The hash of the file.
|
jpayne@69
|
60
|
jpayne@69
|
61 Examples
|
jpayne@69
|
62 --------
|
jpayne@69
|
63
|
jpayne@69
|
64 >>> fname = "test-file-for-hash.txt"
|
jpayne@69
|
65 >>> with open(fname, "w") as f:
|
jpayne@69
|
66 ... __ = f.write("content of the file")
|
jpayne@69
|
67 >>> print(file_hash(fname))
|
jpayne@69
|
68 0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
|
jpayne@69
|
69 >>> import os
|
jpayne@69
|
70 >>> os.remove(fname)
|
jpayne@69
|
71
|
jpayne@69
|
72 """
|
jpayne@69
|
73 if alg not in ALGORITHMS_AVAILABLE:
|
jpayne@69
|
74 raise ValueError(
|
jpayne@69
|
75 f"Algorithm '{alg}' not available to the pooch library. "
|
jpayne@69
|
76 "Only the following algorithms are available "
|
jpayne@69
|
77 f"{list(ALGORITHMS_AVAILABLE.keys())}."
|
jpayne@69
|
78 )
|
jpayne@69
|
79 # Calculate the hash in chunks to avoid overloading the memory
|
jpayne@69
|
80 chunksize = 65536
|
jpayne@69
|
81 hasher = ALGORITHMS_AVAILABLE[alg]()
|
jpayne@69
|
82 with open(fname, "rb") as fin:
|
jpayne@69
|
83 buff = fin.read(chunksize)
|
jpayne@69
|
84 while buff:
|
jpayne@69
|
85 hasher.update(buff)
|
jpayne@69
|
86 buff = fin.read(chunksize)
|
jpayne@69
|
87 return hasher.hexdigest()
|
jpayne@69
|
88
|
jpayne@69
|
89
|
jpayne@69
|
90 def hash_algorithm(hash_string):
|
jpayne@69
|
91 """
|
jpayne@69
|
92 Parse the name of the hash method from the hash string.
|
jpayne@69
|
93
|
jpayne@69
|
94 The hash string should have the following form ``algorithm:hash``, where
|
jpayne@69
|
95 algorithm can be the name of any algorithm known to :mod:`hashlib`.
|
jpayne@69
|
96
|
jpayne@69
|
97 If the algorithm is omitted or the hash string is None, will default to
|
jpayne@69
|
98 ``"sha256"``.
|
jpayne@69
|
99
|
jpayne@69
|
100 Parameters
|
jpayne@69
|
101 ----------
|
jpayne@69
|
102 hash_string : str
|
jpayne@69
|
103 The hash string with optional algorithm prepended.
|
jpayne@69
|
104
|
jpayne@69
|
105 Returns
|
jpayne@69
|
106 -------
|
jpayne@69
|
107 hash_algorithm : str
|
jpayne@69
|
108 The name of the algorithm.
|
jpayne@69
|
109
|
jpayne@69
|
110 Examples
|
jpayne@69
|
111 --------
|
jpayne@69
|
112
|
jpayne@69
|
113 >>> print(hash_algorithm("qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
|
jpayne@69
|
114 sha256
|
jpayne@69
|
115 >>> print(hash_algorithm("md5:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
|
jpayne@69
|
116 md5
|
jpayne@69
|
117 >>> print(hash_algorithm("sha256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
|
jpayne@69
|
118 sha256
|
jpayne@69
|
119 >>> print(hash_algorithm("SHA256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
|
jpayne@69
|
120 sha256
|
jpayne@69
|
121 >>> print(hash_algorithm("xxh3_64:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
|
jpayne@69
|
122 xxh3_64
|
jpayne@69
|
123 >>> print(hash_algorithm(None))
|
jpayne@69
|
124 sha256
|
jpayne@69
|
125
|
jpayne@69
|
126 """
|
jpayne@69
|
127 default = "sha256"
|
jpayne@69
|
128 if hash_string is None:
|
jpayne@69
|
129 algorithm = default
|
jpayne@69
|
130 elif ":" not in hash_string:
|
jpayne@69
|
131 algorithm = default
|
jpayne@69
|
132 else:
|
jpayne@69
|
133 algorithm = hash_string.split(":")[0]
|
jpayne@69
|
134 return algorithm.lower()
|
jpayne@69
|
135
|
jpayne@69
|
136
|
jpayne@69
|
137 def hash_matches(fname, known_hash, strict=False, source=None):
|
jpayne@69
|
138 """
|
jpayne@69
|
139 Check if the hash of a file matches a known hash.
|
jpayne@69
|
140
|
jpayne@69
|
141 If the *known_hash* is None, will always return True.
|
jpayne@69
|
142
|
jpayne@69
|
143 Coverts hashes to lowercase before comparison to avoid system specific
|
jpayne@69
|
144 mismatches between hashes in the registry and computed hashes.
|
jpayne@69
|
145
|
jpayne@69
|
146 Parameters
|
jpayne@69
|
147 ----------
|
jpayne@69
|
148 fname : str or PathLike
|
jpayne@69
|
149 The path to the file.
|
jpayne@69
|
150 known_hash : str
|
jpayne@69
|
151 The known hash. Optionally, prepend ``alg:`` to the hash to specify the
|
jpayne@69
|
152 hashing algorithm. Default is SHA256.
|
jpayne@69
|
153 strict : bool
|
jpayne@69
|
154 If True, will raise a :class:`ValueError` if the hash does not match
|
jpayne@69
|
155 informing the user that the file may be corrupted.
|
jpayne@69
|
156 source : str
|
jpayne@69
|
157 The source of the downloaded file (name or URL, for example). Will be
|
jpayne@69
|
158 used in the error message if *strict* is True. Has no other use other
|
jpayne@69
|
159 than reporting to the user where the file came from in case of hash
|
jpayne@69
|
160 mismatch. If None, will default to *fname*.
|
jpayne@69
|
161
|
jpayne@69
|
162 Returns
|
jpayne@69
|
163 -------
|
jpayne@69
|
164 is_same : bool
|
jpayne@69
|
165 True if the hash matches, False otherwise.
|
jpayne@69
|
166
|
jpayne@69
|
167 """
|
jpayne@69
|
168 if known_hash is None:
|
jpayne@69
|
169 return True
|
jpayne@69
|
170 algorithm = hash_algorithm(known_hash)
|
jpayne@69
|
171 new_hash = file_hash(fname, alg=algorithm)
|
jpayne@69
|
172 matches = new_hash.lower() == known_hash.split(":")[-1].lower()
|
jpayne@69
|
173 if strict and not matches:
|
jpayne@69
|
174 if source is None:
|
jpayne@69
|
175 source = str(fname)
|
jpayne@69
|
176 raise ValueError(
|
jpayne@69
|
177 f"{algorithm.upper()} hash of downloaded file ({source}) does not match"
|
jpayne@69
|
178 f" the known hash: expected {known_hash} but got {new_hash}. Deleted"
|
jpayne@69
|
179 " download for safety. The downloaded file may have been corrupted or"
|
jpayne@69
|
180 " the known hash may be outdated."
|
jpayne@69
|
181 )
|
jpayne@69
|
182 return matches
|
jpayne@69
|
183
|
jpayne@69
|
184
|
jpayne@69
|
185 def make_registry(directory, output, recursive=True):
|
jpayne@69
|
186 """
|
jpayne@69
|
187 Make a registry of files and hashes for the given directory.
|
jpayne@69
|
188
|
jpayne@69
|
189 This is helpful if you have many files in your test dataset as it keeps you
|
jpayne@69
|
190 from needing to manually update the registry.
|
jpayne@69
|
191
|
jpayne@69
|
192 Parameters
|
jpayne@69
|
193 ----------
|
jpayne@69
|
194 directory : str
|
jpayne@69
|
195 Directory of the test data to put in the registry. All file names in
|
jpayne@69
|
196 the registry will be relative to this directory.
|
jpayne@69
|
197 output : str
|
jpayne@69
|
198 Name of the output registry file.
|
jpayne@69
|
199 recursive : bool
|
jpayne@69
|
200 If True, will recursively look for files in subdirectories of
|
jpayne@69
|
201 *directory*.
|
jpayne@69
|
202
|
jpayne@69
|
203 """
|
jpayne@69
|
204 directory = Path(directory)
|
jpayne@69
|
205 if recursive:
|
jpayne@69
|
206 pattern = "**/*"
|
jpayne@69
|
207 else:
|
jpayne@69
|
208 pattern = "*"
|
jpayne@69
|
209
|
jpayne@69
|
210 files = sorted(
|
jpayne@69
|
211 str(path.relative_to(directory))
|
jpayne@69
|
212 for path in directory.glob(pattern)
|
jpayne@69
|
213 if path.is_file()
|
jpayne@69
|
214 )
|
jpayne@69
|
215
|
jpayne@69
|
216 hashes = [file_hash(str(directory / fname)) for fname in files]
|
jpayne@69
|
217
|
jpayne@69
|
218 with open(output, "w", encoding="utf-8") as outfile:
|
jpayne@69
|
219 for fname, fhash in zip(files, hashes):
|
jpayne@69
|
220 # Only use Unix separators for the registry so that we don't go
|
jpayne@69
|
221 # insane dealing with file paths.
|
jpayne@69
|
222 outfile.write("{} {}\n".format(fname.replace("\\", "/"), fhash))
|