jpayne@69
|
1 # Copyright (c) 2018 The Pooch Developers.
|
jpayne@69
|
2 # Distributed under the terms of the BSD 3-Clause License.
|
jpayne@69
|
3 # SPDX-License-Identifier: BSD-3-Clause
|
jpayne@69
|
4 #
|
jpayne@69
|
5 # This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
jpayne@69
|
6 #
|
jpayne@69
|
7 """
|
jpayne@69
|
8 Misc utilities
|
jpayne@69
|
9 """
|
jpayne@69
|
10 import logging
|
jpayne@69
|
11 import os
|
jpayne@69
|
12 import tempfile
|
jpayne@69
|
13 import hashlib
|
jpayne@69
|
14 from pathlib import Path
|
jpayne@69
|
15 from urllib.parse import urlsplit
|
jpayne@69
|
16 from contextlib import contextmanager
|
jpayne@69
|
17 import warnings
|
jpayne@69
|
18
|
jpayne@69
|
19 import platformdirs
|
jpayne@69
|
20 from packaging.version import Version
|
jpayne@69
|
21
|
jpayne@69
|
22
|
jpayne@69
|
23 LOGGER = logging.Logger("pooch")
|
jpayne@69
|
24 LOGGER.addHandler(logging.StreamHandler())
|
jpayne@69
|
25
|
jpayne@69
|
26
|
jpayne@69
|
27 def file_hash(*args, **kwargs):
|
jpayne@69
|
28 """
|
jpayne@69
|
29 WARNING: Importing this function from pooch.utils is DEPRECATED.
|
jpayne@69
|
30 Please import from the top-level namespace (`from pooch import file_hash`)
|
jpayne@69
|
31 instead, which is fully backwards compatible with pooch >= 0.1.
|
jpayne@69
|
32
|
jpayne@69
|
33 Examples
|
jpayne@69
|
34 --------
|
jpayne@69
|
35
|
jpayne@69
|
36 >>> fname = "test-file-for-hash.txt"
|
jpayne@69
|
37 >>> with open(fname, "w") as f:
|
jpayne@69
|
38 ... __ = f.write("content of the file")
|
jpayne@69
|
39 >>> print(file_hash(fname))
|
jpayne@69
|
40 0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
|
jpayne@69
|
41 >>> import os
|
jpayne@69
|
42 >>> os.remove(fname)
|
jpayne@69
|
43
|
jpayne@69
|
44 """
|
jpayne@69
|
45 # pylint: disable=import-outside-toplevel
|
jpayne@69
|
46 from .hashes import file_hash as new_file_hash
|
jpayne@69
|
47
|
jpayne@69
|
48 message = """
|
jpayne@69
|
49 Importing file_hash from pooch.utils is DEPRECATED. Please import from the
|
jpayne@69
|
50 top-level namespace (`from pooch import file_hash`) instead, which is fully
|
jpayne@69
|
51 backwards compatible with pooch >= 0.1.
|
jpayne@69
|
52 """
|
jpayne@69
|
53 warnings.warn(message, DeprecationWarning, stacklevel=2)
|
jpayne@69
|
54 return new_file_hash(*args, **kwargs)
|
jpayne@69
|
55
|
jpayne@69
|
56
|
jpayne@69
|
57 def get_logger():
|
jpayne@69
|
58 r"""
|
jpayne@69
|
59 Get the default event logger.
|
jpayne@69
|
60
|
jpayne@69
|
61 The logger records events like downloading files, unzipping archives, etc.
|
jpayne@69
|
62 Use the method :meth:`logging.Logger.setLevel` of this object to adjust the
|
jpayne@69
|
63 verbosity level from Pooch.
|
jpayne@69
|
64
|
jpayne@69
|
65 Returns
|
jpayne@69
|
66 -------
|
jpayne@69
|
67 logger : :class:`logging.Logger`
|
jpayne@69
|
68 The logger object for Pooch
|
jpayne@69
|
69 """
|
jpayne@69
|
70 return LOGGER
|
jpayne@69
|
71
|
jpayne@69
|
72
|
jpayne@69
|
73 def os_cache(project):
|
jpayne@69
|
74 r"""
|
jpayne@69
|
75 Default cache location based on the operating system.
|
jpayne@69
|
76
|
jpayne@69
|
77 The folder locations are defined by the ``platformdirs`` package
|
jpayne@69
|
78 using the ``user_cache_dir`` function.
|
jpayne@69
|
79 Usually, the locations will be following (see the
|
jpayne@69
|
80 `platformdirs documentation <https://platformdirs.readthedocs.io>`__):
|
jpayne@69
|
81
|
jpayne@69
|
82 * Mac: ``~/Library/Caches/<AppName>``
|
jpayne@69
|
83 * Unix: ``~/.cache/<AppName>`` or the value of the ``XDG_CACHE_HOME``
|
jpayne@69
|
84 environment variable, if defined.
|
jpayne@69
|
85 * Windows: ``C:\Users\<user>\AppData\Local\<AppAuthor>\<AppName>\Cache``
|
jpayne@69
|
86
|
jpayne@69
|
87 Parameters
|
jpayne@69
|
88 ----------
|
jpayne@69
|
89 project : str
|
jpayne@69
|
90 The project name.
|
jpayne@69
|
91
|
jpayne@69
|
92 Returns
|
jpayne@69
|
93 -------
|
jpayne@69
|
94 cache_path : :class:`pathlib.Path`
|
jpayne@69
|
95 The default location for the data cache. User directories (``'~'``) are
|
jpayne@69
|
96 not expanded.
|
jpayne@69
|
97
|
jpayne@69
|
98 """
|
jpayne@69
|
99 return Path(platformdirs.user_cache_dir(project))
|
jpayne@69
|
100
|
jpayne@69
|
101
|
jpayne@69
|
102 def check_version(version, fallback="master"):
|
jpayne@69
|
103 """
|
jpayne@69
|
104 Check if a version is PEP440 compliant and there are no unreleased changes.
|
jpayne@69
|
105
|
jpayne@69
|
106 For example, ``version = "0.1"`` will be returned as is but ``version =
|
jpayne@69
|
107 "0.1+10.8dl8dh9"`` will return the fallback. This is the convention used by
|
jpayne@69
|
108 `versioneer <https://github.com/warner/python-versioneer>`__ to mark that
|
jpayne@69
|
109 this version is 10 commits ahead of the last release.
|
jpayne@69
|
110
|
jpayne@69
|
111 Parameters
|
jpayne@69
|
112 ----------
|
jpayne@69
|
113 version : str
|
jpayne@69
|
114 A version string.
|
jpayne@69
|
115 fallback : str
|
jpayne@69
|
116 What to return if the version string has unreleased changes.
|
jpayne@69
|
117
|
jpayne@69
|
118 Returns
|
jpayne@69
|
119 -------
|
jpayne@69
|
120 version : str
|
jpayne@69
|
121 If *version* is PEP440 compliant and there are unreleased changes, then
|
jpayne@69
|
122 return *version*. Otherwise, return *fallback*.
|
jpayne@69
|
123
|
jpayne@69
|
124 Raises
|
jpayne@69
|
125 ------
|
jpayne@69
|
126 InvalidVersion
|
jpayne@69
|
127 If *version* is not PEP440 compliant.
|
jpayne@69
|
128
|
jpayne@69
|
129 Examples
|
jpayne@69
|
130 --------
|
jpayne@69
|
131
|
jpayne@69
|
132 >>> check_version("0.1")
|
jpayne@69
|
133 '0.1'
|
jpayne@69
|
134 >>> check_version("0.1a10")
|
jpayne@69
|
135 '0.1a10'
|
jpayne@69
|
136 >>> check_version("0.1+111.9hdg36")
|
jpayne@69
|
137 'master'
|
jpayne@69
|
138 >>> check_version("0.1+111.9hdg36", fallback="dev")
|
jpayne@69
|
139 'dev'
|
jpayne@69
|
140
|
jpayne@69
|
141 """
|
jpayne@69
|
142 parse = Version(version)
|
jpayne@69
|
143 if parse.local is not None:
|
jpayne@69
|
144 return fallback
|
jpayne@69
|
145 return version
|
jpayne@69
|
146
|
jpayne@69
|
147
|
jpayne@69
|
148 def parse_url(url):
|
jpayne@69
|
149 """
|
jpayne@69
|
150 Parse a URL into 3 components:
|
jpayne@69
|
151
|
jpayne@69
|
152 <protocol>://<netloc>/<path>
|
jpayne@69
|
153
|
jpayne@69
|
154 Example URLs:
|
jpayne@69
|
155
|
jpayne@69
|
156 * http://127.0.0.1:8080/test.nc
|
jpayne@69
|
157 * ftp://127.0.0.1:8080/test.nc
|
jpayne@69
|
158 * doi:10.6084/m9.figshare.923450.v1/test.nc
|
jpayne@69
|
159
|
jpayne@69
|
160 The DOI is a special case. The protocol will be "doi", the netloc will be
|
jpayne@69
|
161 the DOI, and the path is what comes after the last "/".
|
jpayne@69
|
162 The only exception are Zenodo dois: the protocol will be "doi", the netloc
|
jpayne@69
|
163 will be composed by the "prefix/suffix" and the path is what comes after
|
jpayne@69
|
164 the second "/". This allows to support special cases of Zenodo dois where
|
jpayne@69
|
165 the path contains forward slashes "/", created by the GitHub-Zenodo
|
jpayne@69
|
166 integration service.
|
jpayne@69
|
167
|
jpayne@69
|
168 Parameters
|
jpayne@69
|
169 ----------
|
jpayne@69
|
170 url : str
|
jpayne@69
|
171 The URL.
|
jpayne@69
|
172
|
jpayne@69
|
173 Returns
|
jpayne@69
|
174 -------
|
jpayne@69
|
175 parsed_url : dict
|
jpayne@69
|
176 Three components of a URL (e.g.,
|
jpayne@69
|
177 ``{'protocol':'http', 'netloc':'127.0.0.1:8080','path': '/test.nc'}``).
|
jpayne@69
|
178
|
jpayne@69
|
179 """
|
jpayne@69
|
180 if url.startswith("doi://"):
|
jpayne@69
|
181 raise ValueError(
|
jpayne@69
|
182 f"Invalid DOI link '{url}'. You must not use '//' after 'doi:'."
|
jpayne@69
|
183 )
|
jpayne@69
|
184 if url.startswith("doi:"):
|
jpayne@69
|
185 protocol = "doi"
|
jpayne@69
|
186 parts = url[4:].split("/")
|
jpayne@69
|
187 if "zenodo" in parts[1].lower():
|
jpayne@69
|
188 netloc = "/".join(parts[:2])
|
jpayne@69
|
189 path = "/" + "/".join(parts[2:])
|
jpayne@69
|
190 else:
|
jpayne@69
|
191 netloc = "/".join(parts[:-1])
|
jpayne@69
|
192 path = "/" + parts[-1]
|
jpayne@69
|
193 else:
|
jpayne@69
|
194 parsed_url = urlsplit(url)
|
jpayne@69
|
195 protocol = parsed_url.scheme or "file"
|
jpayne@69
|
196 netloc = parsed_url.netloc
|
jpayne@69
|
197 path = parsed_url.path
|
jpayne@69
|
198 return {"protocol": protocol, "netloc": netloc, "path": path}
|
jpayne@69
|
199
|
jpayne@69
|
200
|
jpayne@69
|
201 def cache_location(path, env=None, version=None):
|
jpayne@69
|
202 """
|
jpayne@69
|
203 Location of the cache given a base path and optional configuration.
|
jpayne@69
|
204
|
jpayne@69
|
205 Checks for the environment variable to overwrite the path of the local
|
jpayne@69
|
206 cache. Optionally add *version* to the path if given.
|
jpayne@69
|
207
|
jpayne@69
|
208 Parameters
|
jpayne@69
|
209 ----------
|
jpayne@69
|
210 path : str, PathLike, list or tuple
|
jpayne@69
|
211 The path to the local data storage folder. If this is a list or tuple,
|
jpayne@69
|
212 we'll join the parts with the appropriate separator. Use
|
jpayne@69
|
213 :func:`pooch.os_cache` for a sensible default.
|
jpayne@69
|
214 version : str or None
|
jpayne@69
|
215 The version string for your project. Will be appended to given path if
|
jpayne@69
|
216 not None.
|
jpayne@69
|
217 env : str or None
|
jpayne@69
|
218 An environment variable that can be used to overwrite *path*. This
|
jpayne@69
|
219 allows users to control where they want the data to be stored. We'll
|
jpayne@69
|
220 append *version* to the end of this value as well.
|
jpayne@69
|
221
|
jpayne@69
|
222 Returns
|
jpayne@69
|
223 -------
|
jpayne@69
|
224 local_path : PathLike
|
jpayne@69
|
225 The path to the local directory.
|
jpayne@69
|
226
|
jpayne@69
|
227 """
|
jpayne@69
|
228 if env is not None and env in os.environ and os.environ[env]:
|
jpayne@69
|
229 path = os.environ[env]
|
jpayne@69
|
230 if isinstance(path, (list, tuple)):
|
jpayne@69
|
231 path = os.path.join(*path)
|
jpayne@69
|
232 if version is not None:
|
jpayne@69
|
233 path = os.path.join(str(path), version)
|
jpayne@69
|
234 path = os.path.expanduser(str(path))
|
jpayne@69
|
235 return Path(path)
|
jpayne@69
|
236
|
jpayne@69
|
237
|
jpayne@69
|
238 def make_local_storage(path, env=None):
|
jpayne@69
|
239 """
|
jpayne@69
|
240 Create the local cache directory and make sure it's writable.
|
jpayne@69
|
241
|
jpayne@69
|
242 Parameters
|
jpayne@69
|
243 ----------
|
jpayne@69
|
244 path : str or PathLike
|
jpayne@69
|
245 The path to the local data storage folder.
|
jpayne@69
|
246 env : str or None
|
jpayne@69
|
247 An environment variable that can be used to overwrite *path*. Only used
|
jpayne@69
|
248 in the error message in case the folder is not writable.
|
jpayne@69
|
249 """
|
jpayne@69
|
250 path = str(path)
|
jpayne@69
|
251 # Check that the data directory is writable
|
jpayne@69
|
252 if not os.path.exists(path):
|
jpayne@69
|
253 action = "create"
|
jpayne@69
|
254 else:
|
jpayne@69
|
255 action = "write to"
|
jpayne@69
|
256
|
jpayne@69
|
257 try:
|
jpayne@69
|
258 if action == "create":
|
jpayne@69
|
259 # When running in parallel, it's possible that multiple jobs will
|
jpayne@69
|
260 # try to create the path at the same time. Use exist_ok to avoid
|
jpayne@69
|
261 # raising an error.
|
jpayne@69
|
262 os.makedirs(path, exist_ok=True)
|
jpayne@69
|
263 else:
|
jpayne@69
|
264 with tempfile.NamedTemporaryFile(dir=path):
|
jpayne@69
|
265 pass
|
jpayne@69
|
266 except PermissionError as error:
|
jpayne@69
|
267 message = [
|
jpayne@69
|
268 str(error),
|
jpayne@69
|
269 f"| Pooch could not {action} data cache folder '{path}'.",
|
jpayne@69
|
270 "Will not be able to download data files.",
|
jpayne@69
|
271 ]
|
jpayne@69
|
272 if env is not None:
|
jpayne@69
|
273 message.append(
|
jpayne@69
|
274 f"Use environment variable '{env}' to specify a different location."
|
jpayne@69
|
275 )
|
jpayne@69
|
276 raise PermissionError(" ".join(message)) from error
|
jpayne@69
|
277
|
jpayne@69
|
278
|
jpayne@69
|
279 @contextmanager
|
jpayne@69
|
280 def temporary_file(path=None):
|
jpayne@69
|
281 """
|
jpayne@69
|
282 Create a closed and named temporary file and make sure it's cleaned up.
|
jpayne@69
|
283
|
jpayne@69
|
284 Using :class:`tempfile.NamedTemporaryFile` will fail on Windows if trying
|
jpayne@69
|
285 to open the file a second time (when passing its name to Pooch function,
|
jpayne@69
|
286 for example). This context manager creates the file, closes it, yields the
|
jpayne@69
|
287 file path, and makes sure it's deleted in the end.
|
jpayne@69
|
288
|
jpayne@69
|
289 Parameters
|
jpayne@69
|
290 ----------
|
jpayne@69
|
291 path : str or PathLike
|
jpayne@69
|
292 The directory in which the temporary file will be created.
|
jpayne@69
|
293
|
jpayne@69
|
294 Yields
|
jpayne@69
|
295 ------
|
jpayne@69
|
296 fname : str
|
jpayne@69
|
297 The path to the temporary file.
|
jpayne@69
|
298
|
jpayne@69
|
299 """
|
jpayne@69
|
300 tmp = tempfile.NamedTemporaryFile(delete=False, dir=path)
|
jpayne@69
|
301 # Close the temp file so that it can be opened elsewhere
|
jpayne@69
|
302 tmp.close()
|
jpayne@69
|
303 try:
|
jpayne@69
|
304 yield tmp.name
|
jpayne@69
|
305 finally:
|
jpayne@69
|
306 if os.path.exists(tmp.name):
|
jpayne@69
|
307 os.remove(tmp.name)
|
jpayne@69
|
308
|
jpayne@69
|
309
|
jpayne@69
|
310 def unique_file_name(url):
|
jpayne@69
|
311 """
|
jpayne@69
|
312 Create a unique file name based on the given URL.
|
jpayne@69
|
313
|
jpayne@69
|
314 The file name will be unique to the URL by prepending the name with the MD5
|
jpayne@69
|
315 hash (hex digest) of the URL. The name will also include the last portion
|
jpayne@69
|
316 of the URL.
|
jpayne@69
|
317
|
jpayne@69
|
318 The format will be: ``{md5}-{filename}.{ext}``
|
jpayne@69
|
319
|
jpayne@69
|
320 The file name will be cropped so that the entire name (including the hash)
|
jpayne@69
|
321 is less than 255 characters long (the limit on most file systems).
|
jpayne@69
|
322
|
jpayne@69
|
323 Parameters
|
jpayne@69
|
324 ----------
|
jpayne@69
|
325 url : str
|
jpayne@69
|
326 The URL with a file name at the end.
|
jpayne@69
|
327
|
jpayne@69
|
328 Returns
|
jpayne@69
|
329 -------
|
jpayne@69
|
330 fname : str
|
jpayne@69
|
331 The file name, unique to this URL.
|
jpayne@69
|
332
|
jpayne@69
|
333 Examples
|
jpayne@69
|
334 --------
|
jpayne@69
|
335
|
jpayne@69
|
336 >>> print(unique_file_name("https://www.some-server.org/2020/data.txt"))
|
jpayne@69
|
337 02ddee027ce5ebb3d7059fb23d210604-data.txt
|
jpayne@69
|
338 >>> print(unique_file_name("https://www.some-server.org/2019/data.txt"))
|
jpayne@69
|
339 9780092867b497fca6fc87d8308f1025-data.txt
|
jpayne@69
|
340 >>> print(unique_file_name("https://www.some-server.org/2020/data.txt.gz"))
|
jpayne@69
|
341 181a9d52e908219c2076f55145d6a344-data.txt.gz
|
jpayne@69
|
342
|
jpayne@69
|
343 """
|
jpayne@69
|
344 md5 = hashlib.md5(url.encode()).hexdigest()
|
jpayne@69
|
345 fname = parse_url(url)["path"].split("/")[-1]
|
jpayne@69
|
346 # Crop the start of the file name to fit 255 characters including the hash
|
jpayne@69
|
347 # and the :
|
jpayne@69
|
348 fname = fname[-(255 - len(md5) - 1) :]
|
jpayne@69
|
349 unique_name = f"{md5}-{fname}"
|
jpayne@69
|
350 return unique_name
|