Mercurial > repos > jpayne > bioproject_to_srr_2
comparison charset_normalizer/legacy.py @ 7:5eb2d5e3bf22
planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author | jpayne |
---|---|
date | Sun, 05 May 2024 23:32:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
6:b2745907b1eb | 7:5eb2d5e3bf22 |
---|---|
1 from typing import Any, Dict, Optional, Union | |
2 from warnings import warn | |
3 | |
4 from .api import from_bytes | |
5 from .constant import CHARDET_CORRESPONDENCE | |
6 | |
7 | |
8 def detect( | |
9 byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any | |
10 ) -> Dict[str, Optional[Union[str, float]]]: | |
11 """ | |
12 chardet legacy method | |
13 Detect the encoding of the given byte string. It should be mostly backward-compatible. | |
14 Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it) | |
15 This function is deprecated and should be used to migrate your project easily, consult the documentation for | |
16 further information. Not planned for removal. | |
17 | |
18 :param byte_str: The byte sequence to examine. | |
19 :param should_rename_legacy: Should we rename legacy encodings | |
20 to their more modern equivalents? | |
21 """ | |
22 if len(kwargs): | |
23 warn( | |
24 f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()" | |
25 ) | |
26 | |
27 if not isinstance(byte_str, (bytearray, bytes)): | |
28 raise TypeError( # pragma: nocover | |
29 "Expected object of type bytes or bytearray, got: " | |
30 "{0}".format(type(byte_str)) | |
31 ) | |
32 | |
33 if isinstance(byte_str, bytearray): | |
34 byte_str = bytes(byte_str) | |
35 | |
36 r = from_bytes(byte_str).best() | |
37 | |
38 encoding = r.encoding if r is not None else None | |
39 language = r.language if r is not None and r.language != "Unknown" else "" | |
40 confidence = 1.0 - r.chaos if r is not None else None | |
41 | |
42 # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process | |
43 # but chardet does return 'utf-8-sig' and it is a valid codec name. | |
44 if r is not None and encoding == "utf_8" and r.bom: | |
45 encoding += "_sig" | |
46 | |
47 if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE: | |
48 encoding = CHARDET_CORRESPONDENCE[encoding] | |
49 | |
50 return { | |
51 "encoding": encoding, | |
52 "language": language, | |
53 "confidence": confidence, | |
54 } |