jpayne@7
|
1 from typing import Any, Dict, Optional, Union
|
jpayne@7
|
2 from warnings import warn
|
jpayne@7
|
3
|
jpayne@7
|
4 from .api import from_bytes
|
jpayne@7
|
5 from .constant import CHARDET_CORRESPONDENCE
|
jpayne@7
|
6
|
jpayne@7
|
7
|
jpayne@7
|
8 def detect(
|
jpayne@7
|
9 byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
jpayne@7
|
10 ) -> Dict[str, Optional[Union[str, float]]]:
|
jpayne@7
|
11 """
|
jpayne@7
|
12 chardet legacy method
|
jpayne@7
|
13 Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
jpayne@7
|
14 Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
jpayne@7
|
15 This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
jpayne@7
|
16 further information. Not planned for removal.
|
jpayne@7
|
17
|
jpayne@7
|
18 :param byte_str: The byte sequence to examine.
|
jpayne@7
|
19 :param should_rename_legacy: Should we rename legacy encodings
|
jpayne@7
|
20 to their more modern equivalents?
|
jpayne@7
|
21 """
|
jpayne@7
|
22 if len(kwargs):
|
jpayne@7
|
23 warn(
|
jpayne@7
|
24 f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
jpayne@7
|
25 )
|
jpayne@7
|
26
|
jpayne@7
|
27 if not isinstance(byte_str, (bytearray, bytes)):
|
jpayne@7
|
28 raise TypeError( # pragma: nocover
|
jpayne@7
|
29 "Expected object of type bytes or bytearray, got: "
|
jpayne@7
|
30 "{0}".format(type(byte_str))
|
jpayne@7
|
31 )
|
jpayne@7
|
32
|
jpayne@7
|
33 if isinstance(byte_str, bytearray):
|
jpayne@7
|
34 byte_str = bytes(byte_str)
|
jpayne@7
|
35
|
jpayne@7
|
36 r = from_bytes(byte_str).best()
|
jpayne@7
|
37
|
jpayne@7
|
38 encoding = r.encoding if r is not None else None
|
jpayne@7
|
39 language = r.language if r is not None and r.language != "Unknown" else ""
|
jpayne@7
|
40 confidence = 1.0 - r.chaos if r is not None else None
|
jpayne@7
|
41
|
jpayne@7
|
42 # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
jpayne@7
|
43 # but chardet does return 'utf-8-sig' and it is a valid codec name.
|
jpayne@7
|
44 if r is not None and encoding == "utf_8" and r.bom:
|
jpayne@7
|
45 encoding += "_sig"
|
jpayne@7
|
46
|
jpayne@7
|
47 if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
jpayne@7
|
48 encoding = CHARDET_CORRESPONDENCE[encoding]
|
jpayne@7
|
49
|
jpayne@7
|
50 return {
|
jpayne@7
|
51 "encoding": encoding,
|
jpayne@7
|
52 "language": language,
|
jpayne@7
|
53 "confidence": confidence,
|
jpayne@7
|
54 }
|