comparison charset_normalizer/legacy.py @ 7:5eb2d5e3bf22

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Sun, 05 May 2024 23:32:17 -0400
parents
children
comparison
equal deleted inserted replaced
6:b2745907b1eb 7:5eb2d5e3bf22
1 from typing import Any, Dict, Optional, Union
2 from warnings import warn
3
4 from .api import from_bytes
5 from .constant import CHARDET_CORRESPONDENCE
6
7
8 def detect(
9 byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
10 ) -> Dict[str, Optional[Union[str, float]]]:
11 """
12 chardet legacy method
13 Detect the encoding of the given byte string. It should be mostly backward-compatible.
14 Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
15 This function is deprecated and should be used to migrate your project easily, consult the documentation for
16 further information. Not planned for removal.
17
18 :param byte_str: The byte sequence to examine.
19 :param should_rename_legacy: Should we rename legacy encodings
20 to their more modern equivalents?
21 """
22 if len(kwargs):
23 warn(
24 f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
25 )
26
27 if not isinstance(byte_str, (bytearray, bytes)):
28 raise TypeError( # pragma: nocover
29 "Expected object of type bytes or bytearray, got: "
30 "{0}".format(type(byte_str))
31 )
32
33 if isinstance(byte_str, bytearray):
34 byte_str = bytes(byte_str)
35
36 r = from_bytes(byte_str).best()
37
38 encoding = r.encoding if r is not None else None
39 language = r.language if r is not None and r.language != "Unknown" else ""
40 confidence = 1.0 - r.chaos if r is not None else None
41
42 # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
43 # but chardet does return 'utf-8-sig' and it is a valid codec name.
44 if r is not None and encoding == "utf_8" and r.bom:
45 encoding += "_sig"
46
47 if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
48 encoding = CHARDET_CORRESPONDENCE[encoding]
49
50 return {
51 "encoding": encoding,
52 "language": language,
53 "confidence": confidence,
54 }