annotate charset_normalizer/legacy.py @ 8:832f269deeb0

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Sun, 05 May 2024 23:47:10 -0400
parents 5eb2d5e3bf22
children
rev   line source
jpayne@7 1 from typing import Any, Dict, Optional, Union
jpayne@7 2 from warnings import warn
jpayne@7 3
jpayne@7 4 from .api import from_bytes
jpayne@7 5 from .constant import CHARDET_CORRESPONDENCE
jpayne@7 6
jpayne@7 7
jpayne@7 8 def detect(
jpayne@7 9 byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
jpayne@7 10 ) -> Dict[str, Optional[Union[str, float]]]:
jpayne@7 11 """
jpayne@7 12 chardet legacy method
jpayne@7 13 Detect the encoding of the given byte string. It should be mostly backward-compatible.
jpayne@7 14 Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
jpayne@7 15 This function is deprecated and should be used to migrate your project easily, consult the documentation for
jpayne@7 16 further information. Not planned for removal.
jpayne@7 17
jpayne@7 18 :param byte_str: The byte sequence to examine.
jpayne@7 19 :param should_rename_legacy: Should we rename legacy encodings
jpayne@7 20 to their more modern equivalents?
jpayne@7 21 """
jpayne@7 22 if len(kwargs):
jpayne@7 23 warn(
jpayne@7 24 f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
jpayne@7 25 )
jpayne@7 26
jpayne@7 27 if not isinstance(byte_str, (bytearray, bytes)):
jpayne@7 28 raise TypeError( # pragma: nocover
jpayne@7 29 "Expected object of type bytes or bytearray, got: "
jpayne@7 30 "{0}".format(type(byte_str))
jpayne@7 31 )
jpayne@7 32
jpayne@7 33 if isinstance(byte_str, bytearray):
jpayne@7 34 byte_str = bytes(byte_str)
jpayne@7 35
jpayne@7 36 r = from_bytes(byte_str).best()
jpayne@7 37
jpayne@7 38 encoding = r.encoding if r is not None else None
jpayne@7 39 language = r.language if r is not None and r.language != "Unknown" else ""
jpayne@7 40 confidence = 1.0 - r.chaos if r is not None else None
jpayne@7 41
jpayne@7 42 # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
jpayne@7 43 # but chardet does return 'utf-8-sig' and it is a valid codec name.
jpayne@7 44 if r is not None and encoding == "utf_8" and r.bom:
jpayne@7 45 encoding += "_sig"
jpayne@7 46
jpayne@7 47 if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
jpayne@7 48 encoding = CHARDET_CORRESPONDENCE[encoding]
jpayne@7 49
jpayne@7 50 return {
jpayne@7 51 "encoding": encoding,
jpayne@7 52 "language": language,
jpayne@7 53 "confidence": confidence,
jpayne@7 54 }