jpayne@7: from typing import Any, Dict, Optional, Union jpayne@7: from warnings import warn jpayne@7: jpayne@7: from .api import from_bytes jpayne@7: from .constant import CHARDET_CORRESPONDENCE jpayne@7: jpayne@7: jpayne@7: def detect( jpayne@7: byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any jpayne@7: ) -> Dict[str, Optional[Union[str, float]]]: jpayne@7: """ jpayne@7: chardet legacy method jpayne@7: Detect the encoding of the given byte string. It should be mostly backward-compatible. jpayne@7: Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it) jpayne@7: This function is deprecated and should be used to migrate your project easily, consult the documentation for jpayne@7: further information. Not planned for removal. jpayne@7: jpayne@7: :param byte_str: The byte sequence to examine. jpayne@7: :param should_rename_legacy: Should we rename legacy encodings jpayne@7: to their more modern equivalents? jpayne@7: """ jpayne@7: if len(kwargs): jpayne@7: warn( jpayne@7: f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()" jpayne@7: ) jpayne@7: jpayne@7: if not isinstance(byte_str, (bytearray, bytes)): jpayne@7: raise TypeError( # pragma: nocover jpayne@7: "Expected object of type bytes or bytearray, got: " jpayne@7: "{0}".format(type(byte_str)) jpayne@7: ) jpayne@7: jpayne@7: if isinstance(byte_str, bytearray): jpayne@7: byte_str = bytes(byte_str) jpayne@7: jpayne@7: r = from_bytes(byte_str).best() jpayne@7: jpayne@7: encoding = r.encoding if r is not None else None jpayne@7: language = r.language if r is not None and r.language != "Unknown" else "" jpayne@7: confidence = 1.0 - r.chaos if r is not None else None jpayne@7: jpayne@7: # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process jpayne@7: # but chardet does return 'utf-8-sig' and it is a valid codec name. jpayne@7: if r is not None and encoding == "utf_8" and r.bom: jpayne@7: encoding += "_sig" jpayne@7: jpayne@7: if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE: jpayne@7: encoding = CHARDET_CORRESPONDENCE[encoding] jpayne@7: jpayne@7: return { jpayne@7: "encoding": encoding, jpayne@7: "language": language, jpayne@7: "confidence": confidence, jpayne@7: }