bioproject_to_srr_2: charset_normalizer/legacy.py annotate

annotate charset_normalizer/legacy.py @ 8:832f269deeb0

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538

author	jpayne
date	Sun, 05 May 2024 23:47:10 -0400
parents	5eb2d5e3bf22
children

rev	line source
jpayne@7	1 from typing import Any, Dict, Optional, Union
jpayne@7	2 from warnings import warn
jpayne@7	3
jpayne@7	4 from .api import from_bytes
jpayne@7	5 from .constant import CHARDET_CORRESPONDENCE
jpayne@7	6
jpayne@7	7
jpayne@7	8 def detect(
jpayne@7	9 byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
jpayne@7	10 ) -> Dict[str, Optional[Union[str, float]]]:
jpayne@7	11 """
jpayne@7	12 chardet legacy method
jpayne@7	13 Detect the encoding of the given byte string. It should be mostly backward-compatible.
jpayne@7	14 Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
jpayne@7	15 This function is deprecated and should be used to migrate your project easily, consult the documentation for
jpayne@7	16 further information. Not planned for removal.
jpayne@7	17
jpayne@7	18 :param byte_str: The byte sequence to examine.
jpayne@7	19 :param should_rename_legacy: Should we rename legacy encodings
jpayne@7	20 to their more modern equivalents?
jpayne@7	21 """
jpayne@7	22 if len(kwargs):
jpayne@7	23 warn(
jpayne@7	24 f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
jpayne@7	25 )
jpayne@7	26
jpayne@7	27 if not isinstance(byte_str, (bytearray, bytes)):
jpayne@7	28 raise TypeError( # pragma: nocover
jpayne@7	29 "Expected object of type bytes or bytearray, got: "
jpayne@7	30 "{0}".format(type(byte_str))
jpayne@7	31 )
jpayne@7	32
jpayne@7	33 if isinstance(byte_str, bytearray):
jpayne@7	34 byte_str = bytes(byte_str)
jpayne@7	35
jpayne@7	36 r = from_bytes(byte_str).best()
jpayne@7	37
jpayne@7	38 encoding = r.encoding if r is not None else None
jpayne@7	39 language = r.language if r is not None and r.language != "Unknown" else ""
jpayne@7	40 confidence = 1.0 - r.chaos if r is not None else None
jpayne@7	41
jpayne@7	42 # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
jpayne@7	43 # but chardet does return 'utf-8-sig' and it is a valid codec name.
jpayne@7	44 if r is not None and encoding == "utf_8" and r.bom:
jpayne@7	45 encoding += "_sig"
jpayne@7	46
jpayne@7	47 if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
jpayne@7	48 encoding = CHARDET_CORRESPONDENCE[encoding]
jpayne@7	49
jpayne@7	50 return {
jpayne@7	51 "encoding": encoding,
jpayne@7	52 "language": language,
jpayne@7	53 "confidence": confidence,
jpayne@7	54 }

Mercurial > repos > jpayne > bioproject_to_srr_2

annotate charset_normalizer/legacy.py @ 8:832f269deeb0