jpayne@7: import importlib
jpayne@7: import logging
jpayne@7: import unicodedata
jpayne@7: from codecs import IncrementalDecoder
jpayne@7: from encodings.aliases import aliases
jpayne@7: from functools import lru_cache
jpayne@7: from re import findall
jpayne@7: from typing import Generator, List, Optional, Set, Tuple, Union
jpayne@7: 
jpayne@7: from _multibytecodec import MultibyteIncrementalDecoder
jpayne@7: 
jpayne@7: from .constant import (
jpayne@7:     ENCODING_MARKS,
jpayne@7:     IANA_SUPPORTED_SIMILAR,
jpayne@7:     RE_POSSIBLE_ENCODING_INDICATION,
jpayne@7:     UNICODE_RANGES_COMBINED,
jpayne@7:     UNICODE_SECONDARY_RANGE_KEYWORD,
jpayne@7:     UTF8_MAXIMAL_ALLOCATION,
jpayne@7: )
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_accentuated(character: str) -> bool:
jpayne@7:     try:
jpayne@7:         description: str = unicodedata.name(character)
jpayne@7:     except ValueError:
jpayne@7:         return False
jpayne@7:     return (
jpayne@7:         "WITH GRAVE" in description
jpayne@7:         or "WITH ACUTE" in description
jpayne@7:         or "WITH CEDILLA" in description
jpayne@7:         or "WITH DIAERESIS" in description
jpayne@7:         or "WITH CIRCUMFLEX" in description
jpayne@7:         or "WITH TILDE" in description
jpayne@7:         or "WITH MACRON" in description
jpayne@7:         or "WITH RING ABOVE" in description
jpayne@7:     )
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def remove_accent(character: str) -> str:
jpayne@7:     decomposed: str = unicodedata.decomposition(character)
jpayne@7:     if not decomposed:
jpayne@7:         return character
jpayne@7: 
jpayne@7:     codes: List[str] = decomposed.split(" ")
jpayne@7: 
jpayne@7:     return chr(int(codes[0], 16))
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def unicode_range(character: str) -> Optional[str]:
jpayne@7:     """
jpayne@7:     Retrieve the Unicode range official name from a single character.
jpayne@7:     """
jpayne@7:     character_ord: int = ord(character)
jpayne@7: 
jpayne@7:     for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
jpayne@7:         if character_ord in ord_range:
jpayne@7:             return range_name
jpayne@7: 
jpayne@7:     return None
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_latin(character: str) -> bool:
jpayne@7:     try:
jpayne@7:         description: str = unicodedata.name(character)
jpayne@7:     except ValueError:
jpayne@7:         return False
jpayne@7:     return "LATIN" in description
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_punctuation(character: str) -> bool:
jpayne@7:     character_category: str = unicodedata.category(character)
jpayne@7: 
jpayne@7:     if "P" in character_category:
jpayne@7:         return True
jpayne@7: 
jpayne@7:     character_range: Optional[str] = unicode_range(character)
jpayne@7: 
jpayne@7:     if character_range is None:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     return "Punctuation" in character_range
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_symbol(character: str) -> bool:
jpayne@7:     character_category: str = unicodedata.category(character)
jpayne@7: 
jpayne@7:     if "S" in character_category or "N" in character_category:
jpayne@7:         return True
jpayne@7: 
jpayne@7:     character_range: Optional[str] = unicode_range(character)
jpayne@7: 
jpayne@7:     if character_range is None:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     return "Forms" in character_range and character_category != "Lo"
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_emoticon(character: str) -> bool:
jpayne@7:     character_range: Optional[str] = unicode_range(character)
jpayne@7: 
jpayne@7:     if character_range is None:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     return "Emoticons" in character_range or "Pictographs" in character_range
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_separator(character: str) -> bool:
jpayne@7:     if character.isspace() or character in {"｜", "+", "<", ">"}:
jpayne@7:         return True
jpayne@7: 
jpayne@7:     character_category: str = unicodedata.category(character)
jpayne@7: 
jpayne@7:     return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_case_variable(character: str) -> bool:
jpayne@7:     return character.islower() != character.isupper()
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_cjk(character: str) -> bool:
jpayne@7:     try:
jpayne@7:         character_name = unicodedata.name(character)
jpayne@7:     except ValueError:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     return "CJK" in character_name
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_hiragana(character: str) -> bool:
jpayne@7:     try:
jpayne@7:         character_name = unicodedata.name(character)
jpayne@7:     except ValueError:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     return "HIRAGANA" in character_name
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_katakana(character: str) -> bool:
jpayne@7:     try:
jpayne@7:         character_name = unicodedata.name(character)
jpayne@7:     except ValueError:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     return "KATAKANA" in character_name
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_hangul(character: str) -> bool:
jpayne@7:     try:
jpayne@7:         character_name = unicodedata.name(character)
jpayne@7:     except ValueError:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     return "HANGUL" in character_name
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_thai(character: str) -> bool:
jpayne@7:     try:
jpayne@7:         character_name = unicodedata.name(character)
jpayne@7:     except ValueError:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     return "THAI" in character_name
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_arabic(character: str) -> bool:
jpayne@7:     try:
jpayne@7:         character_name = unicodedata.name(character)
jpayne@7:     except ValueError:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     return "ARABIC" in character_name
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_arabic_isolated_form(character: str) -> bool:
jpayne@7:     try:
jpayne@7:         character_name = unicodedata.name(character)
jpayne@7:     except ValueError:
jpayne@7:         return False
jpayne@7: 
jpayne@7:     return "ARABIC" in character_name and "ISOLATED FORM" in character_name
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
jpayne@7: def is_unicode_range_secondary(range_name: str) -> bool:
jpayne@7:     return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
jpayne@7: def is_unprintable(character: str) -> bool:
jpayne@7:     return (
jpayne@7:         character.isspace() is False  # includes \n \t \r \v
jpayne@7:         and character.isprintable() is False
jpayne@7:         and character != "\x1A"  # Why? Its the ASCII substitute character.
jpayne@7:         and character != "\ufeff"  # bug discovered in Python,
jpayne@7:         # Zero Width No-Break Space located in 	Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
jpayne@7:     )
jpayne@7: 
jpayne@7: 
jpayne@7: def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
jpayne@7:     """
jpayne@7:     Extract using ASCII-only decoder any specified encoding in the first n-bytes.
jpayne@7:     """
jpayne@7:     if not isinstance(sequence, bytes):
jpayne@7:         raise TypeError
jpayne@7: 
jpayne@7:     seq_len: int = len(sequence)
jpayne@7: 
jpayne@7:     results: List[str] = findall(
jpayne@7:         RE_POSSIBLE_ENCODING_INDICATION,
jpayne@7:         sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
jpayne@7:     )
jpayne@7: 
jpayne@7:     if len(results) == 0:
jpayne@7:         return None
jpayne@7: 
jpayne@7:     for specified_encoding in results:
jpayne@7:         specified_encoding = specified_encoding.lower().replace("-", "_")
jpayne@7: 
jpayne@7:         encoding_alias: str
jpayne@7:         encoding_iana: str
jpayne@7: 
jpayne@7:         for encoding_alias, encoding_iana in aliases.items():
jpayne@7:             if encoding_alias == specified_encoding:
jpayne@7:                 return encoding_iana
jpayne@7:             if encoding_iana == specified_encoding:
jpayne@7:                 return encoding_iana
jpayne@7: 
jpayne@7:     return None
jpayne@7: 
jpayne@7: 
jpayne@7: @lru_cache(maxsize=128)
jpayne@7: def is_multi_byte_encoding(name: str) -> bool:
jpayne@7:     """
jpayne@7:     Verify is a specific encoding is a multi byte one based on it IANA name
jpayne@7:     """
jpayne@7:     return name in {
jpayne@7:         "utf_8",
jpayne@7:         "utf_8_sig",
jpayne@7:         "utf_16",
jpayne@7:         "utf_16_be",
jpayne@7:         "utf_16_le",
jpayne@7:         "utf_32",
jpayne@7:         "utf_32_le",
jpayne@7:         "utf_32_be",
jpayne@7:         "utf_7",
jpayne@7:     } or issubclass(
jpayne@7:         importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
jpayne@7:         MultibyteIncrementalDecoder,
jpayne@7:     )
jpayne@7: 
jpayne@7: 
jpayne@7: def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
jpayne@7:     """
jpayne@7:     Identify and extract SIG/BOM in given sequence.
jpayne@7:     """
jpayne@7: 
jpayne@7:     for iana_encoding in ENCODING_MARKS:
jpayne@7:         marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
jpayne@7: 
jpayne@7:         if isinstance(marks, bytes):
jpayne@7:             marks = [marks]
jpayne@7: 
jpayne@7:         for mark in marks:
jpayne@7:             if sequence.startswith(mark):
jpayne@7:                 return iana_encoding, mark
jpayne@7: 
jpayne@7:     return None, b""
jpayne@7: 
jpayne@7: 
jpayne@7: def should_strip_sig_or_bom(iana_encoding: str) -> bool:
jpayne@7:     return iana_encoding not in {"utf_16", "utf_32"}
jpayne@7: 
jpayne@7: 
jpayne@7: def iana_name(cp_name: str, strict: bool = True) -> str:
jpayne@7:     cp_name = cp_name.lower().replace("-", "_")
jpayne@7: 
jpayne@7:     encoding_alias: str
jpayne@7:     encoding_iana: str
jpayne@7: 
jpayne@7:     for encoding_alias, encoding_iana in aliases.items():
jpayne@7:         if cp_name in [encoding_alias, encoding_iana]:
jpayne@7:             return encoding_iana
jpayne@7: 
jpayne@7:     if strict:
jpayne@7:         raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
jpayne@7: 
jpayne@7:     return cp_name
jpayne@7: 
jpayne@7: 
jpayne@7: def range_scan(decoded_sequence: str) -> List[str]:
jpayne@7:     ranges: Set[str] = set()
jpayne@7: 
jpayne@7:     for character in decoded_sequence:
jpayne@7:         character_range: Optional[str] = unicode_range(character)
jpayne@7: 
jpayne@7:         if character_range is None:
jpayne@7:             continue
jpayne@7: 
jpayne@7:         ranges.add(character_range)
jpayne@7: 
jpayne@7:     return list(ranges)
jpayne@7: 
jpayne@7: 
jpayne@7: def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
jpayne@7:     if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
jpayne@7:         return 0.0
jpayne@7: 
jpayne@7:     decoder_a = importlib.import_module(
jpayne@7:         "encodings.{}".format(iana_name_a)
jpayne@7:     ).IncrementalDecoder
jpayne@7:     decoder_b = importlib.import_module(
jpayne@7:         "encodings.{}".format(iana_name_b)
jpayne@7:     ).IncrementalDecoder
jpayne@7: 
jpayne@7:     id_a: IncrementalDecoder = decoder_a(errors="ignore")
jpayne@7:     id_b: IncrementalDecoder = decoder_b(errors="ignore")
jpayne@7: 
jpayne@7:     character_match_count: int = 0
jpayne@7: 
jpayne@7:     for i in range(255):
jpayne@7:         to_be_decoded: bytes = bytes([i])
jpayne@7:         if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
jpayne@7:             character_match_count += 1
jpayne@7: 
jpayne@7:     return character_match_count / 254
jpayne@7: 
jpayne@7: 
jpayne@7: def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
jpayne@7:     """
jpayne@7:     Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
jpayne@7:     the function cp_similarity.
jpayne@7:     """
jpayne@7:     return (
jpayne@7:         iana_name_a in IANA_SUPPORTED_SIMILAR
jpayne@7:         and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
jpayne@7:     )
jpayne@7: 
jpayne@7: 
jpayne@7: def set_logging_handler(
jpayne@7:     name: str = "charset_normalizer",
jpayne@7:     level: int = logging.INFO,
jpayne@7:     format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
jpayne@7: ) -> None:
jpayne@7:     logger = logging.getLogger(name)
jpayne@7:     logger.setLevel(level)
jpayne@7: 
jpayne@7:     handler = logging.StreamHandler()
jpayne@7:     handler.setFormatter(logging.Formatter(format_string))
jpayne@7:     logger.addHandler(handler)
jpayne@7: 
jpayne@7: 
jpayne@7: def cut_sequence_chunks(
jpayne@7:     sequences: bytes,
jpayne@7:     encoding_iana: str,
jpayne@7:     offsets: range,
jpayne@7:     chunk_size: int,
jpayne@7:     bom_or_sig_available: bool,
jpayne@7:     strip_sig_or_bom: bool,
jpayne@7:     sig_payload: bytes,
jpayne@7:     is_multi_byte_decoder: bool,
jpayne@7:     decoded_payload: Optional[str] = None,
jpayne@7: ) -> Generator[str, None, None]:
jpayne@7:     if decoded_payload and is_multi_byte_decoder is False:
jpayne@7:         for i in offsets:
jpayne@7:             chunk = decoded_payload[i : i + chunk_size]
jpayne@7:             if not chunk:
jpayne@7:                 break
jpayne@7:             yield chunk
jpayne@7:     else:
jpayne@7:         for i in offsets:
jpayne@7:             chunk_end = i + chunk_size
jpayne@7:             if chunk_end > len(sequences) + 8:
jpayne@7:                 continue
jpayne@7: 
jpayne@7:             cut_sequence = sequences[i : i + chunk_size]
jpayne@7: 
jpayne@7:             if bom_or_sig_available and strip_sig_or_bom is False:
jpayne@7:                 cut_sequence = sig_payload + cut_sequence
jpayne@7: 
jpayne@7:             chunk = cut_sequence.decode(
jpayne@7:                 encoding_iana,
jpayne@7:                 errors="ignore" if is_multi_byte_decoder else "strict",
jpayne@7:             )
jpayne@7: 
jpayne@7:             # multi-byte bad cutting detector and adjustment
jpayne@7:             # not the cleanest way to perform that fix but clever enough for now.
jpayne@7:             if is_multi_byte_decoder and i > 0:
jpayne@7:                 chunk_partial_size_chk: int = min(chunk_size, 16)
jpayne@7: 
jpayne@7:                 if (
jpayne@7:                     decoded_payload
jpayne@7:                     and chunk[:chunk_partial_size_chk] not in decoded_payload
jpayne@7:                 ):
jpayne@7:                     for j in range(i, i - 4, -1):
jpayne@7:                         cut_sequence = sequences[j:chunk_end]
jpayne@7: 
jpayne@7:                         if bom_or_sig_available and strip_sig_or_bom is False:
jpayne@7:                             cut_sequence = sig_payload + cut_sequence
jpayne@7: 
jpayne@7:                         chunk = cut_sequence.decode(encoding_iana, errors="ignore")
jpayne@7: 
jpayne@7:                         if chunk[:chunk_partial_size_chk] in decoded_payload:
jpayne@7:                             break
jpayne@7: 
jpayne@7:             yield chunk