jpayne@7: import importlib jpayne@7: import logging jpayne@7: import unicodedata jpayne@7: from codecs import IncrementalDecoder jpayne@7: from encodings.aliases import aliases jpayne@7: from functools import lru_cache jpayne@7: from re import findall jpayne@7: from typing import Generator, List, Optional, Set, Tuple, Union jpayne@7: jpayne@7: from _multibytecodec import MultibyteIncrementalDecoder jpayne@7: jpayne@7: from .constant import ( jpayne@7: ENCODING_MARKS, jpayne@7: IANA_SUPPORTED_SIMILAR, jpayne@7: RE_POSSIBLE_ENCODING_INDICATION, jpayne@7: UNICODE_RANGES_COMBINED, jpayne@7: UNICODE_SECONDARY_RANGE_KEYWORD, jpayne@7: UTF8_MAXIMAL_ALLOCATION, jpayne@7: ) jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_accentuated(character: str) -> bool: jpayne@7: try: jpayne@7: description: str = unicodedata.name(character) jpayne@7: except ValueError: jpayne@7: return False jpayne@7: return ( jpayne@7: "WITH GRAVE" in description jpayne@7: or "WITH ACUTE" in description jpayne@7: or "WITH CEDILLA" in description jpayne@7: or "WITH DIAERESIS" in description jpayne@7: or "WITH CIRCUMFLEX" in description jpayne@7: or "WITH TILDE" in description jpayne@7: or "WITH MACRON" in description jpayne@7: or "WITH RING ABOVE" in description jpayne@7: ) jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def remove_accent(character: str) -> str: jpayne@7: decomposed: str = unicodedata.decomposition(character) jpayne@7: if not decomposed: jpayne@7: return character jpayne@7: jpayne@7: codes: List[str] = decomposed.split(" ") jpayne@7: jpayne@7: return chr(int(codes[0], 16)) jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def unicode_range(character: str) -> Optional[str]: jpayne@7: """ jpayne@7: Retrieve the Unicode range official name from a single character. jpayne@7: """ jpayne@7: character_ord: int = ord(character) jpayne@7: jpayne@7: for range_name, ord_range in UNICODE_RANGES_COMBINED.items(): jpayne@7: if character_ord in ord_range: jpayne@7: return range_name jpayne@7: jpayne@7: return None jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_latin(character: str) -> bool: jpayne@7: try: jpayne@7: description: str = unicodedata.name(character) jpayne@7: except ValueError: jpayne@7: return False jpayne@7: return "LATIN" in description jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_punctuation(character: str) -> bool: jpayne@7: character_category: str = unicodedata.category(character) jpayne@7: jpayne@7: if "P" in character_category: jpayne@7: return True jpayne@7: jpayne@7: character_range: Optional[str] = unicode_range(character) jpayne@7: jpayne@7: if character_range is None: jpayne@7: return False jpayne@7: jpayne@7: return "Punctuation" in character_range jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_symbol(character: str) -> bool: jpayne@7: character_category: str = unicodedata.category(character) jpayne@7: jpayne@7: if "S" in character_category or "N" in character_category: jpayne@7: return True jpayne@7: jpayne@7: character_range: Optional[str] = unicode_range(character) jpayne@7: jpayne@7: if character_range is None: jpayne@7: return False jpayne@7: jpayne@7: return "Forms" in character_range and character_category != "Lo" jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_emoticon(character: str) -> bool: jpayne@7: character_range: Optional[str] = unicode_range(character) jpayne@7: jpayne@7: if character_range is None: jpayne@7: return False jpayne@7: jpayne@7: return "Emoticons" in character_range or "Pictographs" in character_range jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_separator(character: str) -> bool: jpayne@7: if character.isspace() or character in {"|", "+", "<", ">"}: jpayne@7: return True jpayne@7: jpayne@7: character_category: str = unicodedata.category(character) jpayne@7: jpayne@7: return "Z" in character_category or character_category in {"Po", "Pd", "Pc"} jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_case_variable(character: str) -> bool: jpayne@7: return character.islower() != character.isupper() jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_cjk(character: str) -> bool: jpayne@7: try: jpayne@7: character_name = unicodedata.name(character) jpayne@7: except ValueError: jpayne@7: return False jpayne@7: jpayne@7: return "CJK" in character_name jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_hiragana(character: str) -> bool: jpayne@7: try: jpayne@7: character_name = unicodedata.name(character) jpayne@7: except ValueError: jpayne@7: return False jpayne@7: jpayne@7: return "HIRAGANA" in character_name jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_katakana(character: str) -> bool: jpayne@7: try: jpayne@7: character_name = unicodedata.name(character) jpayne@7: except ValueError: jpayne@7: return False jpayne@7: jpayne@7: return "KATAKANA" in character_name jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_hangul(character: str) -> bool: jpayne@7: try: jpayne@7: character_name = unicodedata.name(character) jpayne@7: except ValueError: jpayne@7: return False jpayne@7: jpayne@7: return "HANGUL" in character_name jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_thai(character: str) -> bool: jpayne@7: try: jpayne@7: character_name = unicodedata.name(character) jpayne@7: except ValueError: jpayne@7: return False jpayne@7: jpayne@7: return "THAI" in character_name jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_arabic(character: str) -> bool: jpayne@7: try: jpayne@7: character_name = unicodedata.name(character) jpayne@7: except ValueError: jpayne@7: return False jpayne@7: jpayne@7: return "ARABIC" in character_name jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_arabic_isolated_form(character: str) -> bool: jpayne@7: try: jpayne@7: character_name = unicodedata.name(character) jpayne@7: except ValueError: jpayne@7: return False jpayne@7: jpayne@7: return "ARABIC" in character_name and "ISOLATED FORM" in character_name jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) jpayne@7: def is_unicode_range_secondary(range_name: str) -> bool: jpayne@7: return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) jpayne@7: def is_unprintable(character: str) -> bool: jpayne@7: return ( jpayne@7: character.isspace() is False # includes \n \t \r \v jpayne@7: and character.isprintable() is False jpayne@7: and character != "\x1A" # Why? Its the ASCII substitute character. jpayne@7: and character != "\ufeff" # bug discovered in Python, jpayne@7: # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space. jpayne@7: ) jpayne@7: jpayne@7: jpayne@7: def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]: jpayne@7: """ jpayne@7: Extract using ASCII-only decoder any specified encoding in the first n-bytes. jpayne@7: """ jpayne@7: if not isinstance(sequence, bytes): jpayne@7: raise TypeError jpayne@7: jpayne@7: seq_len: int = len(sequence) jpayne@7: jpayne@7: results: List[str] = findall( jpayne@7: RE_POSSIBLE_ENCODING_INDICATION, jpayne@7: sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"), jpayne@7: ) jpayne@7: jpayne@7: if len(results) == 0: jpayne@7: return None jpayne@7: jpayne@7: for specified_encoding in results: jpayne@7: specified_encoding = specified_encoding.lower().replace("-", "_") jpayne@7: jpayne@7: encoding_alias: str jpayne@7: encoding_iana: str jpayne@7: jpayne@7: for encoding_alias, encoding_iana in aliases.items(): jpayne@7: if encoding_alias == specified_encoding: jpayne@7: return encoding_iana jpayne@7: if encoding_iana == specified_encoding: jpayne@7: return encoding_iana jpayne@7: jpayne@7: return None jpayne@7: jpayne@7: jpayne@7: @lru_cache(maxsize=128) jpayne@7: def is_multi_byte_encoding(name: str) -> bool: jpayne@7: """ jpayne@7: Verify is a specific encoding is a multi byte one based on it IANA name jpayne@7: """ jpayne@7: return name in { jpayne@7: "utf_8", jpayne@7: "utf_8_sig", jpayne@7: "utf_16", jpayne@7: "utf_16_be", jpayne@7: "utf_16_le", jpayne@7: "utf_32", jpayne@7: "utf_32_le", jpayne@7: "utf_32_be", jpayne@7: "utf_7", jpayne@7: } or issubclass( jpayne@7: importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, jpayne@7: MultibyteIncrementalDecoder, jpayne@7: ) jpayne@7: jpayne@7: jpayne@7: def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]: jpayne@7: """ jpayne@7: Identify and extract SIG/BOM in given sequence. jpayne@7: """ jpayne@7: jpayne@7: for iana_encoding in ENCODING_MARKS: jpayne@7: marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding] jpayne@7: jpayne@7: if isinstance(marks, bytes): jpayne@7: marks = [marks] jpayne@7: jpayne@7: for mark in marks: jpayne@7: if sequence.startswith(mark): jpayne@7: return iana_encoding, mark jpayne@7: jpayne@7: return None, b"" jpayne@7: jpayne@7: jpayne@7: def should_strip_sig_or_bom(iana_encoding: str) -> bool: jpayne@7: return iana_encoding not in {"utf_16", "utf_32"} jpayne@7: jpayne@7: jpayne@7: def iana_name(cp_name: str, strict: bool = True) -> str: jpayne@7: cp_name = cp_name.lower().replace("-", "_") jpayne@7: jpayne@7: encoding_alias: str jpayne@7: encoding_iana: str jpayne@7: jpayne@7: for encoding_alias, encoding_iana in aliases.items(): jpayne@7: if cp_name in [encoding_alias, encoding_iana]: jpayne@7: return encoding_iana jpayne@7: jpayne@7: if strict: jpayne@7: raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name)) jpayne@7: jpayne@7: return cp_name jpayne@7: jpayne@7: jpayne@7: def range_scan(decoded_sequence: str) -> List[str]: jpayne@7: ranges: Set[str] = set() jpayne@7: jpayne@7: for character in decoded_sequence: jpayne@7: character_range: Optional[str] = unicode_range(character) jpayne@7: jpayne@7: if character_range is None: jpayne@7: continue jpayne@7: jpayne@7: ranges.add(character_range) jpayne@7: jpayne@7: return list(ranges) jpayne@7: jpayne@7: jpayne@7: def cp_similarity(iana_name_a: str, iana_name_b: str) -> float: jpayne@7: if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b): jpayne@7: return 0.0 jpayne@7: jpayne@7: decoder_a = importlib.import_module( jpayne@7: "encodings.{}".format(iana_name_a) jpayne@7: ).IncrementalDecoder jpayne@7: decoder_b = importlib.import_module( jpayne@7: "encodings.{}".format(iana_name_b) jpayne@7: ).IncrementalDecoder jpayne@7: jpayne@7: id_a: IncrementalDecoder = decoder_a(errors="ignore") jpayne@7: id_b: IncrementalDecoder = decoder_b(errors="ignore") jpayne@7: jpayne@7: character_match_count: int = 0 jpayne@7: jpayne@7: for i in range(255): jpayne@7: to_be_decoded: bytes = bytes([i]) jpayne@7: if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded): jpayne@7: character_match_count += 1 jpayne@7: jpayne@7: return character_match_count / 254 jpayne@7: jpayne@7: jpayne@7: def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool: jpayne@7: """ jpayne@7: Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using jpayne@7: the function cp_similarity. jpayne@7: """ jpayne@7: return ( jpayne@7: iana_name_a in IANA_SUPPORTED_SIMILAR jpayne@7: and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a] jpayne@7: ) jpayne@7: jpayne@7: jpayne@7: def set_logging_handler( jpayne@7: name: str = "charset_normalizer", jpayne@7: level: int = logging.INFO, jpayne@7: format_string: str = "%(asctime)s | %(levelname)s | %(message)s", jpayne@7: ) -> None: jpayne@7: logger = logging.getLogger(name) jpayne@7: logger.setLevel(level) jpayne@7: jpayne@7: handler = logging.StreamHandler() jpayne@7: handler.setFormatter(logging.Formatter(format_string)) jpayne@7: logger.addHandler(handler) jpayne@7: jpayne@7: jpayne@7: def cut_sequence_chunks( jpayne@7: sequences: bytes, jpayne@7: encoding_iana: str, jpayne@7: offsets: range, jpayne@7: chunk_size: int, jpayne@7: bom_or_sig_available: bool, jpayne@7: strip_sig_or_bom: bool, jpayne@7: sig_payload: bytes, jpayne@7: is_multi_byte_decoder: bool, jpayne@7: decoded_payload: Optional[str] = None, jpayne@7: ) -> Generator[str, None, None]: jpayne@7: if decoded_payload and is_multi_byte_decoder is False: jpayne@7: for i in offsets: jpayne@7: chunk = decoded_payload[i : i + chunk_size] jpayne@7: if not chunk: jpayne@7: break jpayne@7: yield chunk jpayne@7: else: jpayne@7: for i in offsets: jpayne@7: chunk_end = i + chunk_size jpayne@7: if chunk_end > len(sequences) + 8: jpayne@7: continue jpayne@7: jpayne@7: cut_sequence = sequences[i : i + chunk_size] jpayne@7: jpayne@7: if bom_or_sig_available and strip_sig_or_bom is False: jpayne@7: cut_sequence = sig_payload + cut_sequence jpayne@7: jpayne@7: chunk = cut_sequence.decode( jpayne@7: encoding_iana, jpayne@7: errors="ignore" if is_multi_byte_decoder else "strict", jpayne@7: ) jpayne@7: jpayne@7: # multi-byte bad cutting detector and adjustment jpayne@7: # not the cleanest way to perform that fix but clever enough for now. jpayne@7: if is_multi_byte_decoder and i > 0: jpayne@7: chunk_partial_size_chk: int = min(chunk_size, 16) jpayne@7: jpayne@7: if ( jpayne@7: decoded_payload jpayne@7: and chunk[:chunk_partial_size_chk] not in decoded_payload jpayne@7: ): jpayne@7: for j in range(i, i - 4, -1): jpayne@7: cut_sequence = sequences[j:chunk_end] jpayne@7: jpayne@7: if bom_or_sig_available and strip_sig_or_bom is False: jpayne@7: cut_sequence = sig_payload + cut_sequence jpayne@7: jpayne@7: chunk = cut_sequence.decode(encoding_iana, errors="ignore") jpayne@7: jpayne@7: if chunk[:chunk_partial_size_chk] in decoded_payload: jpayne@7: break jpayne@7: jpayne@7: yield chunk