bioproject_to_srr_2: charset_normalizer/utils.py comparison

comparison charset_normalizer/utils.py @ 7:5eb2d5e3bf22

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538

author	jpayne
date	Sun, 05 May 2024 23:32:17 -0400
parents
children

comparison

equal deleted inserted replaced

-:b2745907b1eb
+:5eb2d5e3bf22
+import importlib
+import logging
+import unicodedata
+from codecs import IncrementalDecoder
+from encodings.aliases import aliases
+from functools import lru_cache
+from re import findall
+from typing import Generator, List, Optional, Set, Tuple, Union
+from _multibytecodec import MultibyteIncrementalDecoder
+from .constant import (
+ENCODING_MARKS,
+IANA_SUPPORTED_SIMILAR,
+RE_POSSIBLE_ENCODING_INDICATION,
+UNICODE_RANGES_COMBINED,
+UNICODE_SECONDARY_RANGE_KEYWORD,
+UTF8_MAXIMAL_ALLOCATION,
+)
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_accentuated(character: str) -> bool:
+try:
+description: str = unicodedata.name(character)
+except ValueError:
+return False
+return (
+"WITH GRAVE" in description
+or "WITH ACUTE" in description
+or "WITH CEDILLA" in description
+or "WITH DIAERESIS" in description
+or "WITH CIRCUMFLEX" in description
+or "WITH TILDE" in description
+or "WITH MACRON" in description
+or "WITH RING ABOVE" in description
+)
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def remove_accent(character: str) -> str:
+decomposed: str = unicodedata.decomposition(character)
+if not decomposed:
+return character
+codes: List[str] = decomposed.split(" ")
+return chr(int(codes[0], 16))
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def unicode_range(character: str) -> Optional[str]:
+"""
+Retrieve the Unicode range official name from a single character.
+"""
+character_ord: int = ord(character)
+for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
+if character_ord in ord_range:
+return range_name
+return None
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_latin(character: str) -> bool:
+try:
+description: str = unicodedata.name(character)
+except ValueError:
+return False
+return "LATIN" in description
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_punctuation(character: str) -> bool:
+character_category: str = unicodedata.category(character)
+if "P" in character_category:
+return True
+character_range: Optional[str] = unicode_range(character)
+if character_range is None:
+return False
+return "Punctuation" in character_range
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_symbol(character: str) -> bool:
+character_category: str = unicodedata.category(character)
+if "S" in character_category or "N" in character_category:
+return True
+character_range: Optional[str] = unicode_range(character)
+if character_range is None:
+return False
+return "Forms" in character_range and character_category != "Lo"
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_emoticon(character: str) -> bool:
+character_range: Optional[str] = unicode_range(character)
+if character_range is None:
+return False
+return "Emoticons" in character_range or "Pictographs" in character_range
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_separator(character: str) -> bool:
+if character.isspace() or character in {"｜", "+", "<", ">"}:
+return True
+character_category: str = unicodedata.category(character)
+return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_case_variable(character: str) -> bool:
+return character.islower() != character.isupper()
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_cjk(character: str) -> bool:
+try:
+character_name = unicodedata.name(character)
+except ValueError:
+return False
+return "CJK" in character_name
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_hiragana(character: str) -> bool:
+try:
+character_name = unicodedata.name(character)
+except ValueError:
+return False
+return "HIRAGANA" in character_name
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_katakana(character: str) -> bool:
+try:
+character_name = unicodedata.name(character)
+except ValueError:
+return False
+return "KATAKANA" in character_name
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_hangul(character: str) -> bool:
+try:
+character_name = unicodedata.name(character)
+except ValueError:
+return False
+return "HANGUL" in character_name
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_thai(character: str) -> bool:
+try:
+character_name = unicodedata.name(character)
+except ValueError:
+return False
+return "THAI" in character_name
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic(character: str) -> bool:
+try:
+character_name = unicodedata.name(character)
+except ValueError:
+return False
+return "ARABIC" in character_name
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic_isolated_form(character: str) -> bool:
+try:
+character_name = unicodedata.name(character)
+except ValueError:
+return False
+return "ARABIC" in character_name and "ISOLATED FORM" in character_name
+@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
+def is_unicode_range_secondary(range_name: str) -> bool:
+return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_unprintable(character: str) -> bool:
+return (
+character.isspace() is False  # includes \n \t \r \v
+and character.isprintable() is False
+and character != "\x1A"  # Why? Its the ASCII substitute character.
+and character != "\ufeff"  # bug discovered in Python,
+# Zero Width No-Break Space located in 	Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
+)
+def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
+"""
+Extract using ASCII-only decoder any specified encoding in the first n-bytes.
+"""
+if not isinstance(sequence, bytes):
+raise TypeError
+seq_len: int = len(sequence)
+results: List[str] = findall(
+RE_POSSIBLE_ENCODING_INDICATION,
+sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
+)
+if len(results) == 0:
+return None
+for specified_encoding in results:
+specified_encoding = specified_encoding.lower().replace("-", "_")
+encoding_alias: str
+encoding_iana: str
+for encoding_alias, encoding_iana in aliases.items():
+if encoding_alias == specified_encoding:
+return encoding_iana
+if encoding_iana == specified_encoding:
+return encoding_iana
+return None
+@lru_cache(maxsize=128)
+def is_multi_byte_encoding(name: str) -> bool:
+"""
+Verify is a specific encoding is a multi byte one based on it IANA name
+"""
+return name in {
+"utf_8",
+"utf_8_sig",
+"utf_16",
+"utf_16_be",
+"utf_16_le",
+"utf_32",
+"utf_32_le",
+"utf_32_be",
+"utf_7",
+} or issubclass(
+importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
+MultibyteIncrementalDecoder,
+)
+def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
+"""
+Identify and extract SIG/BOM in given sequence.
+"""
+for iana_encoding in ENCODING_MARKS:
+marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
+if isinstance(marks, bytes):
+marks = [marks]
+for mark in marks:
+if sequence.startswith(mark):
+return iana_encoding, mark
+return None, b""
+def should_strip_sig_or_bom(iana_encoding: str) -> bool:
+return iana_encoding not in {"utf_16", "utf_32"}
+def iana_name(cp_name: str, strict: bool = True) -> str:
+cp_name = cp_name.lower().replace("-", "_")
+encoding_alias: str
+encoding_iana: str
+for encoding_alias, encoding_iana in aliases.items():
+if cp_name in [encoding_alias, encoding_iana]:
+return encoding_iana
+if strict:
+raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
+return cp_name
+def range_scan(decoded_sequence: str) -> List[str]:
+ranges: Set[str] = set()
+for character in decoded_sequence:
+character_range: Optional[str] = unicode_range(character)
+if character_range is None:
+continue
+ranges.add(character_range)
+return list(ranges)
+def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
+if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
+return 0.0
+decoder_a = importlib.import_module(
+"encodings.{}".format(iana_name_a)
+).IncrementalDecoder
+decoder_b = importlib.import_module(
+"encodings.{}".format(iana_name_b)
+).IncrementalDecoder
+id_a: IncrementalDecoder = decoder_a(errors="ignore")
+id_b: IncrementalDecoder = decoder_b(errors="ignore")
+character_match_count: int = 0
+for i in range(255):
+to_be_decoded: bytes = bytes([i])
+if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
+character_match_count += 1
+return character_match_count / 254
+def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
+"""
+Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
+the function cp_similarity.
+"""
+return (
+iana_name_a in IANA_SUPPORTED_SIMILAR
+and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
+)
+def set_logging_handler(
+name: str = "charset_normalizer",
+level: int = logging.INFO,
+format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
+) -> None:
+logger = logging.getLogger(name)
+logger.setLevel(level)
+handler = logging.StreamHandler()
+handler.setFormatter(logging.Formatter(format_string))
+logger.addHandler(handler)
+def cut_sequence_chunks(
+sequences: bytes,
+encoding_iana: str,
+offsets: range,
+chunk_size: int,
+bom_or_sig_available: bool,
+strip_sig_or_bom: bool,
+sig_payload: bytes,
+is_multi_byte_decoder: bool,
+decoded_payload: Optional[str] = None,
+) -> Generator[str, None, None]:
+if decoded_payload and is_multi_byte_decoder is False:
+for i in offsets:
+chunk = decoded_payload[i : i + chunk_size]
+if not chunk:
+break
+yield chunk
+else:
+for i in offsets:
+chunk_end = i + chunk_size
+if chunk_end > len(sequences) + 8:
+continue
+cut_sequence = sequences[i : i + chunk_size]
+if bom_or_sig_available and strip_sig_or_bom is False:
+cut_sequence = sig_payload + cut_sequence
+chunk = cut_sequence.decode(
+encoding_iana,
+errors="ignore" if is_multi_byte_decoder else "strict",
+)
+# multi-byte bad cutting detector and adjustment
+# not the cleanest way to perform that fix but clever enough for now.
+if is_multi_byte_decoder and i > 0:
+chunk_partial_size_chk: int = min(chunk_size, 16)
+if (
+decoded_payload
+and chunk[:chunk_partial_size_chk] not in decoded_payload
+):
+for j in range(i, i - 4, -1):
+cut_sequence = sequences[j:chunk_end]
+if bom_or_sig_available and strip_sig_or_bom is False:
+cut_sequence = sig_payload + cut_sequence
+chunk = cut_sequence.decode(encoding_iana, errors="ignore")
+if chunk[:chunk_partial_size_chk] in decoded_payload:
+break
+yield chunk

Mercurial > repos > jpayne > bioproject_to_srr_2

comparison charset_normalizer/utils.py @ 7:5eb2d5e3bf22