jpayne@7: from encodings.aliases import aliases jpayne@7: from hashlib import sha256 jpayne@7: from json import dumps jpayne@7: from typing import Any, Dict, Iterator, List, Optional, Tuple, Union jpayne@7: jpayne@7: from .constant import TOO_BIG_SEQUENCE jpayne@7: from .utils import iana_name, is_multi_byte_encoding, unicode_range jpayne@7: jpayne@7: jpayne@7: class CharsetMatch: jpayne@7: def __init__( jpayne@7: self, jpayne@7: payload: bytes, jpayne@7: guessed_encoding: str, jpayne@7: mean_mess_ratio: float, jpayne@7: has_sig_or_bom: bool, jpayne@7: languages: "CoherenceMatches", jpayne@7: decoded_payload: Optional[str] = None, jpayne@7: ): jpayne@7: self._payload: bytes = payload jpayne@7: jpayne@7: self._encoding: str = guessed_encoding jpayne@7: self._mean_mess_ratio: float = mean_mess_ratio jpayne@7: self._languages: CoherenceMatches = languages jpayne@7: self._has_sig_or_bom: bool = has_sig_or_bom jpayne@7: self._unicode_ranges: Optional[List[str]] = None jpayne@7: jpayne@7: self._leaves: List[CharsetMatch] = [] jpayne@7: self._mean_coherence_ratio: float = 0.0 jpayne@7: jpayne@7: self._output_payload: Optional[bytes] = None jpayne@7: self._output_encoding: Optional[str] = None jpayne@7: jpayne@7: self._string: Optional[str] = decoded_payload jpayne@7: jpayne@7: def __eq__(self, other: object) -> bool: jpayne@7: if not isinstance(other, CharsetMatch): jpayne@7: raise TypeError( jpayne@7: "__eq__ cannot be invoked on {} and {}.".format( jpayne@7: str(other.__class__), str(self.__class__) jpayne@7: ) jpayne@7: ) jpayne@7: return self.encoding == other.encoding and self.fingerprint == other.fingerprint jpayne@7: jpayne@7: def __lt__(self, other: object) -> bool: jpayne@7: """ jpayne@7: Implemented to make sorted available upon CharsetMatches items. jpayne@7: """ jpayne@7: if not isinstance(other, CharsetMatch): jpayne@7: raise ValueError jpayne@7: jpayne@7: chaos_difference: float = abs(self.chaos - other.chaos) jpayne@7: coherence_difference: float = abs(self.coherence - other.coherence) jpayne@7: jpayne@7: # Below 1% difference --> Use Coherence jpayne@7: if chaos_difference < 0.01 and coherence_difference > 0.02: jpayne@7: return self.coherence > other.coherence jpayne@7: elif chaos_difference < 0.01 and coherence_difference <= 0.02: jpayne@7: # When having a difficult decision, use the result that decoded as many multi-byte as possible. jpayne@7: # preserve RAM usage! jpayne@7: if len(self._payload) >= TOO_BIG_SEQUENCE: jpayne@7: return self.chaos < other.chaos jpayne@7: return self.multi_byte_usage > other.multi_byte_usage jpayne@7: jpayne@7: return self.chaos < other.chaos jpayne@7: jpayne@7: @property jpayne@7: def multi_byte_usage(self) -> float: jpayne@7: return 1.0 - (len(str(self)) / len(self.raw)) jpayne@7: jpayne@7: def __str__(self) -> str: jpayne@7: # Lazy Str Loading jpayne@7: if self._string is None: jpayne@7: self._string = str(self._payload, self._encoding, "strict") jpayne@7: return self._string jpayne@7: jpayne@7: def __repr__(self) -> str: jpayne@7: return "".format(self.encoding, self.fingerprint) jpayne@7: jpayne@7: def add_submatch(self, other: "CharsetMatch") -> None: jpayne@7: if not isinstance(other, CharsetMatch) or other == self: jpayne@7: raise ValueError( jpayne@7: "Unable to add instance <{}> as a submatch of a CharsetMatch".format( jpayne@7: other.__class__ jpayne@7: ) jpayne@7: ) jpayne@7: jpayne@7: other._string = None # Unload RAM usage; dirty trick. jpayne@7: self._leaves.append(other) jpayne@7: jpayne@7: @property jpayne@7: def encoding(self) -> str: jpayne@7: return self._encoding jpayne@7: jpayne@7: @property jpayne@7: def encoding_aliases(self) -> List[str]: jpayne@7: """ jpayne@7: Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. jpayne@7: """ jpayne@7: also_known_as: List[str] = [] jpayne@7: for u, p in aliases.items(): jpayne@7: if self.encoding == u: jpayne@7: also_known_as.append(p) jpayne@7: elif self.encoding == p: jpayne@7: also_known_as.append(u) jpayne@7: return also_known_as jpayne@7: jpayne@7: @property jpayne@7: def bom(self) -> bool: jpayne@7: return self._has_sig_or_bom jpayne@7: jpayne@7: @property jpayne@7: def byte_order_mark(self) -> bool: jpayne@7: return self._has_sig_or_bom jpayne@7: jpayne@7: @property jpayne@7: def languages(self) -> List[str]: jpayne@7: """ jpayne@7: Return the complete list of possible languages found in decoded sequence. jpayne@7: Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. jpayne@7: """ jpayne@7: return [e[0] for e in self._languages] jpayne@7: jpayne@7: @property jpayne@7: def language(self) -> str: jpayne@7: """ jpayne@7: Most probable language found in decoded sequence. If none were detected or inferred, the property will return jpayne@7: "Unknown". jpayne@7: """ jpayne@7: if not self._languages: jpayne@7: # Trying to infer the language based on the given encoding jpayne@7: # Its either English or we should not pronounce ourselves in certain cases. jpayne@7: if "ascii" in self.could_be_from_charset: jpayne@7: return "English" jpayne@7: jpayne@7: # doing it there to avoid circular import jpayne@7: from charset_normalizer.cd import encoding_languages, mb_encoding_languages jpayne@7: jpayne@7: languages = ( jpayne@7: mb_encoding_languages(self.encoding) jpayne@7: if is_multi_byte_encoding(self.encoding) jpayne@7: else encoding_languages(self.encoding) jpayne@7: ) jpayne@7: jpayne@7: if len(languages) == 0 or "Latin Based" in languages: jpayne@7: return "Unknown" jpayne@7: jpayne@7: return languages[0] jpayne@7: jpayne@7: return self._languages[0][0] jpayne@7: jpayne@7: @property jpayne@7: def chaos(self) -> float: jpayne@7: return self._mean_mess_ratio jpayne@7: jpayne@7: @property jpayne@7: def coherence(self) -> float: jpayne@7: if not self._languages: jpayne@7: return 0.0 jpayne@7: return self._languages[0][1] jpayne@7: jpayne@7: @property jpayne@7: def percent_chaos(self) -> float: jpayne@7: return round(self.chaos * 100, ndigits=3) jpayne@7: jpayne@7: @property jpayne@7: def percent_coherence(self) -> float: jpayne@7: return round(self.coherence * 100, ndigits=3) jpayne@7: jpayne@7: @property jpayne@7: def raw(self) -> bytes: jpayne@7: """ jpayne@7: Original untouched bytes. jpayne@7: """ jpayne@7: return self._payload jpayne@7: jpayne@7: @property jpayne@7: def submatch(self) -> List["CharsetMatch"]: jpayne@7: return self._leaves jpayne@7: jpayne@7: @property jpayne@7: def has_submatch(self) -> bool: jpayne@7: return len(self._leaves) > 0 jpayne@7: jpayne@7: @property jpayne@7: def alphabets(self) -> List[str]: jpayne@7: if self._unicode_ranges is not None: jpayne@7: return self._unicode_ranges jpayne@7: # list detected ranges jpayne@7: detected_ranges: List[Optional[str]] = [ jpayne@7: unicode_range(char) for char in str(self) jpayne@7: ] jpayne@7: # filter and sort jpayne@7: self._unicode_ranges = sorted(list({r for r in detected_ranges if r})) jpayne@7: return self._unicode_ranges jpayne@7: jpayne@7: @property jpayne@7: def could_be_from_charset(self) -> List[str]: jpayne@7: """ jpayne@7: The complete list of encoding that output the exact SAME str result and therefore could be the originating jpayne@7: encoding. jpayne@7: This list does include the encoding available in property 'encoding'. jpayne@7: """ jpayne@7: return [self._encoding] + [m.encoding for m in self._leaves] jpayne@7: jpayne@7: def output(self, encoding: str = "utf_8") -> bytes: jpayne@7: """ jpayne@7: Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. jpayne@7: Any errors will be simply ignored by the encoder NOT replaced. jpayne@7: """ jpayne@7: if self._output_encoding is None or self._output_encoding != encoding: jpayne@7: self._output_encoding = encoding jpayne@7: self._output_payload = str(self).encode(encoding, "replace") jpayne@7: jpayne@7: return self._output_payload # type: ignore jpayne@7: jpayne@7: @property jpayne@7: def fingerprint(self) -> str: jpayne@7: """ jpayne@7: Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. jpayne@7: """ jpayne@7: return sha256(self.output()).hexdigest() jpayne@7: jpayne@7: jpayne@7: class CharsetMatches: jpayne@7: """ jpayne@7: Container with every CharsetMatch items ordered by default from most probable to the less one. jpayne@7: Act like a list(iterable) but does not implements all related methods. jpayne@7: """ jpayne@7: jpayne@7: def __init__(self, results: Optional[List[CharsetMatch]] = None): jpayne@7: self._results: List[CharsetMatch] = sorted(results) if results else [] jpayne@7: jpayne@7: def __iter__(self) -> Iterator[CharsetMatch]: jpayne@7: yield from self._results jpayne@7: jpayne@7: def __getitem__(self, item: Union[int, str]) -> CharsetMatch: jpayne@7: """ jpayne@7: Retrieve a single item either by its position or encoding name (alias may be used here). jpayne@7: Raise KeyError upon invalid index or encoding not present in results. jpayne@7: """ jpayne@7: if isinstance(item, int): jpayne@7: return self._results[item] jpayne@7: if isinstance(item, str): jpayne@7: item = iana_name(item, False) jpayne@7: for result in self._results: jpayne@7: if item in result.could_be_from_charset: jpayne@7: return result jpayne@7: raise KeyError jpayne@7: jpayne@7: def __len__(self) -> int: jpayne@7: return len(self._results) jpayne@7: jpayne@7: def __bool__(self) -> bool: jpayne@7: return len(self._results) > 0 jpayne@7: jpayne@7: def append(self, item: CharsetMatch) -> None: jpayne@7: """ jpayne@7: Insert a single match. Will be inserted accordingly to preserve sort. jpayne@7: Can be inserted as a submatch. jpayne@7: """ jpayne@7: if not isinstance(item, CharsetMatch): jpayne@7: raise ValueError( jpayne@7: "Cannot append instance '{}' to CharsetMatches".format( jpayne@7: str(item.__class__) jpayne@7: ) jpayne@7: ) jpayne@7: # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage) jpayne@7: if len(item.raw) <= TOO_BIG_SEQUENCE: jpayne@7: for match in self._results: jpayne@7: if match.fingerprint == item.fingerprint and match.chaos == item.chaos: jpayne@7: match.add_submatch(item) jpayne@7: return jpayne@7: self._results.append(item) jpayne@7: self._results = sorted(self._results) jpayne@7: jpayne@7: def best(self) -> Optional["CharsetMatch"]: jpayne@7: """ jpayne@7: Simply return the first match. Strict equivalent to matches[0]. jpayne@7: """ jpayne@7: if not self._results: jpayne@7: return None jpayne@7: return self._results[0] jpayne@7: jpayne@7: def first(self) -> Optional["CharsetMatch"]: jpayne@7: """ jpayne@7: Redundant method, call the method best(). Kept for BC reasons. jpayne@7: """ jpayne@7: return self.best() jpayne@7: jpayne@7: jpayne@7: CoherenceMatch = Tuple[str, float] jpayne@7: CoherenceMatches = List[CoherenceMatch] jpayne@7: jpayne@7: jpayne@7: class CliDetectionResult: jpayne@7: def __init__( jpayne@7: self, jpayne@7: path: str, jpayne@7: encoding: Optional[str], jpayne@7: encoding_aliases: List[str], jpayne@7: alternative_encodings: List[str], jpayne@7: language: str, jpayne@7: alphabets: List[str], jpayne@7: has_sig_or_bom: bool, jpayne@7: chaos: float, jpayne@7: coherence: float, jpayne@7: unicode_path: Optional[str], jpayne@7: is_preferred: bool, jpayne@7: ): jpayne@7: self.path: str = path jpayne@7: self.unicode_path: Optional[str] = unicode_path jpayne@7: self.encoding: Optional[str] = encoding jpayne@7: self.encoding_aliases: List[str] = encoding_aliases jpayne@7: self.alternative_encodings: List[str] = alternative_encodings jpayne@7: self.language: str = language jpayne@7: self.alphabets: List[str] = alphabets jpayne@7: self.has_sig_or_bom: bool = has_sig_or_bom jpayne@7: self.chaos: float = chaos jpayne@7: self.coherence: float = coherence jpayne@7: self.is_preferred: bool = is_preferred jpayne@7: jpayne@7: @property jpayne@7: def __dict__(self) -> Dict[str, Any]: # type: ignore jpayne@7: return { jpayne@7: "path": self.path, jpayne@7: "encoding": self.encoding, jpayne@7: "encoding_aliases": self.encoding_aliases, jpayne@7: "alternative_encodings": self.alternative_encodings, jpayne@7: "language": self.language, jpayne@7: "alphabets": self.alphabets, jpayne@7: "has_sig_or_bom": self.has_sig_or_bom, jpayne@7: "chaos": self.chaos, jpayne@7: "coherence": self.coherence, jpayne@7: "unicode_path": self.unicode_path, jpayne@7: "is_preferred": self.is_preferred, jpayne@7: } jpayne@7: jpayne@7: def to_json(self) -> str: jpayne@7: return dumps(self.__dict__, ensure_ascii=True, indent=4)