jpayne@7: from encodings.aliases import aliases
jpayne@7: from hashlib import sha256
jpayne@7: from json import dumps
jpayne@7: from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
jpayne@7: 
jpayne@7: from .constant import TOO_BIG_SEQUENCE
jpayne@7: from .utils import iana_name, is_multi_byte_encoding, unicode_range
jpayne@7: 
jpayne@7: 
jpayne@7: class CharsetMatch:
jpayne@7:     def __init__(
jpayne@7:         self,
jpayne@7:         payload: bytes,
jpayne@7:         guessed_encoding: str,
jpayne@7:         mean_mess_ratio: float,
jpayne@7:         has_sig_or_bom: bool,
jpayne@7:         languages: "CoherenceMatches",
jpayne@7:         decoded_payload: Optional[str] = None,
jpayne@7:     ):
jpayne@7:         self._payload: bytes = payload
jpayne@7: 
jpayne@7:         self._encoding: str = guessed_encoding
jpayne@7:         self._mean_mess_ratio: float = mean_mess_ratio
jpayne@7:         self._languages: CoherenceMatches = languages
jpayne@7:         self._has_sig_or_bom: bool = has_sig_or_bom
jpayne@7:         self._unicode_ranges: Optional[List[str]] = None
jpayne@7: 
jpayne@7:         self._leaves: List[CharsetMatch] = []
jpayne@7:         self._mean_coherence_ratio: float = 0.0
jpayne@7: 
jpayne@7:         self._output_payload: Optional[bytes] = None
jpayne@7:         self._output_encoding: Optional[str] = None
jpayne@7: 
jpayne@7:         self._string: Optional[str] = decoded_payload
jpayne@7: 
jpayne@7:     def __eq__(self, other: object) -> bool:
jpayne@7:         if not isinstance(other, CharsetMatch):
jpayne@7:             raise TypeError(
jpayne@7:                 "__eq__ cannot be invoked on {} and {}.".format(
jpayne@7:                     str(other.__class__), str(self.__class__)
jpayne@7:                 )
jpayne@7:             )
jpayne@7:         return self.encoding == other.encoding and self.fingerprint == other.fingerprint
jpayne@7: 
jpayne@7:     def __lt__(self, other: object) -> bool:
jpayne@7:         """
jpayne@7:         Implemented to make sorted available upon CharsetMatches items.
jpayne@7:         """
jpayne@7:         if not isinstance(other, CharsetMatch):
jpayne@7:             raise ValueError
jpayne@7: 
jpayne@7:         chaos_difference: float = abs(self.chaos - other.chaos)
jpayne@7:         coherence_difference: float = abs(self.coherence - other.coherence)
jpayne@7: 
jpayne@7:         # Below 1% difference --> Use Coherence
jpayne@7:         if chaos_difference < 0.01 and coherence_difference > 0.02:
jpayne@7:             return self.coherence > other.coherence
jpayne@7:         elif chaos_difference < 0.01 and coherence_difference <= 0.02:
jpayne@7:             # When having a difficult decision, use the result that decoded as many multi-byte as possible.
jpayne@7:             # preserve RAM usage!
jpayne@7:             if len(self._payload) >= TOO_BIG_SEQUENCE:
jpayne@7:                 return self.chaos < other.chaos
jpayne@7:             return self.multi_byte_usage > other.multi_byte_usage
jpayne@7: 
jpayne@7:         return self.chaos < other.chaos
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def multi_byte_usage(self) -> float:
jpayne@7:         return 1.0 - (len(str(self)) / len(self.raw))
jpayne@7: 
jpayne@7:     def __str__(self) -> str:
jpayne@7:         # Lazy Str Loading
jpayne@7:         if self._string is None:
jpayne@7:             self._string = str(self._payload, self._encoding, "strict")
jpayne@7:         return self._string
jpayne@7: 
jpayne@7:     def __repr__(self) -> str:
jpayne@7:         return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
jpayne@7: 
jpayne@7:     def add_submatch(self, other: "CharsetMatch") -> None:
jpayne@7:         if not isinstance(other, CharsetMatch) or other == self:
jpayne@7:             raise ValueError(
jpayne@7:                 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
jpayne@7:                     other.__class__
jpayne@7:                 )
jpayne@7:             )
jpayne@7: 
jpayne@7:         other._string = None  # Unload RAM usage; dirty trick.
jpayne@7:         self._leaves.append(other)
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def encoding(self) -> str:
jpayne@7:         return self._encoding
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def encoding_aliases(self) -> List[str]:
jpayne@7:         """
jpayne@7:         Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
jpayne@7:         """
jpayne@7:         also_known_as: List[str] = []
jpayne@7:         for u, p in aliases.items():
jpayne@7:             if self.encoding == u:
jpayne@7:                 also_known_as.append(p)
jpayne@7:             elif self.encoding == p:
jpayne@7:                 also_known_as.append(u)
jpayne@7:         return also_known_as
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def bom(self) -> bool:
jpayne@7:         return self._has_sig_or_bom
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def byte_order_mark(self) -> bool:
jpayne@7:         return self._has_sig_or_bom
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def languages(self) -> List[str]:
jpayne@7:         """
jpayne@7:         Return the complete list of possible languages found in decoded sequence.
jpayne@7:         Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
jpayne@7:         """
jpayne@7:         return [e[0] for e in self._languages]
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def language(self) -> str:
jpayne@7:         """
jpayne@7:         Most probable language found in decoded sequence. If none were detected or inferred, the property will return
jpayne@7:         "Unknown".
jpayne@7:         """
jpayne@7:         if not self._languages:
jpayne@7:             # Trying to infer the language based on the given encoding
jpayne@7:             # Its either English or we should not pronounce ourselves in certain cases.
jpayne@7:             if "ascii" in self.could_be_from_charset:
jpayne@7:                 return "English"
jpayne@7: 
jpayne@7:             # doing it there to avoid circular import
jpayne@7:             from charset_normalizer.cd import encoding_languages, mb_encoding_languages
jpayne@7: 
jpayne@7:             languages = (
jpayne@7:                 mb_encoding_languages(self.encoding)
jpayne@7:                 if is_multi_byte_encoding(self.encoding)
jpayne@7:                 else encoding_languages(self.encoding)
jpayne@7:             )
jpayne@7: 
jpayne@7:             if len(languages) == 0 or "Latin Based" in languages:
jpayne@7:                 return "Unknown"
jpayne@7: 
jpayne@7:             return languages[0]
jpayne@7: 
jpayne@7:         return self._languages[0][0]
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def chaos(self) -> float:
jpayne@7:         return self._mean_mess_ratio
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def coherence(self) -> float:
jpayne@7:         if not self._languages:
jpayne@7:             return 0.0
jpayne@7:         return self._languages[0][1]
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def percent_chaos(self) -> float:
jpayne@7:         return round(self.chaos * 100, ndigits=3)
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def percent_coherence(self) -> float:
jpayne@7:         return round(self.coherence * 100, ndigits=3)
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def raw(self) -> bytes:
jpayne@7:         """
jpayne@7:         Original untouched bytes.
jpayne@7:         """
jpayne@7:         return self._payload
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def submatch(self) -> List["CharsetMatch"]:
jpayne@7:         return self._leaves
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def has_submatch(self) -> bool:
jpayne@7:         return len(self._leaves) > 0
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def alphabets(self) -> List[str]:
jpayne@7:         if self._unicode_ranges is not None:
jpayne@7:             return self._unicode_ranges
jpayne@7:         # list detected ranges
jpayne@7:         detected_ranges: List[Optional[str]] = [
jpayne@7:             unicode_range(char) for char in str(self)
jpayne@7:         ]
jpayne@7:         # filter and sort
jpayne@7:         self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
jpayne@7:         return self._unicode_ranges
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def could_be_from_charset(self) -> List[str]:
jpayne@7:         """
jpayne@7:         The complete list of encoding that output the exact SAME str result and therefore could be the originating
jpayne@7:         encoding.
jpayne@7:         This list does include the encoding available in property 'encoding'.
jpayne@7:         """
jpayne@7:         return [self._encoding] + [m.encoding for m in self._leaves]
jpayne@7: 
jpayne@7:     def output(self, encoding: str = "utf_8") -> bytes:
jpayne@7:         """
jpayne@7:         Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
jpayne@7:         Any errors will be simply ignored by the encoder NOT replaced.
jpayne@7:         """
jpayne@7:         if self._output_encoding is None or self._output_encoding != encoding:
jpayne@7:             self._output_encoding = encoding
jpayne@7:             self._output_payload = str(self).encode(encoding, "replace")
jpayne@7: 
jpayne@7:         return self._output_payload  # type: ignore
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def fingerprint(self) -> str:
jpayne@7:         """
jpayne@7:         Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
jpayne@7:         """
jpayne@7:         return sha256(self.output()).hexdigest()
jpayne@7: 
jpayne@7: 
jpayne@7: class CharsetMatches:
jpayne@7:     """
jpayne@7:     Container with every CharsetMatch items ordered by default from most probable to the less one.
jpayne@7:     Act like a list(iterable) but does not implements all related methods.
jpayne@7:     """
jpayne@7: 
jpayne@7:     def __init__(self, results: Optional[List[CharsetMatch]] = None):
jpayne@7:         self._results: List[CharsetMatch] = sorted(results) if results else []
jpayne@7: 
jpayne@7:     def __iter__(self) -> Iterator[CharsetMatch]:
jpayne@7:         yield from self._results
jpayne@7: 
jpayne@7:     def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
jpayne@7:         """
jpayne@7:         Retrieve a single item either by its position or encoding name (alias may be used here).
jpayne@7:         Raise KeyError upon invalid index or encoding not present in results.
jpayne@7:         """
jpayne@7:         if isinstance(item, int):
jpayne@7:             return self._results[item]
jpayne@7:         if isinstance(item, str):
jpayne@7:             item = iana_name(item, False)
jpayne@7:             for result in self._results:
jpayne@7:                 if item in result.could_be_from_charset:
jpayne@7:                     return result
jpayne@7:         raise KeyError
jpayne@7: 
jpayne@7:     def __len__(self) -> int:
jpayne@7:         return len(self._results)
jpayne@7: 
jpayne@7:     def __bool__(self) -> bool:
jpayne@7:         return len(self._results) > 0
jpayne@7: 
jpayne@7:     def append(self, item: CharsetMatch) -> None:
jpayne@7:         """
jpayne@7:         Insert a single match. Will be inserted accordingly to preserve sort.
jpayne@7:         Can be inserted as a submatch.
jpayne@7:         """
jpayne@7:         if not isinstance(item, CharsetMatch):
jpayne@7:             raise ValueError(
jpayne@7:                 "Cannot append instance '{}' to CharsetMatches".format(
jpayne@7:                     str(item.__class__)
jpayne@7:                 )
jpayne@7:             )
jpayne@7:         # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
jpayne@7:         if len(item.raw) <= TOO_BIG_SEQUENCE:
jpayne@7:             for match in self._results:
jpayne@7:                 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
jpayne@7:                     match.add_submatch(item)
jpayne@7:                     return
jpayne@7:         self._results.append(item)
jpayne@7:         self._results = sorted(self._results)
jpayne@7: 
jpayne@7:     def best(self) -> Optional["CharsetMatch"]:
jpayne@7:         """
jpayne@7:         Simply return the first match. Strict equivalent to matches[0].
jpayne@7:         """
jpayne@7:         if not self._results:
jpayne@7:             return None
jpayne@7:         return self._results[0]
jpayne@7: 
jpayne@7:     def first(self) -> Optional["CharsetMatch"]:
jpayne@7:         """
jpayne@7:         Redundant method, call the method best(). Kept for BC reasons.
jpayne@7:         """
jpayne@7:         return self.best()
jpayne@7: 
jpayne@7: 
jpayne@7: CoherenceMatch = Tuple[str, float]
jpayne@7: CoherenceMatches = List[CoherenceMatch]
jpayne@7: 
jpayne@7: 
jpayne@7: class CliDetectionResult:
jpayne@7:     def __init__(
jpayne@7:         self,
jpayne@7:         path: str,
jpayne@7:         encoding: Optional[str],
jpayne@7:         encoding_aliases: List[str],
jpayne@7:         alternative_encodings: List[str],
jpayne@7:         language: str,
jpayne@7:         alphabets: List[str],
jpayne@7:         has_sig_or_bom: bool,
jpayne@7:         chaos: float,
jpayne@7:         coherence: float,
jpayne@7:         unicode_path: Optional[str],
jpayne@7:         is_preferred: bool,
jpayne@7:     ):
jpayne@7:         self.path: str = path
jpayne@7:         self.unicode_path: Optional[str] = unicode_path
jpayne@7:         self.encoding: Optional[str] = encoding
jpayne@7:         self.encoding_aliases: List[str] = encoding_aliases
jpayne@7:         self.alternative_encodings: List[str] = alternative_encodings
jpayne@7:         self.language: str = language
jpayne@7:         self.alphabets: List[str] = alphabets
jpayne@7:         self.has_sig_or_bom: bool = has_sig_or_bom
jpayne@7:         self.chaos: float = chaos
jpayne@7:         self.coherence: float = coherence
jpayne@7:         self.is_preferred: bool = is_preferred
jpayne@7: 
jpayne@7:     @property
jpayne@7:     def __dict__(self) -> Dict[str, Any]:  # type: ignore
jpayne@7:         return {
jpayne@7:             "path": self.path,
jpayne@7:             "encoding": self.encoding,
jpayne@7:             "encoding_aliases": self.encoding_aliases,
jpayne@7:             "alternative_encodings": self.alternative_encodings,
jpayne@7:             "language": self.language,
jpayne@7:             "alphabets": self.alphabets,
jpayne@7:             "has_sig_or_bom": self.has_sig_or_bom,
jpayne@7:             "chaos": self.chaos,
jpayne@7:             "coherence": self.coherence,
jpayne@7:             "unicode_path": self.unicode_path,
jpayne@7:             "is_preferred": self.is_preferred,
jpayne@7:         }
jpayne@7: 
jpayne@7:     def to_json(self) -> str:
jpayne@7:         return dumps(self.__dict__, ensure_ascii=True, indent=4)