annotate charset_normalizer/models.py @ 7:5eb2d5e3bf22

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Sun, 05 May 2024 23:32:17 -0400
parents
children
rev   line source
jpayne@7 1 from encodings.aliases import aliases
jpayne@7 2 from hashlib import sha256
jpayne@7 3 from json import dumps
jpayne@7 4 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
jpayne@7 5
jpayne@7 6 from .constant import TOO_BIG_SEQUENCE
jpayne@7 7 from .utils import iana_name, is_multi_byte_encoding, unicode_range
jpayne@7 8
jpayne@7 9
jpayne@7 10 class CharsetMatch:
jpayne@7 11 def __init__(
jpayne@7 12 self,
jpayne@7 13 payload: bytes,
jpayne@7 14 guessed_encoding: str,
jpayne@7 15 mean_mess_ratio: float,
jpayne@7 16 has_sig_or_bom: bool,
jpayne@7 17 languages: "CoherenceMatches",
jpayne@7 18 decoded_payload: Optional[str] = None,
jpayne@7 19 ):
jpayne@7 20 self._payload: bytes = payload
jpayne@7 21
jpayne@7 22 self._encoding: str = guessed_encoding
jpayne@7 23 self._mean_mess_ratio: float = mean_mess_ratio
jpayne@7 24 self._languages: CoherenceMatches = languages
jpayne@7 25 self._has_sig_or_bom: bool = has_sig_or_bom
jpayne@7 26 self._unicode_ranges: Optional[List[str]] = None
jpayne@7 27
jpayne@7 28 self._leaves: List[CharsetMatch] = []
jpayne@7 29 self._mean_coherence_ratio: float = 0.0
jpayne@7 30
jpayne@7 31 self._output_payload: Optional[bytes] = None
jpayne@7 32 self._output_encoding: Optional[str] = None
jpayne@7 33
jpayne@7 34 self._string: Optional[str] = decoded_payload
jpayne@7 35
jpayne@7 36 def __eq__(self, other: object) -> bool:
jpayne@7 37 if not isinstance(other, CharsetMatch):
jpayne@7 38 raise TypeError(
jpayne@7 39 "__eq__ cannot be invoked on {} and {}.".format(
jpayne@7 40 str(other.__class__), str(self.__class__)
jpayne@7 41 )
jpayne@7 42 )
jpayne@7 43 return self.encoding == other.encoding and self.fingerprint == other.fingerprint
jpayne@7 44
jpayne@7 45 def __lt__(self, other: object) -> bool:
jpayne@7 46 """
jpayne@7 47 Implemented to make sorted available upon CharsetMatches items.
jpayne@7 48 """
jpayne@7 49 if not isinstance(other, CharsetMatch):
jpayne@7 50 raise ValueError
jpayne@7 51
jpayne@7 52 chaos_difference: float = abs(self.chaos - other.chaos)
jpayne@7 53 coherence_difference: float = abs(self.coherence - other.coherence)
jpayne@7 54
jpayne@7 55 # Below 1% difference --> Use Coherence
jpayne@7 56 if chaos_difference < 0.01 and coherence_difference > 0.02:
jpayne@7 57 return self.coherence > other.coherence
jpayne@7 58 elif chaos_difference < 0.01 and coherence_difference <= 0.02:
jpayne@7 59 # When having a difficult decision, use the result that decoded as many multi-byte as possible.
jpayne@7 60 # preserve RAM usage!
jpayne@7 61 if len(self._payload) >= TOO_BIG_SEQUENCE:
jpayne@7 62 return self.chaos < other.chaos
jpayne@7 63 return self.multi_byte_usage > other.multi_byte_usage
jpayne@7 64
jpayne@7 65 return self.chaos < other.chaos
jpayne@7 66
jpayne@7 67 @property
jpayne@7 68 def multi_byte_usage(self) -> float:
jpayne@7 69 return 1.0 - (len(str(self)) / len(self.raw))
jpayne@7 70
jpayne@7 71 def __str__(self) -> str:
jpayne@7 72 # Lazy Str Loading
jpayne@7 73 if self._string is None:
jpayne@7 74 self._string = str(self._payload, self._encoding, "strict")
jpayne@7 75 return self._string
jpayne@7 76
jpayne@7 77 def __repr__(self) -> str:
jpayne@7 78 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
jpayne@7 79
jpayne@7 80 def add_submatch(self, other: "CharsetMatch") -> None:
jpayne@7 81 if not isinstance(other, CharsetMatch) or other == self:
jpayne@7 82 raise ValueError(
jpayne@7 83 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
jpayne@7 84 other.__class__
jpayne@7 85 )
jpayne@7 86 )
jpayne@7 87
jpayne@7 88 other._string = None # Unload RAM usage; dirty trick.
jpayne@7 89 self._leaves.append(other)
jpayne@7 90
jpayne@7 91 @property
jpayne@7 92 def encoding(self) -> str:
jpayne@7 93 return self._encoding
jpayne@7 94
jpayne@7 95 @property
jpayne@7 96 def encoding_aliases(self) -> List[str]:
jpayne@7 97 """
jpayne@7 98 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
jpayne@7 99 """
jpayne@7 100 also_known_as: List[str] = []
jpayne@7 101 for u, p in aliases.items():
jpayne@7 102 if self.encoding == u:
jpayne@7 103 also_known_as.append(p)
jpayne@7 104 elif self.encoding == p:
jpayne@7 105 also_known_as.append(u)
jpayne@7 106 return also_known_as
jpayne@7 107
jpayne@7 108 @property
jpayne@7 109 def bom(self) -> bool:
jpayne@7 110 return self._has_sig_or_bom
jpayne@7 111
jpayne@7 112 @property
jpayne@7 113 def byte_order_mark(self) -> bool:
jpayne@7 114 return self._has_sig_or_bom
jpayne@7 115
jpayne@7 116 @property
jpayne@7 117 def languages(self) -> List[str]:
jpayne@7 118 """
jpayne@7 119 Return the complete list of possible languages found in decoded sequence.
jpayne@7 120 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
jpayne@7 121 """
jpayne@7 122 return [e[0] for e in self._languages]
jpayne@7 123
jpayne@7 124 @property
jpayne@7 125 def language(self) -> str:
jpayne@7 126 """
jpayne@7 127 Most probable language found in decoded sequence. If none were detected or inferred, the property will return
jpayne@7 128 "Unknown".
jpayne@7 129 """
jpayne@7 130 if not self._languages:
jpayne@7 131 # Trying to infer the language based on the given encoding
jpayne@7 132 # Its either English or we should not pronounce ourselves in certain cases.
jpayne@7 133 if "ascii" in self.could_be_from_charset:
jpayne@7 134 return "English"
jpayne@7 135
jpayne@7 136 # doing it there to avoid circular import
jpayne@7 137 from charset_normalizer.cd import encoding_languages, mb_encoding_languages
jpayne@7 138
jpayne@7 139 languages = (
jpayne@7 140 mb_encoding_languages(self.encoding)
jpayne@7 141 if is_multi_byte_encoding(self.encoding)
jpayne@7 142 else encoding_languages(self.encoding)
jpayne@7 143 )
jpayne@7 144
jpayne@7 145 if len(languages) == 0 or "Latin Based" in languages:
jpayne@7 146 return "Unknown"
jpayne@7 147
jpayne@7 148 return languages[0]
jpayne@7 149
jpayne@7 150 return self._languages[0][0]
jpayne@7 151
jpayne@7 152 @property
jpayne@7 153 def chaos(self) -> float:
jpayne@7 154 return self._mean_mess_ratio
jpayne@7 155
jpayne@7 156 @property
jpayne@7 157 def coherence(self) -> float:
jpayne@7 158 if not self._languages:
jpayne@7 159 return 0.0
jpayne@7 160 return self._languages[0][1]
jpayne@7 161
jpayne@7 162 @property
jpayne@7 163 def percent_chaos(self) -> float:
jpayne@7 164 return round(self.chaos * 100, ndigits=3)
jpayne@7 165
jpayne@7 166 @property
jpayne@7 167 def percent_coherence(self) -> float:
jpayne@7 168 return round(self.coherence * 100, ndigits=3)
jpayne@7 169
jpayne@7 170 @property
jpayne@7 171 def raw(self) -> bytes:
jpayne@7 172 """
jpayne@7 173 Original untouched bytes.
jpayne@7 174 """
jpayne@7 175 return self._payload
jpayne@7 176
jpayne@7 177 @property
jpayne@7 178 def submatch(self) -> List["CharsetMatch"]:
jpayne@7 179 return self._leaves
jpayne@7 180
jpayne@7 181 @property
jpayne@7 182 def has_submatch(self) -> bool:
jpayne@7 183 return len(self._leaves) > 0
jpayne@7 184
jpayne@7 185 @property
jpayne@7 186 def alphabets(self) -> List[str]:
jpayne@7 187 if self._unicode_ranges is not None:
jpayne@7 188 return self._unicode_ranges
jpayne@7 189 # list detected ranges
jpayne@7 190 detected_ranges: List[Optional[str]] = [
jpayne@7 191 unicode_range(char) for char in str(self)
jpayne@7 192 ]
jpayne@7 193 # filter and sort
jpayne@7 194 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
jpayne@7 195 return self._unicode_ranges
jpayne@7 196
jpayne@7 197 @property
jpayne@7 198 def could_be_from_charset(self) -> List[str]:
jpayne@7 199 """
jpayne@7 200 The complete list of encoding that output the exact SAME str result and therefore could be the originating
jpayne@7 201 encoding.
jpayne@7 202 This list does include the encoding available in property 'encoding'.
jpayne@7 203 """
jpayne@7 204 return [self._encoding] + [m.encoding for m in self._leaves]
jpayne@7 205
jpayne@7 206 def output(self, encoding: str = "utf_8") -> bytes:
jpayne@7 207 """
jpayne@7 208 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
jpayne@7 209 Any errors will be simply ignored by the encoder NOT replaced.
jpayne@7 210 """
jpayne@7 211 if self._output_encoding is None or self._output_encoding != encoding:
jpayne@7 212 self._output_encoding = encoding
jpayne@7 213 self._output_payload = str(self).encode(encoding, "replace")
jpayne@7 214
jpayne@7 215 return self._output_payload # type: ignore
jpayne@7 216
jpayne@7 217 @property
jpayne@7 218 def fingerprint(self) -> str:
jpayne@7 219 """
jpayne@7 220 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
jpayne@7 221 """
jpayne@7 222 return sha256(self.output()).hexdigest()
jpayne@7 223
jpayne@7 224
jpayne@7 225 class CharsetMatches:
jpayne@7 226 """
jpayne@7 227 Container with every CharsetMatch items ordered by default from most probable to the less one.
jpayne@7 228 Act like a list(iterable) but does not implements all related methods.
jpayne@7 229 """
jpayne@7 230
jpayne@7 231 def __init__(self, results: Optional[List[CharsetMatch]] = None):
jpayne@7 232 self._results: List[CharsetMatch] = sorted(results) if results else []
jpayne@7 233
jpayne@7 234 def __iter__(self) -> Iterator[CharsetMatch]:
jpayne@7 235 yield from self._results
jpayne@7 236
jpayne@7 237 def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
jpayne@7 238 """
jpayne@7 239 Retrieve a single item either by its position or encoding name (alias may be used here).
jpayne@7 240 Raise KeyError upon invalid index or encoding not present in results.
jpayne@7 241 """
jpayne@7 242 if isinstance(item, int):
jpayne@7 243 return self._results[item]
jpayne@7 244 if isinstance(item, str):
jpayne@7 245 item = iana_name(item, False)
jpayne@7 246 for result in self._results:
jpayne@7 247 if item in result.could_be_from_charset:
jpayne@7 248 return result
jpayne@7 249 raise KeyError
jpayne@7 250
jpayne@7 251 def __len__(self) -> int:
jpayne@7 252 return len(self._results)
jpayne@7 253
jpayne@7 254 def __bool__(self) -> bool:
jpayne@7 255 return len(self._results) > 0
jpayne@7 256
jpayne@7 257 def append(self, item: CharsetMatch) -> None:
jpayne@7 258 """
jpayne@7 259 Insert a single match. Will be inserted accordingly to preserve sort.
jpayne@7 260 Can be inserted as a submatch.
jpayne@7 261 """
jpayne@7 262 if not isinstance(item, CharsetMatch):
jpayne@7 263 raise ValueError(
jpayne@7 264 "Cannot append instance '{}' to CharsetMatches".format(
jpayne@7 265 str(item.__class__)
jpayne@7 266 )
jpayne@7 267 )
jpayne@7 268 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
jpayne@7 269 if len(item.raw) <= TOO_BIG_SEQUENCE:
jpayne@7 270 for match in self._results:
jpayne@7 271 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
jpayne@7 272 match.add_submatch(item)
jpayne@7 273 return
jpayne@7 274 self._results.append(item)
jpayne@7 275 self._results = sorted(self._results)
jpayne@7 276
jpayne@7 277 def best(self) -> Optional["CharsetMatch"]:
jpayne@7 278 """
jpayne@7 279 Simply return the first match. Strict equivalent to matches[0].
jpayne@7 280 """
jpayne@7 281 if not self._results:
jpayne@7 282 return None
jpayne@7 283 return self._results[0]
jpayne@7 284
jpayne@7 285 def first(self) -> Optional["CharsetMatch"]:
jpayne@7 286 """
jpayne@7 287 Redundant method, call the method best(). Kept for BC reasons.
jpayne@7 288 """
jpayne@7 289 return self.best()
jpayne@7 290
jpayne@7 291
jpayne@7 292 CoherenceMatch = Tuple[str, float]
jpayne@7 293 CoherenceMatches = List[CoherenceMatch]
jpayne@7 294
jpayne@7 295
jpayne@7 296 class CliDetectionResult:
jpayne@7 297 def __init__(
jpayne@7 298 self,
jpayne@7 299 path: str,
jpayne@7 300 encoding: Optional[str],
jpayne@7 301 encoding_aliases: List[str],
jpayne@7 302 alternative_encodings: List[str],
jpayne@7 303 language: str,
jpayne@7 304 alphabets: List[str],
jpayne@7 305 has_sig_or_bom: bool,
jpayne@7 306 chaos: float,
jpayne@7 307 coherence: float,
jpayne@7 308 unicode_path: Optional[str],
jpayne@7 309 is_preferred: bool,
jpayne@7 310 ):
jpayne@7 311 self.path: str = path
jpayne@7 312 self.unicode_path: Optional[str] = unicode_path
jpayne@7 313 self.encoding: Optional[str] = encoding
jpayne@7 314 self.encoding_aliases: List[str] = encoding_aliases
jpayne@7 315 self.alternative_encodings: List[str] = alternative_encodings
jpayne@7 316 self.language: str = language
jpayne@7 317 self.alphabets: List[str] = alphabets
jpayne@7 318 self.has_sig_or_bom: bool = has_sig_or_bom
jpayne@7 319 self.chaos: float = chaos
jpayne@7 320 self.coherence: float = coherence
jpayne@7 321 self.is_preferred: bool = is_preferred
jpayne@7 322
jpayne@7 323 @property
jpayne@7 324 def __dict__(self) -> Dict[str, Any]: # type: ignore
jpayne@7 325 return {
jpayne@7 326 "path": self.path,
jpayne@7 327 "encoding": self.encoding,
jpayne@7 328 "encoding_aliases": self.encoding_aliases,
jpayne@7 329 "alternative_encodings": self.alternative_encodings,
jpayne@7 330 "language": self.language,
jpayne@7 331 "alphabets": self.alphabets,
jpayne@7 332 "has_sig_or_bom": self.has_sig_or_bom,
jpayne@7 333 "chaos": self.chaos,
jpayne@7 334 "coherence": self.coherence,
jpayne@7 335 "unicode_path": self.unicode_path,
jpayne@7 336 "is_preferred": self.is_preferred,
jpayne@7 337 }
jpayne@7 338
jpayne@7 339 def to_json(self) -> str:
jpayne@7 340 return dumps(self.__dict__, ensure_ascii=True, indent=4)