jpayne@7
|
1 from encodings.aliases import aliases
|
jpayne@7
|
2 from hashlib import sha256
|
jpayne@7
|
3 from json import dumps
|
jpayne@7
|
4 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
jpayne@7
|
5
|
jpayne@7
|
6 from .constant import TOO_BIG_SEQUENCE
|
jpayne@7
|
7 from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
jpayne@7
|
8
|
jpayne@7
|
9
|
jpayne@7
|
10 class CharsetMatch:
|
jpayne@7
|
11 def __init__(
|
jpayne@7
|
12 self,
|
jpayne@7
|
13 payload: bytes,
|
jpayne@7
|
14 guessed_encoding: str,
|
jpayne@7
|
15 mean_mess_ratio: float,
|
jpayne@7
|
16 has_sig_or_bom: bool,
|
jpayne@7
|
17 languages: "CoherenceMatches",
|
jpayne@7
|
18 decoded_payload: Optional[str] = None,
|
jpayne@7
|
19 ):
|
jpayne@7
|
20 self._payload: bytes = payload
|
jpayne@7
|
21
|
jpayne@7
|
22 self._encoding: str = guessed_encoding
|
jpayne@7
|
23 self._mean_mess_ratio: float = mean_mess_ratio
|
jpayne@7
|
24 self._languages: CoherenceMatches = languages
|
jpayne@7
|
25 self._has_sig_or_bom: bool = has_sig_or_bom
|
jpayne@7
|
26 self._unicode_ranges: Optional[List[str]] = None
|
jpayne@7
|
27
|
jpayne@7
|
28 self._leaves: List[CharsetMatch] = []
|
jpayne@7
|
29 self._mean_coherence_ratio: float = 0.0
|
jpayne@7
|
30
|
jpayne@7
|
31 self._output_payload: Optional[bytes] = None
|
jpayne@7
|
32 self._output_encoding: Optional[str] = None
|
jpayne@7
|
33
|
jpayne@7
|
34 self._string: Optional[str] = decoded_payload
|
jpayne@7
|
35
|
jpayne@7
|
36 def __eq__(self, other: object) -> bool:
|
jpayne@7
|
37 if not isinstance(other, CharsetMatch):
|
jpayne@7
|
38 raise TypeError(
|
jpayne@7
|
39 "__eq__ cannot be invoked on {} and {}.".format(
|
jpayne@7
|
40 str(other.__class__), str(self.__class__)
|
jpayne@7
|
41 )
|
jpayne@7
|
42 )
|
jpayne@7
|
43 return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
jpayne@7
|
44
|
jpayne@7
|
45 def __lt__(self, other: object) -> bool:
|
jpayne@7
|
46 """
|
jpayne@7
|
47 Implemented to make sorted available upon CharsetMatches items.
|
jpayne@7
|
48 """
|
jpayne@7
|
49 if not isinstance(other, CharsetMatch):
|
jpayne@7
|
50 raise ValueError
|
jpayne@7
|
51
|
jpayne@7
|
52 chaos_difference: float = abs(self.chaos - other.chaos)
|
jpayne@7
|
53 coherence_difference: float = abs(self.coherence - other.coherence)
|
jpayne@7
|
54
|
jpayne@7
|
55 # Below 1% difference --> Use Coherence
|
jpayne@7
|
56 if chaos_difference < 0.01 and coherence_difference > 0.02:
|
jpayne@7
|
57 return self.coherence > other.coherence
|
jpayne@7
|
58 elif chaos_difference < 0.01 and coherence_difference <= 0.02:
|
jpayne@7
|
59 # When having a difficult decision, use the result that decoded as many multi-byte as possible.
|
jpayne@7
|
60 # preserve RAM usage!
|
jpayne@7
|
61 if len(self._payload) >= TOO_BIG_SEQUENCE:
|
jpayne@7
|
62 return self.chaos < other.chaos
|
jpayne@7
|
63 return self.multi_byte_usage > other.multi_byte_usage
|
jpayne@7
|
64
|
jpayne@7
|
65 return self.chaos < other.chaos
|
jpayne@7
|
66
|
jpayne@7
|
67 @property
|
jpayne@7
|
68 def multi_byte_usage(self) -> float:
|
jpayne@7
|
69 return 1.0 - (len(str(self)) / len(self.raw))
|
jpayne@7
|
70
|
jpayne@7
|
71 def __str__(self) -> str:
|
jpayne@7
|
72 # Lazy Str Loading
|
jpayne@7
|
73 if self._string is None:
|
jpayne@7
|
74 self._string = str(self._payload, self._encoding, "strict")
|
jpayne@7
|
75 return self._string
|
jpayne@7
|
76
|
jpayne@7
|
77 def __repr__(self) -> str:
|
jpayne@7
|
78 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
|
jpayne@7
|
79
|
jpayne@7
|
80 def add_submatch(self, other: "CharsetMatch") -> None:
|
jpayne@7
|
81 if not isinstance(other, CharsetMatch) or other == self:
|
jpayne@7
|
82 raise ValueError(
|
jpayne@7
|
83 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
jpayne@7
|
84 other.__class__
|
jpayne@7
|
85 )
|
jpayne@7
|
86 )
|
jpayne@7
|
87
|
jpayne@7
|
88 other._string = None # Unload RAM usage; dirty trick.
|
jpayne@7
|
89 self._leaves.append(other)
|
jpayne@7
|
90
|
jpayne@7
|
91 @property
|
jpayne@7
|
92 def encoding(self) -> str:
|
jpayne@7
|
93 return self._encoding
|
jpayne@7
|
94
|
jpayne@7
|
95 @property
|
jpayne@7
|
96 def encoding_aliases(self) -> List[str]:
|
jpayne@7
|
97 """
|
jpayne@7
|
98 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
jpayne@7
|
99 """
|
jpayne@7
|
100 also_known_as: List[str] = []
|
jpayne@7
|
101 for u, p in aliases.items():
|
jpayne@7
|
102 if self.encoding == u:
|
jpayne@7
|
103 also_known_as.append(p)
|
jpayne@7
|
104 elif self.encoding == p:
|
jpayne@7
|
105 also_known_as.append(u)
|
jpayne@7
|
106 return also_known_as
|
jpayne@7
|
107
|
jpayne@7
|
108 @property
|
jpayne@7
|
109 def bom(self) -> bool:
|
jpayne@7
|
110 return self._has_sig_or_bom
|
jpayne@7
|
111
|
jpayne@7
|
112 @property
|
jpayne@7
|
113 def byte_order_mark(self) -> bool:
|
jpayne@7
|
114 return self._has_sig_or_bom
|
jpayne@7
|
115
|
jpayne@7
|
116 @property
|
jpayne@7
|
117 def languages(self) -> List[str]:
|
jpayne@7
|
118 """
|
jpayne@7
|
119 Return the complete list of possible languages found in decoded sequence.
|
jpayne@7
|
120 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
jpayne@7
|
121 """
|
jpayne@7
|
122 return [e[0] for e in self._languages]
|
jpayne@7
|
123
|
jpayne@7
|
124 @property
|
jpayne@7
|
125 def language(self) -> str:
|
jpayne@7
|
126 """
|
jpayne@7
|
127 Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
jpayne@7
|
128 "Unknown".
|
jpayne@7
|
129 """
|
jpayne@7
|
130 if not self._languages:
|
jpayne@7
|
131 # Trying to infer the language based on the given encoding
|
jpayne@7
|
132 # Its either English or we should not pronounce ourselves in certain cases.
|
jpayne@7
|
133 if "ascii" in self.could_be_from_charset:
|
jpayne@7
|
134 return "English"
|
jpayne@7
|
135
|
jpayne@7
|
136 # doing it there to avoid circular import
|
jpayne@7
|
137 from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
jpayne@7
|
138
|
jpayne@7
|
139 languages = (
|
jpayne@7
|
140 mb_encoding_languages(self.encoding)
|
jpayne@7
|
141 if is_multi_byte_encoding(self.encoding)
|
jpayne@7
|
142 else encoding_languages(self.encoding)
|
jpayne@7
|
143 )
|
jpayne@7
|
144
|
jpayne@7
|
145 if len(languages) == 0 or "Latin Based" in languages:
|
jpayne@7
|
146 return "Unknown"
|
jpayne@7
|
147
|
jpayne@7
|
148 return languages[0]
|
jpayne@7
|
149
|
jpayne@7
|
150 return self._languages[0][0]
|
jpayne@7
|
151
|
jpayne@7
|
152 @property
|
jpayne@7
|
153 def chaos(self) -> float:
|
jpayne@7
|
154 return self._mean_mess_ratio
|
jpayne@7
|
155
|
jpayne@7
|
156 @property
|
jpayne@7
|
157 def coherence(self) -> float:
|
jpayne@7
|
158 if not self._languages:
|
jpayne@7
|
159 return 0.0
|
jpayne@7
|
160 return self._languages[0][1]
|
jpayne@7
|
161
|
jpayne@7
|
162 @property
|
jpayne@7
|
163 def percent_chaos(self) -> float:
|
jpayne@7
|
164 return round(self.chaos * 100, ndigits=3)
|
jpayne@7
|
165
|
jpayne@7
|
166 @property
|
jpayne@7
|
167 def percent_coherence(self) -> float:
|
jpayne@7
|
168 return round(self.coherence * 100, ndigits=3)
|
jpayne@7
|
169
|
jpayne@7
|
170 @property
|
jpayne@7
|
171 def raw(self) -> bytes:
|
jpayne@7
|
172 """
|
jpayne@7
|
173 Original untouched bytes.
|
jpayne@7
|
174 """
|
jpayne@7
|
175 return self._payload
|
jpayne@7
|
176
|
jpayne@7
|
177 @property
|
jpayne@7
|
178 def submatch(self) -> List["CharsetMatch"]:
|
jpayne@7
|
179 return self._leaves
|
jpayne@7
|
180
|
jpayne@7
|
181 @property
|
jpayne@7
|
182 def has_submatch(self) -> bool:
|
jpayne@7
|
183 return len(self._leaves) > 0
|
jpayne@7
|
184
|
jpayne@7
|
185 @property
|
jpayne@7
|
186 def alphabets(self) -> List[str]:
|
jpayne@7
|
187 if self._unicode_ranges is not None:
|
jpayne@7
|
188 return self._unicode_ranges
|
jpayne@7
|
189 # list detected ranges
|
jpayne@7
|
190 detected_ranges: List[Optional[str]] = [
|
jpayne@7
|
191 unicode_range(char) for char in str(self)
|
jpayne@7
|
192 ]
|
jpayne@7
|
193 # filter and sort
|
jpayne@7
|
194 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
jpayne@7
|
195 return self._unicode_ranges
|
jpayne@7
|
196
|
jpayne@7
|
197 @property
|
jpayne@7
|
198 def could_be_from_charset(self) -> List[str]:
|
jpayne@7
|
199 """
|
jpayne@7
|
200 The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
jpayne@7
|
201 encoding.
|
jpayne@7
|
202 This list does include the encoding available in property 'encoding'.
|
jpayne@7
|
203 """
|
jpayne@7
|
204 return [self._encoding] + [m.encoding for m in self._leaves]
|
jpayne@7
|
205
|
jpayne@7
|
206 def output(self, encoding: str = "utf_8") -> bytes:
|
jpayne@7
|
207 """
|
jpayne@7
|
208 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
jpayne@7
|
209 Any errors will be simply ignored by the encoder NOT replaced.
|
jpayne@7
|
210 """
|
jpayne@7
|
211 if self._output_encoding is None or self._output_encoding != encoding:
|
jpayne@7
|
212 self._output_encoding = encoding
|
jpayne@7
|
213 self._output_payload = str(self).encode(encoding, "replace")
|
jpayne@7
|
214
|
jpayne@7
|
215 return self._output_payload # type: ignore
|
jpayne@7
|
216
|
jpayne@7
|
217 @property
|
jpayne@7
|
218 def fingerprint(self) -> str:
|
jpayne@7
|
219 """
|
jpayne@7
|
220 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
jpayne@7
|
221 """
|
jpayne@7
|
222 return sha256(self.output()).hexdigest()
|
jpayne@7
|
223
|
jpayne@7
|
224
|
jpayne@7
|
225 class CharsetMatches:
|
jpayne@7
|
226 """
|
jpayne@7
|
227 Container with every CharsetMatch items ordered by default from most probable to the less one.
|
jpayne@7
|
228 Act like a list(iterable) but does not implements all related methods.
|
jpayne@7
|
229 """
|
jpayne@7
|
230
|
jpayne@7
|
231 def __init__(self, results: Optional[List[CharsetMatch]] = None):
|
jpayne@7
|
232 self._results: List[CharsetMatch] = sorted(results) if results else []
|
jpayne@7
|
233
|
jpayne@7
|
234 def __iter__(self) -> Iterator[CharsetMatch]:
|
jpayne@7
|
235 yield from self._results
|
jpayne@7
|
236
|
jpayne@7
|
237 def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
|
jpayne@7
|
238 """
|
jpayne@7
|
239 Retrieve a single item either by its position or encoding name (alias may be used here).
|
jpayne@7
|
240 Raise KeyError upon invalid index or encoding not present in results.
|
jpayne@7
|
241 """
|
jpayne@7
|
242 if isinstance(item, int):
|
jpayne@7
|
243 return self._results[item]
|
jpayne@7
|
244 if isinstance(item, str):
|
jpayne@7
|
245 item = iana_name(item, False)
|
jpayne@7
|
246 for result in self._results:
|
jpayne@7
|
247 if item in result.could_be_from_charset:
|
jpayne@7
|
248 return result
|
jpayne@7
|
249 raise KeyError
|
jpayne@7
|
250
|
jpayne@7
|
251 def __len__(self) -> int:
|
jpayne@7
|
252 return len(self._results)
|
jpayne@7
|
253
|
jpayne@7
|
254 def __bool__(self) -> bool:
|
jpayne@7
|
255 return len(self._results) > 0
|
jpayne@7
|
256
|
jpayne@7
|
257 def append(self, item: CharsetMatch) -> None:
|
jpayne@7
|
258 """
|
jpayne@7
|
259 Insert a single match. Will be inserted accordingly to preserve sort.
|
jpayne@7
|
260 Can be inserted as a submatch.
|
jpayne@7
|
261 """
|
jpayne@7
|
262 if not isinstance(item, CharsetMatch):
|
jpayne@7
|
263 raise ValueError(
|
jpayne@7
|
264 "Cannot append instance '{}' to CharsetMatches".format(
|
jpayne@7
|
265 str(item.__class__)
|
jpayne@7
|
266 )
|
jpayne@7
|
267 )
|
jpayne@7
|
268 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
jpayne@7
|
269 if len(item.raw) <= TOO_BIG_SEQUENCE:
|
jpayne@7
|
270 for match in self._results:
|
jpayne@7
|
271 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
jpayne@7
|
272 match.add_submatch(item)
|
jpayne@7
|
273 return
|
jpayne@7
|
274 self._results.append(item)
|
jpayne@7
|
275 self._results = sorted(self._results)
|
jpayne@7
|
276
|
jpayne@7
|
277 def best(self) -> Optional["CharsetMatch"]:
|
jpayne@7
|
278 """
|
jpayne@7
|
279 Simply return the first match. Strict equivalent to matches[0].
|
jpayne@7
|
280 """
|
jpayne@7
|
281 if not self._results:
|
jpayne@7
|
282 return None
|
jpayne@7
|
283 return self._results[0]
|
jpayne@7
|
284
|
jpayne@7
|
285 def first(self) -> Optional["CharsetMatch"]:
|
jpayne@7
|
286 """
|
jpayne@7
|
287 Redundant method, call the method best(). Kept for BC reasons.
|
jpayne@7
|
288 """
|
jpayne@7
|
289 return self.best()
|
jpayne@7
|
290
|
jpayne@7
|
291
|
jpayne@7
|
292 CoherenceMatch = Tuple[str, float]
|
jpayne@7
|
293 CoherenceMatches = List[CoherenceMatch]
|
jpayne@7
|
294
|
jpayne@7
|
295
|
jpayne@7
|
296 class CliDetectionResult:
|
jpayne@7
|
297 def __init__(
|
jpayne@7
|
298 self,
|
jpayne@7
|
299 path: str,
|
jpayne@7
|
300 encoding: Optional[str],
|
jpayne@7
|
301 encoding_aliases: List[str],
|
jpayne@7
|
302 alternative_encodings: List[str],
|
jpayne@7
|
303 language: str,
|
jpayne@7
|
304 alphabets: List[str],
|
jpayne@7
|
305 has_sig_or_bom: bool,
|
jpayne@7
|
306 chaos: float,
|
jpayne@7
|
307 coherence: float,
|
jpayne@7
|
308 unicode_path: Optional[str],
|
jpayne@7
|
309 is_preferred: bool,
|
jpayne@7
|
310 ):
|
jpayne@7
|
311 self.path: str = path
|
jpayne@7
|
312 self.unicode_path: Optional[str] = unicode_path
|
jpayne@7
|
313 self.encoding: Optional[str] = encoding
|
jpayne@7
|
314 self.encoding_aliases: List[str] = encoding_aliases
|
jpayne@7
|
315 self.alternative_encodings: List[str] = alternative_encodings
|
jpayne@7
|
316 self.language: str = language
|
jpayne@7
|
317 self.alphabets: List[str] = alphabets
|
jpayne@7
|
318 self.has_sig_or_bom: bool = has_sig_or_bom
|
jpayne@7
|
319 self.chaos: float = chaos
|
jpayne@7
|
320 self.coherence: float = coherence
|
jpayne@7
|
321 self.is_preferred: bool = is_preferred
|
jpayne@7
|
322
|
jpayne@7
|
323 @property
|
jpayne@7
|
324 def __dict__(self) -> Dict[str, Any]: # type: ignore
|
jpayne@7
|
325 return {
|
jpayne@7
|
326 "path": self.path,
|
jpayne@7
|
327 "encoding": self.encoding,
|
jpayne@7
|
328 "encoding_aliases": self.encoding_aliases,
|
jpayne@7
|
329 "alternative_encodings": self.alternative_encodings,
|
jpayne@7
|
330 "language": self.language,
|
jpayne@7
|
331 "alphabets": self.alphabets,
|
jpayne@7
|
332 "has_sig_or_bom": self.has_sig_or_bom,
|
jpayne@7
|
333 "chaos": self.chaos,
|
jpayne@7
|
334 "coherence": self.coherence,
|
jpayne@7
|
335 "unicode_path": self.unicode_path,
|
jpayne@7
|
336 "is_preferred": self.is_preferred,
|
jpayne@7
|
337 }
|
jpayne@7
|
338
|
jpayne@7
|
339 def to_json(self) -> str:
|
jpayne@7
|
340 return dumps(self.__dict__, ensure_ascii=True, indent=4)
|