bioproject_to_srr_2: charset_normalizer/models.py annotate

annotate charset_normalizer/models.py @ 7:5eb2d5e3bf22

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538

author	jpayne
date	Sun, 05 May 2024 23:32:17 -0400
parents
children

rev	line source
jpayne@7	1 from encodings.aliases import aliases
jpayne@7	2 from hashlib import sha256
jpayne@7	3 from json import dumps
jpayne@7	4 from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
jpayne@7	5
jpayne@7	6 from .constant import TOO_BIG_SEQUENCE
jpayne@7	7 from .utils import iana_name, is_multi_byte_encoding, unicode_range
jpayne@7	8
jpayne@7	9
jpayne@7	10 class CharsetMatch:
jpayne@7	11 def __init__(
jpayne@7	12 self,
jpayne@7	13 payload: bytes,
jpayne@7	14 guessed_encoding: str,
jpayne@7	15 mean_mess_ratio: float,
jpayne@7	16 has_sig_or_bom: bool,
jpayne@7	17 languages: "CoherenceMatches",
jpayne@7	18 decoded_payload: Optional[str] = None,
jpayne@7	19 ):
jpayne@7	20 self._payload: bytes = payload
jpayne@7	21
jpayne@7	22 self._encoding: str = guessed_encoding
jpayne@7	23 self._mean_mess_ratio: float = mean_mess_ratio
jpayne@7	24 self._languages: CoherenceMatches = languages
jpayne@7	25 self._has_sig_or_bom: bool = has_sig_or_bom
jpayne@7	26 self._unicode_ranges: Optional[List[str]] = None
jpayne@7	27
jpayne@7	28 self._leaves: List[CharsetMatch] = []
jpayne@7	29 self._mean_coherence_ratio: float = 0.0
jpayne@7	30
jpayne@7	31 self._output_payload: Optional[bytes] = None
jpayne@7	32 self._output_encoding: Optional[str] = None
jpayne@7	33
jpayne@7	34 self._string: Optional[str] = decoded_payload
jpayne@7	35
jpayne@7	36 def __eq__(self, other: object) -> bool:
jpayne@7	37 if not isinstance(other, CharsetMatch):
jpayne@7	38 raise TypeError(
jpayne@7	39 "__eq__ cannot be invoked on {} and {}.".format(
jpayne@7	40 str(other.__class__), str(self.__class__)
jpayne@7	41 )
jpayne@7	42 )
jpayne@7	43 return self.encoding == other.encoding and self.fingerprint == other.fingerprint
jpayne@7	44
jpayne@7	45 def __lt__(self, other: object) -> bool:
jpayne@7	46 """
jpayne@7	47 Implemented to make sorted available upon CharsetMatches items.
jpayne@7	48 """
jpayne@7	49 if not isinstance(other, CharsetMatch):
jpayne@7	50 raise ValueError
jpayne@7	51
jpayne@7	52 chaos_difference: float = abs(self.chaos - other.chaos)
jpayne@7	53 coherence_difference: float = abs(self.coherence - other.coherence)
jpayne@7	54
jpayne@7	55 # Below 1% difference --> Use Coherence
jpayne@7	56 if chaos_difference < 0.01 and coherence_difference > 0.02:
jpayne@7	57 return self.coherence > other.coherence
jpayne@7	58 elif chaos_difference < 0.01 and coherence_difference <= 0.02:
jpayne@7	59 # When having a difficult decision, use the result that decoded as many multi-byte as possible.
jpayne@7	60 # preserve RAM usage!
jpayne@7	61 if len(self._payload) >= TOO_BIG_SEQUENCE:
jpayne@7	62 return self.chaos < other.chaos
jpayne@7	63 return self.multi_byte_usage > other.multi_byte_usage
jpayne@7	64
jpayne@7	65 return self.chaos < other.chaos
jpayne@7	66
jpayne@7	67 @property
jpayne@7	68 def multi_byte_usage(self) -> float:
jpayne@7	69 return 1.0 - (len(str(self)) / len(self.raw))
jpayne@7	70
jpayne@7	71 def __str__(self) -> str:
jpayne@7	72 # Lazy Str Loading
jpayne@7	73 if self._string is None:
jpayne@7	74 self._string = str(self._payload, self._encoding, "strict")
jpayne@7	75 return self._string
jpayne@7	76
jpayne@7	77 def __repr__(self) -> str:
jpayne@7	78 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
jpayne@7	79
jpayne@7	80 def add_submatch(self, other: "CharsetMatch") -> None:
jpayne@7	81 if not isinstance(other, CharsetMatch) or other == self:
jpayne@7	82 raise ValueError(
jpayne@7	83 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
jpayne@7	84 other.__class__
jpayne@7	85 )
jpayne@7	86 )
jpayne@7	87
jpayne@7	88 other._string = None # Unload RAM usage; dirty trick.
jpayne@7	89 self._leaves.append(other)
jpayne@7	90
jpayne@7	91 @property
jpayne@7	92 def encoding(self) -> str:
jpayne@7	93 return self._encoding
jpayne@7	94
jpayne@7	95 @property
jpayne@7	96 def encoding_aliases(self) -> List[str]:
jpayne@7	97 """
jpayne@7	98 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
jpayne@7	99 """
jpayne@7	100 also_known_as: List[str] = []
jpayne@7	101 for u, p in aliases.items():
jpayne@7	102 if self.encoding == u:
jpayne@7	103 also_known_as.append(p)
jpayne@7	104 elif self.encoding == p:
jpayne@7	105 also_known_as.append(u)
jpayne@7	106 return also_known_as
jpayne@7	107
jpayne@7	108 @property
jpayne@7	109 def bom(self) -> bool:
jpayne@7	110 return self._has_sig_or_bom
jpayne@7	111
jpayne@7	112 @property
jpayne@7	113 def byte_order_mark(self) -> bool:
jpayne@7	114 return self._has_sig_or_bom
jpayne@7	115
jpayne@7	116 @property
jpayne@7	117 def languages(self) -> List[str]:
jpayne@7	118 """
jpayne@7	119 Return the complete list of possible languages found in decoded sequence.
jpayne@7	120 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
jpayne@7	121 """
jpayne@7	122 return [e[0] for e in self._languages]
jpayne@7	123
jpayne@7	124 @property
jpayne@7	125 def language(self) -> str:
jpayne@7	126 """
jpayne@7	127 Most probable language found in decoded sequence. If none were detected or inferred, the property will return
jpayne@7	128 "Unknown".
jpayne@7	129 """
jpayne@7	130 if not self._languages:
jpayne@7	131 # Trying to infer the language based on the given encoding
jpayne@7	132 # Its either English or we should not pronounce ourselves in certain cases.
jpayne@7	133 if "ascii" in self.could_be_from_charset:
jpayne@7	134 return "English"
jpayne@7	135
jpayne@7	136 # doing it there to avoid circular import
jpayne@7	137 from charset_normalizer.cd import encoding_languages, mb_encoding_languages
jpayne@7	138
jpayne@7	139 languages = (
jpayne@7	140 mb_encoding_languages(self.encoding)
jpayne@7	141 if is_multi_byte_encoding(self.encoding)
jpayne@7	142 else encoding_languages(self.encoding)
jpayne@7	143 )
jpayne@7	144
jpayne@7	145 if len(languages) == 0 or "Latin Based" in languages:
jpayne@7	146 return "Unknown"
jpayne@7	147
jpayne@7	148 return languages[0]
jpayne@7	149
jpayne@7	150 return self._languages[0][0]
jpayne@7	151
jpayne@7	152 @property
jpayne@7	153 def chaos(self) -> float:
jpayne@7	154 return self._mean_mess_ratio
jpayne@7	155
jpayne@7	156 @property
jpayne@7	157 def coherence(self) -> float:
jpayne@7	158 if not self._languages:
jpayne@7	159 return 0.0
jpayne@7	160 return self._languages[0][1]
jpayne@7	161
jpayne@7	162 @property
jpayne@7	163 def percent_chaos(self) -> float:
jpayne@7	164 return round(self.chaos * 100, ndigits=3)
jpayne@7	165
jpayne@7	166 @property
jpayne@7	167 def percent_coherence(self) -> float:
jpayne@7	168 return round(self.coherence * 100, ndigits=3)
jpayne@7	169
jpayne@7	170 @property
jpayne@7	171 def raw(self) -> bytes:
jpayne@7	172 """
jpayne@7	173 Original untouched bytes.
jpayne@7	174 """
jpayne@7	175 return self._payload
jpayne@7	176
jpayne@7	177 @property
jpayne@7	178 def submatch(self) -> List["CharsetMatch"]:
jpayne@7	179 return self._leaves
jpayne@7	180
jpayne@7	181 @property
jpayne@7	182 def has_submatch(self) -> bool:
jpayne@7	183 return len(self._leaves) > 0
jpayne@7	184
jpayne@7	185 @property
jpayne@7	186 def alphabets(self) -> List[str]:
jpayne@7	187 if self._unicode_ranges is not None:
jpayne@7	188 return self._unicode_ranges
jpayne@7	189 # list detected ranges
jpayne@7	190 detected_ranges: List[Optional[str]] = [
jpayne@7	191 unicode_range(char) for char in str(self)
jpayne@7	192 ]
jpayne@7	193 # filter and sort
jpayne@7	194 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
jpayne@7	195 return self._unicode_ranges
jpayne@7	196
jpayne@7	197 @property
jpayne@7	198 def could_be_from_charset(self) -> List[str]:
jpayne@7	199 """
jpayne@7	200 The complete list of encoding that output the exact SAME str result and therefore could be the originating
jpayne@7	201 encoding.
jpayne@7	202 This list does include the encoding available in property 'encoding'.
jpayne@7	203 """
jpayne@7	204 return [self._encoding] + [m.encoding for m in self._leaves]
jpayne@7	205
jpayne@7	206 def output(self, encoding: str = "utf_8") -> bytes:
jpayne@7	207 """
jpayne@7	208 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
jpayne@7	209 Any errors will be simply ignored by the encoder NOT replaced.
jpayne@7	210 """
jpayne@7	211 if self._output_encoding is None or self._output_encoding != encoding:
jpayne@7	212 self._output_encoding = encoding
jpayne@7	213 self._output_payload = str(self).encode(encoding, "replace")
jpayne@7	214
jpayne@7	215 return self._output_payload # type: ignore
jpayne@7	216
jpayne@7	217 @property
jpayne@7	218 def fingerprint(self) -> str:
jpayne@7	219 """
jpayne@7	220 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
jpayne@7	221 """
jpayne@7	222 return sha256(self.output()).hexdigest()
jpayne@7	223
jpayne@7	224
jpayne@7	225 class CharsetMatches:
jpayne@7	226 """
jpayne@7	227 Container with every CharsetMatch items ordered by default from most probable to the less one.
jpayne@7	228 Act like a list(iterable) but does not implements all related methods.
jpayne@7	229 """
jpayne@7	230
jpayne@7	231 def __init__(self, results: Optional[List[CharsetMatch]] = None):
jpayne@7	232 self._results: List[CharsetMatch] = sorted(results) if results else []
jpayne@7	233
jpayne@7	234 def __iter__(self) -> Iterator[CharsetMatch]:
jpayne@7	235 yield from self._results
jpayne@7	236
jpayne@7	237 def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
jpayne@7	238 """
jpayne@7	239 Retrieve a single item either by its position or encoding name (alias may be used here).
jpayne@7	240 Raise KeyError upon invalid index or encoding not present in results.
jpayne@7	241 """
jpayne@7	242 if isinstance(item, int):
jpayne@7	243 return self._results[item]
jpayne@7	244 if isinstance(item, str):
jpayne@7	245 item = iana_name(item, False)
jpayne@7	246 for result in self._results:
jpayne@7	247 if item in result.could_be_from_charset:
jpayne@7	248 return result
jpayne@7	249 raise KeyError
jpayne@7	250
jpayne@7	251 def __len__(self) -> int:
jpayne@7	252 return len(self._results)
jpayne@7	253
jpayne@7	254 def __bool__(self) -> bool:
jpayne@7	255 return len(self._results) > 0
jpayne@7	256
jpayne@7	257 def append(self, item: CharsetMatch) -> None:
jpayne@7	258 """
jpayne@7	259 Insert a single match. Will be inserted accordingly to preserve sort.
jpayne@7	260 Can be inserted as a submatch.
jpayne@7	261 """
jpayne@7	262 if not isinstance(item, CharsetMatch):
jpayne@7	263 raise ValueError(
jpayne@7	264 "Cannot append instance '{}' to CharsetMatches".format(
jpayne@7	265 str(item.__class__)
jpayne@7	266 )
jpayne@7	267 )
jpayne@7	268 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
jpayne@7	269 if len(item.raw) <= TOO_BIG_SEQUENCE:
jpayne@7	270 for match in self._results:
jpayne@7	271 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
jpayne@7	272 match.add_submatch(item)
jpayne@7	273 return
jpayne@7	274 self._results.append(item)
jpayne@7	275 self._results = sorted(self._results)
jpayne@7	276
jpayne@7	277 def best(self) -> Optional["CharsetMatch"]:
jpayne@7	278 """
jpayne@7	279 Simply return the first match. Strict equivalent to matches[0].
jpayne@7	280 """
jpayne@7	281 if not self._results:
jpayne@7	282 return None
jpayne@7	283 return self._results[0]
jpayne@7	284
jpayne@7	285 def first(self) -> Optional["CharsetMatch"]:
jpayne@7	286 """
jpayne@7	287 Redundant method, call the method best(). Kept for BC reasons.
jpayne@7	288 """
jpayne@7	289 return self.best()
jpayne@7	290
jpayne@7	291
jpayne@7	292 CoherenceMatch = Tuple[str, float]
jpayne@7	293 CoherenceMatches = List[CoherenceMatch]
jpayne@7	294
jpayne@7	295
jpayne@7	296 class CliDetectionResult:
jpayne@7	297 def __init__(
jpayne@7	298 self,
jpayne@7	299 path: str,
jpayne@7	300 encoding: Optional[str],
jpayne@7	301 encoding_aliases: List[str],
jpayne@7	302 alternative_encodings: List[str],
jpayne@7	303 language: str,
jpayne@7	304 alphabets: List[str],
jpayne@7	305 has_sig_or_bom: bool,
jpayne@7	306 chaos: float,
jpayne@7	307 coherence: float,
jpayne@7	308 unicode_path: Optional[str],
jpayne@7	309 is_preferred: bool,
jpayne@7	310 ):
jpayne@7	311 self.path: str = path
jpayne@7	312 self.unicode_path: Optional[str] = unicode_path
jpayne@7	313 self.encoding: Optional[str] = encoding
jpayne@7	314 self.encoding_aliases: List[str] = encoding_aliases
jpayne@7	315 self.alternative_encodings: List[str] = alternative_encodings
jpayne@7	316 self.language: str = language
jpayne@7	317 self.alphabets: List[str] = alphabets
jpayne@7	318 self.has_sig_or_bom: bool = has_sig_or_bom
jpayne@7	319 self.chaos: float = chaos
jpayne@7	320 self.coherence: float = coherence
jpayne@7	321 self.is_preferred: bool = is_preferred
jpayne@7	322
jpayne@7	323 @property
jpayne@7	324 def __dict__(self) -> Dict[str, Any]: # type: ignore
jpayne@7	325 return {
jpayne@7	326 "path": self.path,
jpayne@7	327 "encoding": self.encoding,
jpayne@7	328 "encoding_aliases": self.encoding_aliases,
jpayne@7	329 "alternative_encodings": self.alternative_encodings,
jpayne@7	330 "language": self.language,
jpayne@7	331 "alphabets": self.alphabets,
jpayne@7	332 "has_sig_or_bom": self.has_sig_or_bom,
jpayne@7	333 "chaos": self.chaos,
jpayne@7	334 "coherence": self.coherence,
jpayne@7	335 "unicode_path": self.unicode_path,
jpayne@7	336 "is_preferred": self.is_preferred,
jpayne@7	337 }
jpayne@7	338
jpayne@7	339 def to_json(self) -> str:
jpayne@7	340 return dumps(self.__dict__, ensure_ascii=True, indent=4)

Mercurial > repos > jpayne > bioproject_to_srr_2

annotate charset_normalizer/models.py @ 7:5eb2d5e3bf22