annotate charset_normalizer/cd.py @ 13:f550715358f1

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538
author jpayne
date Mon, 20 May 2024 00:56:52 -0400
parents 5eb2d5e3bf22
children
rev   line source
jpayne@7 1 import importlib
jpayne@7 2 from codecs import IncrementalDecoder
jpayne@7 3 from collections import Counter
jpayne@7 4 from functools import lru_cache
jpayne@7 5 from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
jpayne@7 6
jpayne@7 7 from .constant import (
jpayne@7 8 FREQUENCIES,
jpayne@7 9 KO_NAMES,
jpayne@7 10 LANGUAGE_SUPPORTED_COUNT,
jpayne@7 11 TOO_SMALL_SEQUENCE,
jpayne@7 12 ZH_NAMES,
jpayne@7 13 )
jpayne@7 14 from .md import is_suspiciously_successive_range
jpayne@7 15 from .models import CoherenceMatches
jpayne@7 16 from .utils import (
jpayne@7 17 is_accentuated,
jpayne@7 18 is_latin,
jpayne@7 19 is_multi_byte_encoding,
jpayne@7 20 is_unicode_range_secondary,
jpayne@7 21 unicode_range,
jpayne@7 22 )
jpayne@7 23
jpayne@7 24
jpayne@7 25 def encoding_unicode_range(iana_name: str) -> List[str]:
jpayne@7 26 """
jpayne@7 27 Return associated unicode ranges in a single byte code page.
jpayne@7 28 """
jpayne@7 29 if is_multi_byte_encoding(iana_name):
jpayne@7 30 raise IOError("Function not supported on multi-byte code page")
jpayne@7 31
jpayne@7 32 decoder = importlib.import_module(
jpayne@7 33 "encodings.{}".format(iana_name)
jpayne@7 34 ).IncrementalDecoder
jpayne@7 35
jpayne@7 36 p: IncrementalDecoder = decoder(errors="ignore")
jpayne@7 37 seen_ranges: Dict[str, int] = {}
jpayne@7 38 character_count: int = 0
jpayne@7 39
jpayne@7 40 for i in range(0x40, 0xFF):
jpayne@7 41 chunk: str = p.decode(bytes([i]))
jpayne@7 42
jpayne@7 43 if chunk:
jpayne@7 44 character_range: Optional[str] = unicode_range(chunk)
jpayne@7 45
jpayne@7 46 if character_range is None:
jpayne@7 47 continue
jpayne@7 48
jpayne@7 49 if is_unicode_range_secondary(character_range) is False:
jpayne@7 50 if character_range not in seen_ranges:
jpayne@7 51 seen_ranges[character_range] = 0
jpayne@7 52 seen_ranges[character_range] += 1
jpayne@7 53 character_count += 1
jpayne@7 54
jpayne@7 55 return sorted(
jpayne@7 56 [
jpayne@7 57 character_range
jpayne@7 58 for character_range in seen_ranges
jpayne@7 59 if seen_ranges[character_range] / character_count >= 0.15
jpayne@7 60 ]
jpayne@7 61 )
jpayne@7 62
jpayne@7 63
jpayne@7 64 def unicode_range_languages(primary_range: str) -> List[str]:
jpayne@7 65 """
jpayne@7 66 Return inferred languages used with a unicode range.
jpayne@7 67 """
jpayne@7 68 languages: List[str] = []
jpayne@7 69
jpayne@7 70 for language, characters in FREQUENCIES.items():
jpayne@7 71 for character in characters:
jpayne@7 72 if unicode_range(character) == primary_range:
jpayne@7 73 languages.append(language)
jpayne@7 74 break
jpayne@7 75
jpayne@7 76 return languages
jpayne@7 77
jpayne@7 78
jpayne@7 79 @lru_cache()
jpayne@7 80 def encoding_languages(iana_name: str) -> List[str]:
jpayne@7 81 """
jpayne@7 82 Single-byte encoding language association. Some code page are heavily linked to particular language(s).
jpayne@7 83 This function does the correspondence.
jpayne@7 84 """
jpayne@7 85 unicode_ranges: List[str] = encoding_unicode_range(iana_name)
jpayne@7 86 primary_range: Optional[str] = None
jpayne@7 87
jpayne@7 88 for specified_range in unicode_ranges:
jpayne@7 89 if "Latin" not in specified_range:
jpayne@7 90 primary_range = specified_range
jpayne@7 91 break
jpayne@7 92
jpayne@7 93 if primary_range is None:
jpayne@7 94 return ["Latin Based"]
jpayne@7 95
jpayne@7 96 return unicode_range_languages(primary_range)
jpayne@7 97
jpayne@7 98
jpayne@7 99 @lru_cache()
jpayne@7 100 def mb_encoding_languages(iana_name: str) -> List[str]:
jpayne@7 101 """
jpayne@7 102 Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
jpayne@7 103 This function does the correspondence.
jpayne@7 104 """
jpayne@7 105 if (
jpayne@7 106 iana_name.startswith("shift_")
jpayne@7 107 or iana_name.startswith("iso2022_jp")
jpayne@7 108 or iana_name.startswith("euc_j")
jpayne@7 109 or iana_name == "cp932"
jpayne@7 110 ):
jpayne@7 111 return ["Japanese"]
jpayne@7 112 if iana_name.startswith("gb") or iana_name in ZH_NAMES:
jpayne@7 113 return ["Chinese"]
jpayne@7 114 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
jpayne@7 115 return ["Korean"]
jpayne@7 116
jpayne@7 117 return []
jpayne@7 118
jpayne@7 119
jpayne@7 120 @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
jpayne@7 121 def get_target_features(language: str) -> Tuple[bool, bool]:
jpayne@7 122 """
jpayne@7 123 Determine main aspects from a supported language if it contains accents and if is pure Latin.
jpayne@7 124 """
jpayne@7 125 target_have_accents: bool = False
jpayne@7 126 target_pure_latin: bool = True
jpayne@7 127
jpayne@7 128 for character in FREQUENCIES[language]:
jpayne@7 129 if not target_have_accents and is_accentuated(character):
jpayne@7 130 target_have_accents = True
jpayne@7 131 if target_pure_latin and is_latin(character) is False:
jpayne@7 132 target_pure_latin = False
jpayne@7 133
jpayne@7 134 return target_have_accents, target_pure_latin
jpayne@7 135
jpayne@7 136
jpayne@7 137 def alphabet_languages(
jpayne@7 138 characters: List[str], ignore_non_latin: bool = False
jpayne@7 139 ) -> List[str]:
jpayne@7 140 """
jpayne@7 141 Return associated languages associated to given characters.
jpayne@7 142 """
jpayne@7 143 languages: List[Tuple[str, float]] = []
jpayne@7 144
jpayne@7 145 source_have_accents = any(is_accentuated(character) for character in characters)
jpayne@7 146
jpayne@7 147 for language, language_characters in FREQUENCIES.items():
jpayne@7 148 target_have_accents, target_pure_latin = get_target_features(language)
jpayne@7 149
jpayne@7 150 if ignore_non_latin and target_pure_latin is False:
jpayne@7 151 continue
jpayne@7 152
jpayne@7 153 if target_have_accents is False and source_have_accents:
jpayne@7 154 continue
jpayne@7 155
jpayne@7 156 character_count: int = len(language_characters)
jpayne@7 157
jpayne@7 158 character_match_count: int = len(
jpayne@7 159 [c for c in language_characters if c in characters]
jpayne@7 160 )
jpayne@7 161
jpayne@7 162 ratio: float = character_match_count / character_count
jpayne@7 163
jpayne@7 164 if ratio >= 0.2:
jpayne@7 165 languages.append((language, ratio))
jpayne@7 166
jpayne@7 167 languages = sorted(languages, key=lambda x: x[1], reverse=True)
jpayne@7 168
jpayne@7 169 return [compatible_language[0] for compatible_language in languages]
jpayne@7 170
jpayne@7 171
jpayne@7 172 def characters_popularity_compare(
jpayne@7 173 language: str, ordered_characters: List[str]
jpayne@7 174 ) -> float:
jpayne@7 175 """
jpayne@7 176 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
jpayne@7 177 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
jpayne@7 178 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
jpayne@7 179 """
jpayne@7 180 if language not in FREQUENCIES:
jpayne@7 181 raise ValueError("{} not available".format(language))
jpayne@7 182
jpayne@7 183 character_approved_count: int = 0
jpayne@7 184 FREQUENCIES_language_set = set(FREQUENCIES[language])
jpayne@7 185
jpayne@7 186 ordered_characters_count: int = len(ordered_characters)
jpayne@7 187 target_language_characters_count: int = len(FREQUENCIES[language])
jpayne@7 188
jpayne@7 189 large_alphabet: bool = target_language_characters_count > 26
jpayne@7 190
jpayne@7 191 for character, character_rank in zip(
jpayne@7 192 ordered_characters, range(0, ordered_characters_count)
jpayne@7 193 ):
jpayne@7 194 if character not in FREQUENCIES_language_set:
jpayne@7 195 continue
jpayne@7 196
jpayne@7 197 character_rank_in_language: int = FREQUENCIES[language].index(character)
jpayne@7 198 expected_projection_ratio: float = (
jpayne@7 199 target_language_characters_count / ordered_characters_count
jpayne@7 200 )
jpayne@7 201 character_rank_projection: int = int(character_rank * expected_projection_ratio)
jpayne@7 202
jpayne@7 203 if (
jpayne@7 204 large_alphabet is False
jpayne@7 205 and abs(character_rank_projection - character_rank_in_language) > 4
jpayne@7 206 ):
jpayne@7 207 continue
jpayne@7 208
jpayne@7 209 if (
jpayne@7 210 large_alphabet is True
jpayne@7 211 and abs(character_rank_projection - character_rank_in_language)
jpayne@7 212 < target_language_characters_count / 3
jpayne@7 213 ):
jpayne@7 214 character_approved_count += 1
jpayne@7 215 continue
jpayne@7 216
jpayne@7 217 characters_before_source: List[str] = FREQUENCIES[language][
jpayne@7 218 0:character_rank_in_language
jpayne@7 219 ]
jpayne@7 220 characters_after_source: List[str] = FREQUENCIES[language][
jpayne@7 221 character_rank_in_language:
jpayne@7 222 ]
jpayne@7 223 characters_before: List[str] = ordered_characters[0:character_rank]
jpayne@7 224 characters_after: List[str] = ordered_characters[character_rank:]
jpayne@7 225
jpayne@7 226 before_match_count: int = len(
jpayne@7 227 set(characters_before) & set(characters_before_source)
jpayne@7 228 )
jpayne@7 229
jpayne@7 230 after_match_count: int = len(
jpayne@7 231 set(characters_after) & set(characters_after_source)
jpayne@7 232 )
jpayne@7 233
jpayne@7 234 if len(characters_before_source) == 0 and before_match_count <= 4:
jpayne@7 235 character_approved_count += 1
jpayne@7 236 continue
jpayne@7 237
jpayne@7 238 if len(characters_after_source) == 0 and after_match_count <= 4:
jpayne@7 239 character_approved_count += 1
jpayne@7 240 continue
jpayne@7 241
jpayne@7 242 if (
jpayne@7 243 before_match_count / len(characters_before_source) >= 0.4
jpayne@7 244 or after_match_count / len(characters_after_source) >= 0.4
jpayne@7 245 ):
jpayne@7 246 character_approved_count += 1
jpayne@7 247 continue
jpayne@7 248
jpayne@7 249 return character_approved_count / len(ordered_characters)
jpayne@7 250
jpayne@7 251
jpayne@7 252 def alpha_unicode_split(decoded_sequence: str) -> List[str]:
jpayne@7 253 """
jpayne@7 254 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
jpayne@7 255 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
jpayne@7 256 One containing the latin letters and the other hebrew.
jpayne@7 257 """
jpayne@7 258 layers: Dict[str, str] = {}
jpayne@7 259
jpayne@7 260 for character in decoded_sequence:
jpayne@7 261 if character.isalpha() is False:
jpayne@7 262 continue
jpayne@7 263
jpayne@7 264 character_range: Optional[str] = unicode_range(character)
jpayne@7 265
jpayne@7 266 if character_range is None:
jpayne@7 267 continue
jpayne@7 268
jpayne@7 269 layer_target_range: Optional[str] = None
jpayne@7 270
jpayne@7 271 for discovered_range in layers:
jpayne@7 272 if (
jpayne@7 273 is_suspiciously_successive_range(discovered_range, character_range)
jpayne@7 274 is False
jpayne@7 275 ):
jpayne@7 276 layer_target_range = discovered_range
jpayne@7 277 break
jpayne@7 278
jpayne@7 279 if layer_target_range is None:
jpayne@7 280 layer_target_range = character_range
jpayne@7 281
jpayne@7 282 if layer_target_range not in layers:
jpayne@7 283 layers[layer_target_range] = character.lower()
jpayne@7 284 continue
jpayne@7 285
jpayne@7 286 layers[layer_target_range] += character.lower()
jpayne@7 287
jpayne@7 288 return list(layers.values())
jpayne@7 289
jpayne@7 290
jpayne@7 291 def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
jpayne@7 292 """
jpayne@7 293 This function merge results previously given by the function coherence_ratio.
jpayne@7 294 The return type is the same as coherence_ratio.
jpayne@7 295 """
jpayne@7 296 per_language_ratios: Dict[str, List[float]] = {}
jpayne@7 297 for result in results:
jpayne@7 298 for sub_result in result:
jpayne@7 299 language, ratio = sub_result
jpayne@7 300 if language not in per_language_ratios:
jpayne@7 301 per_language_ratios[language] = [ratio]
jpayne@7 302 continue
jpayne@7 303 per_language_ratios[language].append(ratio)
jpayne@7 304
jpayne@7 305 merge = [
jpayne@7 306 (
jpayne@7 307 language,
jpayne@7 308 round(
jpayne@7 309 sum(per_language_ratios[language]) / len(per_language_ratios[language]),
jpayne@7 310 4,
jpayne@7 311 ),
jpayne@7 312 )
jpayne@7 313 for language in per_language_ratios
jpayne@7 314 ]
jpayne@7 315
jpayne@7 316 return sorted(merge, key=lambda x: x[1], reverse=True)
jpayne@7 317
jpayne@7 318
jpayne@7 319 def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
jpayne@7 320 """
jpayne@7 321 We shall NOT return "English—" in CoherenceMatches because it is an alternative
jpayne@7 322 of "English". This function only keeps the best match and remove the em-dash in it.
jpayne@7 323 """
jpayne@7 324 index_results: Dict[str, List[float]] = dict()
jpayne@7 325
jpayne@7 326 for result in results:
jpayne@7 327 language, ratio = result
jpayne@7 328 no_em_name: str = language.replace("—", "")
jpayne@7 329
jpayne@7 330 if no_em_name not in index_results:
jpayne@7 331 index_results[no_em_name] = []
jpayne@7 332
jpayne@7 333 index_results[no_em_name].append(ratio)
jpayne@7 334
jpayne@7 335 if any(len(index_results[e]) > 1 for e in index_results):
jpayne@7 336 filtered_results: CoherenceMatches = []
jpayne@7 337
jpayne@7 338 for language in index_results:
jpayne@7 339 filtered_results.append((language, max(index_results[language])))
jpayne@7 340
jpayne@7 341 return filtered_results
jpayne@7 342
jpayne@7 343 return results
jpayne@7 344
jpayne@7 345
jpayne@7 346 @lru_cache(maxsize=2048)
jpayne@7 347 def coherence_ratio(
jpayne@7 348 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
jpayne@7 349 ) -> CoherenceMatches:
jpayne@7 350 """
jpayne@7 351 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
jpayne@7 352 A layer = Character extraction by alphabets/ranges.
jpayne@7 353 """
jpayne@7 354
jpayne@7 355 results: List[Tuple[str, float]] = []
jpayne@7 356 ignore_non_latin: bool = False
jpayne@7 357
jpayne@7 358 sufficient_match_count: int = 0
jpayne@7 359
jpayne@7 360 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
jpayne@7 361 if "Latin Based" in lg_inclusion_list:
jpayne@7 362 ignore_non_latin = True
jpayne@7 363 lg_inclusion_list.remove("Latin Based")
jpayne@7 364
jpayne@7 365 for layer in alpha_unicode_split(decoded_sequence):
jpayne@7 366 sequence_frequencies: TypeCounter[str] = Counter(layer)
jpayne@7 367 most_common = sequence_frequencies.most_common()
jpayne@7 368
jpayne@7 369 character_count: int = sum(o for c, o in most_common)
jpayne@7 370
jpayne@7 371 if character_count <= TOO_SMALL_SEQUENCE:
jpayne@7 372 continue
jpayne@7 373
jpayne@7 374 popular_character_ordered: List[str] = [c for c, o in most_common]
jpayne@7 375
jpayne@7 376 for language in lg_inclusion_list or alphabet_languages(
jpayne@7 377 popular_character_ordered, ignore_non_latin
jpayne@7 378 ):
jpayne@7 379 ratio: float = characters_popularity_compare(
jpayne@7 380 language, popular_character_ordered
jpayne@7 381 )
jpayne@7 382
jpayne@7 383 if ratio < threshold:
jpayne@7 384 continue
jpayne@7 385 elif ratio >= 0.8:
jpayne@7 386 sufficient_match_count += 1
jpayne@7 387
jpayne@7 388 results.append((language, round(ratio, 4)))
jpayne@7 389
jpayne@7 390 if sufficient_match_count >= 3:
jpayne@7 391 break
jpayne@7 392
jpayne@7 393 return sorted(
jpayne@7 394 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
jpayne@7 395 )