bioproject_to_srr_2: charset_normalizer/cd.py annotate

annotate charset_normalizer/cd.py @ 9:f9f1d0a0599a

planemo upload for repository https://toolrepo.galaxytrakr.org/view/jpayne/bioproject_to_srr_2/556cac4fb538

author	jpayne
date	Sun, 05 May 2024 23:58:19 -0400
parents	5eb2d5e3bf22
children

rev	line source
jpayne@7	1 import importlib
jpayne@7	2 from codecs import IncrementalDecoder
jpayne@7	3 from collections import Counter
jpayne@7	4 from functools import lru_cache
jpayne@7	5 from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
jpayne@7	6
jpayne@7	7 from .constant import (
jpayne@7	8 FREQUENCIES,
jpayne@7	9 KO_NAMES,
jpayne@7	10 LANGUAGE_SUPPORTED_COUNT,
jpayne@7	11 TOO_SMALL_SEQUENCE,
jpayne@7	12 ZH_NAMES,
jpayne@7	13 )
jpayne@7	14 from .md import is_suspiciously_successive_range
jpayne@7	15 from .models import CoherenceMatches
jpayne@7	16 from .utils import (
jpayne@7	17 is_accentuated,
jpayne@7	18 is_latin,
jpayne@7	19 is_multi_byte_encoding,
jpayne@7	20 is_unicode_range_secondary,
jpayne@7	21 unicode_range,
jpayne@7	22 )
jpayne@7	23
jpayne@7	24
jpayne@7	25 def encoding_unicode_range(iana_name: str) -> List[str]:
jpayne@7	26 """
jpayne@7	27 Return associated unicode ranges in a single byte code page.
jpayne@7	28 """
jpayne@7	29 if is_multi_byte_encoding(iana_name):
jpayne@7	30 raise IOError("Function not supported on multi-byte code page")
jpayne@7	31
jpayne@7	32 decoder = importlib.import_module(
jpayne@7	33 "encodings.{}".format(iana_name)
jpayne@7	34 ).IncrementalDecoder
jpayne@7	35
jpayne@7	36 p: IncrementalDecoder = decoder(errors="ignore")
jpayne@7	37 seen_ranges: Dict[str, int] = {}
jpayne@7	38 character_count: int = 0
jpayne@7	39
jpayne@7	40 for i in range(0x40, 0xFF):
jpayne@7	41 chunk: str = p.decode(bytes([i]))
jpayne@7	42
jpayne@7	43 if chunk:
jpayne@7	44 character_range: Optional[str] = unicode_range(chunk)
jpayne@7	45
jpayne@7	46 if character_range is None:
jpayne@7	47 continue
jpayne@7	48
jpayne@7	49 if is_unicode_range_secondary(character_range) is False:
jpayne@7	50 if character_range not in seen_ranges:
jpayne@7	51 seen_ranges[character_range] = 0
jpayne@7	52 seen_ranges[character_range] += 1
jpayne@7	53 character_count += 1
jpayne@7	54
jpayne@7	55 return sorted(
jpayne@7	56 [
jpayne@7	57 character_range
jpayne@7	58 for character_range in seen_ranges
jpayne@7	59 if seen_ranges[character_range] / character_count >= 0.15
jpayne@7	60 ]
jpayne@7	61 )
jpayne@7	62
jpayne@7	63
jpayne@7	64 def unicode_range_languages(primary_range: str) -> List[str]:
jpayne@7	65 """
jpayne@7	66 Return inferred languages used with a unicode range.
jpayne@7	67 """
jpayne@7	68 languages: List[str] = []
jpayne@7	69
jpayne@7	70 for language, characters in FREQUENCIES.items():
jpayne@7	71 for character in characters:
jpayne@7	72 if unicode_range(character) == primary_range:
jpayne@7	73 languages.append(language)
jpayne@7	74 break
jpayne@7	75
jpayne@7	76 return languages
jpayne@7	77
jpayne@7	78
jpayne@7	79 @lru_cache()
jpayne@7	80 def encoding_languages(iana_name: str) -> List[str]:
jpayne@7	81 """
jpayne@7	82 Single-byte encoding language association. Some code page are heavily linked to particular language(s).
jpayne@7	83 This function does the correspondence.
jpayne@7	84 """
jpayne@7	85 unicode_ranges: List[str] = encoding_unicode_range(iana_name)
jpayne@7	86 primary_range: Optional[str] = None
jpayne@7	87
jpayne@7	88 for specified_range in unicode_ranges:
jpayne@7	89 if "Latin" not in specified_range:
jpayne@7	90 primary_range = specified_range
jpayne@7	91 break
jpayne@7	92
jpayne@7	93 if primary_range is None:
jpayne@7	94 return ["Latin Based"]
jpayne@7	95
jpayne@7	96 return unicode_range_languages(primary_range)
jpayne@7	97
jpayne@7	98
jpayne@7	99 @lru_cache()
jpayne@7	100 def mb_encoding_languages(iana_name: str) -> List[str]:
jpayne@7	101 """
jpayne@7	102 Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
jpayne@7	103 This function does the correspondence.
jpayne@7	104 """
jpayne@7	105 if (
jpayne@7	106 iana_name.startswith("shift_")
jpayne@7	107 or iana_name.startswith("iso2022_jp")
jpayne@7	108 or iana_name.startswith("euc_j")
jpayne@7	109 or iana_name == "cp932"
jpayne@7	110 ):
jpayne@7	111 return ["Japanese"]
jpayne@7	112 if iana_name.startswith("gb") or iana_name in ZH_NAMES:
jpayne@7	113 return ["Chinese"]
jpayne@7	114 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
jpayne@7	115 return ["Korean"]
jpayne@7	116
jpayne@7	117 return []
jpayne@7	118
jpayne@7	119
jpayne@7	120 @lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
jpayne@7	121 def get_target_features(language: str) -> Tuple[bool, bool]:
jpayne@7	122 """
jpayne@7	123 Determine main aspects from a supported language if it contains accents and if is pure Latin.
jpayne@7	124 """
jpayne@7	125 target_have_accents: bool = False
jpayne@7	126 target_pure_latin: bool = True
jpayne@7	127
jpayne@7	128 for character in FREQUENCIES[language]:
jpayne@7	129 if not target_have_accents and is_accentuated(character):
jpayne@7	130 target_have_accents = True
jpayne@7	131 if target_pure_latin and is_latin(character) is False:
jpayne@7	132 target_pure_latin = False
jpayne@7	133
jpayne@7	134 return target_have_accents, target_pure_latin
jpayne@7	135
jpayne@7	136
jpayne@7	137 def alphabet_languages(
jpayne@7	138 characters: List[str], ignore_non_latin: bool = False
jpayne@7	139 ) -> List[str]:
jpayne@7	140 """
jpayne@7	141 Return associated languages associated to given characters.
jpayne@7	142 """
jpayne@7	143 languages: List[Tuple[str, float]] = []
jpayne@7	144
jpayne@7	145 source_have_accents = any(is_accentuated(character) for character in characters)
jpayne@7	146
jpayne@7	147 for language, language_characters in FREQUENCIES.items():
jpayne@7	148 target_have_accents, target_pure_latin = get_target_features(language)
jpayne@7	149
jpayne@7	150 if ignore_non_latin and target_pure_latin is False:
jpayne@7	151 continue
jpayne@7	152
jpayne@7	153 if target_have_accents is False and source_have_accents:
jpayne@7	154 continue
jpayne@7	155
jpayne@7	156 character_count: int = len(language_characters)
jpayne@7	157
jpayne@7	158 character_match_count: int = len(
jpayne@7	159 [c for c in language_characters if c in characters]
jpayne@7	160 )
jpayne@7	161
jpayne@7	162 ratio: float = character_match_count / character_count
jpayne@7	163
jpayne@7	164 if ratio >= 0.2:
jpayne@7	165 languages.append((language, ratio))
jpayne@7	166
jpayne@7	167 languages = sorted(languages, key=lambda x: x[1], reverse=True)
jpayne@7	168
jpayne@7	169 return [compatible_language[0] for compatible_language in languages]
jpayne@7	170
jpayne@7	171
jpayne@7	172 def characters_popularity_compare(
jpayne@7	173 language: str, ordered_characters: List[str]
jpayne@7	174 ) -> float:
jpayne@7	175 """
jpayne@7	176 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
jpayne@7	177 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
jpayne@7	178 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
jpayne@7	179 """
jpayne@7	180 if language not in FREQUENCIES:
jpayne@7	181 raise ValueError("{} not available".format(language))
jpayne@7	182
jpayne@7	183 character_approved_count: int = 0
jpayne@7	184 FREQUENCIES_language_set = set(FREQUENCIES[language])
jpayne@7	185
jpayne@7	186 ordered_characters_count: int = len(ordered_characters)
jpayne@7	187 target_language_characters_count: int = len(FREQUENCIES[language])
jpayne@7	188
jpayne@7	189 large_alphabet: bool = target_language_characters_count > 26
jpayne@7	190
jpayne@7	191 for character, character_rank in zip(
jpayne@7	192 ordered_characters, range(0, ordered_characters_count)
jpayne@7	193 ):
jpayne@7	194 if character not in FREQUENCIES_language_set:
jpayne@7	195 continue
jpayne@7	196
jpayne@7	197 character_rank_in_language: int = FREQUENCIES[language].index(character)
jpayne@7	198 expected_projection_ratio: float = (
jpayne@7	199 target_language_characters_count / ordered_characters_count
jpayne@7	200 )
jpayne@7	201 character_rank_projection: int = int(character_rank * expected_projection_ratio)
jpayne@7	202
jpayne@7	203 if (
jpayne@7	204 large_alphabet is False
jpayne@7	205 and abs(character_rank_projection - character_rank_in_language) > 4
jpayne@7	206 ):
jpayne@7	207 continue
jpayne@7	208
jpayne@7	209 if (
jpayne@7	210 large_alphabet is True
jpayne@7	211 and abs(character_rank_projection - character_rank_in_language)
jpayne@7	212 < target_language_characters_count / 3
jpayne@7	213 ):
jpayne@7	214 character_approved_count += 1
jpayne@7	215 continue
jpayne@7	216
jpayne@7	217 characters_before_source: List[str] = FREQUENCIES[language][
jpayne@7	218 0:character_rank_in_language
jpayne@7	219 ]
jpayne@7	220 characters_after_source: List[str] = FREQUENCIES[language][
jpayne@7	221 character_rank_in_language:
jpayne@7	222 ]
jpayne@7	223 characters_before: List[str] = ordered_characters[0:character_rank]
jpayne@7	224 characters_after: List[str] = ordered_characters[character_rank:]
jpayne@7	225
jpayne@7	226 before_match_count: int = len(
jpayne@7	227 set(characters_before) & set(characters_before_source)
jpayne@7	228 )
jpayne@7	229
jpayne@7	230 after_match_count: int = len(
jpayne@7	231 set(characters_after) & set(characters_after_source)
jpayne@7	232 )
jpayne@7	233
jpayne@7	234 if len(characters_before_source) == 0 and before_match_count <= 4:
jpayne@7	235 character_approved_count += 1
jpayne@7	236 continue
jpayne@7	237
jpayne@7	238 if len(characters_after_source) == 0 and after_match_count <= 4:
jpayne@7	239 character_approved_count += 1
jpayne@7	240 continue
jpayne@7	241
jpayne@7	242 if (
jpayne@7	243 before_match_count / len(characters_before_source) >= 0.4
jpayne@7	244 or after_match_count / len(characters_after_source) >= 0.4
jpayne@7	245 ):
jpayne@7	246 character_approved_count += 1
jpayne@7	247 continue
jpayne@7	248
jpayne@7	249 return character_approved_count / len(ordered_characters)
jpayne@7	250
jpayne@7	251
jpayne@7	252 def alpha_unicode_split(decoded_sequence: str) -> List[str]:
jpayne@7	253 """
jpayne@7	254 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
jpayne@7	255 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
jpayne@7	256 One containing the latin letters and the other hebrew.
jpayne@7	257 """
jpayne@7	258 layers: Dict[str, str] = {}
jpayne@7	259
jpayne@7	260 for character in decoded_sequence:
jpayne@7	261 if character.isalpha() is False:
jpayne@7	262 continue
jpayne@7	263
jpayne@7	264 character_range: Optional[str] = unicode_range(character)
jpayne@7	265
jpayne@7	266 if character_range is None:
jpayne@7	267 continue
jpayne@7	268
jpayne@7	269 layer_target_range: Optional[str] = None
jpayne@7	270
jpayne@7	271 for discovered_range in layers:
jpayne@7	272 if (
jpayne@7	273 is_suspiciously_successive_range(discovered_range, character_range)
jpayne@7	274 is False
jpayne@7	275 ):
jpayne@7	276 layer_target_range = discovered_range
jpayne@7	277 break
jpayne@7	278
jpayne@7	279 if layer_target_range is None:
jpayne@7	280 layer_target_range = character_range
jpayne@7	281
jpayne@7	282 if layer_target_range not in layers:
jpayne@7	283 layers[layer_target_range] = character.lower()
jpayne@7	284 continue
jpayne@7	285
jpayne@7	286 layers[layer_target_range] += character.lower()
jpayne@7	287
jpayne@7	288 return list(layers.values())
jpayne@7	289
jpayne@7	290
jpayne@7	291 def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
jpayne@7	292 """
jpayne@7	293 This function merge results previously given by the function coherence_ratio.
jpayne@7	294 The return type is the same as coherence_ratio.
jpayne@7	295 """
jpayne@7	296 per_language_ratios: Dict[str, List[float]] = {}
jpayne@7	297 for result in results:
jpayne@7	298 for sub_result in result:
jpayne@7	299 language, ratio = sub_result
jpayne@7	300 if language not in per_language_ratios:
jpayne@7	301 per_language_ratios[language] = [ratio]
jpayne@7	302 continue
jpayne@7	303 per_language_ratios[language].append(ratio)
jpayne@7	304
jpayne@7	305 merge = [
jpayne@7	306 (
jpayne@7	307 language,
jpayne@7	308 round(
jpayne@7	309 sum(per_language_ratios[language]) / len(per_language_ratios[language]),
jpayne@7	310 4,
jpayne@7	311 ),
jpayne@7	312 )
jpayne@7	313 for language in per_language_ratios
jpayne@7	314 ]
jpayne@7	315
jpayne@7	316 return sorted(merge, key=lambda x: x[1], reverse=True)
jpayne@7	317
jpayne@7	318
jpayne@7	319 def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
jpayne@7	320 """
jpayne@7	321 We shall NOT return "English—" in CoherenceMatches because it is an alternative
jpayne@7	322 of "English". This function only keeps the best match and remove the em-dash in it.
jpayne@7	323 """
jpayne@7	324 index_results: Dict[str, List[float]] = dict()
jpayne@7	325
jpayne@7	326 for result in results:
jpayne@7	327 language, ratio = result
jpayne@7	328 no_em_name: str = language.replace("—", "")
jpayne@7	329
jpayne@7	330 if no_em_name not in index_results:
jpayne@7	331 index_results[no_em_name] = []
jpayne@7	332
jpayne@7	333 index_results[no_em_name].append(ratio)
jpayne@7	334
jpayne@7	335 if any(len(index_results[e]) > 1 for e in index_results):
jpayne@7	336 filtered_results: CoherenceMatches = []
jpayne@7	337
jpayne@7	338 for language in index_results:
jpayne@7	339 filtered_results.append((language, max(index_results[language])))
jpayne@7	340
jpayne@7	341 return filtered_results
jpayne@7	342
jpayne@7	343 return results
jpayne@7	344
jpayne@7	345
jpayne@7	346 @lru_cache(maxsize=2048)
jpayne@7	347 def coherence_ratio(
jpayne@7	348 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
jpayne@7	349 ) -> CoherenceMatches:
jpayne@7	350 """
jpayne@7	351 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
jpayne@7	352 A layer = Character extraction by alphabets/ranges.
jpayne@7	353 """
jpayne@7	354
jpayne@7	355 results: List[Tuple[str, float]] = []
jpayne@7	356 ignore_non_latin: bool = False
jpayne@7	357
jpayne@7	358 sufficient_match_count: int = 0
jpayne@7	359
jpayne@7	360 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
jpayne@7	361 if "Latin Based" in lg_inclusion_list:
jpayne@7	362 ignore_non_latin = True
jpayne@7	363 lg_inclusion_list.remove("Latin Based")
jpayne@7	364
jpayne@7	365 for layer in alpha_unicode_split(decoded_sequence):
jpayne@7	366 sequence_frequencies: TypeCounter[str] = Counter(layer)
jpayne@7	367 most_common = sequence_frequencies.most_common()
jpayne@7	368
jpayne@7	369 character_count: int = sum(o for c, o in most_common)
jpayne@7	370
jpayne@7	371 if character_count <= TOO_SMALL_SEQUENCE:
jpayne@7	372 continue
jpayne@7	373
jpayne@7	374 popular_character_ordered: List[str] = [c for c, o in most_common]
jpayne@7	375
jpayne@7	376 for language in lg_inclusion_list or alphabet_languages(
jpayne@7	377 popular_character_ordered, ignore_non_latin
jpayne@7	378 ):
jpayne@7	379 ratio: float = characters_popularity_compare(
jpayne@7	380 language, popular_character_ordered
jpayne@7	381 )
jpayne@7	382
jpayne@7	383 if ratio < threshold:
jpayne@7	384 continue
jpayne@7	385 elif ratio >= 0.8:
jpayne@7	386 sufficient_match_count += 1
jpayne@7	387
jpayne@7	388 results.append((language, round(ratio, 4)))
jpayne@7	389
jpayne@7	390 if sufficient_match_count >= 3:
jpayne@7	391 break
jpayne@7	392
jpayne@7	393 return sorted(
jpayne@7	394 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
jpayne@7	395 )

Mercurial > repos > jpayne > bioproject_to_srr_2

annotate charset_normalizer/cd.py @ 9:f9f1d0a0599a