jpayne@7: import logging
jpayne@7: from os import PathLike
jpayne@7: from typing import BinaryIO, List, Optional, Set, Union
jpayne@7: 
jpayne@7: from .cd import (
jpayne@7:     coherence_ratio,
jpayne@7:     encoding_languages,
jpayne@7:     mb_encoding_languages,
jpayne@7:     merge_coherence_ratios,
jpayne@7: )
jpayne@7: from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
jpayne@7: from .md import mess_ratio
jpayne@7: from .models import CharsetMatch, CharsetMatches
jpayne@7: from .utils import (
jpayne@7:     any_specified_encoding,
jpayne@7:     cut_sequence_chunks,
jpayne@7:     iana_name,
jpayne@7:     identify_sig_or_bom,
jpayne@7:     is_cp_similar,
jpayne@7:     is_multi_byte_encoding,
jpayne@7:     should_strip_sig_or_bom,
jpayne@7: )
jpayne@7: 
jpayne@7: # Will most likely be controversial
jpayne@7: # logging.addLevelName(TRACE, "TRACE")
jpayne@7: logger = logging.getLogger("charset_normalizer")
jpayne@7: explain_handler = logging.StreamHandler()
jpayne@7: explain_handler.setFormatter(
jpayne@7:     logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
jpayne@7: )
jpayne@7: 
jpayne@7: 
jpayne@7: def from_bytes(
jpayne@7:     sequences: Union[bytes, bytearray],
jpayne@7:     steps: int = 5,
jpayne@7:     chunk_size: int = 512,
jpayne@7:     threshold: float = 0.2,
jpayne@7:     cp_isolation: Optional[List[str]] = None,
jpayne@7:     cp_exclusion: Optional[List[str]] = None,
jpayne@7:     preemptive_behaviour: bool = True,
jpayne@7:     explain: bool = False,
jpayne@7:     language_threshold: float = 0.1,
jpayne@7:     enable_fallback: bool = True,
jpayne@7: ) -> CharsetMatches:
jpayne@7:     """
jpayne@7:     Given a raw bytes sequence, return the best possibles charset usable to render str objects.
jpayne@7:     If there is no results, it is a strong indicator that the source is binary/not text.
jpayne@7:     By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
jpayne@7:     And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
jpayne@7: 
jpayne@7:     The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
jpayne@7:     but never take it for granted. Can improve the performance.
jpayne@7: 
jpayne@7:     You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
jpayne@7:     purpose.
jpayne@7: 
jpayne@7:     This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
jpayne@7:     By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
jpayne@7:     toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
jpayne@7:     Custom logging format and handler can be set manually.
jpayne@7:     """
jpayne@7: 
jpayne@7:     if not isinstance(sequences, (bytearray, bytes)):
jpayne@7:         raise TypeError(
jpayne@7:             "Expected object of type bytes or bytearray, got: {0}".format(
jpayne@7:                 type(sequences)
jpayne@7:             )
jpayne@7:         )
jpayne@7: 
jpayne@7:     if explain:
jpayne@7:         previous_logger_level: int = logger.level
jpayne@7:         logger.addHandler(explain_handler)
jpayne@7:         logger.setLevel(TRACE)
jpayne@7: 
jpayne@7:     length: int = len(sequences)
jpayne@7: 
jpayne@7:     if length == 0:
jpayne@7:         logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
jpayne@7:         if explain:
jpayne@7:             logger.removeHandler(explain_handler)
jpayne@7:             logger.setLevel(previous_logger_level or logging.WARNING)
jpayne@7:         return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
jpayne@7: 
jpayne@7:     if cp_isolation is not None:
jpayne@7:         logger.log(
jpayne@7:             TRACE,
jpayne@7:             "cp_isolation is set. use this flag for debugging purpose. "
jpayne@7:             "limited list of encoding allowed : %s.",
jpayne@7:             ", ".join(cp_isolation),
jpayne@7:         )
jpayne@7:         cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
jpayne@7:     else:
jpayne@7:         cp_isolation = []
jpayne@7: 
jpayne@7:     if cp_exclusion is not None:
jpayne@7:         logger.log(
jpayne@7:             TRACE,
jpayne@7:             "cp_exclusion is set. use this flag for debugging purpose. "
jpayne@7:             "limited list of encoding excluded : %s.",
jpayne@7:             ", ".join(cp_exclusion),
jpayne@7:         )
jpayne@7:         cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
jpayne@7:     else:
jpayne@7:         cp_exclusion = []
jpayne@7: 
jpayne@7:     if length <= (chunk_size * steps):
jpayne@7:         logger.log(
jpayne@7:             TRACE,
jpayne@7:             "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
jpayne@7:             steps,
jpayne@7:             chunk_size,
jpayne@7:             length,
jpayne@7:         )
jpayne@7:         steps = 1
jpayne@7:         chunk_size = length
jpayne@7: 
jpayne@7:     if steps > 1 and length / steps < chunk_size:
jpayne@7:         chunk_size = int(length / steps)
jpayne@7: 
jpayne@7:     is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
jpayne@7:     is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
jpayne@7: 
jpayne@7:     if is_too_small_sequence:
jpayne@7:         logger.log(
jpayne@7:             TRACE,
jpayne@7:             "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
jpayne@7:                 length
jpayne@7:             ),
jpayne@7:         )
jpayne@7:     elif is_too_large_sequence:
jpayne@7:         logger.log(
jpayne@7:             TRACE,
jpayne@7:             "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
jpayne@7:                 length
jpayne@7:             ),
jpayne@7:         )
jpayne@7: 
jpayne@7:     prioritized_encodings: List[str] = []
jpayne@7: 
jpayne@7:     specified_encoding: Optional[str] = (
jpayne@7:         any_specified_encoding(sequences) if preemptive_behaviour else None
jpayne@7:     )
jpayne@7: 
jpayne@7:     if specified_encoding is not None:
jpayne@7:         prioritized_encodings.append(specified_encoding)
jpayne@7:         logger.log(
jpayne@7:             TRACE,
jpayne@7:             "Detected declarative mark in sequence. Priority +1 given for %s.",
jpayne@7:             specified_encoding,
jpayne@7:         )
jpayne@7: 
jpayne@7:     tested: Set[str] = set()
jpayne@7:     tested_but_hard_failure: List[str] = []
jpayne@7:     tested_but_soft_failure: List[str] = []
jpayne@7: 
jpayne@7:     fallback_ascii: Optional[CharsetMatch] = None
jpayne@7:     fallback_u8: Optional[CharsetMatch] = None
jpayne@7:     fallback_specified: Optional[CharsetMatch] = None
jpayne@7: 
jpayne@7:     results: CharsetMatches = CharsetMatches()
jpayne@7: 
jpayne@7:     sig_encoding, sig_payload = identify_sig_or_bom(sequences)
jpayne@7: 
jpayne@7:     if sig_encoding is not None:
jpayne@7:         prioritized_encodings.append(sig_encoding)
jpayne@7:         logger.log(
jpayne@7:             TRACE,
jpayne@7:             "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
jpayne@7:             len(sig_payload),
jpayne@7:             sig_encoding,
jpayne@7:         )
jpayne@7: 
jpayne@7:     prioritized_encodings.append("ascii")
jpayne@7: 
jpayne@7:     if "utf_8" not in prioritized_encodings:
jpayne@7:         prioritized_encodings.append("utf_8")
jpayne@7: 
jpayne@7:     for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
jpayne@7:         if cp_isolation and encoding_iana not in cp_isolation:
jpayne@7:             continue
jpayne@7: 
jpayne@7:         if cp_exclusion and encoding_iana in cp_exclusion:
jpayne@7:             continue
jpayne@7: 
jpayne@7:         if encoding_iana in tested:
jpayne@7:             continue
jpayne@7: 
jpayne@7:         tested.add(encoding_iana)
jpayne@7: 
jpayne@7:         decoded_payload: Optional[str] = None
jpayne@7:         bom_or_sig_available: bool = sig_encoding == encoding_iana
jpayne@7:         strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
jpayne@7:             encoding_iana
jpayne@7:         )
jpayne@7: 
jpayne@7:         if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
jpayne@7:             logger.log(
jpayne@7:                 TRACE,
jpayne@7:                 "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
jpayne@7:                 encoding_iana,
jpayne@7:             )
jpayne@7:             continue
jpayne@7:         if encoding_iana in {"utf_7"} and not bom_or_sig_available:
jpayne@7:             logger.log(
jpayne@7:                 TRACE,
jpayne@7:                 "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
jpayne@7:                 encoding_iana,
jpayne@7:             )
jpayne@7:             continue
jpayne@7: 
jpayne@7:         try:
jpayne@7:             is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
jpayne@7:         except (ModuleNotFoundError, ImportError):
jpayne@7:             logger.log(
jpayne@7:                 TRACE,
jpayne@7:                 "Encoding %s does not provide an IncrementalDecoder",
jpayne@7:                 encoding_iana,
jpayne@7:             )
jpayne@7:             continue
jpayne@7: 
jpayne@7:         try:
jpayne@7:             if is_too_large_sequence and is_multi_byte_decoder is False:
jpayne@7:                 str(
jpayne@7:                     sequences[: int(50e4)]
jpayne@7:                     if strip_sig_or_bom is False
jpayne@7:                     else sequences[len(sig_payload) : int(50e4)],
jpayne@7:                     encoding=encoding_iana,
jpayne@7:                 )
jpayne@7:             else:
jpayne@7:                 decoded_payload = str(
jpayne@7:                     sequences
jpayne@7:                     if strip_sig_or_bom is False
jpayne@7:                     else sequences[len(sig_payload) :],
jpayne@7:                     encoding=encoding_iana,
jpayne@7:                 )
jpayne@7:         except (UnicodeDecodeError, LookupError) as e:
jpayne@7:             if not isinstance(e, LookupError):
jpayne@7:                 logger.log(
jpayne@7:                     TRACE,
jpayne@7:                     "Code page %s does not fit given bytes sequence at ALL. %s",
jpayne@7:                     encoding_iana,
jpayne@7:                     str(e),
jpayne@7:                 )
jpayne@7:             tested_but_hard_failure.append(encoding_iana)
jpayne@7:             continue
jpayne@7: 
jpayne@7:         similar_soft_failure_test: bool = False
jpayne@7: 
jpayne@7:         for encoding_soft_failed in tested_but_soft_failure:
jpayne@7:             if is_cp_similar(encoding_iana, encoding_soft_failed):
jpayne@7:                 similar_soft_failure_test = True
jpayne@7:                 break
jpayne@7: 
jpayne@7:         if similar_soft_failure_test:
jpayne@7:             logger.log(
jpayne@7:                 TRACE,
jpayne@7:                 "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
jpayne@7:                 encoding_iana,
jpayne@7:                 encoding_soft_failed,
jpayne@7:             )
jpayne@7:             continue
jpayne@7: 
jpayne@7:         r_ = range(
jpayne@7:             0 if not bom_or_sig_available else len(sig_payload),
jpayne@7:             length,
jpayne@7:             int(length / steps),
jpayne@7:         )
jpayne@7: 
jpayne@7:         multi_byte_bonus: bool = (
jpayne@7:             is_multi_byte_decoder
jpayne@7:             and decoded_payload is not None
jpayne@7:             and len(decoded_payload) < length
jpayne@7:         )
jpayne@7: 
jpayne@7:         if multi_byte_bonus:
jpayne@7:             logger.log(
jpayne@7:                 TRACE,
jpayne@7:                 "Code page %s is a multi byte encoding table and it appear that at least one character "
jpayne@7:                 "was encoded using n-bytes.",
jpayne@7:                 encoding_iana,
jpayne@7:             )
jpayne@7: 
jpayne@7:         max_chunk_gave_up: int = int(len(r_) / 4)
jpayne@7: 
jpayne@7:         max_chunk_gave_up = max(max_chunk_gave_up, 2)
jpayne@7:         early_stop_count: int = 0
jpayne@7:         lazy_str_hard_failure = False
jpayne@7: 
jpayne@7:         md_chunks: List[str] = []
jpayne@7:         md_ratios = []
jpayne@7: 
jpayne@7:         try:
jpayne@7:             for chunk in cut_sequence_chunks(
jpayne@7:                 sequences,
jpayne@7:                 encoding_iana,
jpayne@7:                 r_,
jpayne@7:                 chunk_size,
jpayne@7:                 bom_or_sig_available,
jpayne@7:                 strip_sig_or_bom,
jpayne@7:                 sig_payload,
jpayne@7:                 is_multi_byte_decoder,
jpayne@7:                 decoded_payload,
jpayne@7:             ):
jpayne@7:                 md_chunks.append(chunk)
jpayne@7: 
jpayne@7:                 md_ratios.append(
jpayne@7:                     mess_ratio(
jpayne@7:                         chunk,
jpayne@7:                         threshold,
jpayne@7:                         explain is True and 1 <= len(cp_isolation) <= 2,
jpayne@7:                     )
jpayne@7:                 )
jpayne@7: 
jpayne@7:                 if md_ratios[-1] >= threshold:
jpayne@7:                     early_stop_count += 1
jpayne@7: 
jpayne@7:                 if (early_stop_count >= max_chunk_gave_up) or (
jpayne@7:                     bom_or_sig_available and strip_sig_or_bom is False
jpayne@7:                 ):
jpayne@7:                     break
jpayne@7:         except (
jpayne@7:             UnicodeDecodeError
jpayne@7:         ) as e:  # Lazy str loading may have missed something there
jpayne@7:             logger.log(
jpayne@7:                 TRACE,
jpayne@7:                 "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
jpayne@7:                 encoding_iana,
jpayne@7:                 str(e),
jpayne@7:             )
jpayne@7:             early_stop_count = max_chunk_gave_up
jpayne@7:             lazy_str_hard_failure = True
jpayne@7: 
jpayne@7:         # We might want to check the sequence again with the whole content
jpayne@7:         # Only if initial MD tests passes
jpayne@7:         if (
jpayne@7:             not lazy_str_hard_failure
jpayne@7:             and is_too_large_sequence
jpayne@7:             and not is_multi_byte_decoder
jpayne@7:         ):
jpayne@7:             try:
jpayne@7:                 sequences[int(50e3) :].decode(encoding_iana, errors="strict")
jpayne@7:             except UnicodeDecodeError as e:
jpayne@7:                 logger.log(
jpayne@7:                     TRACE,
jpayne@7:                     "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
jpayne@7:                     encoding_iana,
jpayne@7:                     str(e),
jpayne@7:                 )
jpayne@7:                 tested_but_hard_failure.append(encoding_iana)
jpayne@7:                 continue
jpayne@7: 
jpayne@7:         mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
jpayne@7:         if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
jpayne@7:             tested_but_soft_failure.append(encoding_iana)
jpayne@7:             logger.log(
jpayne@7:                 TRACE,
jpayne@7:                 "%s was excluded because of initial chaos probing. Gave up %i time(s). "
jpayne@7:                 "Computed mean chaos is %f %%.",
jpayne@7:                 encoding_iana,
jpayne@7:                 early_stop_count,
jpayne@7:                 round(mean_mess_ratio * 100, ndigits=3),
jpayne@7:             )
jpayne@7:             # Preparing those fallbacks in case we got nothing.
jpayne@7:             if (
jpayne@7:                 enable_fallback
jpayne@7:                 and encoding_iana in ["ascii", "utf_8", specified_encoding]
jpayne@7:                 and not lazy_str_hard_failure
jpayne@7:             ):
jpayne@7:                 fallback_entry = CharsetMatch(
jpayne@7:                     sequences, encoding_iana, threshold, False, [], decoded_payload
jpayne@7:                 )
jpayne@7:                 if encoding_iana == specified_encoding:
jpayne@7:                     fallback_specified = fallback_entry
jpayne@7:                 elif encoding_iana == "ascii":
jpayne@7:                     fallback_ascii = fallback_entry
jpayne@7:                 else:
jpayne@7:                     fallback_u8 = fallback_entry
jpayne@7:             continue
jpayne@7: 
jpayne@7:         logger.log(
jpayne@7:             TRACE,
jpayne@7:             "%s passed initial chaos probing. Mean measured chaos is %f %%",
jpayne@7:             encoding_iana,
jpayne@7:             round(mean_mess_ratio * 100, ndigits=3),
jpayne@7:         )
jpayne@7: 
jpayne@7:         if not is_multi_byte_decoder:
jpayne@7:             target_languages: List[str] = encoding_languages(encoding_iana)
jpayne@7:         else:
jpayne@7:             target_languages = mb_encoding_languages(encoding_iana)
jpayne@7: 
jpayne@7:         if target_languages:
jpayne@7:             logger.log(
jpayne@7:                 TRACE,
jpayne@7:                 "{} should target any language(s) of {}".format(
jpayne@7:                     encoding_iana, str(target_languages)
jpayne@7:                 ),
jpayne@7:             )
jpayne@7: 
jpayne@7:         cd_ratios = []
jpayne@7: 
jpayne@7:         # We shall skip the CD when its about ASCII
jpayne@7:         # Most of the time its not relevant to run "language-detection" on it.
jpayne@7:         if encoding_iana != "ascii":
jpayne@7:             for chunk in md_chunks:
jpayne@7:                 chunk_languages = coherence_ratio(
jpayne@7:                     chunk,
jpayne@7:                     language_threshold,
jpayne@7:                     ",".join(target_languages) if target_languages else None,
jpayne@7:                 )
jpayne@7: 
jpayne@7:                 cd_ratios.append(chunk_languages)
jpayne@7: 
jpayne@7:         cd_ratios_merged = merge_coherence_ratios(cd_ratios)
jpayne@7: 
jpayne@7:         if cd_ratios_merged:
jpayne@7:             logger.log(
jpayne@7:                 TRACE,
jpayne@7:                 "We detected language {} using {}".format(
jpayne@7:                     cd_ratios_merged, encoding_iana
jpayne@7:                 ),
jpayne@7:             )
jpayne@7: 
jpayne@7:         results.append(
jpayne@7:             CharsetMatch(
jpayne@7:                 sequences,
jpayne@7:                 encoding_iana,
jpayne@7:                 mean_mess_ratio,
jpayne@7:                 bom_or_sig_available,
jpayne@7:                 cd_ratios_merged,
jpayne@7:                 decoded_payload,
jpayne@7:             )
jpayne@7:         )
jpayne@7: 
jpayne@7:         if (
jpayne@7:             encoding_iana in [specified_encoding, "ascii", "utf_8"]
jpayne@7:             and mean_mess_ratio < 0.1
jpayne@7:         ):
jpayne@7:             logger.debug(
jpayne@7:                 "Encoding detection: %s is most likely the one.", encoding_iana
jpayne@7:             )
jpayne@7:             if explain:
jpayne@7:                 logger.removeHandler(explain_handler)
jpayne@7:                 logger.setLevel(previous_logger_level)
jpayne@7:             return CharsetMatches([results[encoding_iana]])
jpayne@7: 
jpayne@7:         if encoding_iana == sig_encoding:
jpayne@7:             logger.debug(
jpayne@7:                 "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
jpayne@7:                 "the beginning of the sequence.",
jpayne@7:                 encoding_iana,
jpayne@7:             )
jpayne@7:             if explain:
jpayne@7:                 logger.removeHandler(explain_handler)
jpayne@7:                 logger.setLevel(previous_logger_level)
jpayne@7:             return CharsetMatches([results[encoding_iana]])
jpayne@7: 
jpayne@7:     if len(results) == 0:
jpayne@7:         if fallback_u8 or fallback_ascii or fallback_specified:
jpayne@7:             logger.log(
jpayne@7:                 TRACE,
jpayne@7:                 "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
jpayne@7:             )
jpayne@7: 
jpayne@7:         if fallback_specified:
jpayne@7:             logger.debug(
jpayne@7:                 "Encoding detection: %s will be used as a fallback match",
jpayne@7:                 fallback_specified.encoding,
jpayne@7:             )
jpayne@7:             results.append(fallback_specified)
jpayne@7:         elif (
jpayne@7:             (fallback_u8 and fallback_ascii is None)
jpayne@7:             or (
jpayne@7:                 fallback_u8
jpayne@7:                 and fallback_ascii
jpayne@7:                 and fallback_u8.fingerprint != fallback_ascii.fingerprint
jpayne@7:             )
jpayne@7:             or (fallback_u8 is not None)
jpayne@7:         ):
jpayne@7:             logger.debug("Encoding detection: utf_8 will be used as a fallback match")
jpayne@7:             results.append(fallback_u8)
jpayne@7:         elif fallback_ascii:
jpayne@7:             logger.debug("Encoding detection: ascii will be used as a fallback match")
jpayne@7:             results.append(fallback_ascii)
jpayne@7: 
jpayne@7:     if results:
jpayne@7:         logger.debug(
jpayne@7:             "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
jpayne@7:             results.best().encoding,  # type: ignore
jpayne@7:             len(results) - 1,
jpayne@7:         )
jpayne@7:     else:
jpayne@7:         logger.debug("Encoding detection: Unable to determine any suitable charset.")
jpayne@7: 
jpayne@7:     if explain:
jpayne@7:         logger.removeHandler(explain_handler)
jpayne@7:         logger.setLevel(previous_logger_level)
jpayne@7: 
jpayne@7:     return results
jpayne@7: 
jpayne@7: 
jpayne@7: def from_fp(
jpayne@7:     fp: BinaryIO,
jpayne@7:     steps: int = 5,
jpayne@7:     chunk_size: int = 512,
jpayne@7:     threshold: float = 0.20,
jpayne@7:     cp_isolation: Optional[List[str]] = None,
jpayne@7:     cp_exclusion: Optional[List[str]] = None,
jpayne@7:     preemptive_behaviour: bool = True,
jpayne@7:     explain: bool = False,
jpayne@7:     language_threshold: float = 0.1,
jpayne@7:     enable_fallback: bool = True,
jpayne@7: ) -> CharsetMatches:
jpayne@7:     """
jpayne@7:     Same thing than the function from_bytes but using a file pointer that is already ready.
jpayne@7:     Will not close the file pointer.
jpayne@7:     """
jpayne@7:     return from_bytes(
jpayne@7:         fp.read(),
jpayne@7:         steps,
jpayne@7:         chunk_size,
jpayne@7:         threshold,
jpayne@7:         cp_isolation,
jpayne@7:         cp_exclusion,
jpayne@7:         preemptive_behaviour,
jpayne@7:         explain,
jpayne@7:         language_threshold,
jpayne@7:         enable_fallback,
jpayne@7:     )
jpayne@7: 
jpayne@7: 
jpayne@7: def from_path(
jpayne@7:     path: Union[str, bytes, PathLike],  # type: ignore[type-arg]
jpayne@7:     steps: int = 5,
jpayne@7:     chunk_size: int = 512,
jpayne@7:     threshold: float = 0.20,
jpayne@7:     cp_isolation: Optional[List[str]] = None,
jpayne@7:     cp_exclusion: Optional[List[str]] = None,
jpayne@7:     preemptive_behaviour: bool = True,
jpayne@7:     explain: bool = False,
jpayne@7:     language_threshold: float = 0.1,
jpayne@7:     enable_fallback: bool = True,
jpayne@7: ) -> CharsetMatches:
jpayne@7:     """
jpayne@7:     Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
jpayne@7:     Can raise IOError.
jpayne@7:     """
jpayne@7:     with open(path, "rb") as fp:
jpayne@7:         return from_fp(
jpayne@7:             fp,
jpayne@7:             steps,
jpayne@7:             chunk_size,
jpayne@7:             threshold,
jpayne@7:             cp_isolation,
jpayne@7:             cp_exclusion,
jpayne@7:             preemptive_behaviour,
jpayne@7:             explain,
jpayne@7:             language_threshold,
jpayne@7:             enable_fallback,
jpayne@7:         )
jpayne@7: 
jpayne@7: 
jpayne@7: def is_binary(
jpayne@7:     fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes],  # type: ignore[type-arg]
jpayne@7:     steps: int = 5,
jpayne@7:     chunk_size: int = 512,
jpayne@7:     threshold: float = 0.20,
jpayne@7:     cp_isolation: Optional[List[str]] = None,
jpayne@7:     cp_exclusion: Optional[List[str]] = None,
jpayne@7:     preemptive_behaviour: bool = True,
jpayne@7:     explain: bool = False,
jpayne@7:     language_threshold: float = 0.1,
jpayne@7:     enable_fallback: bool = False,
jpayne@7: ) -> bool:
jpayne@7:     """
jpayne@7:     Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
jpayne@7:     Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
jpayne@7:     are disabled to be stricter around ASCII-compatible but unlikely to be a string.
jpayne@7:     """
jpayne@7:     if isinstance(fp_or_path_or_payload, (str, PathLike)):
jpayne@7:         guesses = from_path(
jpayne@7:             fp_or_path_or_payload,
jpayne@7:             steps=steps,
jpayne@7:             chunk_size=chunk_size,
jpayne@7:             threshold=threshold,
jpayne@7:             cp_isolation=cp_isolation,
jpayne@7:             cp_exclusion=cp_exclusion,
jpayne@7:             preemptive_behaviour=preemptive_behaviour,
jpayne@7:             explain=explain,
jpayne@7:             language_threshold=language_threshold,
jpayne@7:             enable_fallback=enable_fallback,
jpayne@7:         )
jpayne@7:     elif isinstance(
jpayne@7:         fp_or_path_or_payload,
jpayne@7:         (
jpayne@7:             bytes,
jpayne@7:             bytearray,
jpayne@7:         ),
jpayne@7:     ):
jpayne@7:         guesses = from_bytes(
jpayne@7:             fp_or_path_or_payload,
jpayne@7:             steps=steps,
jpayne@7:             chunk_size=chunk_size,
jpayne@7:             threshold=threshold,
jpayne@7:             cp_isolation=cp_isolation,
jpayne@7:             cp_exclusion=cp_exclusion,
jpayne@7:             preemptive_behaviour=preemptive_behaviour,
jpayne@7:             explain=explain,
jpayne@7:             language_threshold=language_threshold,
jpayne@7:             enable_fallback=enable_fallback,
jpayne@7:         )
jpayne@7:     else:
jpayne@7:         guesses = from_fp(
jpayne@7:             fp_or_path_or_payload,
jpayne@7:             steps=steps,
jpayne@7:             chunk_size=chunk_size,
jpayne@7:             threshold=threshold,
jpayne@7:             cp_isolation=cp_isolation,
jpayne@7:             cp_exclusion=cp_exclusion,
jpayne@7:             preemptive_behaviour=preemptive_behaviour,
jpayne@7:             explain=explain,
jpayne@7:             language_threshold=language_threshold,
jpayne@7:             enable_fallback=enable_fallback,
jpayne@7:         )
jpayne@7: 
jpayne@7:     return not guesses