Source code for hbutils.encoding.decode

"""
Automatic binary decoding utilities.

This module provides helpers for decoding binary data into text by attempting
multiple encodings. The primary entry point is :func:`auto_decode`, which
tries an explicit encoding if provided, then a list of preferred encodings,
and finally uses system defaults and the :mod:`chardet` detector.

The module contains the following main components:

* :func:`auto_decode` - Automatically decode bytes into text using multiple strategies

.. note::
   Decoding results depend on the provided byte stream and the available
   encodings in the runtime environment.

Example::

    >>> from hbutils.encoding.decode import auto_decode
    >>> auto_decode(b'kdsfjldsjflkdsmgds')
    'kdsfjldsjflkdsmgds'
    >>> auto_decode(b'\\xd0\\x94\\xd0\\xbe\\xd0\\xb1\\xd1\\x80\\xd1\\x8b\\xd0\\xb9')
    'Добрый'

"""
import sys
from typing import Optional, List

import chardet

from ..collection import unique

_DEFAULT_ENCODING = 'utf-8'
_DEFAULT_PREFERRED_ENCODINGS = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5']  # common encodings for Chinese text

__all__ = [
    'auto_decode'
]


def _decode(data: bytes, encoding: str) -> str:
    """
    Decode binary data using the specified encoding.

    :param data: Binary data to decode.
    :type data: bytes
    :param encoding: Encoding to use for decoding.
    :type encoding: str
    :return: Decoded string.
    :rtype: str
    :raises UnicodeDecodeError: If decoding fails with the specified encoding.
    :raises LookupError: If the specified encoding is unknown.

    Example::

        >>> _decode(b'hello', 'utf-8')
        'hello'
    """
    return data.decode(encoding)


[docs] def auto_decode(data: bytes, encoding: Optional[str] = None, prefers: Optional[List[str]] = None) -> str: r""" Automatically decode binary data into text. This function attempts to decode binary data using multiple strategies: 1. If an ``encoding`` is explicitly specified, it is used directly. 2. Otherwise, preferred encodings are tried in order. 3. The system default encoding is tried next. 4. Finally, the :mod:`chardet` library is used to detect a likely encoding. The function tries each encoding until one succeeds. If all attempts fail, the :class:`UnicodeDecodeError` with the longest successful decode position is raised. :param data: Original binary data to be decoded. :type data: bytes :param encoding: Encoding to use explicitly. If ``None``, the encoding will be automatically detected using the described strategy. :type encoding: Optional[str] :param prefers: Preferred encodings to try first. If ``None``, the default preferred encodings (``utf-8``, ``gbk``, ``gb2312``, ``gb18030``, ``big5``) are used. :type prefers: Optional[List[str]] :return: Decoded string. :rtype: str :raises UnicodeDecodeError: If all decoding attempts fail. :raises LookupError: If any attempted encoding is unknown. .. note:: The detection step uses :func:`chardet.detect`, which may return ``None`` as the detected encoding; such values are ignored. Examples:: >>> auto_decode(b'kdsfjldsjflkdsmgds') 'kdsfjldsjflkdsmgds' >>> auto_decode(b'\\xd0\\x94\\xd0\\xbe\\xd0\\xb1\\xd1\\x80\\xd1\\x8b\\xd0\\xb9 \\xd0' ... b'\\xb2\\xd0\\xb5\\xd1\\x87\\xd0\\xb5\\xd1\\x80') 'Добрый вечер' >>> auto_decode(b'\\xa4\\xb3\\xa4\\xf3\\xa4\\xd0\\xa4\\xf3\\xa4\\xcf') 'こんばんは' >>> auto_decode(b'\\xcd\\xed\\xc9\\xcf\\xba\\xc3') '晚上好' """ if encoding: return _decode(data, encoding) else: if prefers is None: prefers = _DEFAULT_PREFERRED_ENCODINGS _elist = filter(bool, unique([ *prefers, sys.getdefaultencoding(), chardet.detect(data)['encoding'] ])) last_err: Optional[UnicodeDecodeError] = None for enc in _elist: try: return _decode(data, enc) except UnicodeDecodeError as err: if last_err is None or err.start > last_err.start: last_err = err raise last_err