Source code for hbutils.encoding.decode

"""
Overview:
    Functions to deal with encoding binary data easily.
    This module provides utilities for automatically decoding binary data to strings
    by detecting the appropriate encoding, with support for preferred encodings and
    fallback mechanisms.
"""
import sys
from typing import Optional, List

import chardet

from ..collection import unique

_DEFAULT_ENCODING = 'utf-8'
_DEFAULT_PREFERRED_ENCODINGS = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'big5']  # common encodings for chinese

__all__ = [
    'auto_decode'
]


def _decode(data: bytes, encoding: str) -> str:
    """
    Decode binary data using the specified encoding.

    :param data: Binary data to decode.
    :type data: bytes
    :param encoding: Encoding to use for decoding.
    :type encoding: str
    :return: Decoded string.
    :rtype: str
    :raises UnicodeDecodeError: If decoding fails with the specified encoding.
    """
    return data.decode(encoding)



[docs]
def auto_decode(data: bytes, encoding: Optional[str] = None, prefers: Optional[List[str]] = None) -> str:
    r"""
    Auto decode binary data to string, the encoding mode will be automatically detected.

    This function attempts to decode binary data using multiple strategies:
    1. If an encoding is explicitly specified, use it directly
    2. Otherwise, try preferred encodings in order
    3. Fall back to system default encoding
    4. Use chardet library to detect the encoding

    The function will try each encoding until one succeeds, keeping track of the
    best partial match in case all attempts fail.

    :param data: Original binary data to be decoded.
    :type data: bytes
    :param encoding: Encoding mode to be used, default is ``None`` which means this function 
        needs to automatically detect the encoding.
    :type encoding: Optional[str]
    :param prefers: Preferred encodings to try first. If ``None``, uses default preferred 
        encodings (utf-8, gbk, gb2312, gb18030, big5).
    :type prefers: Optional[List[str]]
    :return: Decoded string.
    :rtype: str
    :raises UnicodeDecodeError: If all decoding attempts fail, raises the error with the 
        longest successful decode position.

    Examples::

        >>> auto_decode(b'kdsfjldsjflkdsmgds')
        'kdsfjldsjflkdsmgds'
        >>> auto_decode(b'\xd0\x94\xd0\xbe\xd0\xb1\xd1\x80\xd1\x8b\xd0\xb9 \xd0'
        ...             b'\xb2\xd0\xb5\xd1\x87\xd0\xb5\xd1\x80')
        'Добрый вечер'
        >>> auto_decode(b'\xa4\xb3\xa4\xf3\xa4\xd0\xa4\xf3\xa4\xcf')
        'こんばんは'
        >>> auto_decode(b'\xcd\xed\xc9\xcf\xba\xc3')
        '晚上好'
    """
    if encoding:
        return _decode(data, encoding)
    else:
        if prefers is None:
            prefers = _DEFAULT_PREFERRED_ENCODINGS
        _elist = filter(bool, unique([
            *prefers,
            sys.getdefaultencoding(),
            chardet.detect(data)['encoding']
        ]))

        last_err = None
        for enc in _elist:
            try:
                return _decode(data, enc)
            except UnicodeDecodeError as err:
                if last_err is None or err.start > last_err.start:
                    last_err = err

        raise last_err