Source code for hbutils.string.inflection

"""
Overview:
    Useful utilities for word inflections.

    Extended based on `jpvanhal/inflection <https://github.com/jpvanhal/inflection>`_.
    
    This module provides a comprehensive set of functions for manipulating English words,
    including pluralization, singularization, case conversion, and string formatting utilities.
    It handles various irregular forms and special cases in English grammar.
"""
import re
import unicodedata

__all__ = [
    'camelize',
    'dasherize',
    'humanize',
    'ordinal',
    'ordinalize',
    'parameterize',
    'pluralize',
    'singularize',
    'tableize',
    'titleize',
    'transliterate',
    'underscore',
]

PLURALS = [
    (r"(?i)(quiz)$", r'\1zes'),
    (r"(?i)^(oxen)$", r'\1'),
    (r"(?i)^(ox)$", r'\1en'),
    (r"(?i)(m|l)ice$", r'\1ice'),
    (r"(?i)(m|l)ouse$", r'\1ice'),
    (r"(?i)(passer)s?by$", r'\1sby'),
    (r"(?i)(matr|vert|ind)(?:ix|ex)$", r'\1ices'),
    (r"(?i)(x|ch|ss|sh)$", r'\1es'),
    (r"(?i)([^aeiouy]|qu)y$", r'\1ies'),
    (r"(?i)(hive)$", r'\1s'),
    (r"(?i)([lr])f$", r'\1ves'),
    (r"(?i)([^f])fe$", r'\1ves'),
    (r"(?i)sis$", 'ses'),
    (r"(?i)([ti])a$", r'\1a'),
    (r"(?i)([ti])um$", r'\1a'),
    (r"(?i)(buffal|potat|tomat)o$", r'\1oes'),
    (r"(?i)(bu)s$", r'\1ses'),
    (r"(?i)(alias|status)$", r'\1es'),
    (r"(?i)(octop|vir)i$", r'\1i'),
    (r"(?i)(octop|vir)us$", r'\1i'),
    (r"(?i)^(ax|test)is$", r'\1es'),
    (r"(?i)s$", 's'),
    (r"$", 's'),
]

SINGULARS = [
    (r"(?i)(database)s$", r'\1'),
    (r"(?i)(quiz)zes$", r'\1'),
    (r"(?i)(matr)ices$", r'\1ix'),
    (r"(?i)(vert|ind)ices$", r'\1ex'),
    (r"(?i)(passer)sby$", r'\1by'),
    (r"(?i)^(ox)en", r'\1'),
    (r"(?i)(alias|status)(es)?$", r'\1'),
    (r"(?i)(octop|vir)(us|i)$", r'\1us'),
    (r"(?i)^(a)x[ie]s$", r'\1xis'),
    (r"(?i)(cris|test)(is|es)$", r'\1is'),
    (r"(?i)(shoe)s$", r'\1'),
    (r"(?i)(o)es$", r'\1'),
    (r"(?i)(bus)(es)?$", r'\1'),
    (r"(?i)(m|l)ice$", r'\1ouse'),
    (r"(?i)(x|ch|ss|sh)es$", r'\1'),
    (r"(?i)(m)ovies$", r'\1ovie'),
    (r"(?i)(s)eries$", r'\1eries'),
    (r"(?i)([^aeiouy]|qu)ies$", r'\1y'),
    (r"(?i)([lr])ves$", r'\1f'),
    (r"(?i)(tive)s$", r'\1'),
    (r"(?i)(hive)s$", r'\1'),
    (r"(?i)([^f])ves$", r'\1fe'),
    (r"(?i)(t)he(sis|ses)$", r"\1hesis"),
    (r"(?i)(s)ynop(sis|ses)$", r"\1ynopsis"),
    (r"(?i)(p)rogno(sis|ses)$", r"\1rognosis"),
    (r"(?i)(p)arenthe(sis|ses)$", r"\1arenthesis"),
    (r"(?i)(d)iagno(sis|ses)$", r"\1iagnosis"),
    (r"(?i)(b)a(sis|ses)$", r"\1asis"),
    (r"(?i)(a)naly(sis|ses)$", r"\1nalysis"),
    (r"(?i)([ti])a$", r'\1um'),
    (r"(?i)(n)ews$", r'\1ews'),
    (r"(?i)(ss)$", r'\1'),
    (r"(?i)s$", ''),
]

UNCOUNTABLES = {
    'equipment',
    'fish',
    'information',
    'jeans',
    'money',
    'rice',
    'series',
    'sheep',
    'species'
}


def _irregular(singular: str, plural: str, *plurals: str) -> None:
    """
    A convenience function to add appropriate rules to plurals and singular
    for irregular words.
    
    This function registers both pluralization and singularization rules for
    irregular words that don't follow standard English grammar patterns. It
    handles case-insensitive matching and preserves the original case.
    
    :param singular: Irregular word in singular form (such as ``it``).
    :type singular: str
    :param plural: Irregular word in plural form (such as ``they``).
    :type plural: str
    :param plurals: Extended words in plural form (such as ``them``).
    :type plurals: str
    
    Example::
        >>> _irregular('person', 'people')
        >>> pluralize('person')
        'people'
        >>> singularize('people')
        'person'
    """

    def caseinsensitive(string: str) -> str:
        """
        Create a case-insensitive regex pattern for a string.
        
        :param string: The string to convert.
        :type string: str
        
        :return: A regex pattern that matches the string case-insensitively.
        :rtype: str
        """
        return ''.join('[' + char + char.upper() + ']' for char in string)

    def _register_singular(singular_: str, plural_: str) -> None:
        """
        Register a singularization rule.
        
        :param singular_: The singular form.
        :type singular_: str
        :param plural_: The plural form.
        :type plural_: str
        """
        if singular_[0].upper() == plural_[0].upper():
            SINGULARS.insert(0, (
                r"(?i)({}){}$".format(plural_[0], plural_[1:]),
                r'\1' + singular_[1:]
            ))
        else:
            SINGULARS.insert(0, (
                r"{}{}$".format(plural_[0].upper(), caseinsensitive(plural_[1:])),
                singular_[0].upper() + singular_[1:]
            ))
            SINGULARS.insert(0, (
                r"{}{}$".format(plural_[0].lower(), caseinsensitive(plural_[1:])),
                singular_[0].lower() + singular_[1:]
            ))

    def _register_plural(singular_: str, plural_: str) -> None:
        """
        Register a pluralization rule.
        
        :param singular_: The singular form.
        :type singular_: str
        :param plural_: The plural form.
        :type plural_: str
        """
        if singular_[0].upper() == plural_[0].upper():
            PLURALS.insert(0, (
                r"(?i)({}){}$".format(singular_[0], singular_[1:]),
                r'\1' + plural_[1:]
            ))
            PLURALS.insert(0, (
                r"(?i)({}){}$".format(plural_[0], plural_[1:]),
                r'\1' + plural_[1:]
            ))
        else:
            PLURALS.insert(0, (
                r"{}{}$".format(singular_[0].upper(),
                                caseinsensitive(singular_[1:])),
                plural_[0].upper() + plural_[1:]
            ))
            PLURALS.insert(0, (
                r"{}{}$".format(singular_[0].lower(),
                                caseinsensitive(singular_[1:])),
                plural_[0].lower() + plural_[1:]
            ))
            PLURALS.insert(0, (
                r"{}{}$".format(plural_[0].upper(), caseinsensitive(plural_[1:])),
                plural_[0].upper() + plural_[1:]
            ))
            PLURALS.insert(0, (
                r"{}{}$".format(plural_[0].lower(), caseinsensitive(plural_[1:])),
                plural_[0].lower() + plural_[1:]
            ))

    _register_plural(singular, plural)
    for p in [plural, *plurals]:
        _register_singular(singular, p)



[docs]
def camelize(string: str, uppercase_first_letter: bool = True) -> str:
    """
    Convert strings to CamelCase.
    
    This function converts underscore-separated strings to CamelCase format.
    It can produce either UpperCamelCase or lowerCamelCase depending on the
    ``uppercase_first_letter`` parameter.

    :param string: Original string to convert.
    :type string: str
    :param uppercase_first_letter: If set to ``True``, :func:`camelize` converts \
        strings to UpperCamelCase. If set to ``False``, :func:`camelize` produces \
        lowerCamelCase. Defaults to ``True``.
    :type uppercase_first_letter: bool
    
    :return: The camelized string.
    :rtype: str

    Examples::
        >>> camelize("device_type")
        'DeviceType'
        >>> camelize("device_type", False)
        'deviceType'

    .. note::
        :func:`camelize` can be thought of as a inverse of :func:`underscore`,
        although there are some cases where that does not hold::

            >>> camelize(underscore("IOError"))
            'IoError'
    """
    if uppercase_first_letter:
        return re.sub(r"(?:^|_)(.)", lambda m: m.group(1).upper(), string)
    else:
        return string[0].lower() + camelize(string)[1:]




[docs]
def dasherize(word: str) -> str:
    """
    Replace underscores with dashes in the string.
    
    This function converts underscore-separated strings to dash-separated strings,
    commonly used in URL slugs and CSS class names.

    :param word: Original word to dasherize.
    :type word: str
    
    :return: The dasherized string.
    :rtype: str

    Example::
        >>> dasherize("puni_puni")
        'puni-puni'
    """
    return word.replace('_', '-')




[docs]
def humanize(word: str) -> str:
    """
    Capitalize the first word and turn underscores into spaces and strip a
    trailing ``"_id"``, if any.
    
    Like :func:`titleize`, this is meant for creating pretty output. This function
    is useful for converting database column names or variable names into
    human-readable strings.

    :param word: Original word to humanize.
    :type word: str
    
    :return: The humanized string.
    :rtype: str

    Examples::
        >>> humanize("employee_salary")
        'Employee salary'
        >>> humanize("author_id")
        'Author'
    """
    word = re.sub(r"_id$", "", word)
    word = word.replace('_', ' ')
    word = re.sub(r"(?i)([a-z\d]*)", lambda m: m.group(1).lower(), word)
    word = re.sub(r"^\w", lambda m: m.group(0).upper(), word)
    return word




[docs]
def ordinal(number: int) -> str:
    """
    Return the suffix that should be added to a number to denote the position
    in an ordered sequence such as 1st, 2nd, 3rd, 4th.
    
    This function returns only the suffix (st, nd, rd, th), not the complete
    ordinal number. Use :func:`ordinalize` for the complete ordinal string.

    :param number: Int format number.
    :type number: int
    
    :return: The ordinal suffix for the number.
    :rtype: str

    Examples::
        >>> ordinal(1)
        'st'
        >>> ordinal(2)
        'nd'
        >>> ordinal(1002)
        'nd'
        >>> ordinal(1003)
        'rd'
        >>> ordinal(-11)
        'th'
        >>> ordinal(-1021)
        'st'
    """
    number = abs(int(number))
    if number % 100 in (11, 12, 13):
        return "th"
    else:
        return {
            1: "st",
            2: "nd",
            3: "rd",
        }.get(number % 10, "th")




[docs]
def ordinalize(number: int) -> str:
    """
    Turn a number into an ordinal string used to denote the position in an
    ordered sequence such as 1st, 2nd, 3rd, 4th.
    
    This function combines the number with its ordinal suffix to create
    a complete ordinal string.

    :param number: Int format number.
    :type number: int
    
    :return: The complete ordinal string.
    :rtype: str

    Examples::
        >>> ordinalize(1)
        '1st'
        >>> ordinalize(2)
        '2nd'
        >>> ordinalize(1002)
        '1002nd'
        >>> ordinalize(1003)
        '1003rd'
        >>> ordinalize(-11)
        '-11th'
        >>> ordinalize(-1021)
        '-1021st'
    """
    return "{}{}".format(number, ordinal(number))




[docs]
def parameterize(string: str, separator: str = '-') -> str:
    """
    Replace special characters in a string so that it may be used as part of a
    'pretty' URL.
    
    This function converts a string into a URL-friendly format by removing or
    replacing special characters, converting to lowercase, and using a separator
    for spaces. It's commonly used for creating URL slugs.

    :param string: Original string to parameterize.
    :type string: str
    :param separator: Separator of parameter words. Defaults to ``'-'``.
    :type separator: str
    
    :return: The parameterized string.
    :rtype: str

    Example::
        >>> parameterize(u"Donald E. Knuth")
        'donald-e-knuth'
    """
    string = transliterate(string)
    # Turn unwanted chars into the separator
    string = re.sub(r"(?i)[^a-z0-9\-_]+", separator, string)
    if separator:
        re_sep = re.escape(separator)
        # No more than one of the separator in a row.
        string = re.sub(r'%s{2,}' % re_sep, separator, string)
        # Remove leading/trailing separator.
        string = re.sub(r"(?i)^{sep}|{sep}$".format(sep=re_sep), '', string)

    return string.lower()




[docs]
def pluralize(word: str) -> str:
    """
    Return the plural form of a word.
    
    This function converts singular English words to their plural forms,
    handling both regular and irregular pluralization rules. It preserves
    the case of the original word and handles uncountable nouns.

    :param word: Original word to pluralize.
    :type word: str
    
    :return: The plural form of the word.
    :rtype: str

    Examples::
        >>> pluralize("posts")
        'posts'
        >>> pluralize("octopus")
        'octopi'
        >>> pluralize("sheep")
        'sheep'
        >>> pluralize("CamelOctopus")
        'CamelOctopi'
    """
    if not word or word.lower() in UNCOUNTABLES:
        return word
    else:
        for rule, replacement in PLURALS:
            if re.search(rule, word):
                return re.sub(rule, replacement, word)
        return word




[docs]
def singularize(word: str) -> str:
    """
    Return the singular form of a word, the reverse of :func:`pluralize`.
    
    This function converts plural English words to their singular forms,
    handling both regular and irregular singularization rules. It preserves
    the case of the original word and handles uncountable nouns.

    :param word: Original word to singularize.
    :type word: str
    
    :return: The singular form of the word.
    :rtype: str

    Examples::
        >>> singularize("posts")
        'post'
        >>> singularize("octopi")
        'octopus'
        >>> singularize("sheep")
        'sheep'
        >>> singularize("word")
        'word'
        >>> singularize("CamelOctopi")
        'CamelOctopus'
    """
    for inflection in UNCOUNTABLES:
        if re.search(r'(?i)\b(%s)\Z' % inflection, word):
            return word

    for rule, replacement in SINGULARS:
        if re.search(rule, word):
            return re.sub(rule, replacement, word)
    return word




[docs]
def tableize(word: str) -> str:
    """
    Create the name of a table like Rails does for models to table names.
    
    This method uses the :func:`pluralize` method on the last word in the string
    after converting it to underscore format. It's commonly used in ORM frameworks
    to generate database table names from model class names.

    :param word: Original word to tableize.
    :type word: str
    
    :return: The tableized string.
    :rtype: str

    Examples::
        >>> tableize('RawScaledScorer')
        'raw_scaled_scorers'
        >>> tableize('egg_and_ham')
        'egg_and_hams'
        >>> tableize('fancyCategory')
        'fancy_categories'
    """
    return pluralize(underscore(word))




[docs]
def titleize(word: str) -> str:
    """
    Capitalize all the words and replace some characters in the string to
    create a nicer looking title.
    
    :func:`titleize` is meant for creating pretty output. It converts strings
    to title case, capitalizing the first letter of each word and replacing
    underscores and hyphens with spaces.

    :param word: Original word to titleize.
    :type word: str
    
    :return: The titleized string.
    :rtype: str

    Examples::
        >>> titleize("man from the boondocks")
        'Man From The Boondocks'
        >>> titleize("x-men: the last stand")
        'X Men: The Last Stand'
        >>> titleize("TheManWithoutAPast")
        'The Man Without A Past'
        >>> titleize("raiders_of_the_lost_ark")
        'Raiders Of The Lost Ark'
    """
    return re.sub(
        r"\b('?\w)",
        lambda match: match.group(1).capitalize(),
        humanize(underscore(word)).title()
    )




[docs]
def transliterate(string: str) -> str:
    """
    Replace non-ASCII characters with an ASCII approximation.
    
    If no approximation exists, the non-ASCII character is ignored. The string must
    be ``unicode``. This is useful for creating URL-safe strings from text containing
    accented characters or other non-ASCII symbols.

    :param string: Original string to transliterate.
    :type string: str
    
    :return: The transliterated ASCII string.
    :rtype: str

    Examples::
        >>> transliterate('älämölö')
        'alamolo'
        >>> transliterate('Ærøskøbing')
        'rskbing'
    """
    normalized = unicodedata.normalize('NFKD', string)
    return normalized.encode('ascii', 'ignore').decode('ascii')




[docs]
def underscore(word: str) -> str:
    """
    Make an underscored, lowercase form from the expression in the string.
    
    This function converts CamelCase strings to underscore_separated strings.
    It's commonly used for converting class names to file names or database
    column names.

    :param word: Original word to underscore.
    :type word: str
    
    :return: The underscored string.
    :rtype: str

    Example::
        >>> underscore("DeviceType")
        'device_type'

    .. note::
        As a rule of thumb you can think of :func:`underscore` as the inverse of
        :func:`camelize`, though there are cases where that does not hold::

            >>> camelize(underscore("IOError"))
            'IoError'
    """
    word = re.sub(r"([A-Z]+)([A-Z][a-z])", r'\1_\2', word)
    word = re.sub(r"([a-z\d])([A-Z])", r'\1_\2', word)
    word = word.replace("-", "_")
    return word.lower()



_irregular('person', 'people')
_irregular('man', 'men')
_irregular('human', 'humans')
_irregular('child', 'children')
_irregular('sex', 'sexes')
_irregular('move', 'moves')
_irregular('cow', 'kine')
_irregular('zombie', 'zombies')

# self added patterns
_irregular('it', 'they', 'them')
_irregular('this', 'these')
_irregular('that', 'those')