Source code for sign_language_translator.languages.text.text_language

"""
text_language.py
----------------
This module defines the Base NLP class for text format of a spoken language.
It defines the interface for text processing functions needed by the rule-based translator.
"""

import re
from abc import ABC, abstractmethod
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union



[docs]
class TextLanguage(ABC):
    """Base NLP class for a language.

    Subclass it and provide the functionality to tokenize text and classify & disambiguate tokens.
    Each token should correspond to a sign language clip.
    """


[docs]
    @staticmethod
    @abstractmethod
    def name() -> str:
        """Returns the name of the language used everywhere else in datasets."""



[docs]
    @classmethod
    @abstractmethod
    def token_regex(cls) -> str:
        """Returns a regular expression that matches words in this language."""



[docs]
    @classmethod
    @abstractmethod
    def allowed_characters(cls) -> Set[str]:
        """Returns a set of all allowed characters in the language."""



[docs]
    @abstractmethod
    def preprocess(self, text: str) -> str:
        """Preprocesses text before tokenization.
        Make sure no different unicode characters are used for the same word.
        Remove unnecessary symbols, spaces, etc."""



[docs]
    @abstractmethod
    def tokenize(self, text: str) -> List[str]:
        """Break apart text into words or phrases"""



[docs]
    @abstractmethod
    def sentence_tokenize(self, text: str) -> List[str]:
        """Break text into sentences."""



[docs]
    @abstractmethod
    def detokenize(self, tokens: Iterable[str]) -> str:
        """Joins tokens back into text."""



[docs]
    @abstractmethod
    def tag(self, tokens: Union[str, Iterable[str]]) -> List[Tuple[str, Any]]:
        """Classify the tokens and mark them with appropriate tags."""



[docs]
    @abstractmethod
    def get_tags(self, tokens: Union[str, Iterable[str]]) -> List[Any]:
        """Get the classifications of all tokens in the form of a sequence of tags"""



[docs]
    @abstractmethod
    def get_word_senses(self, tokens: Union[str, Iterable[str]]) -> List[List[str]]:
        """Get all known meanings of the ambiguous words."""


    # embed/similar
    # all_tags


[docs]
    @staticmethod
    def romanize(
        text: str,
        *args,
        add_diacritics=True,
        character_translation_table: Optional[Dict[int, str]] = None,
        n_gram_map: Optional[Dict[str, str]] = None,
        **kwargs
    ) -> str:
        """Map characters to phonetically similar characters of the English language.
        Transliteration is useful for readability & simple text-to-speech.
        First maps (n>1)-grams, then unigrams.

        ALA-LC Standardized Romanization Tables (70 languages): https://www.loc.gov/catdir/cpso/roman.html

        Args:
            text (str): Non-English text to be mapped to Latin script.
            add_diacritics (bool, optional): Whether to use diacritics over English characters to help pronunciation. (Rules: 1. The under-dot ' ̣' indicates alternate soft/hard pronunciation of the letter. 2. The over-bar/macron ' ̄' means long pronunciation). Defaults to True.
            character_translation_table (Optional[Dict[int, str]], optional): A dictionary mapping unicode of single characters to their latin equivalent. Defaults to None.
            n_gram_map (Optional[Dict[str, str]], optional): A dictionary mapping bigrams, trigrams or more to their latin equivalent. Keys are expected to be regular expressions. Defaults to None.
        """

        # map (n>1)-grams
        if isinstance(n_gram_map, dict):
            re_operators = re.compile(r"[\+\*\?\|\[\]\{\}\^\$<=\!\(\)]|(\\[bdwWs])")
            for ngram in sorted(  # ToDo: optimize
                n_gram_map.keys(),
                key=lambda x: len(re_operators.sub("", x)),
                reverse=True,
            ):
                text = re.sub(ngram, n_gram_map[ngram], text)

        # map unigrams
        text = text.translate(character_translation_table or {})

        if not add_diacritics:
            text = re.sub("[ ̄ ̣ ̂ ̇ ̲ ̆ ̤ ̃ ́]".replace(" ", ""), "", text)

        return text