Source code for sign_language_translator.languages.text.text_language

"""
text_language.py
----------------
This module defines the Base NLP class for text format of a spoken language.
It defines the interface for text processing functions needed by the rule-based translator.
"""

import re
from abc import ABC, abstractmethod
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union


[docs] class TextLanguage(ABC): """Base NLP class for a language. Subclass it and provide the functionality to tokenize text and classify & disambiguate tokens. Each token should correspond to a sign language clip. """
[docs] @staticmethod @abstractmethod def name() -> str: """Returns the name of the language used everywhere else in datasets."""
[docs] @classmethod @abstractmethod def token_regex(cls) -> str: """Returns a regular expression that matches words in this language."""
[docs] @classmethod @abstractmethod def allowed_characters(cls) -> Set[str]: """Returns a set of all allowed characters in the language."""
[docs] @abstractmethod def preprocess(self, text: str) -> str: """Preprocesses text before tokenization. Make sure no different unicode characters are used for the same word. Remove unnecessary symbols, spaces, etc."""
[docs] @abstractmethod def tokenize(self, text: str) -> List[str]: """Break apart text into words or phrases"""
[docs] @abstractmethod def sentence_tokenize(self, text: str) -> List[str]: """Break text into sentences."""
[docs] @abstractmethod def detokenize(self, tokens: Iterable[str]) -> str: """Joins tokens back into text."""
[docs] @abstractmethod def tag(self, tokens: Union[str, Iterable[str]]) -> List[Tuple[str, Any]]: """Classify the tokens and mark them with appropriate tags."""
[docs] @abstractmethod def get_tags(self, tokens: Union[str, Iterable[str]]) -> List[Any]: """Get the classifications of all tokens in the form of a sequence of tags"""
[docs] @abstractmethod def get_word_senses(self, tokens: Union[str, Iterable[str]]) -> List[List[str]]: """Get all known meanings of the ambiguous words."""
# embed/similar # all_tags
[docs] @staticmethod def romanize( text: str, *args, add_diacritics=True, character_translation_table: Optional[Dict[int, str]] = None, n_gram_map: Optional[Dict[str, str]] = None, **kwargs ) -> str: """Map characters to phonetically similar characters of the English language. Transliteration is useful for readability & simple text-to-speech. First maps (n>1)-grams, then unigrams. ALA-LC Standardized Romanization Tables (70 languages): https://www.loc.gov/catdir/cpso/roman.html Args: text (str): Non-English text to be mapped to Latin script. add_diacritics (bool, optional): Whether to use diacritics over English characters to help pronunciation. (Rules: 1. The under-dot ' ̣' indicates alternate soft/hard pronunciation of the letter. 2. The over-bar/macron ' ̄' means long pronunciation). Defaults to True. character_translation_table (Optional[Dict[int, str]], optional): A dictionary mapping unicode of single characters to their latin equivalent. Defaults to None. n_gram_map (Optional[Dict[str, str]], optional): A dictionary mapping bigrams, trigrams or more to their latin equivalent. Keys are expected to be regular expressions. Defaults to None. """ # map (n>1)-grams if isinstance(n_gram_map, dict): re_operators = re.compile(r"[\+\*\?\|\[\]\{\}\^\$<=\!\(\)]|(\\[bdwWs])") for ngram in sorted( # ToDo: optimize n_gram_map.keys(), key=lambda x: len(re_operators.sub("", x)), reverse=True, ): text = re.sub(ngram, n_gram_map[ngram], text) # map unigrams text = text.translate(character_translation_table or {}) if not add_diacritics: text = re.sub("[ ̄ ̣ ̂ ̇ ̲ ̆ ̤ ̃ ́]".replace(" ", ""), "", text) return text