Source code for sign_language_translator.languages.text.hindi

__all__ = [
    "Hindi",
]

import re
from string import ascii_uppercase, digits
from typing import Any, Dict, Iterable, List, Set, Tuple, Union

from sign_language_translator.config.assets import Assets
from sign_language_translator.config.enums import TextLanguages
from sign_language_translator.languages.text.text_language import TextLanguage
from sign_language_translator.languages.vocab import Vocab
from sign_language_translator.text.preprocess import remove_space_before_punctuation
from sign_language_translator.text.tagger import Rule, Tagger, Tags
from sign_language_translator.text.tokenizer import SignTokenizer


[docs] class Hindi(TextLanguage): """NLP class for Hindi text. Extends `slt.languages.text.TextLanguage` class. Hindi is an Indo-Aryan language spoken mostly in India. Hindi uses the Devanagari script, which consists of 11 vowels and 33 consonants and is written from left to right. See unicode details at: https://unicode.org/charts/PDF/U0900.pdf """
[docs] @staticmethod def name() -> str: return TextLanguages.HINDI.value
[docs] @classmethod def token_regex(cls) -> str: return f"({cls.NUMBER_REGEX}|{cls.WORD_REGEX})"
[docs] @classmethod def allowed_characters(cls) -> Set[str]: return cls.ALLOWED_CHARACTERS
def __init__(self) -> None: # TODO: args to filter dataset self.vocab = self.__get_vocab() self.tokenizer = self.__get_tokenizer() self.tagging_rules = self.__get_tagging_rules() self.tagger = Tagger(rules=self.tagging_rules, default=Tags.DEFAULT) self.omitted_tokens = {"", " ", "\t"}
[docs] def preprocess(self, text: str) -> str: text = self.normalize_characters(text) text = self.delete_unallowed_characters(text) text = re.sub(r"[^\S\n]{2,}", " ", text) text = remove_space_before_punctuation(text, self.PUNCTUATION) text = text.strip() return text
[docs] def tokenize(self, text: str) -> List[str]: tokens = self.tokenizer.tokenize( text, join_compound_words=True, join_word_sense=True ) tokens = [token for token in tokens if token not in self.omitted_tokens] return tokens
[docs] def sentence_tokenize(self, text: str) -> List[str]: sentences = self.tokenizer.sentence_tokenize(text) sentences = [ sentence.strip() for sentence in sentences if sentence not in self.omitted_tokens ] return sentences
[docs] def detokenize(self, tokens: Iterable[str]) -> str: text = " ".join(tokens) text = remove_space_before_punctuation(text, self.PUNCTUATION) return text
[docs] def tag(self, tokens: Union[str, Iterable[str]]) -> List[Tuple[str, Any]]: if isinstance(tokens, str): tokens = [tokens] tagged = self.tagger.tag(tokens) return tagged
[docs] def get_tags(self, tokens: Union[str, Iterable[str]]) -> List[Any]: if isinstance(tokens, str): tokens = [tokens] tags = self.tagger.get_tags(tokens) return tags
[docs] def get_word_senses(self, tokens: Union[str, Iterable[str]]) -> List[List[str]]: if isinstance(tokens, str): tokens = [tokens] word_senses = [ self.vocab.ambiguous_to_unambiguous.get(token.lower(), []) for token in tokens ] return word_senses
[docs] def romanize(self, text: str, *args, add_diacritics=True, **kwargs) -> str: """Map Hindi characters to phonetically similar characters of the English language. Transliteration is useful for readability. ALA-LC Romanization Table: https://www.loc.gov/catdir/cpso/romanization/hindi.pdf Args: text (str): Hindi text to be mapped to Latin script. add_diacritics (bool, optional): Whether to use diacritics over English characters to help pronunciation. Defaults to True. Examples: .. code-block:: python import sign_language_translator as slt nlp = slt.languages.text.Hindi() text = "मैंने किताब खरीदी है।" romanized_text = nlp.romanize(text) print(romanized_text) # 'mainne kitab khrīdī hai.' text = "ईशांत शर्मा को उनकी शानदार गेंदबाजी के लिए १ प्लेयर ऑफ द मैच का अवॉर्ड दिया गया।" text = nlp.preprocess(text) romanized_text = nlp.romanize(text) print(romanized_text) # 'īshant shrma ko unkī shandar gendbajī ke lie 1 pleyr ôph d maich ka avôrḍ diya gya.' """ text = super().romanize( text, *args, add_diacritics=add_diacritics, character_translation_table=self.ROMANIZATION_CHARACTER_TRANSLATOR, n_gram_map=self.NGRAM_ROMANIZATION_MAP, **kwargs, ) return text
# ================== # # Characters # # ================== # UNICODE_RANGE: Tuple[int, int] = (2304, 2431) # 0x0900 - 0x097F FULL_STOPS: List[str] = [".", "।", "॥"] QUESTION_MARKS: List[str] = ["?"] ACRONYM_PERIODS: List[str] = ["॰"] END_OF_SENTENCE_MARKS: List[str] = FULL_STOPS + QUESTION_MARKS + ["!"] PUNCTUATION: List[str] = END_OF_SENTENCE_MARKS + ACRONYM_PERIODS + list(",;:") BRACKETS: List[str] = ["(", ")", "[", "]", "{", "}"] SYMBOLS: List[str] = PUNCTUATION + BRACKETS + list("-_/") CHARACTERS: List[str] = str( """ ऀ ँ ं ः ऄ अ आ इ ई उ ऊ ऋ ऌ ऍ ऎ ए ऐ ऑ ऒ ओ औ क ख ग घ ङ च छ ज झ ञ ट ठ ड ढ ण त थ द ध न ऩ प फ ब भ म य र ऱ ल ळ ऴ व श ष स ह ऺ ऻ ़ ऽ ा ि ी ु ू ृ ॄ ॅ ॆ े ै ॉ ॊ ो ौ ् ॎ ॏ ॐ ॑ ॒ ॓ ॔ ॕ ॖ ॗ क़ ख़ ग़ ज़ ड़ ढ़ फ़ य़ ॠ ॡ ॢ ॣ । ॥ ० १ २ ३ ४ ५ ६ ७ ८ ९ ॰ ॱ ॲ ॳ ॴ ॵ ॶ ॷ ॸ ॹ ॺ ॻ ॼ ॽ ॾ ॿ """ ).split() DIACRITICS = str("ऀ ँ ं ः ॄ ॅ ़ ा ि ी ु ू ृ े ै ॉ ो ौ ्").split() ALLOWED_CHARACTERS: Set[str] = ( set(CHARACTERS) | set(DIACRITICS) | set(SYMBOLS) | set(ascii_uppercase) | set(digits) | set("()!.,?/[]{}<> \n") ) CHARACTER_TO_DECOMPOSED: Dict[str, str] = { "क़": "क़", "ख़": "ख़", "ग़": "ग़", "ज़": "ज़", "ड़": "ड़", "ढ़": "ढ़", "फ़": "फ़", "य़": "य़", } CHARACTER_TRANSLATOR = {ord(c): d for c, d in CHARACTER_TO_DECOMPOSED.items()} # ========== # # Regex # # ========== # NUMBER_REGEX = r"\d+(?:[\.:]\d+)*" WORD_REGEX = r"[^\W_\d]([^\W_\d]|[" + "".join(DIACRITICS) + r"])*" UNALLOWED_CHARACTERS_REGEX = ( "[^" + "".join(map(re.escape, ALLOWED_CHARACTERS)) + "]" ) # ====================== # # Helper Functions # # ====================== #
[docs] def delete_unallowed_characters(self, text: str) -> str: text = re.sub(self.UNALLOWED_CHARACTERS_REGEX, " ", text) return text
[docs] def normalize_characters(self, text: str) -> str: text = text.translate(self.CHARACTER_TRANSLATOR) return text
# ================ # # initialize # # ================ # def __get_vocab(self): vocab = Vocab( language=f"^{self.name()}$", # r"^hi$" country=r"[a-z]+", organization=r"[a-z]+", part_number=r"[0-9]+", data_root_dir=Assets.ROOT_DIR, arg_is_regex=True, ) return vocab def __get_tokenizer(self): tokenizer = SignTokenizer( word_regex=self.token_regex(), compound_words=( self.vocab.supported_tokens | set(self.vocab.words_to_numbers.keys()) | set(self.vocab.person_names) ), # TODO: | one-hundred twenty-three (\d[ \d]*): ["100", "23"] --> ["123"] end_of_sentence_tokens=self.END_OF_SENTENCE_MARKS, acronym_periods=self.ACRONYM_PERIODS+["."], # spelled out english letters (acronyms) non_sentence_end_words=[ "बी", # B "सी", # C "एफ", # F "एच", # H "जे", # J "एल", # L "एम", # M "एन", # N "एस", # S "डब्ल्यू", # W "एक्स", # X ], tokenized_word_sense_pattern=[self.WORD_REGEX, r"\(", [r"नाम"], r"\)"], ) return tokenizer def __get_tagging_rules(self): punctuation_set = set(self.PUNCTUATION) tagging_rules = [ # e.g. " " Rule.from_pattern(r"^\s+$", Tags.SPACE, 5), # e.g. "," "." Rule(lambda token: token in punctuation_set, Tags.PUNCTUATION, 5), # e.g. "word" Rule.from_pattern("^" + self.WORD_REGEX + "$", Tags.WORD, 5), # e.g. COVID Rule.from_pattern(r"^[A-Z]{2,8}$", Tags.ACRONYM, 4), # e.g. 2002-02-20 Rule.from_pattern(r"^\d{4}-\d{2}-\d{2}$", Tags.DATE, 4), # e.g. 09:30:25.333 Rule.from_pattern(r"^\d+(?::\d+)?(?::\d+(?:\.\d+)?)$", Tags.TIME, 4), # e.g. John, Doe(name) Rule( lambda token: token in self.vocab.person_names or token.endswith("(नाम)"), Tags.NAME, 2, ), # e.g. Cow, airplane, 1 Rule( lambda token: (token.lower() in self.vocab.supported_tokens), Tags.SUPPORTED_WORD, 3, ), # e.g. forty-five, 45 Rule( lambda token: ( bool(re.match(r"^\d+(?:\.\d+)?$", token)) or token in self.vocab.words_to_numbers ), Tags.NUMBER, 4, ), # e.g. "सोना" -> ["सोना(gold)", "सोना(sleep)"] Rule( lambda token: token.lower() in self.vocab.ambiguous_to_unambiguous, Tags.AMBIGUOUS, 2, ), ] return tagging_rules # ================== # # Romanization # # ================== # # https://www.loc.gov/catdir/cpso/romanization/hindi.pdf # https://www.loc.gov/catdir/cpso/romanization/hindi-1997.pdf ROMANIZATION_MAP_VOWELS_AND_DIPHTHONGS = { "अ": "a", "आ": "ā", "ा": "a", # अा "इ": "i", "ि": "i", # अि "ई": "ī", "ी": "ī", # अी "उ": "u", "ु": "u", # अु "ऊ": "ū", "ू": "ū", # अू "ऋ": "r", "ृ": "r", # अृ "ॠ": "r̄", "ॄ": "r̄", # अॄ "ऌ": "l", "ऄ": "ĕ", "ॆ": "ĕ", # अॆ "ए": "e", "े": "e", # अे "ॲ": "ê", "ॅ": "ê", # अॅ # "": "ăi", # "": "ăi", "ऐ": "ai", "ै": "ai", # अै "ऒ": "ŏ", "ॊ": "ŏ", # ऒ "ओ": "o", # ! "आे": "o", "ो": "o", # ओ "ऑ": "ô", "ॉ": "ô", # ऑ "औ": "au", "ौ": "au", # औ "ऎ": "ĕ", # ! not in PDF } ROMANIZATION_MAP_CONSONANTS_GUTTURALS = { "क": "k", "क़": "q", "ख": "kh", "ख़": "k̲h̲", "ग": "g", "ग़": "g̲h̲", "घ": "gh", "घ़": "g̲̲h̲̲", "ङ": "ngh", # ? different from PDF } ROMANIZATION_MAP_CONSONANTS_PALATAS = { "च": "ch", # ? different from PDF "छ": "chh", # ? different from PDF "ज": "j", "ज़": "z", "झ": "jh", "ञ": "ñ", } ROMANIZATION_MAP_CONSONANTS_CEREBRALS = { "ट": "ṭ", "ट़": "t̤", "ठ": "ṭh", "ड": "ḍ", "ड़": "ṛ", "ढ": "ḍh", "ढ़": "ṛh", "ण": "ṇ", } ROMANIZATION_MAP_CONSONANTS_DENTALS = { "त": "t", "थ": "th", "द": "d", "ध": "dh", "न": "n", } ROMANIZATION_MAP_CONSONANTS_LABIALS = { "प": "p", "फ": "ph", "फ़": "f", "ब": "b", "भ": "bh", "म": "m", } ROMANIZATION_MAP_CONSONANTS_SEMIVOWELS = { "य": "y", "र": "r", "ल": "l", "व": "v", } ROMANIZATION_MAP_CONSONANTS_SIBILANTS = { "श": "sh", # ? different from PDF "ष": "s", # ? different from PDF "स": "s", "स़": "s̤", } ROMANIZATION_MAP_CONSONANTS_ASPIRATE = { "ह": "h", "ह़": "h̤", } ROMANIZATION_MAP = { **ROMANIZATION_MAP_VOWELS_AND_DIPHTHONGS, **ROMANIZATION_MAP_CONSONANTS_GUTTURALS, **ROMANIZATION_MAP_CONSONANTS_PALATAS, **ROMANIZATION_MAP_CONSONANTS_CEREBRALS, **ROMANIZATION_MAP_CONSONANTS_DENTALS, **ROMANIZATION_MAP_CONSONANTS_LABIALS, **ROMANIZATION_MAP_CONSONANTS_SEMIVOWELS, **ROMANIZATION_MAP_CONSONANTS_SIBILANTS, **ROMANIZATION_MAP_CONSONANTS_ASPIRATE, # === Diacritics === # "ं": "n", "ँ": "m̐", "ः": "ḥ", "्": "", "ऽ": "'", # === Numbers === # "०": "0", "१": "1", "२": "2", "३": "3", "४": "4", "५": "5", "६": "6", "७": "7", "८": "8", "९": "9", # === Punctuation === # "।": ".", "॥": ".", "॰": ".", } ROMANIZATION_CHARACTER_TRANSLATOR = { ord(h): r for h, r in ROMANIZATION_MAP.items() if len(h) == 1 } NGRAM_ROMANIZATION_MAP = { **{ng: r for ng, r in ROMANIZATION_MAP.items() if len(ng) > 1}, r"(?<=" + "|".join(ROMANIZATION_MAP_CONSONANTS_LABIALS) + ")ं": "m", r"(?<=" + "|".join( (2 - len(c)) * "." + c for c in sorted( list(ROMANIZATION_MAP_CONSONANTS_GUTTURALS) + list(ROMANIZATION_MAP_CONSONANTS_PALATAS) + list(ROMANIZATION_MAP_CONSONANTS_CEREBRALS) + list(ROMANIZATION_MAP_CONSONANTS_DENTALS) ) ) + ")ँ": "n", }