Source code for sign_language_translator.languages.vocab

"""load word datasets to create word maps etc."""

import json
import os
import re
from typing import Any, Dict, Iterable, List, Set, TypedDict

from sign_language_translator.config.assets import Assets
from sign_language_translator.config.settings import Settings

__all__ = [
    "Vocab",
    "MappingDataset",
]


# =================================== #
#    Mapping Dataset Static Typing    #
# =================================== #


class Mapping(TypedDict):
    components: List[str]
    """list of sign videos that can make up the video identified by label"""
    label: str
    """the filename of the sign video"""
    token: Dict[str, List[str]]
    """maps language codes to list of tokens that correspond to the label"""
    gloss: Dict[str, List[str]]
    """maps language codes to list of glosses that correspond to the video. A gloss is a word-for-word translation of the signs in the video."""
    translation: Dict[str, List[str]]
    """maps language codes to list of translations that correspond to the video. A translation is a grammatically correct text that has the same meaning as the sign sequence in the video."""


[docs] class MappingDataset(TypedDict): country: str description: str mapping: List[Mapping] organization: str url: str
# ================= # # Data Reader # # ================= #
[docs] class Vocab: """Loads text datasets for a specific language, country and organization. Note: Our mapping datasets will only be downloaded automatically if the `data_root_dir` arg is the same as `Assets.ROOT_DIR`. """ def __init__( self, language: str = r".^", country: str = r".^", organization: str = r".^", part_number: str = r".^", data_root_dir: str = Assets.ROOT_DIR, arg_is_regex: bool = True, word_sense_regex: str = r"\([^\(\)]*\)", ) -> None: # save arguments self.language = language self.country = country self.organization = organization self.part_number = part_number self.data_root_dir = os.path.abspath(data_root_dir) self.arg_is_regex = arg_is_regex self.word_sense_regex = word_sense_regex # initialize properties with defaults self.word_to_labels: Dict[str, List[List[str]]] = {} self.supported_tokens: Set[str] = set() self.ambiguous_to_unambiguous: Dict[str, List[str]] = {} self.person_names: List[str] = [] self.words_to_numbers: Dict[str, int] = {} self.misspelled_to_correct: Dict[str, str] = {} self.number_suffix_to_zeros: Dict[str, str] = {} self.joint_word_to_split_words: Dict[str, str] = {} self.numeric_keys: Set[str] = set() # load data self.__load_mapping_datasets() self.__load_preprocessing()
[docs] def remove_word_sense(self, text: str) -> str: """Remove the word sense or disambiguation information from given text. Args: text (str): The text from which the word sense needs to be removed. Returns: str: The word without the word sense or disambiguation information. Example: .. code-block:: python word = "this is a spring(metal-coil). those are glasses(water-containers)." without_word_sense = remove_word_sense(word) print(without_word_sense) # Output: "this is a spring. those are glasses." """ without_word_sense = re.sub(self.word_sense_regex, "", text) return without_word_sense
def __load_preprocessing(self) -> None: self.__download_resource(fname := "text-preprocessing.json") with open(os.path.join(self.data_root_dir, fname), "r", encoding="utf-8") as f: raw_data: Dict[str, Dict[str, Any]] = json.load(f) data: Dict[str, Any] = { key: lang_to_data.get(lang, self.__default_value(lang_to_data)) for key, lang_to_data in raw_data.items() for lang in lang_to_data if self.__match(self.language, lang, self.arg_is_regex) } self.person_names: List[str] = data.get("person_names", []) self.words_to_numbers: Dict[str, int] = data.get("words_to_numbers", {}) self.misspelled_to_correct: Dict[str, str] = data.get( "misspelled_to_correct", {} ) self.number_suffix_to_zeros: Dict[str, str] = data.get( "number_suffixes_to_zeros", {} ) self.joint_word_to_split_words: Dict[str, str] = data.get( "joint_word_to_split_words", {} ) # TODO: improve coverage of key.isnumeric() self.numeric_keys: Set[str] = { key for key in self.word_to_labels if key.isnumeric() } def __load_mapping_datasets(self) -> None: # download conditionally self.__download_resource( filename := f"{self.country.rstrip('$')}-dictionary-mapping.json" ) # load existing mapping_filepaths = [ os.path.join(self.data_root_dir, file) for file in os.listdir(self.data_root_dir) if re.match(filename, file) ] for filepath in mapping_filepaths: with open(filepath, "r", encoding="utf-8") as f: self.word_to_labels.update( self._make_word_to_labels( self.language, self.country, self.organization, self.part_number, json.load(f), ) ) self.supported_tokens = set(self.word_to_labels) self.ambiguous_to_unambiguous = self._make_disambiguation_map( self.supported_tokens ) self.labels: Set[str] = { label for label_sequences in self.word_to_labels.values() for label_sequence in label_sequences for label in label_sequence } def _make_word_to_labels( self, language: str, country: str, organization: str, part_number: str, mapping_datasets: List[MappingDataset], is_regex: bool = True, ) -> Dict[str, List[List[str]]]: """ Takes JSON word mapping datasets and creates a dictionary mapping text tokens to sign labels. Args: language (str, optional): Language code used in JSON or a regex matching it whose data should be extracted. Defaults to None. country (str): Country code used to filter the mapping datasets. organization (str): Organization code used to filter the mapping datasets. part_number (str): Part number used to filter the mapping datasets. mapping_datasets (List[MappingDataset]): List of mapping datasets. is_regex (bool, optional): Treat the provided language code, country, organization and part_number as regex when comparing against JSON keys. Defaults to True. Returns: Dict[str, List[List[str]]]: A dictionary that maps each text token to a list containing sequences of signs. """ word_to_labels: Dict[str, List[List[str]]] = {} for dataset in mapping_datasets: # different country or organization if not ( self.__match(country, dataset["country"], is_regex) and self.__match(organization, dataset["organization"], is_regex) ): continue for mapping in dataset["mapping"]: # different part number if not all( self.__match( part_number, label.split("_")[0].split("-")[-1], is_regex ) for label in ([mapping["label"]] if "label" in mapping else []) + mapping.get("components", []) ): continue for lang, token_list in mapping.get("token", {}).items(): # different language if not self.__match(language, lang, is_regex): continue for token in token_list: if token not in word_to_labels: word_to_labels[token] = [] if "label" in mapping: word_to_labels[token].append([mapping["label"]]) if "components" in mapping: # ???? Why filter down the components?????? if all( self.__match(country, x[0], is_regex) and self.__match(organization, x[1], is_regex) for comp in mapping["components"] if ( x := comp.split(Settings.FILENAME_SEPARATOR)[ 0 ].split(Settings.FILENAME_CONNECTOR) ) ): word_to_labels[token].append(mapping["components"]) # Drop duplicates (when language = r".*", digits, etc. are repeated) word_to_labels = { word: list(list(seq) for seq in set(tuple(seq) for seq in label_seqs)) for word, label_seqs in word_to_labels.items() } return word_to_labels def _make_disambiguation_map(self, words: Iterable[str]) -> Dict[str, List[str]]: """create a mapping from ambiguous words to possible unambiguous words Args: words (Iterable[str]): A list of disambiguated words. A disambiguated word is a word that has a word-sense in it. e.g. ["spring(season)", "spring(bouncy-coil)"]. Returns: Dict[str, List[str]]: A dictionary mapping ambiguous words to possible unambiguous words. e.g. {"spring": ["spring(season)", "spring(bouncy-coil)"]}. """ ambiguous_2_unambiguous = {} for word in words: without_word_sense = self.remove_word_sense(word) if without_word_sense != word: if without_word_sense not in ambiguous_2_unambiguous: ambiguous_2_unambiguous[without_word_sense] = [] ambiguous_2_unambiguous[without_word_sense].append(word) return ambiguous_2_unambiguous def __match(self, text_1: str, text_2: str, text_1_is_regex: bool) -> bool: return bool(re.match(text_1, text_2)) if text_1_is_regex else text_1 == text_2 def __default_value(self, dictionary: Dict) -> Any: """get an empty instance of dict's first value's class""" first_value = next(iter(dictionary.values()), None) value_class = type(first_value) default_obj = value_class() return default_obj def __download_resource(self, file_name: str): if ( Settings.AUTO_DOWNLOAD and not os.path.exists(os.path.join(self.data_root_dir, file_name)) and self.data_root_dir == Assets.ROOT_DIR ): Assets.download(file_name, overwrite=False)