Source code for sign_language_translator.models.language_models.ngram_language_model

"""This module provides a simple n-gram-based statistical language model implementation.

Classes:
- NgramLanguageModel: A simple n-gram-based statistical language model.
"""

from __future__ import annotations

import json
from collections import Counter
from copy import deepcopy
from os.path import exists
from typing import Any, Dict, Iterable, List, Tuple

from sign_language_translator.models.language_models.abstract_language_model import (
    LanguageModel,
)
from sign_language_translator.text.utils import make_ngrams
from sign_language_translator.utils import sample_one_index


[docs] class NgramLanguageModel(LanguageModel): """NgramLanguageModel is a statistical language model based on n-grams. It provides functionality for training the model on a given training corpus, generating the next token based on a context, and saving/loading the model. Attributes: - window_size (int): The size of the context window for predicting the next token. - unknown_token (str): The token representation used for unknown or out-of-vocabulary tokens. - sampling_temperature (float): A temperature parameter controlling the sampling probabilities during token generation. - name (str): The name of the language model object (optional). Methods: - train(self, training_corpus): Alias for the fit() method. Trains the language model on the given training corpus. - fit(self, training_corpus): Trains the language model on the given training corpus. - finetune(self, training_corpus, weightage: float): Fine-tunes the language model on an additional training corpus with a specified weightage. - next(self, context: Iterable) -> Tuple[Any, float]: Samples the next token from the learned distribution based on the given context. - next_all(self, context: Iterable) -> Tuple[List[Any], List[float]]: Returns a list of possible next tokens and their associated probabilities based on the given context. - load(model_path: str) -> NgramLanguageModel: Deserializes the model from a JSON file. - save(self, model_path: str, indent=None, ensure_ascii=False): Serializes the model to a JSON file. - __str__(self) -> str: Returns a string representation of the NgramLanguageModel instance. Private Methods: - _to_key_datatype(self, item: Iterable) -> Tuple: Converts an iterable item to the appropriate datatype for use as a key in the model dictionary. - _count_ngrams(self, training_corpus: List[Iterable], n: int) -> Dict[Tuple, int]: Counts the occurrences of n-grams in the training corpus. - _group_by_context(self, counts: Dict[Tuple, int]): Groups the n-grams by context and calculates the weights for each next token. - _count_parameters(self): Counts the total number of weights/probabilities in the model. """ def __init__( self, window_size=1, unknown_token="<unk>", sampling_temperature=1.0, name=None, ) -> None: super().__init__(unknown_token=unknown_token, name=name) self.window_size = window_size self.n_parameters = 0 self.sampling_temperature = sampling_temperature self.model: Dict[Tuple, Dict[str, List]] = {} self._NEXT_TOKEN = "NEXT_TOKEN" self._WEIGHTS = "WEIGHTS" self._sep = "|||"
[docs] def train(self, training_corpus): """Alias for fit(). Trains the language model on the given training corpus. Args: training_corpus (Iterable[Iterable]): The training corpus, an iterable of sequences representing the text data. Returns: None """ self.fit(training_corpus)
[docs] def fit(self, training_corpus) -> None: """Trains the language model on the given training corpus. Args: training_corpus (Iterable[Iterable]): The training corpus, an iterable of sequences representing the text data. Returns: None """ counts = self._count_ngrams(training_corpus, self.window_size + 1) counts = self._group_by_context(counts) # normalize frequencies into probabilities for context, _ in counts.items(): total = sum(counts[context][self._WEIGHTS]) counts[context][self._WEIGHTS] = [ freq / total for freq in counts[context][self._WEIGHTS] ] self.model = counts self.n_parameters = self._count_parameters()
[docs] def finetune(self, training_corpus, weightage: float) -> None: """Fine-tunes the language model on an additional training corpus with a specified weightage. Args: training_corpus (Iterable[Iterable]): The additional training corpus, an iterable of sequences representing the text data. weightage (float): The weightage for the additional training corpus, a value between 0.0 and 1.0 (inclusive). A weightage of 0.0 means no impact from the additional corpus, while a weightage of 1.0 means the model is completely updated based on the additional corpus. Returns: None Raises: AssertionError: If the weightage is outside the valid range [0.0, 1.0]. """ assert 0.0 <= weightage <= 1.0, "provide 0.0 <= weightage <= 1.0" old_model = deepcopy(self.model) self.fit(training_corpus) for context in old_model: if context not in self.model: self.model[context] = old_model[context] else: old_weights = dict( zip( old_model[context][self._NEXT_TOKEN], old_model[context][self._WEIGHTS], ) ) # integrate weights of existing next_tokens for i, next_token in enumerate(self.model[context][self._NEXT_TOKEN]): old_w = old_weights.pop(next_token, 0.0) self.model[context][self._WEIGHTS][i] *= weightage self.model[context][self._WEIGHTS][i] += old_w * (1 - weightage) # append weights of remaining next_tokens for next_token, old_w in old_weights.items(): self.model[context][self._NEXT_TOKEN].append(next_token) self.model[context][self._WEIGHTS].append(old_w * (1 - weightage)) self.n_parameters = self._count_parameters()
[docs] def next(self, context: Iterable) -> Tuple[Any, float]: next_tokens, probabilities = self.next_all(context) index = sample_one_index(probabilities, temperature=self.sampling_temperature) next_token, probability = next_tokens[index], probabilities[index] return (next_token, probability)
[docs] def next_all(self, context: Iterable) -> Tuple[List[Any], List[float]]: context = self._to_key_datatype(context[len(context) - self.window_size :]) # type: ignore if context not in self.model: next_tokens, probabilities = [self.unknown_token], [1.0] else: next_tokens: List = self.model[context][self._NEXT_TOKEN] probabilities: List[float] = self.model[context][self._WEIGHTS] return next_tokens, probabilities
def _to_key_datatype(self, item: Iterable) -> Tuple: """Converts an iterable item to the appropriate datatype for use as a key in the model dictionary.""" return tuple(item) def _count_ngrams( self, training_corpus: Iterable[Iterable], n: int ) -> Dict[Tuple, int]: """Counts the occurrences of n-grams in the training corpus.""" all_ngrams = [ self._to_key_datatype(ngram) for sequence in training_corpus for ngram in make_ngrams(sequence, n) ] counts = dict(Counter(all_ngrams)) return counts def _group_by_context(self, counts: Dict[Tuple, int]): """collect all the keys that are the same except the ending part.""" grouped = {} for ngram, freq in counts.items(): context = ngram[:-1] next_token = ngram[-1] if context not in grouped: grouped[context] = {self._NEXT_TOKEN: [], self._WEIGHTS: []} grouped[context][self._NEXT_TOKEN].append(next_token) grouped[context][self._WEIGHTS].append(freq) return grouped def _count_parameters(self): """Counts the total number of weights/probabilities in the model.""" return sum(len(v[self._WEIGHTS]) for v in self.model.values())
[docs] @staticmethod def load(model_path: str) -> NgramLanguageModel: """Deserializes the model (from JSON). Args: model_path (str): The source file path. Returns: NgramLanguageModel: The deserialized NgramLanguageModel instance. """ with open(model_path, "r", encoding="utf-8") as f: model_data: Dict[str, Any] = json.load(f) window_size = int(model_data["window_size"]) unknown_token = str(model_data["unknown_token"]) name = str(model_data["name"]) next_token_key = str(model_data["NEXT_TOKEN"]) weights_key = str(model_data["WEIGHTS"]) _sep = str(model_data["sep"]) sampling_temperature = float(model_data["sampling_temperature"]) slm = NgramLanguageModel( window_size=window_size, unknown_token=unknown_token, name=name, sampling_temperature=sampling_temperature, ) slm._sep = _sep slm._NEXT_TOKEN = next_token_key slm._WEIGHTS = weights_key slm.model = { slm._to_key_datatype(context.split(_sep)): next_token_and_weights for context, next_token_and_weights in model_data["model"].items() } return slm
[docs] def save( self, model_path: str, indent=None, ensure_ascii=False, overwrite=False ) -> None: """ Serializes the model (as JSON). Args: model_path (str): The target file path. It will silently overwrite if a file already exists at this path. indent (Optional[int]): The indentation level for formatting the JSON data (optional). ensure_ascii (bool): Controls whether non-ASCII characters are escaped (optional). overwrite (bool): If False, raises FileExistsError if the model already exists. Defaults to False. """ if exists(model_path) and not overwrite: raise FileExistsError(f"there is already a file at {model_path = }") model_data = dict() model_data["window_size"] = self.window_size model_data["unknown_token"] = self.unknown_token model_data["name"] = self.name model_data["NEXT_TOKEN"] = self._NEXT_TOKEN model_data["WEIGHTS"] = self._WEIGHTS model_data["sep"] = self._sep model_data["sampling_temperature"] = self.sampling_temperature model_data["model"] = {self._sep.join(k): v for k, v in self.model.items()} with open(model_path, "w", encoding="utf-8") as f: json.dump(model_data, f, indent=indent, ensure_ascii=ensure_ascii)
def __str__(self) -> str: return f"Ngram LM: {super().__str__()}, window={self.window_size}, params={self.n_parameters}"