Source code for sign_language_translator.text.metrics

"""text evaluation metrics
"""

from math import log2
from typing import Any, Iterable, Set



[docs]
class Perplexity:
    """
    A class for calculating the perplexity of sequences based on token frequencies in a corpus.

    Perplexity of a sequence measures how well a language model assigns probabilities to each token in the sequence.
    A lower perplexity indicates that the language model assigns higher probabilities to the tokens in the sequence,
    and therefore, the sequence is more likely to be generated by the language model.
    Conversely, a higher perplexity indicates that the language model assigns lower probabilities to the tokens in the sequence,
    and therefore, the sequence is less likely to be generated by the language model.

    Args:
        all_tokens (set): A set of all tokens in the corpus.
        regularization_constant (float, optional): initial non-zero frequency of tokens.
    """

    def __init__(self, all_tokens: Set[Any], regularizing_constant=1.0) -> None:
        self.token_to_frequency = {token: regularizing_constant for token in all_tokens}
        self.total_frequency = regularizing_constant * len(self.token_to_frequency)


[docs]
    def update_frequencies(self, corpus: Iterable[Iterable[Any]]):
        """Update the token frequencies based on the given corpus.

        Args:
            corpus (Iterable[Iterable[Any]]): An iterable containing sequences of tokens.
        """

        for sequence in corpus:
            for token in sequence:
                self.token_to_frequency[token] += 1
                self.total_frequency += 1



[docs]
    def evaluate(self, sequence: Iterable[Any]) -> float:
        """Calculate the perplexity of a given sequence.

        Args:
            sequence (iterable): The sequence of tokens for which perplexity needs to be calculated.

        Returns:
            float: The perplexity value for the given sequence.
        """

        log_probabilities = [
            log2(self.token_to_frequency[token] / self.total_frequency)
            for token in sequence
        ]
        perplexity = 2 ** (-1 / len(sequence) * sum(log_probabilities))  # type: ignore

        return perplexity




# def cosine_similiarity(text1, text2):
#     # word_embedding = word2embd[word]
#     # similarities   = cosine_similarity( word_embedding.reshape(1,-1),
#     #                                     supported_words_embeddings    ).flatten()
#     # smilarities.argsort()[-top_n:] # get indexes of top 5 values in array
#     # top_n_thresh_indexes = to

#     return 1.0


# def word_error_rate(reference, candidate):
#     # pip install jiwer
#     return 1.0


# def rouge(reference, candidate):
#     # use counters
#     ref = set(reference)
#     return len(set(candidate) & ref) / len(ref)


# def bleu(reference, candidate):
#     # use counters
#     can = set(candidate)
#     return len(set(reference) & can) / len(can)


# def f1_score(reference, candidate):
#     bleu_score = bleu(reference, candidate)
#     rogue_score = rouge(reference, candidate)

#     return 2 * bleu_score * rogue_score / ((bleu_score + rogue_score) or 1)


# def normpdf(x: float, mean: float, std: float):
#     denominator = std * ((2.0 * numpy.pi) ** 0.5)
#     numerator = numpy.exp(-0.5 * ((x - mean) / std) ** 2)
#     return numerator / denominator


__all__ = [
    "Perplexity",
]