Source code for sign_language_translator.text.utils

"""
Utility Functions for Text Processing

This module contains utility functions for text processing tasks.

Functions:
    make_ngrams: Creates n-grams from a given sequence.
    extract_supported_subsequences_indexes: Extracts the indexes of subsequences based on provided tags and skipped items.
    extract_supported_subsequences: Extracts subsequences from a given sequence based on provided tags and skipped items.
    concatenate_sentence_terminals: Concatenates start and end of sentence tokens to a list of sentences.

Classes:
    ListRegex: A utility class for finding sub-lists within a list of strings that match specified patterns.
"""

import re
from typing import Any, Iterable, List, Optional, Set, Tuple, Union



[docs]
def make_ngrams(sequence: Iterable, n: int) -> List[Iterable]:
    """Create all possible slices of the given iterable of size n.
    for example, sequence="1234" and n=2 would create ["12","23","34"].

    Args:
        sequence (Iterable): The iterable sequence from which the n-grams will be created.
        n (int): The size of the n-grams.

    Returns:
        List[Iterable]: A list of Iterables representing the n-grams created from the sequence.
            The type of list items is same as sequence argument.
    """

    start = 0
    end = len(sequence) - n  # type: ignore

    ngrams = (
        [sequence[i : i + n] for i in range(start, end + 1)] if end >= start else []  # type: ignore
    )
    return ngrams




[docs]
def extract_supported_subsequences_indexes(
    sequence: Iterable[Any],
    tags: Iterable[Any],
    supported_tags: Set[Any],
    skipped_items: Set[Any],
) -> List[List[int]]:
    """Extract indexes of supported subsequences from a sequence based on tags and skipped items.

    Args:
        sequence (Iterable[Any]): The input sequence.
        tags (Iterable[Any]): Tags corresponding to each item in the sequence.
        supported_tags (Set[Any]): Set of tags indicating support for a subsequence.
        skipped_items (Set[Any]): Set of items to be skipped.

    Returns:
        List[List[int]]: A list indices of supported subsequences, where each inner list represents a subsequence.

    Examples:

    .. code-block:: python

        sequence = [1, 2, 3, 4, 5, 6]
        tags = ['A', 'A', 'B', 'A', 'A', 'C']
        supported_tags = {'A'}
        skipped_items = {2}
        extract_supported_subsequences(sequence, tags, supported_tags, skipped_items)
        # [[0], [3, 4]]
    """

    all_subsequences = []
    subsequence = []
    for i, (token, tag) in enumerate(zip(sequence, tags)):
        if (tag in supported_tags) and (token not in skipped_items):
            subsequence.append(i)
        else:
            if subsequence:
                all_subsequences.append(subsequence)
                subsequence = []

    if subsequence:
        all_subsequences.append(subsequence)

    return all_subsequences




[docs]
def extract_supported_subsequences(
    sequence: Iterable[Any],
    tags: Iterable[Any],
    supported_tags: Set[Any],
    skipped_items: Set[Any],
) -> List[List[Any]]:
    """Extract supported subsequences from a sequence based on tags and skipped items.

    Args:
        sequence (Iterable[Any]): The input sequence.
        tags (Iterable[Any]): Tags corresponding to each item in the sequence.
        supported_tags (Set[Any]): Set of tags indicating support for a subsequence.
        skipped_items (Set[Any]): Set of items to be skipped.

    Returns:
        List[List[Any]]: A list of supported subsequences, where each inner list represents a subsequence.

    Examples:

    .. code-block:: python

        sequence = [1, 2, 3, 4, 5, 6]
        tags = ['A', 'A', 'B', 'A', 'A', 'C']
        supported_tags = {'A'}
        skipped_items = {2}
        extract_supported_subsequences(sequence, tags, supported_tags, skipped_items)
        # [[1], [4, 5]]
    """

    indexes = extract_supported_subsequences_indexes(
        sequence=sequence,
        tags=tags,
        supported_tags=supported_tags,
        skipped_items=skipped_items,
    )

    subsequences = [[sequence[i] for i in index] for index in indexes]  # type: ignore

    return subsequences




[docs]
def concatenate_sentence_terminals(sentences: List, start_token, end_token):
    """
    Inserts start and end tokens between the sentences the input list and
    concatenates them to the sentences (useful when the input is coming from a sentence tokenizer.)

    This function takes a list of sentences and adds a start token to the beginning
    of each sentence except the first and an end token to the end of each sentence except the last.

    Parameters:
        sentences (List): A list of sentences to be processed.
            Sentences can be strings or list of tokens or any type but it must support + operator for concatenation.
        start_token: The token to be added at the start of sentences. Must be same type as a sentence.
        end_token: The token to be added at the end of sentences. Must be same type as a sentence.

    Returns:
        List: A new list of sentences with start and end tokens inserted.

    Example:

    .. code-block:: python

        sentences = ["Hello!", "How are you?", "Goodbye."]
        start_token = "<start>"
        end_token = "<end>"
        result = concatenate_sentence_terminals(sentences, start_token, end_token)
        # Output: ["Hello!<end>", "<start>How are you?<end>", "<start>Goodbye."]
    """

    new_sentences = []
    for i, sentence in enumerate(sentences):
        if i > 0:
            sentence = start_token + sentence
        if i < len(sentences) - 1:
            sentence = sentence + end_token
        new_sentences.append(sentence)

    return new_sentences




[docs]
class ListRegex:
    """A utility class for finding sub-lists within a list of strings that match specified patterns.

    ListRegex provides methods for
    matching patterns against items in a list,
    searching for the first occurrence of patterns,
    finding all occurrences of patterns,
    and retrieving the starting and ending indices of matches.

    Patterns can be defined using:
        1. regular expressions (str)
        2. lists of patterns (regex (str) or a nested list of patterns)
        3. tuple containing the pattern and its interval quantifier ("\\w+", (2,None)).
    When using regular expressions, each pattern is matched against an individual item in the list.
    When using a list of patterns, any of the patterns in the list can match an item.
    When using a tuple of pattern and counts, items in the specified range can match the pattern.

    Examples:

    .. code-block:: python

        items = ["apple", "banana", "orange", "orange", "grape", "melon", "orange", "kiwi"]

        # Match the patterns against the items
        patterns = ["apple", "\\w+"]
        result = ListRegex.match(items, patterns)
        # Output: (0, 2)

        # Search for the first occurrence of the patterns
        patterns = [r"ba(na){2}", ("orange", (0,3))]
        result = ListRegex.search(items, patterns)
        # Output: (1, 4)

        # Find all occurrences of the patterns
        patterns = ["orange", ["grape", "kiwi"]]
        result = ListRegex.find_all(items, patterns)
        # Output: [['orange', 'grape'], ['orange', 'kiwi']]
    """


[docs]
    @staticmethod
    def match(
        items: List[str], patterns: List[Union[str, List, Tuple]]
    ) -> Optional[Tuple[int, int]]:
        """
        Matches the given patterns against the items in the list.
        Applies the patterns at the start of the list of string.

        Args:
            items (List[str]): The sequence of strings to be matched.
            patterns (List[str|List]): The patterns to be matched against the items.

        Returns:
            Tuple[int, int] or None: A tuple containing the starting and ending indices of the matched items,
            or None if no match is found.
        """

        does_match = False
        pattern_index = 0
        item_index = 0
        count, counting_state = None, False

        while item_index < len(items) and pattern_index < len(patterns):
            pattern = patterns[pattern_index]
            item = items[item_index]

            # interval quantifiers: *, +, {n,m}
            # TODO: recursion on tuple
            if isinstance(pattern, tuple):
                assert (
                    len(pattern) == 2 and len(pattern[-1]) == 2
                ), "use proper syntax for repetition: (pattern, (min_count, max_count))"
                pattern, (min_count, max_count) = pattern[0], pattern[1]
                min_count = min_count if min_count is not None else float("-inf")
                max_count = max_count if max_count is not None else float("inf")
                if not counting_state:
                    count, counting_state = 0, True

            # Regex: abc, Character set: [abc],
            # TODO: Logical OR (ab|bc)  (sequence of items OR some other sequence of items)
            does_match = ListRegex._match_item(item, pattern)
            # TODO: shouldn't match when max_count is 0
            if does_match:
                item_index += 1
                if counting_state:
                    count += 1  # type: ignore
                if (count is not None) and count >= max_count:  # type: ignore
                    count, counting_state = None, False

                if not counting_state:
                    pattern_index += 1
            else:
                if counting_state:
                    pattern_index += 1
                    does_match = min_count <= count <= max_count  # type: ignore

                    count, counting_state = None, False

            if not does_match:
                break

        return (0, item_index) if does_match else None


    @staticmethod
    def _match_item(item: str, pattern):
        """
        Matches a single item against a pattern.
        A pattern could be a string regex or a list containing regex or list of regex and so on.

        Args:
            item (str): The item to be matched.
            pattern (str|list): The pattern to be matched against the item.

        Returns:
            bool: True if the item matches the pattern, False otherwise.
        """

        does_match = False
        if isinstance(pattern, str):
            does_match = bool(re.match(pattern, item))
        elif isinstance(pattern, list):
            does_match = any(ListRegex._match_item(item, pat) for pat in pattern)
        else:
            raise ValueError("unknown value of pattern, provide str or list")

        return does_match


[docs]
    @staticmethod
    def search(items: List[str], patterns) -> Optional[Tuple[int, int]]:
        """
        Searches for the first occurrence of the patterns in the list of items.

        Args:
            items (List[str]): The list of strings to be searched.
            patterns (List[str]): The patterns to be searched for in the items.

        Returns:
            Tuple[int, int] or None: A tuple containing the starting and ending indices of the matched items,
            or None if no match is found.
        """

        for i in range(len(items) - len(patterns) + 1):
            matched = ListRegex.match(items[i:], patterns)
            if matched:
                return (i, i + list(matched)[1])  # type: ignore
        return None



[docs]
    @staticmethod
    def find_all(items: List[str], patterns: List) -> List[List[str]]:
        """Finds all occurrences of the patterns in the list of items.

        Args:
            items (List[str]): The list of strings to be searched.
            patterns (List[str]): The patterns to be searched for in the items.

        Returns:
            List[List[str]]: A list of matched subsequences of items.
        """

        matches = [
            items[start:end] for start, end in ListRegex.find_all_spans(items, patterns)
        ]
        return matches



[docs]
    @staticmethod
    def find_all_spans(items: List[str], patterns: List) -> List[Tuple[int, int]]:
        """Finds the starting and ending indices of all occurrences of the patterns in the list of items.

        Args:
            items (List[str]): The list of strings to be searched.
            patterns (List[str]): The patterns to be searched for in the items.

        Returns:
            List[Tuple[int,int]]: A list of tuples containing the starting and ending indices of the matched items.
        """

        spans = []
        end = 0
        while end < len(items):
            span = ListRegex.search(items[end:], patterns)
            if span:
                span = end + span[0], end + span[1]
                _, end = span
                spans.append(span)
            else:
                break

        return spans




__all__ = [
    "make_ngrams",
    "extract_supported_subsequences_indexes",
    "extract_supported_subsequences",
    "concatenate_sentence_terminals",
    "ListRegex",
]