Source code for sign_language_translator.text.tokenizer

import re
from typing import Dict, Iterable, List, Optional

from sign_language_translator.text.utils import ListRegex


[docs] class SignTokenizer: def __init__( self, word_regex: str = r"\w+", compound_words: Iterable[str] = (), end_of_sentence_tokens: Iterable[str] = (".", "?", "!"), acronym_periods=(".",), non_sentence_end_words: Iterable[str] = ("A", "B", "C"), tokenized_word_sense_pattern: Optional[List] = None, ): self.word_regex = word_regex self._first_subword_to_full = self._make_compound_word_map(compound_words) self.end_of_sentence_tokens = list(set(end_of_sentence_tokens)) self.non_sentence_end_words = non_sentence_end_words self.acronym_periods = acronym_periods self.tokenized_word_sense_pattern = tokenized_word_sense_pattern or [ r"\w+", r"\(", r"\w+", ([r"-", r"\w+"], (0, None)), # interval quantifier r"\)", ]
[docs] def tokenize( self, text: str, join_compound_words: bool = True, join_word_sense: bool = False ) -> List[str]: matches = re.finditer(self.word_regex, text) word_spans = [m.span() for m in matches] split_indexes = [] prev_end = -1 for span in word_spans: # break up non-words into single characters split_indexes.extend(range(prev_end + 1, span[0])) split_indexes.extend(span) prev_end = split_indexes[-1] split_indexes.extend(range(prev_end + 1, len(text) + 1)) broken = [ text[split_indexes[i] : split_indexes[i + 1]] for i in range(len(split_indexes) - 1) if split_indexes[i] != split_indexes[i + 1] ] if join_compound_words: broken = self._join_subwords(broken) if join_word_sense: broken = self._join_word_sense(broken) return broken
[docs] def sentence_tokenize(self, text: str) -> List[str]: tokens = self.tokenize(text) # todo: split on end_of_sentence_tokens directly sentences = [] sentence = [] previous_token = None ended = False for token in tokens: if token in self.end_of_sentence_tokens: ended = True if ( token in self.acronym_periods and previous_token in self.non_sentence_end_words # type: ignore ): ended = False else: if ended: sentences.append(self.detokenize(sentence)) sentence = [] ended = False sentence.append(token) previous_token = token if sentence: sentences.append(self.detokenize(sentence)) return sentences
[docs] def detokenize(self, tokens: Iterable[str]) -> str: return "".join(tokens)
def _make_compound_word_map( self, word_list: Iterable[str] ) -> Dict[str, List[List[str]]]: mapper: Dict[str, List] = {} for word in set(word_list): subwords = self.tokenize(word, join_compound_words=False) if len(subwords) > 1: mapper.setdefault(subwords[0], []) mapper[subwords[0]].append(subwords) # sort descending by length so that longest sequence gets joint for first_subword in list(mapper.keys()): mapper[first_subword] = sorted(mapper[first_subword], key=len, reverse=True) return mapper def _join_subwords(self, tokens: List[str]) -> List[str]: new_tokens = [] i = 0 while i < len(tokens): if tokens[i] in self._first_subword_to_full: compound_found = False for compound in self._first_subword_to_full[tokens[i]]: if tokens[i : i + len(compound)] == compound: new_tokens.append(self.detokenize(compound)) i += len(compound) compound_found = True break if compound_found: continue new_tokens.append(tokens[i]) i += 1 return new_tokens def _join_word_sense(self, tokens: List[str]): spans = ListRegex.find_all_spans(tokens, self.tokenized_word_sense_pattern) spans = [(None, 0)] + spans + [(len(tokens), None)] new_tokens = [] for i in range(len(spans) - 1): prev_end = spans[i][1] start = spans[i + 1][0] end = spans[i + 1][1] new_tokens.extend(tokens[prev_end:start]) if end is not None: new_tokens.append("".join(tokens[start:end])) return new_tokens