import re
from abc import ABC
from abc import abstractmethod
from typing import List
try:
import spacy
except ImportError as err:
raise ImportError(
'Failed to import spacy, please install manually or reinstall dianna with '
'text support: `pip install dianna[text]`') from err
[docs]
class Tokenizer(ABC):
"""Abstract base class for tokenizing.
Has the same interface as (part of) the transformers Tokenizer class.
"""
def __init__(self, mask_token: str):
"""Tokenizer initializer.
Args:
mask_token (str): Token used as mask
"""
[docs]
self.mask_token = mask_token
@abstractmethod
[docs]
def tokenize(self, sentence: str) -> List[str]:
"""Split sentence into list of tokens."""
@abstractmethod
[docs]
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Merge list of tokens back to sentence."""
[docs]
class SpacyTokenizer(Tokenizer):
"""Spacy tokenizer for natural language."""
[docs]
MATCH_token_unk_token = re.compile(r'(\S)(UNKWORDZ)(\S)')
[docs]
MATCH_token_unk_white = re.compile(r'(\S)(UNKWORDZ)(\s|$)')
[docs]
MATCH_white_unk_token = re.compile(r'(^|\s)(UNKWORDZ)(\S)')
def __init__(self,
name: str = 'en_core_web_sm',
mask_token: str = 'UNKWORDZ'):
"""Spacy tokenizer initalizer.
Args:
name: Name of the Spacy tokenizer to use
mask_token (str): Token used as mask
"""
super().__init__(mask_token)
[docs]
self.spacy_tokenizer = get_tokenizer('spacy', name)
[docs]
def tokenize(self, sentence: str) -> List[str]:
"""Tokenize sentence."""
sentence = self._fix_whitespace(sentence)
tokens = self.spacy_tokenizer(sentence)
return tokens
[docs]
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Paste together with spaces in between."""
sentence = ' '.join(tokens)
return sentence
[docs]
def _fix_whitespace(self, sentence: str):
"""Apply fixes for the punctuation/special characters problem.
For more info, see:
https://github.com/dianna-ai/dianna/issues/531
"""
sentence = self.MATCH_token_unk_token.sub(r'\1 \2 \3', sentence)
sentence = self.MATCH_token_unk_white.sub(r'\1 \2\3', sentence)
sentence = self.MATCH_white_unk_token.sub(r'\1\2 \3', sentence)
return sentence
[docs]
def get_tokenizer(_spacy, name):
"""Get tokenizer using spacy."""
nlp = spacy.load(name)
def spacy_tokenizer(text):
# tokenizes text
doc = nlp(text)
return [token.text for token in doc]
return spacy_tokenizer