Source code for _movie_model

import numpy as np
import pandas as pd
from _shared import data_directory
from scipy.special import expit as sigmoid
from dianna import utils
from dianna.utils.tokenizers import SpacyTokenizer


[docs] class MovieReviewsModelRunner: """Creates runner for movie review model.""" def __init__(self, model, word_vector_path=None, max_filter_size=5): """Initializes the class.""" if word_vector_path is None: word_vector_path = data_directory / 'movie_reviews_word_vectors.txt'
[docs] self.run_model = utils.get_function(model)
[docs] self.keys = list( pd.read_csv(word_vector_path, header=None, delimiter=' ')[0])
[docs] self.max_filter_size = max_filter_size
[docs] self.tokenizer = SpacyTokenizer()
[docs] def __call__(self, sentences): """Call Runner.""" # ensure the input has a batch axis if isinstance(sentences, str): sentences = [sentences] output = [] for sentence in sentences: # tokenize and pad to minimum length tokens = self.tokenizer.tokenize(sentence.lower()) if len(tokens) < self.max_filter_size: tokens += ['<pad>'] * (self.max_filter_size - len(tokens)) # numericalize the tokens tokens_numerical = [ self.keys.index(token) if token in self.keys else self.keys.index('<unk>') for token in tokens ] # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis pred = float(sigmoid(self.run_model([tokens_numerical])).flat[0]) output.append(pred) # output two classes positivity = np.array(output) negativity = 1 - positivity return np.transpose([negativity, positivity])
[docs] def tokenize(self, sentence: str): """Tokenize sentence.""" # tokenize and pad to minimum length tokens = self.tokenizer.tokenize(sentence) if len(tokens) < self.max_filter_size: tokens += ['<pad>'] * (self.max_filter_size - len(tokens)) # numericalize the tokens tokens_numerical = [ self.vocab.stoi[token] if token in self.vocab.stoi else self.vocab.stoi['<unk>'] for token in tokens ] return tokens_numerical