Source code for _model_utils

from pathlib import Path
from typing import Iterable
import numpy as np
import onnx
import pandas as pd
import torch
import xgboost
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModel
from transformers import AutoTokenizer
from dianna.utils.tokenizers import SpacyTokenizer


[docs] def load_data(file): """Open data from a file and returns it as pandas DataFrame.""" df = pd.read_csv(file, parse_dates=True) # Add index column df.insert(0, 'Index', df.index) return df
[docs] def preprocess_function(image): """For LIME: we divided the input data by 256 for the model (binary mnist) and LIME needs RGB values.""" return (image / 256).astype(np.float32)
[docs] def fill_segmentation(values, segmentation): """For KernelSHAP: fill each pixel with SHAP values.""" out = np.zeros(segmentation.shape) for i, _ in enumerate(values): out[segmentation == i] = values[i] return out
[docs] def load_model(file): onnx_model = onnx.load(file) return onnx_model
[docs] def load_labels(file): if isinstance(file, (str, Path)): file = open(file, 'rb') labels = [line.decode().rstrip() for line in file.readlines()] if labels is None or labels == ['']: raise ValueError(labels) return labels
[docs] def load_training_data(file): return np.float32(np.load(file, allow_pickle=False))
[docs] def load_sunshine(file): """Tabular sunshine example. Load the csv file in a pandas dataframe and split the data in a train and test set. """ data = load_data(file) # Drop unused columns X_data = data.drop(columns=['DATE', 'MONTH', 'Index'])[:-1] y_data = data.loc[1:]["BASEL_sunshine"] # Split the data X_train, X_holdout, _, y_holdout = train_test_split(X_data, y_data, test_size=0.3, random_state=0) _, X_test, _, _ = train_test_split(X_holdout, y_holdout, test_size=0.5, random_state=0) X_test = X_test.reset_index(drop=True) X_test.insert(0, 'Index', X_test.index) return X_train.to_numpy(dtype=np.float32), X_test
[docs] def load_penguins(penguins): """Prep the data for the penguin model example as per ntoebook.""" # Remove categorial columns and NaN values penguins_filtered = penguins.drop(columns=['island', 'sex']).dropna() # Extract inputs and target input_features = penguins_filtered.drop(columns=['species']) target = pd.get_dummies(penguins_filtered['species']) X_train, X_test, _, _ = train_test_split(input_features, target, test_size=0.2, random_state=0, shuffle=True, stratify=target) X_test = X_test.reset_index(drop=True) X_test.insert(0, 'Index', X_test.index) return X_train.to_numpy(dtype=np.float32), X_test
[docs] def features_eulaw(texts: list[str], model_tag="law-ai/InLegalBERT"): """Create features for a list of texts.""" max_length = 512 tokenizer = AutoTokenizer.from_pretrained(model_tag) model = AutoModel.from_pretrained(model_tag) def process_batch(batch: Iterable[str]): cropped_texts = [text[:max_length] for text in batch] encoded_inputs = tokenizer(cropped_texts, padding='longest', truncation=True, max_length=max_length, return_tensors="pt") with torch.no_grad(): outputs = model(**encoded_inputs) last_hidden_states = outputs.last_hidden_state sentence_features = last_hidden_states.mean(dim=1) return sentence_features dataloader = DataLoader(texts, batch_size=1) # batch size of 1 was quickest for my development features = [process_batch(batch) for batch in tqdm(dataloader, desc='Creating features')] return np.array(torch.cat(features, dim=0))
[docs] def classify_texts_eulaw(texts: list[str], model_path, return_proba: bool = False): """Classifies every text in a list of texts using the xgboost model stored in model_path. The xgboost model will be loaded and used to classify the texts. The texts however will first be processed by a large language model which will do the feature extraction for every text. The classifications of the xgboost model will be returned. For training the xgboost model, see train_legalbert_xgboost.py. Parameters ---------- texts A list of strings of which each needs to be classified. model_path The path to a stored xgboost model return_proba return the probabilities of the model Returns ------- List of classifications, one for every text in the list """ features = features_eulaw(texts) model = xgboost.XGBClassifier() model.load_model(model_path) if return_proba: return model.predict_proba(features) return model.predict(features)
[docs] class StatementClassifierEUlaw(): def __init__(self, model_path):
[docs] self.tokenizer = SpacyTokenizer(name='en_core_web_sm')
[docs] self.model_path = model_path
[docs] def __call__(self, sentences): # ensure the input has a batch axis if isinstance(sentences, str): sentences = [sentences] probs = classify_texts_eulaw(sentences, self.model_path, return_proba=True) model_runner = np.transpose([(probs[:, 0]), (1 - probs[:, 0])]) return model_runner