Source code for _model_utils

from pathlib import Path
from typing import Iterable
import numpy as np
import onnx
import pandas as pd
import torch
import xgboost
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModel
from transformers import AutoTokenizer
from dianna.utils.tokenizers import SpacyTokenizer



[docs]
def load_data(file):
    """Open data from a file and returns it as pandas DataFrame."""
    df = pd.read_csv(file, parse_dates=True)
    # Add index column
    df.insert(0, 'Index', df.index)
    return df




[docs]
def preprocess_function(image):
    """For LIME: we divided the input data by 256 for the model (binary mnist) and LIME needs RGB values."""
    return (image / 256).astype(np.float32)




[docs]
def fill_segmentation(values, segmentation):
    """For KernelSHAP: fill each pixel with SHAP values."""
    out = np.zeros(segmentation.shape)
    for i, _ in enumerate(values):
        out[segmentation == i] = values[i]
    return out




[docs]
def load_model(file):
    onnx_model = onnx.load(file)
    return onnx_model




[docs]
def load_labels(file):
    if isinstance(file, (str, Path)):
        file = open(file, 'rb')

    labels = [line.decode().rstrip() for line in file.readlines()]
    if labels is None or labels == ['']:
        raise ValueError(labels)
    return labels




[docs]
def load_training_data(file):
    return np.float32(np.load(file, allow_pickle=False))




[docs]
def load_sunshine(file):
    """Tabular sunshine example.

    Load the csv file in a pandas dataframe and split the data in a train and test set.
    """
    data = load_data(file)

    # Drop unused columns
    X_data = data.drop(columns=['DATE', 'MONTH', 'Index'])[:-1]
    y_data = data.loc[1:]["BASEL_sunshine"]

    # Split the data
    X_train, X_holdout, _, y_holdout = train_test_split(X_data, y_data, test_size=0.3, random_state=0)
    _, X_test, _, _ = train_test_split(X_holdout, y_holdout, test_size=0.5, random_state=0)
    X_test = X_test.reset_index(drop=True)
    X_test.insert(0, 'Index', X_test.index)

    return X_train.to_numpy(dtype=np.float32), X_test



[docs]
def load_penguins(penguins):
    """Prep the data for the penguin model example as per ntoebook."""
    # Remove categorial columns and NaN values
    penguins_filtered = penguins.drop(columns=['island', 'sex']).dropna()


    # Extract inputs and target
    input_features = penguins_filtered.drop(columns=['species'])
    target = pd.get_dummies(penguins_filtered['species'])

    X_train, X_test, _, _ = train_test_split(input_features, target, test_size=0.2,
                                                    random_state=0, shuffle=True, stratify=target)

    X_test = X_test.reset_index(drop=True)
    X_test.insert(0, 'Index', X_test.index)

    return X_train.to_numpy(dtype=np.float32), X_test




[docs]
def features_eulaw(texts: list[str], model_tag="law-ai/InLegalBERT"):
    """Create features for a list of texts."""
    max_length = 512
    tokenizer = AutoTokenizer.from_pretrained(model_tag)
    model = AutoModel.from_pretrained(model_tag)

    def process_batch(batch: Iterable[str]):
        cropped_texts = [text[:max_length] for text in batch]
        encoded_inputs = tokenizer(cropped_texts, padding='longest', truncation=True, max_length=max_length,
                                   return_tensors="pt")
        with torch.no_grad():
            outputs = model(**encoded_inputs)
        last_hidden_states = outputs.last_hidden_state
        sentence_features = last_hidden_states.mean(dim=1)
        return sentence_features

    dataloader = DataLoader(texts, batch_size=1)  # batch size of 1 was quickest for my development
    features = [process_batch(batch) for batch in tqdm(dataloader, desc='Creating features')]
    return np.array(torch.cat(features, dim=0))




[docs]
def classify_texts_eulaw(texts: list[str], model_path, return_proba: bool = False):
    """Classifies every text in a list of texts using the xgboost model stored in model_path.

    The xgboost model will be loaded and used to classify the texts. The texts however will first be processed by a
    large language model which will do the feature extraction for every text. The classifications of the
    xgboost model will be returned.
    For training the xgboost model, see train_legalbert_xgboost.py.

    Parameters
    ----------
    texts
        A list of strings of which each needs to be classified.
    model_path
        The path to a stored xgboost model
    return_proba
        return the probabilities of the model

    Returns
    -------
        List of classifications, one for every text in the list

    """
    features = features_eulaw(texts)
    model = xgboost.XGBClassifier()
    model.load_model(model_path)

    if return_proba:
        return model.predict_proba(features)
    return model.predict(features)




[docs]
class StatementClassifierEUlaw():
    def __init__(self, model_path):

[docs]
        self.tokenizer = SpacyTokenizer(name='en_core_web_sm')


[docs]
        self.model_path = model_path



[docs]
    def __call__(self, sentences):
        # ensure the input has a batch axis
        if isinstance(sentences, str):
            sentences = [sentences]

        probs = classify_texts_eulaw(sentences, self.model_path, return_proba=True)

        model_runner = np.transpose([(probs[:, 0]), (1 - probs[:, 0])])

        return model_runner