CheckList test of two SRL sytems
CheckList test of two SRL sytems
In this notebook, we’ll apply CheckList tests to two Semantic Role Labeling (SRL) models:
- A logistic regression model, trained on three features.
- A DistillBERT model, fine-tuned on a CoNLL SRL dataset.
Importing dependencies
# For the BERT model
import time
import pandas as pd
import transformers
import numpy as np
import torch
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
from utils import read_data_as_sentence,map_labels_in_dataframe,tokenize_and_align_labels,get_label_mapping,get_labels_from_map,load_srl_model,load_dataset,compute_metrics,write_predictions_to_csv,compute_evaluation_metrics_from_csv, print_sentences
from bert_srl import main, define_args
# For the logistic regression model
import json
import sys
import pickle
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
sys.path.append('feature_extraction')
from extract_position_rel2pred import extract_word_position_and_voice
from extract_dependency_path import extract_dependency_paths
from extract_predicate import extract_predicate_lemma
1. Declare standalone functions
1.1 Logistic regression
def extract_features(data):
"""
Extract features from the data.
Returns a list of samples.
"""
samples = []
for sentence in data:
# Extract features
positions_rel2pred, verb_voice = extract_word_position_and_voice(sentence)
d_paths = extract_dependency_paths(sentence)
predicate_lemma = extract_predicate_lemma(sentence)
# Create a sample for each token in the sentence.
for i, token in enumerate(sentence):
# Skip predicate tokens.
if token['predicate'] == '_':
sample = {
'token': token['form'],
'position_rel2pred': positions_rel2pred[i] + verb_voice,
'dep_path+lemma': d_paths[i] + predicate_lemma
}
samples.append(sample)
return samples
def format_sentence(sentence, predicate_location, argument_labels, predicate_form):
"""
Formats a sentence into a list of dictionaries, to match the input format of the feature extraction functions.
Args:
sentence (list): A list of words.
predicate_location (list): A one-hot vector, indicating the location of the predicate.
argument_labels (list): A list of argument labels.
predicate_form (str): The sense label of the predicate.
"""
output = []
for i, word in enumerate(sentence):
word_dict = {
"form": word,
"predicate": predicate_form if predicate_location[i] == 1 else "_",
"argument": argument_labels[i]
}
output.append(word_dict)
return output
def classify_sentence_logreg(sentence, predicate_location, argument_labels, predicate_sense, model, vectorizer):
"""
The standalone function that takes a sentence and predicts the argument labels, using logistic regression
Args:
sentence (list): A list of words.
predicate_location (list): A one-hot vector, indicating the location of the predicate.
argument_labels (list): A list of argument labels.
predicate_sense (str): The sense label of the predicate.
"""
formatted_output = format_sentence(sentence, predicate_location, argument_labels, predicate_sense)
sample = extract_features([formatted_output])
feature_vectors = vectorizer.transform(sample)
predictions = model.predict(feature_vectors)
predictions = np.insert(predictions, predicate_location.index(1), '_')
return predictions
sentence = ["The", "dog", "ran", "and", "the", "man", "fell", "."]
predicate_location = [0, 0, 1, 0, 0, 0, 0, 0]
argument_labels = ['_', 'ARG0', '_', '_', '_', '_', '_', '_']
predicate_sense = "run.01"
with open("learned-models/vectorizer.pkl", "rb") as f:
vectorizer = pickle.load(f)
with open("learned-models/model.pkl", "rb") as f:
model = pickle.load(f)
/Users/krisstallenberg/anaconda3/envs/adv-nlp-final-exam/lib/python3.12/site-packages/sklearn/base.py:380: InconsistentVersionWarning: Trying to unpickle estimator DictVectorizer from version 1.6.0 when using version 1.6.1. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
warnings.warn(
/Users/krisstallenberg/anaconda3/envs/adv-nlp-final-exam/lib/python3.12/site-packages/sklearn/base.py:380: InconsistentVersionWarning: Trying to unpickle estimator LogisticRegression from version 1.6.0 when using version 1.6.1. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
warnings.warn(
1.2 DistillBERT
def create_input_sequence(sentence, predicate_position, argument_labels):
"""
Creates a DataFrame with columns 'input_form' and 'argument' for a single sentence.
Parameters:
- sentence (list of str): The words in the sentence.
- predicate_position (list of int): One-hot encoding indicating the predicate position.
- argument_labels (list of str): The argument labels for each token in the sentence.
Returns:
- DataFrame with two columns: 'input_form' and 'argument'.
"""
# Ensure input lengths match
assert len(sentence) == len(predicate_position) == len(argument_labels), "Input lists must have the same length."
# Determine the predicate form based on the one-hot encoding
predicate_index = predicate_position.index(1)
predicate_form = sentence[predicate_index]
# Append special tokens to input_form and argument lists
input_form = sentence + ['[SEP]', predicate_form]
argument = argument_labels + [None, None]
# Create a DataFrame
df = pd.DataFrame([{"input_form": input_form, "argument": argument}])
return df
def map_labels_to_words(predicted_labels, gold_labels, dataset):
tokens = []
for i, (predictions, gold_labels) in enumerate(zip(predicted_labels, gold_labels)):
subword_tokens = tokenizer.convert_ids_to_tokens(dataset[i]["input_ids"], skip_special_tokens=True)
word_tokens = []
word_labels_gold = []
word_labels_pred = []
current_word = ""
current_gold_label = None
current_pred_label = None
for idx, (subword, gold, pred) in enumerate(zip(subword_tokens, gold_labels, predictions)):
if subword.startswith("##"): # Continuation of a word
current_word += subword[2:]
else: # New word starts
if current_word: # Save the previous word and its label
word_tokens.append(current_word)
word_labels_gold.append(current_gold_label)
word_labels_pred.append(current_pred_label)
current_word = subword # Start new word
current_gold_label = gold # Take the first subword's label
current_pred_label = pred # Take the first subword's label
if current_word:
word_tokens.append(current_word)
word_labels_gold.append(current_gold_label)
word_labels_pred.append(current_pred_label)
tokens.extend(zip(word_tokens, word_labels_gold, word_labels_pred))
# Create a dataframe and write to CVS
df = pd.DataFrame(tokens, columns=["word", "gold_label", "predicted_label"])
return df
def classify_sentence_bert(sentence, predicate_location, argument_labels, predicate_sense, trainer, tokenizer):
"""
The standalone function that takes a sentence and predicts the argument labels, using DistilBERT.
Args:
sentence (list): A list of words.
predicate_location (list): A one-hot vector, indicating the location of the predicate.
argument_labels (list): A list of argument labels.
predicate_sense (str): The sense label of the predicate.
trainer: The HuggingFace Trainer instance to predict labels with.
tokenizer: The HuggingFace Tokenizer to tokenize input sequences with.
"""
inference_input = create_input_sequence(sentence, predicate_location, argument_labels)
label_map = {'_': 0, 'ARG0': 1, 'ARG1': 2, 'ARG1-DSP': 3, 'ARG2': 4, 'ARG3': 5, 'ARG4': 6, 'ARG5': 7, 'ARGA': 8, 'ARGM-ADJ': 9, 'ARGM-ADV': 10, 'ARGM-CAU': 11, 'ARGM-COM': 12, 'ARGM-CXN': 13, 'ARGM-DIR': 14, 'ARGM-DIS': 15, 'ARGM-EXT': 16, 'ARGM-GOL': 17, 'ARGM-LOC': 18, 'ARGM-LVB': 19, 'ARGM-MNR': 20, 'ARGM-MOD': 21, 'ARGM-NEG': 22, 'ARGM-PRD': 23, 'ARGM-PRP': 24, 'ARGM-PRR': 25, 'ARGM-REC': 26, 'ARGM-TMP': 27, 'C-ARG0': 28, 'C-ARG1': 29, 'C-ARG1-DSP': 30, 'C-ARG2': 31, 'C-ARG3': 32, 'C-ARG4': 33, 'C-ARGM-ADV': 34, 'C-ARGM-COM': 35, 'C-ARGM-CXN': 36, 'C-ARGM-DIR': 37, 'C-ARGM-EXT': 38, 'C-ARGM-GOL': 39, 'C-ARGM-LOC': 40, 'C-ARGM-MNR': 41, 'C-ARGM-PRP': 42, 'C-ARGM-PRR': 43, 'C-ARGM-TMP': 44, 'R-ARG0': 45, 'R-ARG1': 46, 'R-ARG2': 47, 'R-ARG3': 48, 'R-ARG4': 49, 'R-ARGM-ADJ': 50, 'R-ARGM-ADV': 51, 'R-ARGM-CAU': 52, 'R-ARGM-COM': 53, 'R-ARGM-DIR': 54, 'R-ARGM-GOL': 55, 'R-ARGM-LOC': 56, 'R-ARGM-MNR': 57, 'R-ARGM-TMP': 58, None: None}
inference_data = map_labels_in_dataframe(inference_input, label_map)
tokenized_input = tokenize_and_align_labels(tokenizer, inference_data, label_all_tokens=True)
dataset_inference_sample = load_dataset(tokenized_input)
label_list = get_labels_from_map(label_map)
predictions, labels, _ = trainer.predict(dataset_inference_sample)
argmax_predictions = np.argmax(predictions, axis=2)
# Extract predicted labels for each token, filtering out special tokens
predicted_labels = [
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(argmax_predictions, labels)
]
return predicted_labels[0]
tokenizer = AutoTokenizer.from_pretrained("learned-models/tokenizer.save_pretrained.distillbert-base-uncased-finetuned-srl")
bert_model = AutoModelForTokenClassification.from_pretrained("learned-models/model.save_pretrained.distillbert-base-uncased-finetuned-srl")
training_args = TrainingArguments(output_dir="learned-models/trainer.save_model.distillbert-base-uncased-finetuned-srl")
trainer = Trainer(
model=bert_model,
args=training_args,
tokenizer=tokenizer
)
/var/folders/d9/p0hwqj9x1sx30sdq622dyn1r0000gn/T/ipykernel_1437/3635224896.py:99: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
trainer = Trainer(
2. Perform CheckList evaluation
Load the testing-dataset.json from the data directory as a Python dictionary.
with open("data/testing-dataset.json", "r", encoding="utf-8") as file:
checklist_dataset = json.load(file)
Iterate over the capabilities, tests and samples from the CheckList challenge dataset. The dataset contains MFT and INV tests. Each is evaluated in its own way:
- INV: For each model, the predicted labels for two items in a test are compared. If they’re the same, the test passes.
- MFT: For each model, the predicted label is compared to a gold label. If they’re the same, the test passes.
Write the results to checklist-results.json and print().
def get_argument_index(argument_labels):
"""
Get the argument label's index.
Args:
argument_labels:
Array of arguments strings representing the gold labels.
"""
relevant_indices = [i for i, label in enumerate(argument_labels) if label != "_"]
assert len(relevant_indices) == 1, f"Expected exactly one relevant label, found: {relevant_indices}"
return relevant_indices[0]
# Initialize a dictionary to collect all results
aggregated_results = {"capabilities": []}
# Iterate over capabilities in the dataset
for capability in checklist_dataset:
cap_name = capability['capability_name']
cap_description = capability['capability_description']
cap_category = capability['capability_category']
print(f"Capability: {cap_name} ({cap_category})")
cap_result = {
"capability_name": cap_name,
"capability_description": cap_description,
"capability_category": cap_category,
"tests": []
}
# Initialize variables to track capability performance
cap_total_gold = 0
cap_failures_logreg = 0
cap_failures_bert = 0
# Iterate over tests for current capability
for test in capability['tests']:
test_type = test['test_type']
test_result = {
"test_name": test['test_name'],
"test_description": test['test_description'],
"test_type": test_type,
"samples": []
}
print(f"\n * Test: {test_result['test_name']} ({test_result['test_type']})")
# Initialize variables to track test performance
test_total_gold = 0
test_failures_logreg = 0
test_failures_bert = 0
# Iterate over the samples in this test
for sample in test['samples']:
sample_results = []
if test_type == "MFT":
for item in sample:
tokens = item['tokens']
argument_labels = item['argument_labels']
predicate_sense = item['predicate_name']
num_classes = len(tokens)
# Convert predicate position to one-hot vector
predicate_position = torch.tensor([item['predicate_position']])
predicate_position = torch.nn.functional.one_hot(predicate_position, num_classes).squeeze(0).tolist()
# Get inferences from models
logreg_inference = classify_sentence_logreg(
tokens, predicate_position, argument_labels, predicate_sense, model, vectorizer
)
bert_inference = classify_sentence_bert(
tokens, predicate_position, argument_labels, predicate_sense, trainer, tokenizer
)
# Compare predicted labels to gold label
for gold_label, logreg_label, bert_label in zip(argument_labels, logreg_inference, bert_inference):
if gold_label != "_":
test_total_gold += 1
cap_total_gold += 1
logreg_correct = (gold_label == logreg_label)
bert_correct = (gold_label == bert_label)
if not logreg_correct:
test_failures_logreg += 1
cap_failures_logreg += 1
if not bert_correct:
test_failures_bert += 1
cap_failures_bert += 1
sample_results.append({
"gold_label": gold_label,
"logreg_label": logreg_label,
"bert_label": bert_label,
"logreg_correct": logreg_correct,
"bert_correct": bert_correct
})
elif test_type == "INV":
# INV test samples have two items (a 'before' and 'after')
if len(sample) != 2:
print("Warning: INV test sample does not contain exactly 2 items.")
continue
item1, item2 = sample[0], sample[1]
# Process item 1
tokens1 = item1['tokens']
num_classes1 = len(tokens1)
argument_labels_1 = item1['argument_labels']
predicate_sense_1 = item1['predicate_name']
# Find the index of the argument gold label
argument_index_1 = get_argument_index(argument_labels_1)
# Convert predicate position to one-hot vector
predicate_position_tensor1 = torch.tensor([item1['predicate_position']])
predicate_position1 = torch.nn.functional.one_hot(predicate_position_tensor1, num_classes1).squeeze(0).tolist()
# Infer both SRL models for item 1
logreg_inference_1 = classify_sentence_logreg(
tokens1, predicate_position1, argument_labels_1, predicate_sense_1, model, vectorizer
)
bert_inference_1 = classify_sentence_bert(
tokens1, predicate_position1, argument_labels_1, predicate_sense_1, trainer, tokenizer
)
# Process item 2
tokens2 = item2['tokens']
num_classes2 = len(tokens2)
argument_labels_2 = item2['argument_labels']
predicate_sense_2 = item2['predicate_name']
argument_index_2 = get_argument_index(argument_labels_2)
# Convert predicate position to one-hot vector
predicate_position_tensor2 = torch.tensor([item2['predicate_position']])
predicate_position2 = torch.nn.functional.one_hot(predicate_position_tensor2, num_classes2).squeeze(0).tolist()
# Infer both SRL models for item 2
logreg_inference_2 = classify_sentence_logreg(
tokens2, predicate_position2, argument_labels_2, predicate_sense_2, model, vectorizer
)
bert_inference_2 = classify_sentence_bert(
tokens2, predicate_position2, argument_labels_2, predicate_sense_2, trainer, tokenizer
)
arg_logreg_1 = logreg_inference_1[argument_index_1]
arg_bert_1 = bert_inference_1[argument_index_1]
arg_logreg_2 = logreg_inference_2[argument_index_2]
arg_bert_2 = bert_inference_2[argument_index_2]
# Increment counters
test_total_gold += 1
cap_total_gold += 1
# Test passes if the predicted labels are the same for both items
logreg_correct = (arg_logreg_1 == arg_logreg_2)
bert_correct = (arg_bert_1 == arg_bert_2)
if not logreg_correct:
test_failures_logreg += 1
cap_failures_logreg += 1
if not bert_correct:
test_failures_bert += 1
cap_failures_bert += 1
sample_results.append({
"relevant_index_item1": argument_index_1,
"relevant_index_item2": argument_index_2,
"logreg_prediction_item1": pred_logreg_1,
"logreg_prediction_item2": pred_logreg_2,
"bert_prediction_item1": pred_bert_1,
"bert_prediction_item2": pred_bert_2,
"logreg_correct": logreg_correct,
"bert_correct": bert_correct
})
# Save the detailed results for this sample
test_result["samples"].append(sample_results)
# Calculate failure rates for this test (as percentages)
test_result["failure_rate_logreg"] = (
100.0 * test_failures_logreg / test_total_gold if test_total_gold else None
)
test_result["failure_rate_bert"] = (
100.0 * test_failures_bert / test_total_gold if test_total_gold else None
)
print(f"""
* Failure rates:
Logistic regression: {test_result["failure_rate_logreg"]}%
DistillBERT: {test_result["failure_rate_bert"]}%""")
# Add the test result to the capability's results
cap_result["tests"].append(test_result)
# Calculate overall failure rates for the capability
cap_result["failure_rate_logreg"] = (
100.0 * cap_failures_logreg / cap_total_gold if cap_total_gold else None
)
cap_result["failure_rate_bert"] = (
100.0 * cap_failures_bert / cap_total_gold if cap_total_gold else None
)
print(f"""
> Failure rates (total for capability):
Logistic regression: {cap_result["failure_rate_logreg"]}%
DistillBERT: {cap_result["failure_rate_bert"]}%
=====================================================================================
""")
# Add the capability result to the aggregated results
aggregated_results["capabilities"].append(cap_result)
# Write the CheckList test results to a JSON file
with open("checklist-results.json", "w", encoding="utf-8") as outfile:
json.dump(aggregated_results, outfile, indent=2)
Capability: Long-distance dependencies between predicate and ARG0 (syntactic)
* Test: Effect of injecting relative clause between predicate and ARG0 (INV)
* Failure rates:
Logistic regression: 20.0%
DistillBERT: 30.0%
* Test: Sentences without relative clause between predicate and ARG0 (MFT)
* Failure rates:
Logistic regression: 70.0%
DistillBERT: 20.0%
* Test: Sentences with relative clause between predicate and ARG0 (MFT)
* Failure rates:
Logistic regression: 90.0%
DistillBERT: 50.0%
> Failure rates (total for capability):
Logistic regression: 60.0%
DistillBERT: 33.333333333333336%
=====================================================================================
Capability: Long-distance dependencies between predicate and ARG1 (syntactic)
* Test: Effect of injecting adverbial or participial phrase between predicate and ARG1 (INV)
* Failure rates:
Logistic regression: 60.0%
DistillBERT: 60.0%
* Test: Sentence without adverbial or participial phrase between predicate and ARG1 (MFT)
* Failure rates:
Logistic regression: 10.0%
DistillBERT: 0.0%
* Test: Sentence with adverbial or participial phrase between predicate and ARG1 (MFT)
* Failure rates:
Logistic regression: 70.0%
DistillBERT: 60.0%
> Failure rates (total for capability):
Logistic regression: 46.666666666666664%
DistillBERT: 40.0%
=====================================================================================
Capability: Robustness to noise in the form of typos in ARG0 on ARG0 labeling (lexical)
* Test: Effect of typos in proper nouns as ARG0 on ARG0 labeling (INV)
* Failure rates:
Logistic regression: 30.0%
DistillBERT: 20.0%
* Test: Effect of typos in ARG0 as common noun (INV)
* Failure rates:
Logistic regression: 20.0%
DistillBERT: 0.0%
* Test: Sentences without typos in ARG0 as proper noun (MFT)
* Failure rates:
Logistic regression: 60.0%
DistillBERT: 0.0%
* Test: Sentences without typos in ARG0 as proper noun (MFT)
* Failure rates:
Logistic regression: 90.0%
DistillBERT: 20.0%
* Test: Sentences without typos in ARG0 as common noun (MFT)
* Failure rates:
Logistic regression: 60.0%
DistillBERT: 0.0%
* Test: Sentences with typos in ARG0 as common noun (MFT)
* Failure rates:
Logistic regression: 80.0%
DistillBERT: 0.0%
> Failure rates (total for capability):
Logistic regression: 56.666666666666664%
DistillBERT: 6.666666666666667%
=====================================================================================
Capability: Effect of semantic atypicality in active voice syntactically simple SVO sentences on ARG0 labeling (lexical)
* Test: Animate objects as ARG0 (MFT)
* Failure rates:
Logistic regression: 90.0%
DistillBERT: 0.0%
* Test: Inanimate objects as ARG0 (MFT)
* Failure rates:
Logistic regression: 100.0%
DistillBERT: 40.0%
* Test: Effect of animate versus inanimate concepts as ARG0 (INV)
* Failure rates:
Logistic regression: 10.0%
DistillBERT: 40.0%
* Test: Non-abstract concepts as ARG0 (MFT)
* Failure rates:
Logistic regression: 100.0%
DistillBERT: 70.0%
* Test: Abstract concepts as ARG0 (MFT)
* Failure rates:
Logistic regression: 100.0%
DistillBERT: 90.0%
* Test: Effect of abstract versus non-abstract concepts as ARG0 (INV)
* Failure rates:
Logistic regression: 0.0%
DistillBERT: 30.0%
> Failure rates (total for capability):
Logistic regression: 66.66666666666667%
DistillBERT: 45.0%
=====================================================================================
Capability: Dealing with dative verb alternations (syntactic)
* Test: Effect of dative verb alternations on ARG1 (INV)
* Failure rates:
Logistic regression: 0.0%
DistillBERT: 0.0%
* Test: Dative verb alternations with prepositional dative construction for ARG1 (MFT)
* Failure rates:
Logistic regression: 80.0%
DistillBERT: 30.0%
* Test: Dative verb alternations with double object construction for ARG1 (MFT)
* Failure rates:
Logistic regression: 80.0%
DistillBERT: 30.0%
> Failure rates (total for capability):
Logistic regression: 53.333333333333336%
DistillBERT: 20.0%
=====================================================================================