Load evaluation files
import csv
import os
Load evaluation files
# Create a function to load the data from the file
def load_evaluation( emissor_directory: str, filename: str) -> list:
"""
Load the evaluation data from the <emissor directory>/<filename>.csv file.
The data is expected to be in the following format:
- The first line is the header, with the column names.
- The following lines are the data, with the following columns:
- 'Turn': The dialogue turn
- 'Speaker': LEOLANI or SPEAKER
- 'Response': The utterance of the speaker
- 'Reference response': The reference response
- 'Overall Human Rating': the average of the 8 quality ratings
- 'Interesting': rating from 1 to 5.
- 'Engaging': rating from 1 to 5.
- 'Specific': rating from 1 to 5.
- 'Relevant': rating from 1 to 5.
- 'Correct': rating from 1 to 5.
- 'Semantically appropriate': rating from 1 to 5.
- 'Understandable': rating from 1 to 5.
- 'Fluent': rating from 1 to 5.
Parameters
----------
emissor_directory : str
The directory where the emissor data is stored.
filename : str
The name of the file to load.
Returns
-------
list
A list of dictionaries, each dictionary represents a row in the CSV file.
"""
# Check if the file exists in the emissor directory
file_path = os.path.join(emissor_directory, filename)
if not os.path.exists(file_path):
raise FileNotFoundError(f"File {file_path} not found.")
# Load the data from the file
data = []
with open(file_path, 'r') as file:
reader = csv.DictReader(file)
for row in reader:
data.append(row)
# Turn the string values of the numerical columns into numbers
for row in data:
for column in ['Overall Human Rating', 'Interesting', 'Engaging', 'Specific', 'Relevant', 'Correct', 'Semantically Appropriate', 'Understandable', 'Fluent']:
# Replace commas with dots
row[column] = row[column].replace(',', '.')
# Check if the value is not empty
if row[column] != '':
row[column] = float(row[column])
return data
# Load the evaluation data
emissor_directory = './48ebae93-ae78-41ba-908a-a5f1fa2afe3d'
filename_1 = 'manual_evaluation_1.csv'
filename_2 = 'manual_evaluation_2.csv'
filename_3 = 'manual_evaluation_3.csv'
filename_avg = 'manual_evaluation_avg.csv'
human_annotations_1 = load_evaluation(emissor_directory, filename_1)
human_annotations_2 = load_evaluation(emissor_directory, filename_2)
human_annotations_3 = load_evaluation(emissor_directory, filename_3)
human_annotations_avg = load_evaluation(emissor_directory, filename_avg)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[2], line 8
5 filename_3 = 'manual_evaluation_3.csv'
6 filename_avg = 'manual_evaluation_avg.csv'
----> 8 human_annotations_1 = load_evaluation(emissor_directory, filename_1)
9 human_annotations_2 = load_evaluation(emissor_directory, filename_2)
10 human_annotations_3 = load_evaluation(emissor_directory, filename_3)
Cell In[1], line 38, in load_evaluation(emissor_directory, filename)
4 """
5 Load the evaluation data from the <emissor directory>/<filename>.csv file.
6
(...)
34 A list of dictionaries, each dictionary represents a row in the CSV file.
35 """
37 # Check if the file exists in the emissor directory
---> 38 file_path = os.path.join(emissor_directory, filename)
39 if not os.path.exists(file_path):
40 raise FileNotFoundError(f"File {file_path} not found.")
NameError: name 'os' is not defined
def remove_human_turns(human_annotations: list) -> list:
"""
Remove the rows where Speaker is SPEAKER from the human annotations.
Parameters
----------
human_annotations
The list of human annotations
Returns
-------
list
The list of human annotations without the SPEAKER turns.
"""
return [row for row in human_annotations if row['Speaker'] == 'LEOLANI']
human_annotations_only_leolani_1 = remove_human_turns(human_annotations_1)
human_annotations_only_leolani_2 = remove_human_turns(human_annotations_2)
human_annotations_only_leolani_3 = remove_human_turns(human_annotations_3)
human_annotations_only_leolani_avg = remove_human_turns(human_annotations_avg)
Calculate the averages per quality score
# Calculate the average score per column per file
def calculate_average_score(human_annotations: list) -> dict:
"""
Calculate the average score for each column in the human annotations.
Parameters
----------
human_annotations : list
The list of human annotations
Returns
-------
dict
A dictionary with the average score for each column.
"""
# Initialize the dictionary to store the sum of the scores and the count of the scores
sum_scores = {}
count_scores = {}
# Iterate over the rows in the human annotations
for row in human_annotations:
for key, value in row.items():
if key in ['Turn', 'Speaker', 'Response', 'Reference Response']:
continue
if key in sum_scores:
sum_scores[key] += float(value)
count_scores[key] += 1
else:
sum_scores[key] = float(value)
count_scores[key] = 1
# Calculate the average score for each column
average_scores = {}
for key, value in sum_scores.items():
average_scores[key] = value / count_scores[key]
return average_scores
average_scores_1 = calculate_average_score(human_annotations_only_leolani_1)
average_scores_2 = calculate_average_score(human_annotations_only_leolani_2)
average_scores_3 = calculate_average_score(human_annotations_only_leolani_3)
average_scores_avg = calculate_average_score(human_annotations_only_leolani_avg)
# Print the average scores
print(f"Average scores for {emissor_directory}/{filename_1}:")
for key, value in average_scores_1.items():
print(f"{key}: {value:.2f}")
print(f"Average scores for {emissor_directory}/{filename_2}:")
for key, value in average_scores_2.items():
print(f"{key}: {value:.2f}")
print(f"Average scores for {emissor_directory}/{filename_3}:")
for key, value in average_scores_3.items():
print(f"{key}: {value:.2f}")
print(f"Average scores for {emissor_directory}/{filename_avg}:")
for key, value in average_scores_avg.items():
print(f"{key}: {value:.2f}")
Average scores for ./48ebae93-ae78-41ba-908a-a5f1fa2afe3d/manual_evaluation_1.csv:
Overall Human Rating: 3.09
Interesting: 3.05
Engaging: 2.45
Specific: 3.50
Relevant: 1.45
Correct: 1.75
Semantically Appropriate: 2.75
Understandable: 4.85
Fluent: 4.95
Average scores for ./48ebae93-ae78-41ba-908a-a5f1fa2afe3d/manual_evaluation_2.csv:
Overall Human Rating: 2.09
Interesting: 1.75
Engaging: 1.55
Specific: 2.25
Relevant: 1.10
Correct: 1.10
Semantically Appropriate: 1.20
Understandable: 4.15
Fluent: 3.65
Average scores for ./48ebae93-ae78-41ba-908a-a5f1fa2afe3d/manual_evaluation_3.csv:
Overall Human Rating: 3.09
Interesting: 2.60
Engaging: 1.55
Specific: 3.85
Relevant: 1.40
Correct: 1.30
Semantically Appropriate: 4.80
Understandable: 4.25
Fluent: 4.95
Average scores for ./48ebae93-ae78-41ba-908a-a5f1fa2afe3d/manual_evaluation_avg.csv:
Overall Human Rating: 2.76
Interesting: 2.47
Engaging: 1.85
Specific: 3.20
Relevant: 1.32
Correct: 1.38
Semantically Appropriate: 2.92
Understandable: 4.42
Fluent: 4.53
Calculate the standard deviation per metric per annotator
This measures the variability behind an annotators average score per metric.
import math
def calculate_annotator_std_dev(annotations):
"""
Calculate the standard deviation for each metric for a single annotator.
Args:
annotations (list): A list of dictionaries, each containing metric scores for one annotation.
Returns:
dict: A dictionary with metrics as keys and their standard deviations as values.
"""
# Extract all metrics from the dictionary
metrics = [key for key in annotations[0].keys() if key not in {"Turn", "Speaker", "Response", "Reference Response"}]
# Initialize a dictionary to store metric scores
metric_scores = {metric: [] for metric in metrics}
# Populate the metric scores
for annotation in annotations:
for metric in metrics:
metric_scores[metric].append(annotation[metric])
# Calculate standard deviation for each metric
std_dev = {}
for metric, scores in metric_scores.items():
mean = sum(scores) / len(scores)
variance = sum((x - mean) ** 2 for x in scores) / len(scores)
std_dev[metric] = math.sqrt(variance)
return std_dev
std_devs_1 = calculate_annotator_std_dev(human_annotations_only_leolani_1)
std_devs_2 = calculate_annotator_std_dev(human_annotations_only_leolani_2)
std_devs_3 = calculate_annotator_std_dev(human_annotations_only_leolani_3)
print(f"Standard deviations for annotator 1:")
for key, value in std_devs_1.items():
print(f"{key}: {value:.2f}")
print(f"Standard deviations for annotator 2:")
for key, value in std_devs_2.items():
print(f"{key}: {value:.2f}")
print(f"Standard deviations for annotator 3:")
for key, value in std_devs_3.items():
print(f"{key}: {value:.2f}")
Standard deviations for annotator 1:
Overall Human Rating: 0.39
Interesting: 1.16
Engaging: 0.74
Specific: 1.12
Relevant: 0.80
Correct: 1.13
Semantically Appropriate: 0.83
Understandable: 0.36
Fluent: 0.22
Standard deviations for annotator 2:
Overall Human Rating: 0.52
Interesting: 1.13
Engaging: 1.07
Specific: 1.51
Relevant: 0.44
Correct: 0.44
Semantically Appropriate: 0.60
Understandable: 1.15
Fluent: 0.91
Standard deviations for annotator 3:
Overall Human Rating: 0.45
Interesting: 0.80
Engaging: 0.86
Specific: 1.31
Relevant: 1.02
Correct: 0.95
Semantically Appropriate: 0.60
Understandable: 0.54
Fluent: 0.22
Calculate the standard deviation between annotators
This measures the variability among the annotators’ ratings for a specific metric (e.g., “Overall Human Rating”). It tells you how consistent the annotators were.
import math
def calculate_std_dev(dict1, dict2, dict3):
"""
Calculate the standard deviation per metric across three annotators.
Args:
dict1, dict2, dict3: Dictionaries containing metrics and their scores.
Returns:
A dictionary with metrics as keys and their standard deviations as values.
"""
metrics = dict1.keys()
std_dev = {}
for metric in metrics:
values = [dict1[metric], dict2[metric], dict3[metric]]
mean = sum(values) / len(values)
variance = sum((x - mean) ** 2 for x in values) / len(values)
std_dev[metric] = math.sqrt(variance)
return std_dev
std_devs = calculate_std_dev(average_scores_1, average_scores_2, average_scores_3)
print(f"Standard deviation between annotators' averages:")
for key, value in std_devs.items():
print(f"{key}: {value:.2f}")
Standard deviation between annotators' averages:
Overall Human Rating: 0.47
Interesting: 0.54
Engaging: 0.42
Specific: 0.69
Relevant: 0.15
Correct: 0.27
Semantically Appropriate: 1.47
Understandable: 0.31
Fluent: 0.61
Calculate Spearman’s pairwise correlations
import pandas as pd
from scipy.stats import spearmanr
import numpy as np
def calculate_iaa(annotator1, annotator2):
# Ensure both annotators have the same length (same number of sentences)
if len(annotator1) != len(annotator2):
raise ValueError("The lists of dictionaries for both annotators must have the same length.")
# Extract score keys (the keys are the columns containing the scores)
score_columns = ['Overall Human Rating', 'Interesting', 'Engaging', 'Specific', 'Relevant',
'Correct', 'Semantically Appropriate', 'Understandable', 'Fluent']
# Create an empty DataFrame to store scores for both annotators
data = {col: {'annotator1': [], 'annotator2': []} for col in score_columns}
# Populate the data dictionary with corresponding scores for each annotator
for entry1, entry2 in zip(annotator1, annotator2):
for col in score_columns:
data[col]['annotator1'].append(entry1[col])
data[col]['annotator2'].append(entry2[col])
# Calculate Spearman correlations for each score type
correlations = {}
for col in score_columns:
# Check if annotator 2 has all constant values for this column
if len(set(data[col]['annotator2'])) == 1: # All values are constant
if len(set(data[col]['annotator1'])) == 1: # Both annotators have constant values
correlations[col] = np.nan # Cannot calculate correlation if both are constant
else:
correlations[col] = 1 if all(x == data[col]['annotator2'][0] for x in data[col]['annotator1']) else -1
else:
corr, _ = spearmanr(data[col]['annotator1'], data[col]['annotator2'])
correlations[col] = corr
# Calculate the overall IAA (average correlation across all score types)
valid_correlations = [v for v in correlations.values() if not np.isnan(v)]
overall_iaa = sum(valid_correlations) / len(valid_correlations) if valid_correlations else np.nan
return correlations, overall_iaa
correlations, overall_iaa = calculate_iaa(human_annotations_only_leolani_1, human_annotations_only_leolani_2)
print("Spearman Correlations for each score between annotator 1 and 2:")
print(correlations)
print("\nOverall IAA between annotator 1 and 2 (average correlation):")
print(overall_iaa)
print("\n")
correlations, overall_iaa = calculate_iaa(human_annotations_only_leolani_1, human_annotations_only_leolani_3)
print("Spearman Correlations for each score between annotator 1 and 3:")
print(correlations)
print("\nOverall IAA between annotator 1 and 3 (average correlation):")
print(overall_iaa)
print("\n")
correlations, overall_iaa = calculate_iaa(human_annotations_only_leolani_2, human_annotations_only_leolani_3)
print("Spearman Correlations for each score between annotator 1 and 2:")
print(correlations)
print("\nOverall IAA between annotator 2 and 3 (average correlation):")
print(overall_iaa)
Spearman Correlations for each score between annotator 1 and 2:
{'Overall Human Rating': 0.44956390093073445, 'Interesting': 0.2733788898584856, 'Engaging': 0.11466128743706253, 'Specific': 0.3433128603382996, 'Relevant': 0.4686717179335142, 'Correct': 0.39804896992221994, 'Semantically Appropriate': 0.07856742013183862, 'Understandable': 0.22304336650369327, 'Fluent': -0.332871332649303}
Overall IAA between annotator 1 and 2 (average correlation):
0.22404189782294948
Spearman Correlations for each score between annotator 1 and 3:
{'Overall Human Rating': 0.7418581000416071, 'Interesting': 0.3758468291461006, 'Engaging': 0.1142691300746327, 'Specific': 0.5778086325804389, 'Relevant': 0.38355192625785905, 'Correct': 0.4170251804797899, 'Semantically Appropriate': 0.07856742013183862, 'Understandable': 0.21768641667093697, 'Fluent': -0.05263157894736842}
Overall IAA between annotator 1 and 3 (average correlation):
0.3171091173817595
Spearman Correlations for each score between annotator 1 and 2:
{'Overall Human Rating': 0.38895397930111214, 'Interesting': 0.34664554021794824, 'Engaging': 0.4596692981699865, 'Specific': 0.4567230529598133, 'Relevant': 0.5124889426948922, 'Correct': 0.6491103290458, 'Semantically Appropriate': 0.1111111111111111, 'Understandable': -0.4400465413432393, 'Fluent': 0.16643566632465154}
Overall IAA between annotator 2 and 3 (average correlation):
0.29456570872023063