import csv
import os

Load evaluation files

# Create a function to load the data from the file

def load_evaluation( emissor_directory: str, filename: str) -> list:
    """
    Load the evaluation data from the <emissor directory>/<filename>.csv file.
    
    The data is expected to be in the following format:
    - The first line is the header, with the column names.
    - The following lines are the data, with the following columns:
      - 'Turn': The dialogue turn
      - 'Speaker': LEOLANI or SPEAKER
      - 'Response': The utterance of the speaker
      - 'Reference response': The reference response
      - 'Overall Human Rating': the average of the 8 quality ratings
      - 'Interesting': rating from 1 to 5.
      - 'Engaging': rating from 1 to 5.
      - 'Specific': rating from 1 to 5.
      - 'Relevant': rating from 1 to 5.
      - 'Correct': rating from 1 to 5.
      - 'Semantically appropriate': rating from 1 to 5.
      - 'Understandable': rating from 1 to 5.
      - 'Fluent': rating from 1 to 5.
      
      Parameters
      ----------
        emissor_directory : str
            The directory where the emissor data is stored.
        filename : str
            The name of the file to load.
            
        Returns
        -------
        list
            A list of dictionaries, each dictionary represents a row in the CSV file.
    """

    # Check if the file exists in the emissor directory
    file_path = os.path.join(emissor_directory, filename)
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} not found.")
    
    # Load the data from the file
    data = []
    with open(file_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            data.append(row)
            
    # Turn the string values of the numerical columns into numbers
    for row in data:
        for column in ['Overall Human Rating', 'Interesting', 'Engaging', 'Specific', 'Relevant', 'Correct', 'Semantically Appropriate', 'Understandable', 'Fluent']:
            
            # Replace commas with dots
            row[column] = row[column].replace(',', '.')
            
            # Check if the value is not empty
            if row[column] != '':
                row[column] = float(row[column])
    
    return data

# Load the evaluation data
emissor_directory = './48ebae93-ae78-41ba-908a-a5f1fa2afe3d'
filename_1 = 'manual_evaluation_1.csv'
filename_2 = 'manual_evaluation_2.csv'
filename_3 = 'manual_evaluation_3.csv'
filename_avg = 'manual_evaluation_avg.csv'

human_annotations_1 = load_evaluation(emissor_directory, filename_1)
human_annotations_2 = load_evaluation(emissor_directory, filename_2)
human_annotations_3 = load_evaluation(emissor_directory, filename_3)
human_annotations_avg = load_evaluation(emissor_directory, filename_avg)

---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

Cell In[2], line 8
      5 filename_3 = 'manual_evaluation_3.csv'
      6 filename_avg = 'manual_evaluation_avg.csv'
----> 8 human_annotations_1 = load_evaluation(emissor_directory, filename_1)
      9 human_annotations_2 = load_evaluation(emissor_directory, filename_2)
     10 human_annotations_3 = load_evaluation(emissor_directory, filename_3)


Cell In[1], line 38, in load_evaluation(emissor_directory, filename)
      4 """
      5 Load the evaluation data from the <emissor directory>/<filename>.csv file.
      6 
   (...)
     34         A list of dictionaries, each dictionary represents a row in the CSV file.
     35 """
     37 # Check if the file exists in the emissor directory
---> 38 file_path = os.path.join(emissor_directory, filename)
     39 if not os.path.exists(file_path):
     40     raise FileNotFoundError(f"File {file_path} not found.")


NameError: name 'os' is not defined

def remove_human_turns(human_annotations: list) -> list:
    """
    Remove the rows where Speaker is SPEAKER from the human annotations.
    
    Parameters
    ----------
    human_annotations
        The list of human annotations
        
    Returns
    -------
    list
        The list of human annotations without the SPEAKER turns.
    """
    
    return [row for row in human_annotations if row['Speaker'] == 'LEOLANI']

human_annotations_only_leolani_1 = remove_human_turns(human_annotations_1)
human_annotations_only_leolani_2 = remove_human_turns(human_annotations_2)
human_annotations_only_leolani_3 = remove_human_turns(human_annotations_3)
human_annotations_only_leolani_avg = remove_human_turns(human_annotations_avg)

Calculate the averages per quality score

# Calculate the average score per column per file

def calculate_average_score(human_annotations: list) -> dict:
    """
    Calculate the average score for each column in the human annotations.
    
    Parameters
    ----------
    human_annotations : list
        The list of human annotations
    
    Returns
    -------
    dict
        A dictionary with the average score for each column.
    """
    
    # Initialize the dictionary to store the sum of the scores and the count of the scores
    sum_scores = {}
    count_scores = {}
    
    # Iterate over the rows in the human annotations
    for row in human_annotations:
        for key, value in row.items():
            if key in ['Turn', 'Speaker', 'Response', 'Reference Response']:
                continue
            if key in sum_scores:
                sum_scores[key] += float(value)
                count_scores[key] += 1
            else:
                sum_scores[key] = float(value)
                count_scores[key] = 1
                
    # Calculate the average score for each column
    average_scores = {}
    for key, value in sum_scores.items():
        average_scores[key] = value / count_scores[key]
        
    return average_scores

average_scores_1 = calculate_average_score(human_annotations_only_leolani_1)
average_scores_2 = calculate_average_score(human_annotations_only_leolani_2)
average_scores_3 = calculate_average_score(human_annotations_only_leolani_3)
average_scores_avg = calculate_average_score(human_annotations_only_leolani_avg)

# Print the average scores
print(f"Average scores for {emissor_directory}/{filename_1}:")
for key, value in average_scores_1.items():
    print(f"{key}: {value:.2f}")

print(f"Average scores for {emissor_directory}/{filename_2}:")
for key, value in average_scores_2.items():
    print(f"{key}: {value:.2f}")
    
print(f"Average scores for {emissor_directory}/{filename_3}:")
for key, value in average_scores_3.items():
    print(f"{key}: {value:.2f}")
    
print(f"Average scores for {emissor_directory}/{filename_avg}:")
for key, value in average_scores_avg.items():
    print(f"{key}: {value:.2f}")

Average scores for ./48ebae93-ae78-41ba-908a-a5f1fa2afe3d/manual_evaluation_1.csv:
Overall Human Rating: 3.09
Interesting: 3.05
Engaging: 2.45
Specific: 3.50
Relevant: 1.45
Correct: 1.75
Semantically Appropriate: 2.75
Understandable: 4.85
Fluent: 4.95
Average scores for ./48ebae93-ae78-41ba-908a-a5f1fa2afe3d/manual_evaluation_2.csv:
Overall Human Rating: 2.09
Interesting: 1.75
Engaging: 1.55
Specific: 2.25
Relevant: 1.10
Correct: 1.10
Semantically Appropriate: 1.20
Understandable: 4.15
Fluent: 3.65
Average scores for ./48ebae93-ae78-41ba-908a-a5f1fa2afe3d/manual_evaluation_3.csv:
Overall Human Rating: 3.09
Interesting: 2.60
Engaging: 1.55
Specific: 3.85
Relevant: 1.40
Correct: 1.30
Semantically Appropriate: 4.80
Understandable: 4.25
Fluent: 4.95
Average scores for ./48ebae93-ae78-41ba-908a-a5f1fa2afe3d/manual_evaluation_avg.csv:
Overall Human Rating: 2.76
Interesting: 2.47
Engaging: 1.85
Specific: 3.20
Relevant: 1.32
Correct: 1.38
Semantically Appropriate: 2.92
Understandable: 4.42
Fluent: 4.53

Calculate the standard deviation per metric per annotator

This measures the variability behind an annotators average score per metric.

import math

def calculate_annotator_std_dev(annotations):
    """
    Calculate the standard deviation for each metric for a single annotator.

    Args:
        annotations (list): A list of dictionaries, each containing metric scores for one annotation.

    Returns:
        dict: A dictionary with metrics as keys and their standard deviations as values.
    """
    # Extract all metrics from the dictionary
    metrics = [key for key in annotations[0].keys() if key not in {"Turn", "Speaker", "Response", "Reference Response"}]
    
    # Initialize a dictionary to store metric scores
    metric_scores = {metric: [] for metric in metrics}

    # Populate the metric scores
    for annotation in annotations:
        for metric in metrics:
            metric_scores[metric].append(annotation[metric])
    
    # Calculate standard deviation for each metric
    std_dev = {}
    for metric, scores in metric_scores.items():
        mean = sum(scores) / len(scores)
        variance = sum((x - mean) ** 2 for x in scores) / len(scores)
        std_dev[metric] = math.sqrt(variance)
    
    return std_dev

std_devs_1 = calculate_annotator_std_dev(human_annotations_only_leolani_1)
std_devs_2 = calculate_annotator_std_dev(human_annotations_only_leolani_2)
std_devs_3 = calculate_annotator_std_dev(human_annotations_only_leolani_3)

print(f"Standard deviations for annotator 1:")
for key, value in std_devs_1.items():
    print(f"{key}: {value:.2f}")

print(f"Standard deviations for annotator 2:")
for key, value in std_devs_2.items():
    print(f"{key}: {value:.2f}")

print(f"Standard deviations for annotator 3:")
for key, value in std_devs_3.items():
    print(f"{key}: {value:.2f}")

Standard deviations for annotator 1:
Overall Human Rating: 0.39
Interesting: 1.16
Engaging: 0.74
Specific: 1.12
Relevant: 0.80
Correct: 1.13
Semantically Appropriate: 0.83
Understandable: 0.36
Fluent: 0.22
Standard deviations for annotator 2:
Overall Human Rating: 0.52
Interesting: 1.13
Engaging: 1.07
Specific: 1.51
Relevant: 0.44
Correct: 0.44
Semantically Appropriate: 0.60
Understandable: 1.15
Fluent: 0.91
Standard deviations for annotator 3:
Overall Human Rating: 0.45
Interesting: 0.80
Engaging: 0.86
Specific: 1.31
Relevant: 1.02
Correct: 0.95
Semantically Appropriate: 0.60
Understandable: 0.54
Fluent: 0.22

Calculate the standard deviation between annotators

This measures the variability among the annotators’ ratings for a specific metric (e.g., “Overall Human Rating”). It tells you how consistent the annotators were.

import math

def calculate_std_dev(dict1, dict2, dict3):
    """
    Calculate the standard deviation per metric across three annotators.
    
    Args:
        dict1, dict2, dict3: Dictionaries containing metrics and their scores.

    Returns:
        A dictionary with metrics as keys and their standard deviations as values.
    """
    metrics = dict1.keys()
    std_dev = {}
    
    for metric in metrics:
        values = [dict1[metric], dict2[metric], dict3[metric]]
        mean = sum(values) / len(values)
        variance = sum((x - mean) ** 2 for x in values) / len(values)
        std_dev[metric] = math.sqrt(variance)
    
    return std_dev

std_devs = calculate_std_dev(average_scores_1, average_scores_2, average_scores_3)

print(f"Standard deviation between annotators' averages:")
for key, value in std_devs.items():
    print(f"{key}: {value:.2f}")

Standard deviation between annotators' averages:
Overall Human Rating: 0.47
Interesting: 0.54
Engaging: 0.42
Specific: 0.69
Relevant: 0.15
Correct: 0.27
Semantically Appropriate: 1.47
Understandable: 0.31
Fluent: 0.61

Calculate Spearman’s pairwise correlations

import pandas as pd
from scipy.stats import spearmanr
import numpy as np

def calculate_iaa(annotator1, annotator2):
    # Ensure both annotators have the same length (same number of sentences)
    if len(annotator1) != len(annotator2):
        raise ValueError("The lists of dictionaries for both annotators must have the same length.")
    
    # Extract score keys (the keys are the columns containing the scores)
    score_columns = ['Overall Human Rating', 'Interesting', 'Engaging', 'Specific', 'Relevant', 
                     'Correct', 'Semantically Appropriate', 'Understandable', 'Fluent']
    
    # Create an empty DataFrame to store scores for both annotators
    data = {col: {'annotator1': [], 'annotator2': []} for col in score_columns}
    
    # Populate the data dictionary with corresponding scores for each annotator
    for entry1, entry2 in zip(annotator1, annotator2):
        for col in score_columns:
            data[col]['annotator1'].append(entry1[col])
            data[col]['annotator2'].append(entry2[col])
    
    # Calculate Spearman correlations for each score type
    correlations = {}
    for col in score_columns:
        # Check if annotator 2 has all constant values for this column
        if len(set(data[col]['annotator2'])) == 1:  # All values are constant
            if len(set(data[col]['annotator1'])) == 1:  # Both annotators have constant values
                correlations[col] = np.nan  # Cannot calculate correlation if both are constant
            else:
                correlations[col] = 1 if all(x == data[col]['annotator2'][0] for x in data[col]['annotator1']) else -1
        else:
            corr, _ = spearmanr(data[col]['annotator1'], data[col]['annotator2'])
            correlations[col] = corr
    
    # Calculate the overall IAA (average correlation across all score types)
    valid_correlations = [v for v in correlations.values() if not np.isnan(v)]
    overall_iaa = sum(valid_correlations) / len(valid_correlations) if valid_correlations else np.nan
    
    return correlations, overall_iaa

correlations, overall_iaa = calculate_iaa(human_annotations_only_leolani_1, human_annotations_only_leolani_2)
print("Spearman Correlations for each score between annotator 1 and 2:")
print(correlations)
print("\nOverall IAA between annotator 1 and 2 (average correlation):")
print(overall_iaa)
print("\n")

correlations, overall_iaa = calculate_iaa(human_annotations_only_leolani_1, human_annotations_only_leolani_3)
print("Spearman Correlations for each score between annotator 1 and 3:")
print(correlations)
print("\nOverall IAA between annotator 1 and 3 (average correlation):")
print(overall_iaa)
print("\n")

correlations, overall_iaa = calculate_iaa(human_annotations_only_leolani_2, human_annotations_only_leolani_3)
print("Spearman Correlations for each score between annotator 1 and 2:")
print(correlations)
print("\nOverall IAA between annotator 2 and 3 (average correlation):")
print(overall_iaa)

Spearman Correlations for each score between annotator 1 and 2:
{'Overall Human Rating': 0.44956390093073445, 'Interesting': 0.2733788898584856, 'Engaging': 0.11466128743706253, 'Specific': 0.3433128603382996, 'Relevant': 0.4686717179335142, 'Correct': 0.39804896992221994, 'Semantically Appropriate': 0.07856742013183862, 'Understandable': 0.22304336650369327, 'Fluent': -0.332871332649303}

Overall IAA between annotator 1 and 2 (average correlation):
0.22404189782294948


Spearman Correlations for each score between annotator 1 and 3:
{'Overall Human Rating': 0.7418581000416071, 'Interesting': 0.3758468291461006, 'Engaging': 0.1142691300746327, 'Specific': 0.5778086325804389, 'Relevant': 0.38355192625785905, 'Correct': 0.4170251804797899, 'Semantically Appropriate': 0.07856742013183862, 'Understandable': 0.21768641667093697, 'Fluent': -0.05263157894736842}

Overall IAA between annotator 1 and 3 (average correlation):
0.3171091173817595


Spearman Correlations for each score between annotator 1 and 2:
{'Overall Human Rating': 0.38895397930111214, 'Interesting': 0.34664554021794824, 'Engaging': 0.4596692981699865, 'Specific': 0.4567230529598133, 'Relevant': 0.5124889426948922, 'Correct': 0.6491103290458, 'Semantically Appropriate': 0.1111111111111111, 'Understandable': -0.4400465413432393, 'Fluent': 0.16643566632465154}

Overall IAA between annotator 2 and 3 (average correlation):
0.29456570872023063