diff --git a/requirements.txt b/requirements.txt index 87c52ac..f70acc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,7 @@ pytest mypy docformatter pre-commit -pydocstyle==6.1.1 \ No newline at end of file +pydocstyle==6.1.1 +statsmodels>=0.12.2 +pandas==1.3.1 +matplotlib \ No newline at end of file diff --git a/results/quantitative_analysis/README.md b/results/quantitative_analysis/README.md index e69de29..ca39997 100644 --- a/results/quantitative_analysis/README.md +++ b/results/quantitative_analysis/README.md @@ -0,0 +1,70 @@ +# Results of Quantitative Analysis + +The following sections provides the results reported in the paper along with the scripts used to generate the numbers. + +## One-way ANOVA + +![alt text](oneway-anova-table.png) + +The results are generated using [this script](../../scripts/data_analysis/anova.py) and the following command: + +`` python -m scripts.data_analysis.anova --type one-way `` + +## Two-way ANOVA + +![alt text](twoway-anova-table.png) + +The results are generated using [this script](../../scripts/data_analysis/anova.py) and the following command: + +`` python -m scripts.data_analysis.anova --type two-way `` + +## Mean scores + +Mean scores for response usefulness and explanation user ratings for different quality of the explanations and presentation mode can be generated using [this script](../../scripts/data_analysis/mean_scores.py). + +### Mean response usefulness scores for explanations with different levels of accuracy + +![](mean_scores/means_usefulness_explanation_quality.png) + +### Mean response usefulness scores for explanations with different presentation modes + +![](mean_scores/means_usefulness_explanation_presentation.png) + +### Mean explanation ratings for explanations with different levels of accuracy + +![](mean_scores/means_explanation_ratings_explanation_quality.png) + +### Mean explanation ratings for explanations with different presentation modes + +![](mean_scores/means_explanation_ratings_explanation_presentation.png) + +Mean scores for other response dimensions for different quality of the explanations and presentation mode can be generated using .... + +## Data distribution + +The distribution of user-judged response dimensions per query for both user studies can be generated using [this script](../../scripts/data_analysis/data_distribution.py) and the following command: + +`` python -m scripts.data_analysis.data_distribution `` + +![](data_distribution.png) + +## Demographic information + +Demographic information about the crowd workers participating in the user study is presented below: + +| Demographic Information | Option | Number of workers | +| --- | --- | --- | +| age | 18-30 | 39 | +| age | 31-45 | 76 | +| age | 46-60 | 41 | +| age | 60+ | 4 | +| age | Prefer not to say | 0 | +| education | High School | 12 | +| education | Bachelor's Degree | 111 | +| education | Master's Degree | 34 | +| education | Ph.D. or higher | 3 | +| education | Prefer not to say | 0 | +| gender | Male | 95 | +| gender | Female | 60 | +| gender | Other | 5 | +| gender | Prefer not to say | 0 | \ No newline at end of file diff --git a/results/quantitative_analysis/data_distribution.png b/results/quantitative_analysis/data_distribution.png new file mode 100644 index 0000000..0e85a51 Binary files /dev/null and b/results/quantitative_analysis/data_distribution.png differ diff --git a/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_presentation.png b/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_presentation.png new file mode 100644 index 0000000..7b73c52 Binary files /dev/null and b/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_presentation.png differ diff --git a/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_quality.png b/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_quality.png new file mode 100644 index 0000000..a76e8c2 Binary files /dev/null and b/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_quality.png differ diff --git a/results/quantitative_analysis/mean_scores/means_usefulness_explanation_presentation.png b/results/quantitative_analysis/mean_scores/means_usefulness_explanation_presentation.png new file mode 100644 index 0000000..f8240af Binary files /dev/null and b/results/quantitative_analysis/mean_scores/means_usefulness_explanation_presentation.png differ diff --git a/results/quantitative_analysis/mean_scores/means_usefulness_explanation_quality.png b/results/quantitative_analysis/mean_scores/means_usefulness_explanation_quality.png new file mode 100644 index 0000000..e9f234c Binary files /dev/null and b/results/quantitative_analysis/mean_scores/means_usefulness_explanation_quality.png differ diff --git a/results/quantitative_analysis/oneway-anova-table.png b/results/quantitative_analysis/oneway-anova-table.png new file mode 100644 index 0000000..cc036d0 Binary files /dev/null and b/results/quantitative_analysis/oneway-anova-table.png differ diff --git a/results/quantitative_analysis/twoway-anova-table.png b/results/quantitative_analysis/twoway-anova-table.png new file mode 100644 index 0000000..af11441 Binary files /dev/null and b/results/quantitative_analysis/twoway-anova-table.png differ diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/data_analysis/__init__.py b/scripts/data_analysis/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/data_analysis/data_distribution.py b/scripts/data_analysis/data_distribution.py new file mode 100644 index 0000000..55a974c --- /dev/null +++ b/scripts/data_analysis/data_distribution.py @@ -0,0 +1,100 @@ +import collections +from typing import List + +import matplotlib.pyplot as plt +import pandas as pd + + +def process_data_for_distribution_plot( + response_dimensions: List[str], data_df: pd.DataFrame, main_feature: str +): + """Processes the data for the distribution plot. + + Args: + response_dimensions: The response dimensions used in a user study. + data_df: Dataframe containing the results from a user study. + + Returns: + A dictionary containing the data for the distribution plot. + """ + values_per_query = collections.defaultdict(dict) + + for value in list(set(list(data_df[main_feature]))): + for response_dimension in response_dimensions: + query_sub_df = data_df[data_df[main_feature] == value] + query_response_dimension_values = list( + query_sub_df[response_dimension] + ) + values_per_query[value][ + response_dimension + ] = query_response_dimension_values + + return values_per_query + + +if __name__ == "__main__": + aggregated_data = pd.read_csv( + "results/user_study_output/output_processed_aggregated.csv" + ) + + for feature in [ + "source_usefulness", + "warning_usefulness", + "confidence_usefulness", + "conversational_frequency", + "voice_frequency", + ]: + f = [ + int(d.replace("option_", "")) + for d in list(aggregated_data[feature]) + ] + aggregated_data[feature] = f + + response_dimensions = [ + "familiarity", + "interest", + "search_prob", + "relevance", + "correctness", + "completeness", + "comprehensiveness", + "conciseness", + "serendipity", + "coherence", + "factuality", + "fairness", + "readability", + "satisfaction", + "usefulness", + ] + + feature = "questions_ids" + values_per_query = process_data_for_distribution_plot( + response_dimensions, aggregated_data, feature + ) + + n_col = 5 + fig, axs = plt.subplots(3, n_col, figsize=(15, 9)) + + for id, response_dimension in enumerate(response_dimensions): + boxplot_data = [] + + for value in range(1, 11): + boxplot_data.append( + values_per_query[value][response_dimension], + ) + + axs[int(id / n_col)][id % n_col].boxplot(boxplot_data) + axs[int(id / n_col)][id % n_col].set_xlabel("Query ID") + axs[int(id / n_col)][id % n_col].set_ylabel( + "Worker Self-Reported Rating" + ) + axs[int(id / n_col)][id % n_col].set_title( + response_dimension.replace( + "search_prob", "search probability" + ).title() + ) + + fig.tight_layout(pad=1.0) + plt.figure(dpi=2000) + fig.savefig("results/quantitative_analysis/data_distribution.png") diff --git a/scripts/data_analysis/mean_scores.py b/scripts/data_analysis/mean_scores.py new file mode 100644 index 0000000..1c6a02c --- /dev/null +++ b/scripts/data_analysis/mean_scores.py @@ -0,0 +1,386 @@ +from typing import List + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + + +def add_bar_labels(ax, bar, labels): + for rect, bar_mean in zip(bar, labels): + height = rect.get_height() + ax.text( + rect.get_x() + rect.get_width() / 2.0, + 1.01 * height, + str(round(bar_mean, 2)), + ha="center", + va="bottom", + fontsize=14, + ) + + +def get_mean_usefulness_scores( + dist_variable_variants: List[str], + dist_variable: str, + data_cond_variants: List[str], + data_cond_variable: str, + resp_dim: str, + data_df: pd.DataFrame, +): + means = {} + means["All data"] = {} + for value in dist_variable_variants: + sub_df = data_df[data_df[dist_variable] == value] + if len(sub_df) > 0: + response_dimension_values = list(sub_df[resp_dim]) + means["All data"][value] = sum(response_dimension_values) / len( + response_dimension_values + ) + + if data_cond_variable == "response_quality": + sub_label = " Response" + elif data_cond_variable == "additional_info_quality": + sub_label = " Additional Info" + + for data_cond_variant in data_cond_variants: + data_df_sub = data_df[data_df[data_cond_variable] == data_cond_variant] + means[data_cond_variant + sub_label] = {} + + for value in dist_variable_variants: + sub_df = data_df_sub[data_df_sub[dist_variable] == value] + if len(sub_df) > 0: + response_dimension_values = list(sub_df[resp_dim]) + means[data_cond_variant + sub_label][value] = sum( + response_dimension_values + ) / len(response_dimension_values) + + return means + + +if __name__ == "__main__": + aggregated_data = pd.read_csv( + "results/user_study_output/output_processed_aggregated.csv" + ) + + for feature in [ + "source_usefulness", + "warning_usefulness", + "confidence_usefulness", + "conversational_frequency", + "voice_frequency", + ]: + f = [ + int(d.replace("option_", "")) + for d in list(aggregated_data[feature]) + ] + aggregated_data[feature] = f + + aggregated_data = aggregated_data.replace({np.nan: "None"}) + + # Mean response usefulness scores for explanations with different levels of accuracy + + plt.rcParams.update({"font.size": 11}) + + resp_dim = "usefulness" + barWidth = 0.25 + fig, ax = plt.subplots(figsize=(7, 3)) + + means = get_mean_usefulness_scores( + ["Good", "Flawed", "None"], + "additional_info_quality", + ["Good", "Flawed"], + "response_quality", + resp_dim, + aggregated_data, + ) + + br1 = np.arange(len(list(means.keys()))) + br2 = [x + barWidth for x in br1] + br3 = [x + barWidth for x in br2] + + plt.ylim(2, 4) + + print(means) + + bar_1_values = [var_means["Good"] for var_means in means.values()] + bar1 = plt.bar( + br1, + bar_1_values, + color="sandybrown", + width=barWidth, + edgecolor="gray", + label="Accurate", + ) + add_bar_labels(ax, bar1, bar_1_values) + + bar_2_values = [var_means["Flawed"] for var_means in means.values()] + bar2 = plt.bar( + br2, + bar_2_values, + color="bisque", + width=barWidth, + edgecolor="gray", + label="Noisy", + ) + add_bar_labels(ax, bar2, bar_2_values) + + bar_3_values = [var_means["None"] for var_means in means.values()] + bar3 = plt.bar( + br3, + bar_3_values, + color="silver", + width=barWidth, + edgecolor="gray", + label="None", + ) + add_bar_labels(ax, bar3, bar_3_values) + + plt.xticks( + [0.25, 1.25, 2.35], + ["All data", "Perfect response", "Imperfect response"], + fontsize=14, + ) + + plt.legend( + loc="lower right", + fontsize=13, + title="Explanations quality", + title_fontsize=13, + ) + plt.show() + + fig.savefig( + "results/quantitative_analysis/mean_scores/means_usefulness_explanation_quality.png" + ) + + # Mean response usefulness scores for explanations with different presentation modes + + plt.rcParams.update({"font.size": 11}) + + resp_dim = "usefulness" + barWidth = 0.25 + fig, ax = plt.subplots(figsize=(7, 3)) + + dist_variable_variants = ["T", "V", "None"] + dist_variable = "presentation_modes" + + data_cond_variants = ["Good", "Flawed"] + data_cond_variable = "additional_info_quality" + + means = get_mean_usefulness_scores( + dist_variable_variants, + dist_variable, + data_cond_variants, + data_cond_variable, + resp_dim, + aggregated_data, + ) + + br1 = np.arange(len(list(means.keys()))) + br2 = [x + barWidth for x in br1] + br3 = [x + barWidth for x in br2[:1]] + + plt.ylim(2, 4) + + bar_1_values = [var_means["T"] for var_means in means.values()] + bar1 = plt.bar( + br1, + bar_1_values, + color="#dfc27d", + width=barWidth, + edgecolor="grey", + label="Textual", + ) + add_bar_labels(ax, bar1, bar_1_values) + + bar_2_values = [var_means["V"] for var_means in means.values()] + bar2 = plt.bar( + br2, + bar_2_values, + color="#80cdc1", + width=barWidth, + edgecolor="grey", + label="Visual", + ) + add_bar_labels(ax, bar2, bar_2_values) + + bar_3_values = [means["All data"]["None"]] + print(bar_3_values) + bar3 = plt.bar( + br3, + bar_3_values, + color="silver", + width=barWidth, + edgecolor="grey", + label="None", + ) + add_bar_labels(ax, bar3, bar_3_values) + + plt.xticks( + [0.25, 1.1, 2.2], + ["All data", "Accurate explanations", "Noisy explanations"], + fontsize=14, + ) + + plt.legend( + loc="lower right", + fontsize=13, + title="Presentation mode", + title_fontsize=13, + ) + plt.show() + + fig.savefig( + "results/quantitative_analysis/mean_scores/means_usefulness_explanation_presentation.png" + ) + + # Mean explanation ratings for explanations with different levels of accuracy + + plt.rcParams.update({"font.size": 11}) + + barWidth = 0.33 + fig, ax = plt.subplots(figsize=(5.5, 3)) + + dist_variable_variants = ["Good", "Flawed"] + dist_variable = "additional_info_quality" + means = {} + + means["Source Revealment"] = {} + for value in dist_variable_variants: + sub_df = aggregated_data[aggregated_data[dist_variable] == value] + if len(sub_df) > 0: + response_dimension_values = list(sub_df["source_usefulness"]) + means["Source Revealment"][value] = sum( + response_dimension_values + ) / len(response_dimension_values) + + means["Confidence Revealment"] = {} + for value in dist_variable_variants: + sub_df = aggregated_data[aggregated_data[dist_variable] == value] + if len(sub_df) > 0: + response_dimension_values = list(sub_df["confidence_usefulness"]) + means["Confidence Revealment"][value] = sum( + response_dimension_values + ) / len(response_dimension_values) + + br1 = np.arange(len(list(means.keys()))) + br2 = [x + barWidth for x in br1] + + plt.ylim(2, 4) + + print(means) + + bar_1_values = [var_means["Good"] for var_means in means.values()] + bar1 = plt.bar( + br1, + bar_1_values, + color="sandybrown", + width=barWidth, + edgecolor="gray", + label="Accurate", + ) + add_bar_labels(ax, bar1, bar_1_values) + + bar_2_values = [var_means["Flawed"] for var_means in means.values()] + bar2 = plt.bar( + br2, + bar_2_values, + color="bisque", + width=barWidth, + edgecolor="gray", + label="Noisy", + ) + add_bar_labels(ax, bar2, bar_2_values) + + plt.xticks( + [r + barWidth / 2 for r in range(len(list(means.keys())))], + ["Source", "Confidence"], + fontsize=14, + ) + + plt.legend( + loc="lower right", + fontsize=13, + title="Explanations quality", + title_fontsize=13, + ) + plt.show() + + fig.savefig( + "results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_quality.png" + ) + + # Mean explanation ratings for explanations with different presentation modes + + plt.rcParams.update({"font.size": 11}) + + barWidth = 0.33 + fig, ax = plt.subplots(figsize=(5.5, 3)) + + dist_variable_variants = ["T", "V"] + dist_variable = "presentation_modes" + means = {} + + means["Limitation Revealment"] = {} + for value in dist_variable_variants: + sub_df = aggregated_data[aggregated_data[dist_variable] == value] + if len(sub_df) > 0: + response_dimension_values = list(sub_df["warning_usefulness"]) + means["Limitation Revealment"][value] = sum( + response_dimension_values + ) / len(response_dimension_values) + + means["Confidence Revealment"] = {} + for value in dist_variable_variants: + sub_df = aggregated_data[aggregated_data[dist_variable] == value] + if len(sub_df) > 0: + response_dimension_values = list(sub_df["confidence_usefulness"]) + means["Confidence Revealment"][value] = sum( + response_dimension_values + ) / len(response_dimension_values) + + br1 = np.arange(len(list(means.keys()))) + br2 = [x + barWidth for x in br1] + + plt.ylim(2, 4) + + print(means) + + bar_1_values = [var_means["T"] for var_means in means.values()] + bar1 = plt.bar( + br1, + bar_1_values, + color="#dfc27d", + width=barWidth, + edgecolor="gray", + label="Textual", + ) + add_bar_labels(ax, bar1, bar_1_values) + + bar_2_values = [var_means["V"] for var_means in means.values()] + bar2 = plt.bar( + br2, + bar_2_values, + color="#80cdc1", + width=barWidth, + edgecolor="gray", + label="Visual", + ) + add_bar_labels(ax, bar2, bar_2_values) + + plt.xticks( + [r + barWidth / 2 for r in range(len(list(means.keys())))], + ["Limitation", "Confidence"], + fontsize=14, + ) + + plt.legend( + loc="lower right", + fontsize=13, + title="Presentation mode", + title_fontsize=13, + ) + plt.show() + + fig.savefig( + "results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_presentation.png" + )