diff --git a/requirements.txt b/requirements.txt
index 87c52ac..f70acc3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,7 @@ pytest
 mypy
 docformatter
 pre-commit
-pydocstyle==6.1.1
\ No newline at end of file
+pydocstyle==6.1.1
+statsmodels>=0.12.2
+pandas==1.3.1
+matplotlib
\ No newline at end of file
diff --git a/results/quantitative_analysis/README.md b/results/quantitative_analysis/README.md
index e69de29..ca39997 100644
--- a/results/quantitative_analysis/README.md
+++ b/results/quantitative_analysis/README.md
@@ -0,0 +1,70 @@
+# Results of Quantitative Analysis
+
+The following sections provides the results reported in the paper along with the scripts used to generate the numbers.
+
+## One-way ANOVA
+
+![alt text](oneway-anova-table.png)
+
+The results are generated using [this script](../../scripts/data_analysis/anova.py) and the following command:
+
+`` python -m scripts.data_analysis.anova --type one-way ``
+
+## Two-way ANOVA
+
+![alt text](twoway-anova-table.png)
+
+The results are generated using [this script](../../scripts/data_analysis/anova.py) and the following command:
+
+`` python -m scripts.data_analysis.anova --type two-way ``
+
+## Mean scores
+
+Mean scores for response usefulness and explanation user ratings for different quality of the explanations and presentation mode can be generated using [this script](../../scripts/data_analysis/mean_scores.py).
+
+###  Mean response usefulness scores for explanations with different levels of accuracy
+
+![](mean_scores/means_usefulness_explanation_quality.png)
+
+###  Mean response usefulness scores for explanations with different presentation modes
+
+![](mean_scores/means_usefulness_explanation_presentation.png)
+
+###  Mean explanation ratings for explanations with different levels of accuracy
+
+![](mean_scores/means_explanation_ratings_explanation_quality.png)
+
+###  Mean explanation ratings for explanations with different presentation modes
+
+![](mean_scores/means_explanation_ratings_explanation_presentation.png)
+
+Mean scores for other response dimensions for different quality of the explanations and presentation mode can be generated using ....
+
+## Data distribution
+
+The distribution of user-judged response dimensions per query for both user studies can be generated using [this script](../../scripts/data_analysis/data_distribution.py) and the following command:
+
+`` python -m scripts.data_analysis.data_distribution ``
+
+![](data_distribution.png)
+
+## Demographic information
+
+Demographic information about the crowd workers participating in the user study is presented below:
+
+| Demographic Information | Option | Number of workers |
+| --- | --- | --- |
+| age | 18-30 | 39 |
+| age | 31-45 | 76 |
+| age | 46-60 | 41 |
+| age | 60+ | 4 |
+| age | Prefer not to say | 0 |
+| education | High School | 12 |
+| education | Bachelor's Degree | 111 |
+| education | Master's Degree | 34 |
+| education | Ph.D. or higher | 3 |
+| education | Prefer not to say | 0 |
+| gender | Male | 95 |
+| gender | Female | 60 |
+| gender | Other | 5 |
+| gender | Prefer not to say | 0 |
\ No newline at end of file
diff --git a/results/quantitative_analysis/data_distribution.png b/results/quantitative_analysis/data_distribution.png
new file mode 100644
index 0000000..0e85a51
Binary files /dev/null and b/results/quantitative_analysis/data_distribution.png differ
diff --git a/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_presentation.png b/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_presentation.png
new file mode 100644
index 0000000..7b73c52
Binary files /dev/null and b/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_presentation.png differ
diff --git a/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_quality.png b/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_quality.png
new file mode 100644
index 0000000..a76e8c2
Binary files /dev/null and b/results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_quality.png differ
diff --git a/results/quantitative_analysis/mean_scores/means_usefulness_explanation_presentation.png b/results/quantitative_analysis/mean_scores/means_usefulness_explanation_presentation.png
new file mode 100644
index 0000000..f8240af
Binary files /dev/null and b/results/quantitative_analysis/mean_scores/means_usefulness_explanation_presentation.png differ
diff --git a/results/quantitative_analysis/mean_scores/means_usefulness_explanation_quality.png b/results/quantitative_analysis/mean_scores/means_usefulness_explanation_quality.png
new file mode 100644
index 0000000..e9f234c
Binary files /dev/null and b/results/quantitative_analysis/mean_scores/means_usefulness_explanation_quality.png differ
diff --git a/results/quantitative_analysis/oneway-anova-table.png b/results/quantitative_analysis/oneway-anova-table.png
new file mode 100644
index 0000000..cc036d0
Binary files /dev/null and b/results/quantitative_analysis/oneway-anova-table.png differ
diff --git a/results/quantitative_analysis/twoway-anova-table.png b/results/quantitative_analysis/twoway-anova-table.png
new file mode 100644
index 0000000..af11441
Binary files /dev/null and b/results/quantitative_analysis/twoway-anova-table.png differ
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/data_analysis/__init__.py b/scripts/data_analysis/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/data_analysis/data_distribution.py b/scripts/data_analysis/data_distribution.py
new file mode 100644
index 0000000..55a974c
--- /dev/null
+++ b/scripts/data_analysis/data_distribution.py
@@ -0,0 +1,100 @@
+import collections
+from typing import List
+
+import matplotlib.pyplot as plt
+import pandas as pd
+
+
+def process_data_for_distribution_plot(
+    response_dimensions: List[str], data_df: pd.DataFrame, main_feature: str
+):
+    """Processes the data for the distribution plot.
+
+    Args:
+        response_dimensions: The response dimensions used in a user study.
+        data_df: Dataframe containing the results from a user study.
+
+    Returns:
+        A dictionary containing the data for the distribution plot.
+    """
+    values_per_query = collections.defaultdict(dict)
+
+    for value in list(set(list(data_df[main_feature]))):
+        for response_dimension in response_dimensions:
+            query_sub_df = data_df[data_df[main_feature] == value]
+            query_response_dimension_values = list(
+                query_sub_df[response_dimension]
+            )
+            values_per_query[value][
+                response_dimension
+            ] = query_response_dimension_values
+
+    return values_per_query
+
+
+if __name__ == "__main__":
+    aggregated_data = pd.read_csv(
+        "results/user_study_output/output_processed_aggregated.csv"
+    )
+
+    for feature in [
+        "source_usefulness",
+        "warning_usefulness",
+        "confidence_usefulness",
+        "conversational_frequency",
+        "voice_frequency",
+    ]:
+        f = [
+            int(d.replace("option_", ""))
+            for d in list(aggregated_data[feature])
+        ]
+        aggregated_data[feature] = f
+
+    response_dimensions = [
+        "familiarity",
+        "interest",
+        "search_prob",
+        "relevance",
+        "correctness",
+        "completeness",
+        "comprehensiveness",
+        "conciseness",
+        "serendipity",
+        "coherence",
+        "factuality",
+        "fairness",
+        "readability",
+        "satisfaction",
+        "usefulness",
+    ]
+
+    feature = "questions_ids"
+    values_per_query = process_data_for_distribution_plot(
+        response_dimensions, aggregated_data, feature
+    )
+
+    n_col = 5
+    fig, axs = plt.subplots(3, n_col, figsize=(15, 9))
+
+    for id, response_dimension in enumerate(response_dimensions):
+        boxplot_data = []
+
+        for value in range(1, 11):
+            boxplot_data.append(
+                values_per_query[value][response_dimension],
+            )
+
+        axs[int(id / n_col)][id % n_col].boxplot(boxplot_data)
+        axs[int(id / n_col)][id % n_col].set_xlabel("Query ID")
+        axs[int(id / n_col)][id % n_col].set_ylabel(
+            "Worker Self-Reported Rating"
+        )
+        axs[int(id / n_col)][id % n_col].set_title(
+            response_dimension.replace(
+                "search_prob", "search probability"
+            ).title()
+        )
+
+    fig.tight_layout(pad=1.0)
+    plt.figure(dpi=2000)
+    fig.savefig("results/quantitative_analysis/data_distribution.png")
diff --git a/scripts/data_analysis/mean_scores.py b/scripts/data_analysis/mean_scores.py
new file mode 100644
index 0000000..1c6a02c
--- /dev/null
+++ b/scripts/data_analysis/mean_scores.py
@@ -0,0 +1,386 @@
+from typing import List
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+
+def add_bar_labels(ax, bar, labels):
+    for rect, bar_mean in zip(bar, labels):
+        height = rect.get_height()
+        ax.text(
+            rect.get_x() + rect.get_width() / 2.0,
+            1.01 * height,
+            str(round(bar_mean, 2)),
+            ha="center",
+            va="bottom",
+            fontsize=14,
+        )
+
+
+def get_mean_usefulness_scores(
+    dist_variable_variants: List[str],
+    dist_variable: str,
+    data_cond_variants: List[str],
+    data_cond_variable: str,
+    resp_dim: str,
+    data_df: pd.DataFrame,
+):
+    means = {}
+    means["All data"] = {}
+    for value in dist_variable_variants:
+        sub_df = data_df[data_df[dist_variable] == value]
+        if len(sub_df) > 0:
+            response_dimension_values = list(sub_df[resp_dim])
+            means["All data"][value] = sum(response_dimension_values) / len(
+                response_dimension_values
+            )
+
+    if data_cond_variable == "response_quality":
+        sub_label = " Response"
+    elif data_cond_variable == "additional_info_quality":
+        sub_label = " Additional Info"
+
+    for data_cond_variant in data_cond_variants:
+        data_df_sub = data_df[data_df[data_cond_variable] == data_cond_variant]
+        means[data_cond_variant + sub_label] = {}
+
+        for value in dist_variable_variants:
+            sub_df = data_df_sub[data_df_sub[dist_variable] == value]
+            if len(sub_df) > 0:
+                response_dimension_values = list(sub_df[resp_dim])
+                means[data_cond_variant + sub_label][value] = sum(
+                    response_dimension_values
+                ) / len(response_dimension_values)
+
+    return means
+
+
+if __name__ == "__main__":
+    aggregated_data = pd.read_csv(
+        "results/user_study_output/output_processed_aggregated.csv"
+    )
+
+    for feature in [
+        "source_usefulness",
+        "warning_usefulness",
+        "confidence_usefulness",
+        "conversational_frequency",
+        "voice_frequency",
+    ]:
+        f = [
+            int(d.replace("option_", ""))
+            for d in list(aggregated_data[feature])
+        ]
+        aggregated_data[feature] = f
+
+    aggregated_data = aggregated_data.replace({np.nan: "None"})
+
+    # Mean response usefulness scores for explanations with different levels of accuracy
+
+    plt.rcParams.update({"font.size": 11})
+
+    resp_dim = "usefulness"
+    barWidth = 0.25
+    fig, ax = plt.subplots(figsize=(7, 3))
+
+    means = get_mean_usefulness_scores(
+        ["Good", "Flawed", "None"],
+        "additional_info_quality",
+        ["Good", "Flawed"],
+        "response_quality",
+        resp_dim,
+        aggregated_data,
+    )
+
+    br1 = np.arange(len(list(means.keys())))
+    br2 = [x + barWidth for x in br1]
+    br3 = [x + barWidth for x in br2]
+
+    plt.ylim(2, 4)
+
+    print(means)
+
+    bar_1_values = [var_means["Good"] for var_means in means.values()]
+    bar1 = plt.bar(
+        br1,
+        bar_1_values,
+        color="sandybrown",
+        width=barWidth,
+        edgecolor="gray",
+        label="Accurate",
+    )
+    add_bar_labels(ax, bar1, bar_1_values)
+
+    bar_2_values = [var_means["Flawed"] for var_means in means.values()]
+    bar2 = plt.bar(
+        br2,
+        bar_2_values,
+        color="bisque",
+        width=barWidth,
+        edgecolor="gray",
+        label="Noisy",
+    )
+    add_bar_labels(ax, bar2, bar_2_values)
+
+    bar_3_values = [var_means["None"] for var_means in means.values()]
+    bar3 = plt.bar(
+        br3,
+        bar_3_values,
+        color="silver",
+        width=barWidth,
+        edgecolor="gray",
+        label="None",
+    )
+    add_bar_labels(ax, bar3, bar_3_values)
+
+    plt.xticks(
+        [0.25, 1.25, 2.35],
+        ["All data", "Perfect response", "Imperfect response"],
+        fontsize=14,
+    )
+
+    plt.legend(
+        loc="lower right",
+        fontsize=13,
+        title="Explanations quality",
+        title_fontsize=13,
+    )
+    plt.show()
+
+    fig.savefig(
+        "results/quantitative_analysis/mean_scores/means_usefulness_explanation_quality.png"
+    )
+
+    # Mean response usefulness scores for explanations with different presentation modes
+
+    plt.rcParams.update({"font.size": 11})
+
+    resp_dim = "usefulness"
+    barWidth = 0.25
+    fig, ax = plt.subplots(figsize=(7, 3))
+
+    dist_variable_variants = ["T", "V", "None"]
+    dist_variable = "presentation_modes"
+
+    data_cond_variants = ["Good", "Flawed"]
+    data_cond_variable = "additional_info_quality"
+
+    means = get_mean_usefulness_scores(
+        dist_variable_variants,
+        dist_variable,
+        data_cond_variants,
+        data_cond_variable,
+        resp_dim,
+        aggregated_data,
+    )
+
+    br1 = np.arange(len(list(means.keys())))
+    br2 = [x + barWidth for x in br1]
+    br3 = [x + barWidth for x in br2[:1]]
+
+    plt.ylim(2, 4)
+
+    bar_1_values = [var_means["T"] for var_means in means.values()]
+    bar1 = plt.bar(
+        br1,
+        bar_1_values,
+        color="#dfc27d",
+        width=barWidth,
+        edgecolor="grey",
+        label="Textual",
+    )
+    add_bar_labels(ax, bar1, bar_1_values)
+
+    bar_2_values = [var_means["V"] for var_means in means.values()]
+    bar2 = plt.bar(
+        br2,
+        bar_2_values,
+        color="#80cdc1",
+        width=barWidth,
+        edgecolor="grey",
+        label="Visual",
+    )
+    add_bar_labels(ax, bar2, bar_2_values)
+
+    bar_3_values = [means["All data"]["None"]]
+    print(bar_3_values)
+    bar3 = plt.bar(
+        br3,
+        bar_3_values,
+        color="silver",
+        width=barWidth,
+        edgecolor="grey",
+        label="None",
+    )
+    add_bar_labels(ax, bar3, bar_3_values)
+
+    plt.xticks(
+        [0.25, 1.1, 2.2],
+        ["All data", "Accurate explanations", "Noisy explanations"],
+        fontsize=14,
+    )
+
+    plt.legend(
+        loc="lower right",
+        fontsize=13,
+        title="Presentation mode",
+        title_fontsize=13,
+    )
+    plt.show()
+
+    fig.savefig(
+        "results/quantitative_analysis/mean_scores/means_usefulness_explanation_presentation.png"
+    )
+
+    # Mean explanation ratings for explanations with different levels of accuracy
+
+    plt.rcParams.update({"font.size": 11})
+
+    barWidth = 0.33
+    fig, ax = plt.subplots(figsize=(5.5, 3))
+
+    dist_variable_variants = ["Good", "Flawed"]
+    dist_variable = "additional_info_quality"
+    means = {}
+
+    means["Source Revealment"] = {}
+    for value in dist_variable_variants:
+        sub_df = aggregated_data[aggregated_data[dist_variable] == value]
+        if len(sub_df) > 0:
+            response_dimension_values = list(sub_df["source_usefulness"])
+            means["Source Revealment"][value] = sum(
+                response_dimension_values
+            ) / len(response_dimension_values)
+
+    means["Confidence Revealment"] = {}
+    for value in dist_variable_variants:
+        sub_df = aggregated_data[aggregated_data[dist_variable] == value]
+        if len(sub_df) > 0:
+            response_dimension_values = list(sub_df["confidence_usefulness"])
+            means["Confidence Revealment"][value] = sum(
+                response_dimension_values
+            ) / len(response_dimension_values)
+
+    br1 = np.arange(len(list(means.keys())))
+    br2 = [x + barWidth for x in br1]
+
+    plt.ylim(2, 4)
+
+    print(means)
+
+    bar_1_values = [var_means["Good"] for var_means in means.values()]
+    bar1 = plt.bar(
+        br1,
+        bar_1_values,
+        color="sandybrown",
+        width=barWidth,
+        edgecolor="gray",
+        label="Accurate",
+    )
+    add_bar_labels(ax, bar1, bar_1_values)
+
+    bar_2_values = [var_means["Flawed"] for var_means in means.values()]
+    bar2 = plt.bar(
+        br2,
+        bar_2_values,
+        color="bisque",
+        width=barWidth,
+        edgecolor="gray",
+        label="Noisy",
+    )
+    add_bar_labels(ax, bar2, bar_2_values)
+
+    plt.xticks(
+        [r + barWidth / 2 for r in range(len(list(means.keys())))],
+        ["Source", "Confidence"],
+        fontsize=14,
+    )
+
+    plt.legend(
+        loc="lower right",
+        fontsize=13,
+        title="Explanations quality",
+        title_fontsize=13,
+    )
+    plt.show()
+
+    fig.savefig(
+        "results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_quality.png"
+    )
+
+    # Mean explanation ratings for explanations with different presentation modes
+
+    plt.rcParams.update({"font.size": 11})
+
+    barWidth = 0.33
+    fig, ax = plt.subplots(figsize=(5.5, 3))
+
+    dist_variable_variants = ["T", "V"]
+    dist_variable = "presentation_modes"
+    means = {}
+
+    means["Limitation Revealment"] = {}
+    for value in dist_variable_variants:
+        sub_df = aggregated_data[aggregated_data[dist_variable] == value]
+        if len(sub_df) > 0:
+            response_dimension_values = list(sub_df["warning_usefulness"])
+            means["Limitation Revealment"][value] = sum(
+                response_dimension_values
+            ) / len(response_dimension_values)
+
+    means["Confidence Revealment"] = {}
+    for value in dist_variable_variants:
+        sub_df = aggregated_data[aggregated_data[dist_variable] == value]
+        if len(sub_df) > 0:
+            response_dimension_values = list(sub_df["confidence_usefulness"])
+            means["Confidence Revealment"][value] = sum(
+                response_dimension_values
+            ) / len(response_dimension_values)
+
+    br1 = np.arange(len(list(means.keys())))
+    br2 = [x + barWidth for x in br1]
+
+    plt.ylim(2, 4)
+
+    print(means)
+
+    bar_1_values = [var_means["T"] for var_means in means.values()]
+    bar1 = plt.bar(
+        br1,
+        bar_1_values,
+        color="#dfc27d",
+        width=barWidth,
+        edgecolor="gray",
+        label="Textual",
+    )
+    add_bar_labels(ax, bar1, bar_1_values)
+
+    bar_2_values = [var_means["V"] for var_means in means.values()]
+    bar2 = plt.bar(
+        br2,
+        bar_2_values,
+        color="#80cdc1",
+        width=barWidth,
+        edgecolor="gray",
+        label="Visual",
+    )
+    add_bar_labels(ax, bar2, bar_2_values)
+
+    plt.xticks(
+        [r + barWidth / 2 for r in range(len(list(means.keys())))],
+        ["Limitation", "Confidence"],
+        fontsize=14,
+    )
+
+    plt.legend(
+        loc="lower right",
+        fontsize=13,
+        title="Presentation mode",
+        title_fontsize=13,
+    )
+    plt.show()
+
+    fig.savefig(
+        "results/quantitative_analysis/mean_scores/means_explanation_ratings_explanation_presentation.png"
+    )