N_report.py

"""Generate Reports for Validation Data

This script allows the user to generate human-readable reports based on the validation data
produced by 'M_validate.py'.
The reports provide insights into the accuracy of a search algorithm by analyzing various 
statistics and metrics.

The script offers the following features:
1. Display statistics about the validation data and examples used.
2. Calculate the success rate of the search algorithm.
3. Analyze the distribution of correct matches across different positions in the search results.
4. Examine the number of results provided for each query, both for matches and non-matches.
5. Determine the average number of results for correct matches.
6. Identify the most frequently suggested bad entries in the results.
7. Generate alternative statistics after removing the top bad entries.

Usage:
Run this script from the console to select a specific validation file from available validations 
(generated by 'M_validate.py').
The script will then generate a report based on the selected validation data.

Example:
python N_report.py

Note:
    Before using this script, ensure that you have already run 'M_validate.py' to generate the 
    validation data.
"""

import json
import os
import pickle
import re
import sys


def generate_report(validation_file):
    """
    Generate a human-readable report based on the provided validation data.

    Args:
        validation_file (str): The name of the validation data file (pickle format).

    Outputs:
        Displays a comprehensive report on various aspects of the validation data, 
        providing insights into the search algorithm's accuracy.
    """

    # load up raw_data
    with open(f"files/{validation_file}", "rb") as file:
        raw_data = pickle.load(file)

    report_str = str()  # a placeholder for the report as it gets generated

    report_str = (
        report_str
        + """
+-------------------------------------------------+
| How Accurate is the Search Algorithm - A Report |
+-------------------------------------------------+
"""
    )

    with open("englishidioms/phrases.json", encoding="UTF-8") as f:
        phrases = json.load(f)

    report_str = (
        report_str
        + "\n"
        + "{:<30}{:<30}".format(
            "Entries in phrases.json", format(len(phrases["dictionary"]), ",d")
        )
    )

    # how many examples are there in examples.pickle
    with open("files/examples.pickle", "rb") as file:
        examples = pickle.load(file)

    examples_count = 0
    for entry in examples:
        examples_count = examples_count + len(entry[1])

    report_str = (
        report_str
        + "\n"
        + "{:<30}{:<30}".format("Examples ", format(examples_count, ",d"))
    )

    correct_match_count = 0
    for d in raw_data:
        if d["match"]:
            correct_match_count += 1

    report_str = (
        report_str
        + "\n"
        + "{:<30}{:<30}".format("Correct matches", format(correct_match_count, ",d"))
    )

    report_str = (
        report_str
        + "\n"
        + "{:<30}{:<30}".format(
            "Success rate (%)", round((correct_match_count / examples_count) * 100)
        )
    )

    # let's find out how many correct matches were the 1st result, 2nd, 3rd .. etc
    First, Second, Third, Fourth, Fifth, Sixth, Seventh, Eighth, Ninth, Tenth = (
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
    )
    for d in raw_data:
        if d["match"] and d["match_index"] == 1:
            First += 1
        elif d["match"] and d["match_index"] == 2:
            Second += 1
        elif d["match"] and d["match_index"] == 3:
            Third += 1
        elif d["match"] and d["match_index"] == 4:
            Fourth += 1
        elif d["match"] and d["match_index"] == 5:
            Fifth += 1
        elif d["match"] and d["match_index"] == 6:
            Sixth += 1
        elif d["match"] and d["match_index"] == 7:
            Seventh += 1
        elif d["match"] and d["match_index"] == 8:
            Eighth += 1
        elif d["match"] and d["match_index"] == 9:
            Ninth += 1
        elif d["match"] and d["match_index"] == 10:
            Tenth += 1

    report_str = report_str + "\n" + "\nDistribution of Correct Matches by Position\n"
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "First", First, f"{round((First / correct_match_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Second", Second, f"{round((Second / correct_match_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Third", Third, f"{round((Third / correct_match_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Fourth", Fourth, f"{round((Fourth / correct_match_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Fifth", Fifth, f"{round((Fifth / correct_match_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Sixth", Sixth, f"{round((Sixth / correct_match_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Seventh", Seventh, f"{round((Seventh / correct_match_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Eighth", Eighth, f"{round((Eighth / correct_match_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Ninth", Ninth, f"{round((Ninth / correct_match_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Tenth", Tenth, f"{round((Tenth / correct_match_count) * 100, 2)}%"
        )
    )

    # let's calculate how many results given for each query - matches and non-matches
    (
        r_zero,
        r_one,
        r_two,
        r_three,
        r_four,
        r_five,
        r_six,
        r_seven,
        r_eight,
        r_nine,
        r_ten,
    ) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
    for d in raw_data:
        if d["results_count"] == 0:
            r_zero += 1
        if d["results_count"] == 1:
            r_one += 1
        if d["results_count"] == 2:
            r_two += 1
        if d["results_count"] == 3:
            r_three += 1
        if d["results_count"] == 4:
            r_four += 1
        if d["results_count"] == 5:
            r_five += 1
        if d["results_count"] == 6:
            r_six += 1
        if d["results_count"] == 7:
            r_seven += 1
        if d["results_count"] == 8:
            r_eight += 1
        if d["results_count"] == 9:
            r_nine += 1
        if d["results_count"] == 10:
            r_ten += 1

    # let's calculate how many results were given for all queries
    all_results_count = 0
    for d in raw_data:
        all_results_count += d["results_count"]

    report_str = (
        report_str
        + "\n"
        + "\nNumber of Results Provided for Each Query - Matches and Non-Matches\n"
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Zero", r_zero, f"{round((r_zero / all_results_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "One", r_one, f"{round((r_one / all_results_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Two", r_two, f"{round((r_two / all_results_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Three", r_three, f"{round((r_three / all_results_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Four", r_four, f"{round((r_four / all_results_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Five", r_five, f"{round((r_five / all_results_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Six", r_six, f"{round((r_six / all_results_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Seven", r_seven, f"{round((r_seven / all_results_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Eight", r_eight, f"{round((r_eight / all_results_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Nine", r_nine, f"{round((r_nine / all_results_count) * 100, 2)}%"
        )
    )
    report_str = (
        report_str
        + "\n"
        + "{:<15}{:<15}{:<15}".format(
            "Ten", r_ten, f"{round((r_ten / all_results_count) * 100, 2)}%"
        )
    )

    # let's see how many results were given in the case of correct matches, 
    # average results count per match, the most repeated bad results
    total_results_count = 0
    for d in raw_data:
        if d["match"]:
            total_results_count += d["results_count"]

    report_str = (
        report_str
        + "\n\n"
        + f"Results given for correct matches: {format(total_results_count, ',d')} "
        + f"(avg. {round(total_results_count / correct_match_count)} result/match)"
    )

    # let's find out the most repeated bad entries
    bad_results = []
    for d in raw_data:
        if d["match"]:
            results_copy = list(d["results"])
            del results_copy[d["match_index"] - 1]
            bad_results.extend(results_copy)

    xx = [tuple(x) for x in bad_results]  # convert the lists in bad_results to tuples

    bad_results_count = {}
    for r in xx:
        bad_results_count[r] = bad_results_count.get(r, 0) + 1

    lst = []
    for key, val in bad_results_count.items():
        newtup = (val, key)
        lst.append(newtup)

    lst = sorted(lst, reverse=True)

    slst = list(lst[:318])  # sliced copy of lst

    bcount = 0  # how many times the top 100 items of bad_results were mentioned
    for val, key in slst:
        # print(key, val)
        bcount = bcount + val

    report_str = (
        report_str
        + "\n\n"
        + f"The {len(slst)} most frequently occurring incorrect entries were suggested"
        + f" {format(bcount, ',d')} times. \n"
        + f"If these {len(slst)} entries were removed from phrases.json, \n"
        + f"results provided for correct matches would total {format(total_results_count - bcount, ',d')}"
        + f" with an average of {round((total_results_count - bcount) / correct_match_count)} "
        + "results per match.\n\n"
    )

    print(report_str)


# check if user has already run 'M_validate.py' and validation data is available
files = os.listdir("files")
validation_files = [f for f in files if "validation_data" in f]

if not validation_files:
    sys.exit("no validations are available, you need to run 'M_validate.py' first")

print("Available Validations")
for i, f in enumerate(validation_files):
    date = re.findall(r"\d+", f)
    print(
        "{:<30}".format(f"{i+1}) {date[0]} {date[1]} {date[2]} {date[3]}:{date[4]}"),
        end="",
    )
    if (i + 1) % 2 == 0 or (i + 1) == len(validation_files):
        print("")

while True:
    user_input = input(
        f"select a validation to generate a report (1-{len(validation_files)}), q to exit "
    )
    if user_input == "q":
        break
    if user_input not in [str(x) for x in range(1, len(validation_files) + 1)]:
        continue

    generate_report(validation_files[int(user_input) - 1])