benchmark/plots.py

import matplotlib.pyplot as plt
import numpy as np
from imgcat import imgcat

from aider.dump import dump  # noqa: F401


def plot_timing(df):
    """plot a graph showing the average duration of each (model, edit_format)"""
    plt.rcParams["hatch.linewidth"] = 0.5
    plt.rcParams["hatch.color"] = "#444444"

    from matplotlib import rc

    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})

    fig, ax = plt.subplots(figsize=(6, 4))
    ax.grid(axis="y", zorder=0, lw=0.2)

    zorder = 1
    grouped = df.groupby(["model", "edit_format"])["avg_duration"].mean().unstack()
    num_models, num_formats = grouped.shape

    pos = np.array(range(num_models))
    width = 0.8 / num_formats

    formats = grouped.columns
    models = grouped.index

    for i, fmt in enumerate(formats):
        edge = dict(edgecolor="#ffffff", linewidth=1.5)
        color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
        hatch = "////" if "func" in fmt else ""
        rects = ax.bar(
            pos + i * width,
            grouped[fmt],
            width * 0.95,
            label=fmt,
            color=color,
            hatch=hatch,
            zorder=zorder + 1,
            **edge,
        )
        ax.bar_label(rects, padding=4, labels=[f"{v:.1f}s" for v in grouped[fmt]], size=6)

    ax.set_xticks([p + 0.5 * width for p in pos])
    ax.set_xticklabels(models)

    ax.set_ylabel("Average GPT response time\nper exercise (sec)")
    ax.set_title("GPT Code Editing Speed\n(time per coding task)")
    ax.legend(
        title="Edit Format",
        loc="upper left",
    )
    ax.set_ylim(top=max(grouped.max()) * 1.1)  # Set y-axis limit to 10% more than the max value

    plt.tight_layout()
    plt.savefig("tmp_timing.svg")
    imgcat(fig)


def plot_outcomes(df, repeats, repeat_hi, repeat_lo, repeat_avg):
    tries = [df.groupby(["model", "edit_format"])["pass_rate_2"].mean()]
    if True:
        tries += [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]

    plt.rcParams["hatch.linewidth"] = 0.5
    plt.rcParams["hatch.color"] = "#444444"

    from matplotlib import rc

    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})

    fig, ax = plt.subplots(figsize=(6, 4))
    ax.grid(axis="y", zorder=0, lw=0.2)

    zorder = 1
    for grouped in tries:
        zorder += 1
        df = grouped.unstack()
        num_models, num_formats = df.shape

        pos = np.array(range(num_models))
        width = 0.8 / num_formats

        formats = df.columns
        models = df.index

        for i, fmt in enumerate(formats):
            if zorder > 1:
                edge = dict(
                    edgecolor="#ffffff",
                    linewidth=1.5,
                )
            else:
                edge = dict()
            if zorder == 2:
                edge["label"] = fmt

            color = "#b3e6a8" if "diff" in fmt else "#b3d1e6"
            hatch = "////" if "func" in fmt else ""
            rects = ax.bar(
                pos + i * width,
                df[fmt],
                width * 0.95,
                color=color,
                hatch=hatch,
                zorder=zorder,
                **edge,
            )
            if zorder == 2:
                ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)

    if len(repeats):
        ax.errorbar(
            1.4,
            repeat_avg,
            yerr=[[repeat_lo], [repeat_hi]],
            fmt="none",
            zorder=5,
            capsize=2.5,
            elinewidth=1,
            markeredgewidth=1,
        )

    ax.set_xticks([p + 0.5 * width for p in pos])
    model_labels = []
    for model in models:
        pieces = model.split("-")
        ml = "-".join(pieces[:2]) + "-\n" + "-".join(pieces[2:])
        model_labels.append(ml)

    ax.set_xticklabels(model_labels)

    top = 95
    ax.annotate(
        "First attempt,\nbased on\nnatural language\ninstructions",
        xy=(2.20, 41),
        xytext=(2, top),
        horizontalalignment="center",
        verticalalignment="top",
        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
    )
    ax.annotate(
        "Second attempt,\nincluding unit test\nerror output",
        xy=(2.55, 56),
        xytext=(3.5, top),
        horizontalalignment="center",
        verticalalignment="top",
        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
    )

    ax.set_ylabel("Percent of exercises completed successfully")
    # ax.set_xlabel("Model")
    ax.set_title("GPT Code Editing Skill\n(percent coding tasks correct)")
    ax.legend(
        title="Edit Format",
        loc="upper left",
        # bbox_to_anchor=(0.95, 0.95),
    )
    ax.set_ylim(top=100)

    plt.tight_layout()
    plt.savefig("tmp.svg")
    imgcat(fig)

    # df.to_csv("tmp.benchmarks.csv")


def plot_outcomes_claude(df):
    print(df)

    # Fix wrong column label
    df["model"] = df["model"].replace("gpt-4-0314", "gpt-4-0613")

    tries = [
        df[["model", "pass_rate_2"]],
        df[["model", "pass_rate_1"]],
    ]

    plt.rcParams["hatch.linewidth"] = 0.5
    plt.rcParams["hatch.color"] = "#444444"

    from matplotlib import rc

    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})

    fig, ax = plt.subplots(figsize=(6, 4))
    ax.grid(axis="y", zorder=0, lw=0.2)

    zorder = 1
    for df in tries:
        zorder += 1
        print(df)

        num_models, _ = df.shape
        num_formats = 1

        pos = np.array(range(num_models))
        width = 0.6 / num_formats

        if zorder > 1:
            edge = dict(
                edgecolor="#ffffff",
                linewidth=1.5,
            )
        else:
            edge = dict()
        if zorder == 2:
            edge["label"] = "??"

        color = [
            "#b3e6a8",
            "#b3e6a8",
            "#b3e6a8",
            "#b3d1e6",
        ]
        hatch = [  # noqa: F841
            "",
            "",
            "",
            "",
            "////",
            "////",
            "////",
            "",
            "////",
        ]
        hatch = [  # noqa: F841
            "////",
            "////",
            "////",
            "////",
            "",
            "",
            "",
            "////",
            "",
        ]
        rects = ax.bar(
            pos + 0.5 * width,
            df.iloc[:, 1],
            width * 0.95,
            color=color,
            # hatch=hatch,
            # zorder=zorder,
            **edge,
        )
        if zorder == 2:
            ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df.iloc[:, 1]], size=6)

    ax.set_xticks([p + 0.5 * width for p in pos])

    models = df.iloc[:, 0]
    model_map = {
        "gpt-4-0613": "gpt-4-\n0613",
        "gpt-4-0125-preview": "gpt-4-\n0125-preview",
        "gpt-4-1106-preview": "gpt-4-\n1106-preview",
        "gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
    }
    model_labels = []
    for model in models:
        ml = model_map.get(model, model)
        model_labels.append(ml)
    ax.set_xticklabels(model_labels, rotation=0)

    top = 95
    ax.annotate(
        "First attempt,\nbased on\nnatural language\ninstructions",
        xy=(1.0, 53),
        xytext=(0.75, top),
        horizontalalignment="center",
        verticalalignment="top",
        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
    )
    ax.annotate(
        "Second attempt,\nincluding unit test\nerror output",
        xy=(1.55, 65),
        xytext=(1.9, top),
        horizontalalignment="center",
        verticalalignment="top",
        arrowprops={"arrowstyle": "->", "connectionstyle": "arc3,rad=0.3"},
    )

    ax.set_ylabel("Percent of exercises completed successfully")
    # ax.set_xlabel("Model")
    ax.set_title("Code Editing Skill")
    # ax.legend(
    #    title="Model family",
    #    loc="upper left",
    # )
    ax.set_ylim(top=100)

    plt.tight_layout()
    plt.savefig("tmp.svg")
    imgcat(fig)

    # df.to_csv("tmp.benchmarks.csv")


def plot_refactoring(df):
    tries = [df.groupby(["model", "edit_format"])["pass_rate_1"].mean()]

    plt.rcParams["hatch.linewidth"] = 0.5
    plt.rcParams["hatch.color"] = "#444444"

    from matplotlib import rc

    rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"], "size": 10})

    fig, ax = plt.subplots(figsize=(6, 4))
    ax.grid(axis="y", zorder=0, lw=0.2)

    zorder = 1
    for grouped in tries:
        zorder += 1
        df = grouped.unstack()

        i, j = 0, 1
        temp = df.iloc[i].copy()
        df.iloc[i], df.iloc[j] = df.iloc[j], temp
        dump(df)

        # df.sort_values(by=["model"], ascending=False, inplace=True)
        num_models, num_formats = df.shape

        pos = np.array(range(num_models))
        width = 0.8 / num_formats

        formats = df.columns
        models = df.index

        dump(df)
        dump(models)
        dump(formats)
        for i, fmt in enumerate(formats):
            hatch = ""

            if fmt == "diff":
                color = "#b3e6a8"
                label = "Search/replace blocks"
            elif fmt == "udiff":
                color = "#b3d1e6"
                label = "Unified diffs"
            elif fmt == "difffolk":
                label = "Baseline + blind, no hands, $2k tip, etc"
                color = "#b3e6a8"
                hatch = "////"
            elif fmt == "udifffolk":
                label = "Unified diffs + blind, no hands, $2k tip, etc"
                color = "#b3d1e6"
                hatch = "////"

            if zorder > 1:
                edge = dict(
                    edgecolor="#ffffff",
                    linewidth=1.5,
                )
            else:
                edge = dict()
            if zorder == 2:
                edge["label"] = label

            color = [
                "#b3e6a8",
                "#b3e6a8",
                "#b3d1e6",
            ]

            rects = ax.bar(
                pos + i * width,
                df[fmt],
                width * 0.95,
                color=color,
                hatch=hatch,
                zorder=zorder,
                **edge,
            )

            if zorder == 2:
                ax.bar_label(rects, padding=4, labels=[f"{v:.0f}%" for v in df[fmt]], size=6)

    ax.set_xticks([p + 0 * width for p in pos])

    model_map = {
        "gpt-4-0125-preview": "gpt-4-\n0125-preview",
        "gpt-4-1106-preview": "gpt-4-\n1106-preview",
        "gpt-4-turbo-2024-04-09": "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
    }
    model_labels = []

    for model in models:
        ml = model_map.get(model, model)
        model_labels.append(ml)

    model_labels = [
        "gpt-4-\n1106-preview",
        "gpt-4-\n0125-preview",
        "gpt-4-turbo-\n2024-04-09\n(GPT-4 Turbo with Vision)",
    ]
    ax.set_xticklabels(model_labels, rotation=0)

    ax.set_ylabel("Percent of exercises completed successfully")
    # ax.set_xlabel("Model")
    ax.set_title('Refactoring "Laziness" Benchmark')
    # ax.legend(
    # title="Edit Format",
    # loc="upper left",
    # bbox_to_anchor=(0.95, 0.95),
    # )
    ax.set_ylim(top=100)

    plt.tight_layout()
    plt.savefig("tmp.svg")
    imgcat(fig)

    # df.to_csv("tmp.benchmarks.csv")