test-small-llms/visualize.py

from typing import Callable, Optional
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from suite_settings.models import models
from suite_settings.techniques import techniques
from suite_settings.tests import tests


FONT_FAMILY = "NewComputerModern08"


def get_df() -> pd.DataFrame:
    with open('saved_results.json', 'r') as f:
        data = json.load(f)
    raw_data = []
    for test_hash, test_data in data.items():
        raw_data.append({
            "hash": test_hash,
            "model_name": models[test_data['model_id']].display_name,
            "model_size": models[test_data['model_id']].parameter_count_in_b,
            "technique_name": techniques[test_data['technique_id']].name,
            "model_technique": f"{models[test_data['model_id']].display_name}:{ techniques[test_data['technique_id']].name}",
            "seed": test_data['seed'],
            "test_name": tests[test_data['test_id']].name,
            "validation": test_data['validation']
        })

    df = pd.DataFrame(raw_data)

    # Categorical ordering for 'technique_name'
    df['technique_name'] = pd.Categorical(
        df['technique_name'],
        categories=[
            techniques[1].name,
            techniques[572].name,
            techniques[903].name
        ],
        ordered=True
    )

    # Categorical ordering for 'test_name'
    df['test_name'] = pd.Categorical(
        df['test_name'],
        categories=[
            tests[693].name,
            tests[363].name,
            tests[120].name,
            tests[283].name,
            tests[260].name,
            tests[856].name
        ],
        ordered=True
    )

    # Sort by model_size first, then alphabetically by model_name
    sorted_df = df.sort_values(['model_size', 'model_name'], ascending=[True, True])

    return sorted_df


def insert_average_models(pt: pd.DataFrame, df: pd.DataFrame, pivot: int) -> pd.DataFrame:
    # Use the df's model_size for calculations
    model_sizes = df.groupby('model_technique')['model_size'].first()

    # Split the pivot table into two groups based on model size
    up_to_pivot = pt.index[model_sizes.loc[pt.index] <= pivot]
    above_pivot = pt.index[model_sizes.loc[pt.index] > pivot]

    # Calculate average pass rate for models up to and including 10B
    avg_up_to_pivot = pt.loc[up_to_pivot].mean()

    # Find the last model with exactly 10B parameters
    last_pivot_model_index = up_to_pivot[up_to_pivot.shape[0]-1]

    # Reindex the pivot table to insert the new row after the last 10B model
    new_index = list(pt.index)
    last_10b_position = new_index.index(last_pivot_model_index)

    # Insert the row "Average up to 10B" right after the last 10B model
    new_index.insert(last_10b_position+1, f"Average up to {pivot}b")
    pt = pt.reindex(new_index)

    # Set the values for the "Average up to 10B" row
    pt.loc[f"Average up to {pivot}b"] = avg_up_to_pivot

    # Calculate the average pass rate for models larger than 10B
    avg_above_pivot = pt.loc[above_pivot].mean()
    # Add a new row for the average of models larger than 10B at the end
    pt.loc[f"Average above {pivot}b"] = avg_above_pivot

    # Calculate the average pass rate for models larger than 10B
    avg_total = pt.loc[pt.index].mean()
    # Add a new row for the average of models larger than 10B at the end
    pt.loc["Average Total"] = avg_total
    return pt

def insert_average_test_y(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    avg = pt.loc[pt.index].mean()
    pt.loc["Average"] = avg
    return pt

def insert_average_test_x(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    avg = pt.mean(axis=1)
    pt["Average"] = avg
    return pt

def insert_average_technique(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    avg = pt.mean(axis=1)  # Calculate the average across columns (axis=1)
    pt['Average'] = avg     # Insert the average as a new column
    return pt

UNFITTING = [
    903, # tinyllama
    404, # llama3 groq TU
    120, # llama3 groq TU 70b
    890  # Command R+
]

def remove_unfitting(df: pd.DataFrame) -> pd.DataFrame:
    if len(UNFITTING) > 0:
        dff = df.loc[df['model_name'] != models[UNFITTING[0]].display_name]
        if len(UNFITTING) > 1:
            for id in UNFITTING[1:]:
                dff = dff.loc[dff['model_name'] != models[id].display_name]
        return dff
    return df

def trendline(df: pd.DataFrame) -> None:
    # Step 1: Calculate pass rate for each model size
    # Group by 'model_size' and calculate the percentage of runs validated as correct
    df['validation'] = df['validation'].astype(int)  # Convert validation boolean to 1/0
    pass_rate_df = df.groupby('model_size').agg(
        pass_rate=('validation', 'mean')  # Mean gives us the percentage of correct validations
    ).reset_index()

    # Step 2: Plotting
    plt.figure(figsize=(10, 6))
    plt.scatter(pass_rate_df['model_size'], pass_rate_df['pass_rate'] * 100, label='Pass Rate (%)', color='blue')

    # Fit a trendline
    z = np.polyfit(pass_rate_df['model_size'], pass_rate_df['pass_rate'] * 100, 1)  # Linear trendline
    p = np.poly1d(z)
    plt.plot(pass_rate_df['model_size'], p(pass_rate_df['model_size']), color='red', label='Trendline')

    # Set font for axis tick labels
    font_properties = {'fontname': FONT_FAMILY, 'fontsize': 12}
    plt.xticks(**font_properties)
    plt.yticks(**font_properties)

    # Step 3: Customize plot
    plt.title('Model Size vs Pass Rate', font=FONT_FAMILY)
    plt.xlabel('Model Size (in billions of parameters)', font=FONT_FAMILY)
    plt.ylabel('Pass Rate (%)', font=FONT_FAMILY)
    plt.grid(True)
    plt.legend(prop={'family': FONT_FAMILY})

    # Save the plot
    plt.savefig('size-trendline.pdf', format='pdf', dpi=1200)


def heatmap_models_plus_techniues(df: pd.DataFrame, color: str, title: Optional[str]= None, get_weight: Optional[Callable]= None) -> None:
    ordered_techniques = df['model_technique'].unique()
    pt = pd.pivot_table(
        df,
        values='validation',
        index='model_technique',
        columns='test_name',
        observed=False,
        aggfunc="mean",
        fill_value=0
    )

    pt = pt.loc[ordered_techniques]

    if get_weight:
        def get_model_size_by_name(name: str) -> float:
            for id in models:
                if name == models[id].display_name:
                    return models[id].parameter_count_in_b
            raise IndexError(f"Model {name} not found in models.")

        for (index, row) in pt.iterrows():
            pt.loc[index] = row * get_weight([ get_model_size_by_name(index.split(":")[0]) ])

    pt = insert_average_models(pt=pt, df=df, pivot=10)
    pt = insert_average_test_x(pt=pt, df=df)


    plt.figure(figsize=(8, 12))

    sns.heatmap(
        pt * 100,
        annot=True,
        fmt=".0f" if pt.tail(1)['Average'].item() > 0.1 else ".1f",
        cmap=sns.color_palette(color, as_cmap=True),
        cbar=True,
        annot_kws={"size": 10, "fontname": FONT_FAMILY}
    )

    # Update the annotations to display percentages
    for text in plt.gca().texts:
        o = text.get_text()
        text.set_text(f"{o if o != '0.0' else '0'}{'%' if not title else ''}")
        text.set_fontname(FONT_FAMILY)

    for text in plt.gca().yaxis.get_ticklabels():
        if 'average' in text.get_text().lower():
            text.set_color('red')
        text.set_fontname(FONT_FAMILY)

    for text in plt.gca().xaxis.get_ticklabels():
        if 'average' in text.get_text().lower():
            text.set_color('red')
        text.set_fontname(FONT_FAMILY)

    # Set fonts for titles, labels, and tick labels
    plt.title(f'Model+Technique Performance{"" if not title else ": " + title + "adjsuted"}', fontsize=16, fontname=FONT_FAMILY)
    plt.xlabel('Test Name', fontsize=14, fontname=FONT_FAMILY)
    plt.ylabel('Model and Technique', fontsize=14, fontname=FONT_FAMILY)

    plt.xticks(rotation=45, ha='right', fontname=FONT_FAMILY)
    plt.yticks(fontname=FONT_FAMILY)

    plt.tight_layout()
    plt.savefig(f"modeles-plus-techniques-heatmap{'' if not title else '-' + title.lower()}.pdf", format='pdf', dpi=1200)


def heatmap_techniques(df: pd.DataFrame, color: str, title: Optional[str]= None, get_weight: Optional[Callable]= None) -> None:
    pt = pd.pivot_table(
        df,
        values='validation',
        index='test_name',
        observed=False,
        columns='technique_name',
        aggfunc="mean",
        fill_value=0
    )
    if get_weight:
        native =     [ models[m].parameter_count_in_b for m in models if     models[m].supports_tools ]
        artificial = [ models[m].parameter_count_in_b for m in models if not models[m].supports_tools ]
        weight_native     = get_weight(native)
        weight_artificial = get_weight(artificial)
        pt['Native'] = pt['Native'] * weight_native
        pt['LSM']    = pt['LSM'] * weight_artificial
        pt['T2S']    = pt['T2S'] * weight_artificial

    pt = insert_average_test_y(pt=pt, df=df)
    pt = insert_average_technique(pt=pt, df=df)

    plt.figure(figsize=(8, 4))

    sns.heatmap(
        pt * 100,
        annot=True,
        fmt=".0f" if pt.tail(1)['Average'].item() > 0.2 else ".1f",
        cmap=sns.color_palette(color, as_cmap=True),
        cbar=True,
        annot_kws={"size": 10, "fontname": FONT_FAMILY}
    )

    # Add percentage sign to annotations
    for text in plt.gca().texts:
        o = text.get_text()
        text.set_text(f"{o if o != '0.0' else '0'}{'%' if not title else ''}")
        text.set_fontname(FONT_FAMILY)

    for text in plt.gca().yaxis.get_ticklabels():
        if 'average' in text.get_text().lower():
            text.set_color('red')
        text.set_fontname(FONT_FAMILY)

    for text in plt.gca().xaxis.get_ticklabels():
        if 'average' in text.get_text().lower():
            text.set_color('red')
        text.set_fontname(FONT_FAMILY)

    # Customize the plot with labels and a title
    plt.title(f"Technique Performance{'' if not title else ': ' + title + ' adjusted'}", fontsize=16, fontname=FONT_FAMILY)
    plt.ylabel('Test Name', fontsize=14, fontname=FONT_FAMILY)
    plt.xlabel('Technique', fontsize=14, fontname=FONT_FAMILY)

    plt.xticks(rotation=0, fontname=FONT_FAMILY)
    plt.yticks(fontname=FONT_FAMILY)

    plt.tight_layout()
    plt.savefig(f"techniques-heatmap{'' if not title else '-' + title.lower()}.pdf", format='pdf', dpi=1200)


def size(sizes: list[float]) -> float:
    return 100/(sum(sizes) / len(sizes))

def performance(sizes: list[float]) -> float:
    weights_list = [ 1/np.log(x+1) for x in sizes ]
    weight = sum(weights_list) / len(weights_list)
    return weight

if __name__ == "__main__":
    df = get_df()
    dff = remove_unfitting(df)
    trendline(dff.copy())
    heatmap_models_plus_techniues(df.copy(), color="blend:#100,#255,#4a3")
    heatmap_models_plus_techniues(df.copy(), color="blend:#100,#236,#44a,#a4d,#fff,#ffc,#ffa,#ff7,#ff4,#ff0,#af0,#7f0,#3f0,#0f0", title="Size", get_weight=size)
    heatmap_models_plus_techniues(df.copy(), color="blend:#100,#d44,#dc2,#dcc,#cff", title="Performance", get_weight=performance)
    heatmap_techniques(df=dff.copy(), color="blend:#100,#255,#4a3")
    heatmap_techniques(df=dff.copy(), color="blend:#100,#236,#44a,#a4d", title="Size", get_weight=size)
    heatmap_techniques(df=dff.copy(), color="blend:#100,#d44,#dc2", title="Performance", get_weight=performance)