Files
test-small-llms/visualize.py
Lennart J. Kurzweg (Nx2) 74be23050d eps -> pdf
2024-09-30 15:51:15 +02:00

311 lines
11 KiB
Python

from typing import Callable, Optional
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from suite_settings.models import models
from suite_settings.techniques import techniques
from suite_settings.tests import tests
FONT_FAMILY = "NewComputerModern08"
def get_df() -> pd.DataFrame:
with open('saved_results.json', 'r') as f:
data = json.load(f)
raw_data = []
for test_hash, test_data in data.items():
raw_data.append({
"hash": test_hash,
"model_name": models[test_data['model_id']].display_name,
"model_size": models[test_data['model_id']].parameter_count_in_b,
"technique_name": techniques[test_data['technique_id']].name,
"model_technique": f"{models[test_data['model_id']].display_name}:{ techniques[test_data['technique_id']].name}",
"seed": test_data['seed'],
"test_name": tests[test_data['test_id']].name,
"validation": test_data['validation']
})
df = pd.DataFrame(raw_data)
# Categorical ordering for 'technique_name'
df['technique_name'] = pd.Categorical(
df['technique_name'],
categories=[
techniques[1].name,
techniques[572].name,
techniques[903].name
],
ordered=True
)
# Categorical ordering for 'test_name'
df['test_name'] = pd.Categorical(
df['test_name'],
categories=[
tests[693].name,
tests[363].name,
tests[120].name,
tests[283].name,
tests[260].name,
tests[856].name
],
ordered=True
)
# Sort by model_size first, then alphabetically by model_name
sorted_df = df.sort_values(['model_size', 'model_name'], ascending=[True, True])
return sorted_df
def insert_average_models(pt: pd.DataFrame, df: pd.DataFrame, pivot: int) -> pd.DataFrame:
# Use the df's model_size for calculations
model_sizes = df.groupby('model_technique')['model_size'].first()
# Split the pivot table into two groups based on model size
up_to_pivot = pt.index[model_sizes.loc[pt.index] <= pivot]
above_pivot = pt.index[model_sizes.loc[pt.index] > pivot]
# Calculate average pass rate for models up to and including 10B
avg_up_to_pivot = pt.loc[up_to_pivot].mean()
# Find the last model with exactly 10B parameters
last_pivot_model_index = up_to_pivot[up_to_pivot.shape[0]-1]
# Reindex the pivot table to insert the new row after the last 10B model
new_index = list(pt.index)
last_10b_position = new_index.index(last_pivot_model_index)
# Insert the row "Average up to 10B" right after the last 10B model
new_index.insert(last_10b_position+1, f"Average up to {pivot}b")
pt = pt.reindex(new_index)
# Set the values for the "Average up to 10B" row
pt.loc[f"Average up to {pivot}b"] = avg_up_to_pivot
# Calculate the average pass rate for models larger than 10B
avg_above_pivot = pt.loc[above_pivot].mean()
# Add a new row for the average of models larger than 10B at the end
pt.loc[f"Average above {pivot}b"] = avg_above_pivot
# Calculate the average pass rate for models larger than 10B
avg_total = pt.loc[pt.index].mean()
# Add a new row for the average of models larger than 10B at the end
pt.loc["Average Total"] = avg_total
return pt
def insert_average_test_y(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
avg = pt.loc[pt.index].mean()
pt.loc["Average"] = avg
return pt
def insert_average_test_x(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
avg = pt.mean(axis=1)
pt["Average"] = avg
return pt
def insert_average_technique(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
avg = pt.mean(axis=1) # Calculate the average across columns (axis=1)
pt['Average'] = avg # Insert the average as a new column
return pt
UNFITTING = [
903, # tinyllama
404, # llama3 groq TU
120, # llama3 groq TU 70b
890 # Command R+
]
def remove_unfitting(df: pd.DataFrame) -> pd.DataFrame:
if len(UNFITTING) > 0:
dff = df.loc[df['model_name'] != models[UNFITTING[0]].display_name]
if len(UNFITTING) > 1:
for id in UNFITTING[1:]:
dff = dff.loc[dff['model_name'] != models[id].display_name]
return dff
return df
def trendline(df: pd.DataFrame) -> None:
# Step 1: Calculate pass rate for each model size
# Group by 'model_size' and calculate the percentage of runs validated as correct
df['validation'] = df['validation'].astype(int) # Convert validation boolean to 1/0
pass_rate_df = df.groupby('model_size').agg(
pass_rate=('validation', 'mean') # Mean gives us the percentage of correct validations
).reset_index()
# Step 2: Plotting
plt.figure(figsize=(10, 6))
plt.scatter(pass_rate_df['model_size'], pass_rate_df['pass_rate'] * 100, label='Pass Rate (%)', color='blue')
# Fit a trendline
z = np.polyfit(pass_rate_df['model_size'], pass_rate_df['pass_rate'] * 100, 1) # Linear trendline
p = np.poly1d(z)
plt.plot(pass_rate_df['model_size'], p(pass_rate_df['model_size']), color='red', label='Trendline')
# Set font for axis tick labels
font_properties = {'fontname': FONT_FAMILY, 'fontsize': 12}
plt.xticks(**font_properties)
plt.yticks(**font_properties)
# Step 3: Customize plot
plt.title('Model Size vs Pass Rate', font=FONT_FAMILY)
plt.xlabel('Model Size (in billions of parameters)', font=FONT_FAMILY)
plt.ylabel('Pass Rate (%)', font=FONT_FAMILY)
plt.grid(True)
plt.legend(prop={'family': FONT_FAMILY})
# Save the plot
plt.savefig('size-trendline.pdf', format='pdf', dpi=1200)
def heatmap_models_plus_techniues(df: pd.DataFrame, color: str, title: Optional[str]= None, get_weight: Optional[Callable]= None) -> None:
ordered_techniques = df['model_technique'].unique()
pt = pd.pivot_table(
df,
values='validation',
index='model_technique',
columns='test_name',
observed=False,
aggfunc="mean",
fill_value=0
)
pt = pt.loc[ordered_techniques]
if get_weight:
def get_model_size_by_name(name: str) -> float:
for id in models:
if name == models[id].display_name:
return models[id].parameter_count_in_b
raise IndexError(f"Model {name} not found in models.")
for (index, row) in pt.iterrows():
pt.loc[index] = row * get_weight([ get_model_size_by_name(index.split(":")[0]) ])
pt = insert_average_models(pt=pt, df=df, pivot=10)
pt = insert_average_test_x(pt=pt, df=df)
plt.figure(figsize=(8, 12))
sns.heatmap(
pt * 100,
annot=True,
fmt=".0f" if pt.tail(1)['Average'].item() > 0.1 else ".1f",
cmap=sns.color_palette(color, as_cmap=True),
cbar=True,
annot_kws={"size": 10, "fontname": FONT_FAMILY}
)
# Update the annotations to display percentages
for text in plt.gca().texts:
o = text.get_text()
text.set_text(f"{o if o != '0.0' else '0'}{'%' if not title else ''}")
text.set_fontname(FONT_FAMILY)
for text in plt.gca().yaxis.get_ticklabels():
if 'average' in text.get_text().lower():
text.set_color('red')
text.set_fontname(FONT_FAMILY)
for text in plt.gca().xaxis.get_ticklabels():
if 'average' in text.get_text().lower():
text.set_color('red')
text.set_fontname(FONT_FAMILY)
# Set fonts for titles, labels, and tick labels
plt.title(f'Model+Technique Performance{"" if not title else ": " + title + "adjsuted"}', fontsize=16, fontname=FONT_FAMILY)
plt.xlabel('Test Name', fontsize=14, fontname=FONT_FAMILY)
plt.ylabel('Model and Technique', fontsize=14, fontname=FONT_FAMILY)
plt.xticks(rotation=45, ha='right', fontname=FONT_FAMILY)
plt.yticks(fontname=FONT_FAMILY)
plt.tight_layout()
plt.savefig(f"modeles-plus-techniques-heatmap{'' if not title else '-' + title.lower()}.pdf", format='pdf', dpi=1200)
def heatmap_techniques(df: pd.DataFrame, color: str, title: Optional[str]= None, get_weight: Optional[Callable]= None) -> None:
pt = pd.pivot_table(
df,
values='validation',
index='test_name',
observed=False,
columns='technique_name',
aggfunc="mean",
fill_value=0
)
if get_weight:
native = [ models[m].parameter_count_in_b for m in models if models[m].supports_tools ]
artificial = [ models[m].parameter_count_in_b for m in models if not models[m].supports_tools ]
weight_native = get_weight(native)
weight_artificial = get_weight(artificial)
pt['Native'] = pt['Native'] * weight_native
pt['LSM'] = pt['LSM'] * weight_artificial
pt['T2S'] = pt['T2S'] * weight_artificial
pt = insert_average_test_y(pt=pt, df=df)
pt = insert_average_technique(pt=pt, df=df)
plt.figure(figsize=(8, 4))
sns.heatmap(
pt * 100,
annot=True,
fmt=".0f" if pt.tail(1)['Average'].item() > 0.2 else ".1f",
cmap=sns.color_palette(color, as_cmap=True),
cbar=True,
annot_kws={"size": 10, "fontname": FONT_FAMILY}
)
# Add percentage sign to annotations
for text in plt.gca().texts:
o = text.get_text()
text.set_text(f"{o if o != '0.0' else '0'}{'%' if not title else ''}")
text.set_fontname(FONT_FAMILY)
for text in plt.gca().yaxis.get_ticklabels():
if 'average' in text.get_text().lower():
text.set_color('red')
text.set_fontname(FONT_FAMILY)
for text in plt.gca().xaxis.get_ticklabels():
if 'average' in text.get_text().lower():
text.set_color('red')
text.set_fontname(FONT_FAMILY)
# Customize the plot with labels and a title
plt.title(f"Technique Performance{'' if not title else ': ' + title + ' adjusted'}", fontsize=16, fontname=FONT_FAMILY)
plt.ylabel('Test Name', fontsize=14, fontname=FONT_FAMILY)
plt.xlabel('Technique', fontsize=14, fontname=FONT_FAMILY)
plt.xticks(rotation=0, fontname=FONT_FAMILY)
plt.yticks(fontname=FONT_FAMILY)
plt.tight_layout()
plt.savefig(f"techniques-heatmap{'' if not title else '-' + title.lower()}.pdf", format='pdf', dpi=1200)
def size(sizes: list[float]) -> float:
return 100/(sum(sizes) / len(sizes))
def performance(sizes: list[float]) -> float:
weights_list = [ 1/np.log(x+1) for x in sizes ]
weight = sum(weights_list) / len(weights_list)
return weight
if __name__ == "__main__":
df = get_df()
dff = remove_unfitting(df)
trendline(dff.copy())
heatmap_models_plus_techniues(df.copy(), color="blend:#100,#255,#4a3")
heatmap_models_plus_techniues(df.copy(), color="blend:#100,#236,#44a,#a4d,#fff,#ffc,#ffa,#ff7,#ff4,#ff0,#af0,#7f0,#3f0,#0f0", title="Size", get_weight=size)
heatmap_models_plus_techniues(df.copy(), color="blend:#100,#d44,#dc2,#dcc,#cff", title="Performance", get_weight=performance)
heatmap_techniques(df=dff.copy(), color="blend:#100,#255,#4a3")
heatmap_techniques(df=dff.copy(), color="blend:#100,#236,#44a,#a4d", title="Size", get_weight=size)
heatmap_techniques(df=dff.copy(), color="blend:#100,#d44,#dc2", title="Performance", get_weight=performance)