from typing import Callable, Optional import json import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import numpy as np from suite_settings.models import models from suite_settings.techniques import techniques from suite_settings.tests import tests FONT_FAMILY = "NewComputerModern08" def get_df() -> pd.DataFrame: with open('saved_results.json', 'r') as f: data = json.load(f) raw_data = [] for test_hash, test_data in data.items(): try: raw_data.append({ "hash": test_hash, "model_name": models[test_data['model_id']].display_name, "model_size": models[test_data['model_id']].parameter_count_in_b, "technique_name": techniques[test_data['technique_id']].name, "model_technique": f"{models[test_data['model_id']].display_name}:{ techniques[test_data['technique_id']].name}", "seed": test_data['seed'], "test_name": tests[test_data['test_id']].name, "validation": test_data['validation'] }) except KeyError: pass df = pd.DataFrame(raw_data) # Categorical ordering for 'technique_name' df['technique_name'] = pd.Categorical( df['technique_name'], categories=[ techniques[1].name, techniques[572].name, techniques[903].name ], ordered=True ) # Categorical ordering for 'test_name' df['test_name'] = pd.Categorical( df['test_name'], categories=[ tests[693].name, tests[363].name, tests[120].name, tests[283].name, tests[260].name, tests[856].name ], ordered=True ) # Sort by model_size first, then alphabetically by model_name sorted_df = df.sort_values(['model_size', 'model_name'], ascending=[True, True]) return sorted_df def insert_average_models(pt: pd.DataFrame, df: pd.DataFrame, pivot: int) -> pd.DataFrame: # Use the df's model_size for calculations model_sizes = df.groupby('model_technique')['model_size'].first() # Split the pivot table into two groups based on model size up_to_pivot = pt.index[model_sizes.loc[pt.index] <= pivot] above_pivot = pt.index[model_sizes.loc[pt.index] > pivot] # Calculate average pass rate for models up to and including 10B avg_up_to_pivot = pt.loc[up_to_pivot].mean() # Find the last model with exactly 10B parameters last_pivot_model_index = up_to_pivot[up_to_pivot.shape[0]-1] # Reindex the pivot table to insert the new row after the last 10B model new_index = list(pt.index) last_10b_position = new_index.index(last_pivot_model_index) # Insert the row "Average up to 10B" right after the last 10B model new_index.insert(last_10b_position+1, f"Average up to {pivot}b") pt = pt.reindex(new_index) # Set the values for the "Average up to 10B" row pt.loc[f"Average up to {pivot}b"] = avg_up_to_pivot # Calculate the average pass rate for models larger than 10B avg_above_pivot = pt.loc[above_pivot].mean() # Add a new row for the average of models larger than 10B at the end pt.loc[f"Average above {pivot}b"] = avg_above_pivot # Calculate the average pass rate for models larger than 10B avg_total = pt.loc[pt.index].mean() # Add a new row for the average of models larger than 10B at the end pt.loc["Average Total"] = avg_total return pt def insert_average_test_y(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame: avg = pt.loc[pt.index].mean() pt.loc["Average"] = avg return pt def insert_average_test_x(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame: avg = pt.mean(axis=1) pt["Average"] = avg return pt def insert_average_technique(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame: avg = pt.mean(axis=1) # Calculate the average across columns (axis=1) pt['Average'] = avg # Insert the average as a new column return pt UNFITTING = [ 903, # tinyllama 404, # llama3 groq TU 120, # llama3 groq TU 70b 890 # Command R+ ] def remove_unfitting(df: pd.DataFrame) -> pd.DataFrame: if len(UNFITTING) > 0: dff = df.loc[df['model_name'] != models[UNFITTING[0]].display_name] if len(UNFITTING) > 1: for id in UNFITTING[1:]: dff = dff.loc[dff['model_name'] != models[id].display_name] return dff return df def trendline(df: pd.DataFrame) -> None: # Step 1: Calculate pass rate for each model size # Group by 'model_size' and calculate the percentage of runs validated as correct df['validation'] = df['validation'].astype(int) # Convert validation boolean to 1/0 pass_rate_df = df.groupby('model_size').agg( pass_rate=('validation', 'mean') # Mean gives us the percentage of correct validations ).reset_index() # Step 2: Plotting plt.figure(figsize=(10, 6)) plt.scatter(pass_rate_df['model_size'], pass_rate_df['pass_rate'] * 100, label='Pass Rate (%)', color='blue') # Fit a trendline z = np.polyfit(pass_rate_df['model_size'], pass_rate_df['pass_rate'] * 100, 1) # Linear trendline p = np.poly1d(z) plt.plot(pass_rate_df['model_size'], p(pass_rate_df['model_size']), color='red', label='Trendline') # Set font for axis tick labels font_properties = {'fontname': FONT_FAMILY, 'fontsize': 12} plt.xticks(**font_properties) plt.yticks(**font_properties) # Step 3: Customize plot plt.title('Model Size vs Pass Rate', font=FONT_FAMILY) plt.xlabel('Model Size (in billions of parameters)', font=FONT_FAMILY) plt.ylabel('Pass Rate (%)', font=FONT_FAMILY) plt.grid(True) plt.legend(prop={'family': FONT_FAMILY}) # Save the plot plt.savefig('size-trendline.pdf', format='pdf', dpi=1200) def heatmap_models_plus_techniues(df: pd.DataFrame, color: str, title: Optional[str]= None, get_weight: Optional[Callable]= None) -> None: ordered_techniques = df['model_technique'].unique() pt = pd.pivot_table( df, values='validation', index='model_technique', columns='test_name', observed=False, aggfunc="mean", fill_value=0 ) pt = pt.loc[ordered_techniques] if get_weight: def get_model_size_by_name(name: str) -> float: for id in models: if name == models[id].display_name: return models[id].parameter_count_in_b raise IndexError(f"Model {name} not found in models.") for (index, row) in pt.iterrows(): pt.loc[index] = row * get_weight([ get_model_size_by_name(index.split(":")[0]) ]) pt = insert_average_models(pt=pt, df=df, pivot=10) pt = insert_average_test_x(pt=pt, df=df) plt.figure(figsize=(8, 12)) sns.heatmap( pt * 100, annot=True, fmt=".0f" if pt.tail(1)['Average'].item() > 0.1 else ".1f", cmap=sns.color_palette(color, as_cmap=True), cbar=True, annot_kws={"size": 10, "fontname": FONT_FAMILY} ) # Update the annotations to display percentages for text in plt.gca().texts: o = text.get_text() text.set_text(f"{o if o != '0.0' else '0'}{'%' if not title else ''}") text.set_fontname(FONT_FAMILY) for text in plt.gca().yaxis.get_ticklabels(): if 'average' in text.get_text().lower(): text.set_color('red') text.set_fontname(FONT_FAMILY) for text in plt.gca().xaxis.get_ticklabels(): if 'average' in text.get_text().lower(): text.set_color('red') text.set_fontname(FONT_FAMILY) # Set fonts for titles, labels, and tick labels plt.title(f'Model+Technique Performance{"" if not title else ": " + title + " adjusted"}', fontsize=16, fontname=FONT_FAMILY) plt.xlabel('Test Name', fontsize=14, fontname=FONT_FAMILY) plt.ylabel('Model and Technique', fontsize=14, fontname=FONT_FAMILY) plt.xticks(rotation=45, ha='right', fontname=FONT_FAMILY) plt.yticks(fontname=FONT_FAMILY) plt.tight_layout() plt.savefig(f"modeles-plus-techniques-heatmap{'' if not title else '-' + title.lower()}.pdf", format='pdf', dpi=1200) def heatmap_techniques(df: pd.DataFrame, color: str, title: Optional[str]= None, get_weight: Optional[Callable]= None) -> None: pt = pd.pivot_table( df, values='validation', index='test_name', observed=False, columns='technique_name', aggfunc="mean", fill_value=0 ) if get_weight: native = [ models[m].parameter_count_in_b for m in models if models[m].supports_tools ] artificial = [ models[m].parameter_count_in_b for m in models if not models[m].supports_tools ] weight_native = get_weight(native) weight_artificial = get_weight(artificial) pt['Native'] = pt['Native'] * weight_native pt['LSM'] = pt['LSM'] * weight_artificial pt['T2S'] = pt['T2S'] * weight_artificial pt = insert_average_test_y(pt=pt, df=df) pt = insert_average_technique(pt=pt, df=df) plt.figure(figsize=(8, 4)) sns.heatmap( pt * 100, annot=True, fmt=".0f" if pt.tail(1)['Average'].item() > 0.2 else ".1f", cmap=sns.color_palette(color, as_cmap=True), cbar=True, annot_kws={"size": 10, "fontname": FONT_FAMILY} ) # Add percentage sign to annotations for text in plt.gca().texts: o = text.get_text() text.set_text(f"{o if o != '0.0' else '0'}{'%' if not title else ''}") text.set_fontname(FONT_FAMILY) for text in plt.gca().yaxis.get_ticklabels(): if 'average' in text.get_text().lower(): text.set_color('red') text.set_fontname(FONT_FAMILY) for text in plt.gca().xaxis.get_ticklabels(): if 'average' in text.get_text().lower(): text.set_color('red') text.set_fontname(FONT_FAMILY) # Customize the plot with labels and a title plt.title(f"Technique Performance{'' if not title else ': ' + title + ' adjusted'}", fontsize=16, fontname=FONT_FAMILY) plt.ylabel('Test Name', fontsize=14, fontname=FONT_FAMILY) plt.xlabel('Technique', fontsize=14, fontname=FONT_FAMILY) plt.xticks(rotation=0, fontname=FONT_FAMILY) plt.yticks(fontname=FONT_FAMILY) plt.tight_layout() plt.savefig(f"techniques-heatmap{'' if not title else '-' + title.lower()}.pdf", format='pdf', dpi=1200) def size(sizes: list[float]) -> float: return 100/(sum(sizes) / len(sizes)) def performance(sizes: list[float]) -> float: weights_list = [ 1/np.log(x+1) for x in sizes ] weight = sum(weights_list) / len(weights_list) return weight if __name__ == "__main__": df = get_df() dff = remove_unfitting(df) trendline(dff.copy()) heatmap_models_plus_techniues(df.copy(), color="blend:#100,#255,#4a3") heatmap_models_plus_techniues(df.copy(), color="blend:#100,#236,#44a,#a4d,#fff,#ffc,#ffa,#ff7,#ff4,#ff0,#af0,#7f0,#3f0,#0f0", title="Size", get_weight=size) heatmap_models_plus_techniues(df.copy(), color="blend:#100,#d44,#dc2,#dcc,#cff", title="Performance", get_weight=performance) heatmap_techniques(df=dff.copy(), color="blend:#100,#255,#4a3") heatmap_techniques(df=dff.copy(), color="blend:#100,#236,#44a,#a4d", title="Size", get_weight=size) heatmap_techniques(df=dff.copy(), color="blend:#100,#d44,#dc2", title="Performance", get_weight=performance)