This commit is contained in:
Lennart J. Kurzweg (Nx2)
2024-09-27 21:34:21 +02:00
parent 3c2429ce78
commit 76b2b436be
8 changed files with 297 additions and 133 deletions

2
.gitignore vendored
View File

@@ -3,5 +3,5 @@
.direnv .direnv
.vscode .vscode
saved_results.json saved_results.json
*.png *.eps

BIN
assets/NewCM10-Regular.otf Normal file

Binary file not shown.

View File

@@ -71,7 +71,7 @@ def run_tests(models: dict[int, Model], seeds: list[int], tests: dict[int, Test]
'seed': seed, 'seed': seed,
'technique_id': technique_id 'technique_id': technique_id
} }
hash_key = str(nxhash(json.dumps(combination, sort_keys=True))) hash_key = nxhash(json.dumps(combination, sort_keys=True))
combination.update({ combination.update({
'test_name': test.name, 'test_name': test.name,

View File

@@ -157,9 +157,9 @@ def get_notes_containing(patterns: Union[list[str], str]) -> str:
return ret return ret
@tool @tool
def write_note(command: str) -> str: def write_note(note: str) -> str:
"""Write a not with the current time to the database.""" """Write a not with the current time to the database."""
return command return "Written."
@tool @tool
def save_python_repl(command: str): def save_python_repl(command: str):
@@ -171,6 +171,7 @@ def save_python_repl(command: str):
"^ *subprocess\\.", "^ *subprocess\\.",
"^ *(with)? ?open\\(", "^ *(with)? ?open\\(",
"^ *shutil\\.", "^ *shutil\\.",
"^ *requests\\.",
] ]
valid = True valid = True
for pattern in blocked_patterns: for pattern in blocked_patterns:

View File

View File

@@ -29,6 +29,6 @@ models = {
295: Model( display_name="Qwen2 7b", identifier="qwen2:7b", supports_tools=False, parameter_count_in_b=7 ), 295: Model( display_name="Qwen2 7b", identifier="qwen2:7b", supports_tools=False, parameter_count_in_b=7 ),
655: Model( display_name="Qwen2 72b", identifier="qwen2:72b", supports_tools=False, parameter_count_in_b=72 ), 655: Model( display_name="Qwen2 72b", identifier="qwen2:72b", supports_tools=False, parameter_count_in_b=72 ),
780: Model( display_name="Hermes3 8b", identifier="hermes3:8b", supports_tools=True, parameter_count_in_b=8 ), 780: Model( display_name="Hermes3 8b", identifier="hermes3:8b", supports_tools=True, parameter_count_in_b=8 ),
68: Model( display_name="Aya 8b", identifier="aya:8b", supports_tools=False, parameter_count_in_b=8 ), 68: Model( display_name="Aya 8b", identifier="aya:8b", supports_tools=False, parameter_count_in_b=8 ),
397: Model( display_name="Aya 35b", identifier="aya:35b", supports_tools=False, parameter_count_in_b=35 ), 397: Model( display_name="Aya 35b", identifier="aya:35b", supports_tools=False, parameter_count_in_b=35 ),
} }

View File

@@ -33,10 +33,7 @@ tests = {
"tools": {"add": add, "multiply": multiply}, "tools": {"add": add, "multiply": multiply},
}, },
validator=regex_match_any, validator=regex_match_any,
validation_input={ validation_input={"patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]},
"patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
# "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
},
), ),
120: Test( 120: Test(
name="Complex Multiplication", name="Complex Multiplication",
@@ -47,10 +44,10 @@ tests = {
"tools": {"add": add, "multiply": multiply}, "tools": {"add": add, "multiply": multiply},
}, },
validator=regex_match_any, validator=regex_match_any,
validation_input={"patterns": ["6134205", "6.134.205", "6,134,205"]}, validation_input={"patterns": ["6134205", "6,134,205"]},
), ),
363: Test( 363: Test(
name="Complex Multiplication Python", name="Python Remainder",
runnable=one_tool_call_answer, runnable=one_tool_call_answer,
runnable_input={ runnable_input={
"system_msg": 'You are a helpful assistant.', "system_msg": 'You are a helpful assistant.',
@@ -58,9 +55,7 @@ tests = {
"tools": { "save_python_repl": save_python_repl }, "tools": { "save_python_repl": save_python_repl },
}, },
validator=regex_match_any, validator=regex_match_any,
validation_input={ validation_input={"patterns": [ "236", "two ?hundred and thirty ?six", "two ?hundred thirty ?six" ]}
"patterns": [ "236", "two ?hundred and thirty ?six", "two ?hundred thirty ?six" ]
}
), ),
283: Test( 283: Test(
name="Notes from last Saturday", name="Notes from last Saturday",
@@ -96,7 +91,7 @@ tests = {
validator=system_human_answer_match, validator=system_human_answer_match,
validation_input={ validation_input={
"criteria": dedent("""- containing the information that the Human should call Wolfgang "criteria": dedent("""- containing the information that the Human should call Wolfgang
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""") - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
}, },
), ),
856: Test( 856: Test(
@@ -113,11 +108,11 @@ tests = {
AIMessage("I'm afraid I cannot be of great help, since I obviously know charlotte way less than you, but last year you two went out to Cavalinons and you got her a rose necklace as a present. And she liked it. So maybe a pair of earrings would be something she'd like?", name="example_assistant"), AIMessage("I'm afraid I cannot be of great help, since I obviously know charlotte way less than you, but last year you two went out to Cavalinons and you got her a rose necklace as a present. And she liked it. So maybe a pair of earrings would be something she'd like?", name="example_assistant"),
HumanMessage("Did I write down anything yesterday or the day before that?"), HumanMessage("Did I write down anything yesterday or the day before that?"),
AIMessage( "", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "21"}]), AIMessage("", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "21"}]),
ToolMessage("Wednesday the 7th of August 2024 16:23", tool_call_id="21"), ToolMessage("Wednesday the 7th of August 2024 16:23", tool_call_id="21"),
AIMessage( "", tool_calls=[ { "name": "get_notes_in_timespan", "args": {"begin": "2024/08/05", "to": "2024/08/06"}, "id": "22"}]), AIMessage("", tool_calls=[ { "name": "get_notes_in_timespan", "args": {"begin": "2024/08/05", "to": "2024/08/06"}, "id": "22"}]),
ToolMessage( "2024/08/05 11:45 Ask Dr. Mills about the side effects of the new medication he got me.\n\n2024/08/06 18:30 Pick up the dry cleaning on Thursday; they close early on Fridays.", tool_call_id="22"), ToolMessage( "2024/08/05 11:45 Ask Dr. Mills about the side effects of the new medication he got me.\n\n2024/08/06 18:30 Pick up the dry cleaning on Thursday; they close early on Fridays.", tool_call_id="22"),
AIMessage( "Yes. I found two entries.\n- From yesterday stating that you wanted to pickup the dry cleaning on Thursday, because they close early on Fridays\n- From Monday a note saying that you want to ask Dr. Mills about the side effects of the new medication he got you.", name="example_assistant"), AIMessage("Yes. I found two entries.\n- From yesterday stating that you wanted to pickup the dry cleaning on Thursday, because they close early on Fridays\n- From Monday a note saying that you want to ask Dr. Mills about the side effects of the new medication he got you.", name="example_assistant"),
], ],
"human_msg": "Last Saturday, who did grandma want me to call?", "human_msg": "Last Saturday, who did grandma want me to call?",
"tools": { "tools": {

View File

@@ -1,142 +1,310 @@
from typing import Callable, Optional
import json import json
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import numpy as np
from suite_settings.models import models from suite_settings.models import models
from suite_settings.techniques import techniques from suite_settings.techniques import techniques
from suite_settings.tests import tests from suite_settings.tests import tests
# Load the JSON data
with open('saved_results.json', 'r') as f:
data = json.load(f)
# Convert JSON data into a DataFrame
results = []
for test_hash, test_data in data.items():
results.append({
"hash": test_hash,
"model_name": models[test_data['model_id']].display_name,
"model_size": models[test_data['model_id']].parameter_count_in_b,
"technique_name": techniques[test_data['technique_id']].name,
"model_technique": f"{models[test_data['model_id']].display_name}:{ techniques[test_data['technique_id']].name}",
"seed": test_data['seed'],
"test_name": tests[test_data['test_id']].name,
"validation": test_data['validation']
})
df = pd.DataFrame(results)
FONT_FAMILY = "NewComputerModern08"
df['technique_name'] = pd.Categorical(df['technique_name'], categories=[techniques[1].name, techniques[572].name, techniques[903].name],ordered=True)
df['test_name'] = pd.Categorical(df['test_name'], categories=[tests[607].name, tests[693].name, tests[363].name, tests[120].name, tests[283].name, tests[260].name, tests[856].name],ordered=True)
sorted_df = df.sort_values('model_size')
# Perform the groupby and unstack operation
result_df = (
sorted_df.groupby(['model_name', 'validation']).size()
.unstack(fill_value=0) # Unstack and fill NaN with 0
)
## 1st Chart
# Count the number of validation results for each technique_name
validation_counts = result_df.loc[sorted_df['model_name'].drop_duplicates()]
validation_counts.columns = ['Failed', 'Passed']
# Plot the validation results by technique_name
plt.figure(figsize=(10, 6))
validation_counts.plot(kind='bar', stacked=True, color=['red', 'green'], ax=plt.gca())
plt.title('Validation Results by Model and Technique')
plt.xlabel('Model and Technique')
plt.ylabel('Number of Tests')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Validation')
plt.tight_layout()
plt.savefig('model-bar-chart.png')
def get_df() -> pd.DataFrame:
with open('saved_results.json', 'r') as f:
data = json.load(f)
raw_data = []
for test_hash, test_data in data.items():
raw_data.append({
"hash": test_hash,
"model_name": models[test_data['model_id']].display_name,
"model_size": models[test_data['model_id']].parameter_count_in_b,
"technique_name": techniques[test_data['technique_id']].name,
"model_technique": f"{models[test_data['model_id']].display_name}:{ techniques[test_data['technique_id']].name}",
"seed": test_data['seed'],
"test_name": tests[test_data['test_id']].name,
"validation": test_data['validation']
})
df = pd.DataFrame(raw_data)
# Categorical ordering for 'technique_name'
df['technique_name'] = pd.Categorical(
df['technique_name'],
categories=[
techniques[1].name,
techniques[572].name,
techniques[903].name
],
ordered=True
)
# Categorical ordering for 'test_name'
df['test_name'] = pd.Categorical(
df['test_name'],
categories=[
tests[693].name,
tests[363].name,
tests[120].name,
tests[283].name,
tests[260].name,
tests[856].name
],
ordered=True
)
# Sort by model_size first, then alphabetically by model_name
sorted_df = df.sort_values(['model_size', 'model_name'], ascending=[True, True])
return sorted_df
def insert_average_models(pt: pd.DataFrame, df: pd.DataFrame, pivot: int) -> pd.DataFrame:
# Use the df's model_size for calculations
model_sizes = df.groupby('model_technique')['model_size'].first()
# Split the pivot table into two groups based on model size
up_to_pivot = pt.index[model_sizes.loc[pt.index] <= pivot]
above_pivot = pt.index[model_sizes.loc[pt.index] > pivot]
# Calculate average pass rate for models up to and including 10B
avg_up_to_pivot = pt.loc[up_to_pivot].mean()
# Find the last model with exactly 10B parameters
last_pivot_model_index = up_to_pivot[up_to_pivot.shape[0]-1]
# Reindex the pivot table to insert the new row after the last 10B model
new_index = list(pt.index)
last_10b_position = new_index.index(last_pivot_model_index)
# Insert the row "Average up to 10B" right after the last 10B model
new_index.insert(last_10b_position+1, f"Average up to {pivot}b")
pt = pt.reindex(new_index)
# Set the values for the "Average up to 10B" row
pt.loc[f"Average up to {pivot}b"] = avg_up_to_pivot
# Calculate the average pass rate for models larger than 10B
avg_above_pivot = pt.loc[above_pivot].mean()
# Add a new row for the average of models larger than 10B at the end
pt.loc[f"Average above {pivot}b"] = avg_above_pivot
# Calculate the average pass rate for models larger than 10B
avg_total = pt.loc[pt.index].mean()
# Add a new row for the average of models larger than 10B at the end
pt.loc["Average Total"] = avg_total
return pt
def insert_average_test_y(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
avg = pt.loc[pt.index].mean()
pt.loc["Average"] = avg
return pt
def insert_average_test_x(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
avg = pt.mean(axis=1)
pt["Average"] = avg
return pt
def insert_average_technique(pt: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
avg = pt.mean(axis=1) # Calculate the average across columns (axis=1)
pt['Average'] = avg # Insert the average as a new column
return pt
UNFITTING = [
903, # tinyllama
404, # llama3 groq TU
120, # llama3 groq TU 70b
890 # Command R+
]
def remove_unfitting(df: pd.DataFrame) -> pd.DataFrame:
if len(UNFITTING) > 0:
dff = df.loc[df['model_name'] != models[UNFITTING[0]].display_name]
if len(UNFITTING) > 1:
for id in UNFITTING[1:]:
dff = dff.loc[dff['model_name'] != models[id].display_name]
return dff
return df
def trendline(df: pd.DataFrame) -> None:
# Step 1: Calculate pass rate for each model size
# Group by 'model_size' and calculate the percentage of runs validated as correct
df['validation'] = df['validation'].astype(int) # Convert validation boolean to 1/0
pass_rate_df = df.groupby('model_size').agg(
pass_rate=('validation', 'mean') # Mean gives us the percentage of correct validations
).reset_index()
# Step 2: Plotting
plt.figure(figsize=(10, 6))
plt.scatter(pass_rate_df['model_size'], pass_rate_df['pass_rate'] * 100, label='Pass Rate (%)', color='blue')
# Fit a trendline
z = np.polyfit(pass_rate_df['model_size'], pass_rate_df['pass_rate'] * 100, 1) # Linear trendline
p = np.poly1d(z)
plt.plot(pass_rate_df['model_size'], p(pass_rate_df['model_size']), color='red', label='Trendline')
# Set font for axis tick labels
font_properties = {'fontname': FONT_FAMILY, 'fontsize': 12}
plt.xticks(**font_properties)
plt.yticks(**font_properties)
# Step 3: Customize plot
plt.title('Model Size vs Pass Rate', font=FONT_FAMILY)
plt.xlabel('Model Size (in billions of parameters)', font=FONT_FAMILY)
plt.ylabel('Pass Rate (%)', font=FONT_FAMILY)
plt.grid(True)
plt.legend(prop={'family': FONT_FAMILY})
# Save the plot
plt.savefig('size-trendline.eps', format='eps', dpi=1200)
def heatmap_models_plus_techniues(df: pd.DataFrame, color: str, title: Optional[str]= None, get_weight: Optional[Callable]= None) -> None:
ordered_techniques = df['model_technique'].unique()
pt = pd.pivot_table(
df,
values='validation',
index='model_technique',
columns='test_name',
observed=False,
aggfunc="mean",
fill_value=0
)
pt = pt.loc[ordered_techniques]
if get_weight:
def get_model_size_by_name(name: str) -> float:
for id in models:
if name == models[id].display_name:
return models[id].parameter_count_in_b
raise IndexError(f"Model {name} not found in models.")
for (index, row) in pt.iterrows():
pt.loc[index] = row * get_weight([ get_model_size_by_name(index.split(":")[0]) ])
pt = insert_average_models(pt=pt, df=df, pivot=10)
pt = insert_average_test_x(pt=pt, df=df)
plt.figure(figsize=(8, 12))
sns.heatmap(
pt * 100,
annot=True,
fmt=".0f" if pt.tail(1)['Average'].item() > 0.1 else ".1f",
cmap=sns.color_palette(color, as_cmap=True),
cbar=True,
annot_kws={"size": 10, "fontname": FONT_FAMILY}
)
## 2nd Chart # Update the annotations to display percentages
# Plot the validation results by test name for text in plt.gca().texts:
test_name_counts = df.groupby(['test_name', 'validation']).size().unstack().fillna(0) o = text.get_text()
test_name_counts.columns = ['Failed', 'Passed'] text.set_text(f"{o if o != '0.0' else '0'}{'%' if not title else ''}")
text.set_fontname(FONT_FAMILY)
plt.figure(figsize=(10, 6)) for text in plt.gca().yaxis.get_ticklabels():
test_name_counts.plot(kind='barh', stacked=True, color=['red', 'green'], ax=plt.gca()) if 'average' in text.get_text().lower():
plt.title('Validation Results by Test Name') text.set_color('red')
plt.xlabel('Number of Tests') text.set_fontname(FONT_FAMILY)
plt.ylabel('Test Name')
plt.legend(title='Validation') for text in plt.gca().xaxis.get_ticklabels():
plt.tight_layout() if 'average' in text.get_text().lower():
plt.savefig('test-bar-chart.png') text.set_color('red')
text.set_fontname(FONT_FAMILY)
# Set fonts for titles, labels, and tick labels
plt.title(f'Model+Technique Performance{"" if not title else ": " + title + "adjsuted"}', fontsize=16, fontname=FONT_FAMILY)
plt.xlabel('Test Name', fontsize=14, fontname=FONT_FAMILY)
plt.ylabel('Model and Technique', fontsize=14, fontname=FONT_FAMILY)
plt.xticks(rotation=45, ha='right', fontname=FONT_FAMILY)
plt.yticks(fontname=FONT_FAMILY)
plt.tight_layout()
plt.savefig(f"modeles-plus-techniques-heatmap{'' if not title else '-' + title.lower()}.eps", format='eps', dpi=1200)
def heatmap_techniques(df: pd.DataFrame, color: str, title: Optional[str]= None, get_weight: Optional[Callable]= None) -> None:
pt = pd.pivot_table(
df,
values='validation',
index='test_name',
observed=False,
columns='technique_name',
aggfunc="mean",
fill_value=0
)
if get_weight:
native = [ models[m].parameter_count_in_b for m in models if models[m].supports_tools ]
artificial = [ models[m].parameter_count_in_b for m in models if not models[m].supports_tools ]
weight_native = get_weight(native)
weight_artificial = get_weight(artificial)
pt['Native'] = pt['Native'] * weight_native
pt['LSM'] = pt['LSM'] * weight_artificial
pt['T2S'] = pt['T2S'] * weight_artificial
sorted_df = df.sort_values('model_size' ) pt = insert_average_test_y(pt=pt, df=df)
pt = insert_average_technique(pt=pt, df=df)
plt.figure(figsize=(8, 4))
sns.heatmap(
pt * 100,
annot=True,
fmt=".0f" if pt.tail(1)['Average'].item() > 0.2 else ".1f",
cmap=sns.color_palette(color, as_cmap=True),
cbar=True,
annot_kws={"size": 10, "fontname": FONT_FAMILY}
)
# Get the unique order of 'model_technique' based on sorted_df # Add percentage sign to annotations
ordered_techniques = sorted_df['model_technique'].unique() for text in plt.gca().texts:
o = text.get_text()
text.set_text(f"{o if o != '0.0' else '0'}{'%' if not title else ''}")
text.set_fontname(FONT_FAMILY)
# Create the pivot table with the correct order of model_technique for text in plt.gca().yaxis.get_ticklabels():
pass_rate = pd.pivot_table( if 'average' in text.get_text().lower():
sorted_df, text.set_color('red')
values='validation', text.set_fontname(FONT_FAMILY)
index='model_technique',
columns='test_name', for text in plt.gca().xaxis.get_ticklabels():
aggfunc="mean", if 'average' in text.get_text().lower():
fill_value=0 text.set_color('red')
) text.set_fontname(FONT_FAMILY)
# Customize the plot with labels and a title
plt.title(f"Technique Performance{'' if not title else ': ' + title + ' adjusted'}", fontsize=16, fontname=FONT_FAMILY)
plt.ylabel('Test Name', fontsize=14, fontname=FONT_FAMILY)
plt.xlabel('Technique', fontsize=14, fontname=FONT_FAMILY)
# Reorder the rows in the pivot table based on the ordered techniques plt.xticks(rotation=0, fontname=FONT_FAMILY)
pass_rate = pass_rate.loc[ordered_techniques] plt.yticks(fontname=FONT_FAMILY)
# Plot the heatmap plt.tight_layout()
plt.figure(figsize=(8, 12)) plt.savefig(f"techniques-heatmap{'' if not title else '-' + title.lower()}.eps", format='eps', dpi=1200)
sns.heatmap(
pass_rate * 100,
annot=True,
fmt=".0f",
cmap=sns.color_palette("blend:#100,#255,#4a3", as_cmap=True),
cbar=True,
annot_kws={"size": 10}
)
# Add percentage sign to annotations
for text in plt.gca().texts:
text.set_text(f"{text.get_text()}%")
# Customize the plot with labels and a title
plt.title('Model Technique Performance on Each Test', fontsize=16)
plt.xlabel('Test Name', fontsize=14)
plt.ylabel('Model and Technique', fontsize=14)
# Rotate x-axis labels by 45 degrees
plt.xticks(rotation=45, ha='right')
# Adjust layout to ensure labels don't get cut off
plt.tight_layout()
# Save the heatmap
plt.savefig('modelTechnique_heatmap.png')
## 4th Chart: Technique Performance on Each Test (Aggregated Heatmap) def size(sizes: list[float]) -> float:
technique_pass_rate = pd.pivot_table(sorted_df, values='validation', index='test_name', columns='technique_name', aggfunc="mean", fill_value=0) return 100/(sum(sizes) / len(sizes))
plt.figure(figsize=(8, 4))
sns.heatmap(technique_pass_rate*100, annot=True, fmt=".0f", cmap=sns.color_palette("blend:#100,#255,#4a3", as_cmap=True), cbar=True, annot_kws={"size": 10})
# Add percentage sign to annotations def performance(sizes: list[float]) -> float:
for text in plt.gca().texts: weights_list = [ 1/np.log(x+1) for x in sizes ]
text.set_text(f"{text.get_text()}%") weight = sum(weights_list) / len(weights_list)
return weight
# Customize the plot with labels and a title if __name__ == "__main__":
plt.title('Technique Performance on Each Test', fontsize=16) df = get_df()
plt.ylabel('Test Name', fontsize=14) dff = remove_unfitting(df)
plt.xlabel('Technique', fontsize=14) trendline(dff.copy())
plt.xticks(rotation=0) heatmap_models_plus_techniues(df.copy(), color="blend:#100,#255,#4a3")
plt.tight_layout() heatmap_models_plus_techniues(df.copy(), color="blend:#100,#236,#44a,#a4d,#fff,#ffc,#ffa,#ff7,#ff4,#ff0,#af0,#7f0,#3f0,#0f0", title="Size", get_weight=size)
plt.savefig('technique_heatmap.png') heatmap_models_plus_techniues(df.copy(), color="blend:#100,#d44,#dc2,#dcc,#cff", title="Performance", get_weight=performance)
heatmap_techniques(df=dff.copy(), color="blend:#100,#255,#4a3")
heatmap_techniques(df=dff.copy(), color="blend:#100,#236,#44a,#a4d", title="Size", get_weight=size)
heatmap_techniques(df=dff.copy(), color="blend:#100,#d44,#dc2", title="Performance", get_weight=performance)