190 lines
8.0 KiB
Python
190 lines
8.0 KiB
Python
from libs.classes import Test, Model
|
|
from libs.functions import nxhash
|
|
from typing import Union
|
|
|
|
import json
|
|
|
|
def get_len(collection: Union[list, dict]) -> int:
|
|
maximum_length = 0
|
|
|
|
if isinstance(collection, list):
|
|
collection_type = "seeds"
|
|
elif isinstance(collection, dict):
|
|
if isinstance(collection[list(collection.keys())[0]], Model):
|
|
collection_type = "models"
|
|
elif isinstance(collection[list(collection.keys())[0]], Test):
|
|
collection_type = "tests"
|
|
else:
|
|
raise TypeError("get_len: unsupported collection_type")
|
|
else:
|
|
raise TypeError("get_len: unsupported collection_type")
|
|
|
|
match collection_type:
|
|
case "models":
|
|
for model_id in collection:
|
|
maximum_length = max(maximum_length, len(collection[model_id].display_name))
|
|
case "seeds":
|
|
for seed in collection:
|
|
maximum_length = max(maximum_length, len(str(seed)))
|
|
case "tests":
|
|
for test_id in collection:
|
|
maximum_length = max(maximum_length, len(collection[test_id].name))
|
|
case _:
|
|
for model_name in collection:
|
|
raise TypeError("get_len: unsupported collection_type")
|
|
|
|
return maximum_length
|
|
|
|
|
|
|
|
def run_tests(models: dict[int, Model], seeds: list[int], tests: dict[int, Test], base_url: str):
|
|
try:
|
|
print("Trying to load saved_results.json")
|
|
with open("./saved_results.json", "r") as f:
|
|
saved_results = json.load(fp=f)
|
|
print("Loaded.")
|
|
except FileNotFoundError:
|
|
print("saved_results.json not found. Initializing empty.")
|
|
saved_results = {}
|
|
# Get Results
|
|
run_results = {}
|
|
print("Starting to run Tests ... ")
|
|
for model_id in models:
|
|
model = models[model_id]
|
|
for test_id in tests:
|
|
test = tests[test_id]
|
|
for seed in seeds:
|
|
# Init dict
|
|
combination = {
|
|
'test_id': test_id,
|
|
'model_id': model_id,
|
|
'seed': seed,
|
|
}
|
|
hash_key = str(nxhash(json.dumps(combination, sort_keys=True)))
|
|
combination['test_name'] = test.name
|
|
combination['model_name'] = model.display_name
|
|
|
|
# if hash_key == "DE3D137E":
|
|
# pass
|
|
|
|
if hash_key not in saved_results.keys():
|
|
try:
|
|
print("\033[0;35mModel '\033[0m" +
|
|
model.display_name +
|
|
"\033[0;35m'" +
|
|
(" " * (get_len(models) - len(model.display_name))) +
|
|
" with seed \033[0m\033[0;30m" +
|
|
("0" * (get_len(seeds) - len(str(seed)))) +
|
|
"\033[0m" +
|
|
str(seed) +
|
|
"\033[0;35m now runs test '\033[0m" +
|
|
test.name +
|
|
"\033[0;35m'" +
|
|
(" " * (get_len(tests) - len(test.name))) +
|
|
" (\033[0m" +
|
|
hash_key +
|
|
"\033[0;35m)\033[0m",
|
|
end=""
|
|
)
|
|
answer = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
|
|
if isinstance(answer, str):
|
|
combination['answer'] = answer
|
|
# combination['tool_calls'] = [] # no entry
|
|
del answer
|
|
elif isinstance(answer, dict): # calls
|
|
combination['answer'] = answer['answer']
|
|
combination['tool_calls'] = answer['tool_calls']
|
|
del answer
|
|
else:
|
|
raise Exception(f"runnable returned unkown type {type(answer)}.")
|
|
|
|
combination['test'] = test
|
|
run_results[hash_key] = combination
|
|
print("\r\033[0;32mModel '\033[0m" +
|
|
model.display_name +
|
|
"\033[0;32m'" +
|
|
(" " * (get_len(models) - len(model.display_name))) +
|
|
" with seed \033[0m\033[0;30m" +
|
|
("0" * (get_len(seeds) - len(str(seed)))) +
|
|
"\033[0m" +
|
|
str(seed) +
|
|
"\033[0;32m finished test '\033[0m" +
|
|
test.name +
|
|
"\033[0;32m'" +
|
|
(" " * (get_len(tests) - len(test.name))) +
|
|
" (\033[0m" +
|
|
hash_key +
|
|
"\033[0;32m)\033[0m"
|
|
)
|
|
except Exception as e:
|
|
print("\r\033[0;31mError: <\033[0m" + str(e) + "\033[0;31m> at (\033[0m" + hash_key + "\033[0;31m). Continuing...\033[0m ")
|
|
else:
|
|
print("\r\033[0;34mModel '\033[0m" +
|
|
model.display_name +
|
|
"\033[0;34m'" +
|
|
(" " * (get_len(models) - len(model.display_name))) +
|
|
" with seed \033[0m\033[0;30m" +
|
|
("0" * (get_len(seeds) - len(str(seed)))) +
|
|
"\033[0m" +
|
|
str(seed) +
|
|
"\033[0;34m skipped test '\033[0m" +
|
|
test.name +
|
|
"\033[0;34m'" +
|
|
(" " * (get_len(tests) - len(test.name))) +
|
|
" (\033[0m" +
|
|
hash_key +
|
|
"\033[0;34m) becasue its results exists in saved_results.json\033[0m"
|
|
)
|
|
|
|
|
|
# Validate Results
|
|
if run_results != {}:
|
|
print("\nStarting validation of tests ...")
|
|
for hash_key in run_results:
|
|
result = run_results[hash_key]
|
|
|
|
try:
|
|
entry = {
|
|
'test_name': result['test_name'],
|
|
'test_id': result['test_id'],
|
|
'model_name': result['model_name'],
|
|
'model_id': result['model_id'],
|
|
'seed': result['seed'],
|
|
'answer': result['answer'],
|
|
'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url),
|
|
}
|
|
except Exception as e:
|
|
print("\033[0;31mError validating entry (\033[0m" + hash_key + "\033[0;31m). <\033[0m" + str(e) + "\033[0;31m> Continuing...\033[0m ")
|
|
continue
|
|
|
|
try:
|
|
entry['tool_calls'] = result['tool_calls']
|
|
except KeyError:
|
|
pass
|
|
|
|
|
|
saved_results[hash_key] = entry # add result with validation to saved results
|
|
|
|
print("\033[0;36mTest results of model '\033[0m" +
|
|
entry['model_name'] +
|
|
"\033[0;36m'" +
|
|
(" " * (get_len(models) - len(entry['model_name']))) +
|
|
" with seed \033[0m\033[0;30m" +
|
|
("0" * (get_len(seeds) - len(str(entry['seed'])))) +
|
|
"\033[0m" +
|
|
str(entry['seed']) +
|
|
"\033[0;36m on test '\033[0m" +
|
|
entry['test_name'] +
|
|
"\033[0;36m'" +
|
|
(" " * (get_len(tests) - len(entry['test_name']))) +
|
|
" (\033[0m" +
|
|
hash_key +
|
|
"\033[0;36m) evaluated to \033[0m" +
|
|
('\033[0;32mcorrect\033[0m' if entry['validation'] else '\033[0;31mincorrect\033[0m')
|
|
)
|
|
|
|
with open("./saved_results.json", "w") as f:
|
|
json.dump(fp=f, obj=saved_results, indent=4, sort_keys=True, ensure_ascii=False)
|
|
|
|
return saved_results
|