cacheing, tests as dict, new tests
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
from libs.test_class import Test
|
||||
from libs.validators import system_human_answer_match
|
||||
from libs.runnables import basic
|
||||
from typing import Union
|
||||
|
||||
import json
|
||||
|
||||
@@ -16,18 +15,36 @@ def nxhash(text:str): # @BenVida StackOverflow
|
||||
hash = ( hash*281 ^ ord(ch)*997) & 0xFFFFFFFF
|
||||
return hex(hash)[2:].upper().zfill(8)
|
||||
|
||||
def get_len(l: list) -> int:
|
||||
m = 0
|
||||
for e in l:
|
||||
if isinstance(e, Test):
|
||||
m = max(m, len(e.name))
|
||||
elif isinstance(e, str):
|
||||
m = max(m, len(e))
|
||||
elif isinstance(e, int):
|
||||
m = max(m, len(str(e)))
|
||||
else:
|
||||
raise Exception(f"get_len() only supports lits of Test, str or int but got {type(e)}")
|
||||
return m
|
||||
def get_len(collection: Union[list, dict]) -> int:
|
||||
maximum_length = 0
|
||||
|
||||
if isinstance(collection, dict):
|
||||
collection_type = "tests"
|
||||
elif isinstance(collection, list):
|
||||
if isinstance(collection[0], str):
|
||||
collection_type = "models"
|
||||
elif isinstance(collection[0], int):
|
||||
collection_type = "seeds"
|
||||
else:
|
||||
raise TypeError("get_len: unsupported collection_type")
|
||||
else:
|
||||
raise TypeError("get_len: unsupported collection_type")
|
||||
|
||||
match collection_type:
|
||||
case "models":
|
||||
for model_name in collection:
|
||||
maximum_length = max(maximum_length, len(model_name))
|
||||
case "seeds":
|
||||
for seed in collection:
|
||||
maximum_length = max(maximum_length, len(str(seed)))
|
||||
case "tests":
|
||||
for test_id in collection:
|
||||
maximum_length = max(maximum_length, len(collection[test_id].name))
|
||||
case _:
|
||||
for model_name in collection:
|
||||
raise TypeError("get_len: unsupported collection_type")
|
||||
|
||||
return maximum_length
|
||||
|
||||
|
||||
|
||||
@@ -44,29 +61,63 @@ def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url:
|
||||
run_results = {}
|
||||
print("Starting to run Tests ... ")
|
||||
for model in models:
|
||||
for seed in seeds:
|
||||
for test in tests:
|
||||
|
||||
for test_id in tests:
|
||||
test = tests[test_id]
|
||||
for seed in seeds:
|
||||
# Init dict
|
||||
combination = {
|
||||
'test_name': test.name,
|
||||
'test_id': test_id,
|
||||
'model': model,
|
||||
'seed': seed,
|
||||
}
|
||||
hash_key = str(nxhash(json.dumps(combination, sort_keys=True)))
|
||||
combination['test_name'] = test.name
|
||||
|
||||
# if hash_key == "DE3D137E":
|
||||
# pass
|
||||
|
||||
if hash_key not in saved_results.keys():
|
||||
try:
|
||||
combination['answer'] = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
|
||||
print("\033[0;35mModel '\033[0m" +
|
||||
model +
|
||||
"\033[0;35m'" +
|
||||
(" " * (get_len(models) - len(model))) +
|
||||
" with seed \033[0m\033[0;30m" +
|
||||
("0" * (get_len(seeds) - len(str(seed)))) +
|
||||
"\033[0m" +
|
||||
str(seed) +
|
||||
"\033[0;35m now runs test '\033[0m" +
|
||||
test.name +
|
||||
"\033[0;35m'" +
|
||||
(" " * (get_len(tests) - len(test.name))) +
|
||||
" (\033[0m" +
|
||||
hash_key +
|
||||
"\033[0;35m)\033[0m",
|
||||
end=""
|
||||
)
|
||||
answer = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
|
||||
if isinstance(answer, str): # tool capabile return tools called as a list[dict]
|
||||
combination['answer'] = answer
|
||||
# combination['tool_calls'] = [] # no entry
|
||||
del answer
|
||||
elif isinstance(answer, dict): # calls
|
||||
combination['answer'] = answer['answer']
|
||||
combination['tool_calls'] = answer['tool_calls']
|
||||
del answer
|
||||
else:
|
||||
raise Exception(f"runnable returd unkown type {type(answer)}.")
|
||||
|
||||
|
||||
combination['test'] = test
|
||||
run_results[hash_key] = combination
|
||||
print("\033[0;32mModel '\033[0m" +
|
||||
print("\r\033[0;32mModel '\033[0m" +
|
||||
model +
|
||||
"\033[0;32m'" +
|
||||
(" " * (get_len(models) - len(model))) +
|
||||
" with seed \033[0m" +
|
||||
" with seed \033[0m\033[0;30m" +
|
||||
("0" * (get_len(seeds) - len(str(seed)))) +
|
||||
"\033[0m" +
|
||||
str(seed) +
|
||||
(" " * (get_len(seeds) - len(str(seed)))) +
|
||||
"\033[0;32m finished test '\033[0m" +
|
||||
test.name +
|
||||
"\033[0;32m'" +
|
||||
@@ -76,15 +127,16 @@ def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url:
|
||||
"\033[0;32m)\033[0m"
|
||||
)
|
||||
except Exception as e:
|
||||
print("\033[0;31mError: <\033[0m " + str(e) + "\033[0;31m>\033[0m trying to continue...")
|
||||
print("\r\033[0;31mError: <\033[0m" + str(e) + "\033[0;31m> at (\033[0m" + hash_key + "\033[0;31m). Continuing...")
|
||||
else:
|
||||
print("\033[0;34mModel '\033[0m" +
|
||||
print("\r\033[0;34mModel '\033[0m" +
|
||||
model +
|
||||
"\033[0;34m'" +
|
||||
(" " * (get_len(models) - len(model))) +
|
||||
" with seed \033[0m" +
|
||||
" with seed \033[0m\033[0;30m" +
|
||||
("0" * (get_len(seeds) - len(str(seed)))) +
|
||||
"\033[0m" +
|
||||
str(seed) +
|
||||
(" " * (get_len(seeds) - len(str(seed)))) +
|
||||
"\033[0;34m skipped test '\033[0m" +
|
||||
test.name +
|
||||
"\033[0;34m'" +
|
||||
@@ -100,25 +152,37 @@ def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url:
|
||||
for hash_key in run_results:
|
||||
result = run_results[hash_key]
|
||||
|
||||
entry = {
|
||||
'test_name': result['test_name'],
|
||||
'model': result['model'],
|
||||
'seed': result['seed'],
|
||||
'answer': result['answer'],
|
||||
'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url)
|
||||
}
|
||||
try:
|
||||
entry = {
|
||||
'test_name': result['test_name'],
|
||||
'test_id': result['test_id'],
|
||||
'model': result['model'],
|
||||
'seed': result['seed'],
|
||||
'answer': result['answer'],
|
||||
'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url),
|
||||
}
|
||||
except Exception as e:
|
||||
print("\033[0;31mError validating entry (\033[0m" + hash_key + "\033[0;31m). <\033[0m" + str(e) + "\033[0;31m> Continuing...\033[0m")
|
||||
continue
|
||||
|
||||
try:
|
||||
entry['tool_calls'] = result['tool_calls']
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
saved_results[hash_key] = entry # add result with validation to saved results
|
||||
|
||||
print("\033[0;36mTest results of model '\033[0m" +
|
||||
model +
|
||||
entry['model'] +
|
||||
"\033[0;36m'" +
|
||||
(" " * (get_len(models) - len(entry['model']))) +
|
||||
" with seed \033[0m" +
|
||||
str(seed) +
|
||||
(" " * (get_len(seeds) - len(str(entry['seed'])))) +
|
||||
" with seed \033[0m\033[0;30m" +
|
||||
("0" * (get_len(seeds) - len(str(entry['seed'])))) +
|
||||
"\033[0m" +
|
||||
str(entry['seed']) +
|
||||
"\033[0;36m on test '\033[0m" +
|
||||
test.name +
|
||||
entry['test_name'] +
|
||||
"\033[0;36m'" +
|
||||
(" " * (get_len(tests) - len(entry['test_name']))) +
|
||||
" (\033[0m" +
|
||||
|
||||
Reference in New Issue
Block a user