cacheing, tests as dict, new tests

2024-08-14 21:03:03 +02:00
parent 298d8c83ef
commit 2f46056e0d
4 changed files with 501 additions and 74 deletions
--- a/libs/run_tests.py
+++ b/libs/run_tests.py
@@ -1,6 +1,5 @@
 from libs.test_class import Test
-from libs.validators import system_human_answer_match
-from libs.runnables import basic
+from typing import Union

 import json

@@ -16,18 +15,36 @@ def nxhash(text:str): # @BenVida StackOverflow
        hash = ( hash*281  ^ ord(ch)*997) & 0xFFFFFFFF
    return hex(hash)[2:].upper().zfill(8)

-def get_len(l: list) -> int:
-    m = 0
-    for e in l:
-        if isinstance(e, Test):
-            m = max(m, len(e.name))
-        elif isinstance(e, str):
-            m = max(m, len(e))
-        elif isinstance(e, int):
-            m = max(m, len(str(e)))
-        else:
-            raise Exception(f"get_len() only supports lits of Test, str or int but got {type(e)}")
-    return m
+def get_len(collection: Union[list, dict]) -> int:
+    maximum_length = 0
+
+    if isinstance(collection, dict):
+        collection_type = "tests"
+    elif isinstance(collection, list):
+        if isinstance(collection[0], str):
+                collection_type = "models"
+        elif isinstance(collection[0], int):
+            collection_type = "seeds"
+        else: 
+            raise TypeError("get_len: unsupported collection_type")
+    else:
+        raise TypeError("get_len: unsupported collection_type")
+
+    match collection_type:
+        case "models":
+            for model_name in collection:
+                maximum_length = max(maximum_length, len(model_name))
+        case "seeds":
+            for seed in collection:
+                maximum_length = max(maximum_length, len(str(seed)))
+        case "tests":
+            for test_id in collection:
+                maximum_length = max(maximum_length, len(collection[test_id].name))
+        case _:
+            for model_name in collection:
+                raise TypeError("get_len: unsupported collection_type")
+        
+    return maximum_length

    
    
@@ -44,29 +61,63 @@ def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url:
    run_results = {}
    print("Starting to run Tests ... ")
    for model in models:
-        for seed in seeds:
-            for test in tests:
-
+        for test_id in tests:
+            test = tests[test_id]
+            for seed in seeds:
                # Init dict
                combination = { 
-                    'test_name': test.name,
+                    'test_id': test_id,
                    'model': model,
                    'seed': seed,
                }
                hash_key = str(nxhash(json.dumps(combination, sort_keys=True)))
+                combination['test_name'] = test.name
+
+                # if hash_key == "DE3D137E":
+                    # pass

                if hash_key not in saved_results.keys():
                    try: 
-                        combination['answer'] = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
+                        print("\033[0;35mModel '\033[0m" +
+                              model +
+                              "\033[0;35m'" +
+                              (" " * (get_len(models) - len(model))) +
+                              " with seed \033[0m\033[0;30m" +
+                              ("0" * (get_len(seeds) - len(str(seed)))) +
+                              "\033[0m" +
+                              str(seed) +
+                              "\033[0;35m now runs test '\033[0m" +
+                              test.name +
+                              "\033[0;35m'" +
+                              (" " * (get_len(tests) - len(test.name))) +
+                              " (\033[0m" +
+                              hash_key +
+                              "\033[0;35m)\033[0m",
+                              end=""
+                        )
+                        answer = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
+                        if isinstance(answer, str): # tool capabile return tools called as a list[dict]
+                            combination['answer'] = answer
+                            # combination['tool_calls'] = [] # no entry
+                            del answer
+                        elif isinstance(answer, dict): # calls
+                            combination['answer'] = answer['answer']
+                            combination['tool_calls'] = answer['tool_calls']
+                            del answer
+                        else: 
+                            raise Exception(f"runnable returd unkown type {type(answer)}.")
+                            
+                            
                        combination['test'] = test
                        run_results[hash_key] = combination
-                        print("\033[0;32mModel '\033[0m" +
+                        print("\r\033[0;32mModel '\033[0m" +
                              model +
                              "\033[0;32m'" +
                              (" " * (get_len(models) - len(model))) +
-                              " with seed \033[0m" +
+                              " with seed \033[0m\033[0;30m" +
+                              ("0" * (get_len(seeds) - len(str(seed)))) +
+                              "\033[0m" +
                              str(seed) +
-                              (" " * (get_len(seeds) - len(str(seed)))) +
                              "\033[0;32m finished test '\033[0m" +
                              test.name +
                              "\033[0;32m'" +
@@ -76,15 +127,16 @@ def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url:
                              "\033[0;32m)\033[0m"
                        )
                    except Exception as e:
-                          print("\033[0;31mError: <\033[0m " + str(e) + "\033[0;31m>\033[0m trying to continue...")
+                          print("\r\033[0;31mError: <\033[0m" + str(e) + "\033[0;31m> at (\033[0m" + hash_key + "\033[0;31m). Continuing...")
                else: 
-                    print("\033[0;34mModel '\033[0m" +
+                    print("\r\033[0;34mModel '\033[0m" +
                          model +
                          "\033[0;34m'" +
                          (" " * (get_len(models) - len(model))) +
-                          " with seed \033[0m" +
+                          " with seed \033[0m\033[0;30m" +
+                          ("0" * (get_len(seeds) - len(str(seed)))) +
+                          "\033[0m" +
                          str(seed) +
-                          (" " * (get_len(seeds) - len(str(seed)))) +
                          "\033[0;34m skipped  test '\033[0m" +
                          test.name +
                          "\033[0;34m'" +
@@ -100,25 +152,37 @@ def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url:
    for hash_key in run_results:
        result = run_results[hash_key]
        
-        entry = {
-            'test_name':  result['test_name'],
-            'model':      result['model'],
-            'seed':       result['seed'],
-            'answer':     result['answer'],
-            'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url)
-        }
+        try:
+            entry = {
+                'test_name':  result['test_name'],
+                'test_id':    result['test_id'],
+                'model':      result['model'],
+                'seed':       result['seed'],
+                'answer':     result['answer'],
+                'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url),
+            }
+        except Exception as e:
+            print("\033[0;31mError validating entry (\033[0m" + hash_key + "\033[0;31m). <\033[0m" + str(e) + "\033[0;31m> Continuing...\033[0m")
+            continue
+
+        try:
+            entry['tool_calls'] = result['tool_calls'] 
+        except:
+            pass
+            

        saved_results[hash_key] = entry # add result with validation to saved results

        print("\033[0;36mTest results of model '\033[0m" +
-              model +
+              entry['model'] +
              "\033[0;36m'" +
              (" " * (get_len(models) - len(entry['model']))) +
-              " with seed \033[0m" +
-              str(seed) +
-              (" " * (get_len(seeds) - len(str(entry['seed'])))) +
+              " with seed \033[0m\033[0;30m" +
+              ("0" * (get_len(seeds) - len(str(entry['seed'])))) +
+              "\033[0m" +
+              str(entry['seed']) +
              "\033[0;36m on test '\033[0m" +
-              test.name +
+              entry['test_name'] +
              "\033[0;36m'" +
              (" " * (get_len(tests) - len(entry['test_name']))) +
              " (\033[0m" +