Saving results (hash always diffrent) = skipping doesnt work

2024-08-07 21:27:40 +02:00
parent abd6320ce9
commit a448b957ae
4 changed files with 87 additions and 28 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 */__pycache__/*
 .direnv
 .vscode
 saved_results.json
--- a/libs/run_tests.py
+++ b/libs/run_tests.py
@@ -2,28 +2,69 @@ from libs.test_class import Test
 from libs.validators import system_human_answer_match
 from libs.runnables import basic
 import json
 def padd(list, element):
    longest = 0
    for s in list:
        longest = max(longest, len(str(s))) 
    return str(element).ljust(longest)
 def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url: str):
-    results = []
+
-    esc = "\033"
+    # try:
    with open("./saved_results.json", "r") as f:
        saved_results = json.load(fp=f)
    # except: 
        # saved_results = {}
    # Get Results
    run_results = {}
    for model in models:
        for seed in seeds:
            for test in tests:
-                try: 
+
-                    result = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
+                # Init dict
-                    results.append({"test": test,"model": model, "seed": seed, "result": result})
+                combination = { 
                    'test_name': test.name,
                    'model': model,
                    'seed': seed,
                }
                hash_key = str(hash(json.dumps(combination, sort_keys=True)))
                if hash_key not in saved_results.keys():
                    # try: 
                    combination['answer'] = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
                    combination['test'] = test
                    run_results[hash_key] = combination
                    print(f"Model {padd(models, model)} starting with seed {padd(seeds, seed)} is done with test '{test.name}'.")
-                except Exception as e:
+                    # except Exception as e:
-                    print("\033[0;31mError:\033[0m " + str(e))
+                        # print("\033[0;31mError:\033[0m " + str(e))
                else: 
                    print(f"Skipped {combination}")
    for result in results:
        result['validation'] = result['test'].validator(test=result['test'], answer=result['result'], base_url=base_url)
-        print(f"Validation of answer from test {result['test'].name} by {result['model']} with seed {result['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if result['validation'] == True else '\033[0;31mincorrect\033[0m'))
+    # Validate Results
    for hash_key in run_results:
        result = run_results[hash_key]
-    return results
+        entry = {
            'test_name':  result['test_name'],
            'model':      result['model'],
            'seed':       result['seed'],
            'answer':     result['answer'],
            'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url)
        }
        saved_results[hash_key] = entry # add result with validation to saved results
        print(f"Validation of answer from test {entry['test_name']} by {entry['model']} with seed {entry['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if entry['validation'] == True else '\033[0;31mincorrect\033[0m'))
    with open("./saved_results.json", "w") as f:
        json.dump(fp=f, obj=saved_results, indent=4, ensure_ascii=False)
        print("Dumped")
    return saved_results
--- a/libs/validators.py
+++ b/libs/validators.py
@@ -1,18 +1,20 @@
 from langchain_ollama.chat_models import ChatOllama
 from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
-from langchain.tools import Tool
+from langchain.tools import tool
 from libs.test_class import Test
 def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
-    def rate(rating: bool) -> None:
+    @tool
    def rate(rating: bool) -> bool:
        """Rate answer as correct (True) or as incorrect (False)."""
        return rating
    prompt = ChatPromptTemplate.from_messages([
-        SystemMessagePromptTemplate.from_template(template="""Rate the answer as correct, if the answer is  
+        SystemMessagePromptTemplate.from_template(template="""You evaluate LLMs. Rate the LLM answer as correct, if the answer is  
-                                                  {validation_input}
+{validation_input}
-                                                  else as incorrect. Only use the rate tool. Do not answer conversationally."""),
+else as incorrect. Only use the rate tool. Do not answer conversationally."""),
 #         SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if 
 # {validation_input}
@@ -24,14 +26,14 @@ def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
        HumanMessagePromptTemplate.from_template(template="""System Message:
 {system_msg}
-Query:
+Human query:
 {human_msg}
-Answer:
+LLM answer:
 {answer}
 """)
    ]).invoke({
-        "validation_input": test.validation_input,
+        "validation_input": test.validation_input['criteria'],
        "system_msg": test.runnable_input['system_msg'],
        "human_msg": test.runnable_input['human_msg'],
        "answer": answer
@@ -46,7 +48,10 @@ Answer:
    ai_msg = llm.invoke(prompt)
    try:
-        return ai_msg.tool_calls[0]['args']['rating']
+        ret_str = rate.invoke(ai_msg.tool_calls[0]).content
        if ret_str.lower() == 'true': return True
        elif ret_str.lower() == 'false': return False
        else: raise Exception(f"rate tool retured {ret_str}")
    except IndexError as e:
        print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
        return system_human_answer_match(test=test, answer=answer)
--- a/test_small_llms.py
+++ b/test_small_llms.py
@@ -19,9 +19,9 @@ def main():
        # "tinyllama:1.1b",
    ]
    seeds = [
-        # 2,
+        2,
-        222,
+        # 222,
-        22222,
+        # 22222,
        # 2222222
    ]
    tests = [
@@ -34,9 +34,13 @@ def main():
    	    },
            validator=system_human_answer_match,
    	    validation_input={
-        	    "criteria": """- in Mandarin Chinese
+        	    "criteria": """- in Mandarin Chinese from front to finnish
-                            - factually correct
+- factually correct
-                            - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)""",
+- about healthy vegetables
 - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)
 Again, the message has to be entirely in Manadarin Chineese.
 That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct""",
    	    }
        ),
        Test(
@@ -53,6 +57,7 @@ def main():
            validator=regex_match_any,
    	    validation_input={
    	        "patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
    	        # "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
    	    }
        ),
    ]
@@ -65,7 +70,14 @@ def main():
    )
    print()
-    for result in results: print(f"\n\033[0;36mtest_name:\033[0m {result['test'].name}\n\033[0;36mmodel:\033[0m {result['model']}\n\033[0;36mseed:\033[0m {result['seed']}\n\033[0;36mvalidation_result:\033[0m {result['validation']}\n\033[0;36manswer: ⏎\033[0m\n{result['result']}")
+    for hash_key in results:
        result = results[hash_key]
        print(f"""
 \033[0;36mtest_name:\033[0m {result['test_name']}
 \033[0;36mmodel:\033[0m {result['model']}
 \033[0;36mseed:\033[0m {result['seed']}
 \033[0;36mvalidation_result:\033[0m {result['validation']}
 \033[0;36manswer: »\033[0m{result['answer']}\033[0;36m«\033[0m""")
 if __name__ == "__main__":
    main()