From a448b957aeecdb2815aab8f7fd9dfa51c3527dbb Mon Sep 17 00:00:00 2001
From: "Lennart J. Kurzweg (Nx2)" <git@nx2.site>
Date: Wed, 7 Aug 2024 21:27:40 +0200
Subject: [PATCH] Saving results (hash always diffrent) = skipping doesnt work

---
 .gitignore         |  1 +
 libs/run_tests.py  | 63 ++++++++++++++++++++++++++++++++++++++--------
 libs/validators.py | 25 ++++++++++--------
 test_small_llms.py | 26 +++++++++++++------
 4 files changed, 87 insertions(+), 28 deletions(-)

diff --git a/.gitignore b/.gitignore
index 33563d3..a4b6044 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 */__pycache__/*
 .direnv
 .vscode
+saved_results.json
diff --git a/libs/run_tests.py b/libs/run_tests.py
index fd9390f..da1389a 100644
--- a/libs/run_tests.py
+++ b/libs/run_tests.py
@@ -2,28 +2,69 @@ from libs.test_class import Test
 from libs.validators import system_human_answer_match
 from libs.runnables import basic
 
+import json
+
 def padd(list, element):
     longest = 0
     for s in list:
         longest = max(longest, len(str(s))) 
     return str(element).ljust(longest)
 
+           
+
 def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url: str):
-    results = []
-    esc = "\033"
+
+    # try:
+    with open("./saved_results.json", "r") as f:
+        saved_results = json.load(fp=f)
+    # except: 
+        # saved_results = {}
+
+    
+    # Get Results
+    run_results = {}
     for model in models:
         for seed in seeds:
             for test in tests:
-                try: 
-                    result = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
-                    results.append({"test": test,"model": model, "seed": seed, "result": result})
+
+                # Init dict
+                combination = { 
+                    'test_name': test.name,
+                    'model': model,
+                    'seed': seed,
+                }
+                hash_key = str(hash(json.dumps(combination, sort_keys=True)))
+
+                if hash_key not in saved_results.keys():
+                    # try: 
+                    combination['answer'] = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
+                    combination['test'] = test
+                    run_results[hash_key] = combination
                     print(f"Model {padd(models, model)} starting with seed {padd(seeds, seed)} is done with test '{test.name}'.")
-                except Exception as e:
-                    print("\033[0;31mError:\033[0m " + str(e))
+                    # except Exception as e:
+                        # print("\033[0;31mError:\033[0m " + str(e))
+                else: 
+                    print(f"Skipped {combination}")
 
-    for result in results:
-        result['validation'] = result['test'].validator(test=result['test'], answer=result['result'], base_url=base_url)
 
-        print(f"Validation of answer from test {result['test'].name} by {result['model']} with seed {result['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if result['validation'] == True else '\033[0;31mincorrect\033[0m'))
+    # Validate Results
+    for hash_key in run_results:
+        result = run_results[hash_key]
+        
+        entry = {
+            'test_name':  result['test_name'],
+            'model':      result['model'],
+            'seed':       result['seed'],
+            'answer':     result['answer'],
+            'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url)
+        }
 
-    return results
+        saved_results[hash_key] = entry # add result with validation to saved results
+
+        print(f"Validation of answer from test {entry['test_name']} by {entry['model']} with seed {entry['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if entry['validation'] == True else '\033[0;31mincorrect\033[0m'))
+
+    with open("./saved_results.json", "w") as f:
+        json.dump(fp=f, obj=saved_results, indent=4, ensure_ascii=False)
+        print("Dumped")
+
+    return saved_results
diff --git a/libs/validators.py b/libs/validators.py
index 3b45fd2..3c8fb83 100644
--- a/libs/validators.py
+++ b/libs/validators.py
@@ -1,18 +1,20 @@
 from langchain_ollama.chat_models import ChatOllama
 from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
-from langchain.tools import Tool
+from langchain.tools import tool
 from libs.test_class import Test
 
 def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
 
-    def rate(rating: bool) -> None:
+    @tool
+    def rate(rating: bool) -> bool:
         """Rate answer as correct (True) or as incorrect (False)."""
+        return rating
 
     prompt = ChatPromptTemplate.from_messages([
-        SystemMessagePromptTemplate.from_template(template="""Rate the answer as correct, if the answer is  
-                                                  {validation_input}
-                                                  
-                                                  else as incorrect. Only use the rate tool. Do not answer conversationally."""),
+        SystemMessagePromptTemplate.from_template(template="""You evaluate LLMs. Rate the LLM answer as correct, if the answer is  
+{validation_input}
+
+else as incorrect. Only use the rate tool. Do not answer conversationally."""),
 #         SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if 
 # {validation_input}
 
@@ -24,14 +26,14 @@ def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
         HumanMessagePromptTemplate.from_template(template="""System Message:
 {system_msg}
 
-Query:
+Human query:
 {human_msg}
 
-Answer:
+LLM answer:
 {answer}
 """)
     ]).invoke({
-        "validation_input": test.validation_input,
+        "validation_input": test.validation_input['criteria'],
         "system_msg": test.runnable_input['system_msg'],
         "human_msg": test.runnable_input['human_msg'],
         "answer": answer
@@ -46,7 +48,10 @@ Answer:
     ai_msg = llm.invoke(prompt)
 
     try:
-        return ai_msg.tool_calls[0]['args']['rating']
+        ret_str = rate.invoke(ai_msg.tool_calls[0]).content
+        if ret_str.lower() == 'true': return True
+        elif ret_str.lower() == 'false': return False
+        else: raise Exception(f"rate tool retured {ret_str}")
     except IndexError as e:
         print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
         return system_human_answer_match(test=test, answer=answer)
diff --git a/test_small_llms.py b/test_small_llms.py
index 542b15d..060f501 100644
--- a/test_small_llms.py
+++ b/test_small_llms.py
@@ -19,9 +19,9 @@ def main():
         # "tinyllama:1.1b",
     ]
     seeds = [
-        # 2,
-        222,
-        22222,
+        2,
+        # 222,
+        # 22222,
         # 2222222
     ]
     tests = [
@@ -34,9 +34,13 @@ def main():
     	    },
             validator=system_human_answer_match,
     	    validation_input={
-        	    "criteria": """- in Mandarin Chinese
-                            - factually correct
-                            - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)""",
+        	    "criteria": """- in Mandarin Chinese from front to finnish
+- factually correct
+- about healthy vegetables
+- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)
+
+Again, the message has to be entirely in Manadarin Chineese.
+That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct""",
     	    }
         ),
         Test(
@@ -53,6 +57,7 @@ def main():
             validator=regex_match_any,
     	    validation_input={
     	        "patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
+    	        # "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
     	    }
         ),
     ]
@@ -65,7 +70,14 @@ def main():
     )
 
     print()
-    for result in results: print(f"\n\033[0;36mtest_name:\033[0m {result['test'].name}\n\033[0;36mmodel:\033[0m {result['model']}\n\033[0;36mseed:\033[0m {result['seed']}\n\033[0;36mvalidation_result:\033[0m {result['validation']}\n\033[0;36manswer: ⏎\033[0m\n{result['result']}")
+    for hash_key in results:
+        result = results[hash_key]
+        print(f"""
+\033[0;36mtest_name:\033[0m {result['test_name']}
+\033[0;36mmodel:\033[0m {result['model']}
+\033[0;36mseed:\033[0m {result['seed']}
+\033[0;36mvalidation_result:\033[0m {result['validation']}
+\033[0;36manswer: »\033[0m{result['answer']}\033[0;36m«\033[0m""")
 
 if __name__ == "__main__":
     main()