From a448b957aeecdb2815aab8f7fd9dfa51c3527dbb Mon Sep 17 00:00:00 2001 From: "Lennart J. Kurzweg (Nx2)" Date: Wed, 7 Aug 2024 21:27:40 +0200 Subject: [PATCH] Saving results (hash always diffrent) = skipping doesnt work --- .gitignore | 1 + libs/run_tests.py | 63 ++++++++++++++++++++++++++++++++++++++-------- libs/validators.py | 25 ++++++++++-------- test_small_llms.py | 26 +++++++++++++------ 4 files changed, 87 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index 33563d3..a4b6044 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ */__pycache__/* .direnv .vscode +saved_results.json diff --git a/libs/run_tests.py b/libs/run_tests.py index fd9390f..da1389a 100644 --- a/libs/run_tests.py +++ b/libs/run_tests.py @@ -2,28 +2,69 @@ from libs.test_class import Test from libs.validators import system_human_answer_match from libs.runnables import basic +import json + def padd(list, element): longest = 0 for s in list: longest = max(longest, len(str(s))) return str(element).ljust(longest) + + def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url: str): - results = [] - esc = "\033" + + # try: + with open("./saved_results.json", "r") as f: + saved_results = json.load(fp=f) + # except: + # saved_results = {} + + + # Get Results + run_results = {} for model in models: for seed in seeds: for test in tests: - try: - result = test.runnable(model=model, seed=seed, test=test, base_url=base_url) - results.append({"test": test,"model": model, "seed": seed, "result": result}) + + # Init dict + combination = { + 'test_name': test.name, + 'model': model, + 'seed': seed, + } + hash_key = str(hash(json.dumps(combination, sort_keys=True))) + + if hash_key not in saved_results.keys(): + # try: + combination['answer'] = test.runnable(model=model, seed=seed, test=test, base_url=base_url) + combination['test'] = test + run_results[hash_key] = combination print(f"Model {padd(models, model)} starting with seed {padd(seeds, seed)} is done with test '{test.name}'.") - except Exception as e: - print("\033[0;31mError:\033[0m " + str(e)) + # except Exception as e: + # print("\033[0;31mError:\033[0m " + str(e)) + else: + print(f"Skipped {combination}") - for result in results: - result['validation'] = result['test'].validator(test=result['test'], answer=result['result'], base_url=base_url) - print(f"Validation of answer from test {result['test'].name} by {result['model']} with seed {result['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if result['validation'] == True else '\033[0;31mincorrect\033[0m')) + # Validate Results + for hash_key in run_results: + result = run_results[hash_key] + + entry = { + 'test_name': result['test_name'], + 'model': result['model'], + 'seed': result['seed'], + 'answer': result['answer'], + 'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url) + } - return results + saved_results[hash_key] = entry # add result with validation to saved results + + print(f"Validation of answer from test {entry['test_name']} by {entry['model']} with seed {entry['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if entry['validation'] == True else '\033[0;31mincorrect\033[0m')) + + with open("./saved_results.json", "w") as f: + json.dump(fp=f, obj=saved_results, indent=4, ensure_ascii=False) + print("Dumped") + + return saved_results diff --git a/libs/validators.py b/libs/validators.py index 3b45fd2..3c8fb83 100644 --- a/libs/validators.py +++ b/libs/validators.py @@ -1,18 +1,20 @@ from langchain_ollama.chat_models import ChatOllama from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate -from langchain.tools import Tool +from langchain.tools import tool from libs.test_class import Test def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool: - def rate(rating: bool) -> None: + @tool + def rate(rating: bool) -> bool: """Rate answer as correct (True) or as incorrect (False).""" + return rating prompt = ChatPromptTemplate.from_messages([ - SystemMessagePromptTemplate.from_template(template="""Rate the answer as correct, if the answer is - {validation_input} - - else as incorrect. Only use the rate tool. Do not answer conversationally."""), + SystemMessagePromptTemplate.from_template(template="""You evaluate LLMs. Rate the LLM answer as correct, if the answer is +{validation_input} + +else as incorrect. Only use the rate tool. Do not answer conversationally."""), # SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if # {validation_input} @@ -24,14 +26,14 @@ def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool: HumanMessagePromptTemplate.from_template(template="""System Message: {system_msg} -Query: +Human query: {human_msg} -Answer: +LLM answer: {answer} """) ]).invoke({ - "validation_input": test.validation_input, + "validation_input": test.validation_input['criteria'], "system_msg": test.runnable_input['system_msg'], "human_msg": test.runnable_input['human_msg'], "answer": answer @@ -46,7 +48,10 @@ Answer: ai_msg = llm.invoke(prompt) try: - return ai_msg.tool_calls[0]['args']['rating'] + ret_str = rate.invoke(ai_msg.tool_calls[0]).content + if ret_str.lower() == 'true': return True + elif ret_str.lower() == 'false': return False + else: raise Exception(f"rate tool retured {ret_str}") except IndexError as e: print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...") return system_human_answer_match(test=test, answer=answer) diff --git a/test_small_llms.py b/test_small_llms.py index 542b15d..060f501 100644 --- a/test_small_llms.py +++ b/test_small_llms.py @@ -19,9 +19,9 @@ def main(): # "tinyllama:1.1b", ] seeds = [ - # 2, - 222, - 22222, + 2, + # 222, + # 22222, # 2222222 ] tests = [ @@ -34,9 +34,13 @@ def main(): }, validator=system_human_answer_match, validation_input={ - "criteria": """- in Mandarin Chinese - - factually correct - - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)""", + "criteria": """- in Mandarin Chinese from front to finnish +- factually correct +- about healthy vegetables +- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes) + +Again, the message has to be entirely in Manadarin Chineese. +That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct""", } ), Test( @@ -53,6 +57,7 @@ def main(): validator=regex_match_any, validation_input={ "patterns": ["33549659245", "33,549,659,245", "33.549.659.245"] + # "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245 } ), ] @@ -65,7 +70,14 @@ def main(): ) print() - for result in results: print(f"\n\033[0;36mtest_name:\033[0m {result['test'].name}\n\033[0;36mmodel:\033[0m {result['model']}\n\033[0;36mseed:\033[0m {result['seed']}\n\033[0;36mvalidation_result:\033[0m {result['validation']}\n\033[0;36manswer: ⏎\033[0m\n{result['result']}") + for hash_key in results: + result = results[hash_key] + print(f""" +\033[0;36mtest_name:\033[0m {result['test_name']} +\033[0;36mmodel:\033[0m {result['model']} +\033[0;36mseed:\033[0m {result['seed']} +\033[0;36mvalidation_result:\033[0m {result['validation']} +\033[0;36manswer: »\033[0m{result['answer']}\033[0;36m«\033[0m""") if __name__ == "__main__": main()