Saving results (hash always diffrent) = skipping doesnt work

This commit is contained in:
Lennart J. Kurzweg (Nx2)
2024-08-07 21:27:40 +02:00
parent abd6320ce9
commit a448b957ae
4 changed files with 87 additions and 28 deletions

View File

@@ -2,28 +2,69 @@ from libs.test_class import Test
from libs.validators import system_human_answer_match
from libs.runnables import basic
import json
def padd(list, element):
longest = 0
for s in list:
longest = max(longest, len(str(s)))
return str(element).ljust(longest)
def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url: str):
results = []
esc = "\033"
# try:
with open("./saved_results.json", "r") as f:
saved_results = json.load(fp=f)
# except:
# saved_results = {}
# Get Results
run_results = {}
for model in models:
for seed in seeds:
for test in tests:
try:
result = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
results.append({"test": test,"model": model, "seed": seed, "result": result})
# Init dict
combination = {
'test_name': test.name,
'model': model,
'seed': seed,
}
hash_key = str(hash(json.dumps(combination, sort_keys=True)))
if hash_key not in saved_results.keys():
# try:
combination['answer'] = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
combination['test'] = test
run_results[hash_key] = combination
print(f"Model {padd(models, model)} starting with seed {padd(seeds, seed)} is done with test '{test.name}'.")
except Exception as e:
print("\033[0;31mError:\033[0m " + str(e))
# except Exception as e:
# print("\033[0;31mError:\033[0m " + str(e))
else:
print(f"Skipped {combination}")
for result in results:
result['validation'] = result['test'].validator(test=result['test'], answer=result['result'], base_url=base_url)
print(f"Validation of answer from test {result['test'].name} by {result['model']} with seed {result['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if result['validation'] == True else '\033[0;31mincorrect\033[0m'))
# Validate Results
for hash_key in run_results:
result = run_results[hash_key]
entry = {
'test_name': result['test_name'],
'model': result['model'],
'seed': result['seed'],
'answer': result['answer'],
'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url)
}
return results
saved_results[hash_key] = entry # add result with validation to saved results
print(f"Validation of answer from test {entry['test_name']} by {entry['model']} with seed {entry['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if entry['validation'] == True else '\033[0;31mincorrect\033[0m'))
with open("./saved_results.json", "w") as f:
json.dump(fp=f, obj=saved_results, indent=4, ensure_ascii=False)
print("Dumped")
return saved_results

View File

@@ -1,18 +1,20 @@
from langchain_ollama.chat_models import ChatOllama
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
from langchain.tools import Tool
from langchain.tools import tool
from libs.test_class import Test
def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
def rate(rating: bool) -> None:
@tool
def rate(rating: bool) -> bool:
"""Rate answer as correct (True) or as incorrect (False)."""
return rating
prompt = ChatPromptTemplate.from_messages([
SystemMessagePromptTemplate.from_template(template="""Rate the answer as correct, if the answer is
{validation_input}
else as incorrect. Only use the rate tool. Do not answer conversationally."""),
SystemMessagePromptTemplate.from_template(template="""You evaluate LLMs. Rate the LLM answer as correct, if the answer is
{validation_input}
else as incorrect. Only use the rate tool. Do not answer conversationally."""),
# SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if
# {validation_input}
@@ -24,14 +26,14 @@ def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
HumanMessagePromptTemplate.from_template(template="""System Message:
{system_msg}
Query:
Human query:
{human_msg}
Answer:
LLM answer:
{answer}
""")
]).invoke({
"validation_input": test.validation_input,
"validation_input": test.validation_input['criteria'],
"system_msg": test.runnable_input['system_msg'],
"human_msg": test.runnable_input['human_msg'],
"answer": answer
@@ -46,7 +48,10 @@ Answer:
ai_msg = llm.invoke(prompt)
try:
return ai_msg.tool_calls[0]['args']['rating']
ret_str = rate.invoke(ai_msg.tool_calls[0]).content
if ret_str.lower() == 'true': return True
elif ret_str.lower() == 'false': return False
else: raise Exception(f"rate tool retured {ret_str}")
except IndexError as e:
print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
return system_human_answer_match(test=test, answer=answer)