Saving results (hash always diffrent) = skipping doesnt work

This commit is contained in:
Lennart J. Kurzweg (Nx2)
2024-08-07 21:27:40 +02:00
parent abd6320ce9
commit a448b957ae
4 changed files with 87 additions and 28 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@
*/__pycache__/*
.direnv
.vscode
saved_results.json

View File

@@ -2,28 +2,69 @@ from libs.test_class import Test
from libs.validators import system_human_answer_match
from libs.runnables import basic
import json
def padd(list, element):
longest = 0
for s in list:
longest = max(longest, len(str(s)))
return str(element).ljust(longest)
def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url: str):
results = []
esc = "\033"
# try:
with open("./saved_results.json", "r") as f:
saved_results = json.load(fp=f)
# except:
# saved_results = {}
# Get Results
run_results = {}
for model in models:
for seed in seeds:
for test in tests:
try:
result = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
results.append({"test": test,"model": model, "seed": seed, "result": result})
# Init dict
combination = {
'test_name': test.name,
'model': model,
'seed': seed,
}
hash_key = str(hash(json.dumps(combination, sort_keys=True)))
if hash_key not in saved_results.keys():
# try:
combination['answer'] = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
combination['test'] = test
run_results[hash_key] = combination
print(f"Model {padd(models, model)} starting with seed {padd(seeds, seed)} is done with test '{test.name}'.")
except Exception as e:
print("\033[0;31mError:\033[0m " + str(e))
# except Exception as e:
# print("\033[0;31mError:\033[0m " + str(e))
else:
print(f"Skipped {combination}")
for result in results:
result['validation'] = result['test'].validator(test=result['test'], answer=result['result'], base_url=base_url)
print(f"Validation of answer from test {result['test'].name} by {result['model']} with seed {result['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if result['validation'] == True else '\033[0;31mincorrect\033[0m'))
# Validate Results
for hash_key in run_results:
result = run_results[hash_key]
return results
entry = {
'test_name': result['test_name'],
'model': result['model'],
'seed': result['seed'],
'answer': result['answer'],
'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url)
}
saved_results[hash_key] = entry # add result with validation to saved results
print(f"Validation of answer from test {entry['test_name']} by {entry['model']} with seed {entry['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if entry['validation'] == True else '\033[0;31mincorrect\033[0m'))
with open("./saved_results.json", "w") as f:
json.dump(fp=f, obj=saved_results, indent=4, ensure_ascii=False)
print("Dumped")
return saved_results

View File

@@ -1,15 +1,17 @@
from langchain_ollama.chat_models import ChatOllama
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
from langchain.tools import Tool
from langchain.tools import tool
from libs.test_class import Test
def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
def rate(rating: bool) -> None:
@tool
def rate(rating: bool) -> bool:
"""Rate answer as correct (True) or as incorrect (False)."""
return rating
prompt = ChatPromptTemplate.from_messages([
SystemMessagePromptTemplate.from_template(template="""Rate the answer as correct, if the answer is
SystemMessagePromptTemplate.from_template(template="""You evaluate LLMs. Rate the LLM answer as correct, if the answer is
{validation_input}
else as incorrect. Only use the rate tool. Do not answer conversationally."""),
@@ -24,14 +26,14 @@ def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
HumanMessagePromptTemplate.from_template(template="""System Message:
{system_msg}
Query:
Human query:
{human_msg}
Answer:
LLM answer:
{answer}
""")
]).invoke({
"validation_input": test.validation_input,
"validation_input": test.validation_input['criteria'],
"system_msg": test.runnable_input['system_msg'],
"human_msg": test.runnable_input['human_msg'],
"answer": answer
@@ -46,7 +48,10 @@ Answer:
ai_msg = llm.invoke(prompt)
try:
return ai_msg.tool_calls[0]['args']['rating']
ret_str = rate.invoke(ai_msg.tool_calls[0]).content
if ret_str.lower() == 'true': return True
elif ret_str.lower() == 'false': return False
else: raise Exception(f"rate tool retured {ret_str}")
except IndexError as e:
print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
return system_human_answer_match(test=test, answer=answer)

View File

@@ -19,9 +19,9 @@ def main():
# "tinyllama:1.1b",
]
seeds = [
# 2,
222,
22222,
2,
# 222,
# 22222,
# 2222222
]
tests = [
@@ -34,9 +34,13 @@ def main():
},
validator=system_human_answer_match,
validation_input={
"criteria": """- in Mandarin Chinese
"criteria": """- in Mandarin Chinese from front to finnish
- factually correct
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)""",
- about healthy vegetables
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)
Again, the message has to be entirely in Manadarin Chineese.
That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct""",
}
),
Test(
@@ -53,6 +57,7 @@ def main():
validator=regex_match_any,
validation_input={
"patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
# "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
}
),
]
@@ -65,7 +70,14 @@ def main():
)
print()
for result in results: print(f"\n\033[0;36mtest_name:\033[0m {result['test'].name}\n\033[0;36mmodel:\033[0m {result['model']}\n\033[0;36mseed:\033[0m {result['seed']}\n\033[0;36mvalidation_result:\033[0m {result['validation']}\n\033[0;36manswer: ⏎\033[0m\n{result['result']}")
for hash_key in results:
result = results[hash_key]
print(f"""
\033[0;36mtest_name:\033[0m {result['test_name']}
\033[0;36mmodel:\033[0m {result['model']}
\033[0;36mseed:\033[0m {result['seed']}
\033[0;36mvalidation_result:\033[0m {result['validation']}
\033[0;36manswer: »\033[0m{result['answer']}\033[0;36m«\033[0m""")
if __name__ == "__main__":
main()