Saving results (hash always diffrent) = skipping doesnt work

This commit is contained in:
Lennart J. Kurzweg (Nx2)
2024-08-07 21:27:40 +02:00
parent abd6320ce9
commit a448b957ae
4 changed files with 87 additions and 28 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@
*/__pycache__/* */__pycache__/*
.direnv .direnv
.vscode .vscode
saved_results.json

View File

@@ -2,28 +2,69 @@ from libs.test_class import Test
from libs.validators import system_human_answer_match from libs.validators import system_human_answer_match
from libs.runnables import basic from libs.runnables import basic
import json
def padd(list, element): def padd(list, element):
longest = 0 longest = 0
for s in list: for s in list:
longest = max(longest, len(str(s))) longest = max(longest, len(str(s)))
return str(element).ljust(longest) return str(element).ljust(longest)
def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url: str): def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url: str):
results = []
esc = "\033" # try:
with open("./saved_results.json", "r") as f:
saved_results = json.load(fp=f)
# except:
# saved_results = {}
# Get Results
run_results = {}
for model in models: for model in models:
for seed in seeds: for seed in seeds:
for test in tests: for test in tests:
try:
result = test.runnable(model=model, seed=seed, test=test, base_url=base_url) # Init dict
results.append({"test": test,"model": model, "seed": seed, "result": result}) combination = {
'test_name': test.name,
'model': model,
'seed': seed,
}
hash_key = str(hash(json.dumps(combination, sort_keys=True)))
if hash_key not in saved_results.keys():
# try:
combination['answer'] = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
combination['test'] = test
run_results[hash_key] = combination
print(f"Model {padd(models, model)} starting with seed {padd(seeds, seed)} is done with test '{test.name}'.") print(f"Model {padd(models, model)} starting with seed {padd(seeds, seed)} is done with test '{test.name}'.")
except Exception as e: # except Exception as e:
print("\033[0;31mError:\033[0m " + str(e)) # print("\033[0;31mError:\033[0m " + str(e))
else:
print(f"Skipped {combination}")
for result in results:
result['validation'] = result['test'].validator(test=result['test'], answer=result['result'], base_url=base_url)
print(f"Validation of answer from test {result['test'].name} by {result['model']} with seed {result['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if result['validation'] == True else '\033[0;31mincorrect\033[0m')) # Validate Results
for hash_key in run_results:
result = run_results[hash_key]
return results entry = {
'test_name': result['test_name'],
'model': result['model'],
'seed': result['seed'],
'answer': result['answer'],
'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url)
}
saved_results[hash_key] = entry # add result with validation to saved results
print(f"Validation of answer from test {entry['test_name']} by {entry['model']} with seed {entry['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if entry['validation'] == True else '\033[0;31mincorrect\033[0m'))
with open("./saved_results.json", "w") as f:
json.dump(fp=f, obj=saved_results, indent=4, ensure_ascii=False)
print("Dumped")
return saved_results

View File

@@ -1,18 +1,20 @@
from langchain_ollama.chat_models import ChatOllama from langchain_ollama.chat_models import ChatOllama
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
from langchain.tools import Tool from langchain.tools import tool
from libs.test_class import Test from libs.test_class import Test
def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool: def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
def rate(rating: bool) -> None: @tool
def rate(rating: bool) -> bool:
"""Rate answer as correct (True) or as incorrect (False).""" """Rate answer as correct (True) or as incorrect (False)."""
return rating
prompt = ChatPromptTemplate.from_messages([ prompt = ChatPromptTemplate.from_messages([
SystemMessagePromptTemplate.from_template(template="""Rate the answer as correct, if the answer is SystemMessagePromptTemplate.from_template(template="""You evaluate LLMs. Rate the LLM answer as correct, if the answer is
{validation_input} {validation_input}
else as incorrect. Only use the rate tool. Do not answer conversationally."""), else as incorrect. Only use the rate tool. Do not answer conversationally."""),
# SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if # SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if
# {validation_input} # {validation_input}
@@ -24,14 +26,14 @@ def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
HumanMessagePromptTemplate.from_template(template="""System Message: HumanMessagePromptTemplate.from_template(template="""System Message:
{system_msg} {system_msg}
Query: Human query:
{human_msg} {human_msg}
Answer: LLM answer:
{answer} {answer}
""") """)
]).invoke({ ]).invoke({
"validation_input": test.validation_input, "validation_input": test.validation_input['criteria'],
"system_msg": test.runnable_input['system_msg'], "system_msg": test.runnable_input['system_msg'],
"human_msg": test.runnable_input['human_msg'], "human_msg": test.runnable_input['human_msg'],
"answer": answer "answer": answer
@@ -46,7 +48,10 @@ Answer:
ai_msg = llm.invoke(prompt) ai_msg = llm.invoke(prompt)
try: try:
return ai_msg.tool_calls[0]['args']['rating'] ret_str = rate.invoke(ai_msg.tool_calls[0]).content
if ret_str.lower() == 'true': return True
elif ret_str.lower() == 'false': return False
else: raise Exception(f"rate tool retured {ret_str}")
except IndexError as e: except IndexError as e:
print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...") print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
return system_human_answer_match(test=test, answer=answer) return system_human_answer_match(test=test, answer=answer)

View File

@@ -19,9 +19,9 @@ def main():
# "tinyllama:1.1b", # "tinyllama:1.1b",
] ]
seeds = [ seeds = [
# 2, 2,
222, # 222,
22222, # 22222,
# 2222222 # 2222222
] ]
tests = [ tests = [
@@ -34,9 +34,13 @@ def main():
}, },
validator=system_human_answer_match, validator=system_human_answer_match,
validation_input={ validation_input={
"criteria": """- in Mandarin Chinese "criteria": """- in Mandarin Chinese from front to finnish
- factually correct - factually correct
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)""", - about healthy vegetables
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)
Again, the message has to be entirely in Manadarin Chineese.
That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct""",
} }
), ),
Test( Test(
@@ -53,6 +57,7 @@ def main():
validator=regex_match_any, validator=regex_match_any,
validation_input={ validation_input={
"patterns": ["33549659245", "33,549,659,245", "33.549.659.245"] "patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
# "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
} }
), ),
] ]
@@ -65,7 +70,14 @@ def main():
) )
print() print()
for result in results: print(f"\n\033[0;36mtest_name:\033[0m {result['test'].name}\n\033[0;36mmodel:\033[0m {result['model']}\n\033[0;36mseed:\033[0m {result['seed']}\n\033[0;36mvalidation_result:\033[0m {result['validation']}\n\033[0;36manswer: ⏎\033[0m\n{result['result']}") for hash_key in results:
result = results[hash_key]
print(f"""
\033[0;36mtest_name:\033[0m {result['test_name']}
\033[0;36mmodel:\033[0m {result['model']}
\033[0;36mseed:\033[0m {result['seed']}
\033[0;36mvalidation_result:\033[0m {result['validation']}
\033[0;36manswer: »\033[0m{result['answer']}\033[0;36m«\033[0m""")
if __name__ == "__main__": if __name__ == "__main__":
main() main()