Saving results (hash always diffrent) = skipping doesnt work
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@
|
||||
*/__pycache__/*
|
||||
.direnv
|
||||
.vscode
|
||||
saved_results.json
|
||||
|
||||
@@ -2,28 +2,69 @@ from libs.test_class import Test
|
||||
from libs.validators import system_human_answer_match
|
||||
from libs.runnables import basic
|
||||
|
||||
import json
|
||||
|
||||
def padd(list, element):
|
||||
longest = 0
|
||||
for s in list:
|
||||
longest = max(longest, len(str(s)))
|
||||
return str(element).ljust(longest)
|
||||
|
||||
|
||||
|
||||
def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url: str):
|
||||
results = []
|
||||
esc = "\033"
|
||||
|
||||
# try:
|
||||
with open("./saved_results.json", "r") as f:
|
||||
saved_results = json.load(fp=f)
|
||||
# except:
|
||||
# saved_results = {}
|
||||
|
||||
|
||||
# Get Results
|
||||
run_results = {}
|
||||
for model in models:
|
||||
for seed in seeds:
|
||||
for test in tests:
|
||||
try:
|
||||
result = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
|
||||
results.append({"test": test,"model": model, "seed": seed, "result": result})
|
||||
|
||||
# Init dict
|
||||
combination = {
|
||||
'test_name': test.name,
|
||||
'model': model,
|
||||
'seed': seed,
|
||||
}
|
||||
hash_key = str(hash(json.dumps(combination, sort_keys=True)))
|
||||
|
||||
if hash_key not in saved_results.keys():
|
||||
# try:
|
||||
combination['answer'] = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
|
||||
combination['test'] = test
|
||||
run_results[hash_key] = combination
|
||||
print(f"Model {padd(models, model)} starting with seed {padd(seeds, seed)} is done with test '{test.name}'.")
|
||||
except Exception as e:
|
||||
print("\033[0;31mError:\033[0m " + str(e))
|
||||
# except Exception as e:
|
||||
# print("\033[0;31mError:\033[0m " + str(e))
|
||||
else:
|
||||
print(f"Skipped {combination}")
|
||||
|
||||
for result in results:
|
||||
result['validation'] = result['test'].validator(test=result['test'], answer=result['result'], base_url=base_url)
|
||||
|
||||
print(f"Validation of answer from test {result['test'].name} by {result['model']} with seed {result['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if result['validation'] == True else '\033[0;31mincorrect\033[0m'))
|
||||
# Validate Results
|
||||
for hash_key in run_results:
|
||||
result = run_results[hash_key]
|
||||
|
||||
return results
|
||||
entry = {
|
||||
'test_name': result['test_name'],
|
||||
'model': result['model'],
|
||||
'seed': result['seed'],
|
||||
'answer': result['answer'],
|
||||
'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url)
|
||||
}
|
||||
|
||||
saved_results[hash_key] = entry # add result with validation to saved results
|
||||
|
||||
print(f"Validation of answer from test {entry['test_name']} by {entry['model']} with seed {entry['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if entry['validation'] == True else '\033[0;31mincorrect\033[0m'))
|
||||
|
||||
with open("./saved_results.json", "w") as f:
|
||||
json.dump(fp=f, obj=saved_results, indent=4, ensure_ascii=False)
|
||||
print("Dumped")
|
||||
|
||||
return saved_results
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
from langchain_ollama.chat_models import ChatOllama
|
||||
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
|
||||
from langchain.tools import Tool
|
||||
from langchain.tools import tool
|
||||
from libs.test_class import Test
|
||||
|
||||
def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
|
||||
|
||||
def rate(rating: bool) -> None:
|
||||
@tool
|
||||
def rate(rating: bool) -> bool:
|
||||
"""Rate answer as correct (True) or as incorrect (False)."""
|
||||
return rating
|
||||
|
||||
prompt = ChatPromptTemplate.from_messages([
|
||||
SystemMessagePromptTemplate.from_template(template="""Rate the answer as correct, if the answer is
|
||||
SystemMessagePromptTemplate.from_template(template="""You evaluate LLMs. Rate the LLM answer as correct, if the answer is
|
||||
{validation_input}
|
||||
|
||||
else as incorrect. Only use the rate tool. Do not answer conversationally."""),
|
||||
@@ -24,14 +26,14 @@ def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
|
||||
HumanMessagePromptTemplate.from_template(template="""System Message:
|
||||
{system_msg}
|
||||
|
||||
Query:
|
||||
Human query:
|
||||
{human_msg}
|
||||
|
||||
Answer:
|
||||
LLM answer:
|
||||
{answer}
|
||||
""")
|
||||
]).invoke({
|
||||
"validation_input": test.validation_input,
|
||||
"validation_input": test.validation_input['criteria'],
|
||||
"system_msg": test.runnable_input['system_msg'],
|
||||
"human_msg": test.runnable_input['human_msg'],
|
||||
"answer": answer
|
||||
@@ -46,7 +48,10 @@ Answer:
|
||||
ai_msg = llm.invoke(prompt)
|
||||
|
||||
try:
|
||||
return ai_msg.tool_calls[0]['args']['rating']
|
||||
ret_str = rate.invoke(ai_msg.tool_calls[0]).content
|
||||
if ret_str.lower() == 'true': return True
|
||||
elif ret_str.lower() == 'false': return False
|
||||
else: raise Exception(f"rate tool retured {ret_str}")
|
||||
except IndexError as e:
|
||||
print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
|
||||
return system_human_answer_match(test=test, answer=answer)
|
||||
|
||||
@@ -19,9 +19,9 @@ def main():
|
||||
# "tinyllama:1.1b",
|
||||
]
|
||||
seeds = [
|
||||
# 2,
|
||||
222,
|
||||
22222,
|
||||
2,
|
||||
# 222,
|
||||
# 22222,
|
||||
# 2222222
|
||||
]
|
||||
tests = [
|
||||
@@ -34,9 +34,13 @@ def main():
|
||||
},
|
||||
validator=system_human_answer_match,
|
||||
validation_input={
|
||||
"criteria": """- in Mandarin Chinese
|
||||
"criteria": """- in Mandarin Chinese from front to finnish
|
||||
- factually correct
|
||||
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)""",
|
||||
- about healthy vegetables
|
||||
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)
|
||||
|
||||
Again, the message has to be entirely in Manadarin Chineese.
|
||||
That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct""",
|
||||
}
|
||||
),
|
||||
Test(
|
||||
@@ -53,6 +57,7 @@ def main():
|
||||
validator=regex_match_any,
|
||||
validation_input={
|
||||
"patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
|
||||
# "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
|
||||
}
|
||||
),
|
||||
]
|
||||
@@ -65,7 +70,14 @@ def main():
|
||||
)
|
||||
|
||||
print()
|
||||
for result in results: print(f"\n\033[0;36mtest_name:\033[0m {result['test'].name}\n\033[0;36mmodel:\033[0m {result['model']}\n\033[0;36mseed:\033[0m {result['seed']}\n\033[0;36mvalidation_result:\033[0m {result['validation']}\n\033[0;36manswer: ⏎\033[0m\n{result['result']}")
|
||||
for hash_key in results:
|
||||
result = results[hash_key]
|
||||
print(f"""
|
||||
\033[0;36mtest_name:\033[0m {result['test_name']}
|
||||
\033[0;36mmodel:\033[0m {result['model']}
|
||||
\033[0;36mseed:\033[0m {result['seed']}
|
||||
\033[0;36mvalidation_result:\033[0m {result['validation']}
|
||||
\033[0;36manswer: »\033[0m{result['answer']}\033[0;36m«\033[0m""")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user