Saving results (hash always diffrent) = skipping doesnt work
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@
|
|||||||
*/__pycache__/*
|
*/__pycache__/*
|
||||||
.direnv
|
.direnv
|
||||||
.vscode
|
.vscode
|
||||||
|
saved_results.json
|
||||||
|
|||||||
@@ -2,28 +2,69 @@ from libs.test_class import Test
|
|||||||
from libs.validators import system_human_answer_match
|
from libs.validators import system_human_answer_match
|
||||||
from libs.runnables import basic
|
from libs.runnables import basic
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
def padd(list, element):
|
def padd(list, element):
|
||||||
longest = 0
|
longest = 0
|
||||||
for s in list:
|
for s in list:
|
||||||
longest = max(longest, len(str(s)))
|
longest = max(longest, len(str(s)))
|
||||||
return str(element).ljust(longest)
|
return str(element).ljust(longest)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url: str):
|
def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url: str):
|
||||||
results = []
|
|
||||||
esc = "\033"
|
# try:
|
||||||
|
with open("./saved_results.json", "r") as f:
|
||||||
|
saved_results = json.load(fp=f)
|
||||||
|
# except:
|
||||||
|
# saved_results = {}
|
||||||
|
|
||||||
|
|
||||||
|
# Get Results
|
||||||
|
run_results = {}
|
||||||
for model in models:
|
for model in models:
|
||||||
for seed in seeds:
|
for seed in seeds:
|
||||||
for test in tests:
|
for test in tests:
|
||||||
try:
|
|
||||||
result = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
|
# Init dict
|
||||||
results.append({"test": test,"model": model, "seed": seed, "result": result})
|
combination = {
|
||||||
|
'test_name': test.name,
|
||||||
|
'model': model,
|
||||||
|
'seed': seed,
|
||||||
|
}
|
||||||
|
hash_key = str(hash(json.dumps(combination, sort_keys=True)))
|
||||||
|
|
||||||
|
if hash_key not in saved_results.keys():
|
||||||
|
# try:
|
||||||
|
combination['answer'] = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
|
||||||
|
combination['test'] = test
|
||||||
|
run_results[hash_key] = combination
|
||||||
print(f"Model {padd(models, model)} starting with seed {padd(seeds, seed)} is done with test '{test.name}'.")
|
print(f"Model {padd(models, model)} starting with seed {padd(seeds, seed)} is done with test '{test.name}'.")
|
||||||
except Exception as e:
|
# except Exception as e:
|
||||||
print("\033[0;31mError:\033[0m " + str(e))
|
# print("\033[0;31mError:\033[0m " + str(e))
|
||||||
|
else:
|
||||||
|
print(f"Skipped {combination}")
|
||||||
|
|
||||||
for result in results:
|
|
||||||
result['validation'] = result['test'].validator(test=result['test'], answer=result['result'], base_url=base_url)
|
|
||||||
|
|
||||||
print(f"Validation of answer from test {result['test'].name} by {result['model']} with seed {result['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if result['validation'] == True else '\033[0;31mincorrect\033[0m'))
|
# Validate Results
|
||||||
|
for hash_key in run_results:
|
||||||
|
result = run_results[hash_key]
|
||||||
|
|
||||||
return results
|
entry = {
|
||||||
|
'test_name': result['test_name'],
|
||||||
|
'model': result['model'],
|
||||||
|
'seed': result['seed'],
|
||||||
|
'answer': result['answer'],
|
||||||
|
'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url)
|
||||||
|
}
|
||||||
|
|
||||||
|
saved_results[hash_key] = entry # add result with validation to saved results
|
||||||
|
|
||||||
|
print(f"Validation of answer from test {entry['test_name']} by {entry['model']} with seed {entry['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if entry['validation'] == True else '\033[0;31mincorrect\033[0m'))
|
||||||
|
|
||||||
|
with open("./saved_results.json", "w") as f:
|
||||||
|
json.dump(fp=f, obj=saved_results, indent=4, ensure_ascii=False)
|
||||||
|
print("Dumped")
|
||||||
|
|
||||||
|
return saved_results
|
||||||
|
|||||||
@@ -1,18 +1,20 @@
|
|||||||
from langchain_ollama.chat_models import ChatOllama
|
from langchain_ollama.chat_models import ChatOllama
|
||||||
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
|
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
|
||||||
from langchain.tools import Tool
|
from langchain.tools import tool
|
||||||
from libs.test_class import Test
|
from libs.test_class import Test
|
||||||
|
|
||||||
def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
|
def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
|
||||||
|
|
||||||
def rate(rating: bool) -> None:
|
@tool
|
||||||
|
def rate(rating: bool) -> bool:
|
||||||
"""Rate answer as correct (True) or as incorrect (False)."""
|
"""Rate answer as correct (True) or as incorrect (False)."""
|
||||||
|
return rating
|
||||||
|
|
||||||
prompt = ChatPromptTemplate.from_messages([
|
prompt = ChatPromptTemplate.from_messages([
|
||||||
SystemMessagePromptTemplate.from_template(template="""Rate the answer as correct, if the answer is
|
SystemMessagePromptTemplate.from_template(template="""You evaluate LLMs. Rate the LLM answer as correct, if the answer is
|
||||||
{validation_input}
|
{validation_input}
|
||||||
|
|
||||||
else as incorrect. Only use the rate tool. Do not answer conversationally."""),
|
else as incorrect. Only use the rate tool. Do not answer conversationally."""),
|
||||||
# SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if
|
# SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if
|
||||||
# {validation_input}
|
# {validation_input}
|
||||||
|
|
||||||
@@ -24,14 +26,14 @@ def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
|
|||||||
HumanMessagePromptTemplate.from_template(template="""System Message:
|
HumanMessagePromptTemplate.from_template(template="""System Message:
|
||||||
{system_msg}
|
{system_msg}
|
||||||
|
|
||||||
Query:
|
Human query:
|
||||||
{human_msg}
|
{human_msg}
|
||||||
|
|
||||||
Answer:
|
LLM answer:
|
||||||
{answer}
|
{answer}
|
||||||
""")
|
""")
|
||||||
]).invoke({
|
]).invoke({
|
||||||
"validation_input": test.validation_input,
|
"validation_input": test.validation_input['criteria'],
|
||||||
"system_msg": test.runnable_input['system_msg'],
|
"system_msg": test.runnable_input['system_msg'],
|
||||||
"human_msg": test.runnable_input['human_msg'],
|
"human_msg": test.runnable_input['human_msg'],
|
||||||
"answer": answer
|
"answer": answer
|
||||||
@@ -46,7 +48,10 @@ Answer:
|
|||||||
ai_msg = llm.invoke(prompt)
|
ai_msg = llm.invoke(prompt)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return ai_msg.tool_calls[0]['args']['rating']
|
ret_str = rate.invoke(ai_msg.tool_calls[0]).content
|
||||||
|
if ret_str.lower() == 'true': return True
|
||||||
|
elif ret_str.lower() == 'false': return False
|
||||||
|
else: raise Exception(f"rate tool retured {ret_str}")
|
||||||
except IndexError as e:
|
except IndexError as e:
|
||||||
print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
|
print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
|
||||||
return system_human_answer_match(test=test, answer=answer)
|
return system_human_answer_match(test=test, answer=answer)
|
||||||
|
|||||||
@@ -19,9 +19,9 @@ def main():
|
|||||||
# "tinyllama:1.1b",
|
# "tinyllama:1.1b",
|
||||||
]
|
]
|
||||||
seeds = [
|
seeds = [
|
||||||
# 2,
|
2,
|
||||||
222,
|
# 222,
|
||||||
22222,
|
# 22222,
|
||||||
# 2222222
|
# 2222222
|
||||||
]
|
]
|
||||||
tests = [
|
tests = [
|
||||||
@@ -34,9 +34,13 @@ def main():
|
|||||||
},
|
},
|
||||||
validator=system_human_answer_match,
|
validator=system_human_answer_match,
|
||||||
validation_input={
|
validation_input={
|
||||||
"criteria": """- in Mandarin Chinese
|
"criteria": """- in Mandarin Chinese from front to finnish
|
||||||
- factually correct
|
- factually correct
|
||||||
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)""",
|
- about healthy vegetables
|
||||||
|
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)
|
||||||
|
|
||||||
|
Again, the message has to be entirely in Manadarin Chineese.
|
||||||
|
That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct""",
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
Test(
|
Test(
|
||||||
@@ -53,6 +57,7 @@ def main():
|
|||||||
validator=regex_match_any,
|
validator=regex_match_any,
|
||||||
validation_input={
|
validation_input={
|
||||||
"patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
|
"patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
|
||||||
|
# "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
@@ -65,7 +70,14 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
for result in results: print(f"\n\033[0;36mtest_name:\033[0m {result['test'].name}\n\033[0;36mmodel:\033[0m {result['model']}\n\033[0;36mseed:\033[0m {result['seed']}\n\033[0;36mvalidation_result:\033[0m {result['validation']}\n\033[0;36manswer: ⏎\033[0m\n{result['result']}")
|
for hash_key in results:
|
||||||
|
result = results[hash_key]
|
||||||
|
print(f"""
|
||||||
|
\033[0;36mtest_name:\033[0m {result['test_name']}
|
||||||
|
\033[0;36mmodel:\033[0m {result['model']}
|
||||||
|
\033[0;36mseed:\033[0m {result['seed']}
|
||||||
|
\033[0;36mvalidation_result:\033[0m {result['validation']}
|
||||||
|
\033[0;36manswer: »\033[0m{result['answer']}\033[0;36m«\033[0m""")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user