cacheing, tests as dict, new tests
This commit is contained in:
@@ -2,6 +2,8 @@ from langchain_ollama.chat_models import ChatOllama
|
||||
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
|
||||
from langchain.tools import tool
|
||||
from libs.test_class import Test
|
||||
from re import search
|
||||
from textwrap import dedent
|
||||
|
||||
def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
|
||||
|
||||
@@ -11,27 +13,19 @@ def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
|
||||
return rating
|
||||
|
||||
prompt = ChatPromptTemplate.from_messages([
|
||||
SystemMessagePromptTemplate.from_template(template="""You evaluate LLMs. Rate the LLM answer as correct, if the answer is
|
||||
{validation_input}
|
||||
SystemMessagePromptTemplate.from_template(template=dedent("""You evaluate LLMs. Rate the LLM answer as correct, if the answer is
|
||||
{validation_input}
|
||||
|
||||
else as incorrect. Only use the rate tool. Do not answer conversationally."""),
|
||||
# SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if
|
||||
# {validation_input}
|
||||
else as incorrect. Only use the rate tool. Do not answer conversationally.""")),
|
||||
HumanMessagePromptTemplate.from_template(template=dedent("""System Message:
|
||||
{system_msg}
|
||||
|
||||
# If the answer does not match these criteria, rate the answer as incorrect. If the answer is a "refusal" or a "declaration of incapability", the answer is automatically incorrect.
|
||||
Human query:
|
||||
{human_msg}
|
||||
|
||||
# **Only use the rate tool. Do not under any circumstances answer conversationally**.
|
||||
# DO NOT ANSWER WITH <I'm sorry but I do not have the capability to perform this task for you...> or anything like it.
|
||||
# Use the rate tool!"""),
|
||||
HumanMessagePromptTemplate.from_template(template="""System Message:
|
||||
{system_msg}
|
||||
|
||||
Human query:
|
||||
{human_msg}
|
||||
|
||||
LLM answer:
|
||||
{answer}
|
||||
""")
|
||||
LLM answer:
|
||||
{answer}
|
||||
"""))
|
||||
]).invoke({
|
||||
"validation_input": test.validation_input['criteria'],
|
||||
"system_msg": test.runnable_input['system_msg'],
|
||||
@@ -48,7 +42,10 @@ LLM answer:
|
||||
ai_msg = llm.invoke(prompt)
|
||||
|
||||
try:
|
||||
ret_str = rate.invoke(ai_msg.tool_calls[0]).content
|
||||
tool_call = ai_msg.tool_calls[0]
|
||||
if tool_call['name'] != "rate":
|
||||
raise Exception(f"Verificaiton model tried to tool `{tool_call['name']}` not `rate`")
|
||||
ret_str = rate.invoke(tool_call).content
|
||||
if ret_str.lower() == 'true': return True
|
||||
elif ret_str.lower() == 'false': return False
|
||||
else: raise Exception(f"rate tool retured {ret_str}")
|
||||
@@ -56,8 +53,6 @@ LLM answer:
|
||||
print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
|
||||
return system_human_answer_match(test=test, answer=answer, base_url=base_url)
|
||||
|
||||
from re import search
|
||||
|
||||
def regex_match_any(test: Test, answer: str, base_url: str) -> bool:
|
||||
match = False
|
||||
for pattern in test.validation_input['patterns']:
|
||||
|
||||
Reference in New Issue
Block a user