cacheing, tests as dict, new tests

2024-08-14 21:03:03 +02:00
parent 298d8c83ef
commit 2f46056e0d
4 changed files with 501 additions and 74 deletions
--- a/libs/validators.py
+++ b/libs/validators.py
@@ -2,6 +2,8 @@ from langchain_ollama.chat_models import ChatOllama
 from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
 from langchain.tools import tool
 from libs.test_class import Test
+from re import search
+from textwrap import dedent

 def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:

@@ -11,27 +13,19 @@ def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
        return rating

    prompt = ChatPromptTemplate.from_messages([
-        SystemMessagePromptTemplate.from_template(template="""You evaluate LLMs. Rate the LLM answer as correct, if the answer is  
-{validation_input}
+        SystemMessagePromptTemplate.from_template(template=dedent("""You evaluate LLMs. Rate the LLM answer as correct, if the answer is  
+        {validation_input}

-else as incorrect. Only use the rate tool. Do not answer conversationally."""),
-#         SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if 
-# {validation_input}
+        else as incorrect. Only use the rate tool. Do not answer conversationally.""")),
+        HumanMessagePromptTemplate.from_template(template=dedent("""System Message:
+        {system_msg}

-# If the answer does not match these criteria, rate the answer as incorrect. If the answer is a "refusal" or a "declaration of incapability", the answer is automatically incorrect.
+        Human query:
+        {human_msg}

-# **Only use the rate tool. Do not under any circumstances answer conversationally**.
-# DO NOT ANSWER WITH <I'm sorry but I do not have the capability to perform this task for you...> or anything like it.
-# Use the rate tool!"""),
-        HumanMessagePromptTemplate.from_template(template="""System Message:
-{system_msg}
-
-Human query:
-{human_msg}
-
-LLM answer:
-{answer}
-""")
+        LLM answer:
+        {answer}
+        """))
    ]).invoke({
        "validation_input": test.validation_input['criteria'],
        "system_msg": test.runnable_input['system_msg'],
@@ -48,7 +42,10 @@ LLM answer:
    ai_msg = llm.invoke(prompt)

    try:
-        ret_str = rate.invoke(ai_msg.tool_calls[0]).content
+        tool_call = ai_msg.tool_calls[0]
+        if tool_call['name'] != "rate":
+            raise Exception(f"Verificaiton model tried to tool `{tool_call['name']}` not `rate`") 
+        ret_str = rate.invoke(tool_call).content
        if ret_str.lower() == 'true': return True
        elif ret_str.lower() == 'false': return False
        else: raise Exception(f"rate tool retured {ret_str}")
@@ -56,8 +53,6 @@ LLM answer:
        print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
        return system_human_answer_match(test=test, answer=answer, base_url=base_url)

-from re import search
-
 def regex_match_any(test: Test, answer: str, base_url: str) -> bool:
    match = False
    for pattern in test.validation_input['patterns']: