Saving results (hash always diffrent) = skipping doesnt work

2024-08-07 21:27:40 +02:00
parent abd6320ce9
commit a448b957ae
4 changed files with 87 additions and 28 deletions
--- a/libs/validators.py
+++ b/libs/validators.py
@@ -1,18 +1,20 @@
 from langchain_ollama.chat_models import ChatOllama
 from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
-from langchain.tools import Tool
+from langchain.tools import tool
 from libs.test_class import Test

 def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:

-    def rate(rating: bool) -> None:
+    @tool
+    def rate(rating: bool) -> bool:
        """Rate answer as correct (True) or as incorrect (False)."""
+        return rating

    prompt = ChatPromptTemplate.from_messages([
-        SystemMessagePromptTemplate.from_template(template="""Rate the answer as correct, if the answer is  
-                                                  {validation_input}
-                                                  
-                                                  else as incorrect. Only use the rate tool. Do not answer conversationally."""),
+        SystemMessagePromptTemplate.from_template(template="""You evaluate LLMs. Rate the LLM answer as correct, if the answer is  
+{validation_input}
+
+else as incorrect. Only use the rate tool. Do not answer conversationally."""),
 #         SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if 
 # {validation_input}

@@ -24,14 +26,14 @@ def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
        HumanMessagePromptTemplate.from_template(template="""System Message:
 {system_msg}

-Query:
+Human query:
 {human_msg}

-Answer:
+LLM answer:
 {answer}
 """)
    ]).invoke({
-        "validation_input": test.validation_input,
+        "validation_input": test.validation_input['criteria'],
        "system_msg": test.runnable_input['system_msg'],
        "human_msg": test.runnable_input['human_msg'],
        "answer": answer
@@ -46,7 +48,10 @@ Answer:
    ai_msg = llm.invoke(prompt)

    try:
-        return ai_msg.tool_calls[0]['args']['rating']
+        ret_str = rate.invoke(ai_msg.tool_calls[0]).content
+        if ret_str.lower() == 'true': return True
+        elif ret_str.lower() == 'false': return False
+        else: raise Exception(f"rate tool retured {ret_str}")
    except IndexError as e:
        print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
        return system_human_answer_match(test=test, answer=answer)