building of pipeline (validation flaky)

2024-08-04 20:50:11 +02:00
parent e56fa9225c
commit 52a180b936
8 changed files with 168 additions and 53 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 .venv
-__pycache
+*/__pycache__/*
 .direnv
 .vscode
--- a/libs/init.py
+++ b/libs/init.py
--- a/libs/query_fits_to_answer.py
+++ b/libs/query_fits_to_answer.py
@@ -1,52 +0,0 @@
 from langchain_ollama.chat_models import ChatOllama
 from langchain_core.messages import SystemMessage
 from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate
 from langchain.tools import Tool
 def query_fits_to_answer(query: str, answer: str) -> bool:
    def rate(rating: bool) -> None:
        """Rate answer as correct (True) or as incorrect (False)."""
    prompt = ChatPromptTemplate.from_messages([
        SystemMessage(content="""You are a rating machine. You rate answers as correct if they are
        1. factually correct (every statement made)
        2. fitting response to the query answering all questions prompted
        if the answer does not mach these criteria, rate the answer as incorrect. **Only use the rate tool. Do not answer conversationally**.
        Do not answer with <I'm sorry but I do not have the capability to perform this task for you, I am happy to help you with any other queries you may have.> or anything like it. Just use the `rate` tool."""),
        HumanMessagePromptTemplate.from_template(template="""Query:
        {query}
        Answer:
        {answer}
        """)
    ]).invoke({"query": query, "answer": answer})
    llm = ChatOllama(model="llama3-groq-tool-use:70b").bind_tools([rate])
    ai_msg = llm.invoke(prompt)
    try:
        return ai_msg.tool_calls[0]['args']['rating']
    except IndexError as e:
        print(f"\rValidation Error of <{ai_msg.content}> Retrying...")
        return query_fits_to_answer(query=query, answer=answer)
 if __name__ == "__main__":
    # print(query_fits_to_answer(
    #     query="Who is Obama?",
    #     answer="Barack Obama was the 44th President of the United States, serving two terms from January 2009 to January 2017. He was a significant figure in American politics and made history as the first African American to hold the office."
    # ))
    # print(query_fits_to_answer(
    #     query="Who is Obama?",
    #     answer="Quantum computing is a revolutionary technology that uses the principles of quantum mechanics to perform calculations and operations on data. It's a fundamentally different approach from classical computing, which is based on bits (0s and 1s) and transistors."
    # ))
    # print(query_fits_to_answer(
    #     query="Who is Obama?",
    #     answer="Barack Obama was the 72th President of the United States, serving two terms from January 2005 to January 2013. He was a significant figure in American politics and made history as the first Chinese American to hold the office."
    # ))
    print(query_fits_to_answer(
        query="Who is Obama?",
        answer="Barack Obama was the 45th President of the United States, serving two terms from January 2009 to January 2017. He was a significant figure in American politics and made history as the first Chinese American to hold the office."
    ))
--- a/libs/run_tests.py
+++ b/libs/run_tests.py
@@ -0,0 +1,29 @@
 from libs.test_class import Test
 from libs.validators import system_human_answer_match
 from libs.runnables import basic
 def padd(list, element):
    longest = 0
    for s in list:
        longest = max(longest, len(str(s))) 
    return str(element).ljust(longest)
 def run_tests(models: list[str], seeds: list[int], tests: list[Test], base_url: str):
    results = []
    esc = "\033"
    for model in models:
        for seed in seeds:
            for test in tests:
                try: 
                    result = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
                    results.append({"test": test,"model": model, "seed": seed, "result": result})
                    print(f"Model {padd(models, model)} starting with seed {padd(seeds, seed)} is done with test '{test.name}'.")
                except Exception as e:
                    print("\033[0;31mError:\033[0m" + e)
    for result in results:
        result['validation'] = test.validator(test=result['test'], answer=result['result'], base_url=base_url)
        print(f"Validation of answer from test {result['test'].name} by {result['model']} with seed {result['seed']} evaluated to " + ('\033[0;32mcorrect\033[0m' if result['validation'] == True else '\033[0;31mincorrect\033[0m'))
    return results
--- a/libs/runnables.py
+++ b/libs/runnables.py
@@ -0,0 +1,16 @@
 from langchain_ollama.chat_models import ChatOllama
 from langchain_core.messages import SystemMessage, HumanMessage
 from libs.test_class import Test
 def basic(model: str, seed: int, test: Test, base_url: str) -> str:
    if test.system_msg == None: prompt = [ test.human_msg ]
    else:                       prompt = [ test.system_msg, test.human_msg ]
    llm = ChatOllama(
        model=model,
        seed=seed,
        base_url=base_url
    )
    ai_msg = llm.invoke(prompt)
    return ai_msg.content
--- a/libs/test_class.py
+++ b/libs/test_class.py
@@ -0,0 +1,14 @@
 from dataclasses import dataclass, field
 from typing import Callable
@dataclass
 class Test:
    name: str
    system_msg: field(default="You are a helful AI assistant.")
    human_msg: str
    validation_info: field(default="""- it is factually correct
 - it fits/answers the system message and human query
 - it is just the answer, and doesn't have any AI fragments (A/B versions, "end of message" parts, unfiting discalimers or notes)""")
    runnable: Callable
    validator: Callable
--- a/libs/validators.py
+++ b/libs/validators.py
@@ -0,0 +1,53 @@
 from langchain_ollama.chat_models import ChatOllama
 from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate
 from langchain.tools import Tool
 from libs.test_class import Test
 def system_human_answer_match(test: Test, answer: str, base_url: str) -> bool:
    def rate(rating: bool) -> None:
        """Rate answer as correct (True) or as incorrect (False)."""
    prompt = ChatPromptTemplate.from_messages([
        SystemMessagePromptTemplate.from_template(template="""Rate the answer as correct, if the answer is  
                                                  {validation_info}
                                                  else as incorrect. Only use the rate tool. Do not answer conversationally."""),
 #         SystemMessagePromptTemplate.from_template(template="""You are a rating machine. You are given 3 things: The system message, the Human query, and the AI response. You evaluate the response as correct if 
 # {validation_info}
 # If the answer does not match these criteria, rate the answer as incorrect. If the answer is a "refusal" or a "declaration of incapability", the answer is automatically incorrect.
 # **Only use the rate tool. Do not under any circumstances answer conversationally**.
 # DO NOT ANSWER WITH <I'm sorry but I do not have the capability to perform this task for you...> or anything like it.
 # Use the rate tool!"""),
        HumanMessagePromptTemplate.from_template(template="""System Message:
 {system_msg}
 Query:
 {human_msg}
 Answer:
 {answer}
 """)
    ]).invoke({
        "validation_info": test.validation_info,
        "system_msg": test.system_msg,
        "human_msg": test.human_msg,
        "answer": answer
    })
    llm = ChatOllama(
        model="llama3.1:70b",
        # model="llama3-groq-tool-use:70b",
        base_url=base_url
    ).bind_tools([rate])
    ai_msg = llm.invoke(prompt)
    try:
        return ai_msg.tool_calls[0]['args']['rating']
    except IndexError as e:
        print(f"\033[0;31mValidation Error \033[0mof {test.name} <{ai_msg.content[:20]}...> Retrying...")
        return system_human_answer_match(test=test, answer=answer)
--- a/test_small_llms.py
+++ b/test_small_llms.py
@@ -0,0 +1,54 @@
 from libs.test_class import Test
 from libs.run_tests import run_tests
 from libs.runnables import * 
 from libs.validators import * 
 from pprint import pprint
 def main():
    models = [
        # "llama3.1", # 8b
        # "llama3.1:70b",
        # "llama3-groq-tool-use", # latest
        # "llama3-groq-tool-use:70b",
        # "mixtral:8x7b",
        # "mixtral:8x22b",
        # "gemma2:2b",
        # "phi3", # 3.8b
        "tinyllama:1.1b",
    ]
    seeds = [
        # 2,
        222,
        # 22222,
        2222222
    ]
    tests = [
        Test(
    	    name="Chinese Fruit",
    	    system_msg="You are a helpful assistant. You serve people across the globe. You can be a freind, but stay professional.",
    	    human_msg="什么蔬菜最健康？",
    	    validation_info="""- in Mandarin Chinese
 - factually correct
 - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)""",
            runnable=basic,
            validator=system_human_answer_match
        ),
        # Test(
        #     name="Simple Multiplication",
        #     system_msg=
        # )
    ]
    results = run_tests(
        models=models,
        seeds=seeds,
        tests=tests,
        base_url="http://bolt.hs-mittweida.de:11434"
    )
    print()
    for result in results: print(f"\n\033[0;36mtest_name:\033[0m {result['test'].name}\n\033[0;36mmodel:\033[0m {result['model']}\n\033[0;36mseed:\033[0m {result['seed']}\n\033[0;36mvalidation_result:\033[0m {result['validation']}\n\033[0;36manswer: ⏎\033[0m\n{result['result']}")
 if __name__ == "__main__":
    main()