test-small-llms/suite_settings/tests.py

from libs.classes import Test
from libs.runnables import basic_prompt, one_tool_call_answer, agent_with_tools
from libs.validators import regex_match_any, system_human_answer_match
from libs.tools import add, multiply, get_current_date_and_time, get_notes_in_timespan, get_notes_containing, write_note, save_python_repl
from textwrap import dedent
from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, AIMessage

tests = {
    607: Test(
        name="Healthy Vegetables in Chinese",
        runnable=basic_prompt,
        runnable_input={
            "system_msg": "You are a helpful assistant. You serve people across the globe.",
            "human_msg": "什么蔬菜最健康？",
        },
        validator=system_human_answer_match,
        validation_input={
            "criteria": dedent("""- in Mandarin Chinese from front to finnish
            - factually correct
            - about healthy vegetables
            - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)

            Again, the message has to be entirely in Manadarin Chineese.
            That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct"""),
        },
    ),
    693: Test(
        name="Simple Multiplication",
        runnable=one_tool_call_answer,
        runnable_input={
            "system_msg": "You are a helpful assistant.",
            "human_msg": "What is 234215 times 143243?",
            "tools": {"add": add, "multiply": multiply},
        },
        validator=regex_match_any,
        validation_input={
            "patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
            # "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
        },
    ),
    120: Test(
        name="Complex Multiplication",
        runnable=agent_with_tools,
        runnable_input={
            "system_msg": 'You are a helpful assistant. You can use tools to accomplish the task. Once you\'ve called a tool. the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling.',
            "human_msg": "What is 235 times 1243 times 21?",
            "tools": {"add": add, "multiply": multiply},
        },
        validator=regex_match_any,
        validation_input={"patterns": ["6134205", "6.134.205", "6,134,205"]},
    ),
    363: Test(
        name="Complex Multiplication Python",
        runnable=one_tool_call_answer,
        runnable_input={
            "system_msg": 'You are a helpful assistant.',
            "human_msg": 'Is 31515261 divisible by 425? If not, whats the remainder?',
            "tools": { "save_python_repl": save_python_repl },
        },
        validator=regex_match_any,
        validation_input={
            "patterns": [ "236", "two ?hundred and thirty ?six", "two ?hundred thirty ?six" ]
        }
    ),
    283: Test(
        name="Notes from last Saturday",
        runnable=agent_with_tools,
        runnable_input={
            "system_msg": 'You are a helpful assistant. You can use tools to accomplish tasks. Once you\'ve called a tool, the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling. To know the date, use the tool get_current_date_and_time.',
            "human_msg": "Last Saturday, who did grandma want me to call?",
            "tools": {
                "get_current_date_and_time": get_current_date_and_time,
                "get_notes_in_timespan": get_notes_in_timespan,
                "get_notes_containing": get_notes_containing,
                "Write note": write_note,
            },
        },
        validator=system_human_answer_match,
        validation_input={
            "criteria": dedent("""- containing the information that the Human should call Wolfgang
            - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
        },
    ),
    260: Test(
        name="Notes from last Saturday TSO",  # time span only
        runnable=agent_with_tools,
        runnable_input={
            "system_msg": 'You are a helpful assistant. You can use tools to accomplish tasks. Once you\'ve called a tool, the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling. To know the date, use the tool get_current_date_and_time.',
            "human_msg": "Last Saturday, who did grandma want me to call?",
            "tools": {
                "get_current_date_and_time": get_current_date_and_time,
                "get_notes_in_timespan": get_notes_in_timespan,
                "Write note": write_note,
            },
        },
        validator=system_human_answer_match,
        validation_input={
            "criteria": dedent("""- containing the information that the Human should call Wolfgang
    - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
        },
    ),
    856: Test(
        name="Notes from last Saturday TSO FSP",
        runnable=agent_with_tools,
        runnable_input={
            "system_msg": 'You are a helpful assistant. You can use tools to accomplish tasks. Once you\'ve called a tool, the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling. To know the date, use the tool get_current_date_and_time.',
            "fsp_messages": [
                HumanMessage("Tomorrow is the anniversary! Any tips what I should by her?"),  # One year ago
                AIMessage("", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "11"}]),
                ToolMessage("Wednesday the 31st of Juli 2024 09:31", tool_call_id="11"),
                AIMessage("", tool_calls=[ { "name": "get_notes_in_timespan", "args": {"begin": "2023/08/01", "to": "2023/08/01"}, "id": "12", } ]),
                ToolMessage("2023/08/01 23:10 Went out with Charlotte for our anniversary. Pizza at Cavalinos. She loved the rose necklace!", tool_call_id="12"),
                AIMessage("I'm afraid I cannot be of great help, since I obviously know charlotte way less than you, but last year you two went out to Cavalinons and you got her a rose necklace as a present. And she liked it. So maybe a pair of earrings would be something she'd like?", name="example_assistant"),

                HumanMessage("Did I write down anything yesterday or the day before that?"),
                AIMessage( "", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "21"}]),
                ToolMessage("Wednesday the 7th of August 2024 16:23", tool_call_id="21"),
                AIMessage( "", tool_calls=[ { "name": "get_notes_in_timespan", "args": {"begin": "2024/08/05", "to": "2024/08/06"}, "id": "22"}]),
                ToolMessage( "2024/08/05 11:45 Ask Dr. Mills about the side effects of the new medication he got me.\n\n2024/08/06 18:30 Pick up the dry cleaning on Thursday; they close early on Fridays.", tool_call_id="22"),
                AIMessage( "Yes. I found two entries.\n- From yesterday stating that you wanted to pickup the dry cleaning on Thursday, because they close early on Fridays\n- From Monday a note saying that you want to ask Dr. Mills about the side effects of the new medication he got you.", name="example_assistant"),
            ],
            "human_msg": "Last Saturday, who did grandma want me to call?",
            "tools": {
                "get_current_date_and_time": get_current_date_and_time,
                "get_notes_in_timespan": get_notes_in_timespan,
                "Write note": write_note,
            },
        },
        validator=system_human_answer_match,
        validation_input={
            "criteria": dedent("""- containing the information that the Human should call Wolfgang
            - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
        },
    ),
    # 600: Test(),
    # 221: Test(),
    # 985: Test(),
    # 634: Test(),
    # 927: Test(),
    # 346: Test(),
    # 995: Test(),
    # 404: Test(),
    # 299: Test(),
    # 275: Test(),
    # 852: Test(),
    # 376: Test(),
    # 263: Test(),
    # 432: Test(),
    # 270: Test(),
    # 797: Test(),
    # 340: Test(),
    # 489: Test(),
    # 786: Test(),
    # 121: Test(),
    # 971: Test(),
    # 436: Test(),
    # 147: Test(),
    # 534: Test(),
    # 190: Test(),
    # 158: Test(),
    # 191: Test(),
}