Files
test-small-llms/suite_settings/tests.py
Lennart J. Kurzweg (Nx2) ae2ea6d3c9 better pattern
2024-09-30 22:24:34 +02:00

158 lines
8.6 KiB
Python

from libs.classes import Test
from libs.runnables import basic_prompt, one_tool_call_answer, agent_with_tools
from libs.validators import regex_match_any, system_human_answer_match
from libs.tools import add, multiply, get_current_date_and_time, get_notes_in_timespan, get_notes_containing, write_note, save_python_repl
from textwrap import dedent
from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, AIMessage
tests = {
# 607: Test(
# name="Healthy Vegetables in Chinese",
# runnable=basic_prompt,
# runnable_input={
# "system_msg": "You are a helpful assistant. You serve people across the globe.",
# "human_msg": "什么蔬菜最健康?",
# },
# validator=system_human_answer_match,
# validation_input={
# "criteria": dedent("""- in Mandarin Chinese from front to finnish
# - factually correct
# - about healthy vegetables
# - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)
# Again, the message has to be entirely in Manadarin Chineese.
# That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct"""),
# },
# ),
693: Test(
name="Simple Multiplication",
runnable=one_tool_call_answer,
runnable_input={
"system_msg": "You are a helpful assistant.",
"human_msg": "What is 234215 times 143243?",
"tools": {"add": add, "multiply": multiply},
},
validator=regex_match_any,
validation_input={"patterns": ["33[,_]?549[,_]?659[,_]?245",]},
),
120: Test(
name="Complex Multiplication",
runnable=agent_with_tools,
runnable_input={
"system_msg": 'You are a helpful assistant. You can use tools to accomplish the task. Once you\'ve called a tool. the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling.',
"human_msg": "What is 235 times 1243 times 21?",
"tools": {"add": add, "multiply": multiply},
},
validator=regex_match_any,
validation_input={"patterns": ["6134205", "6,134,205"]},
),
363: Test(
name="Python Remainder",
runnable=one_tool_call_answer,
runnable_input={
"system_msg": 'You are a helpful assistant.',
"human_msg": 'Is 31515261 divisible by 425? If not, whats the remainder?',
"tools": { "save_python_repl": save_python_repl },
},
validator=regex_match_any,
validation_input={"patterns": [ "236", "two ?hundred and thirty ?six", "two ?hundred thirty ?six" ]}
),
283: Test(
name="Notes from last Saturday",
runnable=agent_with_tools,
runnable_input={
"system_msg": 'You are a helpful assistant. You can use tools to accomplish tasks. Once you\'ve called a tool, the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling. To know the date, use the tool get_current_date_and_time.',
"human_msg": "Last Saturday, who did grandma want me to call?",
"tools": {
"get_current_date_and_time": get_current_date_and_time,
"get_notes_in_timespan": get_notes_in_timespan,
"get_notes_containing": get_notes_containing,
"Write note": write_note,
},
},
validator=system_human_answer_match,
validation_input={
"criteria": dedent("""- containing the information that the Human should call Wolfgang
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
},
),
260: Test(
name="Notes from last Saturday TSO", # time span only
runnable=agent_with_tools,
runnable_input={
"system_msg": 'You are a helpful assistant. You can use tools to accomplish tasks. Once you\'ve called a tool, the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling. To know the date, use the tool get_current_date_and_time.',
"human_msg": "Last Saturday, who did grandma want me to call?",
"tools": {
"get_current_date_and_time": get_current_date_and_time,
"get_notes_in_timespan": get_notes_in_timespan,
"Write note": write_note,
},
},
validator=system_human_answer_match,
validation_input={
"criteria": dedent("""- containing the information that the Human should call Wolfgang
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
},
),
856: Test(
name="Notes from last Saturday TSO FSP",
runnable=agent_with_tools,
runnable_input={
"system_msg": 'You are a helpful assistant. You can use tools to accomplish tasks. Once you\'ve called a tool, the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling. To know the date, use the tool get_current_date_and_time.',
"fsp_messages": [
HumanMessage("Tomorrow is the anniversary! Any tips what I should by her?"), # One year ago
AIMessage("", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "11"}]),
ToolMessage("Wednesday the 31st of Juli 2024 09:31", tool_call_id="11"),
AIMessage("", tool_calls=[ { "name": "get_notes_in_timespan", "args": {"begin": "2023/08/01", "to": "2023/08/01"}, "id": "12", } ]),
ToolMessage("2023/08/01 23:10 Went out with Charlotte for our anniversary. Pizza at Cavalinos. She loved the rose necklace!", tool_call_id="12"),
AIMessage("I'm afraid I cannot be of great help, since I obviously know charlotte way less than you, but last year you two went out to Cavalinons and you got her a rose necklace as a present. And she liked it. So maybe a pair of earrings would be something she'd like?", name="example_assistant"),
HumanMessage("Did I write down anything yesterday or the day before that?"),
AIMessage("", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "21"}]),
ToolMessage("Wednesday the 7th of August 2024 16:23", tool_call_id="21"),
AIMessage("", tool_calls=[ { "name": "get_notes_in_timespan", "args": {"begin": "2024/08/05", "to": "2024/08/06"}, "id": "22"}]),
ToolMessage( "2024/08/05 11:45 Ask Dr. Mills about the side effects of the new medication he got me.\n\n2024/08/06 18:30 Pick up the dry cleaning on Thursday; they close early on Fridays.", tool_call_id="22"),
AIMessage("Yes. I found two entries.\n- From yesterday stating that you wanted to pickup the dry cleaning on Thursday, because they close early on Fridays\n- From Monday a note saying that you want to ask Dr. Mills about the side effects of the new medication he got you.", name="example_assistant"),
],
"human_msg": "Last Saturday, who did grandma want me to call?",
"tools": {
"get_current_date_and_time": get_current_date_and_time,
"get_notes_in_timespan": get_notes_in_timespan,
"Write note": write_note,
},
},
validator=system_human_answer_match,
validation_input={
"criteria": dedent("""- containing the information that the Human should call Wolfgang
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
},
),
# 600: Test(),
# 221: Test(),
# 985: Test(),
# 634: Test(),
# 927: Test(),
# 346: Test(),
# 995: Test(),
# 404: Test(),
# 299: Test(),
# 275: Test(),
# 852: Test(),
# 376: Test(),
# 263: Test(),
# 432: Test(),
# 270: Test(),
# 797: Test(),
# 340: Test(),
# 489: Test(),
# 786: Test(),
# 121: Test(),
# 971: Test(),
# 436: Test(),
# 147: Test(),
# 534: Test(),
# 190: Test(),
# 158: Test(),
# 191: Test(),
}