From 298d8c83efb4f55fb29c0ee71a3d8fe9e346415e Mon Sep 17 00:00:00 2001 From: "Lennart J. Kurzweg (Nx2)" Date: Wed, 14 Aug 2024 21:02:12 +0200 Subject: [PATCH] restructuring --- suite_settings/models.py | 13 ++++ suite_settings/seeds.py | 17 +++++ suite_settings/tests.py | 159 +++++++++++++++++++++++++++++++++++++++ test_small_llms.py | 75 ------------------ test_suite.py | 16 ++++ 5 files changed, 205 insertions(+), 75 deletions(-) create mode 100644 suite_settings/models.py create mode 100644 suite_settings/seeds.py create mode 100644 suite_settings/tests.py delete mode 100644 test_small_llms.py create mode 100644 test_suite.py diff --git a/suite_settings/models.py b/suite_settings/models.py new file mode 100644 index 0000000..4b3181d --- /dev/null +++ b/suite_settings/models.py @@ -0,0 +1,13 @@ +models = [ + "llama3.1", # 8b + "llama3.1:70b", + "llama3-groq-tool-use", # latest + "llama3-groq-tool-use:70b", + # "mixtral:8x7b", + "mixtral:8x22b", + # "gemma2:2b", + # "phi3", # 3.8b + # "tinyllama:1.1b", + "mistral-nemo:12b", + "command-r-plus:104b", +] diff --git a/suite_settings/seeds.py b/suite_settings/seeds.py new file mode 100644 index 0000000..5d5734e --- /dev/null +++ b/suite_settings/seeds.py @@ -0,0 +1,17 @@ +seeds = [ + 12345678, + 123456, + 2, + 222, + 22222, + 2222222, + 31415926, + 404, + 62951413, + 8008, + 99999999, + 789654, + 10293847, + 42, + 911 +] diff --git a/suite_settings/tests.py b/suite_settings/tests.py new file mode 100644 index 0000000..c004ae2 --- /dev/null +++ b/suite_settings/tests.py @@ -0,0 +1,159 @@ +from libs.test_class import Test +from libs.runnables import * +from libs.validators import * +from libs.tools import * +from textwrap import dedent +from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, AIMessage + +tests = { + 607: Test( + name="Healthy Vegetables in Chinese", + runnable=basic, + runnable_input={ + "system_msg": "You are a helpful assistant. You serve people across the globe.", + "human_msg": "什么蔬菜最健康?", + }, + validator=system_human_answer_match, + validation_input={ + "criteria": dedent("""- in Mandarin Chinese from front to finnish + - factually correct + - about healthy vegetables + - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes) + + Again, the message has to be entirely in Manadarin Chineese. + That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct"""), + } + ), + 693: Test( + name="Simple Multiplication", + runnable=one_tool_call_answer, + runnable_input={ + "system_msg": "You are a helpful assistant.", + "human_msg": "What is 234215 times 143243?", + "tools": { + "add": add, + "multiply": multiply + } + }, + validator=regex_match_any, + validation_input={ + "patterns": ["33549659245", "33,549,659,245", "33.549.659.245"] + # "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245 + } + ), + 120: Test( + name="Complex Multiplication", + runnable=agent_with_tools, + runnable_input={ + "system_msg": 'You are a helpful assistant. You can use tools to accomplish the task. Once you\'ve called a tool. the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling.', + "human_msg": "What is 235 times 1243 times 21?", + "tools": { + "add": add, + "multiply": multiply + } + }, + validator=regex_match_any, + validation_input={ + "patterns": [ "6134205", "6.134.205", "6,134,205" ] + } + ), + 283: Test( + name="Notes from last Saturday", + runnable=agent_with_tools, + runnable_input={ + "system_msg": "You are a helpful assistant. You can use tools to accomplish tasks. Once you've called a tool, the resulting tool_message content can be taken into consideration again. With that you can do \"multiple rounds\" of tool calling. To know the date, use the tool get_current_date_and_time.", + "human_msg": "Last Saturday, who did grandma want me to call?", + "tools": { + "get_current_date_and_time": get_current_date_and_time, + "get_notes_in_timespan": get_notes_in_timespan, + "get_notes_containing": get_notes_containing, + "Write note": write_note + } + }, + validator=system_human_answer_match, + validation_input={ + "criteria": dedent("""- containing the information that the Human should call Wolfgang + - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""") + } + ), + 260: Test( + name="Notes from last Saturday TSO", # time span only + runnable=agent_with_tools, + runnable_input={ + "system_msg": "You are a helpful assistant. You can use tools to accomplish tasks. Once you've called a tool, the resulting tool_message content can be taken into consideration again. With that you can do \"multiple rounds\" of tool calling. To know the date, use the tool get_current_date_and_time.", + "human_msg": "Last Saturday, who did grandma want me to call?", + "tools": { + "get_current_date_and_time": get_current_date_and_time, + "get_notes_in_timespan": get_notes_in_timespan, + "Write note": write_note + } + }, + validator=system_human_answer_match, + validation_input={ + "criteria": dedent("""- containing the information that the Human should call Wolfgang + - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""") + } + ), + 856: Test( + name="Notes from last Saturday TSO FSP", + runnable=agent_with_tools_fsp, + runnable_input={ + "system_msg": "You are a helpful assistant. You can use tools to accomplish tasks. Once you've called a tool, the resulting tool_message content can be taken into consideration again. With that you can do \"multiple rounds\" of tool calling. To know the date, use the tool get_current_date_and_time.", + "fsp_messages": [ + HumanMessage("Tomorrow is the anniversary! Any tips what I should by her?"), # One year ago + AIMessage("", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "11" }]), + ToolMessage("Wednesday the 31st of Juli 2024 09:31", tool_call_id="11" ), + AIMessage("", tool_calls=[{"name": "get_notes_in_timespan", "args": {"begin": "2023/08/01", "to": "2023/08/01"}, "id": "12"}], ), + ToolMessage("2023/08/01 23:10 Went out with Charlotte for our anniversary. Pizza at Cavalinos. She loved the rose necklace!", tool_call_id="12"), + AIMessage("I'm afraid I cannot be of great help, since I obviously know charlotte way less than you, but last year you two went out to Cavalinons and you got her a rose necklace as a present. And she liked it. So maybe a pair of earrings would be something she'd like?", name="example_assistant", ), + + HumanMessage("Did I write down anything yesterday or the day before that?"), + AIMessage("", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "21" }], ), + ToolMessage("Wednesday the 7th of August 2024 16:23", tool_call_id="21" ), + AIMessage("", tool_calls=[{"name": "get_notes_in_timespan", "args": {"begin": "2024/08/05", "to": "2024/08/06"}, "id": "22"}], ), + ToolMessage("2024/08/05 11:45 Ask Dr. Mills about the side effects of the new medication he got me.\n\n2024/08/06 18:30 Pick up the dry cleaning on Thursday; they close early on Fridays.", tool_call_id="22"), + AIMessage("Yes. I found two entries.\n- From yesterday stating that you wanted to pickup the dry cleaning on Thursday, because they close early on Fridays\n- From Monday a note saying that you want to ask Dr. Mills about the side effects of the new medication he got you.", name="example_assistant", ), + ], + "human_msg": "Last Saturday, who did grandma want me to call?", + "tools": { + "get_current_date_and_time": get_current_date_and_time, + "get_notes_in_timespan": get_notes_in_timespan, + "Write note": write_note + } + }, + validator=system_human_answer_match, + validation_input={ + "criteria": dedent("""- containing the information that the Human should call Wolfgang + - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""") + } + ), + # 363: Test(), + # 600: Test(), + # 221: Test(), + # 985: Test(), + # 634: Test(), + # 927: Test(), + # 346: Test(), + # 995: Test(), + # 404: Test(), + # 299: Test(), + # 275: Test(), + # 852: Test(), + # 376: Test(), + # 263: Test(), + # 432: Test(), + # 270: Test(), + # 797: Test(), + # 340: Test(), + # 489: Test(), + # 786: Test(), + # 121: Test(), + # 971: Test(), + # 436: Test(), + # 147: Test(), + # 534: Test(), + # 190: Test(), + # 158: Test(), + # 191: Test(), +} + diff --git a/test_small_llms.py b/test_small_llms.py deleted file mode 100644 index 1d1af44..0000000 --- a/test_small_llms.py +++ /dev/null @@ -1,75 +0,0 @@ -from libs.test_class import Test -from libs.run_tests import run_tests -from libs.runnables import * -from libs.validators import * -from libs.tools import * - -from pprint import pprint - -def main(): - models = [ - "llama3.1", # 8b - "llama3.1:70b", - "llama3-groq-tool-use", # latest - "llama3-groq-tool-use:70b", - # "mixtral:8x7b", - "mixtral:8x22b", - # "gemma2:2b", - # "phi3", # 3.8b - # "tinyllama:1.1b", - "mistral-nemo:12b", - # "command-r-plus:104b", - ] - seeds = [ - 2, - 222, - 22222, - 2222222 - ] - tests = [ - Test( - name="Chinese Fruit", - runnable=basic, - runnable_input={ - "system_msg": "You are a helpful assistant. You serve people across the globe.", - "human_msg": "什么蔬菜最健康?", - }, - validator=system_human_answer_match, - validation_input={ - "criteria": """- in Mandarin Chinese from front to finnish -- factually correct -- about healthy vegetables -- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes) - -Again, the message has to be entirely in Manadarin Chineese. -That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct""", - } - ), - Test( - name="Simple Multiplication", - runnable=one_tool_call_answer, - runnable_input={ - "system_msg": "You are a helpful assistant.", - "human_msg": "What is 234215 times 143243?", - "tools": { - "add": add, - "multiply": multiply - } - }, - validator=regex_match_any, - validation_input={ - "patterns": ["33549659245", "33,549,659,245", "33.549.659.245"] - # "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245 - } - ), - ] - - results = run_tests( - models=models, - seeds=seeds, - tests=tests, - base_url="http://bolt.hs-mittweida.de:11434" - ) - -if __name__ == "__main__": - main() diff --git a/test_suite.py b/test_suite.py new file mode 100644 index 0000000..aa91b29 --- /dev/null +++ b/test_suite.py @@ -0,0 +1,16 @@ +from libs.run_tests import run_tests +from suite_settings.models import models +from suite_settings.seeds import seeds +from suite_settings.tests import tests + +def main(): + + results = run_tests( + models=models, + seeds=seeds, + tests=tests, + base_url="http://bolt.hs-mittweida.de:11434" + ) + +if __name__ == "__main__": + main()