This commit is contained in:
Lennart J. Kurzweg (Nx2)
2024-08-26 21:20:47 +02:00
parent 2723ced901
commit 5d7ce3cf71
12 changed files with 2055 additions and 2350 deletions

View File

@@ -1,19 +0,0 @@
#!/usr/bin/env bash
set -e
if [[ ! -d "/home/nx2/test-small-llms" ]]; then
echo "Cannot find source directory; Did you move it?"
echo "(Looking for "/home/nx2/test-small-llms")"
echo 'Cannot force reload with this script - use "direnv reload" manually and then try again'
exit 1
fi
# rebuild the cache forcefully
_nix_direnv_force_reload=1 direnv exec "/home/nx2/test-small-llms" true
# Update the mtime for .envrc.
# This will cause direnv to reload again - but without re-building.
touch "/home/nx2/test-small-llms/.envrc"
# Also update the timestamp of whatever profile_rc we have.
# This makes sure that we know we are up to date.
touch -r "/home/nx2/test-small-llms/.envrc" "/home/nx2/test-small-llms/.direnv"/*.rc

View File

@@ -1 +0,0 @@
/nix/store/j2vf461mp9h2y9awkklbfawf3dz7cs1p-nix-shell-env

File diff suppressed because it is too large Load Diff

View File

@@ -20,11 +20,10 @@ from langchain_core.language_models import LanguageModelInput
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage, BaseMessage, ToolCall
from langchain_core.outputs import ChatGeneration, ChatResult
from langchain_core.prompts import SystemMessagePromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.runnables import Runnable
from langchain_core.tools import BaseTool, Tool
from langchain_core.utils.pydantic import is_basemodel_instance, is_basemodel_subclass
from textwrap import dedent
from libs.functions import nxhash
@@ -98,14 +97,15 @@ def _is_pydantic_class(obj: Any) -> bool:
is_basemodel_subclass(obj) or BaseModel in obj.__bases__
)
class OllamaFunctions(ChatOllama):
class OllamaFunctionsBase(ChatOllama):
"""Function chat model that uses Ollama API."""
tool_system_prompt_template: str = DEFAULT_SYTEM_PROMPT
tool_system_prompt_template_with_history: str = DEFAULT_SYTEM_PROMPT_WITH_HISTORY
max_tool_call_fails: int = 5
def __init__(self, max_tool_call_fails, **kwargs: Any) -> None:
def __init__(self, **kwargs: Any) -> None:
super().__init__(**kwargs)
def bind_tools(
@@ -115,6 +115,8 @@ class OllamaFunctions(ChatOllama):
) -> Runnable[LanguageModelInput, BaseMessage]:
return self.bind(functions=tools, **kwargs)
def _get_final_message(self, messages: list, functions_str: str) -> list:
raise NotImplementedError
def _generate(self, messages: List[BaseMessage], stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any) -> ChatResult:
def _convert_to_ollama_tool(self, tool: Any) -> Dict:
@@ -177,11 +179,11 @@ class OllamaFunctions(ChatOllama):
return called_tool
def _extract_conversaional_response(self, d: dict) -> str:
if ("tool_input" in d and "response" in d["tool_input"]):
if ("tool_input" in d and d["tool_input"] and "response" in d["tool_input"]):
response = d["tool_input"]["response"]
elif ("input" in d and "response" in d["input"]):
elif ("input" in d and d["input"] and "response" in d["input"]):
response = d["input"]["response"]
elif ("args" in d and "response" in d["args"]):
elif ("args" in d and d["args"] and "response" in d["args"]):
response = d["args"]["response"]
elif "response" in d:
response = d["response"]
@@ -220,66 +222,6 @@ class OllamaFunctions(ChatOllama):
called_tool_args = {}
return called_tool_args
def _get_final_message(self, messages: list, functions_str: str) -> list:
def _get_system_msg_and_formatted_history(self, messages: list) -> Tuple[str, str]:
def _format_tools_for_history(tool_calls: list[ToolCall]) -> str:
call_list = []
for c in tool_calls:
call_list.append({
"id": nxhash(c['id'])[-4:],
"tool": c['name'],
"args": c['args']
})
if len(call_list) == 1:
return json.dumps(obj=call_list[0], ensure_ascii=False, indent=2)
else:
return json.dumps(obj=call_list, ensure_ascii=False, indent=2)
formated_history = ""
system_msg = messages[0]
for m in messages[1:]:
if formated_history != "":
formated_history += "\n\n"
if isinstance(m, SystemMessage):
formated_history += "The system provided the info:\n" + str(m.content)
elif isinstance(m, HumanMessage):
formated_history += "The Human said:\n" + str(m.content)
elif isinstance(m, AIMessage) and m.tool_calls:
formated_history += "So you called the tool" + (":\n" if len(m.tool_calls) == 1 else "s:\n") + _format_tools_for_history(m.tool_calls)
elif isinstance(m, ToolMessage):
formated_history += "To which the tool (" + nxhash(m.tool_call_id)[-4:] + ") replied with:\n" + str(m.content)
elif isinstance(m, AIMessage) and not m.tool_calls:
formated_history += "You said:\n" + str(m.content)
else:
raise TypeError("OllamaFunctions only supports SystemMessage HumanMessage ToolMessage AIMessage but got " + str(type(m)))
return system_msg, formated_history
# prepare generation with history
if True in [ isinstance(m, ToolMessage) for m in messages ]:
system_msg, formated_history = _get_system_msg_and_formatted_history(self, messages=messages)
system_message_prompt_template = SystemMessagePromptTemplate.from_template(self.tool_system_prompt_template_with_history)
system_message = system_message_prompt_template.format(
tools=functions_str,
history=formated_history,
system_msg=system_msg
)
final_messages = [ system_message ]
# prepare generation without history
else:
system_message_prompt_template = SystemMessagePromptTemplate.from_template(self.tool_system_prompt_template)
system_message = system_message_prompt_template.format(
tools=functions_str
)
final_messages = [ system_message ] + messages
return final_messages
def gen(self, failed_tool_calls: int, messages: list) -> ChatResult:
@@ -289,7 +231,7 @@ class OllamaFunctions(ChatOllama):
functions_str = json.dumps(functions_list, indent=2)
# get messages to prompt with
final_messages = _get_final_message(self, messages=messages, functions_str=functions_str)
final_messages = self._get_final_message(messages=messages, functions_str=functions_str)
# genrerate chat result
response_message = super()._generate(final_messages, stop=stop, run_manager=run_manager, **kwargs)
@@ -329,6 +271,125 @@ class OllamaFunctions(ChatOllama):
return gen(self, failed_tool_calls=0, messages=messages)
class OllamaFunctionsLSM(OllamaFunctionsBase):
"""Function chat model that uses Ollama API."""
def _get_final_message(self, messages: list, functions_str: str) -> list:
def _get_system_msg_and_formatted_history(self, messages: list) -> Tuple[str, str]:
def _format_tools_for_history(tool_calls: list[ToolCall]) -> str:
call_list = []
for c in tool_calls:
call_list.append({
"id": nxhash(c['id'])[-4:],
"tool": c['name'],
"args": c['args']
})
if len(call_list) == 1:
return json.dumps(obj=call_list[0], ensure_ascii=False, indent=2)
else:
return json.dumps(obj=call_list, ensure_ascii=False, indent=2)
formated_history = ""
system_msg = messages[0]
for m in messages[1:]:
if formated_history != "":
formated_history += "\n\n"
if isinstance(m, SystemMessage):
formated_history += "The system provided the info:\n" + str(m.content)
elif isinstance(m, HumanMessage):
formated_history += "The Human said:\n" + str(m.content)
elif isinstance(m, AIMessage) and m.tool_calls:
formated_history += "So you called the tool" + (":\n" if len(m.tool_calls) == 1 else "s:\n") + _format_tools_for_history(m.tool_calls)
elif isinstance(m, ToolMessage):
formated_history += "To which the tool (" + nxhash(m.tool_call_id)[-4:] + ") replied with:\n" + str(m.content)
elif isinstance(m, AIMessage) and not m.tool_calls:
formated_history += "You said:\n" + str(m.content)
else:
try:
raise TypeError("OllamaFunctions only supports SystemMessage HumanMessage ToolMessage AIMessage but got " + str(type(m)))
except NameError:
raise TypeError("OllamaFunctions only supports SystemMessage HumanMessage ToolMessage AIMessage.")
return system_msg, formated_history
# prepare generation with history
if True in [ isinstance(m, ToolMessage) for m in messages ]:
system_msg, formated_history = _get_system_msg_and_formatted_history(self, messages=messages)
system_message_prompt_template = SystemMessagePromptTemplate.from_template(self.tool_system_prompt_template_with_history)
system_message = system_message_prompt_template.format(
tools=functions_str,
history=formated_history,
system_msg=system_msg
)
final_messages = [ system_message ]
# prepare generation without history
else:
system_message_prompt_template = SystemMessagePromptTemplate.from_template(self.tool_system_prompt_template)
system_message = system_message_prompt_template.format(
tools=functions_str
)
final_messages = [ system_message ] + messages
return final_messages
@property
def _llm_type(self) -> str:
return "ollama_functions"
return "ollama_functions_lsm"
class OllamaFunctionsT2S(OllamaFunctionsBase):
"""Function chat model that uses Ollama API."""
def _get_final_message(self, messages: list, functions_str: str) -> list:
# prepare generation with history
if True in [ isinstance(m, ToolMessage) for m in messages ]:
transformed_messages = []
for m in messages:
if isinstance(m, ToolMessage):
transformed_messages.append(SystemMessage(content=(
f"The Tool '{m.name}' replied with:" + "\n" + str(m.content)
)))
elif isinstance(m, AIMessage):
if m.tool_calls:
l = []
for call in m.tool_calls:
l.append({
"tool": call['name'],
"tool_input": call['args']
})
if len(l) == 1:
transformed_messages.append(AIMessage(content=json.dumps(l[0])))
else:
transformed_messages.append(AIMessage(content=json.dumps(l)))
else:
transformed_messages.append(m)
system_message_prompt_template = SystemMessagePromptTemplate.from_template(self.tool_system_prompt_template)
system_message = system_message_prompt_template.format(tools=functions_str)
final_messages = [ system_message ] + transformed_messages
# prepare generation without history
else:
system_message_prompt_template = SystemMessagePromptTemplate.from_template(self.tool_system_prompt_template)
system_message = system_message_prompt_template.format(
tools=functions_str
)
final_messages = [ system_message ] + messages
return final_messages
@property
def _llm_type(self) -> str:
return "ollama_functions_t2s"

View File

@@ -79,8 +79,8 @@ def run_tests(models: dict[int, Model], seeds: list[int], tests: dict[int, Test]
'technique_name': technique.name,
})
# if hash_key == "DE3D137E":
# pass
if hash_key == "0DEB2030":
pass
if hash_key not in saved_results.keys():
try:
@@ -105,7 +105,7 @@ def run_tests(models: dict[int, Model], seeds: list[int], tests: dict[int, Test]
"\033[0;35m)\033[0m",
end=""
)
answer = test.runnable(model=model, seed=seed, test=test, base_url=base_url)
answer = test.runnable(model=model, seed=seed, test=test, technique=technique, base_url=base_url)
if isinstance(answer, str):
combination['answer'] = answer
# combination['tool_calls'] = [] # no entry
@@ -172,13 +172,15 @@ def run_tests(models: dict[int, Model], seeds: list[int], tests: dict[int, Test]
try:
entry = {
'test_name': result['test_name'],
'test_id': result['test_id'],
'model_name': result['model_name'],
'model_id': result['model_id'],
'seed': result['seed'],
'answer': result['answer'],
'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url),
'test_name': result['test_name'],
'test_id': result['test_id'],
'model_name': result['model_name'],
'model_id': result['model_id'],
'technique_name': result['technique_name'],
'technique_id': result['technique_id'],
'seed': result['seed'],
'answer': result['answer'],
'validation': result['test'].validator(test=result['test'], answer=result['answer'], base_url=base_url),
}
except Exception as e:
print("\033[0;31mError validating entry (\033[0m" + hash_key + "\033[0;31m). <\033[0m" + str(e) + "\033[0;31m> Continuing...\033[0m ")

View File

@@ -1,8 +1,8 @@
from types import NoneType
from langchain_ollama.chat_models import ChatOllama
from libs.ollama_functions import OllamaFunctions
from libs.ollama_functions import OllamaFunctionsLSM, OllamaFunctionsT2S
from langchain_core.messages import AIMessage, SystemMessage, HumanMessage, ToolMessage
from libs.classes import Test, Model
from libs.classes import Technique, Test, Model
from langchain.tools import Tool
from typing import Literal
@@ -10,22 +10,31 @@ from langgraph.graph import StateGraph, MessagesState
import json
from pydantic import ValidationError
def _get_llm(model: Model, base_url: str, seed: int, tools: list[Tool]|NoneType = None):
if model.supports_tools:
from suite_settings.techniques import techniques
def _get_llm(model: Model, base_url: str, seed: int, technique: Technique, tools: list[Tool]|NoneType = None):
if technique == techniques[1]: # Native
llm = ChatOllama(
model=model.identifier,
seed=seed,
base_url=base_url
)
else:
llm = OllamaFunctions(
elif technique == techniques[903]: # Long System Message
llm = OllamaFunctionsLSM(
model=model.identifier,
seed=seed,
base_url=base_url,
format="json",
max_tool_call_fails=3,
temperature=0.0
)
elif technique == techniques[572]: # ToolMessages to SystemMessages
llm = OllamaFunctionsT2S(
model=model.identifier,
seed=seed,
base_url=base_url,
format="json",
)
else:
raise ValueError("Unkown Technique in _get_llm()")
if tools:
llm = llm.bind_tools(tools=tools)
@@ -33,7 +42,7 @@ def _get_llm(model: Model, base_url: str, seed: int, tools: list[Tool]|NoneType
return llm
def basic_prompt(model: Model, seed: int, test: Test, base_url: str) -> str:
def basic_prompt(model: Model, seed: int, test: Test, technique: Technique, base_url: str) -> str:
messages = [SystemMessage(test.runnable_input['system_msg'])]
try:
@@ -42,20 +51,20 @@ def basic_prompt(model: Model, seed: int, test: Test, base_url: str) -> str:
pass
messages += [ HumanMessage(test.runnable_input['human_msg']) ]
llm = _get_llm(model=model, base_url=base_url, seed=seed)
llm = _get_llm(model=model, base_url=base_url, technique=technique, seed=seed)
ai_msg = llm.invoke(messages)
assert isinstance(ai_msg.content, str)
return ai_msg.content
def one_tool_call_answer(model: Model, seed: int, test: Test, base_url: str) -> dict:
def one_tool_call_answer(model: Model, seed: int, test: Test, technique: Technique, base_url: str) -> dict:
tools_dict = test.runnable_input['tools']
tools = []
for key in tools_dict:
tools.append(tools_dict[key])
llm = _get_llm(model=model, base_url=base_url, seed=seed, tools=tools)
llm = _get_llm(model=model, base_url=base_url, seed=seed, technique=technique, tools=tools)
messages = [SystemMessage(test.runnable_input['system_msg'])]
try:
@@ -108,7 +117,7 @@ def one_tool_call_answer(model: Model, seed: int, test: Test, base_url: str) ->
"tool_calls": tool_calls,
}
def agent_with_tools(model: Model, seed: int, test: Test, base_url: str) -> dict[str, str|list]:
def agent_with_tools(model: Model, seed: int, test: Test, technique: Technique, base_url: str) -> dict[str, str|list]:
tool_calls = []
index = -1
@@ -173,7 +182,7 @@ def agent_with_tools(model: Model, seed: int, test: Test, base_url: str) -> dict
for key in tools_dict:
tools.append(tools_dict[key])
tool_node = NxToolNode(tools)
llm = _get_llm(model=model, base_url=base_url, seed=seed, tools=tools)
llm = _get_llm(model=model, base_url=base_url, seed=seed, technique=technique, tools=tools)
workflow = StateGraph(MessagesState)

View File

@@ -141,6 +141,9 @@ def get_notes_containing(patterns: Union[list[str], str]) -> str:
ret += f"{datetime.strftime(entry.time, '%Y/%m/%d %H:%M')} {entry.content}"
is_first = False
if ret == "":
ret = "No matching notes were found. Try diffrent patterns."
return ret
@tool

View File

@@ -1,96 +0,0 @@
import json
import os
import sys
def print_help():
print("""Example usages:
python print_saved_results.py
python print_saved_results.py -m llama3.1
python print_saved_results.py -m llama3.1,mixtral-nemo:12b
python print_saved_results.py -m llama3.1 -s 2222,2 -t "Healthy Vegetables in Chinese"
Note: If one of the "fileters" does not exist, no error is thrown.""")
def main(argv: list[str]) -> None:
try:
with open("./saved_results.json", "r") as f:
saved_results = json.load(fp=f)
except:
print("saved_results.json not found. Try running test_suite.py first.")
exit(1)
if "-h" in argv:
print_help()
exit(0)
try:
if "-m" in argv:
test_str = argv[argv.index("-m")+1]
assert test_str[0] != "-"
models = test_str.split(",")
argv.pop(argv.index("-m")+1)
argv.pop(argv.index("-m"))
else:
models = None
if "-s" in argv:
test_str = argv[argv.index("-s")+1]
assert test_str[0] != "-"
seeds = test_str.split(",")
argv.pop(argv.index("-s")+1)
argv.pop(argv.index("-s"))
else:
seeds = None
if "-t" in argv:
test_str = argv[argv.index("-t")+1]
assert test_str[0] != "-"
tests = test_str.split(",")
argv.pop(argv.index("-t")+1)
argv.pop(argv.index("-t"))
else:
tests = None
except:
print("Syntax error. Run `python print_saved_results.py -h` for help.")
print_help()
exit(1)
argv.pop(0) # remove filename entry
if argv != []:
print("Syntax error. Run `python print_saved_results.py -h` for help.")
print(f"Got unkown argument{'s' if len(argv) != 1 else ''}: {argv}")
print_help()
exit(1)
first_print = True
term_size = os.get_terminal_size()
for hash_key in saved_results:
result = saved_results[hash_key]
if models == None or result['model'] in models:
if seeds == None or str(result['seed']) in seeds:
if tests == None or result['test_name'] in tests:
if not first_print: print('-' * term_size.columns)
print(
"\n" +
"\033[0;36mTest name:\033[0m " +
result['test_name'] +
"\n\033[0;36mModel:\033[0m " +
result['model'] +
"\n\033[0;36mSeed:\033[0m " +
str(result['seed']) +
"\n\033[0;36mValidation result:\033[0m " +
str(result['validation']) +
"\n\033[0;36mAnswer: »\033[0m" +
result['answer'] +
"\033[0;36m«\033[0m" +
"\n"
)
first_print = False
if __name__ == "__main__":
main(argv=sys.argv)

1651
saved_results-bak.json Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,90 +1,90 @@
from libs.classes import Model
models = {
245: Model(
display_name="llama3.1 8b",
identifier="llama3.1",
supports_tools=True,
parameter_count_in_b=8
),
238: Model(
display_name="llama3.1 70b",
identifier="llama3.1:70b",
supports_tools=True,
parameter_count_in_b=70
),
120: Model(
display_name="llama3 groq TU 8b",
identifier="llama3-groq-tool-use",
supports_tools=True,
parameter_count_in_b=8
),
890: Model(
display_name="llama3 groq TU 70b",
identifier="llama3-groq-tool-use:70b",
supports_tools=True,
parameter_count_in_b=70
),
348: Model(
display_name="Mixtral MoE 8x7b",
identifier="mixtral:8x7b",
supports_tools=False,
parameter_count_in_b=13,
),
789: Model(
display_name="Mixtral MoE 8x22b",
identifier="mixtral:8x22b",
supports_tools=True,
parameter_count_in_b=39
),
445: Model(
display_name="Gemma2 2b",
identifier="gemma2:2b",
supports_tools=False,
parameter_count_in_b=2
),
475: Model(
display_name="Gemma2 9b",
identifier="gemma2:2b",
supports_tools=False,
parameter_count_in_b=9
),
626: Model(
display_name="Gemma2 27b",
identifier="gemma2:2b",
supports_tools=False,
parameter_count_in_b=27
),
229: Model(
display_name="Phi3 3.8b",
identifier="phi3",
supports_tools=False,
parameter_count_in_b=3.8
),
903: Model(
display_name="Tinyllama 1.1b",
identifier="tinyllama:1.1b",
supports_tools=False,
parameter_count_in_b=1.1
),
670: Model(
display_name="Mistral Nemo 12b",
identifier="mistral-nemo:12b",
supports_tools=True,
parameter_count_in_b=12
),
404: Model(
display_name="Command R+ 104b",
identifier="command-r-plus:104b",
supports_tools=True,
parameter_count_in_b=104
),
701: Model(
display_name="Yi 6b",
identifier="yi:6b",
supports_tools=False,
parameter_count_in_b=6
),
# 245: Model(
# display_name="llama3.1 8b",
# identifier="llama3.1",
# supports_tools=True,
# parameter_count_in_b=8
# ),
# 238: Model(
# display_name="llama3.1 70b",
# identifier="llama3.1:70b",
# supports_tools=True,
# parameter_count_in_b=70
# ),
# 120: Model(
# display_name="llama3 groq TU 8b",
# identifier="llama3-groq-tool-use",
# supports_tools=True,
# parameter_count_in_b=8
# ),
# 890: Model(
# display_name="llama3 groq TU 70b",
# identifier="llama3-groq-tool-use:70b",
# supports_tools=True,
# parameter_count_in_b=70
# ),
# 348: Model(
# display_name="Mixtral MoE 8x7b",
# identifier="mixtral:8x7b",
# supports_tools=False,
# parameter_count_in_b=13,
# ),
# 789: Model(
# display_name="Mixtral MoE 8x22b",
# identifier="mixtral:8x22b",
# supports_tools=True,
# parameter_count_in_b=39
# ),
# 445: Model(
# display_name="Gemma2 2b",
# identifier="gemma2:2b",
# supports_tools=False,
# parameter_count_in_b=2
# ),
# 475: Model(
# display_name="Gemma2 9b",
# identifier="gemma2:2b",
# supports_tools=False,
# parameter_count_in_b=9
# ),
# 626: Model(
# display_name="Gemma2 27b",
# identifier="gemma2:2b",
# supports_tools=False,
# parameter_count_in_b=27
# ),
# 229: Model(
# display_name="Phi3 3.8b",
# identifier="phi3",
# supports_tools=False,
# parameter_count_in_b=3.8
# ),
# 903: Model(
# display_name="Tinyllama 1.1b",
# identifier="tinyllama:1.1b",
# supports_tools=False,
# parameter_count_in_b=1.1
# ),
# 670: Model(
# display_name="Mistral Nemo 12b",
# identifier="mistral-nemo:12b",
# supports_tools=True,
# parameter_count_in_b=12
# ),
# 404: Model(
# display_name="Command R+ 104b",
# identifier="command-r-plus:104b",
# supports_tools=True,
# parameter_count_in_b=104
# ),
# 701: Model(
# display_name="Yi 6b",
# identifier="yi:6b",
# supports_tools=False,
# parameter_count_in_b=6
# ),
704: Model(
display_name="Yi 9b",
identifier="yi:9b",

View File

@@ -1,19 +1,19 @@
from libs.classes import Technique
techniques = {
190: Technique(
1: Technique(
name="Native",
for_supports_tools=True,
for_not_supports_tools=False,
),
903: Technique(
name="Long System Message",
name="LSM", # Long System Message
for_supports_tools=False,
for_not_supports_tools=True,
),
572: Technique(
name="T2S", # Tool to System Messsages
for_supports_tools=False,
for_not_supports_tools=True,
),
# 572: Technique(
# name="Tool to System Messsages",
# for_supports_tools=False,
# for_not_supports_tools=True,
# ),
}

View File

@@ -1,160 +1,150 @@
from libs.classes import Test
from libs.runnables import basic_prompt, one_tool_call_answer, agent_with_tools
from libs.validators import regex_match_any, system_human_answer_match
from libs.tools import add, multiply, get_current_date_and_time, get_notes_in_timespan, get_notes_containing, write_note
from textwrap import dedent
from libs.classes import Test
from libs.runnables import basic_prompt, one_tool_call_answer, agent_with_tools
from libs.validators import regex_match_any, system_human_answer_match
from libs.tools import add, multiply, get_current_date_and_time, get_notes_in_timespan, get_notes_containing, write_note
from textwrap import dedent
from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, AIMessage
tests = {
607: Test(
name="Healthy Vegetables in Chinese",
runnable=basic_prompt,
runnable_input={
"system_msg": "You are a helpful assistant. You serve people across the globe.",
"human_msg": "什么蔬菜最健康?",
},
validator=system_human_answer_match,
validation_input={
"criteria": dedent("""- in Mandarin Chinese from front to finnish
- factually correct
- about healthy vegetables
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)
Again, the message has to be entirely in Manadarin Chineese.
That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct"""),
}
),
693: Test(
name="Simple Multiplication",
runnable=one_tool_call_answer,
runnable_input={
"system_msg": "You are a helpful assistant.",
"human_msg": "What is 234215 times 143243?",
"tools": {
"add": add,
"multiply": multiply
}
},
validator=regex_match_any,
validation_input={
"patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
# "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
}
),
120: Test(
name="Complex Multiplication",
runnable=agent_with_tools,
runnable_input={
"system_msg": 'You are a helpful assistant. You can use tools to accomplish the task. Once you\'ve called a tool. the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling.',
"human_msg": "What is 235 times 1243 times 21?",
"tools": {
"add": add,
"multiply": multiply
}
},
validator=regex_match_any,
validation_input={
"patterns": [ "6134205", "6.134.205", "6,134,205" ]
}
),
283: Test(
name="Notes from last Saturday",
runnable=agent_with_tools,
runnable_input={
"system_msg": "You are a helpful assistant. You can use tools to accomplish tasks. Once you've called a tool, the resulting tool_message content can be taken into consideration again. With that you can do \"multiple rounds\" of tool calling. To know the date, use the tool get_current_date_and_time.",
"human_msg": "Last Saturday, who did grandma want me to call?",
"tools": {
"get_current_date_and_time": get_current_date_and_time,
"get_notes_in_timespan": get_notes_in_timespan,
"get_notes_containing": get_notes_containing,
"Write note": write_note,
}
},
validator=system_human_answer_match,
validation_input={
"criteria": dedent("""- containing the information that the Human should call Wolfgang
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
}
),
260: Test(
name="Notes from last Saturday TSO", # time span only
runnable=agent_with_tools,
runnable_input={
"system_msg": "You are a helpful assistant. You can use tools to accomplish tasks. Once you've called a tool, the resulting tool_message content can be taken into consideration again. With that you can do \"multiple rounds\" of tool calling. To know the date, use the tool get_current_date_and_time.",
"human_msg": "Last Saturday, who did grandma want me to call?",
"tools": {
"get_current_date_and_time": get_current_date_and_time,
"get_notes_in_timespan": get_notes_in_timespan,
"Write note": write_note
}
},
validator=system_human_answer_match,
validation_input={
"criteria": dedent("""- containing the information that the Human should call Wolfgang
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
}
),
856: Test(
name="Notes from last Saturday TSO FSP",
runnable=agent_with_tools,
runnable_input={
"system_msg": "You are a helpful assistant. You can use tools to accomplish tasks. Once you've called a tool, the resulting tool_message content can be taken into consideration again. With that you can do \"multiple rounds\" of tool calling. To know the date, use the tool get_current_date_and_time.",
"fsp_messages": [
HumanMessage("Tomorrow is the anniversary! Any tips what I should by her?"), # One year ago
AIMessage("", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "11" }]),
ToolMessage("Wednesday the 31st of Juli 2024 09:31", tool_call_id="11" ),
AIMessage("", tool_calls=[{"name": "get_notes_in_timespan", "args": {"begin": "2023/08/01", "to": "2023/08/01"}, "id": "12"}], ),
ToolMessage("2023/08/01 23:10 Went out with Charlotte for our anniversary. Pizza at Cavalinos. She loved the rose necklace!", tool_call_id="12"),
AIMessage("I'm afraid I cannot be of great help, since I obviously know charlotte way less than you, but last year you two went out to Cavalinons and you got her a rose necklace as a present. And she liked it. So maybe a pair of earrings would be something she'd like?", name="example_assistant", ),
HumanMessage("Did I write down anything yesterday or the day before that?"),
AIMessage("", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "21" }], ),
ToolMessage("Wednesday the 7th of August 2024 16:23", tool_call_id="21" ),
AIMessage("", tool_calls=[{"name": "get_notes_in_timespan", "args": {"begin": "2024/08/05", "to": "2024/08/06"}, "id": "22"}], ),
ToolMessage("2024/08/05 11:45 Ask Dr. Mills about the side effects of the new medication he got me.\n\n2024/08/06 18:30 Pick up the dry cleaning on Thursday; they close early on Fridays.", tool_call_id="22"),
AIMessage("Yes. I found two entries.\n- From yesterday stating that you wanted to pickup the dry cleaning on Thursday, because they close early on Fridays\n- From Monday a note saying that you want to ask Dr. Mills about the side effects of the new medication he got you.", name="example_assistant", ),
],
"human_msg": "Last Saturday, who did grandma want me to call?",
"tools": {
"get_current_date_and_time": get_current_date_and_time,
"get_notes_in_timespan": get_notes_in_timespan,
"Write note": write_note
}
},
607: Test(
name="Healthy Vegetables in Chinese",
runnable=basic_prompt,
runnable_input={
"system_msg": "You are a helpful assistant. You serve people across the globe.",
"human_msg": "什么蔬菜最健康?",
},
validator=system_human_answer_match,
validation_input={
"criteria": dedent("""- containing the information that the Human should call Wolfgang
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
}
),
# 363: Test(),
# 600: Test(),
# 221: Test(),
# 985: Test(),
# 634: Test(),
# 927: Test(),
# 346: Test(),
# 995: Test(),
# 404: Test(),
# 299: Test(),
# 275: Test(),
# 852: Test(),
# 376: Test(),
# 263: Test(),
# 432: Test(),
# 270: Test(),
# 797: Test(),
# 340: Test(),
# 489: Test(),
# 786: Test(),
# 121: Test(),
# 971: Test(),
# 436: Test(),
# 147: Test(),
# 534: Test(),
# 190: Test(),
# 158: Test(),
# 191: Test(),
}
validation_input={
"criteria": dedent("""- in Mandarin Chinese from front to finnish
- factually correct
- about healthy vegetables
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)
Again, the message has to be entirely in Manadarin Chineese.
That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct"""),
},
),
693: Test(
name="Simple Multiplication",
runnable=one_tool_call_answer,
runnable_input={
"system_msg": "You are a helpful assistant.",
"human_msg": "What is 234215 times 143243?",
"tools": {"add": add, "multiply": multiply},
},
validator=regex_match_any,
validation_input={
"patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
# "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
},
),
120: Test(
name="Complex Multiplication",
runnable=agent_with_tools,
runnable_input={
"system_msg": 'You are a helpful assistant. You can use tools to accomplish the task. Once you\'ve called a tool. the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling.',
"human_msg": "What is 235 times 1243 times 21?",
"tools": {"add": add, "multiply": multiply},
},
validator=regex_match_any,
validation_input={"patterns": ["6134205", "6.134.205", "6,134,205"]},
),
283: Test(
name="Notes from last Saturday",
runnable=agent_with_tools,
runnable_input={
"system_msg": 'You are a helpful assistant. You can use tools to accomplish tasks. Once you\'ve called a tool, the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling. To know the date, use the tool get_current_date_and_time.',
"human_msg": "Last Saturday, who did grandma want me to call?",
"tools": {
"get_current_date_and_time": get_current_date_and_time,
"get_notes_in_timespan": get_notes_in_timespan,
"get_notes_containing": get_notes_containing,
"Write note": write_note,
},
},
validator=system_human_answer_match,
validation_input={
"criteria": dedent("""- containing the information that the Human should call Wolfgang
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
},
),
260: Test(
name="Notes from last Saturday TSO", # time span only
runnable=agent_with_tools,
runnable_input={
"system_msg": 'You are a helpful assistant. You can use tools to accomplish tasks. Once you\'ve called a tool, the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling. To know the date, use the tool get_current_date_and_time.',
"human_msg": "Last Saturday, who did grandma want me to call?",
"tools": {
"get_current_date_and_time": get_current_date_and_time,
"get_notes_in_timespan": get_notes_in_timespan,
"Write note": write_note,
},
},
validator=system_human_answer_match,
validation_input={
"criteria": dedent("""- containing the information that the Human should call Wolfgang
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
},
),
856: Test(
name="Notes from last Saturday TSO FSP",
runnable=agent_with_tools,
runnable_input={
"system_msg": 'You are a helpful assistant. You can use tools to accomplish tasks. Once you\'ve called a tool, the resulting tool_message content can be taken into consideration again. With that you can do "multiple rounds" of tool calling. To know the date, use the tool get_current_date_and_time.',
"fsp_messages": [
HumanMessage("Tomorrow is the anniversary! Any tips what I should by her?"), # One year ago
AIMessage("", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "11"}]),
ToolMessage("Wednesday the 31st of Juli 2024 09:31", tool_call_id="11"),
AIMessage("", tool_calls=[ { "name": "get_notes_in_timespan", "args": {"begin": "2023/08/01", "to": "2023/08/01"}, "id": "12", } ]),
ToolMessage("2023/08/01 23:10 Went out with Charlotte for our anniversary. Pizza at Cavalinos. She loved the rose necklace!", tool_call_id="12"),
AIMessage("I'm afraid I cannot be of great help, since I obviously know charlotte way less than you, but last year you two went out to Cavalinons and you got her a rose necklace as a present. And she liked it. So maybe a pair of earrings would be something she'd like?", name="example_assistant"),
HumanMessage("Did I write down anything yesterday or the day before that?"),
AIMessage( "", tool_calls=[{"name": "get_current_date_and_time", "args": {}, "id": "21"}]),
ToolMessage("Wednesday the 7th of August 2024 16:23", tool_call_id="21"),
AIMessage( "", tool_calls=[ { "name": "get_notes_in_timespan", "args": {"begin": "2024/08/05", "to": "2024/08/06"}, "id": "22"}]),
ToolMessage( "2024/08/05 11:45 Ask Dr. Mills about the side effects of the new medication he got me.\n\n2024/08/06 18:30 Pick up the dry cleaning on Thursday; they close early on Fridays.", tool_call_id="22"),
AIMessage( "Yes. I found two entries.\n- From yesterday stating that you wanted to pickup the dry cleaning on Thursday, because they close early on Fridays\n- From Monday a note saying that you want to ask Dr. Mills about the side effects of the new medication he got you.", name="example_assistant"),
],
"human_msg": "Last Saturday, who did grandma want me to call?",
"tools": {
"get_current_date_and_time": get_current_date_and_time,
"get_notes_in_timespan": get_notes_in_timespan,
"Write note": write_note,
},
},
validator=system_human_answer_match,
validation_input={
"criteria": dedent("""- containing the information that the Human should call Wolfgang
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes, what specific tool was used to get the answer, etc.)""")
},
),
# 363: Test(),
# 600: Test(),
# 221: Test(),
# 985: Test(),
# 634: Test(),
# 927: Test(),
# 346: Test(),
# 995: Test(),
# 404: Test(),
# 299: Test(),
# 275: Test(),
# 852: Test(),
# 376: Test(),
# 263: Test(),
# 432: Test(),
# 270: Test(),
# 797: Test(),
# 340: Test(),
# 489: Test(),
# 786: Test(),
# 121: Test(),
# 971: Test(),
# 436: Test(),
# 147: Test(),
# 534: Test(),
# 190: Test(),
# 158: Test(),
# 191: Test(),
}