84 lines
2.5 KiB
Python
84 lines
2.5 KiB
Python
from libs.test_class import Test
|
|
from libs.run_tests import run_tests
|
|
from libs.runnables import *
|
|
from libs.validators import *
|
|
from libs.tools import *
|
|
|
|
from pprint import pprint
|
|
|
|
def main():
|
|
models = [
|
|
"llama3.1", # 8b
|
|
# "llama3.1:70b",
|
|
# "llama3-groq-tool-use", # latest
|
|
# "llama3-groq-tool-use:70b",
|
|
# "mixtral:8x7b",
|
|
# "mixtral:8x22b",
|
|
# "gemma2:2b",
|
|
# "phi3", # 3.8b
|
|
# "tinyllama:1.1b",
|
|
]
|
|
seeds = [
|
|
2,
|
|
# 222,
|
|
# 22222,
|
|
# 2222222
|
|
]
|
|
tests = [
|
|
Test(
|
|
name="Chinese Fruit",
|
|
runnable=basic,
|
|
runnable_input={
|
|
"system_msg": "You are a helpful assistant. You serve people across the globe.",
|
|
"human_msg": "什么蔬菜最健康?",
|
|
},
|
|
validator=system_human_answer_match,
|
|
validation_input={
|
|
"criteria": """- in Mandarin Chinese from front to finnish
|
|
- factually correct
|
|
- about healthy vegetables
|
|
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)
|
|
|
|
Again, the message has to be entirely in Manadarin Chineese.
|
|
That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct""",
|
|
}
|
|
),
|
|
Test(
|
|
name="Simple Multiplication",
|
|
runnable=one_tool_call_answer,
|
|
runnable_input={
|
|
"system_msg": "You are a helpful assistant.",
|
|
"human_msg": "What is 234215 times 143243?",
|
|
"tools": {
|
|
"add": add,
|
|
"multiply": multiply
|
|
}
|
|
},
|
|
validator=regex_match_any,
|
|
validation_input={
|
|
"patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
|
|
# "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
|
|
}
|
|
),
|
|
]
|
|
|
|
results = run_tests(
|
|
models=models,
|
|
seeds=seeds,
|
|
tests=tests,
|
|
base_url="http://bolt.hs-mittweida.de:11434"
|
|
)
|
|
|
|
print()
|
|
for hash_key in results:
|
|
result = results[hash_key]
|
|
print(f"""
|
|
\033[0;36mtest_name:\033[0m {result['test_name']}
|
|
\033[0;36mmodel:\033[0m {result['model']}
|
|
\033[0;36mseed:\033[0m {result['seed']}
|
|
\033[0;36mvalidation_result:\033[0m {result['validation']}
|
|
\033[0;36manswer: »\033[0m{result['answer']}\033[0;36m«\033[0m""")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|