72 lines
2.2 KiB
Python
72 lines
2.2 KiB
Python
from libs.test_class import Test
|
|
from libs.run_tests import run_tests
|
|
from libs.runnables import *
|
|
from libs.validators import *
|
|
from libs.tools import *
|
|
|
|
from pprint import pprint
|
|
|
|
def main():
|
|
models = [
|
|
"llama3.1", # 8b
|
|
# "llama3.1:70b",
|
|
# "llama3-groq-tool-use", # latest
|
|
# "llama3-groq-tool-use:70b",
|
|
# "mixtral:8x7b",
|
|
# "mixtral:8x22b",
|
|
# "gemma2:2b",
|
|
# "phi3", # 3.8b
|
|
# "tinyllama:1.1b",
|
|
]
|
|
seeds = [
|
|
# 2,
|
|
222,
|
|
22222,
|
|
# 2222222
|
|
]
|
|
tests = [
|
|
Test(
|
|
name="Chinese Fruit",
|
|
runnable=basic,
|
|
runnable_input={
|
|
"system_msg": "You are a helpful assistant. You serve people across the globe.",
|
|
"human_msg": "什么蔬菜最健康?",
|
|
},
|
|
validator=system_human_answer_match,
|
|
validation_input={
|
|
"criteria": """- in Mandarin Chinese
|
|
- factually correct
|
|
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)""",
|
|
}
|
|
),
|
|
Test(
|
|
name="Simple Multiplication",
|
|
runnable=one_tool_call_answer,
|
|
runnable_input={
|
|
"system_msg": "You are a helpful assistant.",
|
|
"human_msg": "What is 234215 times 143243?",
|
|
"tools": {
|
|
"add": add,
|
|
"multiply": multiply
|
|
}
|
|
},
|
|
validator=regex_match_any,
|
|
validation_input={
|
|
"patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
|
|
}
|
|
),
|
|
]
|
|
|
|
results = run_tests(
|
|
models=models,
|
|
seeds=seeds,
|
|
tests=tests,
|
|
base_url="http://bolt.hs-mittweida.de:11434"
|
|
)
|
|
|
|
print()
|
|
for result in results: print(f"\n\033[0;36mtest_name:\033[0m {result['test'].name}\n\033[0;36mmodel:\033[0m {result['model']}\n\033[0;36mseed:\033[0m {result['seed']}\n\033[0;36mvalidation_result:\033[0m {result['validation']}\n\033[0;36manswer: ⏎\033[0m\n{result['result']}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|