test-small-llms/test_small_llms.py

from libs.test_class import Test
from libs.run_tests import run_tests
from libs.runnables import *
from libs.validators import *
from libs.tools import *

from pprint import pprint

def main():
    models = [
        "llama3.1", # 8b
        # "llama3.1:70b",
        # "llama3-groq-tool-use", # latest
        # "llama3-groq-tool-use:70b",
        # "mixtral:8x7b",
        # "mixtral:8x22b",
        # "gemma2:2b",
        # "phi3", # 3.8b
        # "tinyllama:1.1b",
    ]
    seeds = [
        2,
        # 222,
        # 22222,
        # 2222222
    ]
    tests = [
        Test(
    	    name="Chinese Fruit",
    	    runnable=basic,
    	    runnable_input={
    	        "system_msg": "You are a helpful assistant. You serve people across the globe.",
    	        "human_msg": "什么蔬菜最健康？",
    	    },
            validator=system_human_answer_match,
    	    validation_input={
        	    "criteria": """- in Mandarin Chinese from front to finnish
- factually correct
- about healthy vegetables
- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)

Again, the message has to be entirely in Manadarin Chineese.
That means If the answer is not in Chinese the answer is NOT correct! Only if the message in in Chinese rate as correct""",
    	    }
        ),
        Test(
    	    name="Simple Multiplication",
    	    runnable=one_tool_call_answer,
    	    runnable_input={
    	        "system_msg": "You are a helpful assistant.",
    	        "human_msg": "What is 234215 times 143243?",
    	        "tools": {
    	            "add": add,
    	            "multiply": multiply
    	        }
    	    },
            validator=regex_match_any,
    	    validation_input={
    	        "patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
    	        # "patterns": ["3[,\. ]?3[,\. ]?5[,\. ]?4[,\. ]?9[,\. ]?6[,\. ]?5[,\. ]?9[,\. ]?2[,\. ]?4[,\. ]?5"] # Would accept 3.354.965.9245
    	    }
        ),
    ]

    results = run_tests(
        models=models,
        seeds=seeds,
        tests=tests,
        base_url="http://bolt.hs-mittweida.de:11434"
    )

    print()
    for hash_key in results:
        result = results[hash_key]
        print(f"""
\033[0;36mtest_name:\033[0m {result['test_name']}
\033[0;36mmodel:\033[0m {result['model']}
\033[0;36mseed:\033[0m {result['seed']}
\033[0;36mvalidation_result:\033[0m {result['validation']}
\033[0;36manswer: »\033[0m{result['answer']}\033[0;36m«\033[0m""")

if __name__ == "__main__":
    main()