test-small-llms/test_small_llms.py

from libs.test_class import Test
from libs.run_tests import run_tests
from libs.runnables import *
from libs.validators import *
from libs.tools import *

from pprint import pprint

def main():
    models = [
        "llama3.1", # 8b
        # "llama3.1:70b",
        # "llama3-groq-tool-use", # latest
        # "llama3-groq-tool-use:70b",
        # "mixtral:8x7b",
        # "mixtral:8x22b",
        # "gemma2:2b",
        # "phi3", # 3.8b
        # "tinyllama:1.1b",
    ]
    seeds = [
        # 2,
        222,
        22222,
        # 2222222
    ]
    tests = [
        Test(
    	    name="Chinese Fruit",
    	    runnable=basic,
    	    runnable_input={
    	        "system_msg": "You are a helpful assistant. You serve people across the globe.",
    	        "human_msg": "什么蔬菜最健康？",
    	    },
            validator=system_human_answer_match,
    	    validation_input={
        	    "criteria": """- in Mandarin Chinese
                            - factually correct
                            - just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)""",
    	    }
        ),
        Test(
    	    name="Simple Multiplication",
    	    runnable=one_tool_call_answer,
    	    runnable_input={
    	        "system_msg": "You are a helpful assistant.",
    	        "human_msg": "What is 234215 times 143243?",
    	        "tools": {
    	            "add": add,
    	            "multiply": multiply
    	        }
    	    },
            validator=regex_match_any,
    	    validation_input={
    	        "patterns": ["33549659245", "33,549,659,245", "33.549.659.245"]
    	    }
        ),
    ]

    results = run_tests(
        models=models,
        seeds=seeds,
        tests=tests,
        base_url="http://bolt.hs-mittweida.de:11434"
    )

    print()
    for result in results: print(f"\n\033[0;36mtest_name:\033[0m {result['test'].name}\n\033[0;36mmodel:\033[0m {result['model']}\n\033[0;36mseed:\033[0m {result['seed']}\n\033[0;36mvalidation_result:\033[0m {result['validation']}\n\033[0;36manswer: ⏎\033[0m\n{result['result']}")

if __name__ == "__main__":
    main()