building of pipeline (validation flaky)

2024-08-04 20:50:11 +02:00
parent e56fa9225c
commit 52a180b936
8 changed files with 168 additions and 53 deletions
--- a/test_small_llms.py
+++ b/test_small_llms.py
@@ -0,0 +1,54 @@
+from libs.test_class import Test
+from libs.run_tests import run_tests
+from libs.runnables import * 
+from libs.validators import * 
+
+from pprint import pprint
+
+def main():
+    models = [
+        # "llama3.1", # 8b
+        # "llama3.1:70b",
+        # "llama3-groq-tool-use", # latest
+        # "llama3-groq-tool-use:70b",
+        # "mixtral:8x7b",
+        # "mixtral:8x22b",
+        # "gemma2:2b",
+        # "phi3", # 3.8b
+        "tinyllama:1.1b",
+    ]
+    seeds = [
+        # 2,
+        222,
+        # 22222,
+        2222222
+    ]
+    tests = [
+        Test(
+    	    name="Chinese Fruit",
+    	    system_msg="You are a helpful assistant. You serve people across the globe. You can be a freind, but stay professional.",
+    	    human_msg="什么蔬菜最健康？",
+    	    validation_info="""- in Mandarin Chinese
+- factually correct
+- just one single conversational answer, without any AI fragments (A/B versions, "end of message" parts, unfitting discalimers or notes)""",
+            runnable=basic,
+            validator=system_human_answer_match
+        ),
+        # Test(
+        #     name="Simple Multiplication",
+        #     system_msg=
+        # )
+    ]
+
+    results = run_tests(
+        models=models,
+        seeds=seeds,
+        tests=tests,
+        base_url="http://bolt.hs-mittweida.de:11434"
+    )
+
+    print()
+    for result in results: print(f"\n\033[0;36mtest_name:\033[0m {result['test'].name}\n\033[0;36mmodel:\033[0m {result['model']}\n\033[0;36mseed:\033[0m {result['seed']}\n\033[0;36mvalidation_result:\033[0m {result['validation']}\n\033[0;36manswer: ⏎\033[0m\n{result['result']}")
+
+if __name__ == "__main__":
+    main()