Source code for testing_model.test

"""File for testing llm model"""

import json
import logging
import re
from typing import Any, Callable, Dict, List, Optional

import ollama
from llama_cpp import Llama
from omegaconf import DictConfig
from openai import OpenAI
from pydantic import BaseModel, ConfigDict, Field

from training_model.utils import get_user_prompt

# from .deepeval_func import test_mention_number_of_values
from .functions_to_test_game import test_actions

ollama.base_url = "http://localhost:11434"


[docs] class Content(BaseModel): """The inner element of the pydantic schema for testing model""" model_config = ConfigDict(extra="forbid") Action: str = Field(..., description="The action associated with the message.")
[docs] class MainModel(BaseModel): """Pydantic schema for testing model""" model_config = ConfigDict(extra="forbid") MessageText: str = Field(..., description="The text of the message.") Content: Content
[docs] def dataset_to_json_for_test(dataset: Dict[str, Any], filename: str) -> None: """ Convert a dataset to a JSON file for testing purposes. Args: dataset (Dict[str, Any]): The dataset containing system and example information. filename (str): The path to the output JSON file. Returns: None """ json_objects: List[Dict[str, str]] = [] system = dataset["system"] dataset = dataset["examples"] with open(filename, "w", encoding="utf-8") as file: file.write("") for row in dataset.keys(): system_message = system user_message = get_user_prompt(dataset[row]["prompt"]) # user_message = str(dataset[row]['prompt']) bot_message = str(dataset[row]["answer"]) json_object = { "system": system_message, "user": user_message, "bot": bot_message, } json_objects.append(json_object) with open(filename, "a", encoding="utf-8") as file: file.write(json.dumps(json_objects, indent=4, ensure_ascii=False))
[docs] def ollama_generate( client: ollama.Client, model_name: str | bytes, prompt: str, schema: Dict ) -> Dict: """ Wrapper function to generate a response using Ollama's structured outputs. Args: client (ollama.Client): The Ollama client instance. model_name (str): The name of the model in Ollama. prompt (str): The formatted prompt to send to the model. schema (Dict): The JSON schema for the expected response. Returns: Dict: The parsed JSON response conforming to the schema. """ response = client.generate( model=model_name, prompt=prompt, format=schema, ) logging.debug(response) return response["response"]
[docs] def call_llm(prompt: str, model: str, client: OpenAI) -> str: """ Sends a prompt to the LLM and returns its response as a dictionary. Args: prompt (str): The user prompt. model (str): The model identifier. client (OpenAI): openai client for the llm. Returns: dict: Parsed LLM response. """ response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}] ) if not response.choices: raise RuntimeError("No choices returned from OpenAI response") content = response.choices[0].message.content model_answer = content return model_answer
[docs] def run_tests( cfg: DictConfig, client: OpenAI | ollama.Client, test_dataset_path: str = "data/test_ru.json", test_file: str = "test.json", test_func: callable = None, use_ollama: bool = False, ) -> None: """ Runs tests by comparing the LLM responses with expected answers from a dataset. Args: cfg (DictConfig): Configuration with model settings. client (OpenAI | ollama.client): OpenAI or ollama client for LLM interaction. test_dataset_path (str, optional): Path to the test dataset JSON file. Defaults to "data/test_ru.json". test_file (str, optional): Path to save the processed test file. Defaults to "test.json". test_func (callable, optional): Additional test function to execute on each result. This function should accept the user prompt, LLM's message text and the correct answer. use_ollama (bool) : Flag to indicate if Ollama should be used for testing. Returns: None """ with open(test_dataset_path, "r", encoding="utf-8") as file: test_dataset = json.load(file) dataset_to_json_for_test(test_dataset, test_file) with open(test_file, "r", encoding="utf-8") as f: prompts = json.load(f) prompts_to_check = [prompt["user"] for prompt in prompts] expected_answers = [answer["bot"] for answer in prompts] passed_test = 0 for number in range(len(prompts_to_check)): prompt = prompts_to_check[number] correct_answer = expected_answers[number].strip() if not use_ollama: model_answer = call_llm( prompt, client=client, model=cfg.model.outfile, ).strip() else: schema = MainModel.model_json_schema() model_answer = ollama_generate( client=client, model_name=cfg.model.outfile.replace(".gguf", ""), prompt=prompt, schema=schema, ) if test_func is not None: try: test_func(prompt, model_answer, correct_answer) passed_test += 1 except AssertionError as e: logging.error( f"Test failed for prompt: {prompt}.\n Error: {e}\nModel answer: {model_answer}\nExpected answer: {correct_answer}\n" ) total_tests = len(prompts_to_check) final_metric = passed_test / total_tests if total_tests > 0 else 0 logging.info( f"Metrics: {final_metric:.2f} ({passed_test}/{total_tests} tests passed)" )
[docs] def test_llm( cfg: DictConfig, path_test_dataset: str = "data/test_ru.json", test_file: str = "test.json", test_func: Optional[List[Callable]] = None, llm_url: Optional[str] = "http://localhost:1234/v1/", use_ollama: bool = False, ollama_client: Optional[ollama.Client] = None, ) -> None: """ Test the LLM via LM Studio by comparing model responses with expected answers. Args: cfg (DictConfig): Configuration dictionary containing model settings. path_test_dataset (str, optional): Path to the test dataset JSON file. Defaults to "data/test_ru.json". test_file (str, optional): Path to save the processed test file. Defaults to "test.json". test_func (Optional[List[Callable]]): List of additional test functions to execute on each result. llm_url (str, optional): URL of the LLM service. Defaults to "http://localhost:1234/v1/". use_ollama (bool) : Flag to indicate if Ollama should be used for testing. ollama_client (Optional[ollama.Client]): Ollama client for connection Returns: None Raises: Logs errors for failed tests and prints accuracy metrics. """ if test_func is None: test_func = [test_actions] if ollama_client is None: client = OpenAI(api_key="dummy", base_url=llm_url) else: client = ollama_client for test in test_func: try: run_tests( cfg=cfg, client=client, test_dataset_path=path_test_dataset, test_file=test_file, test_func=test, use_ollama=use_ollama, ) except Exception as e: logging.error(f"Test function {test.__name__} failed with error: {e}")
[docs] def llamacpp_execute_test( llm, system_prompt: str, prompt: str, expected_answer: str, max_tokens: int, temperature: float, ) -> tuple[dict, bool]: """ Выполняет тест для одного запроса. Args: llm: Модель для генерации ответов. system_prompt (str): Системный промпт с инструкциями. prompt (str): Пользовательский запрос. expected_answer (str): Ожидаемый результат. max_tokens (int): Максимальное число генерируемых токенов. temperature (float): Параметр температуры для генерации. Returns: tuple: Кортеж, содержащий словарь с результатами теста и булевое значение (True, если тест пройден). """ formatted_prompt = f"[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{prompt} [/INST]" response = llm( formatted_prompt, max_tokens=max_tokens, temperature=temperature, stop=["</s>"], ) response_text = response["choices"][0]["text"] json_match = re.search(r"(\{.*\})", response_text, re.DOTALL) if json_match: try: json_response = json.loads(json_match.group(1)) predicted_action = json_response.get("Content", {}).get("Action") passed = predicted_action == expected_answer result = { "prompt": prompt, "expected": expected_answer, "predicted": predicted_action, "full_response": response_text, "passed": passed, } if passed: logging.info("Test passed") else: logging.error("Test failed") logging.error(f"Expected: {expected_answer}, Got: {predicted_action}") except json.JSONDecodeError: logging.error("Test failed: Invalid JSON response") logging.error(f"Response: {response_text}") result = { "prompt": prompt, "expected": expected_answer, "predicted": "ERROR: Invalid JSON", "full_response": response_text, "passed": False, } passed = False else: logging.error("Test failed: No JSON found in response") logging.error(f"Response: {response_text}") result = { "prompt": prompt, "expected": expected_answer, "predicted": "ERROR: No JSON found", "full_response": response_text, "passed": False, } passed = False return result, passed
[docs] def test_via_llamacpp( model_path: str | bytes, test_dataset: str = "data/test_ru.json", test_file: str = "test.json", n_gpu_layers: int = -1, n_ctx: int = 2048, temperature: float = 0.7, max_tokens: int = 2048, test_func: Callable = llamacpp_execute_test, system_prompt: Optional[str] = None, ) -> float: """ Тестирование GGUF модели через llama.cpp с использованием передаваемой функции тестирования. Args: model_path (str | bytes): Путь к файлу модели GGUF. test_dataset (str, optional): Путь к JSON файлу с тестовыми данными. По умолчанию "data/test_ru.json". test_file (str, optional): Путь для сохранения обработанного тестового файла. По умолчанию "test.json". n_gpu_layers (int, optional): Количество слоёв для вычислений на GPU. По умолчанию -1 (все слои). n_ctx (int, optional): Размер окна контекста. По умолчанию 2048. temperature (float, optional): Температура сэмплинга. По умолчанию 0.7. max_tokens (int, optional): Максимальное количество генерируемых токенов. По умолчанию 2048. test_func (Callable): Функция, реализующая принцип тестирования. system_prompt (Optional[str], optional): Системный промпт для модели. Returns: float: Значение точности (accuracy). """ llm = Llama( model_path=model_path, n_gpu_layers=n_gpu_layers, n_ctx=n_ctx, verbose=True ) json_schema = MainModel.model_json_schema() with open(test_dataset, "r", encoding="utf-8") as file: test_dataset_data = json.load(file) dataset_to_json_for_test(test_dataset_data, test_file) with open(test_file, "r", encoding="utf-8") as f: prompts = json.load(f) prompts_to_check = [prompt["user"] for prompt in prompts] answers = [ test_dataset_data["examples"][bot]["answer"]["Content"]["Action"] for bot in test_dataset_data["examples"] ] logging.debug(f"Expected answers: {answers}") logging.debug(f"Number of prompts: {len(prompts_to_check)}") count = 0 results = [] if system_prompt is None: system_prompt = ( "Ты – помощник по имени ВИКА на заброшенной космической станции. " "У тебя есть доступ к системам станции. " "Отвечай только в формате JSON с ключами 'MessageText' и 'Content', " "где Content содержит ключ 'Action' с одним из доступных тебе действий. " f"Используй следующую JSON схему: {json.dumps(json_schema, ensure_ascii=False)} " "Заканчивай ответ символом }." ) for number, prompt in enumerate(prompts_to_check): result, passed = test_func( llm=llm, system_prompt=system_prompt, prompt=prompt, expected_answer=answers[number], max_tokens=max_tokens, temperature=temperature, ) results.append(result) if passed: count += 1 logging.info(f"Test {number} passed") else: logging.error(f"Test {number} failed") accuracy = count / len(prompts_to_check) logging.info(f"Accuracy: {accuracy:.4f} ({count}/{len(prompts_to_check)})") with open("test_results.json", "w", encoding="utf-8") as f: json.dump( {"accuracy": accuracy, "results": results}, f, ensure_ascii=False, indent=2 ) return accuracy