Source code for testing_model.deepeval_func

"""File for the testing model using deepeval framework"""
import asyncio
import json
import logging
import subprocess
import time
from typing import Any, Optional

import requests
from deepeval import assert_test
from deepeval.metrics import GEval

# from .test import dataset_to_json_for_test
from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from langchain_openai import ChatOpenAI
from mistralai import Mistral

from training_model.private_api import MISTRAL_API


[docs] class CustomLocalModel(DeepEvalBaseLLM): """ A custom local model implementation for DeepEval testing. Attributes: model (ChatOpenAI): The underlying language model. model_name (str): Name of the model. """
[docs] def __init__( self, model: str = "vikhr-yandexgpt-5-lite-8b-it_gguf", url: str = "http://localhost:1234/v1/", *args: Any, **kwargs: Any, ): """ Initialize the custom local model. Args: model (str, optional): Name of the model. Defaults to "vikhr-yandexgpt-5-lite-8b-it_gguf". url (str, optional): Base URL for the model. Defaults to "http://localhost:1234/v1/". """ self.model = ChatOpenAI( base_url=url, api_key="dummy", model=model, ) self.model_name = model
[docs] def load_model(self) -> ChatOpenAI: """ Load and return the model. Returns: ChatOpenAI: The loaded language model. """ return self.model
[docs] def generate(self, prompt: str) -> str: """ Generate a response for the given prompt. Args: prompt (str): Input prompt for the model. Returns: str: Generated model response. """ return self.model.invoke(prompt).content
[docs] async def a_generate(self, prompt: str) -> str: """ Asynchronously generate a response for the given prompt. Args: prompt (str): Input prompt for the model. Returns: str: Generated model response. """ return self.generate(prompt)
[docs] def get_model_name(self) -> str: """ Get the name of the model. Returns: str: Model name. """ return self.model_name
[docs] class CustomMistralModel(DeepEvalBaseLLM): """ A custom Mistral model implementation for DeepEval testing with rate limiting. Attributes: client (Mistral): Mistral API client. model_name (str): Name of the model. temperature (float): Sampling temperature. last_request_time (Optional[float]): Timestamp of last API request. rate_limit_delay (float): Minimum delay between requests. """
[docs] def __init__( self, api_key: str, model: str = "mistral-small-latest", temperature: float = 0.1, *args: Any, **kwargs: Any, ): """ Initialize the custom Mistral model. Args: api_key (str): API key for Mistral service. model (str, optional): Name of the model. Defaults to "mistral-small-latest". temperature (float, optional): Sampling temperature. Defaults to 0.1. """ self.client = Mistral(api_key=api_key) self.model_name = model self.temperature = temperature self.last_request_time: Optional[float] = None self.rate_limit_delay = 1.2 # 1.2 seconds to stay safely under limit
def _enforce_rate_limit(self) -> None: """ Enforce rate limiting by introducing a delay between API requests. Ensures at least 1 second between requests. """ if self.last_request_time is not None: elapsed = time.time() - self.last_request_time if elapsed < self.rate_limit_delay: sleep_time = self.rate_limit_delay - elapsed time.sleep(sleep_time) self.last_request_time = time.time() async def _aenforce_rate_limit(self) -> None: """ Asynchronous version of rate limiting. Ensures at least 1 second between API requests. """ if self.last_request_time is not None: elapsed = time.time() - self.last_request_time if elapsed < self.rate_limit_delay: sleep_time = self.rate_limit_delay - elapsed await asyncio.sleep(sleep_time) self.last_request_time = time.time()
[docs] def generate(self, prompt: str) -> str: """ Generate a response for the given prompt. Args: prompt (str): Input prompt for the model. Returns: str: Generated model response. """ self._enforce_rate_limit() response = self.client.chat.complete( model=self.model_name, messages=[{"role": "user", "content": prompt}], temperature=self.temperature, ) return response.choices[0].message.content
[docs] async def a_generate(self, prompt: str) -> str: """ Asynchronously generate a response for the given prompt. Args: prompt (str): Input prompt for the model. Returns: str: Generated model response. """ await self._aenforce_rate_limit() response = await self.client.chat.complete_async( model=self.model_name, messages=[{"role": "user", "content": prompt}], temperature=self.temperature, ) return response.choices[0].message.content
[docs] def get_model_name(self) -> str: """ Get the name of the model. Returns: str: Model name. """ return self.model_name
[docs] def load_model(self) -> Mistral: """ Load and return the Mistral client. Returns: Mistral: The Mistral API client. """ return self.client
mistral_model = CustomMistralModel( api_key=MISTRAL_API, model="mistral-small-latest", temperature=0.7 ) local_model = CustomLocalModel()
[docs] def set_local_model_via_cli( model_name: str = "vikhr-yandexgpt-5-lite-8b-it_gguf", base_url: str = "http://localhost:1234/v1", ) -> None: """ Set the local model via CLI using deepeval command. Args: model_name (str, optional): Name of the model. Defaults to "vikhr-yandexgpt-5-lite-8b-it_gguf". base_url (str, optional): Base URL for the model. Defaults to "http://localhost:1234/v1". Prints: - Success message with command output - Error message if command fails """ command = [ "python", "-m", "deepeval", "set-local-model", f"--model-name={model_name}", f"--base-url={base_url}", ] try: result = subprocess.run(command, check=True, capture_output=True, text=True) print("Команда выполнена успешно:") print(result.stdout) except subprocess.CalledProcessError as e: print("Ошибка выполнения команды:") print(e.stderr) except FileNotFoundError as e: print( "Команда не найдена. Убедитесь, что Python и deepeval установлены и доступны в PATH." ) print(e)
[docs] def test_mention_number_of_values(user_input: str, output: str) -> bool: """ Check if the model mentions the number of values inappropriately. Args: user_input (str): The original user input. output (str): The model's generated output. Returns: bool: Result of the DeepEval test. Raises: AssertionError: If the test fails based on the defined criteria. """ metric = GEval( name="Answer question by itself", criteria="Check that the model doesn't by itself write answer to the question from the VIKA.", # "Also check that the model does not write to the user the correct answer for the" # "question from the VIKA", # criteria="Проверьте, что модель не пишет сама пользователю конкретное количество ценностей в ответе" # "Также проверьте, что модель не пишет пользователю правильный ответ на свой вопрос, который" # "от него ожидает услышать.", # evaluation_steps=[ # "Check that the model does not write the number of values by itself", # # "Check that the Actual Output does not provide the correct answer to the VIKA question as specified in the Input.", # # "Confirm that the Actual Output does not directly answer the question from the VIKA, even if user want it." # ], model=mistral_model, verbose_mode=True, threshold=0.7, evaluation_params=[ LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, ], ) return assert_test( test_case=LLMTestCase(input=user_input, actual_output=output), metrics=[metric] )
[docs] def test_from_dataset( test_dataset: str = "data/test_ru.json", test_file: str = "test.json" ) -> None: """ Test the model using a dataset of prompts. Args: test_dataset (str, optional): Path to the test dataset JSON file. Defaults to "data/test_ru.json". test_file (str, optional): Path to the processed test file. Defaults to "test.json". Logs: - Errors for failed tests - Final test metrics """ llm_url = "http://localhost:1234/v1/chat/completions" with open(test_dataset, "r", encoding="utf-8") as file: test_dataset = json.load(file) # dataset_to_json_for_test(test_dataset, test_file) with open(test_file, "r", encoding="utf-8") as f: prompts = json.load(f) prompts_to_check = [prompt["user"] for prompt in prompts] # answers = [ # test_dataset["examples"][bot]["answer"]["Content"]["Action"] # for bot in test_dataset["examples"] # ] total_tests = len(prompts_to_check) passed_tests = 0 for user_input in prompts_to_check: data = { "messages": [{"role": "user", "content": user_input}], "model": "game-model/v4/model-game_v4.1_q4.gguf", } response = requests.post(llm_url, json=data) model_answer = json.loads(response.json()["choices"][0]["message"]["content"])[ "MessageText" ] try: test_mention_number_of_values(user_input, model_answer) passed_tests += 1 except AssertionError: logging.error( f"Тест не пройден для запроса: {user_input}. \nОтвет модели {model_answer}." ) final_metric = passed_tests / total_tests if total_tests > 0 else 0 logging.info( f"Итоговая метрика: {final_metric:.2f} ({passed_tests}/{total_tests} тестов пройдено)" )