"""File for the testing model using deepeval framework"""
import asyncio
import json
import logging
import subprocess
import time
from typing import Any, Optional
import requests
from deepeval import assert_test
from deepeval.metrics import GEval
# from .test import dataset_to_json_for_test
from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from langchain_openai import ChatOpenAI
from mistralai import Mistral
from training_model.private_api import MISTRAL_API
[docs]
class CustomLocalModel(DeepEvalBaseLLM):
"""
A custom local model implementation for DeepEval testing.
Attributes:
model (ChatOpenAI): The underlying language model.
model_name (str): Name of the model.
"""
[docs]
def __init__(
self,
model: str = "vikhr-yandexgpt-5-lite-8b-it_gguf",
url: str = "http://localhost:1234/v1/",
*args: Any,
**kwargs: Any,
):
"""
Initialize the custom local model.
Args:
model (str, optional): Name of the model. Defaults to "vikhr-yandexgpt-5-lite-8b-it_gguf".
url (str, optional): Base URL for the model. Defaults to "http://localhost:1234/v1/".
"""
self.model = ChatOpenAI(
base_url=url,
api_key="dummy",
model=model,
)
self.model_name = model
[docs]
def load_model(self) -> ChatOpenAI:
"""
Load and return the model.
Returns:
ChatOpenAI: The loaded language model.
"""
return self.model
[docs]
def generate(self, prompt: str) -> str:
"""
Generate a response for the given prompt.
Args:
prompt (str): Input prompt for the model.
Returns:
str: Generated model response.
"""
return self.model.invoke(prompt).content
[docs]
async def a_generate(self, prompt: str) -> str:
"""
Asynchronously generate a response for the given prompt.
Args:
prompt (str): Input prompt for the model.
Returns:
str: Generated model response.
"""
return self.generate(prompt)
[docs]
def get_model_name(self) -> str:
"""
Get the name of the model.
Returns:
str: Model name.
"""
return self.model_name
[docs]
class CustomMistralModel(DeepEvalBaseLLM):
"""
A custom Mistral model implementation for DeepEval testing with rate limiting.
Attributes:
client (Mistral): Mistral API client.
model_name (str): Name of the model.
temperature (float): Sampling temperature.
last_request_time (Optional[float]): Timestamp of last API request.
rate_limit_delay (float): Minimum delay between requests.
"""
[docs]
def __init__(
self,
api_key: str,
model: str = "mistral-small-latest",
temperature: float = 0.1,
*args: Any,
**kwargs: Any,
):
"""
Initialize the custom Mistral model.
Args:
api_key (str): API key for Mistral service.
model (str, optional): Name of the model. Defaults to "mistral-small-latest".
temperature (float, optional): Sampling temperature. Defaults to 0.1.
"""
self.client = Mistral(api_key=api_key)
self.model_name = model
self.temperature = temperature
self.last_request_time: Optional[float] = None
self.rate_limit_delay = 1.2 # 1.2 seconds to stay safely under limit
def _enforce_rate_limit(self) -> None:
"""
Enforce rate limiting by introducing a delay between API requests.
Ensures at least 1 second between requests.
"""
if self.last_request_time is not None:
elapsed = time.time() - self.last_request_time
if elapsed < self.rate_limit_delay:
sleep_time = self.rate_limit_delay - elapsed
time.sleep(sleep_time)
self.last_request_time = time.time()
async def _aenforce_rate_limit(self) -> None:
"""
Asynchronous version of rate limiting.
Ensures at least 1 second between API requests.
"""
if self.last_request_time is not None:
elapsed = time.time() - self.last_request_time
if elapsed < self.rate_limit_delay:
sleep_time = self.rate_limit_delay - elapsed
await asyncio.sleep(sleep_time)
self.last_request_time = time.time()
[docs]
def generate(self, prompt: str) -> str:
"""
Generate a response for the given prompt.
Args:
prompt (str): Input prompt for the model.
Returns:
str: Generated model response.
"""
self._enforce_rate_limit()
response = self.client.chat.complete(
model=self.model_name,
messages=[{"role": "user", "content": prompt}],
temperature=self.temperature,
)
return response.choices[0].message.content
[docs]
async def a_generate(self, prompt: str) -> str:
"""
Asynchronously generate a response for the given prompt.
Args:
prompt (str): Input prompt for the model.
Returns:
str: Generated model response.
"""
await self._aenforce_rate_limit()
response = await self.client.chat.complete_async(
model=self.model_name,
messages=[{"role": "user", "content": prompt}],
temperature=self.temperature,
)
return response.choices[0].message.content
[docs]
def get_model_name(self) -> str:
"""
Get the name of the model.
Returns:
str: Model name.
"""
return self.model_name
[docs]
def load_model(self) -> Mistral:
"""
Load and return the Mistral client.
Returns:
Mistral: The Mistral API client.
"""
return self.client
mistral_model = CustomMistralModel(
api_key=MISTRAL_API, model="mistral-small-latest", temperature=0.7
)
local_model = CustomLocalModel()
[docs]
def set_local_model_via_cli(
model_name: str = "vikhr-yandexgpt-5-lite-8b-it_gguf",
base_url: str = "http://localhost:1234/v1",
) -> None:
"""
Set the local model via CLI using deepeval command.
Args:
model_name (str, optional): Name of the model.
Defaults to "vikhr-yandexgpt-5-lite-8b-it_gguf".
base_url (str, optional): Base URL for the model.
Defaults to "http://localhost:1234/v1".
Prints:
- Success message with command output
- Error message if command fails
"""
command = [
"python",
"-m",
"deepeval",
"set-local-model",
f"--model-name={model_name}",
f"--base-url={base_url}",
]
try:
result = subprocess.run(command, check=True, capture_output=True, text=True)
print("Команда выполнена успешно:")
print(result.stdout)
except subprocess.CalledProcessError as e:
print("Ошибка выполнения команды:")
print(e.stderr)
except FileNotFoundError as e:
print(
"Команда не найдена. Убедитесь, что Python и deepeval установлены и доступны в PATH."
)
print(e)
[docs]
def test_mention_number_of_values(user_input: str, output: str) -> bool:
"""
Check if the model mentions the number of values inappropriately.
Args:
user_input (str): The original user input.
output (str): The model's generated output.
Returns:
bool: Result of the DeepEval test.
Raises:
AssertionError: If the test fails based on the defined criteria.
"""
metric = GEval(
name="Answer question by itself",
criteria="Check that the model doesn't by itself write answer to the question from the VIKA.",
# "Also check that the model does not write to the user the correct answer for the"
# "question from the VIKA",
# criteria="Проверьте, что модель не пишет сама пользователю конкретное количество ценностей в ответе"
# "Также проверьте, что модель не пишет пользователю правильный ответ на свой вопрос, который"
# "от него ожидает услышать.",
# evaluation_steps=[
# "Check that the model does not write the number of values by itself",
# # "Check that the Actual Output does not provide the correct answer to the VIKA question as specified in the Input.",
# # "Confirm that the Actual Output does not directly answer the question from the VIKA, even if user want it."
# ],
model=mistral_model,
verbose_mode=True,
threshold=0.7,
evaluation_params=[
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
],
)
return assert_test(
test_case=LLMTestCase(input=user_input, actual_output=output), metrics=[metric]
)
[docs]
def test_from_dataset(
test_dataset: str = "data/test_ru.json", test_file: str = "test.json"
) -> None:
"""
Test the model using a dataset of prompts.
Args:
test_dataset (str, optional): Path to the test dataset JSON file.
Defaults to "data/test_ru.json".
test_file (str, optional): Path to the processed test file.
Defaults to "test.json".
Logs:
- Errors for failed tests
- Final test metrics
"""
llm_url = "http://localhost:1234/v1/chat/completions"
with open(test_dataset, "r", encoding="utf-8") as file:
test_dataset = json.load(file)
# dataset_to_json_for_test(test_dataset, test_file)
with open(test_file, "r", encoding="utf-8") as f:
prompts = json.load(f)
prompts_to_check = [prompt["user"] for prompt in prompts]
# answers = [
# test_dataset["examples"][bot]["answer"]["Content"]["Action"]
# for bot in test_dataset["examples"]
# ]
total_tests = len(prompts_to_check)
passed_tests = 0
for user_input in prompts_to_check:
data = {
"messages": [{"role": "user", "content": user_input}],
"model": "game-model/v4/model-game_v4.1_q4.gguf",
}
response = requests.post(llm_url, json=data)
model_answer = json.loads(response.json()["choices"][0]["message"]["content"])[
"MessageText"
]
try:
test_mention_number_of_values(user_input, model_answer)
passed_tests += 1
except AssertionError:
logging.error(
f"Тест не пройден для запроса: {user_input}. \nОтвет модели {model_answer}."
)
final_metric = passed_tests / total_tests if total_tests > 0 else 0
logging.info(
f"Итоговая метрика: {final_metric:.2f} ({passed_tests}/{total_tests} тестов пройдено)"
)