docs: update test format documentation in README

Update documentation to reflect new TXT format with separator for summarization tests instead of JSON format. Clarify that expected field may be empty if summary generation fails. feat: change test generation to TXT format with separator Change test generation from JSON to TXT format with TEST_SEPARATOR. Add filename sanitization function to handle MongoDB record IDs. Update output path and file naming logic. Add attempt to generate expected summary through LLM with fallback to empty string.
2026-01-22 20:40:41 +03:00
parent 2466f1253a
commit 2a04e6c089
21 changed files with 96 additions and 104 deletions
--- a/src/benchmarks/codegen.py
+++ b/src/benchmarks/codegen.py
@@ -9,6 +9,9 @@ class CodegenBenchmark(Benchmark):

    def __init__(self):
        super().__init__("codegen")
+        # Загружаем универсальный промпт
+        with open('prompts/codegen.txt', 'r', encoding='utf-8') as f:
+            self.universal_prompt = f.read().strip()

    def load_test_data(self) -> List[Dict[str, Any]]:
        """
@@ -29,7 +32,7 @@ class CodegenBenchmark(Benchmark):
                    if len(parts) == 2:
                        test_data.append({
                            'name': filename.replace('.txt', ''),
-                            'prompt': parts[0],
+                            'prompt': self.universal_prompt.format(task=parts[0]),
                            'expected': parts[1]
                        })

--- a/src/benchmarks/summarization.py
+++ b/src/benchmarks/summarization.py
@@ -9,6 +9,9 @@ class SummarizationBenchmark(Benchmark):

    def __init__(self):
        super().__init__("summarization")
+        # Загружаем универсальный промпт
+        with open('prompts/summarization.txt', 'r', encoding='utf-8') as f:
+            self.universal_prompt = f.read().strip()

    def load_test_data(self) -> List[Dict[str, Any]]:
        """
@@ -29,7 +32,7 @@ class SummarizationBenchmark(Benchmark):
                    if len(parts) == 2:
                        test_data.append({
                            'name': filename.replace('.txt', ''),
-                            'prompt': parts[0],
+                            'prompt': self.universal_prompt.format(task=parts[0]),
                            'expected': parts[1]
                        })

--- a/src/benchmarks/translation.py
+++ b/src/benchmarks/translation.py
@@ -5,14 +5,17 @@ from typing import Dict, Any, List
 from benchmarks.base import Benchmark, TEST_SEPARATOR

 class TranslationBenchmark(Benchmark):
-    """Бенчмарк для тестирования переводов."""
+    """Бенчмарк для тестирования перевода."""

    def __init__(self):
        super().__init__("translation")
+        # Загружаем универсальный промпт
+        with open('prompts/translation.txt', 'r', encoding='utf-8') as f:
+            self.universal_prompt = f.read().strip()

    def load_test_data(self) -> List[Dict[str, Any]]:
        """
-        Загрузка тестовых данных для переводов.
+        Загрузка тестовых данных для перевода.

        Returns:
            Список тестовых случаев
@@ -29,7 +32,7 @@ class TranslationBenchmark(Benchmark):
                    if len(parts) == 2:
                        test_data.append({
                            'name': filename.replace('.txt', ''),
-                            'prompt': parts[0],
+                            'prompt': self.universal_prompt.format(text=parts[0]),
                            'expected': parts[1]
                        })