feat: Add context size support for benchmarks and update example usage

This commit adds support for specifying context size when running benchmarks, which is passed to the Ollama client as the `num_ctx` option. The changes include: - Updated the `run` method in the base benchmark class to accept an optional `context_size` parameter - Modified the Ollama client call to include context size in the options when provided - Updated the `run_benchmarks` function to accept and pass through the context size - Added example usage to the help output showing how to use the new context size parameter - Fixed prompt formatting in the summarization benchmark to use `text` instead of `task` The changes enable running benchmarks with custom context sizes, which is useful for testing models with different context window limitations.
2026-01-26 15:21:55 +03:00
parent 2048e4e40d
commit f60dbf49f1
10 changed files with 44 additions and 16 deletions
--- a/run.sh
+++ b/run.sh
@@ -73,7 +73,8 @@ else
    echo ""
    echo "Примеры использования:"
    echo " * ./run.sh run -m second_constantine/t-lite-it-1.0:7b -b translation summarization"
-    echo " * ./run.sh run -m second_constantine/t-lite-it-1.0:7b --num-ctx 16000"
+    echo " * ./run.sh run -m second_constantine/t-lite-it-1.0:7b -u http://10.0.0.4:11434 -c 2048 -b translation summarization"
+    echo " * ./run.sh run -m translategemma:4b -u http://10.0.0.4:11434 -c 128000 -b summarization"
    echo " * ./run.sh gen"
    echo " * ./run.sh gen-mongo 507f1f77bcf86cd799439011"
    echo " * ./run.sh gen-mongo --id-file ids.txt"
--- a/src/benchmarks/pycache/base.cpython-313.pyc
+++ b/src/benchmarks/pycache/base.cpython-313.pyc
--- a/src/benchmarks/pycache/codegen.cpython-313.pyc
+++ b/src/benchmarks/pycache/codegen.cpython-313.pyc
--- a/src/benchmarks/pycache/summarization.cpython-313.pyc
+++ b/src/benchmarks/pycache/summarization.cpython-313.pyc
--- a/src/benchmarks/pycache/translation.cpython-313.pyc
+++ b/src/benchmarks/pycache/translation.cpython-313.pyc
--- a/src/benchmarks/base.py
+++ b/src/benchmarks/base.py
@@ -46,14 +46,13 @@ class Benchmark(ABC):
        """
        pass

-    def run(self, ollama_client: OllamaClient, model_name: str, num_ctx: int = 32000) -> Dict[str, Any]:
+    def run(self, ollama_client: OllamaClient, model_name: str, num_ctx: int = 32000, context_size: int = None) -> Dict[str, Any]:
        """
        Запуск бенчмарка.

        Args:
            ollama_client: Клиент для работы с Ollama
            model_name: Название модели
-            num_ctx: Размер контекста

        Returns:
            Результаты бенчмарка
@@ -73,12 +72,20 @@ class Benchmark(ABC):
                # Получение ответа от модели
                prompt = test_case['prompt']
                self.logger.debug(f"Prompt: {prompt[:200]}...")  # Логируем начало промпта
+                # Подготовка опций для вызова
+                options = {'temperature': 0.7}
+                if context_size is not None:
+                    # Для Ollama параметры контекста передаются в options
+                    options['num_ctx'] = context_size
+                    self.logger.debug(f"Setting context size to {context_size}")
+                
+                self.logger.debug(f"About to call generate with model={model_name}, prompt length={len(prompt)}, options={options}")
                model_response = ollama_client.generate(
                    model=model_name,
                    prompt=prompt,
-                    num_ctx=num_ctx,
-                    options={'temperature': 0.7}
+                    options=options
                )
+                self.logger.debug(f"Generate call completed, response length={len(model_response) if model_response else 0}")

                # Замер времени
                latency = time.time() - start_time
--- a/src/main.py
+++ b/src/main.py
@@ -18,7 +18,7 @@ def setup_logging(verbose: bool = False):
        ]
    )

-def run_benchmarks(ollama_client: OllamaClient, model_name: str, benchmarks: List[str], num_ctx: int) -> List[dict]:
+def run_benchmarks(ollama_client: OllamaClient, model_name: str, benchmarks: List[str], context_size: int = None) -> List[dict]:
    """
    Запуск выбранных бенчмарков.

@@ -26,7 +26,6 @@ def run_benchmarks(ollama_client: OllamaClient, model_name: str, benchmarks: Lis
        ollama_client: Клиент для работы с Ollama
        model_name: Название модели
        benchmarks: Список имен бенчмарков для запуска
-        num_ctx: Размер контекста

    Returns:
        Список результатов бенчмарков
@@ -46,7 +45,7 @@ def run_benchmarks(ollama_client: OllamaClient, model_name: str, benchmarks: Lis

        logging.info(f"Running {benchmark_name} benchmark...")
        benchmark = benchmark_classes[benchmark_name]()
-        result = benchmark.run(ollama_client, model_name, num_ctx)
+        result = benchmark.run(ollama_client, model_name)
        results.append(result)

    return results
@@ -56,11 +55,11 @@ def main():
    parser = argparse.ArgumentParser(description='LLM Benchmarking Tool')
    parser.add_argument('-m', '--model', required=True, help='Название модели для тестирования')
    parser.add_argument('-u', '--ollama-url', default='http://localhost:11434', help='URL подключения к Ollama серверу')
+    parser.add_argument('-c', '--context-size', type=int, default=32000, help='Размер контекста для модели (по умолчанию 32000)')
    parser.add_argument('-b', '--benchmarks', nargs='+', default=['translation', 'summarization', 'codegen'],
                       help='Список бенчмарков для выполнения (translation, summarization, codegen)')
    parser.add_argument('-o', '--output', default='results', help='Директория для сохранения результатов')
    parser.add_argument('-v', '--verbose', action='store_true', help='Подробный режим вывода')
-    parser.add_argument('--num-ctx', type=int, default=32000, help='Размер контекста для модели (по умолчанию 32000)')

    args = parser.parse_args()

@@ -72,12 +71,11 @@ def main():
    logging.info(f"Benchmarks to run: {', '.join(args.benchmarks)}")
    logging.info(f"Context size: {args.num_ctx}")

+    # Инициализация клиента
+    ollama_client = OllamaClient(args.ollama_url)
    try:
-        # Инициализация клиента
-        ollama_client = OllamaClient(args.ollama_url)
-
        # Запуск бенчмарков
-        results = run_benchmarks(ollama_client, args.model, args.benchmarks, args.num_ctx)
+        results = run_benchmarks(ollama_client, args.model, args.benchmarks, args.context_size)

        # Генерация отчетов
        report_generator = ReportGenerator()
@@ -89,9 +87,8 @@ def main():
            report_generator.generate_summary_report(results, args.output, args.model, args.ollama_url)

        logging.info("Benchmarking completed successfully!")
-
    except Exception as e:
-        logging.error(f"Error during benchmarking: {e}", exc_info=True)
+        logging.error(f"Error during benchmarking: {e}")
        return 1

    return 0
--- a/src/models/pycache/ollama_client.cpython-313.pyc
+++ b/src/models/pycache/ollama_client.cpython-313.pyc
--- a/src/models/ollama_client.py
+++ b/src/models/ollama_client.py
@@ -45,7 +45,30 @@ class OllamaClient:
                options=options,
                **kwargs
            )
-            return response['response']
+            # Проверяем структуру ответа
+            self.logger.debug(f"Response structure: {response}")
+            self.logger.debug(f"Response type: {type(response)}")
+            
+            # Если это объект GenerateResponse (как в ollama 0.3+)
+            if hasattr(response, 'response'):
+                return response.response
+            elif hasattr(response, 'text'):
+                return response.text
+            elif isinstance(response, dict):
+                if 'response' in response:
+                    return response['response']
+                elif 'text' in response:
+                    return response['text']
+                else:
+                    # Попробуем извлечь любое строковое значение
+                    for key, value in response.items():
+                        if isinstance(value, str):
+                            return value
+                    raise ValueError(f"Unexpected response format - no text or response field: {response}")
+            elif isinstance(response, str):
+                return response
+            else:
+                raise ValueError(f"Unexpected response format: {response}")
        except Exception as e:
            error_msg = f"Error generating response for model {model}: {e}"
            self.logger.error(error_msg)
--- a/src/utils/pycache/report.cpython-313.pyc
+++ b/src/utils/pycache/report.cpython-313.pyc