feat: Add context size support for benchmarks and update example usage

This commit adds support for specifying context size when running benchmarks, which is passed to the Ollama client as the `num_ctx` option. The changes include:

- Updated the `run` method in the base benchmark class to accept an optional `context_size` parameter
- Modified the Ollama client call to include context size in the options when provided
- Updated the `run_benchmarks` function to accept and pass through the context size
- Added example usage to the help output showing how to use the new context size parameter
- Fixed prompt formatting in the summarization benchmark to use `text` instead of `task`

The changes enable running benchmarks with custom context sizes, which is useful for testing models with different context window limitations.
This commit is contained in:
second_constantine 2026-01-26 15:21:55 +03:00
parent 2048e4e40d
commit f60dbf49f1
10 changed files with 44 additions and 16 deletions

3
run.sh
View File

@ -73,7 +73,8 @@ else
echo ""
echo "Примеры использования:"
echo " * ./run.sh run -m second_constantine/t-lite-it-1.0:7b -b translation summarization"
echo " * ./run.sh run -m second_constantine/t-lite-it-1.0:7b --num-ctx 16000"
echo " * ./run.sh run -m second_constantine/t-lite-it-1.0:7b -u http://10.0.0.4:11434 -c 2048 -b translation summarization"
echo " * ./run.sh run -m translategemma:4b -u http://10.0.0.4:11434 -c 128000 -b summarization"
echo " * ./run.sh gen"
echo " * ./run.sh gen-mongo 507f1f77bcf86cd799439011"
echo " * ./run.sh gen-mongo --id-file ids.txt"

View File

@ -46,14 +46,13 @@ class Benchmark(ABC):
"""
pass
def run(self, ollama_client: OllamaClient, model_name: str, num_ctx: int = 32000) -> Dict[str, Any]:
def run(self, ollama_client: OllamaClient, model_name: str, num_ctx: int = 32000, context_size: int = None) -> Dict[str, Any]:
"""
Запуск бенчмарка.
Args:
ollama_client: Клиент для работы с Ollama
model_name: Название модели
num_ctx: Размер контекста
Returns:
Результаты бенчмарка
@ -73,12 +72,20 @@ class Benchmark(ABC):
# Получение ответа от модели
prompt = test_case['prompt']
self.logger.debug(f"Prompt: {prompt[:200]}...") # Логируем начало промпта
# Подготовка опций для вызова
options = {'temperature': 0.7}
if context_size is not None:
# Для Ollama параметры контекста передаются в options
options['num_ctx'] = context_size
self.logger.debug(f"Setting context size to {context_size}")
self.logger.debug(f"About to call generate with model={model_name}, prompt length={len(prompt)}, options={options}")
model_response = ollama_client.generate(
model=model_name,
prompt=prompt,
num_ctx=num_ctx,
options={'temperature': 0.7}
options=options
)
self.logger.debug(f"Generate call completed, response length={len(model_response) if model_response else 0}")
# Замер времени
latency = time.time() - start_time

View File

@ -18,7 +18,7 @@ def setup_logging(verbose: bool = False):
]
)
def run_benchmarks(ollama_client: OllamaClient, model_name: str, benchmarks: List[str], num_ctx: int) -> List[dict]:
def run_benchmarks(ollama_client: OllamaClient, model_name: str, benchmarks: List[str], context_size: int = None) -> List[dict]:
"""
Запуск выбранных бенчмарков.
@ -26,7 +26,6 @@ def run_benchmarks(ollama_client: OllamaClient, model_name: str, benchmarks: Lis
ollama_client: Клиент для работы с Ollama
model_name: Название модели
benchmarks: Список имен бенчмарков для запуска
num_ctx: Размер контекста
Returns:
Список результатов бенчмарков
@ -46,7 +45,7 @@ def run_benchmarks(ollama_client: OllamaClient, model_name: str, benchmarks: Lis
logging.info(f"Running {benchmark_name} benchmark...")
benchmark = benchmark_classes[benchmark_name]()
result = benchmark.run(ollama_client, model_name, num_ctx)
result = benchmark.run(ollama_client, model_name)
results.append(result)
return results
@ -56,11 +55,11 @@ def main():
parser = argparse.ArgumentParser(description='LLM Benchmarking Tool')
parser.add_argument('-m', '--model', required=True, help='Название модели для тестирования')
parser.add_argument('-u', '--ollama-url', default='http://localhost:11434', help='URL подключения к Ollama серверу')
parser.add_argument('-c', '--context-size', type=int, default=32000, help='Размер контекста для модели (по умолчанию 32000)')
parser.add_argument('-b', '--benchmarks', nargs='+', default=['translation', 'summarization', 'codegen'],
help='Список бенчмарков для выполнения (translation, summarization, codegen)')
parser.add_argument('-o', '--output', default='results', help='Директория для сохранения результатов')
parser.add_argument('-v', '--verbose', action='store_true', help='Подробный режим вывода')
parser.add_argument('--num-ctx', type=int, default=32000, help='Размер контекста для модели (по умолчанию 32000)')
args = parser.parse_args()
@ -72,12 +71,11 @@ def main():
logging.info(f"Benchmarks to run: {', '.join(args.benchmarks)}")
logging.info(f"Context size: {args.num_ctx}")
# Инициализация клиента
ollama_client = OllamaClient(args.ollama_url)
try:
# Инициализация клиента
ollama_client = OllamaClient(args.ollama_url)
# Запуск бенчмарков
results = run_benchmarks(ollama_client, args.model, args.benchmarks, args.num_ctx)
results = run_benchmarks(ollama_client, args.model, args.benchmarks, args.context_size)
# Генерация отчетов
report_generator = ReportGenerator()
@ -89,9 +87,8 @@ def main():
report_generator.generate_summary_report(results, args.output, args.model, args.ollama_url)
logging.info("Benchmarking completed successfully!")
except Exception as e:
logging.error(f"Error during benchmarking: {e}", exc_info=True)
logging.error(f"Error during benchmarking: {e}")
return 1
return 0

View File

@ -45,7 +45,30 @@ class OllamaClient:
options=options,
**kwargs
)
return response['response']
# Проверяем структуру ответа
self.logger.debug(f"Response structure: {response}")
self.logger.debug(f"Response type: {type(response)}")
# Если это объект GenerateResponse (как в ollama 0.3+)
if hasattr(response, 'response'):
return response.response
elif hasattr(response, 'text'):
return response.text
elif isinstance(response, dict):
if 'response' in response:
return response['response']
elif 'text' in response:
return response['text']
else:
# Попробуем извлечь любое строковое значение
for key, value in response.items():
if isinstance(value, str):
return value
raise ValueError(f"Unexpected response format - no text or response field: {response}")
elif isinstance(response, str):
return response
else:
raise ValueError(f"Unexpected response format: {response}")
except Exception as e:
error_msg = f"Error generating response for model {model}: {e}"
self.logger.error(error_msg)