feat: add MongoDB test generation and update dependencies

- Added pymongo==3.13.0 to requirements.txt for MongoDB connectivity - Implemented generate_summarization_from_mongo.py script to generate summarization tests from MongoDB - Updated run.sh to support 'gen-mongo' command for MongoDB test generation - Enhanced scripts/README.md with documentation for new MongoDB functionality - Improved help text in run.sh to clarify available commands and usage examples ``` This commit adds MongoDB integration for test generation and updates the documentation and scripts accordingly.
2026-01-22 20:11:52 +03:00
parent f117c7b23c
commit 8ef3a16e3a
41 changed files with 728 additions and 164 deletions
--- a/scripts/generate_summarization_from_mongo.py
+++ b/scripts/generate_summarization_from_mongo.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+"""
+Скрипт для генерации тестов пересказов из MongoDB.
+
+Извлекает текст статьи из коллекции rssNotification (поле .meta.topicContent)
+и генерирует тестовые данные в формате JSON для бенчмарка AI.
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Dict, Optional
+
+import pymongo
+from pymongo import MongoClient
+
+def connect_to_mongo() -> MongoClient:
+    """Подключается к MongoDB кластеру."""
+    client = MongoClient(
+        "mongodb://10.0.0.3:27017,10.0.0.4:27017,10.0.0.5:27017/",
+        connectTimeoutMS=30000,
+        socketTimeoutMS=30000,
+        serverSelectionTimeoutMS=30000,
+        retryWrites=True,
+        retryReads=True
+    )
+    return client
+
+def extract_text_from_topic_content(topic_content: Dict) -> Optional[str]:
+    """
+    Извлекает текст статьи из .meta.topicContent.
+
+    Args:
+        topic_content: Содержимое поля .meta.topicContent из MongoDB
+
+    Returns:
+        Текст статьи или None, если не удалось извлечь
+    """
+    if not topic_content:
+        return None
+
+    # Преобразуем в строку, если это не строка
+    content_str = str(topic_content)
+
+    return content_str
+
+def generate_test_from_mongo_record(record_id: str) -> bool:
+    """
+    Генерирует тест пересказа из записи MongoDB.
+
+    Args:
+        record_id: ID записи в MongoDB
+
+    Returns:
+        True, если тест успешно generated, False в случае ошибки
+    """
+    try:
+        client = connect_to_mongo()
+        db = client['tracker_conbot']
+        collection = db['rssNotification']
+
+        # Извлекаем запись по ID
+        record = collection.find_one({"_id": record_id})
+        if not record:
+            print(f"❌ Запись с ID {record_id} не найдена в коллекции")
+            return False
+
+        # Отладочная информация
+        print(f"🔍 Найдена запись: {record_id}")
+        print(f"📋 Полная структура записи:")
+        print(json.dumps(record, ensure_ascii=False, indent=2, default=str))
+
+        # Извлекаем текст из meta.topicContent
+        meta_data = record.get('meta', {})
+        topic_content = meta_data.get('topicContent')
+        if not topic_content:
+            print(f"❌ В записи {record_id} отсутствует поле meta.topicContent")
+            return False
+
+        print(f"📝 Тип поля meta.topicContent: {type(topic_content)}")
+        print(f"📝 Содержимое meta.topicContent (первые 500 символов):")
+        print(str(topic_content)[:500])
+
+        # Извлекаем текст
+        article_text = extract_text_from_topic_content(topic_content)
+        if not article_text:
+            print(f"❌ Не удалось извлечь текст из meta.topicContent записи {record_id}")
+            return False
+
+        print(f"📝 Итоговый текст (первые 500 символов): {article_text[:500]}")
+
+        # Формируем тест
+        test_data = {
+            "prompt": f"Summarize the following text in 1-2 sentences: '{article_text}'",
+            "expected": ""  # Ожидаемый результат будет пустым, так как его нужно будет сгенерировать отдельно
+        }
+
+        # Создаем директорию для сохранения теста (всегда в tests/summarization)
+        output_path = Path("tests") / "summarization"
+        output_path.mkdir(parents=True, exist_ok=True)
+
+        # Находим следующий доступный номер теста
+        test_num = 1
+        while True:
+            test_file = output_path / f"test{test_num}.json"
+            if not test_file.exists():
+                break
+            test_num += 1
+
+        # Сохраняем тест
+        with open(test_file, "w", encoding="utf-8") as f:
+            json.dump(test_data, f, ensure_ascii=False, indent=2)
+
+        print(f"✅ Создан тест tests/summarization/test{test_num}.json")
+        print(f"   Источник: MongoDB запись {record_id}")
+        print(f"   Текст статьи (первые 100 символов): {article_text[:100]}...")
+
+        return True
+
+    except Exception as e:
+        print(f"❌ Ошибка при генерации теста: {e}")
+        return False
+    finally:
+        if 'client' in locals():
+            client.close()
+
+def validate_test(test_data: Dict[str, str]) -> bool:
+    """Валидирует тестовые данные."""
+    if not isinstance(test_data, dict):
+        print("❌ Тест должен быть словарём (JSON объект)")
+        return False
+
+    if "prompt" not in test_data:
+        print("❌ Отсутствует поле 'prompt'")
+        return False
+
+    if "expected" not in test_data:
+        print("❌ Отсутствует поле 'expected'")
+        return False
+
+    if not isinstance(test_data["prompt"], str):
+        print("❌ Поле 'prompt' должно быть строкой")
+        return False
+
+    if not isinstance(test_data["expected"], str):
+        print("❌ Поле 'expected' должно быть строкой")
+        return False
+
+    if not test_data["prompt"].strip():
+        print("❌ Поле 'prompt' не может быть пустым")
+        return False
+
+    return True
+
+def main():
+    """Основная функция скрипта."""
+    parser = argparse.ArgumentParser(
+        description="Генератор тестов пересказов из MongoDB",
+        epilog="Примеры использования:\n"
+               "  python scripts/generate_summarization_from_mongo.py --record-id 507f1f77bcf86cd799439011 --output-dir tests\n"
+               "  python scripts/generate_summarization_from_mongo.py --record-id 507f1f77bcf86cd799439011 --output-dir results"
+    )
+    parser.add_argument(
+        "--record-id",
+        type=str,
+        required=True,
+        help="ID записи в MongoDB (обязательный параметр)"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="tests",
+        help="Директория для сохранения generated тестов (по умолчанию: tests)"
+    )
+
+    args = parser.parse_args()
+
+    print(f"🔍 Подключаюсь к MongoDB кластеру...")
+    print(f"📄 Извлекаю запись с ID: {args.record_id}")
+    print(f"💾 Сохраняю тест в: tests/summarization/")
+
+    success = generate_test_from_mongo_record(args.record_id)
+
+    if success:
+        print("\n✨ Готово! Тест успешно generated.")
+    else:
+        print("\n❌ Не удалось generated тест.")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()