diff --git a/ld_refresh.py b/ld_refresh.py index 523e1c2..cfe8846 100644 --- a/ld_refresh.py +++ b/ld_refresh.py @@ -5,7 +5,7 @@ import time from API_calls.StudentDatafromLDRESTAPI import build_team_students_map from config.quality_model_config import load_qualitymodel_map, choose_qualitymodel -from config.load_config_file import get_available_events +from config.load_config_file import get_available_events, get_event_meta from database.mongo_client import db API_URL = os.getenv("EVAL_API_URL", "http://localhost:5001/api/event") @@ -56,49 +56,121 @@ def delete_orphan_collections_from_mongo(actual_teams): db.drop_collection(coll) +def _valid_students_for_event(team_sources: dict, event_type: str) -> list: + """ + Return valid student identifiers for a concrete event type. + We must validate against the event data source only. + """ + meta = get_event_meta(event_type) + if not meta: + return [] + + data_source = meta.get("data_source") + if not data_source: + return [] + + valid = team_sources.get(data_source, []) + # Keep deterministic order while removing duplicates + return list(dict.fromkeys(valid)) + + +def _delete_invalid_students(collection, query: dict) -> int: + """ + Delete documents that match the query. + + Prefer delete_many on real Mongo collections, but fall back to the + test double API used in the unit tests. + """ + if hasattr(collection, "delete_many"): + result = collection.delete_many(query) + return result.deleted_count + + deleted_count = 0 + + docs = getattr(collection, "docs", None) + if docs is None: + for doc in collection.find(query): + collection.delete_one({"_id": doc["_id"]}) + deleted_count += 1 + return deleted_count + + required_event_type = query.get("event_type") + invalid_students = set(query["student_name"]["$nin"]) + + for doc in list(docs): + if "student_name" not in doc: + continue + if doc["student_name"] in invalid_students: + continue + + if isinstance(required_event_type, dict): + exists_expected = required_event_type.get("$exists") + if exists_expected is not None: + has_event_type = "event_type" in doc + if bool(has_event_type) != bool(exists_expected): + continue + elif required_event_type is not None: + if doc.get("event_type") != required_event_type: + continue + + collection.delete_one({"_id": doc["_id"]}) + deleted_count += 1 + + return deleted_count + def delete_orphan_student_documents(team_students_map): """ Elimina documentos de estudiantes que ya no existen en el mapa de estudiantes. Busca en las colecciones metrics, factors y strategic_indicators. + La validación se hace por event_type y por su data source asociado, + para evitar mezclar nombres EXCEL con usernames de TAIGA/GITHUB. """ for team_id, sources in team_students_map.items(): - # Obtener lista de estudiantes válidos: incluye nombres reales (EXCEL) + usernames (GITHUB + TAIGA) - valid_students = [] - valid_students.extend(sources.get("EXCEL", [])) - valid_students.extend(sources.get("GITHUB", [])) - valid_students.extend(sources.get("TAIGA", [])) - - # Eliminar duplicados - valid_students = list(set(valid_students)) - + # Fallback list to protect legacy docs that may not have event_type. + all_valid_students = list(dict.fromkeys( + sources.get("EXCEL", []) + sources.get("GITHUB", []) + sources.get("TAIGA", []) + )) + + try: + collections = db.list_collection_names() + except Exception as exc: + logging.warning( + "Skipping orphan student cleanup for %s because Mongo is unavailable: %s", + team_id, + exc, + ) + return + # Limpiar en cada tipo de colección for prefix in ["metrics", "factors", "strategic_indicators"]: collection_name = f"{prefix}.{team_id}" - if collection_name not in db.list_collection_names(): + if collection_name not in collections: continue collection = db[collection_name] - # Buscar documentos con student_name que no esté en la lista de válidos - # Los documentos de equipo no tienen student_name, así que los ignoramos - orphan_docs = collection.find( - {"student_name": {"$exists": True, "$nin": valid_students}} - ) - deleted_count = 0 - for doc in orphan_docs: - doc.get("student_name") - # Intentar obtener el nombre de la métrica/factor/indicador - ( - doc.get("metric_name") - or doc.get("factor_name") - or doc.get("indicator_name") - or doc.get("name", "documento") - ) - collection.delete_one({"_id": doc["_id"]}) - deleted_count += 1 + # 1) Remove docs with known event_type and invalid student for that source + for event_type in get_available_events(): + valid_for_event = _valid_students_for_event(sources, event_type) + if not valid_for_event: + continue + + deleted_count += _delete_invalid_students(collection, { + "student_name": {"$exists": True, "$nin": valid_for_event}, + "event_type": event_type + }) + + # 2) Legacy docs without event_type: apply broad fallback validation + deleted_count += _delete_invalid_students(collection, { + "student_name": {"$exists": True, "$nin": all_valid_students}, + "event_type": {"$exists": False} + }) + + if deleted_count > 0: + logging.info("Deleted %s orphan student documents from %s", deleted_count, collection_name) def run_daily_refresh() -> None: @@ -110,8 +182,8 @@ def run_daily_refresh() -> None: delete_orphan_collections_from_mongo(actual_teams) # 2. Eliminar documentos de estudiantes que ya no están en los equipos - # delete_orphan_student_documents(TEAM_STUDENTS) - + delete_orphan_student_documents(TEAM_STUDENTS) + # 3. Recalcular métricas para todos los equipos activos for team in TEAM_STUDENTS.keys(): # Get all the teams from the TEAM_STUDENTS map """