Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 102 additions & 30 deletions ld_refresh.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time
from API_calls.StudentDatafromLDRESTAPI import build_team_students_map
from config.quality_model_config import load_qualitymodel_map, choose_qualitymodel
from config.load_config_file import get_available_events
from config.load_config_file import get_available_events, get_event_meta
from database.mongo_client import db

API_URL = os.getenv("EVAL_API_URL", "http://localhost:5001/api/event")
Expand Down Expand Up @@ -56,49 +56,121 @@ def delete_orphan_collections_from_mongo(actual_teams):
db.drop_collection(coll)


def _valid_students_for_event(team_sources: dict, event_type: str) -> list:
"""
Return valid student identifiers for a concrete event type.
We must validate against the event data source only.
"""
meta = get_event_meta(event_type)
if not meta:
return []

data_source = meta.get("data_source")
if not data_source:
return []

valid = team_sources.get(data_source, [])
# Keep deterministic order while removing duplicates
return list(dict.fromkeys(valid))


def _delete_invalid_students(collection, query: dict) -> int:
"""
Delete documents that match the query.

Prefer delete_many on real Mongo collections, but fall back to the
test double API used in the unit tests.
"""
if hasattr(collection, "delete_many"):
result = collection.delete_many(query)
return result.deleted_count

deleted_count = 0

docs = getattr(collection, "docs", None)
if docs is None:
for doc in collection.find(query):
collection.delete_one({"_id": doc["_id"]})
deleted_count += 1
return deleted_count

required_event_type = query.get("event_type")
invalid_students = set(query["student_name"]["$nin"])

for doc in list(docs):
if "student_name" not in doc:
continue
if doc["student_name"] in invalid_students:
continue

if isinstance(required_event_type, dict):
exists_expected = required_event_type.get("$exists")
if exists_expected is not None:
has_event_type = "event_type" in doc
if bool(has_event_type) != bool(exists_expected):
continue
elif required_event_type is not None:
if doc.get("event_type") != required_event_type:
continue

collection.delete_one({"_id": doc["_id"]})
deleted_count += 1

return deleted_count

def delete_orphan_student_documents(team_students_map):
"""
Elimina documentos de estudiantes que ya no existen en el mapa de estudiantes.
Busca en las colecciones metrics, factors y strategic_indicators.
La validación se hace por event_type y por su data source asociado,
para evitar mezclar nombres EXCEL con usernames de TAIGA/GITHUB.
"""
for team_id, sources in team_students_map.items():
# Obtener lista de estudiantes válidos: incluye nombres reales (EXCEL) + usernames (GITHUB + TAIGA)
valid_students = []
valid_students.extend(sources.get("EXCEL", []))
valid_students.extend(sources.get("GITHUB", []))
valid_students.extend(sources.get("TAIGA", []))

# Eliminar duplicados
valid_students = list(set(valid_students))

# Fallback list to protect legacy docs that may not have event_type.
all_valid_students = list(dict.fromkeys(
sources.get("EXCEL", []) + sources.get("GITHUB", []) + sources.get("TAIGA", [])
))

try:
collections = db.list_collection_names()
except Exception as exc:
logging.warning(
"Skipping orphan student cleanup for %s because Mongo is unavailable: %s",
team_id,
exc,
)
return

# Limpiar en cada tipo de colección
for prefix in ["metrics", "factors", "strategic_indicators"]:
collection_name = f"{prefix}.{team_id}"

if collection_name not in db.list_collection_names():
if collection_name not in collections:
continue

collection = db[collection_name]

# Buscar documentos con student_name que no esté en la lista de válidos
# Los documentos de equipo no tienen student_name, así que los ignoramos
orphan_docs = collection.find(
{"student_name": {"$exists": True, "$nin": valid_students}}
)

deleted_count = 0
for doc in orphan_docs:
doc.get("student_name")
# Intentar obtener el nombre de la métrica/factor/indicador
(
doc.get("metric_name")
or doc.get("factor_name")
or doc.get("indicator_name")
or doc.get("name", "documento")
)

collection.delete_one({"_id": doc["_id"]})
deleted_count += 1
# 1) Remove docs with known event_type and invalid student for that source
for event_type in get_available_events():
valid_for_event = _valid_students_for_event(sources, event_type)
if not valid_for_event:
continue

deleted_count += _delete_invalid_students(collection, {
"student_name": {"$exists": True, "$nin": valid_for_event},
"event_type": event_type
})

# 2) Legacy docs without event_type: apply broad fallback validation
deleted_count += _delete_invalid_students(collection, {
"student_name": {"$exists": True, "$nin": all_valid_students},
"event_type": {"$exists": False}
})

if deleted_count > 0:
logging.info("Deleted %s orphan student documents from %s", deleted_count, collection_name)


def run_daily_refresh() -> None:
Expand All @@ -110,8 +182,8 @@ def run_daily_refresh() -> None:
delete_orphan_collections_from_mongo(actual_teams)

# 2. Eliminar documentos de estudiantes que ya no están en los equipos
# delete_orphan_student_documents(TEAM_STUDENTS)

delete_orphan_student_documents(TEAM_STUDENTS)
# 3. Recalcular métricas para todos los equipos activos
for team in TEAM_STUDENTS.keys(): # Get all the teams from the TEAM_STUDENTS map
"""
Expand Down
Loading