Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 36 additions & 26 deletions ferry/database/sync_db_courses.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,20 @@ def generate_diff(
old_df = old_df.set_index(pk)
new_df = new_df.set_index(pk)

deleted_rows = old_df[~old_df.index.isin(new_df.index)]
added_rows = new_df[~new_df.index.isin(old_df.index)]
# Use set for O(1) index lookups instead of O(n) scan
new_index_set = set(new_df.index)
old_index_set = set(old_df.index)

deleted_rows = old_df[~old_df.index.isin(new_index_set)]
added_rows = new_df[~new_df.index.isin(old_index_set)]

# Must sort index in order to compare dataframes cell-wise
shared_index = old_index_set & new_index_set
shared_rows_old = (
old_df[old_df.index.isin(new_df.index)].sort_index().sort_index(axis=1)
old_df[old_df.index.isin(shared_index)].sort_index().sort_index(axis=1)
)
shared_rows_new = (
new_df[new_df.index.isin(old_df.index)].sort_index().sort_index(axis=1)
new_df[new_df.index.isin(shared_index)].sort_index().sort_index(axis=1)
)
if (shared_rows_old.index != shared_rows_new.index).any():
print(shared_rows_old.index)
Expand All @@ -96,32 +101,37 @@ def generate_diff(
raise ValueError(
f"Column mismatch in table {table_name}. Run with --rewrite once to fix."
)
# Do not allow type changes unless one of them is NA
old_types = shared_rows_old.map(type)
new_types = shared_rows_new.map(type)
different_types = ~(
(old_types == new_types) | shared_rows_old.isna() | shared_rows_new.isna()

# Hash-based change detection: only do O(rows*cols) cell comparison for rows
# where hashes differ. When most rows are unchanged (typical for incremental
# sync), this reduces complexity from O(n*m) to O(n) + O(changed*cols).
old_hashes = pd.util.hash_pandas_object(shared_rows_old, index=False)
new_hashes = pd.util.hash_pandas_object(shared_rows_new, index=False)
unchanged_mask = (old_hashes == new_hashes).reindex(shared_rows_old.index)

unequal_mask = pd.DataFrame(
False, index=shared_rows_old.index, columns=shared_rows_old.columns
)
if different_types.any().any():
row, col = list(zip(*different_types.values.nonzero()))[0]
print(
f"Type mismatch in {table_name} at ({row}, {col}) (column {shared_rows_old.columns[col]})"
possibly_changed = ~unchanged_mask
if possibly_changed.any():
# Only compare cells for rows that might have changed (hash differed)
subset_old = shared_rows_old.loc[possibly_changed]
subset_new = shared_rows_new.loc[possibly_changed]
subset_unequal = ~(
(subset_old == subset_new)
| (subset_old.isna() & subset_new.isna())
)
print(f"Old type: {old_types.iat[row, col]}")
print(f"New type: {new_types.iat[row, col]}")
print(f"Old value: {shared_rows_old.iat[row, col]}")
print(f"New value: {shared_rows_new.iat[row, col]}")
raise TypeError("Type mismatch")
unequal_mask = ~(
(shared_rows_old == shared_rows_new)
| (shared_rows_old.isna() & shared_rows_new.isna())
)
unequal_mask.loc[possibly_changed] = subset_unequal
Comment on lines +120 to +124
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
python - <<'PY'
import pandas as pd

old = pd.DataFrame({"primary_crn": pd.Series([pd.NA], dtype="Int64")})
new = pd.DataFrame({"primary_crn": pd.Series([12345], dtype="Int64")})

current = ~((old == new) | (old.isna() & new.isna()))
fixed = old.ne(new).fillna(True) & ~(old.isna() & new.isna())

print("current mask:")
print(current)
print("current any:", current.any(axis=1).tolist())

print("\nfixed mask:")
print(fixed)
print("fixed any:", fixed.any(axis=1).tolist())
PY

Repository: coursetable/ferry

Length of output: 179


🏁 Script executed:

#!/bin/bash
pip install pandas --quiet
python - <<'PY'
import pandas as pd

old = pd.DataFrame({"primary_crn": pd.Series([pd.NA], dtype="Int64")})
new = pd.DataFrame({"primary_crn": pd.Series([12345], dtype="Int64")})

current = ~((old == new) | (old.isna() & new.isna()))
fixed = old.ne(new).fillna(True) & ~(old.isna() & new.isna())

print("current mask:")
print(current)
print("current any:", current.any(axis=1).tolist())

print("\nfixed mask:")
print(fixed)
print("fixed any:", fixed.any(axis=1).tolist())
PY

Repository: coursetable/ferry

Length of output: 431


🏁 Script executed:

cd /repo && cat -n ferry/database/sync_db_courses.py | sed -n '100,135p'

Repository: coursetable/ferry

Length of output: 116


🏁 Script executed:

cd /repo && cat -n ferry/database/sync_db_courses.py | sed -n '680,695p'

Repository: coursetable/ferry

Length of output: 116


🏁 Script executed:

find . -name "sync_db_courses.py" -type f

Repository: coursetable/ferry

Length of output: 96


🏁 Script executed:

find . -name "sync_db_courses.py" -type f -exec cat -n {} \; | sed -n '100,135p'

Repository: coursetable/ferry

Length of output: 2108


🏁 Script executed:

find . -name "sync_db_courses.py" -type f -exec cat -n {} \; | sed -n '680,695p'

Repository: coursetable/ferry

Length of output: 925


Make the per-cell diff mask NA-safe.

With pandas nullable dtypes, subset_old == subset_new returns pd.NA for null/value transitions. That propagates through the boolean expression, causing unequal_mask to contain <NA> cells. When unequal_mask.any(axis=1) is called at line 126, those NA values are treated as falsy, so rows with changes like primary_crn: <NA> → 12345 are silently dropped from the diff.

Suggested fix
-            subset_unequal = ~(
-                (subset_old == subset_new)
-                | (subset_old.isna() & subset_new.isna())
-            )
+            both_na = subset_old.isna() & subset_new.isna()
+            subset_unequal = subset_old.ne(subset_new).fillna(True) & ~both_na
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
subset_unequal = ~(
(subset_old == subset_new)
| (subset_old.isna() & subset_new.isna())
)
print(f"Old type: {old_types.iat[row, col]}")
print(f"New type: {new_types.iat[row, col]}")
print(f"Old value: {shared_rows_old.iat[row, col]}")
print(f"New value: {shared_rows_new.iat[row, col]}")
raise TypeError("Type mismatch")
unequal_mask = ~(
(shared_rows_old == shared_rows_new)
| (shared_rows_old.isna() & shared_rows_new.isna())
)
unequal_mask.loc[possibly_changed] = subset_unequal
both_na = subset_old.isna() & subset_new.isna()
subset_unequal = subset_old.ne(subset_new).fillna(True) & ~both_na
unequal_mask.loc[possibly_changed] = subset_unequal
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@ferry/database/sync_db_courses.py` around lines 120 - 124, The per-cell diff
currently lets pd.NA propagate (via subset_old == subset_new) so unequal_mask
gets <NA> values; change the logic in the block that computes subset_unequal
(using subset_old, subset_new, possibly_changed) to explicitly handle nulls—for
example compute na_mismatch = subset_old.isna() ^ subset_new.isna(), compute
value_diff = (subset_old != subset_new).fillna(False), then set subset_unequal =
value_diff | na_mismatch and assign that into
unequal_mask.loc[possibly_changed]; this ensures NA→value and value→NA
transitions are detected and yields a boolean mask.


changed_rows = shared_rows_new[unequal_mask.any(axis=1)].copy()
changed_row_mask = unequal_mask.any(axis=1)
changed_rows = shared_rows_new[changed_row_mask].copy()
if len(changed_rows) > 0:
changed_rows["columns_changed"] = unequal_mask[
unequal_mask.any(axis=1)
].apply(lambda row: shared_rows_new.columns[row].tolist(), axis=1)
# Vectorized: avoid slow apply(axis=1), use list comprehension over numpy
cols = shared_rows_new.columns
unequal_arr = unequal_mask.loc[changed_row_mask].values
changed_rows["columns_changed"] = [
cols[unequal_arr[i]].tolist() for i in range(len(changed_rows))
]
else:
changed_rows["columns_changed"] = pd.Series(dtype=object)

Expand Down
Loading