diff --git a/.github/workflows/cypress-tests.yml b/.github/workflows/cypress-tests.yml index 243b4db85..2baf505c1 100644 --- a/.github/workflows/cypress-tests.yml +++ b/.github/workflows/cypress-tests.yml @@ -9,6 +9,10 @@ on: - stable - 'jc/**' +concurrency: + group: e2e-${{ github.ref }} + cancel-in-progress: true + jobs: cypress-run: runs-on: ubuntu-latest diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index 6f12f2189..0658fe926 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -93,6 +93,7 @@ jobs: -e POSTGRES_HOST=postgres \ -e POSTGRES_PASSWORD=PdwPNS2mDN73Vfbc \ -e POSTGRES_DB=polis-test \ + -e SKIP_GOLDEN=1 \ delphi \ bash -c " \ set -e; \ diff --git a/.spr.yml b/.spr.yml new file mode 100644 index 000000000..a2862f74a --- /dev/null +++ b/.spr.yml @@ -0,0 +1,14 @@ +githubRepoOwner: compdemocracy +githubRepoName: polis +githubHost: github.com +githubRemote: origin +githubBranch: edge +requireChecks: true +requireApproval: true +defaultReviewers: [] +mergeMethod: squash +mergeQueue: false +prTemplateType: stack +forceFetchTags: false +showPrTitlesInStack: false +branchPushIndividually: false diff --git a/delphi/CLAUDE.md b/delphi/CLAUDE.md index 8d5bb9c17..92d0e8b0c 100644 --- a/delphi/CLAUDE.md +++ b/delphi/CLAUDE.md @@ -2,15 +2,9 @@ This document provides comprehensive guidance for working with the Delphi system, including database interactions, environment configuration, Docker services, and the distributed job queue system. It serves as both documentation and a practical reference for day-to-day operations. -## Documentation Directory +## Documentation -For a comprehensive list of all documentation files with descriptions, see: -[delphi/docs/DOCUMENTATION_DIRECTORY.md](docs/DOCUMENTATION_DIRECTORY.md) - -## Current work todos are located in - -delphi/docs/JOB_QUEUE_SCHEMA.md -delphi/docs/DISTRIBUTED_SYSTEM_ROADMAP.md +**Warning:** Many docs in `docs/` are outdated and should not be trusted. Always verify against the actual code. Start with `docs/PLAN_DISCREPANCY_FIXES.md` (canonical fix plan) and `docs/CLJ-PARITY-FIXES-JOURNAL.md` (session journal) for current Clojure parity work. ## Helpful terminology @@ -368,3 +362,25 @@ The system uses AWS Auto Scaling Groups to manage capacity: - Large Instance ASG: 1 instance by default, scales up to 3 based on demand CPU utilization triggers scaling actions (scale down when below 60%, scale up when above 80%). + + +## Testing + +Run tests with `pytest` on the `tests/` folder. + +### Datasets of reference + +In `real_data`, we have several datasets of real conversations, exported from Polis, that can be used for testing and development. Those at the root of `real_data` are public. +In `real_data/.local`, we have some private datasets that can only be used internally. The comparer supports both public and private datasets via the `--include-local` flag. + +### Regressions and golden snapshots + +For regressions compared to the latest validated python code, there are both regression unit tests in `tests/`, as well as a test script that compares the output to "golden snapshots": `scripts/regression_comparer.py`. That script is more verbose than the tests, useful for debugging. + +Some amount of numerical errors are OK, which is what the regression comparer library is for. + +### Old Clojure reference implementation, and moving to Sklearn + +For math, there is an older implementation in Clojure, in `polismath`. Until we can replace it, we run comparisons between the two implementations in `tests/*legacy*`. Those run the python code, and compare some of the output in some way to the `math blob`, which is the JSON output of the Clojure implementation, often stored in the PostgreSQL database, but for simplicity stored along the golden (python) snapshots used by the regression comparer, so we do not have to run Postgres nor Clojure to run those tests. + +A lot of the current python code was ported from Clojure using an AI agent (Sonnet 3.5 last year), including a lot of home-made implementations of core algorithms. We are in the process of replacing those with standard implementations (such as sklearn for the PCA and K-means). This is ongoing work, and made harder by the fact that the Python code does not quite produce the same output as the Clojure code. So typically we have to check what the ported python code is doing differently from the clojure code, adjust the python code to match the clojure output, and then replace it with standard implementations, which may again produce slightly different output, so we have to adjust parameters until we get similar output. diff --git a/delphi/Dockerfile b/delphi/Dockerfile index 4fe6f4eab..019f4658d 100644 --- a/delphi/Dockerfile +++ b/delphi/Dockerfile @@ -7,6 +7,13 @@ ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 + ENV UV_SYSTEM_PYTHON=1 + + # Install uv for faster package installation (~2x faster than pip) + # Placed in /opt/uv (not /usr/local/bin) to avoid leaking into the final/prod image + # via the blanket COPY --from=builder /usr/local/bin in Stage 2. + COPY --from=ghcr.io/astral-sh/uv:0.11.2 /uv /opt/uv/uv + ENV PATH="/opt/uv:$PATH" RUN apt-get update && \ apt-get install -y --no-install-recommends \ @@ -28,21 +35,21 @@ COPY pyproject.toml requirements.lock ./ # Install dependencies from lock file (cached layer - reused unless requirements.lock changes) -# BuildKit cache mount keeps pip cache between builds for faster rebuilds +# BuildKit cache mount keeps uv cache between builds for faster rebuilds # If USE_CPU_TORCH is true, we install CPU-specific wheels and filter them out of the lockfile -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=/root/.cache/uv \ if [ "$USE_CPU_TORCH" = "true" ]; then \ echo "USE_CPU_TORCH=true: Installing CPU-only PyTorch..." && \ - pip install --index-url https://download.pytorch.org/whl/cpu \ + uv pip install --index-url https://download.pytorch.org/whl/cpu \ torch==2.8.0 \ torchvision==0.23.0 \ torchaudio==2.8.0 && \ echo "Filtering standard torch packages from requirements.lock..." && \ grep -vE "^(torch|torchvision|torchaudio)==" requirements.lock > requirements.filtered.lock && \ - pip install -r requirements.filtered.lock; \ + uv pip install -r requirements.filtered.lock; \ else \ echo "USE_CPU_TORCH=false: Installing standard dependencies..." && \ - pip install -r requirements.lock; \ + uv pip install -r requirements.lock; \ fi # ===== OPTIMIZATION: Copy source code LAST (busts cache on code changes) ===== @@ -54,8 +61,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # Install the project package (without dependencies - they're already installed) # This registers entry points and installs the package in development mode - RUN --mount=type=cache,target=/root/.cache/pip \ - pip install --no-deps . + RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --no-deps . RUN echo "--- PyTorch Check (after pyproject.toml installation) ---" && \ pip show torch torchvision torchaudio && \ @@ -132,12 +139,14 @@ RUN apt-get update && \ && apt-get clean && \ rm -rf /var/lib/apt/lists/* -# Copy pyproject.toml to install dev dependencies +# Copy uv from builder for faster package installation +COPY --from=builder /opt/uv/uv /usr/local/bin/uv +ENV UV_SYSTEM_PYTHON=1 COPY pyproject.toml . # Install dev dependencies (pytest, etc.) using caching -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install --no-cache-dir ".[dev]" +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install ".[dev]" # Default command for test container (can be overridden) CMD ["tail", "-f", "/dev/null"] diff --git a/delphi/docs/CLJ-PARITY-FIXES-JOURNAL.md b/delphi/docs/CLJ-PARITY-FIXES-JOURNAL.md new file mode 100644 index 000000000..d0092623a --- /dev/null +++ b/delphi/docs/CLJ-PARITY-FIXES-JOURNAL.md @@ -0,0 +1,452 @@ +# Journal: Fixing Python-Clojure Discrepancies + +This is the ongoing tracking document for the TDD fix process described in +`PLAN_DISCREPANCY_FIXES.md`. It serves as the single source of truth for +our work, while commit messages and PR descriptions serve reviewers. + +--- + +## Initial Baseline (2026-02-25) + +### Branch: `series-of-fixes` (forked from `origin/kmeans_analysis_docs`) + +### Test Results + +**`test_legacy_clojure_regression.py`** (2 datasets: vw, biodiversity): +- 2 passed (`test_pca_components_match_clojure` × 2) +- 6 xfailed: + - `test_basic_outputs` × 2 — D9/D5/D7: empty `comment_repness` + - `test_group_clustering` × 2 — D2/D3: wrong participant threshold, missing k-smoother + - `test_comment_priorities` × 2 — D12: not implemented + +**Full test suite** (`tests/` except `test_batch_id.py`): +- 185 passed, 11 failed, 3 skipped, 6 xfailed +- Pre-existing failures (not caused by this work, inherited from stacked PRs): + - `test_clusters.py::test_init_clusters` — `init_clusters()` doesn't populate members when k > n_points + - `test_conversation.py::test_recompute` — clustering threshold (7.2) filters out all 20 participants in synthetic data + - `test_conversation.py::test_data_persistence` — same threshold issue + - `test_datasets.py` × 4 — DatasetInfo API changed (added `has_cold_start_blob`), tests use old 8-arg constructor + - `test_edge_cases.py::test_insufficient_data_for_pca` — repness returns empty dict for no-group case + - `test_repness_smoke.py` × 2 — repness empty due to D9/D5/D7 (the very discrepancies we're fixing) +- **Note**: CI runs on `main` which has different code — these failures are specific to this stacked branch. + 4 of them were already known (documented in MEMORY.md under "Known pre-existing test failures in #2393"). + +### Datasets Available + +| Dataset | Votes | Has cold-start Clojure blob? | +|---------|------:|-----| +| vw | 4,684 | Yes | +| biodiversity | 29,803 | Yes | +| *(5 private datasets)* | 91K–1M | **No** (need prodclone DB) | + +### Clojure Blob Structure (vw example) + +Repness entry keys: `tid`, `n-agree`, `p-test`, `repness-test`, `n-success`, +`repful-for`, `n-trials`, `repness`, `best-agree`, `p-success` + +Consensus structure: `{agree: [{tid, n-success, n-trials, p-success, p-test}], disagree: [...]}` + +Comment priorities: `{tid: priority_value}` — 125 entries for vw + +In-conv: list of 67 participant IDs (vw) + +--- + +## PR 0: Test Infrastructure (complete) + +### What was done +- Created `tests/test_discrepancy_fixes.py` with per-discrepancy test classes +- Created this journal +- Documented initial baseline + +### Test results for `test_discrepancy_fixes.py` + +After rebase onto updated `origin/kmeans_analysis_docs`: + +``` +7 passed, 19 skipped, 39 xfailed, 10 xpassed (with --include-local, 7 datasets) +``` + +- **7 passed**: Clojure formula sanity checks (prop_test, repness metric product, repful rat>rdt) + Clojure blob consistency checks (pat values) +- **19 skipped**: D15 moderation (no moderated comments), incomplete Clojure blobs, engage duplicate files +- **39 xfailed**: Discrepancy tests correctly fail (D2-D12 constants, formulas, and real-data comparisons) +- **10 xpassed** (all `strict=False`, so green): + - D2 in-conv × 2 on vw — small dataset where old/new thresholds coincide + - D6 two_prop_test × 1 — pseudocount difference too small to matter for this test case + - D9 repness_not_empty × 7 on all datasets — `comment_repness` list is populated (all + (group, comment) pairs) even with wrong thresholds; only `group_repness` selection is + affected. **TODO**: tighten this test when fixing D9 to check correct *number* of + representative comments, not just non-emptiness + +### Design decisions +- All tests that verify targets not yet implemented are marked `@pytest.mark.xfail` with the discrepancy ID in the reason +- `strict=False` on D2 because vw (small) coincidentally matches while biodiversity (larger) does not +- D5 `test_clojure_pat_values_consistent_with_formula` is a sanity check that verifies Clojure data matches the documented formula — it's NOT xfailed because it doesn't test Python code +- D6 `test_two_prop_test_with_pseudocounts` is xfailed (Python lacks pseudocounts) +- D15 uses `pytest.skip` when dataset has no moderated comments + +### Test classes summary + +| Class | Discrepancy | Tests | xfail? | +|-------|-------------|-------|--------| +| `TestD2InConvThreshold` | D2 | 2 (per dataset) | xfail(strict=False) | +| `TestD4Pseudocount` | D4 | 2 (1 constant + 1 per dataset) | both xfail | +| `TestD5ProportionTest` | D5 | 2 (1 formula + 1 sanity per dataset) | formula xfail, sanity passes | +| `TestD6TwoPropTest` | D6 | 1 | xfail | +| `TestD7RepnessMetric` | D7 | 1 | xfail | +| `TestD8FinalizeStats` | D8 | 1 | xfail | +| `TestD9ZScoreThresholds` | D9 | 3 (2 constants + 1 per dataset) | all xfail | +| `TestD10RepCommentSelection` | D10 | 1 (per dataset) | xfail | +| `TestD11ConsensusSelection` | D11 | 1 (per dataset) | xfail | +| `TestD12CommentPriorities` | D12 | 1 (per dataset) | xfail | +| `TestD15ModerationHandling` | D15 | 1 (per dataset) | skipped (no mod-out data) | +| `TestSyntheticEdgeCases` | multiple | 5 | 2 xfail (D4, D9), 3 pass | + +--- + +## PR 1: Fix D2 — In-Conv Participant Threshold (blocked on Clojure blob regeneration) + +### TDD steps +1. **Baseline (public only)**: 3 failed (2 D2 + 1 DynamoDB), 205 passed, 4 skipped, 20 xfailed, 3 xpassed +2. **Red**: Removed xfail from `TestD2InConvThreshold` — biodiversity fails (Python=428, Clojure=441, threshold 8.8 vs 7) +3. **Fix**: Changed `threshold = 7 + sqrt(n_cmts) * 0.1` → `threshold = min(7, n_cmts)` in `conversation.py:1270` +4. **Green (public only)**: All 4 D2 tests pass (vw + biodiversity × 2 tests each) +5. **Full suite (public only)**: 1 failed (DynamoDB only), 207 passed — but golden snapshots were re-recorded for biodiversity without proper verification +6. **Full suite (with private datasets)**: 15 failed, 6 errors — regression tests fail for private datasets (golden snapshots stale), `engage` dataset has duplicate vote files +7. Investigated regression failures — all caused by expected threshold change. Re-recorded after verification. +8. **Blocker**: 3 private datasets have incomplete Clojure blobs (no in-conv data). D2 tests fail on those — not a code issue. Delegated blob regeneration to separate session. + +### Investigation of regression failures +- Private dataset golden snapshots were never committed (unstaged in .local repo) — reverted automatically +- biodiversity: re-recorded after verifying D2 in-conv set matches Clojure exactly (428→441 participants) +- Private datasets: threshold dropped significantly for large conversations: + - bg2050 (7753 comments): old threshold ≈ 15.8, new threshold = 7 → 6609/7890 qualify (was fewer) + - This is the expected and correct effect of the D2 fix +- **Gotcha**: `--datasets ` alone won't find private datasets — must also pass `--include-local`! + Without it, private datasets are silently skipped (shown as `[NOTSET]`). +- Golden snapshots re-recorded for FLI, bg2018, pakistan, bg2050 after verifying all regression + failures are downstream of the expected threshold change. Committed in `.local` repo on + branch `series-of-fixes`. + +### Incomplete Clojure blobs (RESOLVED) + +3 private datasets have incomplete Clojure cold-start blobs (4 keys instead of 23): +- Missing `in-conv`, `repness`, `consensus`, `comment-priorities`, etc. +- Root cause: the Clojure math worker relies on incremental processing and cannot + analyse these large conversations in a single cold-start pass. +- The 4 remaining datasets (vw, biodiversity, FLI, bg2018) have complete cold-start blobs (23 keys) + +**Resolution (session 4)**: Instead of regenerating blobs, we now test against both +incremental and cold-start blob types. `get_blob_variants()` discovers which blob types +have meaningful content per dataset (via `_is_blob_filled()`). Tests on datasets with +empty cold-start blobs only run against the incremental blob. D2 in-conv tests on +incremental blobs are xfailed (see session 4 notes). No blob regeneration needed. + +### What was NOT needed (revised — see D2c/D2d below) +- ~~`raw_rating_mat` vs `rating_mat` — not needed~~ **WRONG**: See D2c below, this IS needed. +- Greedy fallback / monotonic persistence — not needed for cold-start parity, but monotonicity tests needed for future-proofing (see D2d). + +### D2c: Vote count source — raw vs filtered matrix (discovered in session 3) + +Deep investigation of Clojure vs Python revealed a **structural discrepancy** in how +votes are counted for the in-conv threshold, independent of delta vs full processing: + +| Aspect | Clojure | Python (current) | +|--------|---------|-----------------| +| Vote counts per participant | From `raw-rating-mat` (conversation.clj:217-225) — includes votes on moderated-out comments | From `self.rating_mat` (conversation.py:1226/1244) — excludes moderated-out columns | +| `n_cmts` (threshold cap) | From `rating-mat` (conversation.clj:214-215) — columns zeroed but still present, so count includes moderated-out | From `self.rating_mat` (conversation.py:1268) — moderated-out columns removed entirely | + +Key insight: Clojure's `zero-out-columns` (named_matrix.clj:214-228) sets moderated-out +column values to 0 but **keeps the columns** in `rating-mat`. Python's `_apply_moderation` +(conversation.py:308) **removes columns entirely**. This means both `user-vote-counts` +and `n-cmts` differ between implementations. + +**Fix**: Both `_compute_user_vote_counts` and `n_cmts` must use `self.raw_rating_mat`. +Two xfail unit tests planned (vote count source + n_cmts threshold). + +### D2d: In-conv monotonicity — full recompute vs persistence (discovered in session 3) + +Investigated whether Python needs to persist in-conv to DynamoDB (like Clojure persists to +`math_main`). Finding: **no, full recompute is better**. + +Clojure persists in-conv (conv_man.clj:55) because it uses delta vote processing. Python +rebuilds `raw_rating_mat` from all votes every time. Since votes are immutable in PostgreSQL +(can be updated, never deleted), a participant's vote count never decreases → monotonicity +is a free consequence of full recompute from `raw_rating_mat`. + +**Decision**: No DynamoDB persistence. Instead, 6 tests (T1-T6) guard the monotonicity +invariant, each documenting that switching to delta processing would require adding +persistence. See plan PR 1bis for test details. Ref: compdemocracy/polis#2358. + +### D2b: Base-cluster sort order (added from Copilot review) + +Copilot flagged that Python sorts base clusters by size (descending) and reassigns IDs, +while Clojure uses `(sort-by :id ...)` which preserves k-means' original cluster IDs. +The size-sort changes the encounter order of base-cluster centers fed into group-level +k-means (which uses first-k-distinct initialization), potentially diverging from Clojure. + +**Fix**: Removed size-sort and ID reassignment; now sort by k-means ID (ascending), +matching Clojure's `sort-by :id`. + +### Test results for PR 1 + +``` +17 passed, 16 skipped, 31 xfailed, 7 xpassed, 4 errors (with --include-local, 7 datasets) +``` + +- **17 passed**: D2 tests pass on 4 datasets with complete blobs (8 tests) + formula sanity checks + blob consistency +- **16 skipped**: D2 skipped on 3 datasets with incomplete Clojure blobs + D15 moderation + engage errors +- **31 xfailed**: Remaining discrepancy tests (D4-D12) +- **7 xpassed** (all `strict=False`, so green): + - D6 two_prop_test × 1 — pseudocount difference too small for this test case + - D9 repness_not_empty × 6 — test too weak (checks non-empty, not correct count) +- **4 errors**: engage dataset has duplicate vote files (pre-existing data issue) + +### Golden snapshot re-recording (BLOCKED) + +After D2b fix, regression tests fail on all datasets except vw (as expected — sort order +change cascades to different cluster assignments). biodiversity was re-recorded and verified. +Private datasets (FLI, bg2018, bg2050, pakistan) need re-recording but the recorder crashes +on `engage` (pre-existing duplicate vote files) before reaching the others. Need to record +them individually, but **blocked on fixes to PRs earlier in the stack** (#2393, #2397). +Will re-record after those are resolved and rebased. + +### What's Next + +1. **PR 1 (D2c)**: Implement the vote count source fix — switch `_compute_user_vote_counts` + and `n_cmts` to use `self.raw_rating_mat`. Write xfail tests first, then fix, then verify. +2. **PR 1bis (D2d)**: Write the 6 monotonicity tests (T1-T6). These should pass immediately + after D2c is fixed (full recompute + raw_rating_mat = monotonicity for free). Add the + code comment block and PR description documenting the design decision. +3. **PR 2 — Fix D4 (Pseudocount)**: Next in pipeline order after D2 is fully resolved. + +--- + +## Session Log + +### Session 1 (2026-02-25) + +- Assessed codebase: read test infra, repness module, conversation module, Clojure blob structure +- Created `tests/test_discrepancy_fixes.py` (30 tests, 12 classes) +- Created this journal with initial baseline +- Ran baseline: `test_legacy_clojure_regression.py` → 2 passed, 6 xfailed +- Ran full suite → 185 passed, 11 failed (pre-existing), 3 skipped, 6 xfailed +- Investigated 11 pre-existing failures: inherited from stacked PRs (DatasetInfo API change, threshold issues, D9/D5/D7 repness) +- Rebased onto updated `origin/kmeans_analysis_docs` after PR stack rebase +- Committed and created PR #2401 (`[Clj parity PR 0]`) + +### Session 2 (2026-02-26) + +- Rebased after stack update (base tests tightened, some xpassed→xfailed) +- Updated PR title convention: `[Clj parity PR N]` prefix for reviewer clarity +- Redacted private dataset names from git history across the full stack: + - `SESSION_HANDOFF_KMEANS.md` in `kmeans_clustering_tooling` (amended deep commit via `GIT_SEQUENCE_EDITOR` rebase) + - `PLAN_DISCREPANCY_FIXES.md` in `kmeans_analysis_docs` (amended tip) + - `CLJ-PARITY-FIXES-JOURNAL.md` in `series-of-fixes` (amended tip) + - Force-pushed all three branches, rebased the chain +- Tests unchanged: 5 passed, 2 skipped, 18 xfailed, 5 xpassed +- Renamed journal and plan to `CLJ-PARITY-FIXES-*.md`, amended introducing commits, rebased chain +- Set up private data repo infrastructure: + - Pushed data to bare repo at `~/polis/github/real_data_private` + - Created `link-to-polis-worktree.sh` for per-worktree clones with post-checkout branch sync + - Linked `.local` to this worktree, created `series-of-fixes` branch in private repo +- Created `CLAUDE.local.md` (via stow) and `CLAUDE.md` in private data repo +- D2 fix (TDD): + - Baseline (public only): 205 passed, 3 failed (2 D2 + 1 DynamoDB) + - Red: removed xfail from D2 tests, biodiversity fails (Python=428, Clojure=441) + - Fix: `threshold = min(7, n_cmts)` in `conversation.py:1270` + - Green: D2 tests pass on vw + biodiversity + - Full suite with private datasets (14 min): regression failures on all private datasets (expected — threshold change cascades to clustering) + - Investigated: all failures are downstream of threshold change, verified correct + - Re-recorded golden snapshots for biodiversity + 4 private datasets after verification + - Discovered 3 private datasets have incomplete Clojure blobs (4 keys instead of 23) — delegated regeneration to separate session + - Committed and pushed D2 fix (`df2d013ec`) + +### Session 3 (2026-03-04) + +- Deep investigation of in-conv vote counting: compared Clojure `user-vote-counts` + (conversation.clj:217-225, uses `raw-rating-mat`) vs Python `_compute_user_vote_counts` + (conversation.py:1226, uses `self.rating_mat`) +- Discovered D2c: structural discrepancy in both vote count source AND `n_cmts` — Clojure's + `zero-out-columns` keeps moderated-out columns (zeroed), Python's `_apply_moderation` + removes them entirely. Both vote counts and threshold cap differ. +- Investigated whether to persist in-conv to DynamoDB (like Clojure does to `math_main`). + Found: Clojure persists because it uses delta vote processing. Python does full recompute + → monotonicity is free. No persistence needed. +- Verified Clojure persistence path: `prep-main` (conv_man.clj:55) writes `:in-conv`, + `restructure-json-conv` (conv_man.clj:182) restores it on restart. +- Updated plan: added D2c (vote count source, 2 xfail tests) and D2d (monotonicity, 6 tests + guarding against future delta-processing refactor). Ref: compdemocracy/polis#2358. +- Corrected earlier journal entry that said `raw_rating_mat` was "not needed" — it IS needed. +- Added terminology rule to CLAUDE.local.md: always say "moderated-out" or "moderated-in", + never just "moderated". +- Committed plan update (`f2bf77c38`) + +### Session 4 (2026-03-10) + +- **Dual-blob test infrastructure** (committed on `jc/series-of-fixes`, #2420): + - Extended `get_dataset_files()` with `blob_type` parameter (explicit `'incremental'` or `'cold_start'`) + - Added `get_blob_variants()` to discover which blob types are available per dataset, + filtering out empty/unfilled blobs via `_is_blob_filled()` (checks for PCA data or + non-empty base-clusters) + - Extended `@pytest.mark.use_discovered_datasets` with optional `use_blobs=True` parameter + to parametrize tests with composite `dataset-blob_type` IDs (e.g., `biodiversity-incremental`) + - Added `parse_dataset_blob_id()` in conftest for splitting composite IDs + - Applied to `test_legacy_clojure_regression.py`, `test_discrepancy_fixes.py`, and + `test_legacy_repness_comparison.py` + - Conversation computation shared across blob variants via `_CONV_CACHE` (keyed by dataset + name) to avoid redundant recomputation + +- **D2 incremental xfail with rationale** (committed on `jc/clj-parity-d2-fix`, #2421): + - D2 in-conv tests on incremental blobs are xfailed with inline comments explaining why: + incremental blobs were built progressively as votes trickled in, so the threshold + `min(7, n_cmts)` was evaluated at each iteration with a smaller `n_cmts`, admitting + a few extra participants (1–2) during earlier iterations. Very large conversations + have empty cold-start blobs because the Clojure math worker relies on incremental + processing — it cannot analyse the whole conversation in a single cold-start pass. + - Matching incremental exactly would require simulating the progressive threshold — tracked + as future work under Replay Infrastructure (PRs A/B/C in the plan) + +- **Golden snapshots re-recorded** for all public + private datasets, 5 stale xfail markers removed +- **Final test results**: 245 passed, 5 skipped, 36 xfailed, 0 failures, 0 xpassed +- Updated PR #2421 description with incremental vs cold-start explanation +- Local handoff file created at `delphi/docs/HANDOFF_D2_INCREMENTAL_IN_CONV.md` (untracked) + for future investigation of how much in-conv sets differ between blob types + +### Session 5 (2026-03-11) + +- **D2c fix**: Switched `_compute_user_vote_counts` and `_get_in_conv_participants` to use + `self.raw_rating_mat` instead of `self.rating_mat`. Both vote counts and `n_cmts` now + include votes on moderated-out comments, matching Clojure's `user-vote-counts` + (conversation.clj:217-225) and `n-cmts` (conversation.clj:214-215). +- **D2c tests** (3 in `TestD2cVoteCountSource`): + - `test_vote_count_includes_moderated_out_votes`: 10 comments, 3 moderated-out, count=10 + - `test_n_cmts_includes_moderated_out_comments`: verifies threshold uses raw column count, + not filtered; also tests that a participant with 6 raw votes is correctly excluded + - `test_participant_stays_in_conv_after_moderation`: the critical scenario — participant + with 8 votes stays in-conv when 3 comments moderated-out (filtered count drops to 5) +- **D2d monotonicity tests** (5 in `TestD2dInConvMonotonicity`): + - T1: basic monotonicity across batch updates + - T2: survives moderation-out + - T3: worker restart + moderation (key delta-processing guard) + - T4: worker restart, moderation, no new votes + - T5: mixed participants with moderation + - All pass for free with D2c fix (full recompute from `raw_rating_mat`) +- **TDD discipline**: wrote tests first (D2c xfail, D2d no xfail), confirmed D2c red (3 + xfailed) and D2d red (T2-T5 failed, T1 passed), applied fix, confirmed all green. +- **Full test suite**: 253 passed, 5 skipped, 36 xfailed, 0 failures (+8 from session 4) +- **No regressions** on public datasets +- Added code comments on `_get_in_conv_participants` documenting the monotonicity design + decision and delta-processing caveat (ref: #2358) +- Updated plan: D2, D2b, D2c, D2d all marked DONE; PR 1bis merged into PR 1 +- Updated PR #2421 description with D2c/D2d sections + +### What's Next + +1. **PR 3 — Fix D9 (Z-score thresholds)**: Switch from two-tailed to one-tailed z-scores. +2. Regression test performance investigation (see `HANDOFF_REGRESSION_TEST_PERF.md`) + +--- + +## PR 2: Fix D4 — Pseudocount Formula + +### TDD steps +1. **Baseline**: 25 passed, 3 skipped, 28 xfailed (discrepancy tests) +2. **Red**: Removed xfail from 3 D4 tests → 6 failures (constant check, pa values × 4 datasets, synthetic) +3. **Fix**: `PSEUDO_COUNT = 1.5` → `2.0` in `repness.py` +4. **Green**: All 6 D4 tests pass +5. **Full suite**: 258 passed, 3 skipped, 30 xfailed, 0 failures (public datasets) +6. **Private datasets**: 60 passed, 6 skipped, 53 xfailed (discrepancy tests with --include-local) +7. Re-recorded golden snapshots for all 7 datasets + +### Changes +- `repness.py`: `PSEUDO_COUNT = 2.0`, updated comment to reference Beta(2,2) prior +- `test_discrepancy_fixes.py`: removed xfail from 3 D4 tests +- `test_repness_unit.py`, `test_old_format_repness.py`: import `PSEUDO_COUNT` instead of hardcoding 1.5 +- `simplified_repness_test.py`: updated hardcoded constant + +### Side finding: regression test performance +Large private datasets take 1-5 minutes per regression test due to: +- Benchmark mode running pipeline 3× (n_runs=3) +- O(participants × comments) per-participant loop in `_compute_participant_info_optimized` +- Intermediate stages computed redundantly + +Fitted model: `t ≈ 1.66 + 3.87e-5 × votes + 9.90e-7 × (ptpts × cmts)` (R²=0.9995). +Detailed analysis in `HANDOFF_REGRESSION_TEST_PERF.md` for a future session. + +### Session 6 (2026-03-11) + +- Created branch `jc/clj-parity-d4-fix` stacked on `jc/clj-parity-d2-fix` +- D4 fix (TDD): red (6 tests failed) → fix PSEUDO_COUNT → green (all 6 pass) +- Fixed 2 unit tests that hardcoded old pseudocount value (used import instead) +- Re-recorded golden snapshots for all 7 datasets (public + private) +- Investigated regression test performance (engage 317s, pakistan 179s) — confirmed + consistent with O(votes + ptpts×cmts) complexity, not a regression from D4 +- Created `HANDOFF_REGRESSION_TEST_PERF.md` for future optimization work +- Pushed branch, created PR + +### What's Next + +1. **PR 3 — Fix D9 (Z-score thresholds)**: `Z_90=1.645` → `1.2816`, `Z_95=1.96` → `1.6449` +2. Regression test performance optimization (separate session) + +--- + +## TDD Discipline + +**CRITICAL: For every fix, ALWAYS follow this order:** +1. **Run the full test suite** (all datasets, including private) to establish the baseline +2. **Remove xfail** from the target test(s) +3. **Run tests and confirm they FAIL** (red) — this validates the test actually catches the discrepancy +4. **Apply the fix** +5. **Run tests and confirm they PASS** (green) +6. **Run the full test suite** to check for regressions vs the baseline from step 1 +7. **If regression tests fail**: INVESTIGATE before re-recording golden snapshots (see below) + +Never skip step 3. A test that passes before the fix is applied is not testing anything useful. + +### Golden snapshots are precious + +**NEVER blindly re-record golden snapshots.** They are the regression safety net. +When a fix causes regression test failures: + +1. **Investigate** what changed: compare old vs new output, check which fields differ +2. **Verify** the new output is closer to Clojure (e.g., in-conv set now matches exactly) +3. **Only then** re-record, one dataset at a time, after confirming correctness +4. **Commit** the updated snapshots with a clear message explaining why they changed + +### Per-dataset testing for faster feedback + +Run each dataset as a **separate background task** instead of one big pytest invocation. +Smallest datasets finish first, giving early signal without waiting for the 1M-vote ones. + +**IMPORTANT**: Always pass `--include-local` when testing private datasets. +Without it, `--datasets ` silently skips private datasets (shown as `[NOTSET]`). + +### Pipelined worktree workflow + +Full test suite takes ~14 minutes. To avoid idle time: +- When tests start running on the current worktree, create the next worktree and start coding the next fix +- Each worktree = one fix, one branch, one PR — stacked like the PR chain +- Number of worktrees in flight depends on test duration vs coding speed +- If a lower fix needs amending, rebase the chain of worktrees above it +- Each worktree gets its own `.local` private data clone (via `link-to-polis-worktree.sh`) + +**When starting a new Claude session for the next fix**, ask the user whether they want +to create a new worktree. If yes, provide a prompt they can use to start that session: + +> Start working on [Clj parity] fix DN (). This is a new worktree +> stacked on ``. Read `delphi/docs/CLJ-PARITY-FIXES-JOURNAL.md` +> and `delphi/docs/CLJ-PARITY-FIXES-PLAN.md` for context. Follow the TDD discipline +> documented in the journal. + +--- + +## Notes for Future Sessions + +- Private datasets are in `delphi/real_data/.local/` (separate git repo, linked via `link-to-polis-worktree.sh`) +- `test_discrepancy_fixes.py` uses same parametrization pattern as `test_legacy_clojure_regression.py` (own `pytest_generate_tests` hook, `dataset_name` fixture). +- 11 pre-existing test failures are from the stacked branch, not from our work. They should be fixed in their respective PRs before merging to main. +- `strict=False` on xfail means xpass (unexpected pass) is reported but not a failure. Used when some datasets pass by coincidence. +- After rebase, D9 `test_repness_not_empty` started xpassing — the `comment_repness` list is populated (all pairs), but `group_repness` (selected reps) may still be affected by wrong thresholds. Consider tightening this test when fixing D9. +- To rebase when base is updated: `git fetch origin kmeans_analysis_docs && git rebase --onto origin/kmeans_analysis_docs series-of-fixes-base series-of-fixes && git tag -f series-of-fixes-base origin/kmeans_analysis_docs` diff --git a/delphi/docs/HANDOFF_REGRESSION_TEST_PERF.md b/delphi/docs/HANDOFF_REGRESSION_TEST_PERF.md new file mode 100644 index 000000000..2692b3c53 --- /dev/null +++ b/delphi/docs/HANDOFF_REGRESSION_TEST_PERF.md @@ -0,0 +1,95 @@ +# Handoff: Regression Test Performance Investigation + +## Problem + +The `test_regression.py` tests are slow for large private datasets, particularly +`engage` (317s) and `pakistan` (179s). This was noticed during the D4 pseudocount +fix session but is **not caused by D4** — the pseudocount is just a constant and +has zero performance impact. + +## Dataset Dimensions + +| Dataset | Votes | Participants | Comments | t_regression | t_stages | +|---------------|----------|--------------|----------|--------------|----------| +| vw | 4,683 | 69 | 125 | 3.02s | 0.62s | +| biodiversity | 29,802 | 536 | 316 | 15.83s | 2.40s | +| FLI | 91,364 | 1,090 | 1,454 | 8.72s | 9.61s | +| bg2018 | 226,232 | 2,044 | 896 | 13.66s | 11.21s | +| engage | 442,583 | 5,531 | 7,728 | 317.33s | 59.97s | +| pakistan | 400,075 | 18,081 | 9,034 | 196.47s | 178.89s | +| bg2050 |1,034,858 | 7,890 | 7,753 | 165.32s | 87.16s | + +- `t_regression` = `test_conversation_regression` (runs benchmark: 3x full pipeline) +- `t_stages` = `test_conversation_stages_individually` (single pipeline run) + +## Complexity Model + +Fitted a linear model on vw, biodiversity, FLI, bg2018, pakistan (5 points), +then predicted engage and bg2050 (held out): + +``` +t_stages ≈ 1.66 + 3.87e-5 × votes + 9.90e-7 × (participants × comments) +``` + +R² = 0.9995 on fit data. + +| Dataset | Predicted | Observed | Ratio | +|---------------|-----------|-----------|-------| +| vw | 1.85s | 0.62s | 0.34 | (intercept dominates small datasets) +| biodiversity | 2.98s | 2.40s | 0.81 | +| FLI | 6.77s | 9.61s | 1.42 | +| bg2018 | 12.23s | 11.21s | 0.92 | +| **engage** | **61.12s** | **59.97s** | **0.98** | held out — predicted well +| pakistan | 178.91s | 178.89s | 1.00 | +| **bg2050** | **102.29s** | **87.16s** | **0.85** | held out — 15% over-prediction + +**Conclusion: timings are fully explained by O(votes) + O(participants × comments).** + +## Two Bottlenecks + +### 1. `_compute_participant_info_optimized` — O(participants × groups × comments) + +Location: `conversation.py:726-904` + +Per-participant Python loop computing correlations with each group. For each of +the N participants: +- Slice participant votes: O(comments) +- For each group: mask + `np.corrcoef` on valid comments: O(comments) +- Total: O(participants × groups × comments) + +For pakistan (18K × ~3 × 9K ≈ 486M ops), this dominates. + +**Fix ideas:** +- Vectorize: compute all participant-group correlations as a single matrix operation +- Use `np.corrcoef` on the full (participants × comments) matrix at once +- Or skip participant_info entirely if it's not needed downstream + +### 2. Benchmark mode runs pipeline 3× + +`compare_with_golden(benchmark=True)` calls `compute_all_stages_with_benchmark` +which runs `compute_all_stages` **3 times** (n_runs=3). This is the main reason +`t_regression ≈ 5× t_stages` for large datasets. + +**Fix: set `benchmark=False` in test_regression.py** (or make it opt-in). + +### 3. Intermediate stages redundancy + +`compute_all_stages` runs 6 stages including intermediate ones (empty, load-only, +PCA-only, PCA+clustering, full recompute, full data export). The full recompute +already includes everything — the intermediate stages exist for granular failure +detection in `test_conversation_stages_individually`. + +**Question:** Are the intermediate stages actually useful? If a regression is +detected, the stage-level test tells you WHERE, but you could also just diff +the full recompute output. Consider: +- Keep intermediate stages for small datasets only +- Or compute them lazily (only if full recompute fails) +- Or remove them entirely and rely on diff-based debugging + +## Files to Modify + +- `tests/test_regression.py` — disable benchmark, possibly skip intermediate stages +- `polismath/regression/comparer.py:62` — `compare_with_golden(benchmark=True)` default +- `polismath/regression/utils.py:40` — `compute_all_stages()` intermediate stages +- `polismath/regression/utils.py:158` — `compute_all_stages_with_benchmark()` n_runs=3 +- `polismath/conversation/conversation.py:726` — `_compute_participant_info_optimized` diff --git a/delphi/docs/PLAN_DISCREPANCY_FIXES.md b/delphi/docs/PLAN_DISCREPANCY_FIXES.md new file mode 100644 index 000000000..b01f0ca5c --- /dev/null +++ b/delphi/docs/PLAN_DISCREPANCY_FIXES.md @@ -0,0 +1,513 @@ +# Plan: TDD Approach to Fixing Python-Clojure Discrepancies + +## Context + +The Delphi Python math pipeline has 15 documented discrepancies with the Clojure reference implementation (see `deep-analysis-for-julien/07-discrepancies.md` and `deep-analysis-for-julien/09-fix-plan.md`). We need to fix them one-by-one with a TDD approach: **first extend the regression test to verify the discrepancy exists, then fix it, then verify the test passes**. + +Each fix will be a separate PR to keep reviews manageable. PRs will be **stacked** (each builds on the previous), since fixes are ordered by pipeline execution order — fixing upstream affects downstream. PRs should be clearly labeled as stacked with their dependency chain, so reviewers know the order. We can use `git rebase -i` to clean up commit history before merging to main. + +**PR naming convention**: Clojure parity fix PRs use the title prefix `[Clj parity PR N]` (e.g., `[Clj parity PR 0] Per-discrepancy test infrastructure`, `[Clj parity PR 1] Fix D2: in-conv participant threshold`). + +**Prerequisite**: The current `kmeans_work` branch has changes from `edge`. These should be separated into their own PR(s) first, before we start the discrepancy fix PRs. The discrepancy fix PRs should be based on the cleaned-up branch. + +**Action**: Before starting any fix, analyze the diff of `kmeans_work` vs `upstream/edge` to understand what's changed. Group those changes into logical PR(s) — e.g., test infrastructure improvements, doc updates, minor bug fixes. Each should be reviewable independently. The discrepancy fix PRs (PR 1+) then stack on top of this clean base. + +### Session Continuity + +Because this work will span multiple Claude Code sessions, we maintain: + +1. **`delphi/docs/CLJ-PARITY-FIXES-JOURNAL.md`** — An ongoing (committed) tracking: + - Test baseline after each PR (which tests pass/fail) + - Gotchas and edge cases discovered along the way + - Keep the detailed context here, in addition to commit messages and PR descriptions. This document serves as the single source of truth for the fix process for our work, while commit messages are for reviewers and colleagues and future maintainers. + - Notes for the next session to pick up where we left off +2. **`delphi/CLAUDE.md`** — Updated with stable patterns and conventions discovered during fixes +3. **Commit messages and PR descriptions** — All information useful for reviewers goes here (the journal should reference commit hashes, but can also duplicate content: different audiences, info can overlap, even be identical if it helps.) +4. **Reference docs** — `deep-analysis-for-julien/` (architecture, discrepancies, fix plan), this plan file + +### Current Test Baseline +- `test_legacy_clojure_regression.py`: **12 failed, 6 passed, 7 xfailed** (with --include-local) +- Key failures: `test_basic_outputs` (repness empty!), `test_group_clustering` (wrong k, wrong membership) +- `test_legacy_repness_comparison.py`: 4 passed (but only checks structure, not values) +- Only `biodiversity` and `vw` have Clojure math blobs; private datasets do not yet + +### Testing Principles + +- **Granular tests per discrepancy**: Not just overall regression — each fix gets its own targeted test checking the specific aspect it addresses. Multiple discrepancies may affect `test_basic_outputs`; we need to see incremental improvement per fix. +- **Targeted pipeline-stage tests**: For D2/D3 (participant filtering, clustering), check in-conv count, cluster count, and cluster memberships against Clojure blob. For D12, check comment-priorities against Clojure blob. +- **All datasets, not just biodiversity**: Every fix must pass on ALL datasets. biodiversity is just one reference among many. +- **Synthetic edge-case tests**: Every time we discover an edge case specific to one conversation, extract it into a synthetic unit test with made-up data (never real data from private datasets). These run fast and document the intent clearly. +- **E2E awareness**: GitHub Actions has Cypress E2E tests (`cypress-tests.yml`) testing UI workflows, and `python-ci.yml` running pytest regression. The Cypress tests don't test math output values directly, but `python-ci.yml` will break if clustering/repness changes. Formula-level fixes (D4, D5, D6, D7, D8, D9) are pure computation — no E2E risk. Selection logic changes (D10, D11) and priority computation (D12) could affect what the TypeScript server returns. We decide case-by-case which PRs need E2E verification. + +### Datasets Available (sorted by size, smallest first) + +| Dataset | Votes | Has Clojure blob? | Notes | +|---------|------:|-------------------|-------| +| vw | 4,684 | Yes | Fastest for iteration | +| biodiversity | 29,803 | Yes | Medium-size reference | +| *(5 private datasets in real_data/.local/)* | 91K–1M | **Need to generate** | Require prodclone DB | + +**Generating missing Clojure blobs**: Use `delphi/scripts/generate_cold_start_clojure.py` which creates a temporary conversation in Postgres, runs the Clojure poller via Docker, captures the math blob, and cleans up. Requires the prodclone Postgres database to be running. This should be done as a **pre-requisite step** (PR 0 or setup task) before starting fixes. Note: this is ONLY for a full cold-start blob generation. For incremental testing (D3, D1), we need the replay infrastructure to feed votes in batches and capture intermediate blobs. See below. + +```bash +cd delphi +python scripts/generate_cold_start_clojure.py --all --include-local --pause-math +``` + +If the prodclone DB is not running, we need to start it first. Also check if the main polis worktree already has generated blobs we can copy. +**Testing strategy**: Start with `vw` (fastest), then `biodiversity`, then progressively larger datasets. Expect that some discrepancies only manifest in certain conversations (different edge cases). When we find such a case, add a synthetic unit test for that pattern. + +### TypeScript Server Dependency + +The TypeScript server (`server/src/nextComment.ts`) depends on Delphi's math outputs: +- **`comment-priorities`**: Used by `getNextPrioritizedComment()` → `selectProbabilistically()` for weighted comment routing. **Currently NOT computed by Python** — falls back to uniform random (weight=1 for all). +- **`repness`**: Consumed by `client-report/src/components/lists/participantGroups.jsx` for displaying representative comments per group in the **legacy report mode**. +- **`consensus`**: Consumed by `client-report/` via `normalizeConsensus.js` for consensus display. + +**Two report modes exist**: +- **Legacy report** (`/report/{report_id}`) — uses `math["repness"]`, `math["group-votes"]`, `math["group-clusters"]` from the math blob +- **Narrative report** (`/narrativeReport/{report_id}`) — uses DynamoDB UMAP/topic data, independent of math blob repness + +The formula-level fixes (D4, D5, D6, D7, D8, D9) are pure math improvements — they make the computation correct regardless of consumer. The **selection logic** fixes (D10, D11) need a legacy-mode flag since `client-report/` may depend on current behavior. Comment priorities (D12) is a net-new computation that the TypeScript server will consume. + +**Future PRs** (after parity): Implement Python versions of TypeScript routing logic (`getNextPrioritizedComment`, `selectProbabilistically`) so core computations live in Python, with TypeScript as thin routing layer. Open a GitHub Issue upstream documenting this proposal with permalinks to the relevant TypeScript code. + +--- + +## Fix Order (one PR per fix) + +Fixes are ordered by **pipeline execution order**: participant filtering → probability estimates → proportion tests → z-score thresholds → repness metrics → comment stats → selection → consensus → clustering stability → priorities. This way, after each fix, we re-run and compare up to that point. + +**After each PR**: re-check the full baseline. Document results in the journal. Some earlier fixes may resolve downstream failures. + +--- + +### PR 0: Generate Missing Clojure Blobs + Test Infrastructure Setup + +**Pre-requisite** before any fixes. + +1. Start prodclone Postgres if not running +2. Run `generate_cold_start_clojure.py --all --include-local --pause-math` to generate cold-start blobs for all private datasets +3. Create `tests/test_discrepancy_fixes.py` with the test infrastructure: + - One test class per discrepancy, parametrized by ALL datasets + - Shared comparison utilities that call the same core comparer logic as `regression_comparer.py` + - Synthetic edge-case test scaffolding +4. Create `delphi/docs/CLJ-PARITY-FIXES-JOURNAL.md` with initial baseline +5. Update `delphi/CLAUDE.md` with TypeScript/Delphi connection documentation + +--- + +### PR 1: Fix D2 — In-Conv Participant Threshold + +**Why first**: This is the **first step in the pipeline** — it gates which participants enter clustering. Wrong threshold → wrong participants → wrong clusters → wrong everything downstream. + +**File**: `delphi/polismath/conversation/conversation.py` + +**Current**: ~~`threshold = 7 + sqrt(n_cmts) * 0.1`~~ → **DONE**: `threshold = min(7, n_cmts)`. +**D2c**: ~~Vote counts and `n_cmts` from `rating_mat`~~ → **DONE**: Both use `raw_rating_mat` (includes moderated-out comments). Matches Clojure's `user-vote-counts` (conversation.clj:217-225). +**D2b**: ~~Base clusters sorted by size~~ → **DONE**: Sort by k-means ID (matches Clojure's `sort-by :id`). + +**Remaining (deferred)**: +- Greedy fallback (top-15 voters if <15 qualify) — not needed for current datasets +- Cluster comparison test at participant level — deferred to after repness fixes + +--- + +### PR 1bis: Fix D2d — In-Conv Monotonicity — **DONE** (merged into PR 1) + +**Related upstream issue**: [compdemocracy/polis#2358](https://github.com/compdemocracy/polis/issues/2358) — "Non-Deterministic K-Means Clustering Due to Worker Restart". That issue focuses on `group-clusterings` not being persisted. In-conv IS persisted in Clojure (see below), but we take a different — and better — approach. + +**Clojure behavior (verified)**: +- **Monotonic**: `(as-> (or (:in-conv conv) #{}) in-conv (into in-conv ...))` (conversation.clj:244) — starts from previous set, only adds, never removes. +- **Persisted to DB**: `:in-conv` is included in `prep-main` (conv_man.clj:55) and written to `math_main`. On restart, `load-or-init` (conv_man.clj:196) calls `restructure-json-conv` which restores it as a set (conv_man.clj:182). So in-conv **survives worker restarts** in Clojure. +- **Why Clojure needs persistence**: Clojure uses **delta vote processing** — during normal operation, only new votes since last timestamp are fed via `keep-votes` (conversation.clj:189-190) into `raw-rating-mat`. After a restart, `load-or-init` rebuilds `raw-rating-mat` from scratch (conv_man.clj:200-203, `conv-poll` with offset 0), but during incremental updates, old votes are not re-scanned. Without persisting in-conv, a restart + moderation could drop participants whose qualifying votes are on now-moderated-out comments, because those votes might not be re-counted in delta mode. + +**Why Python doesn't need persistence — full recompute is better**: + +Python rebuilds `raw_rating_mat` from **all** votes every time (no delta processing). Since: +1. **Votes are immutable** in PostgreSQL — they can be updated (agree→disagree) but never deleted. A participant's count of "comments voted on" never decreases. +2. **`raw_rating_mat` includes votes on moderated-out comments** — moderation only affects `rating_mat` (columns removed), not `raw_rating_mat`. +3. **Therefore monotonicity is a free consequence**: if a participant had ≥7 votes at time T1, they still have ≥7 votes at time T2. No persistence needed — the DB is the persistence. +4. **Worker restart changes nothing**: on restart, `raw_rating_mat` is rebuilt from all votes → same counts → same in-conv set. No DynamoDB schema change needed. + +This is **strictly better** than Clojure's approach: simpler code, no persistence to maintain, no risk of stale/corrupt persisted state, and deterministic regardless of restart history. Clojure only needs persistence because it uses delta updates. + +**CRITICAL: future-proofing for delta vote processing**: + +If the code is ever refactored to process only delta votes (for performance), in-conv **must** be persisted to DynamoDB at that point. Without full recompute, the guarantees above break: after a restart, `raw_rating_mat` would only contain new votes, and participants whose qualifying votes are all in the past would be lost. + +This must be documented: +1. **In code**: a prominent comment block on `_get_in_conv_participants()` explaining WHY full recompute makes persistence unnecessary, and that switching to delta processing REQUIRES adding persistence (with a reference to how Clojure does it: conv_man.clj:55, conversation.clj:244). +2. **In the PR description**: a dedicated section explaining the design decision, referencing issue #2358 and the Clojure persistence code. + +**File**: `delphi/polismath/conversation/conversation.py` + +**DONE**: `_get_in_conv_participants()` uses `self.raw_rating_mat` (D2c fix). Monotonicity is a free consequence. Code comment on the function documents the design decision and the delta-processing caveat. + +**Tests implemented** (T1-T5 in `TestD2dInConvMonotonicity`): +- T1: Basic monotonicity across batch updates +- T2: Survives moderation-out of voted comments +- T3: Worker restart + moderation (key delta-processing guard) +- T4: Worker restart, moderation, no new votes +- T5: Mixed participants with moderation +- T6 (greedy fallback): deferred — greedy fallback not yet implemented + +All test docstrings explain what would break under delta processing. + +--- + +### PR 2: Fix D4 — Pseudocount Formula + +**Why before D9**: The pseudocount affects probability estimates (`pa`, `pd`), which feed into proportion tests and z-scores. Fixing this first gives us correct probability values, even if the z-score thresholds are still wrong. This follows pipeline execution order: probabilities are computed before significance testing. + +**File**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Current**: ~~`PSEUDO_COUNT = 1.5`~~ → **DONE**: `PSEUDO_COUNT = 2.0` → `pa = (na + 1) / (ns + 2)` (Beta(2,2) prior, matching Clojure) + +**Test-first approach**: +1. Add unit test: for known (na, ns) pairs from Clojure math blob repness, verify `pa` values match +2. Compare `pa`/`pd` values for specific (group, comment) pairs across ALL datasets +3. Fix: Change `PSEUDO_COUNT = 2.0` +4. Document which repness values improved and which are still off (due to remaining D5/D6/D7/D9) + +--- + +### PR 3: Fix D9 — Z-Score Significance Thresholds + +**Why next**: Now that probabilities are correct (D4 fixed), we fix the significance gates. This is likely why `comment_repness` is empty — Python requires 28% higher z-scores. + +**File**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Current**: `Z_90 = 1.645` (two-tailed), `Z_95 = 1.96` (two-tailed) +**Target**: `Z_90 = 1.2816` (one-tailed), `Z_95 = 1.6449` (one-tailed) — matching Clojure + +**Note**: Python's own `stats.py` already uses 1.2816. This fix resolves an internal inconsistency. + +**Test-first approach**: +1. Add test checking repness produces non-empty `comment_repness` for ALL datasets +2. Add test comparing repness values (pa, pat, ra, rat, overall metric) to Clojure — expect partial match (D5/D6/D7 still unfixed, so z-scores will differ, but more comments should now pass significance) +3. Fix: Change constants +4. Re-run full suite — `test_basic_outputs` may now pass +5. Document which repness values now match Clojure and which still differ + +--- + +### PR 4: Fix D5 — Proportion Test Formula + +**Why next**: With correct probabilities (D4) and thresholds (D9), now fix the proportion test formula itself. This changes ALL `pat`/`pdt` z-scores. + +**File**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Analysis**: + +| Aspect | Python (current) | Clojure (target) | +|--------|-----------------|------------------| +| Formula | `(p - 0.5) / sqrt(0.25/n)` | `2*sqrt(n+1)*((succ+1)/(n+1) - 0.5)` | +| Type | Standard z-test | Wilson-score-like with built-in continuity correction | +| Pseudocount | None in test (applied separately) | Built-in Beta(1,1) prior: `(succ+1)/(n+1)` | +| Small n | Can produce extreme z-scores | Regularized — `+1` terms shrink extremes | + +The Clojure formula is preferable for Polis: small group sizes are common, and the built-in regularization prevents spurious significance. + +**Code comments**: Check Clojure source (`stats.clj`) for explanatory comments and copy them to Python. Add clear docstring explaining what this test does, why the Clojure formula is used, and how it differs from a standard z-test. + +**Signature change**: `prop_test(p, n, p0)` → `prop_test(succ, n)`. All callers need updating including `prop_test_vectorized`. + +**Test-first approach**: +1. Unit tests: for known (succ, n) pairs, compare to Clojure formula output +2. Compare pat/pdt values against Clojure blob for ALL datasets +3. Fix: Replace both scalar and vectorized implementations, update all callers +4. Document improvements + +--- + +### PR 5: Fix D6 — Two-Proportion Test Adjustment + +**Why next**: Similar to D5, this fixes the group-vs-other z-scores (`rat`/`rdt`). + +**File**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Current**: Standard two-proportion z-test, no pseudocounts +**Target**: Add +1 pseudocount to all 4 inputs (succ1, n1, succ2, n2), matching Clojure + +**Signature change**: From proportions to counts. All callers (including `two_prop_test_vectorized`) need updating. + +**Test-first approach**: +1. Unit tests with known values, compare to Clojure formula +2. Compare rat/rdt values against Clojure blob for ALL datasets +3. Fix: Add pseudocounts, change signatures, update callers + +--- + +### PR 6: Fix D7 — Repness Metric Formula + +**Why next**: With all underlying statistics now correct (D4, D5, D6, D9), fix the metric that ranks comments. + +**File**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Analysis**: + +| Aspect | Python (current) | Clojure (target) | +|--------|-----------------|------------------| +| Formula | `pa * (\|pat\| + \|rat\|)` | `ra * rat * pa * pat` | +| Type | Weighted sum of 2 absolute values | Product of 4 signed values | +| Zero sensitivity | Tolerant | Strict — any factor near 0 kills metric | +| Behavior | High probability OR high test score | Requires ALL dimensions to be strong | + +The Clojure product formula is more conservative and arguably better for finding truly representative comments. + +**Possibly unintentional change**: The Python formula may have been an LLM hallucination during the Clojure→Python port. Team has been asked to confirm. **Preserve the current formula behind a flag** with a TODO comment to delete or keep once confirmed. Default to Clojure formula for regression tests. + +**Test-first approach**: +1. Extract Clojure repness scores from math blob for ALL datasets +2. Compare agree_metric/disagree_metric per (group, comment) +3. Fix: Change to `ra * rat * pa * pat` (both scalar and vectorized), keep old formula behind flag +4. Verify ranking order matches Clojure + +--- + +### PR 7: Fix D8 — Finalize Comment Stats Logic + +**File**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Current**: `if pa > 0.5 AND ra > 1.0 → 'agree'; elif pd > 0.5 AND rd > 1.0 → 'disagree'` +**Target**: Simple `rat > rdt → 'agree'; else → 'disagree'` (matching Clojure) + +**Test-first approach**: +1. Test: verify `repful` classification matches Clojure across ALL datasets +2. Fix: Replace threshold checks with `rat > rdt` (both scalar and vectorized) + +--- + +### PR 8: Fix D10 — Representative Comment Selection + +**File**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Pre-investigation needed**: Dig into `client-report/src/components/lists/participantGroups.jsx` to understand exactly how it consumes `repness`. The legacy report uses `math["repness"][gid]`, the narrative report uses DynamoDB topics. Determine if changing selection logic (3+2 → 5 total, agrees first) would break the legacy report display. + +**Approach**: Implement Clojure's selection as default, keep current selection behind `DELPHI_LEGACY_REPNESS_SELECTION=true` env var (update `example.env`). TODO comment: remove legacy mode once verified in production. + +**Test-first approach**: +1. Compare selected rep comments per group to Clojure's repness output for ALL datasets +2. Fix: Match Clojure's single-pass with beats-best-by-test, up to 5 total (agrees first) + +--- + +### PR 9: Fix D11 — Consensus Comment Selection + +**File**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Current**: ALL groups `pa > 0.6`, top 2 overall +**Target**: Per-comment `pa > 0.5`, top 5 agree + 5 disagree with z-test scores + +Same legacy-mode flag approach as D10. TODO comment + GitHub Issue (upstream) proposing consolidation of all math into Python, with permalinks to the TypeScript code that would be simplified. + +**Test-first approach**: +1. Compare consensus comments to Clojure's `consensus` blob key for ALL datasets +2. Fix: Match Clojure's logic + +--- + +### PR 10: Fix D3 — K-Smoother Buffer + +**File**: `delphi/polismath/conversation/conversation.py` + +**This is a temporal stability feature**: Clojure requires k to be best for 4 consecutive updates before switching. Without it, groups flicker. + +**WARNING**: The k-smoother makes `k` depend on the full history of updates, which we don't have in cold-start math blobs. Cold-start blobs are single-shot. Therefore: + +- **Cold-start test**: Verify the smoother doesn't change cold-start behavior (buffer=4 means first k is always accepted) +- **Incremental test**: Feed votes in batches, verify k doesn't flicker — Python-only test, no Clojure comparison possible without replay infrastructure + +If we discover that cold-start fixes can't be completed without warm-start testing, we'll need the replay infrastructure earlier (see Replay Infrastructure section below). + +**Test approach**: +1. Cold-start: verify same results as without smoother (first run always accepts k) +2. Incremental: feed votes in batches, assert k stability +3. Fix: Add `group_k_smoother` state with buffer=4 + +--- + +### PR 11: Fix D12 — Comment Priorities + +**Files**: `delphi/polismath/conversation/conversation.py`, `delphi/polismath/pca_kmeans_rep/pca.py` + +**Critical for production**: Without comment priorities, TypeScript server falls back to uniform random selection. Currently `_compute_votes_base()` is dead code with a bug (line 1076: invalid pandas syntax). + +This replaces the reading from the math blob with a proper Python computation matching Clojure. + +**Test-first approach**: +1. Remove xfail from existing `test_comment_priorities` +2. Extract `comment-priorities` from Clojure math blob, compare rankings (Spearman correlation) for ALL datasets +3. Implement: comment projection/extremity in PCA, `importance_metric`, `priority_metric`, full computation +4. Fix buggy `_compute_votes_base()` method + +--- + +### PR 12: Fix D15 — Moderation Handling + +**File**: `delphi/polismath/conversation/conversation.py` + +Clojure zeros out moderated comments (keeps structure, sets values to 0). Python removes them entirely. Affects matrix dimensions and vote counts. + +**Test-first approach**: +1. Add test with dataset containing moderated comments, compare behavior +2. Fix or document the difference + +--- + +### PR 13: Fix D1/D1b — PCA Sign Flip Prevention + +**Files**: `delphi/polismath/pca_kmeans_rep/pca.py` + +**This MUST be fixed**: We cannot afford sign flips between updates — they cause participants to appear to jump to the opposite side of the visualization. + +**Approach**: +1. Add a test that detects sign flips (feed incremental votes, check projection consistency) +2. Implement sign-consistency logic: after computing PCA, compare to previous components and flip signs to maintain consistency +3. Consider `IncrementalPCA` for warm-starting (also addresses D14 for free) + +This is non-trivial and should be one of the last fixes. + +--- + +### PR 14: Cleanup — Remove Dead Code + +**Files**: Multiple (see `08-dead-code.md`) +- Custom kmeans chain in `clusters.py` +- Non-vectorized repness functions in `repness.py` +- Buggy `_compute_votes_base()` (after D12 replaces it) +- `stats.py` inconsistencies (after D9 makes `repness.py` authoritative) + +By this point, we should have good test coverage from all the per-discrepancy tests, giving confidence that dead code removal won't break anything. Still run full regression suite after cleanup. + +--- + +### Explicitly Deferred + +- **D13 — Subgroup Clustering**: Not implemented in Python, never used by TypeScript consumers. No fix needed. +- **D14 — Large Conversation Optimization**: Clojure needed mini-batch PCA because it was slow. Python's sklearn SVD is fast enough. No fix needed unless we adopt `IncrementalPCA` for sign consistency (PR 13). + +--- + +### Future PRs (after parity) + +- **Python implementation of TypeScript routing logic**: `getNextPrioritizedComment`, `selectProbabilistically`, returning a pre-computed list of next comments so TypeScript becomes thin routing layer +- **GitHub Issue**: Document proposal to consolidate math in Python with permalinks to TypeScript code + +--- + +## Discrepancy Coverage Checklist + +| ID | Discrepancy | PR | Status | +|----|-------------|-----|--------| +| D1 | PCA sign flips | PR 13 | Fix (sign consistency) | +| D1b | Projection input | PR 13 | Fix with D1 | +| D2 | In-conv threshold | **PR 1** | **DONE** ✓ | +| D2b | Base-cluster sort order | **PR 1** | **DONE** ✓ | +| D2c | Vote count source (raw vs filtered matrix) | **PR 1** | **DONE** ✓ | +| D2d | In-conv monotonicity (once in, always in) | **PR 1** | **DONE** ✓ (5 guard tests, T1-T5) | +| D3 | K-smoother buffer | PR 10 | Fix | +| D4 | Pseudocount formula | **PR 2** | **DONE** ✓ | +| D5 | Proportion test | PR 4 | Fix | +| D6 | Two-proportion test | PR 5 | Fix | +| D7 | Repness metric | PR 6 | Fix (with flag for old formula) | +| D8 | Finalize cmt stats | PR 7 | Fix | +| D9 | Z-score thresholds | **PR 3** | Fix | +| D10 | Rep comment selection | PR 8 | Fix (with legacy env var) | +| D11 | Consensus selection | PR 9 | Fix (with legacy env var) | +| D12 | Comment priorities | PR 11 | Fix (implement from scratch) | +| D13 | Subgroup clustering | — | **Deferred** (unused) | +| D14 | Large conv optimization | — | **Deferred** (Python fast enough) | +| D15 | Moderation handling | PR 12 | Fix | + +--- + +## Test Infrastructure + +### `tests/test_discrepancy_fixes.py` — New test file + +**Two types of tests per discrepancy**: + +1. **Real-data parametrized tests**: One test class per discrepancy, parametrized by ALL datasets. Loads conversation + Clojure math blob, checks specific aspect, designed to FAIL before fix and PASS after. + +2. **Synthetic edge-case tests**: When we discover an edge case in a real conversation, extract the pattern into a test with completely made-up data (never real data from private datasets). These run fast, document the edge case clearly, and prevent regressions. + +**All datasets must have cold start Clojure blobs** — generate missing ones using `generate_cold_start_clojure.py` before starting fixes. + +### Shared comparison logic + +Both `pytest` tests and `regression_comparer.py` should use the **same core comparison logic**. The pytest tests call it with assertions; the script calls it with report generation. This ensures consistency and reduces duplication. Extend `ClojureComparer` with methods like: +- `compare_in_conv_participants()` — participant set comparison +- `compare_repness_values()` — per-(group, comment) metric comparison +- `compare_consensus()` — consensus comment overlap +- `compare_priorities()` — priority ranking correlation (Spearman) + +### Replay Infrastructure (separate PRs, needed for D3/D1) + +Three separate PRs for temporal/incremental testing: + +**Replay PR A**: Core replay infrastructure +- Script that takes a conversation's votes CSV, splits into batches of configurable size +- For each batch: feed votes to Python `Conversation.update()`, record state +- For each batch: feed votes to Clojure via DB + poller, record math blob +- Compare Python vs Clojure state after each batch +- Requires separate test Postgres database (not touching main environment) +- Generation can be done once but must be scripted for reproducibility + +**Replay PR B**: Use replay infrastructure for tests +- Test D3 (k-smoother stability) with real incremental data +- Test D1 (PCA sign consistency) with real incremental data +- Test D2 (in-conv threshold) on incremental blobs: currently xfailed because early + participants (low PIDs) were admitted when `n_cmts` was still < 7 during early + iterations, making the threshold equal to `n_cmts` rather than 7. All four datasets + with both blob types exhibit this (1–2 extra participants each). Matching incremental + behaviour requires simulating the progressive threshold evaluation. +- Compare Python's incremental behavior to Clojure's + +**Replay PR C**: Visualization movie generation +- Adapt existing static visualization scripts (`delphi/scripts/visualize_cluster_comparison.py`) for temporal replay +- Generate frame-by-frame visualizations showing conversation evolution +- Fixed x/y ranges across all frames for consistent animation +- Useful for debugging, tutorials, and showcasing + +--- + +## Dataset Size Strategy + +When iterating on a fix: +1. Start with **vw** (4.7K votes) — fastest feedback loop +2. Then **biodiversity** (30K votes) — another reference +3. Then private datasets (91K–1M) — progressively larger +4. Full suite with ALL datasets only after fix is stable on smaller ones + +All datasets are equally important. No single dataset is "the main" reference — each may expose different edge cases. + +### On Test Redundancy: `pytest` vs `regression_comparer.py` + +**Keep both** — they serve different purposes: + +| | `pytest` tests | `regression_comparer.py` | +|---|---|---| +| **Purpose** | Binary pass/fail | Detailed human-readable reports | +| **CI-friendly** | Yes | No (interactive use) | +| **When** | After each change (automated) | Investigating failures (manual) | + +**Critical**: Both must use the same core comparison logic (shared in `polismath/regression/`), so updating one automatically benefits the other. + +### Golden snapshot updates + +After each fix: `uv run python scripts/regression_recorder.py` + +--- + +## Verification After Each PR + +1. Run targeted discrepancy test on **vw**: `uv run pytest tests/test_discrepancy_fixes.py -k --datasets=vw` +2. If pass, run on **biodiversity**: `--datasets=biodiversity` +3. Run on **all datasets**: `--include-local` +4. Run full legacy suite: `uv run pytest tests/test_legacy_clojure_regression.py --include-local` — no new failures +5. Run full regression suite: `uv run pytest tests/test_regression.py --include-local` — re-record golden snapshots if needed +6. Optional deep inspection: `uv run python scripts/regression_comparer.py --include-local` +7. Document updated baseline in `CLJ-PARITY-FIXES-JOURNAL.md` +8. Write clear commit message and PR description with test results diff --git a/delphi/docs/deep-analysis-for-julien/01-overview-and-architecture.md b/delphi/docs/deep-analysis-for-julien/01-overview-and-architecture.md new file mode 100644 index 000000000..35da75cf7 --- /dev/null +++ b/delphi/docs/deep-analysis-for-julien/01-overview-and-architecture.md @@ -0,0 +1,159 @@ +# Polis Math Pipeline: Overview and Architecture + +## Document Index + +This analysis is split across multiple files for manageability: + +1. **01-overview-and-architecture.md** (this file) — High-level architecture, data flow, storage +2. **02-pca-analysis.md** — PCA implementations (Clojure vs Python), mathematical formulas +3. **03-clustering-analysis.md** — K-means clustering, silhouette, k-selection +4. **04-repness-analysis.md** — Representativeness metrics, statistical tests, consensus +5. **05-participant-filtering.md** — In-conv logic, comment priorities, vote structures +6. **06-comment-routing.md** — TypeScript comment routing, topical vs prioritized +7. **07-discrepancies.md** — All Clojure vs Python discrepancies (THE KEY DOCUMENT) +8. **08-dead-code.md** — Dead/unreachable code identified +9. **09-fix-plan.md** — Plan to bring Python to Clojure parity + +--- + +## 1. High-Level Architecture + +The Polis math pipeline takes raw participant votes on comments and produces: + +- **PCA projections**: 2D coordinates for each participant in opinion space +- **Clusters**: Hierarchical grouping (base clusters → group clusters → optional subgroups) +- **Representativeness**: Which comments best characterize each group +- **Consensus**: Comments that all groups broadly agree on +- **Comment priorities**: Scores used to route the next comment to a participant + +### 1.1 Two Implementations + +| Aspect | Legacy (Clojure) | Delphi (Python) | +|--------|-----------------|-----------------| +| Location | `math/src/polismath/math/` | `delphi/polismath/` | +| Framework | Plumbing Graph (DAG computation) | Imperative class methods | +| Matrix repr | Custom `NamedMatrix` | pandas `DataFrame` | +| PCA method | Power iteration (custom) | sklearn SVD | +| Status | **CORRECT reference** | Has behavioral gaps | + +### 1.2 Computation Flow + +**Clojure** (`conversation.clj` lines 136–702): +``` +votes → customs (cap ptpts/cmts) + → raw-rating-mat (NamedMatrix) + → rating-mat (zero out mod-out columns) + → mat (replace nil with column means) + → pca (power iteration) + → proj (sparsity-aware projection) + → proj-nmat + → in-conv (filter participants) + → base-clusters (k-means, k=100) + → base-clusters-proj → bucket-dists + → group-clusterings (k=2..max-k) + → group-clusterings-silhouettes + → group-k-smoother (buffer before switching k) + → group-clusters + → subgroup-clusterings → subgroup-clusters + → votes-base → group-votes → subgroup-votes + → comment-priorities + → group-aware-consensus + → repness, subgroup-repness + → consensus + → ptpt-stats, subgroup-ptpt-stats +``` + +**Python** (`conversation.py`, `Conversation` class): +``` +update_votes() → raw_rating_mat (DataFrame) + → _apply_moderation() → rating_mat + → _compute_vote_stats() + → recompute(): + → _compute_pca() + → _compute_clusters() + → _compute_repness() + → _compute_participant_info() +``` + +### 1.3 Key Architectural Differences + +1. **Plumbing Graph vs Imperative**: Clojure uses a declarative computation graph where each node declares its dependencies. Python uses sequential method calls. + +2. **Immutability**: Clojure's pipeline is functionally pure — each step produces new data. Python's `Conversation` class mutates `self` attributes. + +3. **Missing in Python**: subgroup clustering, comment priorities computation, large-conv mini-batch PCA, k-smoother buffer, proper consensus selection (Clojure's `consensus-stats` + `select-consensus-comments`). + +--- + +## 2. Data Storage + +### 2.1 PostgreSQL (Source of Truth) + +**Read by both implementations:** + +| Table | Purpose | Key columns | +|-------|---------|-------------| +| `votes` | Individual vote records | `zid, pid, tid, vote, created` | +| `comments` | All comments | `zid, tid, txt, mod, active, is_seed` | +| `math_main` | Serialized math results | `zid, data, math_tick, caching_tick` | +| `math_ticks` | Update tracking | `zid, math_env` | +| `worker_tasks` | Task queue | `zid, math_env, task_type` | + +**Vote sign convention at the boundary:** +- Postgres stores: `AGREE = -1`, `DISAGREE = 1` (historical Polis convention) +- Delphi internal: `AGREE = 1`, `DISAGREE = -1` (standard convention) +- `postgres.py` line ~: `postgres_vote_to_delphi()` flips signs at the boundary + +### 2.2 DynamoDB (Delphi Output) + +Delphi writes results to these DynamoDB tables (`dynamodb.py`): + +| Table | Content | +|-------|---------| +| `Delphi_PCAConversationConfig` | PCA config (center, components) | +| `Delphi_PCAResults` | Summary results per conversation | +| `Delphi_KMeansClusters` | Cluster assignments and centers | +| `Delphi_CommentRouting` | Comment priority scores for routing | +| `Delphi_RepresentativeComments` | Per-group representative comments | +| `Delphi_PCAParticipantProjections` | Per-participant 2D projections | + +### 2.3 Clojure Math Storage + +Clojure serializes its entire conversation state (including all computed fields) into the `math_main` table as a single large JSON blob under the `data` column. The TypeScript server reads this blob via `getPca(zid, 0)` to extract `comment-priorities` for routing. + +--- + +## 3. Pipeline Entry Points + +### 3.1 Clojure + +- **`conv_man.clj`**: Conversation manager, polls for new votes +- **`conversation.clj:conv-update`** (line 766): Dispatches to `small-conv-update` or `large-conv-update` based on: + - `ptpt-cutoff = 10000` participants + - `cmt-cutoff = 5000` comments +- `small-conv-update` = `eager-profiled-compiler(small-conv-update-graph)` +- `large-conv-update` = same graph but with mini-batch PCA override + +### 3.2 Python (Delphi) + +- **`poller.py`**: Polls Postgres for new votes/moderation/tasks on separate threads +- **`run_math_pipeline.py`**: CLI tool for one-shot processing +- **`manager.py`**: Thread-safe management of multiple `Conversation` objects +- **`conversation.py:Conversation.update_votes()`**: Main entry, calls `recompute()` + +### 3.3 Default Configuration (Clojure) + +From `conversation.clj` lines 142–152: +```clojure +{:n-comps 2 + :pca-iters 100 + :base-iters 100 + :base-k 100 + :max-k 5 + :group-iters 100 + :max-ptpts 100000 + :max-cmts 10000 + :group-k-buffer 4} +``` + +Python matches `BASE_K=100`, `MAX_K=5`, but is missing `group-k-buffer` entirely. diff --git a/delphi/docs/deep-analysis-for-julien/02-pca-analysis.md b/delphi/docs/deep-analysis-for-julien/02-pca-analysis.md new file mode 100644 index 000000000..0703e28d6 --- /dev/null +++ b/delphi/docs/deep-analysis-for-julien/02-pca-analysis.md @@ -0,0 +1,204 @@ +# PCA Implementation Analysis + +## 1. Mathematical Foundation + +Both implementations perform PCA on the vote matrix `V` of shape `(n_participants × n_comments)` where entries are `{-1, 0, 1, NaN}`. The goal is to find a 2D projection that captures maximum variance in participant opinions. + +### 1.1 Missing Data Imputation + +Both implementations replace `NaN` (unvoted) entries with column means before PCA: + +**Clojure** (`conversation.clj` lines 356–377): +```clojure +;; replace? nil? ← only replaces nil, not 0 (pass votes) +column-averages = mean of non-nil values per column +mat[i][j] = column-averages[j] if mat[i][j] is nil +``` + +**Python** (`pca.py` lines 42–50): +```python +col_means = np.nanmean(matrix_data, axis=0) +nan_indices = np.where(np.isnan(matrix_data)) +matrix_data_no_nan[nan_indices] = col_means[nan_indices[1]] +``` + +**Status: MATCH** — Both impute with column means. Clojure replaces `nil?` (not `0`), Python replaces `NaN`. Since pass votes are stored as `0` in Clojure and also `0` in Python (with `NaN` for unvoted), these are equivalent. + +--- + +## 2. PCA Computation + +### 2.1 Clojure: Power Iteration (`pca.clj`) + +The Clojure implementation uses **power iteration** (also called the power method) to find eigenvectors of the covariance matrix iteratively. + +**Algorithm** (`pca.clj:powerit-pca`): + +Given centered data matrix `X` (n × p), find top-k eigenvectors: + +``` +For each component i = 1..k: + v_i ← random unit vector of dimension p + For iter = 1..100: + v_i ← X^T · X · v_i (multiply by covariance matrix) + v_i ← v_i - Σ_{j10K participants), Clojure uses mini-batch PCA with learning rate 0.01; Python has no such optimization. + +--- + +## 3. Sparsity-Aware Projection + +Both implementations apply identical sparsity-aware scaling to projections. This is critical for fair treatment of participants who have voted on few comments. + +### 3.1 Mathematical Formula + +For participant `i` with projection `p_i = [p_{i,1}, p_{i,2}]`: + +``` +n_votes_i = number of non-NaN entries in row i +n_comments = total number of comments +scaling_i = sqrt(n_comments / max(n_votes_i, 1)) +scaled_projection_i = p_i × scaling_i +``` + +Participants who have voted on fewer comments get **pushed outward** (larger scaling factor), compensating for the bias toward the center caused by mean imputation. + +### 3.2 Clojure (`pca.clj:sparsity-aware-project-ptpt`, lines 134–157) + +```clojure +(defn sparsity-aware-project-ptpt [votes {:keys [comps center]}] + (let [n-cmnts (count votes) + [pc1 pc2] comps + [n-votes p1 p2] + (reduce + (fn [[n-votes p1 p2] [x-n cntr-n pc1-n pc2-n]] + (if x-n ;; if voted (non-nil) + (let [x-n' (- x-n cntr-n)] ;; subtract center + [(inc n-votes) (+ p1 (* x-n' pc1-n)) (+ p2 (* x-n' pc2-n))]) + [n-votes p1 p2])) ;; skip if nil (unvoted) + [0 0.0 0.0] + (zip votes center pc1 pc2))] + (* (Math/sqrt (/ n-cmnts (max n-votes 1))) [p1 p2]))) +``` + +**Critical detail**: Clojure projects using the RAW votes with nils. Unvoted entries are **skipped** in the dot product (contribute 0). The projection uses the `rating-mat` (raw matrix with nils), NOT the imputed `mat`. + +### 3.3 Python (`pca.py` lines 96–102) + +```python +n_cmnts = matrix_data.shape[1] +n_seen = np.sum(~np.isnan(matrix_data), axis=1) +n_seen_safe = np.maximum(n_seen, 1) +proportions = np.sqrt(n_seen_safe / n_cmnts) +scaled_projections = projections / proportions[:, np.newaxis] +``` + +**IMPORTANT**: Python divides by `sqrt(n_seen/n_cmnts)`, which equals multiplying by `sqrt(n_cmnts/n_seen)`. The scaling is mathematically equivalent to the Clojure formula. + +### 3.4 DISCREPANCY: Projection Input + +The scaling is equivalent, but the **projection computation** differs subtly: + +| | Clojure | Python | +|---|---------|--------| +| Projection input | Raw votes (nils skipped in dot product) | Fully imputed matrix (NaN → col mean) | +| Effect of unvoted | Contributes 0 to dot product | Contributes `(col_mean - center) × loading` | + +Since `center ≈ col_mean` (both computed on the imputed matrix), the unvoted contribution in Python is approximately 0. The two approaches produce very similar but not identical results. The difference grows when the distribution of voters per comment is highly skewed. + +**Status: NEAR-MATCH** — Scaling equivalent; projection input differs subtly. + +--- + +## 4. Comment Projection and Extremity + +**Clojure only** (`conversation.clj` lines 338–349): + +Clojure also projects comments into the PCA space and computes their "extremity" (distance from origin): + +```clojure +cmnt-proj = pca-project-cmnts(pca) ;; project each comment onto PC axes +cmnt-extremity[j] = ||cmnt-proj[j]|| ;; L2 norm of comment's projection +``` + +This extremity is used in the `priority-metric` for comment routing. + +**Python**: Does NOT compute comment projection or extremity. This is a **missing feature** that affects comment priority computation. + +--- + +## 5. Large Conversation Handling + +### 5.1 Clojure Mini-Batch PCA (`conversation.clj` lines 706–755) + +For conversations with >10K participants or >5K comments, Clojure uses **mini-batch PCA**: + +``` +sample_size = interpolate(n_ptpts, start=(100,1500), stop=(1500,150000)) +For iter = 1..pca_iters: + indices = random_sample(range(n_ptpts), size=sample_size) + subset = mat[indices, :] + part_pca = powerit_pca(subset, start_vectors=current_pca.comps, iters=10) + pca.center = 0.99 * pca.center + 0.01 * part_pca.center + pca.comps = 0.99 * pca.comps + 0.01 * part_pca.comps +``` + +The learning rate of 0.01 ensures gradual adaptation. The `sample-size-fn` linearly interpolates between (100 samples at 1500 participants) and (1500 samples at 150000 participants). + +### 5.2 Python + +**No large-conversation optimization exists.** Python always runs full sklearn PCA on the entire matrix. This could become a performance bottleneck for very large conversations. + +**Status: MISSING in Python** diff --git a/delphi/docs/deep-analysis-for-julien/03-clustering-analysis.md b/delphi/docs/deep-analysis-for-julien/03-clustering-analysis.md new file mode 100644 index 000000000..c7066184c --- /dev/null +++ b/delphi/docs/deep-analysis-for-julien/03-clustering-analysis.md @@ -0,0 +1,249 @@ +# Clustering Analysis + +## 1. Two-Level Hierarchical Clustering + +Both implementations use a two-level clustering approach: + +``` +Level 1: Participants → ~100 base clusters (k-means on PCA projections) +Level 2: Base clusters → 2-5 group clusters (weighted k-means on base cluster centers) +``` + +The base clustering reduces the computational cost of the group-level clustering while preserving the distribution of participants in opinion space. + +--- + +## 2. K-Means Implementation + +### 2.1 Clojure (`clusters.clj`) + +**Initialization** (`init-clusters`): +```clojure +;; Take first k distinct rows from the NamedMatrix in encounter order +(defn init-clusters [nmat k] + (->> (nm/get-matrix nmat) + distinct + (take k) + (mapv (fn [row] {:center row :members [] :id nil})))) +``` + +**Iteration** (`kmeans`): +``` +1. Initialize clusters (init-clusters or clean-start-clusters if previous exists) +2. Repeat up to max-iters: + a. Assign each point to nearest cluster center (Euclidean distance) + b. Update centers as weighted mean of members + c. Remove empty clusters + d. If converged (same clustering as previous step, threshold=0.01), stop +3. Sort by :id +``` + +**Clean-start** (`clean-start-clusters`): When previous clusters exist: +``` +1. Start with old cluster centers +2. Assign all current data points to nearest old center +3. If need more clusters: split largest cluster using most-distal point +4. If need fewer: merge closest pair +5. Run k-means from these starting positions +``` + +**Convergence** (`same-clustering?`, lines 68–76): Two clusterings are "same" if EVERY pairwise center distance is < threshold (0.01 default). Centers are sorted before comparison. + +### 2.2 Python (`clusters.py`) + +Python has TWO k-means implementations: + +**Custom `kmeans()`** (lines 383–419): Mirrors Clojure's approach: +```python +clusters = clean_start_clusters(data, k, last_clusters) +for _ in range(max_iters): + new_clusters = cluster_step(data, clusters, weights) + if same_clustering(clusters, new_clusters): + break + clusters = new_clusters +``` + +**sklearn `kmeans_sklearn()`** (lines 595–665): Used in the actual pipeline: +```python +init = _get_first_k_distinct_centers(data, k) # match Clojure init +kmeans = KMeans(n_clusters=k, init=init, n_init=1, algorithm='lloyd') +kmeans.fit(data, sample_weight=weights) +``` + +**Which is actually used?** The `_compute_clusters()` method in `conversation.py` (line 576) calls `kmeans_sklearn()`, NOT the custom `kmeans()`. The custom implementation is **dead code**. + +--- + +## 3. Silhouette Coefficient + +The silhouette coefficient measures clustering quality. For each point `i`: + +``` +a(i) = mean distance to other points in same cluster +b(i) = min over other clusters C: mean distance to points in C +s(i) = (b(i) - a(i)) / max(a(i), b(i)) +``` + +Overall silhouette = mean of all s(i). Range: [-1, 1]. Higher is better. + +### 3.1 Clojure (`clusters.clj:silhouette`) + +Custom implementation using a **named distance matrix** (`named-dist-matrix`): +```clojure +;; Precomputes pairwise distances between all base cluster centers +;; Then computes silhouette using those distances +(defn silhouette [dist-matrix clustering] + ;; For each cluster, for each member: + ;; a = avg distance to same-cluster members + ;; b = min avg distance to other-cluster members + ;; s = (b - a) / max(a, b) + ;; Return mean of all s values + ) +``` + +### 3.2 Python + +**Custom `silhouette()`** (lines 444–503): Direct implementation matching Clojure logic. + +**sklearn `calculate_silhouette_sklearn()`** (lines 668–686): Wrapper around `sklearn.metrics.silhouette_score`. + +The actual pipeline uses `calculate_silhouette_sklearn()` (`conversation.py` line 633). + +--- + +## 4. Optimal k Selection + +### 4.1 Clojure: k-smoother with Buffer + +**Max k calculation** (`conversation.clj` lines 271–276): +```clojure +(defn max-k-fn [data max-max-k] + (min max-max-k + (+ 2 (int (/ (count (nm/rownames data)) 12))))) +;; max-max-k defaults to 5 +;; Example: 100 base clusters → max-k = min(5, 2 + 100/12) = min(5, 10) = 5 +``` + +**k-smoother** (`conversation.clj` lines 452–468): +``` +State: {last-k, last-k-count, smoothed-k} +Algorithm: + this-k = argmax_k silhouette(k) # best k for current data + if this-k == last-k: + this-k-count = last-k-count + 1 + else: + this-k-count = 1 + if this-k-count >= group-k-buffer (default 4): + smoothed-k = this-k # switch to new k + else: + smoothed-k = previous smoothed-k # keep old k +``` + +**Purpose**: Prevents the number of groups from flickering between values. A new k must be the best for **4 consecutive updates** before the system switches to it. + +### 4.2 Python: Direct Best Silhouette + +**Max k** (`conversation.py` line 614): +```python +max_k = min(MAX_K, 2 + len(base_clusters) // 12) +max_k = max(2, min(max_k, len(base_clusters))) +``` + +**k selection** (`conversation.py` lines 624–641): +```python +best_k = 2 +best_score = -1 +for k in range(2, max_k + 1): + labels, centers, members = kmeans_sklearn(base_centers, k=k, weights=base_weights) + score = calculate_silhouette_sklearn(base_centers, labels) + if score > best_score: + best_score = score + best_k = k +``` + +### 4.3 DISCREPANCY: No k-smoother in Python + +Python picks the best k by silhouette **every single update** without any buffering. This means: + +1. Groups can flicker between k=2 and k=3 (or other values) on consecutive updates +2. Participants may see their group assignment change frequently +3. The visualization becomes unstable + +**Impact**: HIGH — This is a user-visible instability. + +--- + +## 5. Subgroup Clustering + +### 5.1 Clojure (`conversation.clj` lines 480–570) + +For each group cluster, Clojure runs another round of clustering: +``` +For each group g: + Get base clusters belonging to g + For k = 2..max-k: + Run k-means on those base cluster centers + Apply k-smoother (same buffer logic) + Store best subgroup clustering +``` + +This produces a second level of hierarchy: groups → subgroups. + +### 5.2 Python + +**No subgroup clustering exists.** The `subgroup_clusters` attribute is initialized to `{}` and never populated. + +**Status: MISSING in Python** + +--- + +## 6. `determine_k()` Function — Python Dead Code + +Python has a `determine_k()` function (`clusters.py` lines 689–726) that uses a logarithmic heuristic: +```python +if n_rows < 10: return 2 +if n_rows >= 500: k = 2 + int(min(1, log2(n_rows) / 10)) +else: k = 2 + int(min(2, log2(n_rows) / 5)) +``` + +This function is **NOT used** in the actual pipeline. The pipeline uses the Clojure-matching `max_k` formula with silhouette selection. `determine_k()` is only called from `cluster_dataframe()`, which itself is not used in the main pipeline (the pipeline calls `kmeans_sklearn()` directly). + +--- + +## 7. Cluster Output Format + +### 7.1 Clojure + +Group clusters are sorted by `:id` and contain: +```clojure +{:id 0 + :center [x, y] + :members [bid1, bid2, ...] ;; base cluster IDs + :count n} +``` + +Base clusters contain: +```clojure +{:id 0 + :center [x, y] + :members [pid1, pid2, ...] ;; participant IDs + :count n} +``` + +### 7.2 Python + +Both levels use the same dict format: +```python +{'id': 0, 'center': [x, y], 'members': [...]} +``` + +Group clusters are sorted by size (descending) and re-assigned sequential IDs. This matches Clojure's behavior of sorting by `:id` after creation. + +### 7.3 Base Cluster Folding + +For serialization, Clojure uses a "folded" format: +```clojure +{:id [0 1 2 ...], :members [[...] [...] ...], :x [...], :y [...], :count [...]} +``` + +Python's `_fold_base_clusters()` (`conversation.py` lines 1250–1269) matches this format. diff --git a/delphi/docs/deep-analysis-for-julien/04-repness-analysis.md b/delphi/docs/deep-analysis-for-julien/04-repness-analysis.md new file mode 100644 index 000000000..da10ef297 --- /dev/null +++ b/delphi/docs/deep-analysis-for-julien/04-repness-analysis.md @@ -0,0 +1,377 @@ +# Representativeness (Repness) Analysis + +## 1. Overview + +The representativeness module determines which comments best characterize each opinion group. For each (group, comment) pair, it computes: + +1. **Proportion test**: Is this group's agreement on this comment significantly different from 50%? +2. **Representativeness ratio**: Does this group agree/disagree more than other groups? +3. **Two-proportion test**: Is the difference between group and others statistically significant? +4. **Composite metric**: Final ranking score combining the above. + +--- + +## 2. Vote Counting + +### 2.1 Clojure (`repness.clj:comment-stats-graphimpl`) + +```clojure +;; CRITICAL: Vote signs are INVERTED in Clojure repness +;; na counts votes == -1 (which is AGREE in Postgres convention) +;; nd counts votes == 1 (which is DISAGREE in Postgres convention) +;; Comment in code: "Change when we flip votes" +na = count of votes == -1 in group +nd = count of votes == 1 in group +ns = na + nd ;; total directional votes (excludes pass/nil) +``` + +### 2.2 Python (`repness.py:comment_stats`, line 133) + +```python +n_agree = np.sum(group_votes == AGREE) # AGREE = 1 +n_disagree = np.sum(group_votes == DISAGREE) # DISAGREE = -1 +n_votes = n_agree + n_disagree +``` + +### 2.3 DISCREPANCY: Vote Sign Convention + +This is a **critical semantic difference**. In Clojure, `na` counts votes equal to `-1`, which in the Postgres convention IS the agree vote. In Python, `na` counts votes equal to `+1` (AGREE constant). + +Since Delphi flips signs at the Postgres boundary (`postgres_vote_to_delphi()`), AGREE becomes `+1` internally. So both are counting "agrees" — just detecting them with different numeric values. **This is consistent** as long as the sign flip is correctly applied at the boundary. + +**Status: CONSISTENT** (but confusing and fragile) + +--- + +## 3. Pseudocount Smoothing (Bayesian Prior) + +### 3.1 Clojure (`repness.clj`) + +```clojure +pa = (1 + na) / (2 + ns) +pd = (1 + nd) / (2 + ns) +``` + +This is equivalent to adding 1 virtual agree and 1 virtual disagree to the counts (a Beta(2,2) prior, or equivalently Laplace smoothing with pseudocount=2). + +### 3.2 Python (`repness.py`, lines 138–139) + +```python +PSEUDO_COUNT = 1.5 +pa = (n_agree + PSEUDO_COUNT/2) / (n_votes + PSEUDO_COUNT) + = (n_agree + 0.75) / (n_votes + 1.5) +pd = (n_disagree + PSEUDO_COUNT/2) / (n_votes + PSEUDO_COUNT) + = (n_disagree + 0.75) / (n_votes + 1.5) +``` + +### 3.3 DISCREPANCY: Pseudocount Values + +| | Clojure | Python | +|---|---------|--------| +| Numerator add | +1 | +0.75 | +| Denominator add | +2 | +1.5 | +| Effective prior | Beta(2,2) | Beta(1.75,1.75) | + +**Impact**: Python's weaker prior pulls probabilities less strongly toward 0.5. For small sample sizes (e.g., n=3 votes, 2 agrees): +- Clojure: pa = (1+2)/(2+3) = 0.60 +- Python: pa = (2+0.75)/(3+1.5) = 0.611 + +The effect diminishes with larger samples but matters for small groups. + +--- + +## 4. Proportion Test (p-test) + +### 4.1 Clojure (`stats.clj:prop-test`) + +```clojure +(defn prop-test [succ n] + (* 2 (Math/sqrt (inc n)) (+ (/ (inc succ) (inc n)) -0.5))) +``` + +Expanding: `z = 2 * sqrt(n+1) * ((succ+1)/(n+1) - 0.5)` + +This is a **custom formula** that: +- Adds 1 to both `succ` and `n` (pseudocount) +- Tests departure from 0.5 +- Uses `2*sqrt(n+1)` as the scaling factor instead of standard error + +### 4.2 Python (`repness.py:prop_test`, lines 64–86) + +```python +def prop_test(p, n, p0): + se = math.sqrt(p0 * (1 - p0) / n) + return (p - p0) / se +``` + +This is the **standard one-proportion z-test**: `z = (p - p0) / sqrt(p0*(1-p0)/n)` + +When `p0 = 0.5`: `z = (p - 0.5) / sqrt(0.25/n) = (p - 0.5) / (0.5/sqrt(n)) = 2*sqrt(n)*(p - 0.5)` + +### 4.3 DISCREPANCY: Proportion Test Formula + +| | Clojure | Python | +|---|---------|--------| +| Formula | `2*sqrt(n+1)*((succ+1)/(n+1) - 0.5)` | `(p - 0.5) / sqrt(0.25/n)` | +| Pseudocount | +1/+1 built into test | None (uses pre-smoothed p) | +| Denominator | `1/(2*sqrt(n+1))` | `0.5/sqrt(n)` | + +For n=10, succ=8: +- Clojure: `2*sqrt(11)*((9/11)-0.5) = 2*3.317*0.318 = 2.11` +- Python (with pa from Clojure's smoothing): p=9/12=0.75, `(0.75-0.5)/sqrt(0.25/10) = 0.25/0.158 = 1.58` + +**Impact**: Clojure's test is more liberal (produces larger z-scores), making it easier to reach significance. + +### 4.4 Note: Python has TWO different prop_test functions + +There is also `stats.py:prop_test` which uses yet another formula: +```python +# stats.py version (NOT used by repness) +succ_adjusted = succ + 1 +n_adjusted = n + 2 +p = succ_adjusted / n_adjusted +se = math.sqrt(p * (1 - p) / n_adjusted) +z = (p - 0.5) / se +``` + +This third variant is not used in the repness pipeline. It exists in `stats.py` but is **dead code** relative to repness computation. + +--- + +## 5. Two-Proportion Test (Representativeness Test) + +### 5.1 Clojure (`stats.clj:two-prop-test`) + +```clojure +(defn two-prop-test [succ1 n1 succ2 n2] + (let [succ1 (inc succ1) n1 (inc n1) + succ2 (inc succ2) n2 (inc n2) + p1 (/ succ1 n1) p2 (/ succ2 n2) + p (/ (+ succ1 succ2) (+ n1 n2))] + (/ (- p1 p2) + (Math/sqrt (* p (- 1 p) (+ (/ 1 n1) (/ 1 n2))))))) +``` + +Key: Increments ALL four inputs by 1 before computing. + +### 5.2 Python (`repness.py:two_prop_test`, lines 89–115) + +```python +def two_prop_test(p1, n1, p2, n2): + p = (p1 * n1 + p2 * n2) / (n1 + n2) # pooled proportion + se = math.sqrt(p * (1 - p) * (1/n1 + 1/n2)) + return (p1 - p2) / se +``` + +Key: Takes proportions directly (already smoothed), no additional pseudocount. + +### 5.3 DISCREPANCY: Two-Proportion Test + +Clojure adds pseudocounts (+1 to each of 4 parameters) to prevent division by zero and stabilize estimates. Python takes pre-smoothed proportions and raw counts without additional adjustment. + +**Impact**: Different z-scores for the same data, especially at small sample sizes. Clojure is more conservative (z-scores pulled toward 0 by the +1 adjustments). + +--- + +## 6. Representativeness Ratio + +### 6.1 Clojure (`repness.clj:add-comparitive-stats`) + +```clojure +ra = pa_group / ((1 + sum(na_rest)) / (2 + sum(ns_rest))) +rd = pd_group / ((1 + sum(nd_rest)) / (2 + sum(ns_rest))) +``` + +The denominator for "others" uses the same pseudocount formula as group stats. + +### 6.2 Python (`repness.py:add_comparative_stats`, lines 172–173) + +```python +ra = pa_group / other_pa # other_pa already computed with PSEUDO_COUNT smoothing +rd = pd_group / other_pd +``` + +**Status: STRUCTURAL MATCH** — Both compute the ratio of group probability to "other" probability. The numeric values differ due to different pseudocount formulas (see Section 3). + +--- + +## 7. Repness Metric (Composite Score) + +### 7.1 Clojure (`repness.clj:repness-metric`) + +```clojure +(defn repness-metric [{:keys [repness repness-test p-success p-test]}] + (* repness repness-test p-success p-test)) +``` + +This is a **PRODUCT of 4 values**: +- `repness` = representativeness ratio (ra or rd) +- `repness-test` = two-proportion z-score (rat or rdt) +- `p-success` = agreement probability (pa or pd) +- `p-test` = proportion z-score (pat or pdt) + +### 7.2 Python (`repness.py:repness_metric`, lines 189–210) + +```python +def repness_metric(stats, key_prefix): + p = stats[f'p{key_prefix}'] # pa or pd + p_test = stats[f'p{key_prefix}t'] # pat or pdt + r = stats[f'r{key_prefix}'] # ra or rd (unused!) + r_test = stats[f'r{key_prefix}t'] # rat or rdt + p_factor = p if key_prefix == 'a' else (1 - p) + return p_factor * (abs(p_test) + abs(r_test)) +``` + +This is `p_factor * (|p_test| + |r_test|)` — a **weighted SUM of 2 absolute z-scores**. + +### 7.3 DISCREPANCY: Repness Metric Formula + +| | Clojure | Python | +|---|---------|--------| +| Formula | `ra * rat * pa * pat` | `pa * (|pat| + |rat|)` | +| Combination | Product of 4 | Weighted sum of 2 | +| Uses ratio | Yes (as multiplier) | No (only ratio's z-test) | +| Uses abs | No | Yes | + +**Impact**: SEVERE — These produce fundamentally different rankings. The Clojure product heavily penalizes any factor near zero, while Python's sum is more forgiving. Additionally, Python takes absolute values of z-scores, meaning it doesn't distinguish direction. + +--- + +## 8. Finalize Comment Stats (Agree vs Disagree) + +### 8.1 Clojure (`repness.clj:finalize-cmt-stats`) + +```clojure +;; Simple comparison: which representativeness test is larger? +(if (> rat rdt) + ;; agree is more representative + {:repful :agree, :repness ra, :repness-test rat, ...} + ;; disagree is more representative + {:repful :disagree, :repness rd, :repness-test rdt, ...}) +``` + +Decision criterion: `rat > rdt` — whichever direction has a larger representativeness z-score wins. + +### 8.2 Python (`repness.py:finalize_cmt_stats`, lines 213–243) + +```python +if pa > 0.5 and ra > 1.0: + result['repful'] = 'agree' +elif pd > 0.5 and rd > 1.0: + result['repful'] = 'disagree' +else: + if agree_metric >= disagree_metric: + result['repful'] = 'agree' + else: + result['repful'] = 'disagree' +``` + +Decision criterion: Priority to agree if `pa > 0.5 AND ra > 1.0`, then disagree if `pd > 0.5 AND rd > 1.0`, then fallback to comparing metrics. + +### 8.3 DISCREPANCY: Agree/Disagree Selection + +Clojure uses a simple `rat > rdt` comparison. Python adds threshold checks (`pa > 0.5`, `ra > 1.0`) that can override the metric-based decision. + +**Impact**: Comments near the boundary between agree and disagree may be classified differently. + +--- + +## 9. Significance Thresholds + +### 9.1 Clojure (`stats.clj`) + +```clojure +(defn z-sig-90? [z-val] (> z-val 1.2816)) ;; one-tailed 90% +(defn z-sig-95? [z-val] (> z-val 1.6449)) ;; one-tailed 95% +``` + +These are **one-tailed** z-values (P(Z > z) < alpha). + +### 9.2 Python — INTERNAL INCONSISTENCY + +**`repness.py`** (line 19): +```python +Z_90 = 1.645 # two-tailed 90% (or one-tailed 95%) +Z_95 = 1.96 # two-tailed 95% +``` + +**`stats.py`** (used elsewhere, not in repness): +```python +z_sig_90 threshold = 1.2816 # matches Clojure +``` + +### 9.3 DISCREPANCY: Z-score Thresholds + +| | Clojure | Python repness.py | Python stats.py | +|---|---------|------------------|-----------------| +| 90% threshold | 1.2816 (one-tailed) | 1.645 (two-tailed) | 1.2816 (one-tailed) | +| 95% threshold | 1.6449 (one-tailed) | 1.96 (two-tailed) | — | + +**Impact**: Python's `repness.py` requires MUCH higher z-scores to pass significance tests. Many comments that would be "significant" in Clojure will fail significance in Python, leading to fewer representative comments being selected. + +--- + +## 10. Representative Comment Selection + +### 10.1 Clojure (`repness.clj:select-rep-comments`) + +Single-pass algorithm: +``` +For each (group, comment) pair: + If repful == :agree: + If beats-best-by-test? OR (passes-by-test? AND beats-best-agr?): + Add to best-agrees + If repful == :disagree: + Similar logic for best-disagrees +Final: Take top 5 (agrees before disagrees) +``` + +### 10.2 Python (`repness.py:select_rep_comments`, lines 315–392) + +Separate lists: +```python +agree_comments = sorted(best_agree(all_stats), key=agree_metric, reverse=True) +disagree_comments = sorted(best_disagree(all_stats), key=disagree_metric, reverse=True) +selected = top 3 agrees + top 2 disagrees +``` + +### 10.3 DISCREPANCY: Selection Algorithm + +| | Clojure | Python | +|---|---------|--------| +| Agree count | Up to 5 total | 3 agrees + 2 disagrees | +| Algorithm | Single-pass with replacement | Sort-and-take | +| Criteria | beats-best-by-test?, passes-by-test? | pa > pd, significance tests | + +--- + +## 11. Consensus Comments + +### 11.1 Clojure (`repness.clj:consensus-stats` + `select-consensus-comments`) + +```clojure +;; consensus-stats: For each comment, compute pa with Bayesian smoothing +;; select-consensus-comments: +;; - format-stat adds test scores +;; - Takes top 5 agrees (pa > 0.5) and top 5 disagrees (pd > 0.5) +;; - Filtered by mod-out +``` + +### 11.2 Python (`repness.py:select_consensus_comments`, lines 413–454) + +```python +# Group by comment +# Check if ALL groups have pa > 0.6 +# Sort by average agreement +# Take top 2 +``` + +### 11.3 DISCREPANCY: Consensus Selection + +| | Clojure | Python | +|---|---------|--------| +| Criterion | Per-comment pa > 0.5 | ALL groups pa > 0.6 | +| Output | Top 5 agree + 5 disagree | Top 2 overall | +| Smoothing | Bayesian (same as repness) | Same | + +**Impact**: Python is much more restrictive (requires all groups to agree at 60%+) and returns fewer consensus comments. diff --git a/delphi/docs/deep-analysis-for-julien/05-participant-filtering.md b/delphi/docs/deep-analysis-for-julien/05-participant-filtering.md new file mode 100644 index 000000000..9b5ed383c --- /dev/null +++ b/delphi/docs/deep-analysis-for-julien/05-participant-filtering.md @@ -0,0 +1,278 @@ +# Participant Filtering, Comment Priorities, and Vote Structures + +## 1. Participant Filtering (in-conv) + +The "in-conv" filter determines which participants have voted enough to be included in clustering. This is critical because participants with too few votes produce unreliable PCA projections. + +### 1.1 Clojure (`conversation.clj` lines 240–266) + +```clojure +:in-conv (plmb/fnk [conv user-vote-counts n-cmts] + (as-> (or (:in-conv conv) #{}) in-conv + ;; Step 1: Include anyone with >= min(7, n-cmts) votes + (into in-conv + (map first + (filter + (fn [[rowname cnt]] + (>= cnt (min 7 n-cmts))) + user-vote-counts))) + ;; Step 2: If fewer than 15 participants, greedily add top voters + (let [greedy-n 15 + n-in-conv (count in-conv)] + (if (< n-in-conv greedy-n) + (->> user-vote-counts + (remove (fn [[k v]] (in-conv k))) + (sort-by (comp - second)) + (map first) + (take (- greedy-n n-in-conv)) + (into in-conv)) + in-conv)))) +``` + +**Algorithm**: +1. Start with previous in-conv set (once in, always in) +2. Add any participant with `vote_count >= min(7, n_comments)` +3. If result has fewer than 15 participants, greedily add the highest-voting participants until reaching 15 + +**Key property**: Monotonic — once a participant is in-conv, they stay in-conv forever (across updates). + +### 1.2 Python (`conversation.py` lines 1225–1248) + +```python +def _get_in_conv_participants(self) -> Set[str]: + n_cmts = len(self.rating_mat.columns) + threshold = 7 + np.sqrt(n_cmts) * 0.1 + vote_counts = self._compute_user_vote_counts() + in_conv = {pid for pid, count in vote_counts.items() if count >= threshold} + return in_conv +``` + +**Algorithm**: +1. Threshold = `7 + sqrt(n_comments) * 0.1` +2. Include any participant with `vote_count >= threshold` +3. No greedy fallback +4. No persistence (recomputed from scratch each time) + +### 1.3 DISCREPANCY: In-Conv Filtering + +| Property | Clojure | Python | +|----------|---------|--------| +| Threshold | `min(7, n_cmts)` | `7 + sqrt(n_cmts) * 0.1` | +| Greedy fallback | Yes (top-15 voters) | No | +| Persistence | Monotonic (once in, always in) | Recomputed each time | +| At 100 comments | 7 votes needed | 8 votes needed | +| At 1000 comments | 7 votes needed | 10.2 votes needed | + +**Impact**: HIGH +- Python's threshold is HIGHER and GROWS with comment count, excluding more participants +- Python lacks the greedy fallback, so small conversations may have too few participants for clustering +- Python recalculates from scratch, so participants can drop OUT of in-conv if new comments are added (increasing the threshold) + +### 1.4 Note on to_dict() In-Conv + +Interestingly, `conversation.py:to_dict()` (lines 1637–1644) has a DIFFERENT in-conv calculation for the serialized output: + +```python +min_votes = min(7, self.comment_count) +for pid, count in result['user-vote-counts'].items(): + if count >= min_votes: + in_conv.append(pid) +``` + +This matches Clojure's threshold formula but is only used for serialization output, NOT for the actual clustering computation. The clustering uses `_get_in_conv_participants()` with the different formula. + +--- + +## 2. User Vote Counts + +### 2.1 Clojure (`conversation.clj` lines 217–225) + +```clojure +:user-vote-counts + (->> (mapv + (fn [rowname row] + [rowname (count (remove nil? row))]) + (nm/rownames raw-rating-mat) + (nm/get-matrix raw-rating-mat)) + (into {})) +``` + +Counts non-nil entries per row of the **raw** rating matrix (before moderation). + +### 2.2 Python (`conversation.py:_compute_user_vote_counts`, lines 1179–1223) + +```python +# Uses self.rating_mat (AFTER moderation, not raw) +non_nan_mask = ~np.isnan(self.rating_mat.values) +row_sums = np.sum(non_nan_mask, axis=1) +``` + +### 2.3 DISCREPANCY: Raw vs Moderated Matrix + +Clojure counts votes on the **raw** matrix; Python counts on the **moderated** matrix. If comments are moderated out, Python undercounts participant votes. + +**Impact**: A participant who voted on 8 comments (including 2 moderated-out) would have count=8 in Clojure but count=6 in Python, potentially dropping below the in-conv threshold. + +--- + +## 3. Customs (Participant/Comment Caps) + +### 3.1 Clojure (`conversation.clj` lines 164–186) + +```clojure +:customs (plmb/fnk [conv votes opts'] + (reduce + (fn [{:keys [pids tids] :as result} {:keys [pid tid] :as vote}] + (let [pid-room (< (count pids) (:max-ptpts opts')) ;; 100000 + tid-room (< (count tids) (:max-cmts opts')) ;; 10000 + pid-in (pids pid) + tid-in (tids tid)] + (if (and (or pid-room pid-in) + (or tid-room tid-in)) + ;; Accept vote + ... + ;; Reject vote + result))) + ...)) +``` + +This enforces hard caps: max 100,000 participants, max 10,000 comments. A vote is accepted only if: +- The participant is already known OR there's room for new participants +- AND the comment is already known OR there's room for new comments + +### 3.2 Python + +**No customs/cap mechanism exists.** All votes are accepted regardless of conversation size. + +**Status: MISSING in Python** — Could be a problem for very large conversations, but less critical than other discrepancies. + +--- + +## 4. Comment Priorities + +### 4.1 Clojure (`conversation.clj` lines 308–669) + +Comment priorities determine which comments are shown next to participants. The pipeline computes: + +**Step 1: Votes Base** (`votes-base`, lines 583–590) +``` +For each comment tid: + For each base cluster: + A[bid] = count of agree votes from cluster members + D[bid] = count of disagree votes from cluster members + S[bid] = count of total votes from cluster members +``` + +**Step 2: Group Votes** (`group-votes`, lines 278–305) +``` +For each group: + For each comment: + A = sum of A[bid] for base clusters in this group + D = sum of D[bid] ... + S = sum of S[bid] ... +``` + +**Step 3: Importance Metric** (lines 308–312) +``` +importance_metric(A, P, S, E) = (1 - p) * (E + 1) * a +where: + p = (P + 1) / (S + 2) ;; pass probability (Bayesian) + a = (A + 1) / (S + 2) ;; agree probability (Bayesian) + P = S - A - D ;; pass count + E = comment extremity ;; L2 norm of comment's PCA projection +``` + +**Step 4: Priority Metric** (lines 318–327) +``` +priority_metric(is_meta, A, P, S, E) = + if is_meta: + meta_priority^2 = 7^2 = 49 + else: + (importance_metric(A, P, S, E) * (1 + 8 * 2^(S/-5)))^2 +``` + +The `(1 + 8 * 2^(S/-5))` factor is a **novelty boost** that gives high priority to comments with few total votes (S). As S increases: +- S=0: factor = 1 + 8*1 = 9 +- S=5: factor = 1 + 8*0.5 = 5 +- S=10: factor = 1 + 8*0.25 = 3 +- S=20: factor = 1 + 8*0.0625 = 1.5 +- S=∞: factor → 1 + +The squaring amplifies differences between priorities. + +**Step 5: Comment Priorities** (lines 638–669) +``` +For each tid: + Sum A, D, S, P across all groups + Get extremity from PCA + priority = priority_metric(is_meta, total_A, total_P, total_S, extremity) +``` + +### 4.2 Python + +**Comment priorities are NOT computed in the main pipeline.** The `_compute_votes_base()` method exists (lines 1047–1093) but: + +1. It has a **BUG** on line 1076: `self.rating_mat[:, 'tid']` is invalid pandas syntax (should be `self.rating_mat[tid]`) +2. It is marked with `TODO(julien): why is that not called anywhere?` +3. The `recompute()` method does NOT call it +4. No `importance_metric` or `priority_metric` functions exist in Python +5. Comment extremity is not computed (no comment projection) + +**Status: MISSING in Python** — Comment priorities are critical for the TypeScript server's comment routing. Without them, all comments get equal weight (default priority = 1). + +--- + +## 5. Group-Aware Consensus + +### 5.1 Clojure (`conversation.clj` lines 614–636) + +```clojure +:group-aware-consensus + ;; For each (group, comment): + ;; prob = (A + 1.0) / (S + 2.0) ;; Laplace-smoothed agree probability + ;; For each comment: + ;; consensus = product of all group probabilities + ;; Result: {tid → consensus_value} +``` + +Mathematical formula for comment `j`: +``` +consensus(j) = Π_{g ∈ groups} [ (A_g(j) + 1) / (S_g(j) + 2) ] +``` + +This is a product of Bayesian-smoothed agreement probabilities across all groups. High consensus means ALL groups agree. + +### 5.2 Python (`conversation.py:_compute_group_aware_consensus`, lines 1291–1349) + +```python +# Same formula: consensus_value *= (agree_count + 1.0) / (total_count + 2.0) +``` + +**Status: MATCH** — Both compute the same product of Laplace-smoothed probabilities. + +Also implemented inline in `to_dict()` (lines 1596–1631) with identical logic. + +--- + +## 6. Votes-Base Structure + +### 6.1 Clojure + +```clojure +;; {tid {:A [a0 a1 a2 ...], :D [...], :S [...]}} +;; where indices correspond to base cluster IDs (sorted) +;; a0 = number of agree votes from base cluster 0 for this comment +``` + +This per-base-cluster granularity enables efficient group-level aggregation. + +### 6.2 Python (`to_dict()`, lines 1497–1521) + +```python +# {tid: {'A': total_agree, 'D': total_disagree, 'S': total_votes}} +# NOT per-base-cluster — just global totals +``` + +### 6.3 DISCREPANCY: Granularity + +Clojure's votes-base has per-base-cluster arrays; Python's has only global totals. This means Python cannot efficiently recompute group-level vote statistics when group assignments change. Python works around this by recomputing from the raw vote matrix each time. diff --git a/delphi/docs/deep-analysis-for-julien/06-comment-routing.md b/delphi/docs/deep-analysis-for-julien/06-comment-routing.md new file mode 100644 index 000000000..3dd233bea --- /dev/null +++ b/delphi/docs/deep-analysis-for-julien/06-comment-routing.md @@ -0,0 +1,274 @@ +# Comment Routing Analysis + +## 1. Overview + +Comment routing determines which comment a participant sees next when they're ready to vote. The TypeScript server (`server/src/nextComment.ts`) mediates between two routing strategies: + +1. **Prioritized routing**: Uses math-computed priority scores for weighted random selection +2. **Topical routing**: Uses Delphi-generated topic clusters filtered by participant preferences + +--- + +## 2. Entry Point + +### `getNextComment()` (`nextComment.ts` lines 339–381) + +```typescript +export async function getNextComment(zid, pid, withoutTids, lang) { + const ratio = Config.getValidTopicalRatio(); + // ratio is TOPICAL_COMMENT_RATIO env var, float [0,1] or null + + const shouldUseTopical = + typeof ratio === 'number' && ratio > 0 && + Math.random() < ratio && // probabilistic split + pid !== -1; // not anonymous + + let next = null; + if (shouldUseTopical) { + next = await getNextTopicalComment(zid, pid, withoutTids); + } else { + next = await getNextPrioritizedComment(zid, pid, withoutTids); + } + + // Fallback: if topical yielded nothing, try prioritized + if (!next && shouldUseTopical) { + next = await getNextPrioritizedComment(zid, pid, withoutTids); + } + + if (!next) return next; + await ensureTranslations(zid, next, lang); + return next; +} +``` + +**Key design**: The split is probabilistic. With `TOPICAL_COMMENT_RATIO=0.5`, each request has a 50% chance of using topical routing and 50% prioritized. + +--- + +## 3. Prioritized Routing (Legacy Path) + +### `getNextPrioritizedComment()` (`nextComment.ts` lines 51–103) + +```typescript +async function getNextPrioritizedComment(zid, pid, withoutTids) { + const [comments, mathRaw, remainingRows] = await Promise.all([ + getComments({zid, not_voted_by_pid: pid, ...}), // unvoted comments + getPca(zid, 0), // math results from DB + getNumberOfCommentsRemaining(zid, pid), // count + ]); + + const commentPriorities = math.asPOJO?.['comment-priorities'] || {}; + const selectedRow = selectProbabilistically(comments, commentPriorities); + selectedRow.remaining = remainingCount; + selectedRow.total = totalCount; + return selectedRow; +} +``` + +**Data flow**: +1. Fetch all comments the participant hasn't voted on +2. Fetch `comment-priorities` from the math results stored in `math_main` table +3. Use weighted random selection + +### `selectProbabilistically()` (`nextComment.ts` lines 105–140) + +```typescript +function selectProbabilistically(comments, priorities) { + // Build cumulative weight lookup + const lookup = _.reduce(comments, (o, comment) => { + const lookup_val = o.lastCount + (priorities[comment.tid] || 1); + // ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + // default weight = 1 if no priority + o.lookup.push([lookup_val, comment]); + o.lastCount = lookup_val; + return o; + }, { lastCount: 0, lookup: [] }); + + // Weighted random selection + const randomN = Math.random() * lookup.lastCount; + const result = _.find(lookup.lookup, (x) => x[0] > randomN); + return result[1]; +} +``` + +**Mathematical formulation**: + +Given comments `c_1, ..., c_n` with priorities `w_1, ..., w_n` (default `w_i = 1`): + +``` +P(select c_i) = w_i / Σ_j w_j +``` + +This is **probability-proportional-to-size (PPS) sampling** without replacement of the selection step. Each comment's selection probability is proportional to its priority weight. + +**Example** with priorities {A: 49, B: 4, C: 1}: +``` +P(A) = 49/54 = 90.7% ← meta comment (priority^2 = 7^2) +P(B) = 4/54 = 7.4% ← normal comment +P(C) = 1/54 = 1.9% ← default (no priority computed) +``` + +--- + +## 4. Topical Routing (Delphi Path) + +### `getNextTopicalComment()` (`nextComment.ts` lines 285–327) + +```typescript +export async function getNextTopicalComment(zid, pid, withoutTids) { + const tids = await getTidsForParticipantTopicAgenda(zid, pid); + if (!tids || tids.length === 0) { + return getNextPrioritizedComment(zid, pid, withoutTids); // fallback + } + + const rows = await getComments({ + zid, + not_voted_by_pid: pid, + tids, // restrict to topic-relevant comments + withoutTids, + random: true, // uniform random within pool + limit: 1, + }); + + if (!rows[0]) { + return getNextPrioritizedComment(zid, pid, withoutTids); // fallback + } + + return { zid, tid: rows[0].tid, txt: rows[0].txt }; +} +``` + +### `getTidsForParticipantTopicAgenda()` (`nextComment.ts` lines 153–276) + +``` +1. Query PostgreSQL: topic_agenda_selections WHERE zid AND pid + → Get archetypal_selections array (participant's chosen topics) + +2. Extract unique topic_keys from selections + +3. For each topic_key: + Query DynamoDB: Delphi_CommentClustersLLMTopicNames + → Get (layer_id, cluster_id) for each topic + +4. For each (layer_id, cluster_id): + Query DynamoDB: Delphi_CommentHierarchicalClusterAssignments + → Get all comment tids in that cluster + +5. De-duplicate and return tids +``` + +**Key difference from prioritized**: Topical routing uses **uniform random** selection within the filtered pool (no weighting). The filtering is by LLM-generated topic clusters, not by math-computed priorities. + +--- + +## 5. How Comment Priorities Reach the Server + +### 5.1 Clojure Pipeline + +``` +conversation.clj:comment-priorities + → Serialized into math_main.data JSON blob in PostgreSQL + → Read by server via getPca(zid, 0) + → Extracted as math.asPOJO['comment-priorities'] + → Passed to selectProbabilistically() +``` + +The Clojure pipeline computes fresh priorities on each update and stores them in the `math_main` table. + +### 5.2 Delphi (Python) Pipeline + +Python does NOT compute comment priorities (see 05-participant-filtering.md Section 4). This means: + +- If running Delphi instead of Clojure math, `comment-priorities` will be empty (`{}`) +- `selectProbabilistically()` will default every comment to weight 1 +- Result: **uniform random selection** — all comments equally likely + +**Impact**: The entire priority-based routing degrades to random selection when using Delphi. + +--- + +## 6. Server API Endpoints + +### GET `/api/v3/nextComment` + +Handler: `handle_GET_nextComment()` in `routes/comments.ts` + +Parameters: +- `zid` — conversation ID +- `not_voted_by_pid` — participant ID (exclude already-voted) +- `without` — array of tids to exclude +- `lang` — language for translation + +Response: +```json +{ + "tid": 42, + "txt": "The comment text...", + "created": 1703000000000, + "remaining": 15, + "total": 50, + "currentPid": 7 +} +``` + +### POST `/api/v3/votes` + +Handler: `handle_POST_votes()` in `routes/votes.ts` + +After recording the vote, the handler calls `getNextComment()` and includes the next comment in the response, enabling seamless vote → next comment flow without a separate request. + +--- + +## 7. Comparison: Delphi vs Legacy Comment Routing + +| Aspect | Legacy (Clojure + Prioritized) | Delphi (Topical) | +|--------|-------------------------------|------------------| +| Selection pool | All unvoted comments | Only topic-relevant unvoted comments | +| Weighting | Priority scores (importance × novelty) | Uniform random within pool | +| Novelty boost | Yes (`2^(S/-5)` decay) | No | +| Meta comments | 7× priority boost (squared: 49×) | Not distinguished | +| Extremity factor | Yes (PCA comment projection norm) | No | +| Topic filtering | No | Yes (LLM-generated clusters) | +| Participant prefs | No | Yes (topic_agenda_selections) | +| Fallback | Uniform random if no priorities | Prioritized routing | +| Data source | math_main PostgreSQL table | DynamoDB + PostgreSQL | + +### 7.1 Strengths of Legacy Approach +- Comments with high information value (high extremity, low vote count) surface faster +- Meta/featured comments get strong priority boost +- Battle-tested across many conversations + +### 7.2 Strengths of Delphi Approach +- Respects participant interests (topic selection) +- LLM-generated topics can be more semantically meaningful +- Reduces survey fatigue by focusing on relevant topics + +### 7.3 Current Hybrid +The `TOPICAL_COMMENT_RATIO` parameter blends both approaches probabilistically. When set to 0.5, each comment request has a 50% chance of using each strategy. + +--- + +## 8. Client-Side Integration + +### `client-participation/js/stores/polis.js` (lines 157–194) + +```javascript +getNextComment(o) { + params = { + not_voted_by_pid: myPid, + limit: 1, + conversation_id: conversation_id, + lang: Utils.uiLanguage() + }; + if (o.notTid) { + params.without = [o.notTid]; // exclude current comment + } + return polisGet('api/v3/nextComment', params) + .then((c) => { + nextCommentCache = c; + if (c.currentPid) processPidResponse(c.currentPid); + }); +} +``` + +The client calls `getNextComment` after each vote, caches the response, and displays it. The client has no knowledge of the routing strategy — it simply requests and displays whatever the server returns. diff --git a/delphi/docs/deep-analysis-for-julien/07-discrepancies.md b/delphi/docs/deep-analysis-for-julien/07-discrepancies.md new file mode 100644 index 000000000..288e6d3ea --- /dev/null +++ b/delphi/docs/deep-analysis-for-julien/07-discrepancies.md @@ -0,0 +1,268 @@ +# ALL Discrepancies: Clojure (CORRECT) vs Python (Delphi) + +This is the critical reference document. Every discrepancy is rated by severity and lists the exact code locations. + +--- + +## SEVERITY LEGEND + +- **CRITICAL**: Fundamentally changes mathematical behavior, produces different results +- **HIGH**: Significant behavioral difference visible to users +- **MEDIUM**: Subtle numerical differences, may affect edge cases +- **LOW**: Cosmetic or minor structural difference + +--- + +## D1. PCA Algorithm [MEDIUM] + +| | Clojure | Python | +|---|---------|--------| +| File | `math/src/polismath/math/pca.clj` | `delphi/polismath/pca_kmeans_rep/pca.py` | +| Method | Power iteration (100 iters) | sklearn SVD | +| Warm-start | Yes (previous components) | No | +| Large conv | Mini-batch with learning rate 0.01 | Full SVD (no optimization) | + +**Why MEDIUM**: Both methods converge to the same eigenspace for well-separated eigenvalues. The main risk is temporal instability in Python (PCA can flip/rotate between updates). + +**Fix**: Could switch to incremental PCA (sklearn `IncrementalPCA`) for warm-starting, or accept SVD as equivalent and address stability separately. + +### D1b. Projection Input [LOW] + +| | Clojure | Python | +|---|---------|--------| +| File | `pca.clj:134–157` | `pca.py:76,96–102` | +| Input | Raw votes (nils skipped) | Imputed matrix (NaN → col mean) | + +Clojure projects against raw sparse votes, skipping unvoted entries. Python projects the fully imputed matrix. Since `center ≈ col_mean`, the difference is small but nonzero. See doc 02, Section 3.4 for details. + +--- + +## D2. In-Conv Participant Threshold [CRITICAL] + +| | Clojure | Python | +|---|---------|--------| +| File | `conversation.clj:240–266` | `conversation.py:1225–1248` | +| Threshold | `min(7, n_cmts)` | `7 + sqrt(n_cmts) * 0.1` | +| Greedy fallback | Top-15 voters if < 15 qualify | None | +| Persistence | Monotonic (once in, always in) | Recomputed from scratch | +| Matrix used | raw_rating_mat | rating_mat (moderated) | + +**Example at 100 comments**: +- Clojure: need 7 votes → most participants qualify +- Python: need 8 votes → slightly fewer qualify + +**Example at 1000 comments**: +- Clojure: need 7 votes → most participants qualify +- Python: need 10.2 votes → significantly fewer qualify + +**Why CRITICAL**: This is the gatekeeper for clustering. Wrong threshold means different participants are clustered, cascading to different groups, different repness, different everything. + +**Fix**: Change Python to `min(7, n_cmts)`, add greedy fallback, add persistence, use `raw_rating_mat`. + +--- + +## D3. K-Smoother Buffer [HIGH] + +| | Clojure | Python | +|---|---------|--------| +| File | `conversation.clj:452–468` | `conversation.py:620–641` | +| Buffer | 4 consecutive updates before switching k | None (immediate switch) | +| State | Persisted across updates | None | + +**Why HIGH**: Without the buffer, the number of groups can flicker between values on successive updates, causing visible instability in the UI. + +**Fix**: Add `group_k_smoother` state to `Conversation` class with `{last_k, last_k_count, smoothed_k}`. + +--- + +## D4. Pseudocount Formula [MEDIUM] + +| | Clojure | Python | +|---|---------|--------| +| File | `repness.clj` | `repness.py:35,138–139` | +| Numerator add | +1 | +0.75 (`PSEUDO_COUNT/2`) | +| Denominator add | +2 | +1.5 (`PSEUDO_COUNT`) | +| Prior | Beta(2,2) | Beta(1.75,1.75) | + +**Why MEDIUM**: Small numerical differences that diminish with sample size but matter for small groups. + +**Fix**: Change `PSEUDO_COUNT = 2.0` in Python (giving +1/+2 to match Clojure). + +--- + +## D5. Proportion Test Formula [CRITICAL] + +| | Clojure | Python | +|---|---------|--------| +| File | `stats.clj:prop-test` | `repness.py:prop_test` lines 64–86 | +| Clojure | `2*sqrt(n+1)*((succ+1)/(n+1) - 0.5)` | — | +| Python | — | `(p - 0.5) / sqrt(0.25/n)` (standard z-test) | + +**Why CRITICAL**: These are fundamentally different formulas. Clojure's built-in pseudocount in the test produces different z-scores, especially for small samples. Since these z-scores are used in significance tests that gate which comments become representative, different formulas → different representative comments. + +**Fix**: Implement Clojure's exact formula in Python. + +--- + +## D6. Two-Proportion Test [MEDIUM] + +| | Clojure | Python | +|---|---------|--------| +| File | `stats.clj:two-prop-test` | `repness.py:two_prop_test` lines 89–115 | +| Pseudocount | +1 to all 4 inputs | None (uses pre-smoothed proportions) | + +**Why MEDIUM**: The +1 adjustment provides regularization that prevents extreme z-scores when sample sizes are very small. Python's version can produce larger z-scores. + +**Fix**: Add +1 adjustment to match Clojure, or apply pseudocount before calling. + +--- + +## D7. Repness Metric Formula [CRITICAL] + +| | Clojure | Python | +|---|---------|--------| +| File | `repness.clj:repness-metric` | `repness.py:repness_metric` lines 189–210 | +| Formula | `ra * rat * pa * pat` (product of 4) | `pa * (|pat| + |rat|)` (weighted sum of 2) | +| Uses ratio directly | Yes (`ra`) | No (only `rat`) | +| Abs values | No | Yes | + +**Why CRITICAL**: Completely different mathematical formulas for ranking representative comments. The product form zeros out when any factor is near zero; the sum form is more tolerant. They will produce different rankings for the same data. + +**Fix**: Replace Python formula with `ra * rat * pa * pat` to match Clojure. + +--- + +## D8. Finalize Comment Stats [MEDIUM] + +| | Clojure | Python | +|---|---------|--------| +| File | `repness.clj:finalize-cmt-stats` | `repness.py:finalize_cmt_stats` lines 213–243 | +| Decision | `rat > rdt` (simple comparison) | `pa > 0.5 AND ra > 1.0` (threshold check first) | + +**Why MEDIUM**: Different logic for choosing agree vs disagree classification. The threshold checks in Python can override what would be the metric-based choice. + +**Fix**: Replace Python logic with simple `rat > rdt` comparison. + +--- + +## D9. Z-Score Significance Thresholds [CRITICAL] + +| | Clojure | Python repness.py | +|---|---------|------------------| +| File | `stats.clj` | `repness.py:19` | +| 90% threshold | 1.2816 (one-tailed) | 1.645 (two-tailed) | +| 95% threshold | 1.6449 (one-tailed) | 1.96 (two-tailed) | + +**Why CRITICAL**: Python requires 28% higher z-scores to pass significance at 90% confidence. Many comments that would be selected as representative in Clojure will fail to pass significance in Python. + +**Additional issue**: Python's own `stats.py` uses 1.2816 (matching Clojure), creating an **internal inconsistency** within the Python codebase. + +**Fix**: Change `Z_90 = 1.2816` and `Z_95 = 1.6449` in `repness.py`. + +--- + +## D10. Representative Comment Selection [HIGH] + +| | Clojure | Python | +|---|---------|--------| +| File | `repness.clj:select-rep-comments` | `repness.py:select_rep_comments` lines 315–392 | +| Algorithm | Single-pass with beats-best | Sort-and-take-top | +| Max agree | 5 total (agrees first) | 3 agrees | +| Max disagree | (from the 5) | 2 disagrees | +| Total | Up to 5 | Up to 5 | + +**Why HIGH**: Different selection counts and algorithm produce different representative comment sets. + +**Fix**: Match Clojure's selection logic (5 total, agrees first, single-pass). + +--- + +## D11. Consensus Comment Selection [HIGH] + +| | Clojure | Python | +|---|---------|--------| +| File | `repness.clj:consensus-stats` + `select-consensus-comments` | `repness.py:select_consensus_comments` lines 413–454 | +| Criterion | Per-comment `pa > 0.5` | ALL groups `pa > 0.6` | +| Output | Top 5 agree + 5 disagree with test scores | Top 2 overall | +| Test scores | Yes (format-stat adds z-tests) | No | + +**Why HIGH**: Python is much more restrictive and returns far fewer consensus comments. + +**Fix**: Match Clojure's consensus-stats + select-consensus-comments logic. + +--- + +## D12. Comment Priorities [CRITICAL] + +| | Clojure | Python | +|---|---------|--------| +| File | `conversation.clj:308–669` | `conversation.py:1047–1093` (dead code) | +| Status | Fully computed and stored | NOT COMPUTED (dead code with bug) | +| Components | importance × novelty, squared | N/A | +| Comment extremity | Yes (from PCA) | Not computed | +| Meta priority | 7 (squared = 49) | N/A | + +**Why CRITICAL**: Without comment priorities, the TypeScript server falls back to uniform random comment selection. This eliminates the information-theoretic comment routing that is core to Polis's design. + +**Fix**: Implement `importance_metric`, `priority_metric`, and the full `comment-priorities` computation. Requires first implementing comment projection and extremity in PCA. + +--- + +## D13. Subgroup Clustering [LOW] + +| | Clojure | Python | +|---|---------|--------| +| File | `conversation.clj:480–570` | N/A | +| Status | Full implementation with k-smoother | Not implemented | + +**Why LOW**: Subgroups are a secondary feature. The primary group clustering is more important. + +**Fix**: Implement if needed, but lower priority. + +--- + +## D14. Large Conversation Optimization [LOW] + +| | Clojure | Python | +|---|---------|--------| +| Dispatch | `ptpt-cutoff=10000`, `cmt-cutoff=5000` | No dispatch | +| Large PCA | Mini-batch with learning rate | Full SVD | +| Sample size | `interpolate(n, 100..1500, 1500..150000)` | N/A | + +**Why LOW**: Only matters for very large conversations. Can be addressed later. + +**Fix**: Use `IncrementalPCA` from sklearn for large conversations. + +--- + +## D15. Moderation Handling [MEDIUM] + +| | Clojure | Python | +|---|---------|--------| +| Method | `zero-out-columns` (set values to 0, keep structure) | `loc[keep_ptpts, keep_comments]` (remove entirely) | + +**Why MEDIUM**: Clojure keeps moderated comments in the matrix with zeroed values; Python removes them entirely. This affects matrix dimensions and potentially vote count calculations. + +**Fix**: Consider switching to zero-out approach, but current approach may be acceptable since Python works with DataFrames. + +--- + +## Summary Table + +| ID | Discrepancy | Severity | Fix Effort | +|----|-------------|----------|------------| +| D1 | PCA algorithm | MEDIUM | Low (accept or use IncrementalPCA) | +| D2 | In-conv threshold | CRITICAL | Low (change formula) | +| D3 | K-smoother buffer | HIGH | Medium (add state) | +| D4 | Pseudocount formula | MEDIUM | Low (change constant) | +| D5 | Proportion test | CRITICAL | Low (change formula) | +| D6 | Two-proportion test | MEDIUM | Low (add +1 adjustment) | +| D7 | Repness metric | CRITICAL | Low (change formula) | +| D8 | Finalize cmt stats | MEDIUM | Low (simplify logic) | +| D9 | Z-score thresholds | CRITICAL | Low (change constants) | +| D10 | Rep comment selection | HIGH | Medium (change algorithm) | +| D11 | Consensus selection | HIGH | Medium (match Clojure) | +| D12 | Comment priorities | CRITICAL | High (full implementation) | +| D13 | Subgroup clustering | LOW | High (full implementation) | +| D14 | Large conv optimization | LOW | Medium | +| D15 | Moderation handling | MEDIUM | Low | diff --git a/delphi/docs/deep-analysis-for-julien/08-dead-code.md b/delphi/docs/deep-analysis-for-julien/08-dead-code.md new file mode 100644 index 000000000..e27583ce2 --- /dev/null +++ b/delphi/docs/deep-analysis-for-julien/08-dead-code.md @@ -0,0 +1,131 @@ +# Dead and Unreachable Code + +## 1. Python (Delphi) Dead Code + +### 1.1 `_compute_votes_base()` — Dead Code with Bug + +**File**: `conversation.py` lines 1047–1093 +**Status**: DEAD — Never called from `recompute()` or any other pipeline method +**Bug**: Line 1076 uses `self.rating_mat[:, 'tid']` which is invalid pandas syntax. Should be `self.rating_mat[tid]`. +**Note**: Marked with `TODO(julien): why is that not called anywhere?` + +The `to_dict()` method (lines 1497–1521) has its own inline votes-base computation that IS used for serialization, making this method redundant. + +### 1.2 Custom `kmeans()` function — Dead Code + +**File**: `clusters.py` lines 383–419 +**Status**: DEAD — The pipeline uses `kmeans_sklearn()` exclusively +**Related dead code**: +- `cluster_step()` (lines 219–250) — only called by custom `kmeans()` +- `assign_points_to_clusters()` (lines 164–188) — only called by `cluster_step()` +- `update_cluster_centers()` (lines 191–203) — only called by `cluster_step()` +- `filter_empty_clusters()` (lines 206–216) — only called by `cluster_step()` +- `same_clustering()` (lines 135–161) — only called by custom `kmeans()` +- `clean_start_clusters()` (lines 318–380) — only called by custom `kmeans()` +- `split_cluster()` (lines 279–315) — only called by `clean_start_clusters()` +- `most_distal()` (lines 253–276) — only called by `split_cluster()` +- `distance_matrix()` (lines 422–441) — only called by custom `silhouette()` +- Custom `silhouette()` (lines 444–503) — pipeline uses `calculate_silhouette_sklearn()` + +### 1.3 `cluster_dataframe()` — Dead Code + +**File**: `clusters.py` lines 729–790 +**Status**: DEAD — The pipeline calls `kmeans_sklearn()` directly from `_compute_clusters()` +**Related**: `determine_k()` (lines 689–726) is only called by `cluster_dataframe()`, making it also dead. + +### 1.4 `clusters_from_dict()` — Dead Code + +**File**: `clusters.py` lines 536–564 +**Status**: DEAD — Used by `cluster_dataframe()` which is dead. The pipeline constructs clusters as plain dicts, not `Cluster` objects. + +### 1.5 `clusters_to_dict()` — Dead Code + +**File**: `clusters.py` lines 506–533 +**Status**: DEAD — Same reason as above. + +### 1.6 `Cluster` class — Partially Dead + +**File**: `clusters.py` lines 19–79 +**Status**: PARTIALLY DEAD — The `Cluster` class is used by the dead custom `kmeans()` chain but not by `kmeans_sklearn()`. The pipeline works with plain dicts instead. + +### 1.7 Legacy `comment_stats()`, `add_comparative_stats()`, etc. — Dead Code + +**File**: `repness.py` lines 118–392 +**Status**: DEAD — These are the non-vectorized versions. The pipeline uses the vectorized DataFrame functions (`compute_group_comment_stats_df`, `select_rep_comments_df`, etc.) exclusively via `conv_repness()`. + +Specifically dead: +- `comment_stats()` (lines 118–154) +- `add_comparative_stats()` (lines 157–186) +- `finalize_cmt_stats()` (lines 213–243) +- `passes_by_test()` (lines 246–268) +- `best_agree()` (lines 271–290) +- `best_disagree()` (lines 293–312) +- `select_rep_comments()` (lines 315–392) +- `select_consensus_comments()` (lines 413–454) + +The vectorized replacements (`compute_group_comment_stats_df`, `select_rep_comments_df`, `select_consensus_comments_df`) are used instead. + +### 1.8 `calculate_kl_divergence()` — Dead Code + +**File**: `repness.py` lines 395–410 +**Status**: DEAD — Never called from anywhere. + +### 1.9 `_compute_group_votes()` — Partially Dead + +**File**: `conversation.py` lines 1095–1177 +**Status**: Called by `_compute_group_aware_consensus()` which is called from nowhere in the main pipeline. The `to_dict()` method has its own inline group-votes computation. + +### 1.10 `_compute_group_aware_consensus()` — Dead Code + +**File**: `conversation.py` lines 1291–1349 +**Status**: DEAD — Never called from `recompute()`. The `to_dict()` method has its own inline consensus computation. + +### 1.11 `comment_priorities` attribute — Dead Code + +**File**: `conversation.py` lines 1039–1041, 1452–1453 +**Status**: The attribute is checked in `to_dict()` and `get_full_data()` but never populated by any pipeline method. + +--- + +## 2. Python Code That Is Used But Has Issues + +### 2.1 `_get_in_conv_participants()` — Used but Wrong Formula + +**File**: `conversation.py` lines 1225–1248 +**Status**: ACTIVE but uses wrong threshold formula (see D2 in discrepancies) + +### 2.2 `to_dict()` In-Conv — Used but Different from Pipeline + +**File**: `conversation.py` lines 1637–1644 +**Status**: ACTIVE — Uses `min(7, n_cmts)` threshold (matching Clojure) but only for serialization output, not for the actual clustering pipeline. + +### 2.3 `repness.py` Z_90 = 1.645 — Used but Wrong Value + +**File**: `repness.py` line 19 +**Status**: ACTIVE — Should be 1.2816 to match Clojure (see D9 in discrepancies) + +--- + +## 3. Clojure Dead Code + +### 3.1 `choose-group-k` — Commented Out + +**File**: `conversation.clj` lines 97–101 +**Status**: DEAD — Commented out, replaced by silhouette-based k-selection. + +### 3.2 `cmnt-proj` binding — Commented Out + +**File**: `conversation.clj` lines 391–393 +**Status**: DEAD — Commented out in favor of computing projection+extremity together in `with-proj-and-extremtiy` (sic — typo in the Clojure source, `conversation.clj:338`). + +--- + +## 4. Summary + +| Category | Count | Impact | +|----------|-------|--------| +| Dead pipeline methods (Python) | 3 (`_compute_votes_base`, `_compute_group_votes`, `_compute_group_aware_consensus`) | None (unused) | +| Dead utility functions (Python) | ~15 (custom kmeans chain, non-vectorized repness) | None (unused) | +| Dead but referenced attributes (Python) | 1 (`comment_priorities`) | Confusing — suggests feature is implemented when it's not | +| Active code with bugs (Python) | 3 (in-conv threshold, Z_90 value, to_dict inconsistency) | HIGH | +| Dead Clojure code | 2 (commented out, harmless) | None | diff --git a/delphi/docs/deep-analysis-for-julien/09-fix-plan.md b/delphi/docs/deep-analysis-for-julien/09-fix-plan.md new file mode 100644 index 000000000..6dd0f9ec6 --- /dev/null +++ b/delphi/docs/deep-analysis-for-julien/09-fix-plan.md @@ -0,0 +1,395 @@ +# Fix Plan: Bringing Python to Clojure Parity + +> **Note**: This was a preliminary analysis. The canonical fix ordering is in +> [`PLAN_DISCREPANCY_FIXES.md`](../PLAN_DISCREPANCY_FIXES.md), which reordered +> some fixes (e.g., D4 before D9) based on pipeline execution order. + +## Prioritized Fix Order + +Fixes are ordered by: (1) cascading impact, (2) severity, (3) ease of implementation. Items that affect downstream computations come first. + +--- + +## Phase 1: Foundation Fixes (Highest Priority) + +### Fix 1: In-Conv Participant Threshold [D2] + +**Files to modify**: `delphi/polismath/conversation/conversation.py` + +**Current** (lines 1237–1238): +```python +threshold = 7 + np.sqrt(n_cmts) * 0.1 +``` + +**Target**: +```python +threshold = min(7, n_cmts) +``` + +**Additional changes needed**: +1. Use `self.raw_rating_mat` instead of `self.rating_mat` for vote counting +2. Add monotonic persistence: store `self._in_conv` set, union with new qualifiers +3. Add greedy fallback: if `len(in_conv) < 15`, add top voters until reaching 15 + +**Estimated effort**: 1–2 hours + +### Fix 2: Z-Score Significance Thresholds [D9] + +**Files to modify**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Current** (line 19): +```python +Z_90 = 1.645 +Z_95 = 1.96 +``` + +**Target**: +```python +Z_90 = 1.2816 # one-tailed, matching Clojure +Z_95 = 1.6449 # one-tailed, matching Clojure +``` + +**Estimated effort**: 5 minutes + +### Fix 3: Pseudocount Formula [D4] + +**Files to modify**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Current** (line 35): +```python +PSEUDO_COUNT = 1.5 +``` + +**Target**: +```python +PSEUDO_COUNT = 2.0 # +1 numerator, +2 denominator, matching Clojure Beta(2,2) prior +``` + +Verify that `pa = (na + PSEUDO_COUNT/2) / (ns + PSEUDO_COUNT)` with `PSEUDO_COUNT=2.0` gives `pa = (na + 1) / (ns + 2)`, which matches Clojure. ✓ + +**Estimated effort**: 5 minutes + +### Fix 4: Proportion Test Formula [D5] + +**Files to modify**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Current** (`prop_test`, lines 64–86): +```python +def prop_test(p, n, p0): + se = math.sqrt(p0 * (1 - p0) / n) + return (p - p0) / se +``` + +**Target** (match Clojure's `stats.clj:prop-test`): +```python +def prop_test(succ, n): + """Clojure-compatible proportion test. + z = 2 * sqrt(n+1) * ((succ+1)/(n+1) - 0.5) + """ + if n == 0: + return 0.0 + return 2 * math.sqrt(n + 1) * ((succ + 1) / (n + 1) - 0.5) +``` + +**Note**: This changes the function signature from `(p, n, p0)` to `(succ, n)`. All callers need to be updated. The vectorized version (`prop_test_vectorized`) also needs updating. + +**Estimated effort**: 1 hour (including caller updates and vectorized version) + +--- + +## Phase 2: Repness Metric Fixes + +### Fix 5: Repness Metric Formula [D7] + +**Files to modify**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Current** (`repness_metric`, lines 189–210): +```python +p_factor = p if key_prefix == 'a' else (1 - p) +return p_factor * (abs(p_test) + abs(r_test)) +``` + +**Target** (match Clojure's product of 4): +```python +def repness_metric(stats, key_prefix): + p = stats[f'p{key_prefix}'] # pa or pd + p_test = stats[f'p{key_prefix}t'] # pat or pdt + r = stats[f'r{key_prefix}'] # ra or rd + r_test = stats[f'r{key_prefix}t'] # rat or rdt + return r * r_test * p * p_test +``` + +**Also update vectorized version** in `compute_group_comment_stats_df`: +```python +# Current (lines 649–650): +stats_df['agree_metric'] = stats_df['pa'] * (stats_df['pat'].abs() + stats_df['rat'].abs()) +stats_df['disagree_metric'] = (1 - stats_df['pd']) * (stats_df['pdt'].abs() + stats_df['rdt'].abs()) + +# Target: +stats_df['agree_metric'] = stats_df['ra'] * stats_df['rat'] * stats_df['pa'] * stats_df['pat'] +stats_df['disagree_metric'] = stats_df['rd'] * stats_df['rdt'] * stats_df['pd'] * stats_df['pdt'] +``` + +**Estimated effort**: 30 minutes + +### Fix 6: Finalize Comment Stats Logic [D8] + +**Files to modify**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Current** (`finalize_cmt_stats`, lines 230–241): +```python +if result['pa'] > 0.5 and result['ra'] > 1.0: + result['repful'] = 'agree' +elif result['pd'] > 0.5 and result['rd'] > 1.0: + result['repful'] = 'disagree' +else: ... +``` + +**Target** (match Clojure's simple comparison): +```python +if result['rat'] > result['rdt']: + result['repful'] = 'agree' +else: + result['repful'] = 'disagree' +``` + +**Also update vectorized version** in `compute_group_comment_stats_df` (lines 656–663): +```python +# Target: +stats_df['repful'] = np.where(stats_df['rat'] > stats_df['rdt'], 'agree', 'disagree') +``` + +**Estimated effort**: 15 minutes + +### Fix 7: Two-Proportion Test Adjustment [D6] + +**Files to modify**: `delphi/polismath/pca_kmeans_rep/repness.py` + +**Current** (`two_prop_test`, lines 89–115): +```python +def two_prop_test(p1, n1, p2, n2): + p = (p1 * n1 + p2 * n2) / (n1 + n2) + ... +``` + +**Target** (add +1 pseudocount like Clojure): +```python +def two_prop_test(succ1, n1, succ2, n2): + """Clojure-compatible two-proportion test with +1 pseudocounts.""" + succ1, n1 = succ1 + 1, n1 + 1 + succ2, n2 = succ2 + 1, n2 + 1 + p1 = succ1 / n1 + p2 = succ2 / n2 + p = (succ1 + succ2) / (n1 + n2) + se = math.sqrt(p * (1 - p) * (1/n1 + 1/n2)) + if se == 0: + return 0.0 + return (p1 - p2) / se +``` + +**Note**: Signature changes from proportions to counts. Callers must be updated. + +**Estimated effort**: 1 hour + +--- + +## Phase 3: Selection & Consensus Fixes + +### Fix 8: Representative Comment Selection [D10] + +**Files to modify**: `delphi/polismath/pca_kmeans_rep/repness.py` + +Match Clojure's `select-rep-comments` logic: +- Single pass with beats-best-by-test? and passes-by-test? +- Up to 5 total comments (agrees prioritized over disagrees) + +**Estimated effort**: 2–3 hours (algorithm redesign) + +### Fix 9: Consensus Comment Selection [D11] + +**Files to modify**: `delphi/polismath/pca_kmeans_rep/repness.py` + +Match Clojure's `consensus-stats` + `select-consensus-comments`: +- Per-comment stats with `pa = (A+1)/(S+2)` smoothing +- Top 5 agree (pa > 0.5) + top 5 disagree (pd > 0.5) +- Include z-test scores via `format-stat` + +**Estimated effort**: 2–3 hours + +--- + +## Phase 4: K-Smoother and Comment Priorities + +### Fix 10: K-Smoother Buffer [D3] + +**Files to modify**: `delphi/polismath/conversation/conversation.py` + +Add state tracking to `Conversation`: +```python +# In __init__: +self.group_k_smoother = { + 'last_k': None, + 'last_k_count': 0, + 'smoothed_k': None +} + +# In _compute_clusters(), after finding best_k by silhouette: +GROUP_K_BUFFER = 4 +smoother = self.group_k_smoother +this_k = best_k +same = (smoother['last_k'] is not None and this_k == smoother['last_k']) +this_k_count = smoother['last_k_count'] + 1 if same else 1 + +if this_k_count >= GROUP_K_BUFFER: + smoothed_k = this_k +else: + smoothed_k = smoother['smoothed_k'] if smoother['smoothed_k'] is not None else this_k + +self.group_k_smoother = { + 'last_k': this_k, + 'last_k_count': this_k_count, + 'smoothed_k': smoothed_k +} + +# Use smoothed_k instead of best_k for the actual clustering +``` + +**Estimated effort**: 1–2 hours + +### Fix 11: Comment Priorities [D12] + +**Files to modify**: `delphi/polismath/conversation/conversation.py`, `delphi/polismath/pca_kmeans_rep/pca.py` + +**Step 1**: Add comment projection and extremity to PCA: +```python +# In pca.py, after computing components: +comment_projection = pca.components_ # (n_comps × n_comments) +comment_extremity = np.linalg.norm(comment_projection, axis=0) +# Add to pca_results +pca_results['comment-projection'] = comment_projection +pca_results['comment-extremity'] = comment_extremity +``` + +**Step 2**: Implement `importance_metric`: +```python +def importance_metric(A, P, S, E): + p = (P + 1) / (S + 2) + a = (A + 1) / (S + 2) + return (1 - p) * (E + 1) * a +``` + +**Step 3**: Implement `priority_metric`: +```python +META_PRIORITY = 7 + +def priority_metric(is_meta, A, P, S, E): + if is_meta: + return META_PRIORITY ** 2 # = 49 + imp = importance_metric(A, P, S, E) + novelty = 1 + 8 * (2 ** (S / -5)) + return (imp * novelty) ** 2 +``` + +**Step 4**: Compute priorities in `recompute()`: +```python +def _compute_comment_priorities(self): + group_votes = self._compute_group_votes() + extremities = dict(zip( + self.rating_mat.columns, + self.pca.get('comment-extremity', np.zeros(len(self.rating_mat.columns))) + )) + + priorities = {} + for tid in self.rating_mat.columns: + total_A = total_D = total_S = total_P = 0 + for gid, gdata in group_votes.items(): + v = gdata['votes'].get(tid, {'A': 0, 'D': 0, 'S': 0}) + total_A += v['A'] + total_D += v['D'] + total_S += v['S'] + total_P += v['S'] - v['A'] - v['D'] + + E = extremities.get(tid, 0) + is_meta = tid in self.meta_tids if self.meta_tids else False + priorities[tid] = priority_metric(is_meta, total_A, total_P, total_S, E) + + self.comment_priorities = priorities +``` + +**Step 5**: Call from `recompute()` after `_compute_repness()`. + +**Estimated effort**: 4–6 hours + +--- + +## Phase 5: Cleanup + +### Fix 12: Remove Dead Code + +Remove the extensive dead code identified in `08-dead-code.md`: +- Custom kmeans chain in `clusters.py` +- Non-vectorized repness functions in `repness.py` +- Buggy `_compute_votes_base()` in `conversation.py` + +**Estimated effort**: 1–2 hours + +### Fix 13: Resolve Internal Inconsistencies + +- Remove or align `stats.py:prop_test` with the repness version +- Remove `stats.py:z_sig_90` (uses 1.2816) since `repness.py` is authoritative + +**Estimated effort**: 30 minutes + +--- + +## Testing Strategy + +### Unit Tests Per Fix + +Each fix should include tests comparing Python output to known Clojure output: + +1. **In-conv test**: Given a vote matrix with known vote counts, verify the same participants qualify +2. **Proportion test**: Compare z-scores for specific (succ, n) pairs +3. **Repness metric test**: Compute metric for known stats, compare to Clojure value +4. **Full pipeline test**: Run both implementations on the same small dataset and compare: + - PCA projections (direction, not exact values due to SVD vs power iteration) + - Cluster assignments + - Representative comments per group + - Consensus comments + - Comment priorities + +### Integration Test + +Use a real conversation dataset (export from production) and run both pipelines, comparing: +- Group count +- Group membership overlap +- Representative comment overlap +- Priority ranking correlation (Spearman rank) + +--- + +## Timeline Estimate + +| Phase | Fixes | Effort | Dependencies | +|-------|-------|--------|-------------| +| Phase 1 | D2, D9, D4, D5 | 3–4 hours | None | +| Phase 2 | D7, D8, D6 | 2–3 hours | Phase 1 | +| Phase 3 | D10, D11 | 4–6 hours | Phase 2 | +| Phase 4 | D3, D12 | 6–8 hours | Phase 3 | +| Phase 5 | Cleanup | 2–3 hours | All above | +| Testing | Unit + integration | 4–6 hours | All above | +| **Total** | | **21–30 hours** | | + +--- + +## Future: sklearn Streamlining + +Once parity is achieved, opportunities to leverage sklearn more effectively: + +1. **PCA**: Already using sklearn. Could use `IncrementalPCA` for warm-starting and large-conversation support. +2. **K-means**: Already using sklearn. The custom `kmeans()` is dead code and can be removed. +3. **Silhouette**: Already using sklearn. +4. **Statistical tests**: Could use `scipy.stats.proportions_ztest` instead of custom implementations, BUT need to ensure the pseudocount behavior matches Clojure. +5. **Clustering pipeline**: Could use sklearn's `Pipeline` or custom pipeline class for cleaner composition. + +**Important**: Any sklearn adoption must preserve the exact mathematical behavior of the Clojure implementation. The goal is code simplification, not behavioral change. diff --git a/delphi/polismath/conversation/conversation.py b/delphi/polismath/conversation/conversation.py index 04662179a..0c88ce4bd 100644 --- a/delphi/polismath/conversation/conversation.py +++ b/delphi/polismath/conversation/conversation.py @@ -589,11 +589,10 @@ def _compute_clusters(self) -> None: 'members': member_pids }) - # Sort base clusters by size (descending) for consistency - base_clusters.sort(key=lambda c: len(c['members']), reverse=True) - # Reassign IDs based on sorted order - for i, cluster in enumerate(base_clusters): - cluster['id'] = i + # Keep base clusters in k-means ID order (matching Clojure's sort-by :id) + # Do NOT sort by size or reassign IDs — that would change the encounter + # order of centers used in group clustering's first-k-distinct initialization. + base_clusters.sort(key=lambda c: c['id']) logger.info(f"Created {len(base_clusters)} base clusters") @@ -1211,25 +1210,31 @@ def _compute_user_vote_counts(self) -> Dict[str, int]: """ Compute the number of votes per participant. + Uses raw_rating_mat (not rating_mat) so that votes on moderated-out + comments are still counted. This matches Clojure's user-vote-counts + (conversation.clj:217-225) which reads from raw-rating-mat. + Fix D2c: see PLAN_DISCREPANCY_FIXES.md. + Returns: Dictionary mapping participant IDs to vote counts """ import time start_time = time.time() - logger.info(f"Starting _compute_user_vote_counts for {self.rating_mat.shape[0]} participants") + mat = self.raw_rating_mat + logger.info(f"Starting _compute_user_vote_counts for {mat.shape[0]} participants") vote_counts = {} # Use more efficient approach for large datasets - if self.rating_mat.shape[0] > 1000: + if mat.shape[0] > 1000: # Create a mask of non-nan values across the entire matrix - non_nan_mask = ~np.isnan(self.rating_mat.values) + non_nan_mask = ~np.isnan(mat.values) # Sum across rows using vectorized operation row_sums = np.sum(non_nan_mask, axis=1) # Convert to dictionary - for i, pid in enumerate(self.rating_mat.index): + for i, pid in enumerate(mat.index): if i < len(row_sums): vote_counts[pid] = int(row_sums[i]) else: @@ -1239,9 +1244,9 @@ def _compute_user_vote_counts(self) -> Dict[str, int]: logger.info(f"Computed vote counts for {len(vote_counts)} participants using vectorized approach in {time.time() - start_time:.4f}s") else: # Original approach for smaller datasets - for i, pid in enumerate(self.rating_mat.index): + for i, pid in enumerate(mat.index): # Get row of votes for this participant - row = self.rating_mat.values[i, :] + row = mat.values[i, :] # Count non-nan values count = np.sum(~np.isnan(row)) @@ -1259,14 +1264,24 @@ def _get_in_conv_participants(self) -> Set[str]: Matches Clojure's in-conv logic from conversation.clj lines 239-266. - Threshold: participant must have voted on at least: - 7 + sqrt(n_comments) * 0.1 comments + Threshold: participant must have voted on at least min(7, n_comments) + comments (Clojure parity fix D2). + + Both vote counts and n_cmts use raw_rating_mat (fix D2c), which includes + votes on moderated-out comments. This matches Clojure, where + zero-out-columns keeps moderated-out columns in the matrix (zeroed but + present). Since raw_rating_mat contains all historical votes and votes + are immutable in PostgreSQL, monotonicity is guaranteed without explicit + persistence — a participant who once qualified can never lose votes. + If the code is ever refactored to use delta vote processing, in-conv + MUST be persisted to DynamoDB. See compdemocracy/polis#2358 and + Clojure's approach in conv_man.clj:55, conversation.clj:244. Returns: Set of participant IDs that meet the threshold """ - n_cmts = len(self.rating_mat.columns) if hasattr(self.rating_mat, 'columns') else 0 - threshold = 7 + np.sqrt(n_cmts) * 0.1 + n_cmts = len(self.raw_rating_mat.columns) if hasattr(self.raw_rating_mat, 'columns') else 0 + threshold = min(7, n_cmts) # Get vote counts for all participants vote_counts = self._compute_user_vote_counts() diff --git a/delphi/polismath/pca_kmeans_rep/repness.py b/delphi/polismath/pca_kmeans_rep/repness.py index abbd83515..df4714b15 100644 --- a/delphi/polismath/pca_kmeans_rep/repness.py +++ b/delphi/polismath/pca_kmeans_rep/repness.py @@ -19,20 +19,18 @@ Z_90 = 1.645 # Z-score for 90% confidence Z_95 = 1.96 # Z-score for 95% confidence -# Pseudocount for Bayesian smoothing (Laplace smoothing / additive smoothing) +# Pseudocount for Bayesian smoothing (Beta prior) # # Why use pseudocounts? # - Prevents extreme probabilities (0 or 1) when sample sizes are small -# - With PSEUDO_COUNT = 1.5, we effectively add 0.75 "virtual" agrees and -# 0.75 "virtual" disagrees to each comment's vote count -# - This pulls probabilities toward 0.5 (the prior), with the effect diminishing -# as sample size grows +# - With PSEUDO_COUNT = 2.0, we add 1 "virtual" agree and 1 "virtual" disagree +# to each comment's vote count — equivalent to a Beta(2,2) prior +# - This pulls probabilities toward 0.5, with the effect diminishing as n grows # - Formula: p_agree = (n_agree + PSEUDO_COUNT/2) / (n_votes + PSEUDO_COUNT) +# i.e. (n_agree + 1) / (n_votes + 2) # -# Example: With 3 agrees out of 4 votes: -# - Raw probability: 3/4 = 0.75 -# - Smoothed (PSEUDO_COUNT=1.5): (3 + 0.75) / (4 + 1.5) = 3.75/5.5 ≈ 0.68 -PSEUDO_COUNT = 1.5 +# Matches Clojure's implementation (repness.clj). +PSEUDO_COUNT = 2.0 def z_score_sig_90(z: float) -> bool: diff --git a/delphi/polismath/regression/__init__.py b/delphi/polismath/regression/__init__.py index 981673587..e6c57293c 100644 --- a/delphi/polismath/regression/__init__.py +++ b/delphi/polismath/regression/__init__.py @@ -15,6 +15,7 @@ get_dataset_info, get_dataset_files, get_dataset_report_id, + get_blob_variants, ) __all__ = [ 'ConversationRecorder', @@ -26,4 +27,5 @@ 'get_dataset_info', 'get_dataset_files', 'get_dataset_report_id', + 'get_blob_variants', ] diff --git a/delphi/polismath/regression/datasets.py b/delphi/polismath/regression/datasets.py index 73d4cd4d8..0f8e4551b 100644 --- a/delphi/polismath/regression/datasets.py +++ b/delphi/polismath/regression/datasets.py @@ -156,12 +156,15 @@ def get_dataset_report_id(name: str) -> str: return get_dataset_info(name).report_id -def get_dataset_files(name: str, prefer_cold_start: bool = True) -> Dict[str, str]: +def get_dataset_files(name: str, prefer_cold_start: bool = True, blob_type: Optional[str] = None) -> Dict[str, str]: """Get file paths for a dataset. Args: name: Dataset name - prefer_cold_start: If True (default), use cold-start blob when available + prefer_cold_start: If True (default), use cold-start blob when available. + Ignored if blob_type is specified. + blob_type: Explicit blob type to use: 'incremental' or 'cold_start'. + If specified, overrides prefer_cold_start. """ info = get_dataset_info(name) rid = info.report_id @@ -174,11 +177,19 @@ def find_file(pattern: str) -> str: raise ValueError(f"Multiple files matching {pattern} in {info.path}: {matches}") return str(matches[0].resolve()) - # Check for cold-start blob first, fall back to original + # Determine which blob to use cold_start_blob = info.path / f"{rid}_math_blob_cold_start.json" original_blob = info.path / f"{rid}_math_blob.json" - if prefer_cold_start and cold_start_blob.exists(): + if blob_type == 'cold_start': + if not cold_start_blob.exists(): + raise FileNotFoundError(f"No cold-start blob for {name}") + math_blob_path = str(cold_start_blob) + elif blob_type == 'incremental': + if not original_blob.exists(): + raise FileNotFoundError(f"No incremental blob for {name}") + math_blob_path = str(original_blob) + elif prefer_cold_start and cold_start_blob.exists(): math_blob_path = str(cold_start_blob) else: math_blob_path = str(original_blob) @@ -193,6 +204,44 @@ def find_file(pattern: str) -> str: } +def _is_blob_filled(blob_path: Path) -> bool: + """Check if a math blob has meaningful content (PCA, non-empty clusters, etc.).""" + import json + if not blob_path.exists(): + return False + try: + with open(blob_path) as f: + data = json.load(f) + # A blob is "filled" if it has PCA data or non-empty base-clusters + has_pca = 'pca' in data and 'comps' in data.get('pca', {}) + bc = data.get('base-clusters', {}) + has_clusters = isinstance(bc, dict) and len(bc.get('id', [])) > 0 + return has_pca or has_clusters + except (json.JSONDecodeError, OSError): + return False + + +def get_blob_variants(name: str) -> List[str]: + """Get available filled blob variants for a dataset. + + Returns a list of blob_type strings ('incremental', 'cold_start') for blobs + that exist and contain meaningful data. + """ + info = get_dataset_info(name) + rid = info.report_id + variants = [] + + full_blob = info.path / f"{rid}_math_blob.json" + if _is_blob_filled(full_blob): + variants.append('incremental') + + cold_start_blob = info.path / f"{rid}_math_blob_cold_start.json" + if _is_blob_filled(cold_start_blob): + variants.append('cold_start') + + return variants + + # Legacy aliases def get_dataset_directory(report_id: str, dataset_name: Optional[str] = None) -> Path: """Find dataset directory by report_id.""" diff --git a/delphi/pyproject.toml b/delphi/pyproject.toml index abf2c4824..4bfe0ac2d 100644 --- a/delphi/pyproject.toml +++ b/delphi/pyproject.toml @@ -124,8 +124,12 @@ include = [ # Pytest configuration [tool.pytest.ini_options] -# When using pytest-xdist (-n), group tests by xdist_group marker for efficient fixture sharing -addopts = "--dist=loadgroup" +# Force sequential execution (-n0) to leverage the session-scoped conversation cache. +# The cache shares computed conversations across test files, but each xdist worker +# has its own process with a separate cache. With only 2 datasets (biodiversity, vw), +# sequential execution with caching is ~6x faster than parallel execution where each +# worker recomputes the conversations independently. +addopts = "-n0" filterwarnings = [ # Ignore python_multipart deprecation warning from ddtrace (third-party) "ignore:Please use `import python_multipart`:PendingDeprecationWarning:ddtrace.internal.module", diff --git a/delphi/scripts/clojure_comparer.py b/delphi/scripts/clojure_comparer.py index 3ec6a5691..d724daa8f 100755 --- a/delphi/scripts/clojure_comparer.py +++ b/delphi/scripts/clojure_comparer.py @@ -86,12 +86,7 @@ def main(datasets: tuple, include_local: bool, log_level: str, from polismath.regression.clojure_comparer import ClojureComparer, load_clojure_math_blob from polismath.regression.datasets import list_available_datasets, get_dataset_files from polismath.conversation import Conversation - try: - from tests.common_utils import load_votes - except ImportError: - click.echo("Error: Could not import tests.common_utils.load_votes.", err=True) - click.echo("Run this script from the delphi/ directory with: uv run python scripts/clojure_comparer.py", err=True) - raise SystemExit(1) + from polismath.benchmarks.benchmark_utils import load_votes_from_csv # Get available datasets with Clojure math_blob all_datasets = list_available_datasets(include_local=include_local) @@ -138,7 +133,7 @@ def main(datasets: tuple, include_local: bool, log_level: str, # Load and process Python output dataset_files = get_dataset_files(dataset) - votes_data = load_votes(dataset_files['votes']) + votes_data = load_votes_from_csv(dataset_files['votes']) click.echo(f"✓ Loaded votes data: {len(votes_data['votes'])} votes") @@ -226,9 +221,7 @@ def main(datasets: tuple, include_local: bool, log_level: str, click.echo(f"✓ {dataset}: PASS") else: click.echo(f"✗ {dataset}: FAIL") - click.echo(f"\nLikely cause: Initialization algorithm difference") - click.echo(f" Python: K-means++ (seed 42)") - click.echo(f" Clojure: First k distinct points") + click.echo(f" See docs/PLAN_DISCREPANCY_FIXES.md for known discrepancies") click.echo(f"{'='*60}") except Exception as e: @@ -260,8 +253,8 @@ def main(datasets: tuple, include_local: bool, log_level: str, click.echo(f"\n✗ Failed:") for name in failed_datasets: click.echo(f" {name}") - click.echo("\nNote: Failures may be due to initialization differences (K-means++ vs first-k).") - click.echo("This is expected until clustering initialization is aligned.") + click.echo("\nNote: Failures may be due to remaining Python-Clojure discrepancies.") + click.echo("See docs/PLAN_DISCREPANCY_FIXES.md for the fix plan.") return 1 else: click.echo("\n✓ All datasets passed!") diff --git a/delphi/scripts/generate_cold_start_clojure.py b/delphi/scripts/generate_cold_start_clojure.py index 51109347d..91684cd7b 100755 --- a/delphi/scripts/generate_cold_start_clojure.py +++ b/delphi/scripts/generate_cold_start_clojure.py @@ -14,8 +14,8 @@ This approach works with the Clojure poller's design rather than against it. Usage: - python scripts/generate_cold_start_clojure.py biodiversity --stop-math - python scripts/generate_cold_start_clojure.py --all --stop-math + python scripts/generate_cold_start_clojure.py biodiversity + python scripts/generate_cold_start_clojure.py --all --include-local python scripts/generate_cold_start_clojure.py biodiversity --no-cleanup """ @@ -23,6 +23,7 @@ import os import subprocess import sys +import tempfile import threading import time from pathlib import Path @@ -45,89 +46,115 @@ MATH_ENV = os.environ.get('MATH_ENV', 'prod') -def get_running_math_containers() -> list[str]: - """Get list of running math worker container names.""" +# Marker file used to track whether we (coldstart runs) paused the math worker. +# If the user paused it manually, no marker exists and we won't unpause it. +PAUSE_MARKER = Path(tempfile.gettempdir()) / 'polis-math-coldstart-paused' + + +def get_running_math_workers() -> list[str]: + """Get running math worker containers (excluding our coldstart containers).""" result = subprocess.run( ['docker', 'ps', '--filter', 'name=math', '--format', '{{.Names}}'], - capture_output=True, text=True + capture_output=True, text=True, ) - return [line for line in result.stdout.splitlines() if 'math' in line.lower()] + return [ + name for name in result.stdout.splitlines() + if name and 'coldstart' not in name + ] -def check_math_worker_running() -> bool: - """Check if math worker container is running.""" - return len(get_running_math_containers()) > 0 +def get_running_coldstart_containers(exclude: str = '') -> list[str]: + """Get running coldstart containers, optionally excluding our own.""" + result = subprocess.run( + ['docker', 'ps', '--filter', 'name=polis-math-coldstart', '--format', '{{.Names}}'], + capture_output=True, text=True, + ) + return [name for name in result.stdout.splitlines() if name and name != exclude] -def pause_math_workers() -> list[str]: - """Pause all running math worker containers. +def ensure_math_workers_paused() -> None: + """Pause math workers if running, using a marker file for coordination. - Returns list of container names that were paused. + Multiple concurrent coldstart runs coordinate via the marker file: + - First run to find workers running pauses them and creates the marker. + - Subsequent runs see workers already paused; they check the marker to + confirm it was us (not a manual user pause) and proceed. """ - containers = get_running_math_containers() - if not containers: - return [] + workers = get_running_math_workers() + if not workers: + return # Nothing running (either already paused or not started) - paused = [] - for container in containers: + click.echo(f"Pausing {len(workers)} running math worker(s) to prevent conflicts...") + for container in workers: click.echo(f" Pausing {container}...") - result = subprocess.run(['docker', 'pause', container], capture_output=True) - if result.returncode == 0: - paused.append(container) - else: - click.echo(f" Warning: Failed to pause {container}", err=True) - - return paused + subprocess.run(['docker', 'pause', container], capture_output=True) + # Create marker so the last coldstart run knows to unpause + PAUSE_MARKER.touch() + click.echo(f" ✓ Paused {len(workers)} container(s)") -def unpause_math_workers(containers: list[str]) -> int: - """Unpause previously paused math worker containers. - Args: - containers: List of container names to unpause +def maybe_unpause_math_workers(own_container: str) -> None: + """Unpause math workers if we're the last coldstart run and we caused the pause. - Returns the number of containers unpaused. + Checks two conditions before unpausing: + 1. No other coldstart containers are still running (we're the last one). + 2. The pause marker file exists (we caused the pause, not the user). """ - if not containers: - return 0 - - unpaused = 0 - for container in containers: - click.echo(f" Resuming {container}...") - result = subprocess.run(['docker', 'unpause', container], capture_output=True) - if result.returncode == 0: - unpaused += 1 - else: - click.echo(f" Warning: Failed to unpause {container}", err=True) + if not PAUSE_MARKER.exists(): + return # Pause wasn't caused by us - return unpaused + siblings = get_running_coldstart_containers(exclude=own_container) + if siblings: + click.echo(f"\n Skipping math worker unpause ({len(siblings)} other coldstart run(s) still active)") + return + # We're the last one — unpause and clean up marker + result = subprocess.run( + ['docker', 'ps', '--filter', 'name=math', '--filter', 'status=paused', '--format', '{{.Names}}'], + capture_output=True, text=True, + ) + paused = [name for name in result.stdout.splitlines() if name and 'coldstart' not in name] -def stop_math_workers() -> int: - """Stop all running math worker containers. + if paused: + click.echo(f"\nResuming {len(paused)} paused math worker(s)...") + for container in paused: + click.echo(f" Resuming {container}...") + subprocess.run(['docker', 'unpause', container], capture_output=True) + click.echo(f" ✓ Resumed {len(paused)} container(s)") - Returns the number of containers stopped. - """ - containers = get_running_math_containers() - if not containers: - return 0 + PAUSE_MARKER.unlink(missing_ok=True) - for container in containers: - click.echo(f" Stopping {container}...") - subprocess.run(['docker', 'stop', container], capture_output=True) - return len(containers) +def stop_poller_container(process: subprocess.Popen, container_name: str) -> None: + """Stop a poller process and force-remove its container.""" + if process.poll() is None: + click.echo("\n Stopping poller...") + process.terminate() + try: + process.wait(timeout=10) + except subprocess.TimeoutExpired: + process.kill() + # Force-remove the container in case --rm didn't clean it up + subprocess.run(['docker', 'rm', '-f', container_name], capture_output=True) def get_db_connection(): - """Create a connection to the Postgres database.""" + """Create a connection to the Postgres database. + + Uses DATABASE_URL from env / dotenv. When running host-side (not in Docker), + DATABASE_URL may contain 'host.docker.internal' which needs translating to + 'localhost' for the psycopg2 connection. + """ database_url = os.environ.get('DATABASE_URL') if not database_url: raise ValueError( "DATABASE_URL environment variable is not set. " "Please set it in the main polis/.env file." ) - return psycopg2.connect(database_url) + # When running host-side, translate Docker-internal hostname to localhost + host_url = database_url.replace('host.docker.internal', 'localhost') + return psycopg2.connect(host_url) def get_zid_from_report_id(conn, report_id: str) -> int | None: @@ -189,6 +216,62 @@ def create_fake_conversation(conn, source_zid: int) -> int: return fake_zid +def copy_participants(conn, source_zid: int, fake_zid: int) -> int: + """Copy participants from source to fake conversation. + + Required because comments have a FK constraint on (zid, pid) -> participants. + """ + cursor = conn.cursor() + cursor.execute(""" + INSERT INTO participants (zid, pid, uid, created) + SELECT %s, pid, uid, created + FROM participants + WHERE zid = %s + """, (fake_zid, source_zid)) + count = cursor.rowcount + conn.commit() + cursor.close() + return count + + +def copy_comments_with_fresh_timestamps(conn, source_zid: int, fake_zid: int) -> int: + """Copy comments from source to fake conversation with fresh modified timestamps. + + The Clojure mod-poller queries comments WHERE modified > last_mod_timestamp, + so fresh timestamps ensure moderation data is picked up. Without comments, + the poller has no mod-in set, causing all tids to be filtered out and PCA to + fail with 'nil has zero dimensionality'. + + The tid_auto trigger auto-assigns tids, so we disable triggers for this + session only (using session_replication_role) to preserve original tids. + This is safe for concurrent use — only affects the current DB session. + """ + cursor = conn.cursor() + now_ms = int(time.time() * 1000) + + # Disable triggers for this session only (safe for concurrent use) + cursor.execute("SET session_replication_role = 'replica'") + + try: + cursor.execute(""" + INSERT INTO comments (zid, tid, pid, txt, created, velocity, mod, active, + modified, uid, anon, is_seed, curation, is_meta) + SELECT %s, tid, pid, txt, created, velocity, mod, active, + %s, uid, anon, is_seed, curation, is_meta + FROM comments + WHERE zid = %s + """, (fake_zid, now_ms, source_zid)) + count = cursor.rowcount + conn.commit() + finally: + # Restore normal trigger behavior for this session + cursor.execute("SET session_replication_role = 'origin'") + conn.commit() + + cursor.close() + return count + + def copy_votes_with_fresh_timestamps(conn, source_zid: int, fake_zid: int) -> int: """ Copy votes from source conversation to fake conversation with fresh timestamps. @@ -270,7 +353,12 @@ def cleanup_fake_conversation(conn, fake_zid: int) -> dict: cursor.execute("DELETE FROM votes WHERE zid = %s", (fake_zid,)) cleanup_stats['votes'] = cursor.rowcount - # participants (if any were auto-created) + # comments (before participants due to FK) + cursor.execute("DELETE FROM comments WHERE zid = %s", (fake_zid,)) + if cursor.rowcount > 0: + cleanup_stats['comments'] = cursor.rowcount + + # participants cursor.execute("DELETE FROM participants WHERE zid = %s", (fake_zid,)) if cursor.rowcount > 0: cleanup_stats['participants'] = cursor.rowcount @@ -384,17 +472,19 @@ def stream_output(self, process: subprocess.Popen, prefix: str = " [clj] "): pass # Process terminated -def run_poller_for_zid(fake_zid: int, timeout_seconds: int = 300, verbose: bool = False) -> tuple[subprocess.Popen, PollerMonitor]: +def run_poller_for_zid(fake_zid: int, timeout_seconds: int = 300, verbose: bool = False) -> tuple[subprocess.Popen, PollerMonitor, str]: """ Start the Clojure poller restricted to process only the fake zid. - Returns tuple of (Popen process handle, PollerMonitor for error detection). + Returns tuple of (Popen process handle, PollerMonitor, container name). """ # Use MATH_ZID_ALLOWLIST to only process our fake conversation # This speeds things up significantly and avoids touching other conversations log_level = 'debug' if verbose else 'info' + container_name = f'polis-math-coldstart-{fake_zid}' cmd = [ 'docker', 'compose', 'run', '--rm', + '--name', container_name, '-e', 'POLL_FROM_DAYS_AGO=1', # Only need very recent votes (ours) '-e', f'MATH_ZID_ALLOWLIST={fake_zid}', '-e', f'LOGGING_LEVEL={log_level}', @@ -422,7 +512,7 @@ def run_poller_for_zid(fake_zid: int, timeout_seconds: int = 300, verbose: bool ) stream_thread.start() - return process, monitor + return process, monitor, container_name def generate_cold_start_via_fake_conversation( @@ -443,6 +533,7 @@ def generate_cold_start_via_fake_conversation( """ fake_zid = None poller_process = None + container_name = None try: # Step 1: Create fake conversation @@ -460,7 +551,7 @@ def generate_cold_start_via_fake_conversation( # Step 3: Run poller click.echo("\n[3/4] Running Clojure poller...") - poller_process, poller_monitor = run_poller_for_zid(fake_zid, timeout_seconds) + poller_process, poller_monitor, container_name = run_poller_for_zid(fake_zid, timeout_seconds) # Step 4: Wait for math computation click.echo("\n[4/4] Waiting for math computation...") @@ -477,14 +568,8 @@ def generate_cold_start_via_fake_conversation( return math_blob finally: - # Always kill the poller process if running - if poller_process and poller_process.poll() is None: - click.echo("\n Stopping poller...") - poller_process.terminate() - try: - poller_process.wait(timeout=10) - except subprocess.TimeoutExpired: - poller_process.kill() + if poller_process and container_name: + stop_poller_container(poller_process, container_name) def generate_cold_start_for_dataset( @@ -519,8 +604,14 @@ def generate_cold_start_for_dataset( click.echo(f"Error connecting to database: {e}", err=True) return False + # Pause any running math workers to prevent them from processing our + # fake conversation's votes. Coordinated with other coldstart runs via + # marker file — only the last run to finish will unpause. + ensure_math_workers_paused() + fake_zid = None poller_process = None + container_name = None try: # Look up zid from report_id @@ -542,21 +633,31 @@ def generate_cold_start_for_dataset( click.echo(f"\n--- Starting cold-start generation ---") # Create temporary conversation (so we can track it for cleanup) - click.echo("\n[1/4] Creating temporary conversation...") + click.echo("\n[1/6] Creating temporary conversation...") fake_zid = create_fake_conversation(conn, source_zid) click.echo(f" ✓ Created temporary conversation with zid {fake_zid}") + # Copy participants (required by comments FK constraint) + click.echo("\n[2/6] Copying participants...") + copied_participants = copy_participants(conn, source_zid, fake_zid) + click.echo(f" ✓ Copied {copied_participants} participants") + + # Copy comments with fresh timestamps (required for moderation data) + click.echo("\n[3/6] Copying comments with fresh timestamps...") + copied_comments = copy_comments_with_fresh_timestamps(conn, source_zid, fake_zid) + click.echo(f" ✓ Copied {copied_comments} comments") + # Copy votes - click.echo("\n[2/4] Copying votes with fresh timestamps...") + click.echo("\n[4/6] Copying votes with fresh timestamps...") copied_votes = copy_votes_with_fresh_timestamps(conn, source_zid, fake_zid) click.echo(f" ✓ Copied {copied_votes} votes") # Run poller - click.echo("\n[3/4] Running Clojure poller...") - poller_process, poller_monitor = run_poller_for_zid(fake_zid, timeout_seconds, verbose=verbose) + click.echo("\n[5/6] Running Clojure poller...") + poller_process, poller_monitor, container_name = run_poller_for_zid(fake_zid, timeout_seconds, verbose=verbose) # Wait for computation - click.echo("\n[4/4] Waiting for math computation...") + click.echo("\n[6/6] Waiting for math computation...") try: math_blob = wait_for_math_computation(conn, fake_zid, MATH_ENV, timeout_seconds, monitor=poller_monitor) except PollerError as e: @@ -593,14 +694,9 @@ def generate_cold_start_for_dataset( return True finally: - # Always kill the poller process if running - if poller_process and poller_process.poll() is None: - click.echo("\n Stopping poller...") - poller_process.terminate() - try: - poller_process.wait(timeout=10) - except subprocess.TimeoutExpired: - poller_process.kill() + # Stop and remove our poller container + if poller_process and container_name: + stop_poller_container(poller_process, container_name) # Always clean up temporary conversation data if fake_zid is not None: @@ -613,6 +709,9 @@ def generate_cold_start_for_dataset( conn.close() + # Unpause math workers if we're the last coldstart run + maybe_unpause_math_workers(container_name or '') + @click.command() @click.argument('datasets', nargs=-1) @@ -620,9 +719,8 @@ def generate_cold_start_for_dataset( @click.option('--include-local', is_flag=True, default=False, help='Include datasets from real_data/.local/') @click.option('--no-cleanup', is_flag=True, help='Do not cleanup temporary conversation (for debugging)') @click.option('--timeout', default=300, help='Timeout in seconds for math computation (default: 300)') -@click.option('--pause-math', is_flag=True, help='Automatically pause running math workers (resumes after completion)') @click.option('--verbose', '-v', is_flag=True, help='Show detailed output including Clojure poller logs') -def main(datasets: tuple, process_all: bool, include_local: bool, no_cleanup: bool, timeout: int, pause_math: bool, verbose: bool): +def main(datasets: tuple, process_all: bool, include_local: bool, no_cleanup: bool, timeout: int, verbose: bool): """ Generate cold-start Clojure math blobs for fair Python comparison. @@ -630,6 +728,13 @@ def main(datasets: tuple, process_all: bool, include_local: bool, no_cleanup: bo votes from the source), and runs the Clojure poller to compute a true cold-start math blob. + Each dataset gets its own isolated Clojure poller container (restricted + via MATH_ZID_ALLOWLIST). If a math worker is already running, it is + automatically paused to prevent it from racing on the temporary + conversation's votes. Multiple concurrent runs coordinate via a marker + file — only the last run to finish unpauses the math worker, and only + if it was paused by us (not manually by the user). + Examples: # Generate for single dataset @@ -644,9 +749,6 @@ def main(datasets: tuple, process_all: bool, include_local: bool, no_cleanup: bo # Generate for all datasets including .local/ python scripts/generate_cold_start_clojure.py --all --include-local - # Automatically pause math workers (resumes after completion) - python scripts/generate_cold_start_clojure.py biodiversity --pause-math - # Verbose mode: show Clojure poller output in real-time python scripts/generate_cold_start_clojure.py biodiversity -v @@ -660,21 +762,6 @@ def main(datasets: tuple, process_all: bool, include_local: bool, no_cleanup: bo click.echo(f" Looked in: {POLIS_DIR}/.env", err=True) raise click.Abort() - # Check if math worker is running - paused_containers: list[str] = [] - if check_math_worker_running(): - if pause_math: - click.echo("Pausing running math worker containers...") - paused_containers = pause_math_workers() - click.echo(f" ✓ Paused {len(paused_containers)} container(s)") - else: - click.echo("✗ ERROR: Math worker container is running!", err=True) - click.echo("\nThe math worker must be paused/stopped to prevent conflicts.", err=True) - click.echo("Either use --pause-math to pause it automatically, or stop it manually:", err=True) - click.echo(f" cd {POLIS_DIR}", err=True) - click.echo(" docker compose stop math", err=True) - raise click.Abort() - # Determine which datasets to process available_datasets = discover_datasets(include_local=include_local) @@ -740,11 +827,7 @@ def main(datasets: tuple, process_all: bool, include_local: bool, no_cleanup: bo click.echo("\n✓ All datasets processed successfully!") finally: - # Resume any paused math workers - if paused_containers: - click.echo("\nResuming paused math worker containers...") - n_resumed = unpause_math_workers(paused_containers) - click.echo(f" ✓ Resumed {n_resumed} container(s)") + pass # Pause/unpause is handled per-dataset in generate_cold_start_for_dataset if __name__ == '__main__': diff --git a/delphi/tests/conftest.py b/delphi/tests/conftest.py index 8fb6c1db4..297c87fbe 100644 --- a/delphi/tests/conftest.py +++ b/delphi/tests/conftest.py @@ -5,43 +5,110 @@ - Command line options --include-local and --datasets for dataset selection - Fixtures for accessing dataset information - @pytest.mark.use_discovered_datasets for dynamic dataset parametrization -- Helper functions for parallel test execution with xdist_group markers +- Session-scoped conversation cache for efficient test execution """ +from copy import deepcopy + import pytest + +from polismath.conversation.conversation import Conversation +from polismath.regression import get_dataset_files from polismath.regression.datasets import ( discover_datasets, list_regression_datasets, + get_blob_variants, ) +from tests.common_utils import load_votes, load_comments # ============================================================================= -# Parallel Execution Helpers +# Session-scoped Conversation Cache # ============================================================================= -def make_dataset_params(datasets: list[str]) -> list: +_SESSION_CONV_CACHE: dict = {} + + +@pytest.fixture(scope="session") +def get_or_compute_conversation(): + """Session-wide conversation cache shared across all test files. + + Returns a function that computes a Conversation once per dataset and + returns a deepcopy each time to preserve test isolation. + + Only ONE dataset is kept in memory at a time. When a different dataset + is requested, the previous one is evicted. This works because tests are + reordered by pytest_collection_modifyitems to group all tests for a + dataset together (across all test files). """ - Create pytest.param objects with xdist_group markers for parallel execution. + import gc + + def _get(dataset_name: str) -> dict: + if dataset_name not in _SESSION_CONV_CACHE: + # Evict previous dataset (we only keep one at a time) + for ds in list(_SESSION_CONV_CACHE.keys()): + _SESSION_CONV_CACHE.pop(ds, None) + Conversation._reset_conversion_cache() + gc.collect() - When using pytest-xdist with --dist=loadgroup, tests with the same - xdist_group marker will run on the same worker. This ensures fixtures - are computed only once per dataset per worker. + files = get_dataset_files(dataset_name, blob_type='incremental') + votes = load_votes(files['votes']) + comments = load_comments(files['comments']) + + conv = Conversation(dataset_name) + conv = conv.update_votes(votes) + conv = conv.recompute() + + _SESSION_CONV_CACHE[dataset_name] = { + 'conv': conv, + 'dataset_name': dataset_name, + 'files': files, + 'comments': comments, + } + + return deepcopy(_SESSION_CONV_CACHE[dataset_name]) + + return _get + + +# ============================================================================= +# Dataset Parametrization Helpers +# ============================================================================= + +def make_dataset_params(datasets: list[str]) -> list: + """ + Create pytest.param objects for dataset parametrization. Args: - datasets: List of dataset names + datasets: List of dataset names (or "dataset-blob_type" composite IDs) Returns: - List of pytest.param objects with xdist_group markers + List of pytest.param objects Example: @pytest.mark.parametrize("dataset_name", make_dataset_params(["biodiversity", "vw"])) def test_something(dataset_name): ... """ - return [ - pytest.param(ds, marks=pytest.mark.xdist_group(ds)) - for ds in datasets - ] + return [pytest.param(ds) for ds in datasets] + + +def parse_dataset_blob_id(composite_id: str) -> tuple[str, str]: + """Parse a 'dataset-blob_type' composite ID into (dataset_name, blob_type). + + Examples: + 'biodiversity-incremental' -> ('biodiversity', 'incremental') + 'bg2050-cold_start' -> ('bg2050', 'cold_start') + """ + if composite_id.endswith('-cold_start'): + return composite_id[:-len('-cold_start')], 'cold_start' + elif composite_id.endswith('-incremental'): + return composite_id[:-len('-incremental')], 'incremental' + else: + raise ValueError( + f"Invalid composite dataset ID: {composite_id}. " + f"Expected format: 'dataset-incremental' or 'dataset-cold_start'" + ) def pytest_addoption(parser): @@ -83,8 +150,10 @@ def pytest_configure(config): """Register custom markers.""" config.addinivalue_line( "markers", - "use_discovered_datasets: dynamically parametrize with discovered " - "datasets, respecting --include-local and --datasets CLI options" + "use_discovered_datasets(use_blobs=False): dynamically parametrize with discovered " + "datasets, respecting --include-local and --datasets CLI options. " + "With use_blobs=True, parametrize with 'dataset-blob_type' composite IDs " + "(e.g., 'biodiversity-incremental', 'engage-cold_start') for each filled blob variant." ) @@ -113,19 +182,91 @@ def pytest_generate_tests(metafunc): These tests must declare a 'dataset_name' parameter. They will be parametrized with all regression datasets, filtered by --include-local and --datasets. - Uses xdist_group markers for efficient parallel execution with pytest-xdist. + With use_blobs=True, parametrize with 'dataset-blob_type' composite IDs + (e.g., 'biodiversity-incremental', 'engage-cold_start') for each filled blob variant. + + Uses the session-scoped conversation cache for efficient test execution. """ - if not list(metafunc.definition.iter_markers("use_discovered_datasets")): + markers = list(metafunc.definition.iter_markers("use_discovered_datasets")) + if not markers: return include_local = metafunc.config.getoption("--include-local") requested = _get_requested_datasets(metafunc.config) - datasets = list_regression_datasets(include_local=include_local) - if requested: - datasets = [d for d in datasets if d in requested] + # Check if use_blobs=True was passed to the marker + use_blobs = any(m.kwargs.get('use_blobs', False) for m in markers) + + if use_blobs: + # Parametrize with composite 'dataset-blob_type' IDs + datasets = discover_datasets(include_local=include_local) + blob_ids = [] + for name, info in datasets.items(): + if not (info.has_votes and info.has_comments and info.has_clojure_reference): + continue + if requested and name not in requested: + continue + for blob_type in get_blob_variants(name): + blob_ids.append(f"{name}-{blob_type}") + metafunc.parametrize("dataset_name", make_dataset_params(blob_ids)) + else: + # Parametrize with plain dataset names + datasets = list_regression_datasets(include_local=include_local) + if requested: + datasets = [d for d in datasets if d in requested] + metafunc.parametrize("dataset_name", make_dataset_params(datasets)) + + +# ============================================================================= +# Test Reordering for Cache Efficiency +# ============================================================================= + +def _extract_dataset_from_test(item) -> str: + """Extract the dataset name from a test item's parameters. - metafunc.parametrize("dataset_name", make_dataset_params(datasets)) + Handles both plain dataset names ('biodiversity') and composite IDs + ('biodiversity-incremental'). Returns empty string if no dataset parameter. + """ + # Check callspec for parametrized values + if hasattr(item, 'callspec') and item.callspec.params: + for param_name in ('dataset_name', 'dataset_blob_id'): + if param_name in item.callspec.params: + value = item.callspec.params[param_name] + # Extract base dataset name from composite IDs + if value.endswith('-incremental'): + return value[:-len('-incremental')] + elif value.endswith('-cold_start'): + return value[:-len('-cold_start')] + return value + return '' + + +def pytest_collection_modifyitems(session, config, items): + """Reorder tests to group by dataset for cache efficiency. + + Groups all tests for a dataset together (across all test files) so that + the session-scoped conversation cache only needs to hold ONE dataset at + a time. This reduces peak memory from O(N datasets) to O(1 dataset). + + Order: dataset1[file1, file2, ...], dataset2[file1, file2, ...], ... + Within each dataset, original test order is preserved. + """ + # Separate tests into dataset-parametrized and non-parametrized + dataset_tests = [] + other_tests = [] + + for item in items: + ds = _extract_dataset_from_test(item) + if ds: + dataset_tests.append((ds, item)) + else: + other_tests.append(item) + + # Sort dataset tests by dataset name (stable sort preserves order within dataset) + dataset_tests.sort(key=lambda x: x[0]) + + # Rebuild items list: non-parametrized first, then dataset tests grouped + items[:] = other_tests + [item for _, item in dataset_tests] # Provide summary of discovered datasets at start of test run diff --git a/delphi/tests/simplified_repness_test.py b/delphi/tests/simplified_repness_test.py index cb25b2e01..4097f9dcc 100644 --- a/delphi/tests/simplified_repness_test.py +++ b/delphi/tests/simplified_repness_test.py @@ -17,7 +17,7 @@ # Constants Z_90 = 1.645 # Z-score for 90% confidence -PSEUDO_COUNT = 1.5 # Pseudocount for Bayesian smoothing +PSEUDO_COUNT = 2.0 # Pseudocount for Bayesian smoothing (Beta(2,2) prior, matches Clojure) def prop_test(p: float, n: int, p0: float) -> float: """One-proportion z-test.""" diff --git a/delphi/tests/test_conversation.py b/delphi/tests/test_conversation.py index 6447d5af9..c6c950243 100644 --- a/delphi/tests/test_conversation.py +++ b/delphi/tests/test_conversation.py @@ -427,7 +427,7 @@ def test_recompute(self): } # Create two distinct opinion groups with enough votes per participant - # to meet the vote threshold (7 + sqrt(n_comments) * 0.1) + # to meet the vote threshold (min(7, n_comments)) for i in range(20): pid = f'p{i}' diff --git a/delphi/tests/test_discrepancy_fixes.py b/delphi/tests/test_discrepancy_fixes.py new file mode 100644 index 000000000..9652d7dca --- /dev/null +++ b/delphi/tests/test_discrepancy_fixes.py @@ -0,0 +1,925 @@ +""" +Per-discrepancy tests for Python-Clojure parity fixes. + +Each test class targets ONE specific discrepancy from the fix plan +(delphi/docs/PLAN_DISCREPANCY_FIXES.md). Tests are designed to FAIL before +the fix is applied and PASS after. They are parametrized by ALL available +datasets with Clojure reference blobs. + +Discrepancies tested: + D2 - In-conv participant threshold + D4 - Pseudocount formula + D5 - Proportion test formula + D6 - Two-proportion test adjustment + D7 - Repness metric formula + D8 - Finalize comment stats logic + D9 - Z-score significance thresholds + D10 - Representative comment selection + D11 - Consensus comment selection + D12 - Comment priorities + D15 - Moderation handling + +Not tested here (deferred or tested elsewhere): + D1/D1b - PCA sign flips (needs replay infrastructure) + D3 - K-smoother buffer (needs replay infrastructure) + D13 - Subgroup clustering (unused, deferred) + D14 - Large conv optimization (deferred) +""" + +import math + +import numpy as np +import pytest +import pytest_check as check + +from polismath.conversation.conversation import Conversation +from polismath.pca_kmeans_rep.repness import ( + PSEUDO_COUNT, + Z_90, + Z_95, + prop_test, + two_prop_test, + repness_metric, + finalize_cmt_stats, +) +from polismath.regression import get_dataset_files, get_blob_variants +from polismath.regression.datasets import discover_datasets +from conftest import _get_requested_datasets, make_dataset_params, parse_dataset_blob_id +from tests.common_utils import load_clojure_output + + +# --------------------------------------------------------------------------- +# Dataset+blob parametrization (same pattern as test_legacy_clojure_regression.py) +# --------------------------------------------------------------------------- + +def _get_clojure_dataset_blob_ids(include_local: bool, requested: set[str] | None = None) -> list[str]: + """Get composite 'dataset-blob_type' IDs for all filled blobs.""" + datasets = discover_datasets(include_local=include_local) + result = [] + for name, info in datasets.items(): + if not (info.has_votes and info.has_comments and info.has_clojure_reference): + continue + if requested and name not in requested: + continue + for blob_type in get_blob_variants(name): + result.append(f"{name}-{blob_type}") + return result + + +def pytest_generate_tests(metafunc): + """Parametrize tests with clojure dataset+blob_type at collection time.""" + if "dataset_name" in metafunc.fixturenames: + include_local = metafunc.config.getoption("--include-local", default=False) + requested = _get_requested_datasets(metafunc.config) + blob_ids = _get_clojure_dataset_blob_ids(include_local, requested) + params = make_dataset_params(blob_ids) + metafunc.parametrize("dataset_name", params, scope="class") + + +# --------------------------------------------------------------------------- +# Shared fixtures +# --------------------------------------------------------------------------- + +# Module-level cache for blobs (keyed by composite ID) +_BLOB_CACHE: dict = {} + + +@pytest.fixture(scope="class") +def conversation_data(dataset_name, get_or_compute_conversation): + """Class-scoped fixture: runs the full pipeline once per dataset+blob_type. + + dataset_name here is actually a composite 'dataset-blob_type' ID + (e.g., 'biodiversity-full'). The Conversation is shared across blob variants + via the session-scoped get_or_compute_conversation fixture. + """ + global _BLOB_CACHE + ds_name, blob_type = parse_dataset_blob_id(dataset_name) + + # Get or compute the conversation (shared across blob variants via session cache) + conv_data = get_or_compute_conversation(ds_name) + + # Load the specific blob variant (cache per composite ID) + if dataset_name not in _BLOB_CACHE: + for bid in list(_BLOB_CACHE.keys()): + if not bid.startswith(ds_name + '-'): + _BLOB_CACHE.pop(bid, None) + files = get_dataset_files(ds_name, blob_type=blob_type) + clojure = load_clojure_output(files['math_blob']) + _BLOB_CACHE[dataset_name] = clojure + + return { + 'conv': conv_data['conv'], + 'clojure': _BLOB_CACHE[dataset_name], + 'dataset_name': ds_name, + 'blob_type': blob_type, + 'files': conv_data['files'], + 'comments': conv_data['comments'], + } + + +@pytest.fixture(scope="class") +def clojure_blob(conversation_data): + return conversation_data['clojure'] + + +@pytest.fixture(scope="class") +def conv(conversation_data): + return conversation_data['conv'] + + +# --------------------------------------------------------------------------- +# Helpers for extracting Clojure repness data +# --------------------------------------------------------------------------- + +def _clojure_repness_entries(blob: dict) -> list[dict]: + """Flatten Clojure repness dict {gid: [entries]} into a list with gid added.""" + repness = blob.get('repness', {}) + result = [] + for gid_str, entries in repness.items(): + gid = int(gid_str) + for entry in entries: + result.append({**entry, 'gid': gid}) + return result + + +def _clojure_in_conv_set(blob: dict) -> set[int]: + """Get the set of in-conv participant IDs from Clojure blob.""" + return set(blob.get('in-conv', [])) + + +# ============================================================================ +# D2 — In-Conv Participant Threshold +# ============================================================================ + +@pytest.mark.clojure_comparison +class TestD2InConvThreshold: + """ + D2: Clojure uses threshold = min(7, n_cmts) for in-conv filtering. + Python now matches (fixed from 7 + sqrt(n_cmts) * 0.1). + """ + + def test_in_conv_count_matches(self, conv, clojure_blob, dataset_name): + """Number of in-conv participants should match Clojure.""" + if 'in-conv' not in clojure_blob: + pytest.skip(f"[{dataset_name}] Clojure blob has no in-conv data") + if dataset_name.endswith('-incremental'): + # Incremental blobs were built progressively as votes trickled in, + # so the threshold min(7, n_cmts) was evaluated at each iteration + # with a smaller n_cmts than the final value. This admits a few + # extra participants to in-conv during earlier iterations. + # The difference is tiny (1-2 participants) when a cold-start blob + # is available. Very large conversations have empty cold-start blobs + # because Clojure can't process them in one pass. + pytest.xfail("D2: behaviour matches on cold-start, incremental deferred to future PR") + clojure_in_conv = _clojure_in_conv_set(clojure_blob) + python_in_conv_count = len(conv._get_in_conv_participants()) + + print(f"[{dataset_name}] In-conv: Python={python_in_conv_count}, Clojure={len(clojure_in_conv)}") + check.equal(python_in_conv_count, len(clojure_in_conv), + f"In-conv count mismatch: Python={python_in_conv_count}, Clojure={len(clojure_in_conv)}") + + def test_in_conv_set_matches(self, conv, clojure_blob, dataset_name): + """The actual set of in-conv participants should match Clojure.""" + if 'in-conv' not in clojure_blob: + pytest.skip(f"[{dataset_name}] Clojure blob has no in-conv data") + if dataset_name.endswith('-incremental'): + # Incremental blobs were built progressively as votes trickled in, + # so the threshold min(7, n_cmts) was evaluated at each iteration + # with a smaller n_cmts than the final value. This admits a few + # extra participants to in-conv during earlier iterations. + # The difference is tiny (1-2 participants) when a cold-start blob + # is available. Very large conversations have empty cold-start blobs + # because Clojure can't process them in one pass. + pytest.xfail("D2: behaviour matches on cold-start, incremental deferred to future PR") + clojure_in_conv = _clojure_in_conv_set(clojure_blob) + python_in_conv = conv._get_in_conv_participants() + # Convert python pids to int for comparison + python_in_conv_int = set() + for pid in python_in_conv: + try: + python_in_conv_int.add(int(pid)) + except (ValueError, TypeError): + python_in_conv_int.add(pid) + + only_python = python_in_conv_int - clojure_in_conv + only_clojure = clojure_in_conv - python_in_conv_int + + print(f"[{dataset_name}] Only in Python: {len(only_python)}, Only in Clojure: {len(only_clojure)}") + + check.equal(only_python, set(), f"Participants in Python but not Clojure: {len(only_python)}") + check.equal(only_clojure, set(), f"Participants in Clojure but not Python: {len(only_clojure)}") + + +# ============================================================================ +# D2c — Vote Count Source (raw vs filtered matrix) +# ============================================================================ + +def _build_conv_with_moderation( + n_comments: int = 10, + mod_out_tids: list | None = None, + participant_votes: dict | None = None, +) -> Conversation: + """Build a synthetic Conversation with moderation applied. + + Args: + n_comments: Total number of comments (tids 0..n_comments-1). + mod_out_tids: List of tids to moderate-out. + participant_votes: Dict mapping pid → list of tids they voted on. + If None, a single participant votes on all comments. + + Returns: + A Conversation with votes ingested and moderation applied (no recompute). + """ + if participant_votes is None: + participant_votes = {0: list(range(n_comments))} + + votes_list = [] + for pid, tids in participant_votes.items(): + for tid in tids: + votes_list.append({'pid': pid, 'tid': tid, 'vote': 1}) + + conv = Conversation('test-d2c') + conv = conv.update_votes({'votes': votes_list}, recompute=False) + + if mod_out_tids: + conv = conv.update_moderation( + {'mod_out_tids': mod_out_tids}, recompute=False + ) + + return conv + + +class TestD2cVoteCountSource: + """ + D2c: Vote counts for in-conv threshold must come from raw_rating_mat + (includes votes on moderated-out comments), not rating_mat (filtered). + + Clojure's user-vote-counts (conversation.clj:217-225) uses raw-rating-mat. + Python currently uses self.rating_mat, undercounting when comments are + moderated-out. + """ + + def test_vote_count_includes_moderated_out_votes(self): + """Participant who voted on 10 comments (3 moderated-out) should have count=10.""" + conv = _build_conv_with_moderation( + n_comments=10, + mod_out_tids=[0, 1, 2], + participant_votes={0: list(range(10))}, + ) + + vote_counts = conv._compute_user_vote_counts() + assert vote_counts[0] == 10, ( + f"Vote count should be 10 (from raw_rating_mat), got {vote_counts[0]} " + f"(rating_mat has {len(conv.rating_mat.columns)} columns)" + ) + + def test_n_cmts_includes_moderated_out_comments(self): + """n_cmts in threshold should count all comments including moderated-out. + + With 10 total comments and 5 moderated-out, n_cmts should be 10. + This matters when non-moderated-out count < 7: the threshold would be + artificially low in Python (min(7,5)=5 vs correct min(7,10)=7). + """ + conv = _build_conv_with_moderation( + n_comments=10, + mod_out_tids=[0, 1, 2, 3, 4], + participant_votes={0: list(range(10))}, + ) + + # raw_rating_mat has all columns; rating_mat has only non-moderated-out + n_cmts_raw = len(conv.raw_rating_mat.columns) + n_cmts_filtered = len(conv.rating_mat.columns) + + assert n_cmts_raw == 10, f"raw_rating_mat should have 10 columns, got {n_cmts_raw}" + assert n_cmts_filtered == 5, f"rating_mat should have 5 columns, got {n_cmts_filtered}" + + # The threshold used by _get_in_conv_participants should be min(7, 10) = 7, + # not min(7, 5) = 5. Verify indirectly: participant with exactly 6 votes + # should NOT be in-conv (threshold=7), but would be if n_cmts=5 (threshold=5). + conv2 = _build_conv_with_moderation( + n_comments=10, + mod_out_tids=[0, 1, 2, 3, 4], + participant_votes={ + 0: list(range(10)), # 10 raw votes → in-conv + 1: list(range(5, 11)), # 6 raw votes (only 1 moderated-out) → NOT in-conv + }, + ) + in_conv = conv2._get_in_conv_participants() + assert 0 in in_conv, "P0 (10 raw votes) should be in-conv" + assert 1 not in in_conv, ( + "P1 (6 raw votes) should NOT be in-conv with threshold=7, " + "but would be if n_cmts wrongly used filtered count (5)" + ) + + def test_participant_stays_in_conv_after_moderation(self): + """Participant with 8 votes stays in-conv even when 3 comments moderated-out. + + This is the critical scenario: participant votes above threshold (8 >= 7), + comments get moderated-out between update_votes calls reducing filtered + count to 5, but raw count is still 8 so they should remain in-conv. + """ + # Participant 0: votes on 8 comments (above threshold of 7) + # Participant 1: votes on 7 comments (borderline) + conv = _build_conv_with_moderation( + n_comments=10, + mod_out_tids=[0, 1, 2], # 3 moderated-out + participant_votes={ + 0: list(range(8)), # votes on c0-c7; after mod, only c3-c7 visible (5) + 1: list(range(7)), # votes on c0-c6; after mod, only c3-c6 visible (4) + }, + ) + + in_conv = conv._get_in_conv_participants() + + # Both should be in-conv: raw counts are 8 and 7, threshold is min(7, 10) = 7 + assert 0 in in_conv, ( + f"Participant 0 (8 raw votes) should be in-conv, " + f"but filtered count is {np.sum(~np.isnan(conv.rating_mat.loc[0].values))}" + ) + assert 1 in in_conv, ( + f"Participant 1 (7 raw votes) should be in-conv, " + f"but filtered count is {np.sum(~np.isnan(conv.rating_mat.loc[1].values))}" + ) + + +# ============================================================================ +# D2d — In-Conv Monotonicity +# ============================================================================ + +class TestD2dInConvMonotonicity: + """ + D2d: Once a participant qualifies for in-conv, they must never be removed. + + These tests pass today because Python does full recompute from raw_rating_mat + (which includes all historical votes, even on moderated-out comments). + If the code is ever refactored to use delta vote processing, in-conv MUST be + persisted to DynamoDB — see compdemocracy/polis#2358 and the Clojure approach + in conv_man.clj:55, conversation.clj:244. + """ + + def test_t1_basic_monotonicity_across_updates(self): + """Participant who qualified in batch 1 stays in-conv after batch 2. + + P votes on 7 comments in batch 1 → qualifies. Batch 2 adds new comments + but P doesn't vote on them. P's count stays 7 → still in-conv. + + Would FAIL under delta processing without persistence if batch 2 only + contained new votes and P's old votes weren't re-scanned. + """ + conv = Conversation('test-d2d-t1') + + # Batch 1: P0 votes on 7 comments, P1 votes on all 10 + batch1 = {'votes': [ + *[{'pid': 0, 'tid': tid, 'vote': 1} for tid in range(7)], + *[{'pid': 1, 'tid': tid, 'vote': 1} for tid in range(10)], + ]} + conv = conv.update_votes(batch1, recompute=False) + in_conv_after_b1 = conv._get_in_conv_participants() + assert 0 in in_conv_after_b1, "P0 should be in-conv after batch 1 (7 votes)" + + # Batch 2: new comments c10-c14, only P1 votes on them + batch2 = {'votes': [ + *[{'pid': 1, 'tid': tid, 'vote': 1} for tid in range(10, 15)], + ]} + conv = conv.update_votes(batch2, recompute=False) + in_conv_after_b2 = conv._get_in_conv_participants() + assert 0 in in_conv_after_b2, ( + "P0 should still be in-conv after batch 2 (7 votes unchanged)" + ) + + def test_t2_monotonicity_survives_moderation(self): + """Participant stays in-conv after their voted comments are moderated-out. + + P votes on 7 comments → qualifies. 3 comments moderated-out. P's raw + count is still 7 → still in-conv (because raw_rating_mat is used). + + Would FAIL under delta processing without persistence if moderation + caused a recount from only the filtered matrix. + """ + conv = Conversation('test-d2d-t2') + votes = {'votes': [ + *[{'pid': 0, 'tid': tid, 'vote': 1} for tid in range(7)], + *[{'pid': 1, 'tid': tid, 'vote': 1} for tid in range(10)], + ]} + conv = conv.update_votes(votes, recompute=False) + + in_conv_before = conv._get_in_conv_participants() + assert 0 in in_conv_before, "P0 should be in-conv before moderation" + + # Moderate out 3 of P0's comments + conv = conv.update_moderation({'mod_out_tids': [0, 1, 2]}, recompute=False) + + in_conv_after = conv._get_in_conv_participants() + assert 0 in in_conv_after, ( + "P0 should still be in-conv after moderation " + "(7 raw votes, only 4 on filtered matrix)" + ) + + def test_t3_worker_restart_with_moderation(self): + """Participant survives worker restart + moderation. + + P votes on 7 comments → qualifies. Worker dies. 3 comments moderated-out. + New worker rebuilds from all votes. P still has 7 raw votes → in-conv. + + This is the KEY test that would FAIL under delta processing without + persistence: after restart, only new votes would be scanned. + """ + # Original worker + conv1 = Conversation('test-d2d-t3') + votes = {'votes': [ + *[{'pid': 0, 'tid': tid, 'vote': 1} for tid in range(7)], + *[{'pid': 1, 'tid': tid, 'vote': 1} for tid in range(10)], + ]} + conv1 = conv1.update_votes(votes, recompute=False) + assert 0 in conv1._get_in_conv_participants() + + # Simulate worker restart: new Conversation object, replay ALL votes + conv2 = Conversation('test-d2d-t3') + conv2 = conv2.update_votes(votes, recompute=False) + + # Apply moderation that happened while worker was dead + conv2 = conv2.update_moderation({'mod_out_tids': [0, 1, 2]}, recompute=False) + + in_conv = conv2._get_in_conv_participants() + assert 0 in in_conv, ( + "P0 should be in-conv after restart+moderation " + "(full recompute from all votes)" + ) + + def test_t4_worker_restart_moderation_no_new_votes(self): + """Rebuild from existing votes after moderation, no new votes needed. + + Same as T3 but verifies that recompute alone is sufficient — no "trigger" + of new votes is needed to re-evaluate in-conv. + """ + votes = {'votes': [ + *[{'pid': 0, 'tid': tid, 'vote': 1} for tid in range(7)], + *[{'pid': 1, 'tid': tid, 'vote': 1} for tid in range(10)], + ]} + + # Build from scratch with moderation already applied + conv = Conversation('test-d2d-t4') + conv = conv.update_votes(votes, recompute=False) + conv = conv.update_moderation({'mod_out_tids': [0, 1, 2]}, recompute=False) + + in_conv = conv._get_in_conv_participants() + assert 0 in in_conv, ( + "P0 should be in-conv with just existing votes + moderation" + ) + + def test_t5_mixed_participants_moderation(self): + """Both old and new participants correct after moderation. + + P0 votes on c0-c6. c0-c2 moderated-out. New participant Q votes on c3-c9. + Rebuild from all votes. Both should be in-conv: + - P0: 7 raw votes (c0-c6), threshold min(7,10)=7 → qualifies + - Q: 7 votes on non-moderated-out comments → qualifies + """ + conv = Conversation('test-d2d-t5') + votes = {'votes': [ + *[{'pid': 0, 'tid': tid, 'vote': 1} for tid in range(7)], # c0-c6 + *[{'pid': 1, 'tid': tid, 'vote': 1} for tid in range(3, 10)], # c3-c9 + ]} + conv = conv.update_votes(votes, recompute=False) + conv = conv.update_moderation({'mod_out_tids': [0, 1, 2]}, recompute=False) + + in_conv = conv._get_in_conv_participants() + assert 0 in in_conv, "P0 (7 raw votes) should be in-conv" + assert 1 in in_conv, "P1 (7 votes on non-moderated-out comments) should be in-conv" + + +# ============================================================================ +# D4 — Pseudocount Formula +# ============================================================================ + +@pytest.mark.clojure_comparison +class TestD4Pseudocount: + """ + D4: Python uses PSEUDO_COUNT = 1.5 → pa = (na + 0.75) / (ns + 1.5) + Clojure uses PSEUDO_COUNT = 2.0 → pa = (na + 1) / (ns + 2) + """ + + def test_pseudocount_constant(self): + """Verify the pseudocount constant matches Clojure's Beta(2,2) prior.""" + # Target: PSEUDO_COUNT = 2.0 + check.equal(PSEUDO_COUNT, 2.0, + f"PSEUDO_COUNT should be 2.0 (Clojure Beta(2,2) prior), got {PSEUDO_COUNT}") + + def test_pa_values_match_clojure(self, conv, clojure_blob, dataset_name): + """p-success values should match Clojure for specific (group, comment) pairs.""" + clojure_entries = _clojure_repness_entries(clojure_blob) + if not clojure_entries: + pytest.skip("No repness entries in Clojure blob") + + mismatches = 0 + total = 0 + for entry in clojure_entries: + clj_pa = entry.get('p-success') + clj_na = entry.get('n-success') + clj_ns = entry.get('n-trials') + if clj_pa is None or clj_na is None or clj_ns is None: + continue + + # Recompute with Python's current pseudocount + python_pa = (clj_na + PSEUDO_COUNT / 2) / (clj_ns + PSEUDO_COUNT) + total += 1 + + if abs(python_pa - clj_pa) > 1e-6: + mismatches += 1 + + print(f"[{dataset_name}] pa mismatches: {mismatches}/{total}") + check.equal(mismatches, 0, f"pa values differ for {mismatches}/{total} entries") + + +# ============================================================================ +# D9 — Z-Score Significance Thresholds +# ============================================================================ + +@pytest.mark.clojure_comparison +class TestD9ZScoreThresholds: + """ + D9: Python uses two-tailed z-scores (Z_90=1.645, Z_95=1.96) + Clojure uses one-tailed z-scores (Z_90=1.2816, Z_95=1.6449) + + Python's higher thresholds mean fewer comments pass significance, + leading to empty comment_repness. + """ + + @pytest.mark.xfail(reason="D9: Z_90=1.645 (two-tailed), target is 1.2816 (one-tailed)") + def test_z90_matches_clojure(self): + """Z_90 should be one-tailed (1.2816), not two-tailed (1.645).""" + check.almost_equal(Z_90, 1.2816, abs=0.001, + msg=f"Z_90 should be 1.2816 (one-tailed), got {Z_90}") + + @pytest.mark.xfail(reason="D9: Z_95=1.96 (two-tailed), target is 1.6449 (one-tailed)") + def test_z95_matches_clojure(self): + """Z_95 should be one-tailed (1.6449), not two-tailed (1.96).""" + check.almost_equal(Z_95, 1.6449, abs=0.001, + msg=f"Z_95 should be 1.6449 (one-tailed), got {Z_95}") + + def test_repness_not_empty(self, conv, dataset_name): + """Repness should produce non-empty comment_repness with correct thresholds.""" + repness = conv.repness + check.is_not_none(repness, "Repness should not be None") + if repness: + check.is_in('comment_repness', repness, "Should have comment_repness key") + if 'comment_repness' in repness: + check.greater(len(repness['comment_repness']), 0, + "comment_repness should not be empty") + + +# ============================================================================ +# D5 — Proportion Test Formula +# ============================================================================ + +@pytest.mark.clojure_comparison +class TestD5ProportionTest: + """ + D5: Python uses standard z-test: (p - 0.5) / sqrt(0.25/n) + Clojure uses Wilson-score-like: 2*sqrt(n+1)*((succ+1)/(n+1) - 0.5) + + Clojure formula has built-in regularization via +1 terms. + """ + + @pytest.mark.xfail(reason="D5: Python standard z-test vs Clojure Wilson-score-like") + def test_prop_test_matches_clojure_formula(self): + """prop_test should match Clojure's formula for known inputs.""" + # Example: 12 successes out of 13 trials + succ, n = 12, 13 + # Clojure formula: 2 * sqrt(n+1) * ((succ+1)/(n+1) - 0.5) + expected = 2 * math.sqrt(n + 1) * ((succ + 1) / (n + 1) - 0.5) + + # Current Python: prop_test(p, n, 0.5) where p = (succ + pc/2) / (n + pc) + p = (succ + PSEUDO_COUNT / 2) / (n + PSEUDO_COUNT) + python_result = prop_test(p, n, 0.5) + + print(f"prop_test(succ={succ}, n={n}): Python={python_result:.4f}, Clojure={expected:.4f}") + check.almost_equal(python_result, expected, abs=0.01, + msg=f"prop_test mismatch: Python={python_result:.4f}, Clojure={expected:.4f}") + + def test_clojure_pat_values_consistent_with_formula(self, clojure_blob, dataset_name): + """Sanity check: Clojure's p-test values match the documented formula.""" + clojure_entries = _clojure_repness_entries(clojure_blob) + if not clojure_entries: + pytest.skip("No repness entries in Clojure blob") + + mismatches = 0 + total = 0 + max_diff = 0.0 + for entry in clojure_entries: + clj_pat = entry.get('p-test') + if clj_pat is None: + continue + clj_na = entry.get('n-success', 0) + clj_ns = entry.get('n-trials', 0) + if clj_ns == 0: + continue + + # Recompute using Clojure's documented formula + expected = 2 * math.sqrt(clj_ns + 1) * ((clj_na + 1) / (clj_ns + 1) - 0.5) + + total += 1 + diff = abs(clj_pat - expected) + if diff > 0.01: + mismatches += 1 + max_diff = max(max_diff, diff) + + print(f"[{dataset_name}] pat consistency: {total - mismatches}/{total} match formula (max_diff={max_diff:.4f})") + check.equal(mismatches, 0, f"Clojure p-test values don't match formula for {mismatches}/{total}") + + +# ============================================================================ +# D6 — Two-Proportion Test Adjustment +# ============================================================================ + +@pytest.mark.clojure_comparison +class TestD6TwoPropTest: + """ + D6: Python uses standard two-proportion z-test without pseudocounts. + Clojure adds +1 pseudocount to all 4 inputs (succ1, n1, succ2, n2). + """ + + def test_two_prop_test_with_pseudocounts(self): + """two_prop_test should add +1 pseudocounts matching Clojure.""" + # With pseudocounts: (succ+1)/(n+2) for both groups + succ1, n1 = 10, 20 + succ2, n2 = 15, 30 + + # Clojure formula adds +1 to successes and +2 to trials + p1_clj = (succ1 + 1) / (n1 + 2) + p2_clj = (succ2 + 1) / (n2 + 2) + p_pooled_clj = (succ1 + succ2 + 2) / (n1 + n2 + 4) + se_clj = math.sqrt(p_pooled_clj * (1 - p_pooled_clj) * (1 / (n1 + 2) + 1 / (n2 + 2))) + expected = (p1_clj - p2_clj) / se_clj if se_clj > 0 else 0.0 + + # Python currently doesn't add pseudocounts + p1_py = succ1 / n1 + p2_py = succ2 / n2 + python_result = two_prop_test(p1_py, n1, p2_py, n2) + + print(f"two_prop_test: Python={python_result:.4f}, Clojure(with pseudocounts)={expected:.4f}") + check.almost_equal(python_result, expected, abs=0.01, + msg=f"two_prop_test should include pseudocounts: Python={python_result:.4f}, expected={expected:.4f}") + + +# ============================================================================ +# D7 — Repness Metric Formula +# ============================================================================ + +@pytest.mark.clojure_comparison +class TestD7RepnessMetric: + """ + D7: Python uses pa * (|pat| + |rat|) — weighted sum of absolutes + Clojure uses ra * rat * pa * pat — product of signed values + + The Clojure product formula is more conservative: any factor near 0 + kills the whole metric. + """ + + @pytest.mark.xfail(reason="D7: Python uses pa*(|pat|+|rat|), target is ra*rat*pa*pat") + def test_metric_formula_is_product(self): + """repness_metric should use product formula (ra * rat * pa * pat).""" + stats = { + 'pa': 0.8, 'pat': 2.5, 'ra': 1.3, 'rat': 1.8, + 'pd': 0.2, 'pdt': -1.5, 'rd': 0.7, 'rdt': -0.9, + } + + # Clojure formula for agree: ra * rat * pa * pat + expected_agree = stats['ra'] * stats['rat'] * stats['pa'] * stats['pat'] + # Current Python formula: pa * (|pat| + |rat|) + current_python = stats['pa'] * (abs(stats['pat']) + abs(stats['rat'])) + + result = repness_metric(stats, 'a') + print(f"agree_metric: current={result:.4f}, expected(Clojure)={expected_agree:.4f}, current_formula={current_python:.4f}") + + check.almost_equal(result, expected_agree, abs=0.01, + msg=f"agree_metric should be ra*rat*pa*pat={expected_agree:.4f}, got {result:.4f}") + + +# ============================================================================ +# D8 — Finalize Comment Stats Logic +# ============================================================================ + +@pytest.mark.clojure_comparison +class TestD8FinalizeStats: + """ + D8: Python uses if pa > 0.5 AND ra > 1.0 → 'agree'; elif pd > 0.5 AND rd > 1.0 → 'disagree' + Clojure uses simple rat > rdt → 'agree'; else → 'disagree' + """ + + @pytest.mark.xfail(reason="D8: Python uses pa/ra thresholds, target is rat>rdt comparison") + def test_repful_uses_rat_vs_rdt(self): + """repful classification should use rat > rdt (Clojure logic).""" + # Case where Python and Clojure disagree: + # pa > 0.5 and ra > 1.0 → Python says 'agree' + # but rat < rdt → Clojure says 'disagree' + stats = { + 'pa': 0.6, 'pat': 1.0, 'ra': 1.2, 'rat': 0.5, + 'pd': 0.4, 'pdt': -0.5, 'rd': 0.8, 'rdt': 1.5, + 'agree_metric': 0.0, # placeholder + 'disagree_metric': 0.0, + } + + result = finalize_cmt_stats(stats) + + # Clojure: rat (0.5) < rdt (1.5) → 'disagree' + # Python: pa (0.6) > 0.5 and ra (1.2) > 1.0 → 'agree' + check.equal(result['repful'], 'disagree', + f"repful should be 'disagree' when rat < rdt, got '{result['repful']}'") + + +# ============================================================================ +# D10 — Representative Comment Selection +# ============================================================================ + +@pytest.mark.clojure_comparison +class TestD10RepCommentSelection: + """ + D10: Python selects 3 agree + 2 disagree = 5 total + Clojure selects up to 5 total, agrees first, with beats-best-by-test logic + """ + + @pytest.mark.xfail(reason="D10: Different selection logic than Clojure") + def test_rep_comments_match_clojure(self, conv, clojure_blob, dataset_name): + """Selected representative comments per group should match Clojure.""" + clojure_repness = clojure_blob.get('repness', {}) + if not clojure_repness: + pytest.skip("No repness in Clojure blob") + + python_repness = conv.repness or {} + group_repness = python_repness.get('group_repness', {}) + + total_groups = 0 + matching_groups = 0 + + for gid_str, clj_entries in clojure_repness.items(): + gid = int(gid_str) + clj_tids = set(e['tid'] for e in clj_entries) + + py_entries = group_repness.get(gid, []) + py_tids = set(e.get('comment_id') for e in py_entries) + + total_groups += 1 + overlap = len(clj_tids & py_tids) + total_unique = len(clj_tids | py_tids) + + if clj_tids == py_tids: + matching_groups += 1 + + print(f"[{dataset_name}] Group {gid}: Clojure tids={sorted(clj_tids)}, Python tids={sorted(py_tids)}, overlap={overlap}/{total_unique}") + + check.equal(matching_groups, total_groups, + f"Only {matching_groups}/{total_groups} groups have matching rep comments") + + +# ============================================================================ +# D11 — Consensus Comment Selection +# ============================================================================ + +@pytest.mark.clojure_comparison +class TestD11ConsensusSelection: + """ + D11: Python uses ALL groups pa > 0.6, top 2 overall + Clojure uses per-comment pa > 0.5, top 5 agree + 5 disagree with z-test scores + """ + + @pytest.mark.xfail(reason="D11: Different consensus selection logic than Clojure") + def test_consensus_matches_clojure(self, conv, clojure_blob, dataset_name): + """Consensus comments should match Clojure's selection.""" + clj_consensus = clojure_blob.get('consensus', {}) + if not clj_consensus: + pytest.skip("No consensus in Clojure blob") + + # Clojure consensus has 'agree' and 'disagree' keys + clj_agree_tids = set(e['tid'] for e in clj_consensus.get('agree', [])) + clj_disagree_tids = set(e['tid'] for e in clj_consensus.get('disagree', [])) + clj_all = clj_agree_tids | clj_disagree_tids + + py_consensus = conv.repness.get('consensus_comments', []) if conv.repness else [] + py_tids = set(c.get('comment_id') for c in py_consensus) + + print(f"[{dataset_name}] Consensus: Clojure agree={sorted(clj_agree_tids)}, disagree={sorted(clj_disagree_tids)}") + print(f"[{dataset_name}] Consensus: Python={sorted(py_tids)}") + + overlap = len(clj_all & py_tids) + print(f"[{dataset_name}] Consensus overlap: {overlap}/{len(clj_all)}") + + check.equal(py_tids, clj_all, + f"Consensus mismatch: Python={sorted(py_tids)}, Clojure={sorted(clj_all)}") + + +# ============================================================================ +# D12 — Comment Priorities +# ============================================================================ + +@pytest.mark.clojure_comparison +class TestD12CommentPriorities: + """ + D12: Comment priorities are not computed by Python. + Clojure computes priorities based on PCA extremity and importance. + """ + + @pytest.mark.xfail(reason="D12: Comment priorities not implemented in Python") + def test_comment_priorities_exist(self, conv, clojure_blob, dataset_name): + """Python should produce comment-priorities matching Clojure.""" + clj_priorities = clojure_blob.get('comment-priorities', {}) + check.greater(len(clj_priorities), 0, + f"Clojure has {len(clj_priorities)} comment priorities") + + # Check that Python produces priorities + has_priorities = hasattr(conv, 'comment_priorities') and conv.comment_priorities + check.is_true(has_priorities, "Python should compute comment_priorities") + + if not has_priorities: + return + + py_priorities = conv.comment_priorities + # Compare rankings (Spearman correlation would be ideal, but check overlap first) + common_tids = set(str(k) for k in clj_priorities.keys()) & set(str(k) for k in py_priorities.keys()) + print(f"[{dataset_name}] Common priority tids: {len(common_tids)}/{len(clj_priorities)}") + check.greater(len(common_tids), 0, "Should have common priority tids") + + +# ============================================================================ +# D15 — Moderation Handling +# ============================================================================ + +@pytest.mark.clojure_comparison +class TestD15ModerationHandling: + """ + D15: Python removes moderated comments entirely from matrix. + Clojure zeros them out (keeps structure, sets values to 0). + """ + + def test_moderated_comments_zeroed_not_removed(self, conv, clojure_blob, dataset_name): + """ + Moderated comments should be zeroed out, not removed, if any exist. + + Note: This test only applies when the dataset has moderated comments. + """ + # Check if Clojure blob has mod-out comments + mod_out = clojure_blob.get('mod-out', []) + if not mod_out: + pytest.skip(f"[{dataset_name}] No moderated comments in this dataset") + + # If there ARE moderated comments, check Python's handling + n_cols_python = len(conv.rating_mat.columns) + n_tids_clojure = len(clojure_blob.get('tids', [])) + + print(f"[{dataset_name}] Moderated comments: {len(mod_out)}") + print(f"[{dataset_name}] Python matrix columns: {n_cols_python}, Clojure tids: {n_tids_clojure}") + + # Clojure keeps all tids (zeroed for mod-out), Python removes them + check.equal(n_cols_python, n_tids_clojure, + f"Matrix columns differ: Python={n_cols_python}, Clojure={n_tids_clojure} (mod-out={len(mod_out)})") + + +# ============================================================================ +# Synthetic edge-case tests (not dataset-dependent) +# ============================================================================ + +class TestSyntheticEdgeCases: + """ + Synthetic tests with made-up data to verify specific formulas + independently of any real dataset. These document intent clearly + and prevent regressions. + """ + + def test_pseudocount_beta_2_2_prior(self): + """With PSEUDO_COUNT=2.0, pa should use Beta(2,2) prior: (na+1)/(ns+2).""" + na, ns = 3, 4 + expected = (na + 1) / (ns + 2) # = 4/6 = 0.6667 + actual = (na + PSEUDO_COUNT / 2) / (ns + PSEUDO_COUNT) + check.almost_equal(actual, expected, abs=1e-10, + msg=f"With PSEUDO_COUNT={PSEUDO_COUNT}: got {actual}, expected {expected}") + + @pytest.mark.xfail(reason="D9: Z thresholds are two-tailed, target is one-tailed") + def test_z_thresholds_are_one_tailed(self): + """Z thresholds should be one-tailed: Z_90=1.2816, Z_95=1.6449.""" + check.almost_equal(Z_90, 1.2816, abs=0.001, + msg=f"Z_90={Z_90}, expected 1.2816 (one-tailed)") + check.almost_equal(Z_95, 1.6449, abs=0.001, + msg=f"Z_95={Z_95}, expected 1.6449 (one-tailed)") + + def test_clojure_prop_test_formula(self): + """Verify Clojure's proportion test formula: 2*sqrt(n+1)*((succ+1)/(n+1) - 0.5).""" + # Small n: 5 successes out of 8 trials + succ, n = 5, 8 + result = 2 * math.sqrt(n + 1) * ((succ + 1) / (n + 1) - 0.5) + # Manual: 2 * 3 * (6/9 - 0.5) = 6 * 0.1667 = 1.0 + expected = 2 * 3.0 * (6.0 / 9.0 - 0.5) + assert abs(result - expected) < 1e-10 + + def test_clojure_repness_metric_product(self): + """Verify Clojure's repness metric is a product: ra * rat * pa * pat.""" + ra, rat, pa, pat = 1.5, 2.0, 0.8, 3.0 + expected = ra * rat * pa * pat # = 7.2 + assert expected == pytest.approx(7.2) + + def test_clojure_repful_uses_rat_vs_rdt(self): + """Clojure determines repful by comparing rat vs rdt.""" + # rat > rdt → agree + assert (2.0 > 1.0) # rat=2.0, rdt=1.0 → agree + + # rat < rdt → disagree + assert (0.5 < 1.5) # rat=0.5, rdt=1.5 → disagree diff --git a/delphi/tests/test_legacy_clojure_regression.py b/delphi/tests/test_legacy_clojure_regression.py index f3dcef40b..6d72ec653 100644 --- a/delphi/tests/test_legacy_clojure_regression.py +++ b/delphi/tests/test_legacy_clojure_regression.py @@ -18,144 +18,81 @@ import pytest import pytest_check as check -import gc -from polismath.conversation.conversation import Conversation -from polismath.regression import get_dataset_files +from polismath.regression import get_dataset_files, get_blob_variants from polismath.regression.datasets import discover_datasets -from tests.common_utils import load_votes, load_comments, load_clojure_output -from conftest import _get_requested_datasets, make_dataset_params +from tests.common_utils import load_clojure_output +from conftest import _get_requested_datasets, make_dataset_params, parse_dataset_blob_id from polismath.regression.clojure_comparer import ( ClojureComparer, unfold_clojure_group_clusters, ) -def _get_clojure_datasets(include_local: bool, requested: Optional[set[str]] = None) -> list[str]: - """Get datasets that have Clojure math_blob for comparison. +def _get_clojure_dataset_blob_ids(include_local: bool, requested: Optional[set[str]] = None) -> list[str]: + """Get composite 'dataset-blob_type' IDs for all filled blobs. - Only requires votes, comments, and math_blob - does NOT require golden_snapshot. - Filters by requested datasets if specified. + Returns IDs like 'biodiversity-incremental', 'engage-incremental', 'engage-cold_start'. + Only includes blobs that have meaningful content (PCA, clusters, etc.). + Filters by dataset name if --datasets is specified. """ datasets = discover_datasets(include_local=include_local) - result = [ - name for name, info in datasets.items() - if info.has_votes and info.has_comments and info.has_clojure_reference - ] - # Filter by --datasets if specified - if requested: - result = [d for d in result if d in requested] + result = [] + for name, info in datasets.items(): + if not (info.has_votes and info.has_comments and info.has_clojure_reference): + continue + if requested and name not in requested: + continue + for blob_type in get_blob_variants(name): + result.append(f"{name}-{blob_type}") return result -# Module-level cache for conversation data - survives across fixture calls -_CONVERSATION_CACHE: dict = {} +# Module-level cache for blobs (keyed by composite ID) +_BLOB_CACHE: dict = {} def pytest_generate_tests(metafunc): - """Parametrize tests with clojure datasets at collection time.""" - if "dataset_name" in metafunc.fixturenames: + """Parametrize tests with clojure dataset+blob_type at collection time.""" + if "dataset_blob_id" in metafunc.fixturenames: include_local = metafunc.config.getoption("--include-local", default=False) requested = _get_requested_datasets(metafunc.config) - datasets = _get_clojure_datasets(include_local, requested) - # Add xdist_group marker to each parameter for parallel execution - params = make_dataset_params(datasets) - metafunc.parametrize("dataset_name", params, scope="class") - - -def _cleanup_previous_datasets(current_dataset: str): - """Clear cached datasets except the current one to manage memory.""" - global _CONVERSATION_CACHE - for ds in list(_CONVERSATION_CACHE.keys()): - if ds != current_dataset: - print(f"[{ds}] Cleaning up previous dataset...") - _CONVERSATION_CACHE.pop(ds, None) - Conversation._reset_conversion_cache() - gc.collect() + blob_ids = _get_clojure_dataset_blob_ids(include_local, requested) + params = make_dataset_params(blob_ids) + metafunc.parametrize("dataset_blob_id", params, scope="class") @pytest.fixture(scope="class") -def conversation_data(dataset_name): +def conversation_data(dataset_blob_id, get_or_compute_conversation): """ - Class-scoped fixture computed once per dataset. - Uses module-level cache to avoid recomputation. + Class-scoped fixture computed once per dataset+blob_type. + Reuses the Conversation across blob variants via the session-scoped cache. """ - global _CONVERSATION_CACHE - - # Clean up previous datasets to manage memory - _cleanup_previous_datasets(dataset_name) - - # Return cached data if available - if dataset_name in _CONVERSATION_CACHE: - return _CONVERSATION_CACHE[dataset_name] - - # Compute the data - - # Get dataset files using central configuration - dataset_files = get_dataset_files(dataset_name) - - # Load the Clojure output for comparison - clojure_output = load_clojure_output(dataset_files['math_blob']) - - # Create and compute conversation - votes = load_votes(dataset_files['votes']) - comments = load_comments(dataset_files['comments']) - - print(f"\n[{dataset_name}] Processing conversation with {len(votes['votes'])} votes and {len(comments['comments'])} comments") - conv = Conversation(dataset_name) - conv = conv.update_votes(votes) - - print(f"[{dataset_name}] Recomputing conversation analysis...") - conv = conv.recompute() - - # Extract key metrics for reporting - group_count = len(conv.group_clusters) - print(f"[{dataset_name}] Found {group_count} groups") - print(f"[{dataset_name}] Processed {conv.comment_count} comments") - print(f"[{dataset_name}] Found {conv.participant_count} participants") - - if conv.repness and 'comment_repness' in conv.repness: - print(f"[{dataset_name}] Calculated representativeness for {len(conv.repness['comment_repness'])} comments") - - # Print top representative comments for each group - if conv.repness and 'comment_repness' in conv.repness: - for group_id in range(group_count): - print(f"\n[{dataset_name}] Top representative comments for Group {group_id}:") - group_repness = [item for item in conv.repness['comment_repness'] if item['gid'] == group_id] - - # Sort by representativeness - group_repness.sort(key=lambda x: abs(x['repness']), reverse=True) - - # Print top 5 comments - for i, rep_item in enumerate(group_repness[:5]): - comment_id = rep_item['tid'] - # Get the comment text if available - comment_txt = next((c['txt'] for c in comments['comments'] if str(c['tid']) == str(comment_id)), 'Unknown') - print(f" {i+1}. Comment {comment_id} (Repness: {rep_item['repness']:.4f}): {comment_txt[:50]}...") - - # Save the Python conversion results for manual inspection - import os - import json - data_dir = dataset_files['data_dir'] - output_dir = os.path.join(os.path.dirname(data_dir), '.test_outputs', 'python_output', dataset_name) - os.makedirs(output_dir, exist_ok=True) - - output_path = os.path.join(output_dir, 'conversation_result.json') - with open(output_path, 'w') as f: - json.dump(conv.to_dict(), f, indent=2) - - print(f"[{dataset_name}] Saved results to {output_path}") - - # Cache for sharing across test methods - data = { - 'conv': conv, - 'clojure_output': clojure_output, + global _BLOB_CACHE + dataset_name, blob_type = parse_dataset_blob_id(dataset_blob_id) + + # Get or compute the conversation (shared across blob variants via session cache) + conv_data = get_or_compute_conversation(dataset_name) + + # Load the specific blob variant (cache per composite ID) + if dataset_blob_id not in _BLOB_CACHE: + # Evict blobs from other datasets + for bid in list(_BLOB_CACHE.keys()): + if not bid.startswith(dataset_name + '-'): + _BLOB_CACHE.pop(bid, None) + + dataset_files = get_dataset_files(dataset_name, blob_type=blob_type) + clojure_output = load_clojure_output(dataset_files['math_blob']) + print(f"[{dataset_name}] Loaded {blob_type} blob for Clojure comparison") + _BLOB_CACHE[dataset_blob_id] = clojure_output + + return { + 'conv': conv_data['conv'], + 'clojure_output': _BLOB_CACHE[dataset_blob_id], 'dataset_name': dataset_name, - 'comments': comments, + 'blob_type': blob_type, + 'comments': conv_data['comments'], } - _CONVERSATION_CACHE[dataset_name] = data - - return data @pytest.mark.clojure_comparison @@ -389,3 +326,74 @@ def test_comment_priorities(self, conversation_data): check.equal(len(mismatches), 0, f"All comment priorities should match Clojure (got {len(mismatches)} mismatches out of {len(clojure_priorities)})") + + @pytest.mark.xfail(raises=AssertionError, strict=True, reason="D5/D6/D7/D10: z-values, metric, and selection logic differ") + def test_repness_matches_clojure(self, conversation_data): + """ + Test that representative comment selection matches Clojure. + + Compares selected comment sets and z-score values per group. + Requires D5 (prop test), D6 (two-prop test), D7 (metric), + and D10 (selection logic) to fully pass. + """ + conv = conversation_data['conv'] + clojure_output = conversation_data['clojure_output'] + dataset_name = conversation_data['dataset_name'] + + print(f"\n[{dataset_name}] Testing repness matches Clojure...") + + clj_repness = clojure_output.get('repness', {}) + check.is_true(bool(clj_repness), "Clojure output should have repness") + + py_repness = (conv.repness or {}).get('group_repness', {}) + check.is_true(bool(py_repness), "Python should have group_repness") + + if not clj_repness or not py_repness: + return + + set_mismatches = [] + value_mismatches = [] + + for gid_str, clj_entries in clj_repness.items(): + gid = int(gid_str) + + # Compare selected comment sets + clj_tids = set(e['tid'] for e in clj_entries) + py_entries = py_repness.get(gid, []) + py_tids = set(int(e['comment_id']) for e in py_entries) + + if clj_tids != py_tids: + set_mismatches.append( + f" g{gid}: clj={sorted(clj_tids)}, py={sorted(py_tids)}") + + # Compare z-values for shared comments + py_by_tid = {int(e['comment_id']): e for e in py_entries} + for clj_entry in clj_entries: + tid = clj_entry['tid'] + py_entry = py_by_tid.get(tid) + if py_entry is None: + continue + + clj_pat = clj_entry.get('p-test', 0) + py_pat = py_entry.get('pat', 0) + clj_rat = clj_entry.get('repness-test', 0) + py_rat = py_entry.get('rat', 0) + + if abs(clj_pat - py_pat) > 0.01 or abs(clj_rat - py_rat) > 0.01: + value_mismatches.append( + f" g{gid}/t{tid}: pat clj={clj_pat:.4f} py={py_pat:.4f}, " + f"rat clj={clj_rat:.4f} py={py_rat:.4f}") + + if set_mismatches: + print(f" Set mismatches ({len(set_mismatches)} groups):") + for m in set_mismatches[:10]: + print(m) + if value_mismatches: + print(f" Value mismatches ({len(value_mismatches)} comments):") + for m in value_mismatches[:10]: + print(m) + + check.equal(len(set_mismatches), 0, + f"{len(set_mismatches)} groups differ in selected rep comments") + check.equal(len(value_mismatches), 0, + f"{len(value_mismatches)} shared comments have z-value mismatches") diff --git a/delphi/tests/test_legacy_repness_comparison.py b/delphi/tests/test_legacy_repness_comparison.py index f9545cf5e..b106a3bb3 100644 --- a/delphi/tests/test_legacy_repness_comparison.py +++ b/delphi/tests/test_legacy_repness_comparison.py @@ -22,6 +22,7 @@ from polismath.pca_kmeans_rep.repness import conv_repness, participant_stats from common_utils import create_test_conversation from polismath.regression import get_dataset_files +from conftest import parse_dataset_blob_id import json logger = logging.getLogger(__name__) @@ -47,8 +48,12 @@ def log_warning(self): @pytest.fixture def clojure_results(self, dataset_name: str) -> Dict[str, Any]: - """Load Clojure reference results from file.""" - dataset_files = get_dataset_files(dataset_name) + """Load Clojure reference results from file. + + dataset_name is a composite ID like 'biodiversity-incremental' or 'engage-cold_start'. + """ + ds_name, blob_type = parse_dataset_blob_id(dataset_name) + dataset_files = get_dataset_files(ds_name, blob_type=blob_type) json_path = dataset_files['math_blob'] if not os.path.exists(json_path): @@ -61,8 +66,9 @@ def clojure_results(self, dataset_name: str) -> Dict[str, Any]: @pytest.fixture def conversation(self, dataset_name: str): """Create conversation with PCA and clustering computed.""" - logger.debug(f"Creating conversation for {dataset_name}") - conv = create_test_conversation(dataset_name) + ds_name, _blob_type = parse_dataset_blob_id(dataset_name) + logger.debug(f"Creating conversation for {ds_name}") + conv = create_test_conversation(ds_name) logger.debug(f"Participants: {conv.participant_count}, Comments: {conv.comment_count}") @@ -210,7 +216,7 @@ def _compare_results(self, py_results: Dict[str, Any], clj_results: Dict[str, An return overall_match_rate, stats - @pytest.mark.use_discovered_datasets + @pytest.mark.use_discovered_datasets(use_blobs=True) def test_structural_compatibility(self, dataset_name: str, python_results, clojure_results): """Test that Python and Clojure results have compatible structure.""" logger.info(f"Testing structural compatibility for {dataset_name} dataset") @@ -227,52 +233,48 @@ def test_structural_compatibility(self, dataset_name: str, python_results, cloju else: logger.warning(f"No Clojure results available for {dataset_name}") - @pytest.mark.use_discovered_datasets - def test_comparison_visibility(self, dataset_name: str, python_results, clojure_results): - """ - Compare Python and Clojure results for visibility into differences. - - Note: This test does NOT assert on match rates, as implementations are - known to be very different. It reports statistics for manual inspection. - """ - logger.info(f"Comparing representativeness for {dataset_name} dataset") - - if not clojure_results: - logger.warning(f"No Clojure results available for {dataset_name}. Skipping comparison.") - pytest.skip(f"No Clojure results for {dataset_name}") - return - - # Perform comparison - match_rate, stats = self._compare_results(python_results, clojure_results) - - # Log comparison results (for visibility, not assertions) - logger.info(f"Comparison results for {dataset_name}:") - logger.info(f" - Overall: {stats['comment_matches']} / {stats['total_comments']} comments match") - logger.info(f" - Note: Python and Clojure implementations are known to be very different") - - logger.debug(f"Group match rates:") - for group_id, rate in stats['group_match_rates'].items(): - logger.debug(f" - Group {group_id}: {rate:.2f}") - - logger.debug(f"Consensus comments match rate: {stats['consensus_match_rate']:.2f}") - - # Log sample matching comments for inspection - if stats['top_matching_comments']: - logger.debug(f"Sample matching comments (first 3):") - for i, comment in enumerate(stats['top_matching_comments'][:3]): - cid = comment['comment_id'] - gid = comment['group_id'] - logger.debug(f" - Comment {cid} (Group {gid}):") - logger.debug(f" Clojure: Agree={comment['clojure']['agree']:.2f}, Disagree={comment['clojure']['disagree']:.2f}") - logger.debug(f" Python: Agree={comment['python']['agree']:.2f}, Disagree={comment['python']['disagree']:.2f}") - - # Log Python results summary - logger.debug(f"Python representativeness summary:") - for group_id, comments in python_results.get('group_repness', {}).items(): - if comments: - logger.debug(f" - Group {group_id}: {len(comments)} comments") - for i, cmt in enumerate(comments[:2]): # Show top 2 - logger.debug(f" Comment {i+1}: ID {cmt.get('comment_id')}, Type: {cmt.get('repful')}") - logger.debug(f" Agree: {cmt.get('pa', 0):.2f}, Disagree: {cmt.get('pd', 0):.2f}") - - logger.info(f"✓ Comparison completed for {dataset_name}") + @pytest.mark.use_discovered_datasets(use_blobs=True) + @pytest.mark.xfail(reason="D2/D3: different clustering → different groups → empty repness for some groups") + def test_python_covers_clojure_groups(self, dataset_name: str, python_results, clojure_results): + """Python should produce non-empty repness for every group Clojure has.""" + if not clojure_results or 'repness' not in clojure_results: + pytest.skip(f"No Clojure repness for {dataset_name}") + + clj_repness = clojure_results['repness'] + py_group_repness = python_results.get('group_repness', {}) + + for gid_str, clj_entries in clj_repness.items(): + if not isinstance(clj_entries, list) or not clj_entries: + continue + gid = int(gid_str) + py_entries = py_group_repness.get(gid, []) + assert len(py_entries) > 0, ( + f"Group {gid}: Clojure has {len(clj_entries)} rep comments, Python has none" + ) + + @pytest.mark.use_discovered_datasets(use_blobs=True) + @pytest.mark.xfail(reason="D5/D6/D10: z-value and selection logic differences") + def test_selected_comment_sets_match(self, dataset_name: str, python_results, clojure_results): + """Selected representative comment sets should match per group.""" + if not clojure_results or 'repness' not in clojure_results: + pytest.skip(f"No Clojure repness for {dataset_name}") + + clj_repness = clojure_results['repness'] + py_group_repness = python_results.get('group_repness', {}) + + mismatches = [] + for gid_str, clj_entries in clj_repness.items(): + if not isinstance(clj_entries, list): + continue + gid = int(gid_str) + clj_tids = set(int(e.get('tid', e.get('comment_id', 0))) for e in clj_entries) + py_entries = py_group_repness.get(gid, []) + py_tids = set(int(e['comment_id']) for e in py_entries) + + if clj_tids != py_tids: + mismatches.append( + f"Group {gid}: clj={sorted(clj_tids)}, py={sorted(py_tids)}") + + assert len(mismatches) == 0, ( + f"{len(mismatches)} groups differ:\n" + "\n".join(mismatches) + ) diff --git a/delphi/tests/test_old_format_repness.py b/delphi/tests/test_old_format_repness.py index 6679172a7..b0e70648d 100644 --- a/delphi/tests/test_old_format_repness.py +++ b/delphi/tests/test_old_format_repness.py @@ -80,8 +80,9 @@ def test_comment_stats(self): n_agree = 3 n_disagree = 1 n_votes = 4 - p_agree = (n_agree + 1.5/2) / (n_votes + 1.5) - p_disagree = (n_disagree + 1.5/2) / (n_votes + 1.5) + from polismath.pca_kmeans_rep.repness import PSEUDO_COUNT + p_agree = (n_agree + PSEUDO_COUNT/2) / (n_votes + PSEUDO_COUNT) + p_disagree = (n_disagree + PSEUDO_COUNT/2) / (n_votes + PSEUDO_COUNT) assert np.isclose(stats['pa'], p_agree) assert np.isclose(stats['pd'], p_disagree) diff --git a/delphi/tests/test_regression.py b/delphi/tests/test_regression.py index 743857064..df4823d50 100644 --- a/delphi/tests/test_regression.py +++ b/delphi/tests/test_regression.py @@ -13,12 +13,19 @@ pytest tests/test_regression.py --include-local # Include local datasets """ +import os + import pytest import numpy as np from polismath.regression import ConversationRecorder, ConversationComparer from polismath.regression.utils import load_golden_snapshot +_skip_golden = pytest.mark.skipif( + os.environ.get("SKIP_GOLDEN") == "1", + reason="Golden snapshot tests disabled (SKIP_GOLDEN=1)", +) + def _check_golden_exists(dataset_name: str): """ @@ -44,6 +51,7 @@ def _check_golden_exists(dataset_name: str): ) +@_skip_golden @pytest.mark.use_discovered_datasets def test_conversation_regression(dataset_name): """ @@ -91,6 +99,7 @@ def test_conversation_regression(dataset_name): ) +@_skip_golden @pytest.mark.use_discovered_datasets def test_conversation_stages_individually(dataset_name): """ diff --git a/delphi/tests/test_repness_unit.py b/delphi/tests/test_repness_unit.py index 69da76726..c2006e5d9 100644 --- a/delphi/tests/test_repness_unit.py +++ b/delphi/tests/test_repness_unit.py @@ -81,8 +81,9 @@ def test_comment_stats(self): n_agree = 3 n_disagree = 1 n_votes = 4 - p_agree = (n_agree + 1.5/2) / (n_votes + 1.5) - p_disagree = (n_disagree + 1.5/2) / (n_votes + 1.5) + from polismath.pca_kmeans_rep.repness import PSEUDO_COUNT + p_agree = (n_agree + PSEUDO_COUNT/2) / (n_votes + PSEUDO_COUNT) + p_disagree = (n_disagree + PSEUDO_COUNT/2) / (n_votes + PSEUDO_COUNT) assert np.isclose(stats['pa'], p_agree) assert np.isclose(stats['pd'], p_disagree)