diff --git a/delphi/docs/CLJ-PARITY-FIXES-JOURNAL.md b/delphi/docs/CLJ-PARITY-FIXES-JOURNAL.md index 1409f88cb..8c5d6797c 100644 --- a/delphi/docs/CLJ-PARITY-FIXES-JOURNAL.md +++ b/delphi/docs/CLJ-PARITY-FIXES-JOURNAL.md @@ -440,8 +440,58 @@ significance filtering. Using different priors is intentional. ### What's Next -1. **PR 5 — Fix D6 (Two-proportion test)**: Add +1 pseudocount to all 4 inputs, change signature - from proportions to raw counts. +1. **PR 6 — Fix D7 (Repness metric)**: Change formula from `pa * (|pat| + |rat|)` to + `ra * rat * pa * pat` (Clojure product formula). + +--- + +## PR 5: Fix D6 — Two-Proportion Test Pseudocounts + +### TDD steps +1. **Baseline**: 1 failed (pakistan-incremental D2, pre-existing), 102 passed, 5 skipped, 143 xfailed, 2 xpassed +2. **Red**: Rewrote `TestD6TwoPropTest` with new signature `two_prop_test(succ_in, succ_out, pop_in, pop_out)` + and correct Clojure formula → 3 failures (TypeError: old function expects proportions) +3. **Fix**: Replaced both `two_prop_test` and `two_prop_test_vectorized` with Clojure formula: + add +1 to all 4 inputs (stats.clj:20), compute `pi1=(s+1)/(p+1)`, standard pooled z-test +4. **Green**: All 3 D6 formula tests pass, 4 blob comparison tests xfail (depend on D10) +5. **Full suite**: 4 regression failures, all in `rat`/`rdt`/`agree_metric`/`disagree_metric` — direct + downstream of the formula change. No unexpected fields affected. +6. **Re-recorded golden snapshots** for all 7 datasets (public + private) +7. **Final**: 1 failed (pakistan-incremental D2, pre-existing), 102 passed, 5 skipped, 143 xfailed, 2 xpassed + +### Changes +- `repness.py`: `two_prop_test(p1, n1, p2, n2)` → `two_prop_test(succ_in, succ_out, pop_in, pop_out)` + with +1 pseudocount on all 4 inputs, matching Clojure's `(map inc ...)` (stats.clj:20) +- `repness.py`: `two_prop_test_vectorized` — same signature change +- `repness.py`: Updated callers in `add_comparative_stats` and `compute_group_comment_stats_df` + to pass raw counts `(na, other_na, ns, other_ns)` instead of `(pa, ns, other_pa, other_ns)` +- `test_discrepancy_fixes.py`: Rewrote `TestD6TwoPropTest` with correct formula, 7 test cases, + edge cases, and regularization effect test +- `test_repness_unit.py`: Updated `test_two_prop_test`, `test_two_prop_test_vectorized`, + `test_two_prop_test_vectorized_edge_cases` for new signature +- `test_old_format_repness.py`: Updated `test_two_prop_test` for new signature + +### Key insight: existing test had wrong expected formula +The pre-existing D6 test computed expected values using `(succ+1)/(n+2)` — as if two pseudocounts +were added to the denominator. But Clojure's `(map inc ...)` adds +1 to each value independently, +giving `(succ+1)/(pop+1)`. The formula is a standard pooled z-test on the pseudocount-adjusted values, +not a Beta distribution posterior. + +### Session 8 (2026-03-13) + +- Created branch `jc/clj-parity-d6-two-prop-test` on top of `jc/clj-parity-d5-prop-test` +- Read Clojure source (stats.clj:18-33, repness.clj:97-100) to verify formula and call sites +- Discovered the existing D6 test had wrong expected formula — fixed +- TDD cycle: red (3 TypeError failures) → fix → green (3 pass, 4 xfail) +- Updated all callers: both scalar (`add_comparative_stats`) and vectorized + (`compute_group_comment_stats_df`) now pass raw counts +- Full suite: 4 regression failures, all in rat/rdt/metric fields (expected) +- Re-recorded golden snapshots for all 7 datasets +- Final validation: 102 passed, 1 pre-existing failure (pakistan-incremental D2) + +### What's Next + +1. **PR 6 — Fix D7 (Repness metric)**: Change from `pa * (|pat| + |rat|)` to `ra * rat * pa * pat`. --- diff --git a/delphi/docs/PLAN_DISCREPANCY_FIXES.md b/delphi/docs/PLAN_DISCREPANCY_FIXES.md index 92cecbfbe..fdf014ee7 100644 --- a/delphi/docs/PLAN_DISCREPANCY_FIXES.md +++ b/delphi/docs/PLAN_DISCREPANCY_FIXES.md @@ -21,6 +21,7 @@ This plan's "PR N" labels map to actual GitHub PRs as follows: | (perf) | #2436 | Stack 10/10 | Speed up regression tests | | PR 3 (D9) | #2446 | — | Fix D9: z-score thresholds (one-tailed) | | PR 4 (D5) | #2448 | Stack 14/25 | Fix D5: proportion test formula | +| PR 5 (D6) | #2449 | Stack 15/25 | Fix D6: two-proportion test pseudocounts | Future fix PRs will be appended to the stack as they're created. @@ -467,7 +468,7 @@ By this point, we should have good test coverage from all the per-discrepancy te | D3 | K-smoother buffer | PR 10 | — | Fix | | D4 | Pseudocount formula | **PR 2** | **#2435** | **DONE** ✓ | | D5 | Proportion test | **PR 4** | — | **DONE** ✓ | -| D6 | Two-proportion test | PR 5 | — | Fix | +| D6 | Two-proportion test | **PR 5** | — | **DONE** ✓ | | D7 | Repness metric | PR 6 | — | Fix (with flag for old formula) | | D8 | Finalize cmt stats | PR 7 | — | Fix | | D9 | Z-score thresholds | **PR 3** | **#2446** | **DONE** ✓ | diff --git a/delphi/polismath/pca_kmeans_rep/repness.py b/delphi/polismath/pca_kmeans_rep/repness.py index 82f53556f..59ec17e07 100644 --- a/delphi/polismath/pca_kmeans_rep/repness.py +++ b/delphi/polismath/pca_kmeans_rep/repness.py @@ -95,33 +95,51 @@ def prop_test(succ: int, n: int) -> float: return 2 * math.sqrt(n_pc) * (succ_pc / n_pc - 0.5) -def two_prop_test(p1: float, n1: int, p2: float, n2: int) -> float: +def two_prop_test(succ_in: int, succ_out: int, pop_in: int, pop_out: int) -> float: """ - Two-proportion z-test. - + Two-proportion z-test with +1 pseudocount on all inputs. + + Matches Clojure's stats/two-prop-test (stats.clj:18-33): + (let [[succ-in succ-out pop-in pop-out] (map inc [succ-in succ-out pop-in pop-out]) + pi1 (/ succ-in pop-in) + pi2 (/ succ-out pop-out) + pi-hat (/ (+ succ-in succ-out) (+ pop-in pop-out))] + ...) + + The +1 pseudocount (Laplace smoothing) regularizes the z-score for small + samples, preventing extreme values when group sizes are tiny. + Args: - p1: First proportion - n1: Number of observations for first proportion - p2: Second proportion - n2: Number of observations for second proportion - + succ_in: Number of successes in the group (e.g., agrees) + succ_out: Number of successes outside the group + pop_in: Total votes in the group + pop_out: Total votes outside the group + Returns: - Z-score + Z-score (positive means group proportion > other proportion) """ - if n1 == 0 or n2 == 0: + if pop_in == 0 or pop_out == 0: return 0.0 - - # Pooled probability - p = (p1 * n1 + p2 * n2) / (n1 + n2) - - # Standard error - se = math.sqrt(p * (1 - p) * (1/n1 + 1/n2)) - - # Z-score calculation + + # Add +1 pseudocount to all four inputs (Clojure: map inc) + s1 = succ_in + 1 + s2 = succ_out + 1 + p1 = pop_in + 1 + p2 = pop_out + 1 + + pi1 = s1 / p1 + pi2 = s2 / p2 + pi_hat = (s1 + s2) / (p1 + p2) + + if pi_hat == 1.0: + # Clojure note (stats.clj:26-27): "this isn't quite right... could + # actually solve this using limits" — returning 0 for now, matching Clojure. + return 0.0 + + se = math.sqrt(pi_hat * (1 - pi_hat) * (1/p1 + 1/p2)) if se == 0: return 0.0 - else: - return (p1 - p2) / se + return (pi1 - pi2) / se def comment_stats(votes: np.ndarray, group_members: List[int]) -> Dict[str, Any]: @@ -182,15 +200,17 @@ def add_comparative_stats(comment_stats: Dict[str, Any], result['ra'] = result['pa'] / other_stats['pa'] if other_stats['pa'] > 0 else 1.0 result['rd'] = result['pd'] / other_stats['pd'] if other_stats['pd'] > 0 else 1.0 - # Calculate representativeness tests + # Calculate representativeness tests — pass raw counts, matching Clojure's + # (stats/two-prop-test (:na in-stats) (sum :na rest-stats) + # (:ns in-stats) (sum :ns rest-stats)) (repness.clj:97-100) result['rat'] = two_prop_test( - result['pa'], result['ns'], - other_stats['pa'], other_stats['ns'] + result['na'], other_stats['na'], + result['ns'], other_stats['ns'] ) - + result['rdt'] = two_prop_test( - result['pd'], result['ns'], - other_stats['pd'], other_stats['ns'] + result['nd'], other_stats['nd'], + result['ns'], other_stats['ns'] ) return result @@ -493,30 +513,38 @@ def prop_test_vectorized(succ: pd.Series, n: pd.Series) -> pd.Series: return z -def two_prop_test_vectorized(p1: pd.Series, n1: pd.Series, - p2: pd.Series, n2: pd.Series) -> pd.Series: +def two_prop_test_vectorized(succ_in: pd.Series, succ_out: pd.Series, + pop_in: pd.Series, pop_out: pd.Series) -> pd.Series: """ - Vectorized two-proportion z-test. + Vectorized two-proportion z-test with +1 pseudocount on all inputs. + + Matches Clojure's stats/two-prop-test (stats.clj:18-33). + See two_prop_test() scalar version for formula details. Args: - p1: Series of first proportions - n1: Series of number of observations for first proportion - p2: Series of second proportions - n2: Series of number of observations for second proportion + succ_in: Series of success counts in the group + succ_out: Series of success counts outside the group + pop_in: Series of total vote counts in the group + pop_out: Series of total vote counts outside the group Returns: Series of z-scores """ - # Pooled probability - p_pooled = (p1 * n1 + p2 * n2) / (n1 + n2) + # Add +1 pseudocount to all four inputs (Clojure: map inc) + s1 = succ_in + 1 + s2 = succ_out + 1 + p1 = pop_in + 1 + p2 = pop_out + 1 - # Standard error - se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2)) + pi1 = s1 / p1 + pi2 = s2 / p2 + pi_hat = (s1 + s2) / (p1 + p2) - # Z-score calculation - z = (p1 - p2) / se + se = np.sqrt(pi_hat * (1 - pi_hat) * (1/p1 + 1/p2)) + z = (pi1 - pi2) / se - # Handle edge cases + # Handle edge cases: pop_in=0 or pop_out=0 → 0, pi_hat=1 → 0 + z = z.where((pop_in > 0) & (pop_out > 0), 0.0) z = z.fillna(0.0) z = z.replace([np.inf, -np.inf], 0.0) return z @@ -649,14 +677,16 @@ def compute_group_comment_stats_df(votes_long: pd.DataFrame, stats_df['ra'] = stats_df['ra'].replace([np.inf, -np.inf], 1.0).fillna(1.0) stats_df['rd'] = stats_df['rd'].replace([np.inf, -np.inf], 1.0).fillna(1.0) - # Compute representativeness tests (two-proportion z-test: group vs other) + # Compute representativeness tests — pass raw counts, matching Clojure's + # (stats/two-prop-test (:na in-stats) (sum :na rest-stats) + # (:ns in-stats) (sum :ns rest-stats)) (repness.clj:97-100) stats_df['rat'] = two_prop_test_vectorized( - stats_df['pa'], stats_df['ns'], - stats_df['other_pa'], stats_df['other_votes'] + stats_df['na'], stats_df['other_agree'], + stats_df['ns'], stats_df['other_votes'] ) stats_df['rdt'] = two_prop_test_vectorized( - stats_df['pd'], stats_df['ns'], - stats_df['other_pd'], stats_df['other_votes'] + stats_df['nd'], stats_df['other_disagree'], + stats_df['ns'], stats_df['other_votes'] ) # Compute metrics diff --git a/delphi/tests/test_discrepancy_fixes.py b/delphi/tests/test_discrepancy_fixes.py index 390d02ff4..ddcb2dc0d 100644 --- a/delphi/tests/test_discrepancy_fixes.py +++ b/delphi/tests/test_discrepancy_fixes.py @@ -793,30 +793,62 @@ def test_pat_values_match_clojure_blob(self, conv, clojure_blob, dataset_name): class TestD6TwoPropTest: """ D6: Python uses standard two-proportion z-test without pseudocounts. - Clojure adds +1 pseudocount to all 4 inputs (succ1, n1, succ2, n2). + Clojure adds +1 pseudocount to all 4 inputs (stats.clj:18-33): + (map inc [succ-in succ-out pop-in pop-out]) + pi1 = (succ-in+1)/(pop-in+1), pi2 = (succ-out+1)/(pop-out+1) + pi-hat = (succ-in+1 + succ-out+1) / (pop-in+1 + pop-out+1) """ - def test_two_prop_test_with_pseudocounts(self): - """two_prop_test should add +1 pseudocounts matching Clojure.""" - # With pseudocounts: (succ+1)/(n+2) for both groups - succ1, n1 = 10, 20 - succ2, n2 = 15, 30 - - # Clojure formula adds +1 to successes and +2 to trials - p1_clj = (succ1 + 1) / (n1 + 2) - p2_clj = (succ2 + 1) / (n2 + 2) - p_pooled_clj = (succ1 + succ2 + 2) / (n1 + n2 + 4) - se_clj = math.sqrt(p_pooled_clj * (1 - p_pooled_clj) * (1 / (n1 + 2) + 1 / (n2 + 2))) - expected = (p1_clj - p2_clj) / se_clj if se_clj > 0 else 0.0 - - # Python currently doesn't add pseudocounts - p1_py = succ1 / n1 - p2_py = succ2 / n2 - python_result = two_prop_test(p1_py, n1, p2_py, n2) + @staticmethod + def _clojure_two_prop_test(succ_in, succ_out, pop_in, pop_out): + """Reference implementation of Clojure's two-prop-test (stats.clj:18-33).""" + s1, s2, p1, p2 = succ_in + 1, succ_out + 1, pop_in + 1, pop_out + 1 + pi1 = s1 / p1 + pi2 = s2 / p2 + pi_hat = (s1 + s2) / (p1 + p2) + if pi_hat == 1: + return 0.0 + return (pi1 - pi2) / math.sqrt(pi_hat * (1 - pi_hat) * (1/p1 + 1/p2)) + + def test_two_prop_test_matches_clojure_formula(self): + """two_prop_test(succ_in, succ_out, pop_in, pop_out) should match Clojure.""" + # Test cases: (succ_in, succ_out, pop_in, pop_out) + test_cases = [ + (10, 15, 20, 30), # typical case + (0, 0, 10, 10), # no successes in either group + (5, 5, 10, 10), # identical groups + (10, 0, 10, 10), # all success in group, none outside + (1, 1, 1, 1), # minimal counts + (50, 20, 100, 200), # asymmetric sizes + (0, 10, 20, 30), # no success in group, some outside + ] - print(f"two_prop_test: Python={python_result:.4f}, Clojure(with pseudocounts)={expected:.4f}") - check.almost_equal(python_result, expected, abs=0.01, - msg=f"two_prop_test should include pseudocounts: Python={python_result:.4f}, expected={expected:.4f}") + for succ_in, succ_out, pop_in, pop_out in test_cases: + expected = self._clojure_two_prop_test(succ_in, succ_out, pop_in, pop_out) + result = two_prop_test(succ_in, succ_out, pop_in, pop_out) + check.almost_equal( + result, expected, abs=0.001, + msg=f"two_prop_test({succ_in},{succ_out},{pop_in},{pop_out}): " + f"got={result:.4f}, expected={expected:.4f}") + + def test_two_prop_test_edge_cases(self): + """Edge cases: n=0 should return 0, pi_hat=1 should return 0.""" + # n=0 cases + check.equal(two_prop_test(5, 5, 0, 10), 0.0) + check.equal(two_prop_test(5, 5, 10, 0), 0.0) + # Both zero + check.equal(two_prop_test(0, 0, 0, 0), 0.0) + + def test_two_prop_test_pseudocount_effect(self): + """Pseudocounts should shrink z-scores toward zero for small samples.""" + # With small n, the +1 pseudocount has a large effect + # succ=1, pop=1 → without pseudocount: p=1.0 (extreme) + # With pseudocount: (1+1)/(1+1) = 1.0, but denominator also shifts + result_small = two_prop_test(1, 0, 2, 2) + result_large = two_prop_test(100, 0, 200, 200) + # The large-sample z should be more extreme (less regularized) + check.greater(abs(result_large), abs(result_small), + "Large samples should produce more extreme z-scores than small ones") @pytest.mark.xfail(reason="D6/D10: two-prop test differs + no shared comments to compare") def test_rat_values_match_clojure_blob(self, conv, clojure_blob, dataset_name): @@ -1233,3 +1265,94 @@ def test_prop_test_matches_blob_p_test(self, clojure_blob, dataset_name): assert not mismatches, ( f"[{dataset_name}] {len(mismatches)}/{total} p-test mismatches:\n" + "\n".join(mismatches[:10])) + + +@pytest.mark.clojure_comparison +class TestD6BlobInjection: + """D6: Verify two_prop_test against real Clojure blob repness-test values. + + For each repness entry, reconstruct the two_prop_test inputs from + group-votes (group counts vs total-minus-group), compare to blob's + repness-test. + """ + + def test_two_prop_test_matches_blob_repness_test(self, clojure_blob, dataset_name): + """two_prop_test should match blob's repness-test for every repness entry.""" + repness = clojure_blob.get('repness', {}) + group_votes = clojure_blob.get('group-votes', {}) + if not repness or not group_votes: + pytest.skip(f"No repness or group-votes in blob for {dataset_name}") + + # Precompute total votes across ALL groups for each comment + all_group_votes = {} + for other_gid, other_gv_data in group_votes.items(): + for tid_str, counts in other_gv_data.get('votes', {}).items(): + if tid_str not in all_group_votes: + all_group_votes[tid_str] = {'A': 0, 'D': 0, 'S': 0} + all_group_votes[tid_str]['A'] += counts['A'] + all_group_votes[tid_str]['D'] += counts['D'] + all_group_votes[tid_str]['S'] += counts['S'] + + mismatches = [] + total = 0 + for gid, entries in repness.items(): + gv = group_votes.get(gid, {}).get('votes', {}) + for entry in entries: + tid_str = str(entry['tid']) + repful = entry['repful-for'] + expected_rt = entry['repness-test'] + + group_cv = gv.get(tid_str, {'A': 0, 'D': 0, 'S': 0}) + total_cv = all_group_votes.get(tid_str, {'A': 0, 'D': 0, 'S': 0}) + + if repful == 'agree': + succ_in = group_cv['A'] + succ_out = total_cv['A'] - group_cv['A'] + else: + succ_in = group_cv['D'] + succ_out = total_cv['D'] - group_cv['D'] + + pop_in = group_cv['S'] + pop_out = total_cv['S'] - group_cv['S'] + + actual = two_prop_test(succ_in, succ_out, pop_in, pop_out) + total += 1 + if abs(actual - expected_rt) > 1e-4: + mismatches.append( + f"group={gid} tid={entry['tid']} ({repful}): " + f"two_prop_test({succ_in},{succ_out},{pop_in},{pop_out})={actual:.6f}, " + f"blob repness-test={expected_rt:.6f}") + + assert not mismatches, ( + f"[{dataset_name}] {len(mismatches)}/{total} repness-test mismatches:\n" + + "\n".join(mismatches[:10])) + + +@pytest.mark.clojure_comparison +class TestD4BlobInjection: + """D4: Verify p-success (pseudocount formula) against blob values.""" + + def test_p_success_matches_blob(self, clojure_blob, dataset_name): + """(n_success + 1) / (n_trials + 2) should match blob's p-success.""" + repness = clojure_blob.get('repness', {}) + if not repness: + pytest.skip(f"No repness in blob for {dataset_name}") + + mismatches = [] + total = 0 + for gid, entries in repness.items(): + for entry in entries: + ns = entry['n-success'] + nt = entry['n-trials'] + expected = entry['p-success'] + actual = (ns + PSEUDO_COUNT / 2) / (nt + PSEUDO_COUNT) + total += 1 + if abs(actual - expected) > 1e-4: + mismatches.append( + f"group={gid} tid={entry['tid']}: " + f"pa=({ns}+1)/({nt}+2)={actual:.6f}, " + f"blob p-success={expected:.6f}") + + assert not mismatches, ( + f"[{dataset_name}] {len(mismatches)}/{total} p-success mismatches:\n" + + "\n".join(mismatches[:10])) diff --git a/delphi/tests/test_old_format_repness.py b/delphi/tests/test_old_format_repness.py index 113afc700..c3930b745 100644 --- a/delphi/tests/test_old_format_repness.py +++ b/delphi/tests/test_old_format_repness.py @@ -56,14 +56,17 @@ def test_prop_test(self): assert prop_test(0, 0) == 0.0 def test_two_prop_test(self): - """Test two-proportion z-test.""" - # Test cases - assert np.isclose(two_prop_test(0.7, 100, 0.5, 100), 2.9, atol=0.1) - assert np.isclose(two_prop_test(0.2, 50, 0.3, 50), -1.2, atol=0.1) - - # Edge cases - assert two_prop_test(0.5, 0, 0.5, 100) == 0.0 - assert two_prop_test(0.5, 100, 0.5, 0) == 0.0 + """Test two-proportion z-test with +1 pseudocounts (Clojure parity).""" + # two_prop_test(succ_in, succ_out, pop_in, pop_out) — raw counts + # After +1: pi1=71/101≈0.703, pi2=51/101≈0.505, z≈2.88 + assert np.isclose(two_prop_test(70, 50, 100, 100), 2.88, atol=0.1) + + # Equal proportions → z ≈ 0 + assert np.isclose(two_prop_test(25, 25, 50, 50), 0.0, atol=0.1) + + # Edge cases: pop=0 → 0 + assert two_prop_test(5, 5, 0, 100) == 0.0 + assert two_prop_test(5, 5, 100, 0) == 0.0 class TestCommentStats: diff --git a/delphi/tests/test_repness_unit.py b/delphi/tests/test_repness_unit.py index 3efb7672a..3edda9f82 100644 --- a/delphi/tests/test_repness_unit.py +++ b/delphi/tests/test_repness_unit.py @@ -59,14 +59,20 @@ def test_prop_test(self): 2 * math.sqrt(2) * 0.5, atol=0.01) def test_two_prop_test(self): - """Test two-proportion z-test.""" - # Test cases - assert np.isclose(two_prop_test(0.7, 100, 0.5, 100), 2.9, atol=0.1) - assert np.isclose(two_prop_test(0.2, 50, 0.3, 50), -1.2, atol=0.1) - - # Edge cases - assert two_prop_test(0.5, 0, 0.5, 100) == 0.0 - assert two_prop_test(0.5, 100, 0.5, 0) == 0.0 + """Test two-proportion z-test with +1 pseudocounts (Clojure parity).""" + # two_prop_test(succ_in, succ_out, pop_in, pop_out) — raw counts + # Clojure adds +1 to all 4 inputs (stats.clj:20) + + # succ_in=70, succ_out=50, pop_in=100, pop_out=100 + # After +1: pi1=71/101≈0.703, pi2=51/101≈0.505, z≈2.88 + assert np.isclose(two_prop_test(70, 50, 100, 100), 2.88, atol=0.1) + + # Equal proportions → z ≈ 0 + assert np.isclose(two_prop_test(25, 25, 50, 50), 0.0, atol=0.1) + + # Edge cases: pop=0 → 0 + assert two_prop_test(5, 5, 0, 100) == 0.0 + assert two_prop_test(5, 5, 100, 0) == 0.0 class TestCommentStats: @@ -576,29 +582,30 @@ def test_prop_test_vectorized_edge_cases(self): assert not np.isnan(result.iloc[1]) # normal case def test_two_prop_test_vectorized(self): - """Test vectorized two-proportion z-test.""" - p1 = pd.Series([0.7, 0.2]) - n1 = pd.Series([100, 50]) - p2 = pd.Series([0.5, 0.3]) - n2 = pd.Series([100, 50]) + """Test vectorized two-proportion z-test with +1 pseudocounts.""" + # Now takes raw counts: (succ_in, succ_out, pop_in, pop_out) + succ_in = pd.Series([70, 10]) + succ_out = pd.Series([50, 15]) + pop_in = pd.Series([100, 50]) + pop_out = pd.Series([100, 50]) - result = two_prop_test_vectorized(p1, n1, p2, n2) + result = two_prop_test_vectorized(succ_in, succ_out, pop_in, pop_out) # Compare with scalar version - assert np.isclose(result.iloc[0], two_prop_test(0.7, 100, 0.5, 100), atol=0.01) - assert np.isclose(result.iloc[1], two_prop_test(0.2, 50, 0.3, 50), atol=0.01) + assert np.isclose(result.iloc[0], two_prop_test(70, 50, 100, 100), atol=0.01) + assert np.isclose(result.iloc[1], two_prop_test(10, 15, 50, 50), atol=0.01) def test_two_prop_test_vectorized_edge_cases(self): """Test vectorized two-prop test handles edge cases.""" - p1 = pd.Series([0.5, 0.7]) - n1 = pd.Series([0, 100]) # n1=0 should return 0 - p2 = pd.Series([0.5, 0.5]) - n2 = pd.Series([100, 0]) # n2=0 should return 0 + succ_in = pd.Series([5, 70]) + pop_in = pd.Series([0, 100]) # pop_in=0 should return 0 + succ_out = pd.Series([5, 50]) + pop_out = pd.Series([100, 0]) # pop_out=0 should return 0 - result = two_prop_test_vectorized(p1, n1, p2, n2) + result = two_prop_test_vectorized(succ_in, succ_out, pop_in, pop_out) - assert result.iloc[0] == 0.0 # n1=0 case - assert result.iloc[1] == 0.0 # n2=0 case + assert result.iloc[0] == 0.0 # pop_in=0 case + assert result.iloc[1] == 0.0 # pop_out=0 case def test_compute_group_comment_stats_df(self): """Test vectorized computation of group/comment statistics."""