diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 88fbad1..18fd2a3 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -23,7 +23,6 @@ jobs: uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - cache: 'pip' - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/tableone.ipynb b/tableone.ipynb index b13eee6..3cc44f0 100644 --- a/tableone.ipynb +++ b/tableone.ipynb @@ -335,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "colab": {}, "colab_type": "code", @@ -347,12 +347,12 @@ "# Test for normality, multimodality (Hartigan's Dip Test), and far outliers (Tukey's test)\n", "\n", "# for versions >= 0.7.9\n", - "table1 = TableOne(data, dip_test=True, normal_test=True, tukey_test=True)" + "table1 = TableOne(data, dip_test=True, normal_test=True, tukey_test=True, show_histograms=True)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -388,6 +388,7 @@ " \n", " Missing\n", " Overall\n", + " Histogram\n", " \n", " \n", " \n", @@ -396,79 +397,93 @@ " \n", " \n", " 1000\n", + " \n", " \n", " \n", " Age, mean (SD)\n", " \n", " 0\n", " 65.0 (17.2)\n", + " ▂▂▃▄▆▇█▇\n", " \n", " \n", " SysABP, mean (SD)\n", " \n", " 291\n", " 114.3 (40.2)\n", + " ▂▁▁▃█▆▃▁\n", " \n", " \n", " Height, mean (SD)\n", " \n", " 475\n", " 170.1 (22.1)\n", + " ▁▃▅▇▇█▂▁\n", " \n", " \n", " Weight, mean (SD)\n", " \n", " 302\n", " 82.9 (23.8)\n", + " ▃▆█▆▄▃▁▁\n", " \n", " \n", " ICU, n (%)\n", " CCU\n", - " 0\n", + " \n", " 162 (16.2)\n", + " \n", " \n", " \n", " CSRU\n", " \n", " 202 (20.2)\n", + " \n", " \n", " \n", " MICU\n", " \n", " 380 (38.0)\n", + " \n", " \n", " \n", " SICU\n", " \n", " 256 (25.6)\n", + " \n", " \n", " \n", " MechVent, n (%)\n", " 0\n", - " 0\n", + " \n", " 540 (54.0)\n", + " \n", " \n", " \n", " 1\n", " \n", " 460 (46.0)\n", + " \n", " \n", " \n", " LOS, mean (SD)\n", " \n", " 0\n", " 14.2 (14.2)\n", + " █▆▃▁▁▁▁▁\n", " \n", " \n", " death, n (%)\n", " 0\n", - " 0\n", + " \n", " 864 (86.4)\n", + " \n", " \n", " \n", " 1\n", " \n", " 136 (13.6)\n", + " \n", " \n", " \n", "\n", @@ -478,21 +493,21 @@ " in: Height, LOS.
" ], "text/plain": [ - " Missing Overall\n", - "n 1000\n", - "Age, mean (SD) 0 65.0 (17.2)\n", - "SysABP, mean (SD) 291 114.3 (40.2)\n", - "Height, mean (SD) 475 170.1 (22.1)\n", - "Weight, mean (SD) 302 82.9 (23.8)\n", - "ICU, n (%) CCU 0 162 (16.2)\n", - " CSRU 202 (20.2)\n", - " MICU 380 (38.0)\n", - " SICU 256 (25.6)\n", - "MechVent, n (%) 0 0 540 (54.0)\n", - " 1 460 (46.0)\n", - "LOS, mean (SD) 0 14.2 (14.2)\n", - "death, n (%) 0 0 864 (86.4)\n", - " 1 136 (13.6)\n", + " Missing Overall Histogram\n", + "n 1000 \n", + "Age, mean (SD) 0 65.0 (17.2) ▂▂▃▄▆▇█▇\n", + "SysABP, mean (SD) 291 114.3 (40.2) ▂▁▁▃█▆▃▁\n", + "Height, mean (SD) 475 170.1 (22.1) ▁▃▅▇▇█▂▁\n", + "Weight, mean (SD) 302 82.9 (23.8) ▃▆█▆▄▃▁▁\n", + "ICU, n (%) CCU 162 (16.2) \n", + " CSRU 202 (20.2) \n", + " MICU 380 (38.0) \n", + " SICU 256 (25.6) \n", + "MechVent, n (%) 0 540 (54.0) \n", + " 1 460 (46.0) \n", + "LOS, mean (SD) 0 14.2 (14.2) █▆▃▁▁▁▁▁\n", + "death, n (%) 0 864 (86.4) \n", + " 1 136 (13.6) \n", "[1] Hartigan's Dip Test reports possible\n", " multimodal distributions for: Age, SysABP, Height, LOS.\n", "[2] Normality test reports non-normal\n", @@ -501,7 +516,7 @@ " in: Height, LOS." ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -513,7 +528,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -530,7 +545,7 @@ "pandas.core.frame.DataFrame" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -570,7 +585,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -587,7 +602,7 @@ "(-30.0, 250.0)" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, @@ -622,7 +637,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -673,7 +688,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -761,7 +776,7 @@ " \n", " ICU, n (%)\n", " CCU\n", - " 0\n", + " \n", " 162 (16.2)\n", " 137 (15.9)\n", " 25 (18.4)\n", @@ -802,7 +817,7 @@ "SysABP, mean (SD) 291 114.3 (40.2) 115.4 (38.3) 107.6 (49.4)\n", "Height, mean [min,max] 475 170.1 [13.0,406.4] 170.3 [13.0,406.4] 168.5 [144.8,188.0]\n", "Weight, mean (SD) 302 82.9 (23.8) 83.0 (23.6) 82.3 (25.4)\n", - "ICU, n (%) CCU 0 162 (16.2) 137 (15.9) 25 (18.4)\n", + "ICU, n (%) CCU 162 (16.2) 137 (15.9) 25 (18.4)\n", " CSRU 202 (20.2) 194 (22.5) 8 (5.9)\n", " MICU 380 (38.0) 318 (36.8) 62 (45.6)\n", " SICU 256 (25.6) 215 (24.9) 41 (30.1)\n", @@ -814,7 +829,7 @@ " in: Height, SysABP." ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -895,7 +910,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": { "colab": {}, "colab_type": "code", @@ -910,7 +925,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -952,9 +967,9 @@ " Overall\n", " 0\n", " 1\n", + " SMD (0,1)\n", " P-Value\n", " Test\n", - " SMD (0,1)\n", " \n", " \n", " \n", @@ -976,9 +991,9 @@ " 68.0 [53.0,79.0]\n", " 66.0 [52.8,78.0]\n", " 75.0 [62.0,83.0]\n", + " 0.487\n", " <0.001\n", " Kruskal-Wallis\n", - " 0.487\n", " \n", " \n", " SysABP, mean (SD)\n", @@ -987,9 +1002,9 @@ " 114.3 (40.2)\n", " 115.4 (38.3)\n", " 107.6 (49.4)\n", - " 0.134\n", - " Two Sample T-test\n", " -0.176\n", + " 0.134\n", + " Welch’s T-test\n", " \n", " \n", " Height, mean (SD)\n", @@ -998,9 +1013,9 @@ " 170.1 (22.1)\n", " 170.3 (23.2)\n", " 168.5 (11.3)\n", - " 0.304\n", - " Two Sample T-test\n", " -0.099\n", + " 0.304\n", + " Welch’s T-test\n", " \n", " \n", " Weight, mean (SD)\n", @@ -1009,20 +1024,20 @@ " 82.9 (23.8)\n", " 83.0 (23.6)\n", " 82.3 (25.4)\n", - " 0.782\n", - " Two Sample T-test\n", " -0.031\n", + " 0.782\n", + " Welch’s T-test\n", " \n", " \n", " ICU, n (%)\n", " CCU\n", - " 0\n", + " \n", " 162 (16.2)\n", " 137 (15.9)\n", " 25 (18.4)\n", + " 0.490\n", " <0.001\n", " Chi-squared\n", - " 0.490\n", " \n", " \n", " CSRU\n", @@ -1062,17 +1077,17 @@ " in: Height, SysABP.
" ], "text/plain": [ - " Grouped by death \n", - " Missing Overall 0 1 P-Value Test SMD (0,1)\n", - "n 1000 864 136 \n", - "Age, median [Q1,Q3] 0 68.0 [53.0,79.0] 66.0 [52.8,78.0] 75.0 [62.0,83.0] <0.001 Kruskal-Wallis 0.487\n", - "SysABP, mean (SD) 291 114.3 (40.2) 115.4 (38.3) 107.6 (49.4) 0.134 Two Sample T-test -0.176\n", - "Height, mean (SD) 475 170.1 (22.1) 170.3 (23.2) 168.5 (11.3) 0.304 Two Sample T-test -0.099\n", - "Weight, mean (SD) 302 82.9 (23.8) 83.0 (23.6) 82.3 (25.4) 0.782 Two Sample T-test -0.031\n", - "ICU, n (%) CCU 0 162 (16.2) 137 (15.9) 25 (18.4) <0.001 Chi-squared 0.490\n", - " CSRU 202 (20.2) 194 (22.5) 8 (5.9) \n", - " MICU 380 (38.0) 318 (36.8) 62 (45.6) \n", - " SICU 256 (25.6) 215 (24.9) 41 (30.1) \n", + " Grouped by death \n", + " Missing Overall 0 1 SMD (0,1) P-Value Test\n", + "n 1000 864 136 \n", + "Age, median [Q1,Q3] 0 68.0 [53.0,79.0] 66.0 [52.8,78.0] 75.0 [62.0,83.0] 0.487 <0.001 Kruskal-Wallis\n", + "SysABP, mean (SD) 291 114.3 (40.2) 115.4 (38.3) 107.6 (49.4) -0.176 0.134 Welch’s T-test\n", + "Height, mean (SD) 475 170.1 (22.1) 170.3 (23.2) 168.5 (11.3) -0.099 0.304 Welch’s T-test\n", + "Weight, mean (SD) 302 82.9 (23.8) 83.0 (23.6) 82.3 (25.4) -0.031 0.782 Welch’s T-test\n", + "ICU, n (%) CCU 162 (16.2) 137 (15.9) 25 (18.4) 0.490 <0.001 Chi-squared\n", + " CSRU 202 (20.2) 194 (22.5) 8 (5.9) \n", + " MICU 380 (38.0) 318 (36.8) 62 (45.6) \n", + " SICU 256 (25.6) 215 (24.9) 41 (30.1) \n", "[1] Hartigan's Dip Test reports possible\n", " multimodal distributions for: Age, Height, SysABP.\n", "[2] Normality test reports non-normal\n", @@ -1081,7 +1096,7 @@ " in: Height, SysABP." ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1114,7 +1129,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -1124,7 +1139,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -1157,7 +1172,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1176,7 +1191,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1253,7 +1268,7 @@ " 170.3 (23.2)\n", " 168.5 (11.3)\n", " 0.304\n", - " Two Sample T-test\n", + " Welch’s T-test\n", " \n", " \n", " Weight, mean (SD)\n", @@ -1263,12 +1278,12 @@ " 83.0 (23.6)\n", " 82.3 (25.4)\n", " 0.782\n", - " Two Sample T-test\n", + " Welch’s T-test\n", " \n", " \n", " ICU, n (%)\n", " CCU\n", - " 0\n", + " \n", " 162 (16.2)\n", " 137 (15.9)\n", " 25 (18.4)\n", @@ -1305,7 +1320,7 @@ " \n", " MechVent, n (%)\n", " 0\n", - " 0\n", + " \n", " 540 (54.0)\n", " 468 (54.2)\n", " 72 (52.9)\n", @@ -1329,30 +1344,30 @@ " 14.0 (13.5)\n", " 15.4 (17.7)\n", " 0.386\n", - " Two Sample T-test\n", + " Welch’s T-test\n", " \n", " \n", "\n", "
" ], "text/plain": [ - " Grouped by death \n", - " Missing Overall 0 1 P-Value Test\n", - "n 1000 864 136 \n", - "Age, mean (SD) 0 65.0 (17.2) 64.0 (17.4) 71.7 (14.0) <0.001 Custom test 1\n", - "SysABP, mean (SD) 291 114.3 (40.2) 115.4 (38.3) 107.6 (49.4) 0.012 Custom test 2\n", - "Height, mean (SD) 475 170.1 (22.1) 170.3 (23.2) 168.5 (11.3) 0.304 Two Sample T-test\n", - "Weight, mean (SD) 302 82.9 (23.8) 83.0 (23.6) 82.3 (25.4) 0.782 Two Sample T-test\n", - "ICU, n (%) CCU 0 162 (16.2) 137 (15.9) 25 (18.4) <0.001 Chi-squared\n", - " CSRU 202 (20.2) 194 (22.5) 8 (5.9) \n", - " MICU 380 (38.0) 318 (36.8) 62 (45.6) \n", - " SICU 256 (25.6) 215 (24.9) 41 (30.1) \n", - "MechVent, n (%) 0 0 540 (54.0) 468 (54.2) 72 (52.9) 0.862 Chi-squared\n", - " 1 460 (46.0) 396 (45.8) 64 (47.1) \n", - "LOS, mean (SD) 0 14.2 (14.2) 14.0 (13.5) 15.4 (17.7) 0.386 Two Sample T-test" + " Grouped by death \n", + " Missing Overall 0 1 P-Value Test\n", + "n 1000 864 136 \n", + "Age, mean (SD) 0 65.0 (17.2) 64.0 (17.4) 71.7 (14.0) <0.001 Custom test 1\n", + "SysABP, mean (SD) 291 114.3 (40.2) 115.4 (38.3) 107.6 (49.4) 0.012 Custom test 2\n", + "Height, mean (SD) 475 170.1 (22.1) 170.3 (23.2) 168.5 (11.3) 0.304 Welch’s T-test\n", + "Weight, mean (SD) 302 82.9 (23.8) 83.0 (23.6) 82.3 (25.4) 0.782 Welch’s T-test\n", + "ICU, n (%) CCU 162 (16.2) 137 (15.9) 25 (18.4) <0.001 Chi-squared\n", + " CSRU 202 (20.2) 194 (22.5) 8 (5.9) \n", + " MICU 380 (38.0) 318 (36.8) 62 (45.6) \n", + " SICU 256 (25.6) 215 (24.9) 41 (30.1) \n", + "MechVent, n (%) 0 540 (54.0) 468 (54.2) 72 (52.9) 0.862 Chi-squared\n", + " 1 460 (46.0) 396 (45.8) 64 (47.1) \n", + "LOS, mean (SD) 0 14.2 (14.2) 14.0 (13.5) 15.4 (17.7) 0.386 Welch’s T-test" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1392,7 +1407,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -1402,7 +1417,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -1412,7 +1427,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": { "scrolled": true }, @@ -1430,11 +1445,11 @@ " SysABP, mean (SD) & & 291 & 114.3 (40.2) & 115.4 (38.3) & 107.6 (49.4) \\\\\n", " Height, mean (SD) & & 475 & 170.1 (22.1) & 170.3 (23.2) & 168.5 (11.3) \\\\\n", " Weight, mean (SD) & & 302 & 82.9 (23.8) & 83.0 (23.6) & 82.3 (25.4) \\\\\n", - " ICU, n (\\%) & CCU & 0 & 162 (16.2) & 137 (15.9) & 25 (18.4) \\\\\n", + " ICU, n (\\%) & CCU & & 162 (16.2) & 137 (15.9) & 25 (18.4) \\\\\n", " & CSRU & & 202 (20.2) & 194 (22.5) & 8 (5.9) \\\\\n", " & MICU & & 380 (38.0) & 318 (36.8) & 62 (45.6) \\\\\n", " & SICU & & 256 (25.6) & 215 (24.9) & 41 (30.1) \\\\\n", - " MechVent, n (\\%) & 0 & 0 & 540 (54.0) & 468 (54.2) & 72 (52.9) \\\\\n", + " MechVent, n (\\%) & 0 & & 540 (54.0) & 468 (54.2) & 72 (52.9) \\\\\n", " & 1 & & 460 (46.0) & 396 (45.8) & 64 (47.1) \\\\\n", " LOS, mean (SD) & & 0 & 14.2 (14.2) & 14.0 (13.5) & 15.4 (17.7) \\\\\n", "\\hline\n", @@ -1448,7 +1463,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1462,11 +1477,11 @@ "| SysABP, mean (SD) | | 291 | 114.3 (40.2) | 115.4 (38.3) | 107.6 (49.4) |\n", "| Height, mean (SD) | | 475 | 170.1 (22.1) | 170.3 (23.2) | 168.5 (11.3) |\n", "| Weight, mean (SD) | | 302 | 82.9 (23.8) | 83.0 (23.6) | 82.3 (25.4) |\n", - "| ICU, n (%) | CCU | 0 | 162 (16.2) | 137 (15.9) | 25 (18.4) |\n", + "| ICU, n (%) | CCU | | 162 (16.2) | 137 (15.9) | 25 (18.4) |\n", "| | CSRU | | 202 (20.2) | 194 (22.5) | 8 (5.9) |\n", "| | MICU | | 380 (38.0) | 318 (36.8) | 62 (45.6) |\n", "| | SICU | | 256 (25.6) | 215 (24.9) | 41 (30.1) |\n", - "| MechVent, n (%) | 0 | 0 | 540 (54.0) | 468 (54.2) | 72 (52.9) |\n", + "| MechVent, n (%) | 0 | | 540 (54.0) | 468 (54.2) | 72 (52.9) |\n", "| | 1 | | 460 (46.0) | 396 (45.8) | 64 (47.1) |\n", "| LOS, mean (SD) | | 0 | 14.2 (14.2) | 14.0 (13.5) | 15.4 (17.7) |\n" ] @@ -1487,7 +1502,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -1526,7 +1541,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.19" + "version": "3.9.21" } }, "nbformat": 4, diff --git a/tableone/formatting.py b/tableone/formatting.py index 4ca9c49..2933822 100644 --- a/tableone/formatting.py +++ b/tableone/formatting.py @@ -291,3 +291,40 @@ def reorder_columns(table, optional_columns, groupby, order, overall): table = table.reindex(cols, axis=1) return table + + +def generate_histograms(values, bins=8, clip=(1, 99)): + """ + Generate a mini histogram using unicode blocks. + + Parameters + ---------- + values : np.ndarray + Numeric values. + bins : int + Number of bins for the histogram. + clip : tuple of (int, int) or None, optional + If specified, clip values to the given (lower_percentile, upper_percentile). + For example, clip=(1, 99) clips to 1st and 99th percentiles. + If None, no clipping is applied. + + Returns + ------- + str + Unicode sparkline. + """ + if len(values) == 0: + return '' + + if clip is not None: + lower, upper = np.percentile(values, clip) + values = np.clip(values, lower, upper) + + hist, _ = np.histogram(values, bins=bins) + if hist.max() == 0: + return '' + + blocks = '▁▂▃▄▅▆▇█' + hist_normalized = np.floor((hist / hist.max()) * (len(blocks) - 1)).astype(int) + + return ''.join(blocks[i] for i in hist_normalized) diff --git a/tableone/tableone.py b/tableone/tableone.py index 59a0c27..45ca950 100644 --- a/tableone/tableone.py +++ b/tableone/tableone.py @@ -8,12 +8,13 @@ import numpy as np import pandas as pd from tabulate import tabulate +from typing import Tuple from tableone.deprecations import handle_deprecated_parameters from tableone.formatting import (docstring_copier, set_display_options, format_pvalues, format_smd_columns, apply_limits, sort_and_reindex, apply_order, mask_duplicate_values, create_row_labels, - reorder_columns) + reorder_columns, generate_histograms) from tableone.preprocessors import (ensure_list, detect_categorical, order_categorical, get_groups, handle_categorical_nulls) from tableone.statistics import Statistics @@ -171,6 +172,14 @@ class TableOne: include_null : bool, default: True Include None/Null values for categorical variables by treating them as a category level. + show_histograms : bool, default=False + Whether to include mini-histograms for continuous variables. + clip_histograms : tuple or None, default (1, 99) + If show_histograms=True, specify a (lower_percentile, upper_percentile) range to clip the + data before generating histograms. This reduces the influence of extreme outliers. + For example, (1, 99) clips to the 1st and 99th percentiles. + Set to None to disable clipping and use the full range of values. + Attributes @@ -210,23 +219,36 @@ def __init__(self, data: pd.DataFrame, continuous: Optional[list] = None, groupby: Optional[str] = None, nonnormal: Optional[list] = None, - min_max: Optional[list] = None, pval: Optional[bool] = False, - pval_adjust: Optional[str] = None, htest_name: bool = False, - pval_test_name: bool = False, htest: Optional[dict] = None, + min_max: Optional[list] = None, + pval: Optional[bool] = False, + pval_adjust: Optional[str] = None, + htest_name: bool = False, + pval_test_name: bool = False, + htest: Optional[dict] = None, isnull: Optional[bool] = None, missing: bool = True, - ddof: int = 1, labels: Optional[dict] = None, - rename: Optional[dict] = None, sort: Union[bool, str] = False, + show_histograms: bool = False, + clip_histograms: Optional[Tuple[int, int]] = (1, 99), + ddof: int = 1, + labels: Optional[dict] = None, + rename: Optional[dict] = None, + sort: Union[bool, str] = False, limit: Union[int, dict, None] = None, - order: Optional[dict] = None, remarks: bool = False, - label_suffix: bool = True, decimals: Union[int, dict] = 1, - smd: bool = False, overall: bool = True, - row_percent: bool = False, display_all: bool = False, - dip_test: bool = False, normal_test: bool = False, + order: Optional[dict] = None, + remarks: bool = False, + label_suffix: bool = True, + decimals: Union[int, dict] = 1, + smd: bool = False, + overall: bool = True, + row_percent: bool = False, + display_all: bool = False, + dip_test: bool = False, + normal_test: bool = False, tukey_test: bool = False, pval_threshold: Optional[float] = None, include_null: Optional[bool] = True, pval_digits: int = 3, - ttest_equal_var: bool = False) -> None: + ttest_equal_var: bool = False, + ) -> None: # Warn about deprecated parameters handle_deprecated_parameters(labels, isnull, pval_test_name, remarks) @@ -241,7 +263,8 @@ def __init__(self, data: pd.DataFrame, htest, missing, ddof, rename, sort, limit, order, label_suffix, decimals, smd, overall, row_percent, dip_test, normal_test, tukey_test, pval_threshold, - include_null, pval_digits, ttest_equal_var) + include_null, pval_digits, ttest_equal_var, + show_histograms, clip_histograms) # Initialize intermediate tables self.initialize_intermediate_tables() @@ -283,12 +306,14 @@ def initialize_core_attributes(self, data, columns, categorical, continuous, gro htest, missing, ddof, rename, sort, limit, order, label_suffix, decimals, smd, overall, row_percent, dip_test, normal_test, tukey_test, pval_threshold, - include_null, pval_digits, ttest_equal_var): + include_null, pval_digits, ttest_equal_var, + show_histograms, clip_histograms): """ Initialize attributes. """ self._alt_labels = rename self._include_null = include_null + self._clip_histograms = clip_histograms self._columns = columns if columns else data.columns.to_list() # type: ignore self._categorical = detect_categorical(data[self._columns], groupby) if categorical is None else categorical if continuous: @@ -316,6 +341,7 @@ def initialize_core_attributes(self, data, columns, categorical, continuous, gro self._pval_digits = pval_digits self._reserved_columns = ['Missing', 'P-Value', 'Test', 'P-Value (adjusted)', 'SMD', 'Overall'] self._row_percent = row_percent + self._show_histograms = show_histograms self._smd = smd self._sort = sort self._tukey_test = tukey_test @@ -659,10 +685,59 @@ def _create_tableone(self, data): table = self._combine_tables() optional_columns = ['P-Value', 'P-Value (adjusted)', 'Test'] + if self._show_histograms: + hist_cols = [ + f"{lvl} Histogram" for lvl in self._groupbylvls + ] if self._groupby else ["Histogram"] + hist_cols.append("Overall Histogram") if self._groupby else None + optional_columns += hist_cols + # ensure column headers are strings before reindexing table = table.reset_index().set_index(['variable', 'value']) # type: ignore table.columns = table.columns.values.astype(str) + # Add histograms + if self._show_histograms: + histogram_cols = {} + for v in self._columns: + if v in self._continuous: + histograms = [] + if self._groupby: + for lvl in self._groupbylvls: + lvl_values = data.loc[data[self._groupby] == lvl, v].dropna().values + histograms.append(generate_histograms(lvl_values, clip=self._clip_histograms)) + overall_values = data[v].dropna().values + histograms.append(generate_histograms(overall_values, clip=self._clip_histograms)) + else: + overall_values = data[v].dropna().values + histograms.append(generate_histograms(overall_values, clip=self._clip_histograms)) + histogram_cols[v] = histograms + + if histogram_cols: + new_cols = [] + if self._groupby: + new_cols = [f"{lvl} Histogram" for lvl in self._groupbylvls] + ['Overall Histogram'] + else: + new_cols = ['Histogram'] + + var_names = table.index.get_level_values(0) + var_values = table.index.get_level_values(1) + + for idx, colname in enumerate(new_cols): + histograms = [] + for var, val in zip(var_names, var_values): + # Only add histogram to the main summary row (val == '') + if val == '': + hists = histogram_cols.get(var, []) + if idx < len(hists): + histograms.append(hists[idx]) + else: + histograms.append('') + else: + # No histogram for sub-rows + histograms.append('') + table[colname] = histograms + table = sort_and_reindex(table, self._smd, self.smd_table, self._sort, self._columns) table = format_pvalues(table, self._pval, self._pval_adjust, self._pval_threshold, self._pval_digits) table = format_smd_columns(table, self._smd, self.smd_table) diff --git a/tests/test_histograms.py b/tests/test_histograms.py new file mode 100644 index 0000000..fe9d9ea --- /dev/null +++ b/tests/test_histograms.py @@ -0,0 +1,71 @@ +import numpy as np +import pandas as pd + +from tableone import TableOne +from tableone.formatting import generate_histograms + + +def test_generate_histograms_simple(): + # Simple case: clean data + x = np.linspace(0, 10, 100) + hist = generate_histograms(x) + assert isinstance(hist, str) + assert all(c in '▁▂▃▄▅▆▇█' for c in hist) + + +def test_generate_histograms_empty_array(): + # Edge case: empty array + x = np.array([]) + hist = generate_histograms(x) + assert isinstance(hist, str) + assert hist == '' + + +def test_clip_histogram_behavior(): + + # Create toy data: mostly normal values, plus strong outliers + rng = np.random.default_rng(seed=42) + normal_data = rng.normal(loc=50, scale=5, size=95) + outliers = np.array([300, 400, 500, 600, 1000]) # Big outliers + all_data = np.concatenate([normal_data, outliers]) + + df = pd.DataFrame({ + 'group': ['A'] * 50 + ['B'] * 50, + 'value': all_data + }) + + # No clipping + t1_noclip = TableOne(df, columns=['value'], groupby='group', continuous=['value'], + show_histograms=True, clip_histograms=None) + + # With clipping + t1_clip = TableOne(df, columns=['value'], groupby='group', continuous=['value'], + show_histograms=True, clip_histograms=(5, 95)) + + # Find the index for the summary row + main_row_idx = None + for idx in t1_noclip.tableone.index: + if idx[0].startswith('value') and idx[1] == '': + main_row_idx = idx + break + + assert main_row_idx is not None, "Could not find main summary row for 'value'." + + # Extract histograms + no_clip_hist = t1_noclip.tableone.loc[main_row_idx, ('Grouped by group', 'Overall Histogram')] + clip_hist = t1_clip.tableone.loc[main_row_idx, ('Grouped by group', 'Overall Histogram')] + + # They should be different + assert no_clip_hist != clip_hist + + # Histograms should not be empty + assert isinstance(no_clip_hist, str) and len(no_clip_hist) > 0 + assert isinstance(clip_hist, str) and len(clip_hist) > 0 + + +def test_histogram_unicode_characters_only(): + # Check that only expected unicode block characters are used + data = np.random.randn(100) + hist = generate_histograms(data) + block_chars = set('▁▂▃▄▅▆▇█') + assert set(hist).issubset(block_chars)