diff --git a/tableone/statistics.py b/tableone/statistics.py index 29389a2..fddadf3 100644 --- a/tableone/statistics.py +++ b/tableone/statistics.py @@ -114,8 +114,8 @@ def _p_test(self, v: str, is_categorical: bool, is_normal: bool, min_observed: int, - catlevels: list, - h_test: dict): + h_test: dict, + ttest_equal_var: bool): """ Compute P-Values. @@ -133,8 +133,6 @@ def _p_test(self, v: str, True if the variable is normally distributed. min_observed : int Minimum number of values across groups for the variable. - catlevels : list - Sorted list of levels for categorical variables. Returns ---------- @@ -165,9 +163,9 @@ def _p_test(self, v: str, # continuous if (is_continuous and is_normal and len(grouped_data) == 2 and min_observed >= 2): - ptest = 'Two Sample T-test' + ptest = 'Welch’s T-test' if not ttest_equal_var else 'Pooled T-test' test_stat, pval = stats.ttest_ind(*grouped_data.values(), - equal_var=False, + equal_var=ttest_equal_var, nan_policy="omit") elif is_continuous and is_normal: # normally distributed diff --git a/tableone/tableone.py b/tableone/tableone.py index d29923a..59a0c27 100644 --- a/tableone/tableone.py +++ b/tableone/tableone.py @@ -76,15 +76,15 @@ class TableOne: If the argument is set to None (or omitted), we attempt to detect continuous variables. Set to an empty list to indicate explicitly that there are no variables of this type to be included. - groupby : str, optional - Optional column for stratifying the final table (default: None). - nonnormal : list, optional - List of columns that contain non-normal variables (default: None). + groupby : str, default: None + Optional column for stratifying the final table. + nonnormal : list, default: None + List of columns that contain non-normal variables. min_max: list, optional List of variables that should report minimum and maximum, instead of standard deviation (for normal) or Q1-Q3 (for non-normal). - pval : bool, optional - Display computed P-Values (default: False). + pval : bool, default: False + Display computed P-Values. pval_adjust : str, optional Method used to adjust P-Values for multiple testing. The P-values from the unadjusted table (default when pval=True) @@ -109,17 +109,21 @@ class TableOne: Threshold below which p-values are marked with an asterisk (*). For example, if set to 0.05, all p-values less than 0.05 will be displayed with a trailing asterisk (e.g., '0.012*'). - htest_name : bool, optional - Display a column with the names of hypothesis tests (default: False). + htest_name : bool, default: False + Display a column with the names of hypothesis tests. htest : dict, optional Dictionary of custom hypothesis tests. Keys are variable names and values are functions. Functions must take a list of Numpy Arrays as the input argument and must return a test result. e.g. htest = {'age': myfunc} - missing : bool, optional - Display a count of null values (default: True). - ddof : int, optional - Degrees of freedom for standard deviation calculations (default: 1). + ttest_equal_var : bool, default=False + Whether to assume equal population variances when performing two-sample + t-tests. Set to False (default) to use Welch’s t-test, which is more robust + to unequal variances. + missing : bool, default: True + Display a count of null values. + ddof : int, default: 1 + Degrees of freedom for standard deviation calculations. rename : dict, optional Dictionary of alternative names for variables. e.g. `rename = {'sex':'gender', 'trt':'treatment'}` @@ -135,9 +139,9 @@ class TableOne: order : dict, optional Specify an order for categorical variables. Key is the variable, value is a list of values in order. {e.g. 'sex': ['f', 'm', 'other']} - label_suffix : bool, optional + label_suffix : bool, default: True Append summary type (e.g. "mean (SD); median [Q1,Q3], n (%); ") to the - row label (default: True). + row label. decimals : int or dict, optional Number of decimal places to display. An integer applies the rule to all variables (default: 1). A dictionary (e.g. `decimals = {'age': 0)`) @@ -145,32 +149,28 @@ class TableOne: variables. For continuous variables, applies to all summary statistics (e.g. mean and standard deviation). For categorical variables, applies to percentage only. - overall : bool, optional + overall : bool, default: True If True, add an "overall" column to the table. Smd and p-value calculations are performed only using stratified columns. row_percent : bool, optional If True, compute "n (%)" percentages for categorical variables across "groupby" rows rather than columns. - display_all : bool, optional + display_all : bool, default: False If True, set pd. display_options to display all columns and rows. - (default: False) - dip_test : bool, optional + dip_test : bool, default: False Run Hartigan's Dip Test for multimodality. If variables are found to have multimodal distributions, a remark will be added below the Table 1. - (default: False) - normal_test : bool, optional + normal_test : bool, default: False Test the null hypothesis that a sample come from a normal distribution. Uses scipy.stats.normaltest. If variables are found to have non-normal distributions, a remark will be added below the Table 1. - (default: False) - tukey_test : bool, optional + tukey_test : bool, default: False Run Tukey's test for far outliers. If variables are found to have far outliers, a remark will be added below the Table 1. - (default: False) - include_null : bool, optional + include_null : bool, default: True Include None/Null values for categorical variables by treating them as a - category level. (default: True) + category level. Attributes @@ -225,7 +225,8 @@ def __init__(self, data: pd.DataFrame, tukey_test: bool = False, pval_threshold: Optional[float] = None, include_null: Optional[bool] = True, - pval_digits: int = 3) -> None: + pval_digits: int = 3, + ttest_equal_var: bool = False) -> None: # Warn about deprecated parameters handle_deprecated_parameters(labels, isnull, pval_test_name, remarks) @@ -240,7 +241,7 @@ def __init__(self, data: pd.DataFrame, htest, missing, ddof, rename, sort, limit, order, label_suffix, decimals, smd, overall, row_percent, dip_test, normal_test, tukey_test, pval_threshold, - include_null, pval_digits) + include_null, pval_digits, ttest_equal_var) # Initialize intermediate tables self.initialize_intermediate_tables() @@ -282,7 +283,7 @@ def initialize_core_attributes(self, data, columns, categorical, continuous, gro htest, missing, ddof, rename, sort, limit, order, label_suffix, decimals, smd, overall, row_percent, dip_test, normal_test, tukey_test, pval_threshold, - include_null, pval_digits): + include_null, pval_digits, ttest_equal_var): """ Initialize attributes. """ @@ -299,6 +300,7 @@ def initialize_core_attributes(self, data, columns, categorical, continuous, gro self._dip_test = dip_test self._groupby = groupby self._htest = htest + self._ttest_equal_var = ttest_equal_var self._isnull = missing self._label_suffix = label_suffix self._limit = limit @@ -359,7 +361,8 @@ def create_intermediate_tables(self, data): self.htest_table = self.tables.create_htest_table(data, self._continuous, self._categorical, self._nonnormal, self._groupby, self._groupbylvls, self._htest, - self._pval, self._pval_adjust) + self._pval, self._pval_adjust, + self._ttest_equal_var) # create overall tables if required if self._categorical and self._groupby and self._overall: diff --git a/tableone/tables.py b/tableone/tables.py index e4d9a61..2ec1974 100644 --- a/tableone/tables.py +++ b/tableone/tables.py @@ -31,7 +31,8 @@ def create_htest_table(self, data: pd.DataFrame, groupbylvls, htest, pval, - pval_adjust) -> pd.DataFrame: + pval_adjust, + ttest_equal_var) -> pd.DataFrame: """ Create a table containing P-Values for significance tests. Add features of the distributions and the P-Values to the dataframe. @@ -57,6 +58,9 @@ def create_htest_table(self, data: pd.DataFrame, df['nonnormal'] = np.where(df.index.isin(nonnormal), True, False) # list values for each variable, grouped by groupby levels + min_observed = 0 + catlevels = None + for v in df.index: is_continuous = df.loc[v]['continuous'] is_categorical = ~df.loc[v]['continuous'] @@ -89,7 +93,7 @@ def create_htest_table(self, data: pd.DataFrame, (df.loc[v, 'P-Value'], df.loc[v, 'Test'], warning_msg) = self.statistics._p_test(v, grouped_data, is_continuous, is_categorical, # type: ignore - is_normal, min_observed, catlevels, htest) # type: ignore + is_normal, min_observed, htest, ttest_equal_var) # type: ignore # TODO: Improve method for handling these warnings. # Write to logfile? diff --git a/tests/unit/test_tableone.py b/tests/unit/test_tableone.py index c9bc28a..12afb0a 100644 --- a/tests/unit/test_tableone.py +++ b/tests/unit/test_tableone.py @@ -1411,8 +1411,22 @@ def test_pval_digits_custom_formatting(): pval = t2.tableone['Grouped by group']['P-Value'].iloc[1] assert pval == '0.233*' - t3 = TableOne(df, columns=['y'], continuous=['y'], groupby='group', pval=True, pval_digits=1, pval_threshold=0.3) pval = t3.tableone['Grouped by group']['P-Value'].iloc[1] assert pval == '<0.1*' + + +def test_ttest_equal_var_flag(): + df = pd.DataFrame({ + 'group': ['A', 'A', 'A', 'B', 'B', 'B'], + 'x': [1.0, 2.0, 3.0, 20.0, 22.0, 24.0] + }) + + t1 = TableOne(df, columns=['x'], groupby='group', pval=True, ttest_equal_var=False, pval_digits=5) + pval_welch = t1.tableone[('Grouped by group', 'P-Value')].iloc[1] + assert pval_welch == "0.00065" + + t2 = TableOne(df, columns=['x'], groupby='group', pval=True, ttest_equal_var=True, pval_digits=5) + pval = t2.tableone[('Grouped by group', 'P-Value')].iloc[1] + assert pval == "0.00010"