Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions tableone/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ def _p_test(self, v: str,
is_categorical: bool,
is_normal: bool,
min_observed: int,
catlevels: list,
h_test: dict):
h_test: dict,
ttest_equal_var: bool):
"""
Compute P-Values.

Expand All @@ -133,8 +133,6 @@ def _p_test(self, v: str,
True if the variable is normally distributed.
min_observed : int
Minimum number of values across groups for the variable.
catlevels : list
Sorted list of levels for categorical variables.

Returns
----------
Expand Down Expand Up @@ -165,9 +163,9 @@ def _p_test(self, v: str,
# continuous
if (is_continuous and is_normal and len(grouped_data) == 2
and min_observed >= 2):
ptest = 'Two Sample T-test'
ptest = 'Welch’s T-test' if not ttest_equal_var else 'Pooled T-test'
test_stat, pval = stats.ttest_ind(*grouped_data.values(),
equal_var=False,
equal_var=ttest_equal_var,
nan_policy="omit")
elif is_continuous and is_normal:
# normally distributed
Expand Down
61 changes: 32 additions & 29 deletions tableone/tableone.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,15 @@ class TableOne:
If the argument is set to None (or omitted), we attempt to detect
continuous variables. Set to an empty list to indicate explicitly
that there are no variables of this type to be included.
groupby : str, optional
Optional column for stratifying the final table (default: None).
nonnormal : list, optional
List of columns that contain non-normal variables (default: None).
groupby : str, default: None
Optional column for stratifying the final table.
nonnormal : list, default: None
List of columns that contain non-normal variables.
min_max: list, optional
List of variables that should report minimum and maximum, instead of
standard deviation (for normal) or Q1-Q3 (for non-normal).
pval : bool, optional
Display computed P-Values (default: False).
pval : bool, default: False
Display computed P-Values.
pval_adjust : str, optional
Method used to adjust P-Values for multiple testing.
The P-values from the unadjusted table (default when pval=True)
Expand All @@ -109,17 +109,21 @@ class TableOne:
Threshold below which p-values are marked with an asterisk (*).
For example, if set to 0.05, all p-values less than 0.05 will be
displayed with a trailing asterisk (e.g., '0.012*').
htest_name : bool, optional
Display a column with the names of hypothesis tests (default: False).
htest_name : bool, default: False
Display a column with the names of hypothesis tests.
htest : dict, optional
Dictionary of custom hypothesis tests. Keys are variable names and
values are functions. Functions must take a list of Numpy Arrays as
the input argument and must return a test result.
e.g. htest = {'age': myfunc}
missing : bool, optional
Display a count of null values (default: True).
ddof : int, optional
Degrees of freedom for standard deviation calculations (default: 1).
ttest_equal_var : bool, default=False
Whether to assume equal population variances when performing two-sample
t-tests. Set to False (default) to use Welch’s t-test, which is more robust
to unequal variances.
missing : bool, default: True
Display a count of null values.
ddof : int, default: 1
Degrees of freedom for standard deviation calculations.
rename : dict, optional
Dictionary of alternative names for variables.
e.g. `rename = {'sex':'gender', 'trt':'treatment'}`
Expand All @@ -135,42 +139,38 @@ class TableOne:
order : dict, optional
Specify an order for categorical variables. Key is the variable, value
is a list of values in order. {e.g. 'sex': ['f', 'm', 'other']}
label_suffix : bool, optional
label_suffix : bool, default: True
Append summary type (e.g. "mean (SD); median [Q1,Q3], n (%); ") to the
row label (default: True).
row label.
decimals : int or dict, optional
Number of decimal places to display. An integer applies the rule to all
variables (default: 1). A dictionary (e.g. `decimals = {'age': 0)`)
applies the rule per variable, defaulting to 1 place for unspecified
variables. For continuous variables, applies to all summary statistics
(e.g. mean and standard deviation). For categorical variables, applies
to percentage only.
overall : bool, optional
overall : bool, default: True
If True, add an "overall" column to the table. Smd and p-value
calculations are performed only using stratified columns.
row_percent : bool, optional
If True, compute "n (%)" percentages for categorical variables across
"groupby" rows rather than columns.
display_all : bool, optional
display_all : bool, default: False
If True, set pd. display_options to display all columns and rows.
(default: False)
dip_test : bool, optional
dip_test : bool, default: False
Run Hartigan's Dip Test for multimodality. If variables are found to
have multimodal distributions, a remark will be added below the
Table 1.
(default: False)
normal_test : bool, optional
normal_test : bool, default: False
Test the null hypothesis that a sample come from a normal distribution.
Uses scipy.stats.normaltest. If variables are found to have non-normal
distributions, a remark will be added below the Table 1.
(default: False)
tukey_test : bool, optional
tukey_test : bool, default: False
Run Tukey's test for far outliers. If variables are found to
have far outliers, a remark will be added below the Table 1.
(default: False)
include_null : bool, optional
include_null : bool, default: True
Include None/Null values for categorical variables by treating them as a
category level. (default: True)
category level.


Attributes
Expand Down Expand Up @@ -225,7 +225,8 @@ def __init__(self, data: pd.DataFrame,
tukey_test: bool = False,
pval_threshold: Optional[float] = None,
include_null: Optional[bool] = True,
pval_digits: int = 3) -> None:
pval_digits: int = 3,
ttest_equal_var: bool = False) -> None:

# Warn about deprecated parameters
handle_deprecated_parameters(labels, isnull, pval_test_name, remarks)
Expand All @@ -240,7 +241,7 @@ def __init__(self, data: pd.DataFrame,
htest, missing, ddof, rename, sort, limit, order,
label_suffix, decimals, smd, overall, row_percent,
dip_test, normal_test, tukey_test, pval_threshold,
include_null, pval_digits)
include_null, pval_digits, ttest_equal_var)

# Initialize intermediate tables
self.initialize_intermediate_tables()
Expand Down Expand Up @@ -282,7 +283,7 @@ def initialize_core_attributes(self, data, columns, categorical, continuous, gro
htest, missing, ddof, rename, sort, limit, order,
label_suffix, decimals, smd, overall, row_percent,
dip_test, normal_test, tukey_test, pval_threshold,
include_null, pval_digits):
include_null, pval_digits, ttest_equal_var):
"""
Initialize attributes.
"""
Expand All @@ -299,6 +300,7 @@ def initialize_core_attributes(self, data, columns, categorical, continuous, gro
self._dip_test = dip_test
self._groupby = groupby
self._htest = htest
self._ttest_equal_var = ttest_equal_var
self._isnull = missing
self._label_suffix = label_suffix
self._limit = limit
Expand Down Expand Up @@ -359,7 +361,8 @@ def create_intermediate_tables(self, data):
self.htest_table = self.tables.create_htest_table(data, self._continuous, self._categorical,
self._nonnormal, self._groupby,
self._groupbylvls, self._htest,
self._pval, self._pval_adjust)
self._pval, self._pval_adjust,
self._ttest_equal_var)

# create overall tables if required
if self._categorical and self._groupby and self._overall:
Expand Down
8 changes: 6 additions & 2 deletions tableone/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def create_htest_table(self, data: pd.DataFrame,
groupbylvls,
htest,
pval,
pval_adjust) -> pd.DataFrame:
pval_adjust,
ttest_equal_var) -> pd.DataFrame:
"""
Create a table containing P-Values for significance tests. Add features
of the distributions and the P-Values to the dataframe.
Expand All @@ -57,6 +58,9 @@ def create_htest_table(self, data: pd.DataFrame,
df['nonnormal'] = np.where(df.index.isin(nonnormal), True, False)

# list values for each variable, grouped by groupby levels
min_observed = 0
catlevels = None

for v in df.index:
is_continuous = df.loc[v]['continuous']
is_categorical = ~df.loc[v]['continuous']
Expand Down Expand Up @@ -89,7 +93,7 @@ def create_htest_table(self, data: pd.DataFrame,
(df.loc[v, 'P-Value'],
df.loc[v, 'Test'],
warning_msg) = self.statistics._p_test(v, grouped_data, is_continuous, is_categorical, # type: ignore
is_normal, min_observed, catlevels, htest) # type: ignore
is_normal, min_observed, htest, ttest_equal_var) # type: ignore

# TODO: Improve method for handling these warnings.
# Write to logfile?
Expand Down
16 changes: 15 additions & 1 deletion tests/unit/test_tableone.py
Original file line number Diff line number Diff line change
Expand Up @@ -1411,8 +1411,22 @@ def test_pval_digits_custom_formatting():
pval = t2.tableone['Grouped by group']['P-Value'].iloc[1]
assert pval == '0.233*'


t3 = TableOne(df, columns=['y'], continuous=['y'], groupby='group', pval=True, pval_digits=1,
pval_threshold=0.3)
pval = t3.tableone['Grouped by group']['P-Value'].iloc[1]
assert pval == '<0.1*'


def test_ttest_equal_var_flag():
df = pd.DataFrame({
'group': ['A', 'A', 'A', 'B', 'B', 'B'],
'x': [1.0, 2.0, 3.0, 20.0, 22.0, 24.0]
})

t1 = TableOne(df, columns=['x'], groupby='group', pval=True, ttest_equal_var=False, pval_digits=5)
pval_welch = t1.tableone[('Grouped by group', 'P-Value')].iloc[1]
assert pval_welch == "0.00065"

t2 = TableOne(df, columns=['x'], groupby='group', pval=True, ttest_equal_var=True, pval_digits=5)
pval = t2.tableone[('Grouped by group', 'P-Value')].iloc[1]
assert pval == "0.00010"