import pandas as pd # The t-test is used, among other things, to assess whether two # «population» means of some attribute are the same, based on a # «sample» of each of the two populations. The test makes a few # assumptions, the most important being: # # 1. the attribute is normally distributed, # 2. the variances of the two samples are similar, # 3. the sample sizes are equal. # # The assumptions are not exact: small deviations only lead to small # inaccuracy in the result. Hence, we can set up some tolerances. # Implement a predicate ‹t_validate› that takes 2 sets of numbers, and # tolerance arguments as follows: # # • ‹normality› is the maximum p-value that we are willing to # accept for a normality test on the input data (use a # Shapiro-Wilk test to obtain the p-value), # • ‹variance› is the difference of variances that we are willing # to tolerate, and finally # • ‹relsize› is the relative size difference that we are willing # to accept (i.e. we accept the samples if their size difference # divided by their size average is less than ‹relsize›). def t_validate( s_1, s_2, normality, variance, relsize ): pass # Then implement a function ‹split› that takes: # # • ‹data›, a pandas data frame, # • ‹col›, the column to test, # • ‹split_col›, the column by which the data is split into two # disjoint sets, # • ‹split_val› if ‹None›, ‹split_col› must have exactly 2 values, # which are taken to be the sample sets to compare, otherwise # ‹split_val› is a number and ‹split_col› is numeric: then the # two sets are given by ‹data[split_col] < split_val› and # ‹data[split_col] >= split_val›. # # The result of ‹split› is two sets of numbers (in the form of # single-column data frames). def split( data, col, split_col, split_val = None ): pass # Finally implement ‹pvalue› which takes 2 samples (sets of numbers) # and produces a p-value indicating the likelihood that the means of # the corresponding populations are equal. def pvalue( s_1, s_2 ): pass # Note on typing: if you decide to use ‹scipy.stats›, you will need # to import it with ‹# type: ignore›, since ‹scipy› does not have # ‹mypy› stubs. def test_main() -> None: data = pd.read_csv( 'zz.stats.csv' ) x, y = split( data, 'bmi', 'sex' ) assert t_validate( x, y, 0.05, 5, .1 ) assert 0.08 < pvalue( x, y ) < 0.1 x, y = split( data, 'bmi', 'smoker' ) assert not t_validate( x, y, 0.05, 5, .1 ) assert t_validate( y, y, 0.05, 5, .1 ) assert not t_validate( x, x, 0.05, 5, .1 ) assert 0.99 < pvalue( y, y ) <= 1 x, y = split( data, 'bmi', 'age', 39 ) assert t_validate( y, y, 0.05, 5, .1 ) assert 0 < pvalue( x, y ) < 0.001 if __name__ == '__main__': test_main()