Source code for medstat.medstat

"""Main module."""
from typing import List

import pandas as pd
from scipy.stats import fisher_exact, chi2_contingency


[docs]def test_hypothesis(data: pd.DataFrame, expression_1: str, expression_2: str, threshold: float = 0.05): """ Perform an hypothesis test of independence between expression_1 and expression_2. The expression can be column names, in that case each category of the column is considered, or boolean tests. Depending on the frequencies a fisher test or a chi square test will be performed. Args: data (pd.DataFrame): The data frame containing the data under study expression_1 (str): A column name or a boolean test expression_2 (str): A column name or a boolean test threshold (float): p-value threshold under which the test is considered significant Returns: Dict: Containing the p-value, the contengency table, the test used and if the result is significant Examples: >>> medstat.test_hypothesis(data, 'sex', 'age < 30') {'contingency_table': age < 30 False True All sex Female 26 22 48 Male 24 8 32 All 50 30 80, 'test': 'Fisher', 'p-value': 0.06541995357625573, 'significant': False} """ result = {} contengency_table = __make_contingency_table(data, expression_1, expression_2) result['contengency_table'] = contengency_table value_table = contengency_table.iloc[:-1, :-1].values min_freq = value_table.min() if min_freq >= 10: result['test'] = "Chi-squared" _, result['p-value'], _, _ = chi2_contingency(value_table) else: result['test'] = "Fisher" try: c, result['p-value'] = fisher_exact(value_table) except ValueError as e: result["error"] = f"Fisher test cannot be used: {e}" return result result['significant'] = False if result['p-value'] < threshold: result['significant'] = True return result
[docs]def analyse_dataset(data: pd.DataFrame, hypothesis: List[tuple], threshold: float = 0.05, file=None): """ Provide a data set and a list of couple of factor for which you want to check the independence and it will perform the appropriate test for each hypothesis. The results will also be printed on the screen and can be saved to a file. Args: data: The data set. hypothesis: List of 2-tuples containing the factor to tests threshold (optional): p-value threshold under which the test result is considered significant file (optional): A file where to write a report of the results Returns: List: A list of dictionnary containing for each test the result, the contingency table etc (see test_hypothesis output) Examples: >>> medstat.analyse_dataset(data,[('sex', 'age < 30'), ('sex', 'test_a')], file='report.txt') [{'contengency_table': age < 30 False True All sex Female 21 18 39 Male 29 12 41 All 50 30 80, 'test': 'Chi-squared', 'p-value': 0.18407215636751517, 'significant': False}, {'contengency_table': test_a negative positive All sex Female 25 14 39 Male 25 16 41 All 50 30 80, 'test': 'Chi-squared', 'p-value': 0.9539453144224308, 'significant': False}] """ results = [] reports = [] for i, hypo in enumerate(hypothesis): result = test_hypothesis(data, *hypo, threshold) report = __make_result_report(result, i) print(report) results.append(result) reports.append(report) if file is not None: with open(file, "w") as f: for report in reports: f.write(report) return results
def __make_result_report(result, i): exp_1 = result['contengency_table'].index.name exp_2 = result['contengency_table'].columns.name report = "-" * 20 + f" Test {i + 1} " + "-" * 20 + "\n" report += f"Test independence between {exp_1} and {exp_2}. \n" report += f"Use {result['test']} test.\n" report += f"Result is {(not result['significant']) * 'not '}significant.\n" report += f"p-value: {result['p-value']}\n" report += f"Contingency table: \n {result['contengency_table']} \n \n" return report def __make_contingency_table(data, expression_1, expression_2): """Prepare the contingency table""" factors = [] for expression in [expression_1, expression_2]: if expression in data.columns.values: factor = data[expression].astype('category') else: factor = data.eval(expression) factor = pd.Categorical(factor, categories=[False, True]) factors.append(factor) contengency_table = pd.crosstab(factors[0], factors[1], dropna=False, margins=True) contengency_table.index.name = expression_1 contengency_table.columns.name = expression_2 return contengency_table