import pandas as pd
import numpy as np
import math
import scipy.stats as stats


BaseModelScores = [0.709202,0.675973,0.690961,0.692875,0.678119,0.699425,0.679891,0.691891,0.705739,0.702819]


OtherModelScores = [0.693766,0.668319,0.678609,0.680208,0.663592,0.682784,0.670627,0.683872,0.68519,0.692516]


alpha=0.05


print('BaseModelScores:')
shapiro_test = stats.shapiro(BaseModelScores)
print(shapiro_test)
if shapiro_test.pvalue < alpha:
    print('The null hypothesis that the data are normally distributed is rejected')
else:
    print('The data are normally distributed')

BaseModelScores:
ShapiroResult(statistic=0.9325253963470459, pvalue=0.4731871783733368)
The data are normally distributed


print('OtherModelScores:')
shapiro_test = stats.shapiro(OtherModelScores)
print(shapiro_test)
if shapiro_test.pvalue < alpha:
    print('The null hypothesis that the data are normally distributed is rejected')
else:
    print('The data are normally distributed')

OtherModelScores:
ShapiroResult(statistic=0.9528518915176392, pvalue=0.7022936940193176)
The data are normally distributed


if sum(np.abs(stats.zscore(BaseModelScores))>3)>0:
    print('There are outliers in BaseModelScores')
else:
    print('No outliers in BaseModelScores')

No outliers in BaseModelScores


if sum(np.abs(stats.zscore(OtherModelScores))>3)>0:
    print('There are outliers in OtherModelScores')
else:
    print('No outliers in OtherModelScores')

No outliers in OtherModelScores


t=stats.ttest_rel(BaseModelScores,OtherModelScores)
print(t)
if t.pvalue>=alpha:
    print('No difference between the models with %s significance level'%alpha)
else:
    print('There is a difference between models with %s significance level'%alpha)

Ttest_relResult(statistic=9.772287205694246, pvalue=4.333029637146347e-06)
There is a difference between models with 0.05 significance level


#Paired t-test
diff=[y - x for y, x in zip(BaseModelScores, OtherModelScores)]   
n = len(diff)
m = np.mean(diff)
#it's important to provide ddof=1 (delta degrees of freedom) in numpy var to calculate variance with degre of freedom n - 1.
v = np.var(diff,ddof=1)
t = m/math.sqrt(v/n)
print(t)

9.772287205694246


#degree of freedom
df = n - 1


#Critical value for Two-tailed test from t distribution table:
critical_value=stats.t.ppf(q=1-alpha/2, df=df)
print('Critical value from Student`s distribution with significance level %s and degree of freedom %s is %s'%(alpha, df, critical_value))

Critical value from Student`s distribution with significance level 0.05 and degree of freedom 9 is 2.2621571627409915


#p-value - probability of getting a more extreme value - for two-sided test
p = 2*(1-stats.t.cdf(t, df))
print('p-value from Student`s distribution with significance level %s and degree of freedom %s is %s'%(alpha, df, p))

p-value from Student`s distribution with significance level 0.05 and degree of freedom 9 is 4.333029637093588e-06


n2=89559
n1=806039


#Corrected Paired t-test
diff=[y - x for y, x in zip(BaseModelScores, OtherModelScores)]
n = len(diff)
m = np.mean(diff)
#it's important to provide ddof=1 (delta degrees of freedom) in numpy var to calculate variance with degre of freedom n - 1.
v = np.var(diff,ddof=1)
t = m/math.sqrt(v*(1/n + n2/n1))
print(t)

6.725766889467009


# Nadeau and Bengio corrected paired t-test
# https://link.springer.com/content/pdf/10.1023/A:1024068626366.pdf
# https://www.cs.waikato.ac.nz/~eibe/pubs/bouckaert_and_frank.pdf


import numpy as np
import math
import scipy.stats as stats

def corrected_paired_ttest(data1, data2, n_training_size_folds, n_test_size_folds, alpha):
    #corrected paired t-test
    diff=[y - x for y, x in zip(data1, data2)]
    n = len(diff)
    m = np.mean(diff)
    #it's important to provide ddof=1 (delta degrees of freedom) in numpy var to calculate variance with degre of freedom n - 1.
    v = np.var(diff,ddof=1)
    t = m/math.sqrt(v*(1/n + n2/n1))
    
    #degree of freedom
    df = n - 1
    
    #Critical value for Two-tailed test from t distribution table:
    critical_value=stats.t.ppf(q=1-alpha/2, df=df)
    
    #p-value - probability of getting a more extreme value - for two-sided test
    pvalue = 2*(1-stats.t.cdf(t, df))
    
    return t, critical_value, pvalue


(c_t, critical_value, pvalue) = corrected_paired_ttest(BaseModelScores, OtherModelScores, n1, n2, alpha)
print('Corrected t-test value is %s , critical value is %s, p-value is %s'%(c_t, critical_value, pvalue))
if pvalue>=alpha:
    print('No difference between the models with %s significance level'%alpha)
else:
    print('There is a difference between models with %s significance level'%alpha)

Corrected t-test value is 6.725766889467009 , critical value is 2.2621571627409915, p-value is 8.598010400850953e-05
There is a difference between models with 0.05 significance level


diff=[y - x for y, x in zip(BaseModelScores, OtherModelScores)]


import scipy.stats as st

CI=st.t.interval(1-alpha, len(diff)-1, loc=np.mean(diff), scale=st.sem(diff))
CI

(0.009791778208024765, 0.015690621791975272)


import statsmodels.stats.api as sms

CI=sms.DescrStatsW(diff).tconfint_mean()
CI

(0.009791778208024765, 0.015690621791975272)


if CI[0]<=0:
    print('No difference between the models with %s confidence level'%(1-alpha))
else:
    print('There is a difference between models with %s confidence level'%(1-alpha))

There is a difference between models with 0.95 confidence level


import scipy.stats as st


def corrected_confidence_interval(data1, data2, n1, n2, confidence=0.95):
    diff=[y - x for y, x in zip(data1, data2)]
    n = len(diff)
    m = np.mean(diff)
    v = np.var(diff, ddof=1) 
    df = n - 1  
    t = stats.t.ppf((1 + confidence)/2, df) 

    lower = m - t * math.sqrt(v*(1/n + n2/n1))
    upper = m + t * math.sqrt(v*(1/n + n2/n1))
    return lower, upper


Corrected_CI = corrected_confidence_interval(BaseModelScores, OtherModelScores, n1, n2,1-alpha)
Corrected_CI

(0.00845580068188603, 0.017026599318114007)


if Corrected_CI[0]<=0:
    print('No difference between the models with %s confidence level'%(1-alpha))
else:
    print('There is a difference between models with %s confidence level'%(1-alpha))

There is a difference between models with 0.95 confidence level


data_dict = {}
data_dict['category'] = ['CI','Corrected CI']
data_dict['mean'] = [np.mean(diff),np.mean(diff)]
data_dict['lower'] = [CI[0],Corrected_CI[0]]
data_dict['upper'] = [CI[1],Corrected_CI[1]]
dataset = pd.DataFrame(data_dict)


import matplotlib.pyplot as plt
dim=np.arange(0,dataset['upper'].max() + dataset['upper'].max()/10,dataset['upper'].max()/10)
for lower,mean,upper,x in zip(dataset['lower'],dataset['mean'],dataset['upper'],range(len(dataset))):
    plt.plot((x,x),(lower,upper),'r_-',markersize=20,color='blue')
    plt.plot(x,mean,'ro',color='red')
plt.xticks(range(len(dataset)),list(dataset['category']))
plt.yticks(dim)
plt.grid(axis='both')
plt.margins(x=2)

Normality¶

Outliers¶

Two-sided Paired t-test¶

Corrected t-test¶

Confidence intervals of the differences between model scores¶

Corrected Confidence intervals of the differences between model scores for not independent samples¶

Visualization¶