```
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import numpy as np
%matplotlib inline
```

```
from scipy.stats import shapiro
from scipy.stats import normaltest
from scipy.stats import anderson
# Input: series has the sample whose distribution we want to test
# Output: gaussian boolean True if it is normal distribution and False otherwise.
def testNormality(series):
alpha = 0.05
gaussian = False
# only if the three tests give normal will be normal. If we find one that is not passed, then it is NOT normal.
# Shapiro-Wilk Test - for smaller data sets around thousands of records
print('length of series in Shapiro is: '+ str(len(series)))
stats1, p1 = shapiro(series)
print('Statistics Shapiro-Wilk Test =%.3f, p=%.3f' % (stats1, p1))
if p1 > alpha:
gaussian = True
print('Shapiro.Wilk says it is Normal '+ str(gaussian))
gaussian = False # because of intermediate printing, reinitialize
# D'Agostino and Pearson's Test
stats2, p2 = normaltest(series) #dataw.humid
print('Statistics D\'Agostino and Pearson\'s Test=%.3f, p=%.3f' % (stats2, p2))
if p2 > alpha:
gaussian = False
print('D\'Agostino and Pearson\'s says it is Normal '+ str(gaussian))
# Anderson-Darling Test
'''result = anderson(series)
print('Statistic: %.3f' % result.statistic)
for i in range(len(result.critical_values)):
sl, cv = result.significance_level[i], result.critical_values[i]
if result.statistic > result.critical_values[i]:
gaussian = False'''
return gaussian
```

```
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
# Input:
# series1 is the series with the set of measurements for every single worker in case A of controlled experiment
# series2 is the series with the set of measurements for every single worker in case B of controlled experiment
# gaussian is the boolean value indicating if the samples have passed the test of normality or not (True is apply parametric test)
# Output:
# stats of statistical test
# p-value
# acceptHo (True if we fail to reject it and False if we reject it)
# See also all tables for picking the tests (e.g., https://help.xlstat.com/customer/en/portal/articles/2062457-which-statistical-test-should-you-use-)
def compareTwoSamples(series1,series2, gaussian):
# Tests to compare two samples (H0: they have equal distribution; H1: they have different distribution)
alpha = 0.05
acceptH0 = False
if (gaussian == True):
# Run Student's T-test
stats, p = ttest_ind(series1, series2)
print('Statistics=%.3f, p=%.3f' % (stats, p))
else:
# Run Mann-Whitney; Kruskal-Wallis test is for more samples.
stats, p = mannwhitneyu(series1, series2)
print('Statistics=%.3f, p=%.3f' % (stats, p))
# result - hypothesis testing
if p > alpha:
acceptH0 = True
print('The two samples have the same distribution (meanA = meanB): ' + str(acceptH0))
return stats,p,acceptH0
```

```
#https://www.kaggle.com/uchayder/likes-shares-comments-and-mann-whitney-u-test
```

```
posts = pd.read_csv('post.csv')
comments = pd.read_csv('comment.csv')
```

```
com_count = comments.groupby('pid').count()['cid']
data = posts.join(com_count,on='pid', rsuffix='c')[['msg', 'likes', 'shares', 'cid', 'gid']]
data.columns = ['msg', 'likes', 'shares', 'comments', 'gid']
data['msg'].head()
data['msg_len'] = data.apply(lambda x: len(str(x['msg'])),axis=1)
data.head()
```

```
data.gid = data.gid.map({117291968282998: 1, 25160801076: 2, 1443890352589739: 3})
```

```
data.fillna(0,inplace=True)
data.head()
```

```
group1 = data[data['gid']==1.0]
group2 = data[data['gid']==2.0]
group3 = data[data['gid']==3.0]
```

```
norm_g1 = testNormality(group1['msg_len'])
norm_g2 = testNormality(group2['msg_len'])
norm_g3 = testNormality(group3['msg_len'])
```

```
group1['msg_len'].hist(log=True)
```

```
group2['msg_len'].hist(log=True)
```

```
group3['msg_len'].hist(log=True)
```

```
normal = norm_g1 and norm_g2
stats,p,acceptH0 = compareTwoSamples(group1['msg_len'], group2['msg_len'], normal )
corrected_p = multipletests(p,
alpha = 0.05,
method = 'holm')
print(corrected_p)
print('p > 0.05 is accept so with this correction is False as in the blog')
```

```
# tested my code and these samples give the same results as in the blog post - see Conclusions 4 for G1-G2 and G1-G3 but not G2-G3 (but there was also an error in the code!)
# https://www.kaggle.com/uchayder/likes-shares-comments-and-mann-whitney-u-test/notebook
# code of the correction package https://www.statsmodels.org/dev/_modules/statsmodels/stats/multitest.html
```

```
#!pip install statsmodels
from statsmodels.sandbox.stats.multicomp import multipletests
# it returns
''' reject : array, boolean
true for hypothesis that can be rejected for given alpha
pvals_corrected : array
p-values corrected for multiple tests
alphacSidak: float
corrected alpha for Sidak method
alphacBonf: float
corrected alpha for Bonferroni method '''
```

```
normal = norm_g2 and norm_g3
stats,p,acceptH0 = compareTwoSamples(group2['msg_len'], group3['msg_len'], normal )
corrected_p = multipletests(p,
alpha = 0.05,
method = 'holm')
print(corrected_p)
print('p > 0.05 is accept so with this correction is False as in the blog')
```

```
normal = norm_g1 and norm_g3
stats,p,acceptH0 = compareTwoSamples(group1['msg_len'], group3['msg_len'], normal)
corrected_p = multipletests(p,
alpha = 0.05,
method = 'holm')
print(corrected_p)
print('p > 0.05 is accept so with this correction is False as in the blog')
```

```
rejected, p_corrected, a1, a2 = multipletests(comparison.p_value,
alpha = 0.05,
method = 'holm')
```

```
# https://gist.github.com/naturale0/3915e2def589553e91dce99e69d138cc
```