This study explores the distribution of thanks. The first figure presents data on the number of thanks the average person receives. The second figure presents data on whether thanks are received evenly throughout the year or in clusters.

- Gets all thanks in a timeframe

use PROJECT;

select B.user_editcount as Edit_Count, A.log_title as User_Name, A.log_timestamp as Thank_Time

from (select log_title, log_timestamp from logging_userindex where (log_action = 'thank' and log_type='thanks' and log_timestamp < timestamp(TIME1) and log_timestamp >= timestamp(TIME2))) as A

join (select user_editcount, user_name from user) as B

on A.log_title = B.user_name order by B.user_editcount, A.log_title

-- For this analysis: TIME1 = '2018-06-01' TIME2 = '2017-06-01' PROJECT = itwiki_p, ptwiki_p, plwiki_p, fawiki_p, nlwiki_p

```
import csv
from datetime import datetime
import random
```

```
#define filenames
src = '(1-4)-data/'
input_prefixes = ['It', 'Pt', 'Pl', 'Fa', 'Nl']
input_stem = 'Receivers'
input_suffix = '.csv'
```

```
def make_files_lst(src=src, prefixes=input_prefixes, stem=input_stem, suffix=input_suffix):
lst = []
for prefix in prefixes:
lst.append(src+prefix+stem+suffix)
return lst
```

```
input_files = make_files_lst()
```

```
#information for breaking the data into percentiles
num_groups = 10
samples = [[0, 0.2], [0.8, 1]]
file_size, group_size, remainder = -1, -1, -1
```

```
def find_group_size(input_file, n):
#computes the values of:
global file_size, group_size, remainder
file_size = 0
with open(input_file, 'rt', encoding = 'utf-8') as csvfile:
rder = csv.DictReader(csvfile)
for row in rder:
file_size += 1
group_size = int(file_size/n)
remainder = file_size - group_size * n
```

```
#calls spatial_distribution_inner() for all samples of all input files
def examine_spatial_distribution(input_files=input_files, samples=samples, num_groups=num_groups, simple=False):
data = []
for input_file in input_files:
for sample in samples:
start_group = round(sample[0] * num_groups)
end_group = round(sample[1] * num_groups)
d = spatial_distribution_inner(input_file, sample, start_group, end_group, simple)
data.append(d)
return data
```

```
#computes different statistics on spatial distribution (function has two modes: simple=True/False)
def spatial_distribution_inner(input_file, sample, start_group, end_group, simple):
with open(input_file, 'r', encoding = 'utf-8') as csvfile:
rder = csv.DictReader(csvfile)
global file_size, group_size, remainder
find_group_size(input_file, num_groups)
i, j = 0, 0
adjusted_group_size = group_size
prev_username=''
thanks=0
people=0
months_by_person = {}
days_by_person = {}
thanks_by_person = {}
for row in rder:
i += 1
if (j >= start_group and j < end_group):
username = row['User_Name']
if (username != prev_username):
#set-up data collection for new user
people += 1
prev_username = username
months_by_person[username] = set()
days_by_person[username] = set()
thanks_by_person[username] = 0
thanks += 1
thanks_by_person[username] += 1
#if this user has not yet received a thank on this month or day, increment the count
tme = row['Thank_Time'][:6]
if (not tme in months_by_person[username]):
months_by_person[username].add(tme)
tme = row['Thank_Time'][:8]
if (not tme in days_by_person[username]):
days_by_person[username].add(tme)
#adjust group size
if (i == adjusted_group_size):
i = 0
j += 1
if (j == num_groups - remainder):
adjusted_group_size += 1
if (not simple):
#more complex mode, return differences in distribution with randomized and controlled data sets
return [find_average_difference(months_by_person, thanks_by_person, 12), find_average_difference(days_by_person, thanks_by_person, 365)]
else:
#simple mode, return average number of thanks/person for different timeframes
num_months, num_days = 0, 0
num_months = sum([len(months_by_person[k]) for k in months_by_person])*1.0/people
num_days = sum([len(days_by_person[k]) for k in days_by_person])*1.0/people
avg_thanks = thanks*1.0/people
return [avg_thanks, avg_thanks/num_months, avg_thanks/num_days]
```

```
#find average difference between the actual distribution and a random distribution,
#a random distribution and a random distribution, and a controlled distribution and a random distributioon
def find_average_difference(timespans_by_person, thanks_by_person, num_spans):
dif = 0
expected_dif = 0
normalized_dif = 0
usernames = [k for k in timespans_by_person]
random_timespans_by_person = generate_random_data_for_population(usernames, thanks_by_person, num_spans)
normalized_timespans_by_person = generate_normalized_data_for_population(usernames, thanks_by_person, num_spans)
for username in usernames:
random_spans = generate_random_data_for_person(thanks_by_person[username], num_spans)
dif += len(timespans_by_person[username]) - random_spans #actual - random
expected_dif += random_timespans_by_person[username] - random_spans #random - random
normalized_dif += normalized_timespans_by_person[username] - random_spans #controlled spread - random
return [dif*1.0/len(usernames), normalized_dif*1.0/len(usernames), expected_dif*1.0/len(usernames)]
```

```
#generate random data for the timeframes (months or days) in which a single person received thanks
def generate_random_data_for_person(num_thanks, num_spans):
random_spans = set()
for i in range(0, num_thanks):
random_spans.add(random.randint(1, num_spans))
return len(random_spans)
```

```
#generate random data for the timeframes (months or days) in which every person received thanks
def generate_random_data_for_population(usernames, thanks_by_person, num_spans):
spans = {}
for username in usernames:
spans[username] = generate_random_data_for_person(thanks_by_person[username], num_spans)
return spans
```

```
#generate controlled (maximum-spread) data for the timeframes (months or days) in which every person received thanks
def generate_normalized_data_for_population(usernames, thanks_by_person, num_spans):
spans = {}
for username in usernames:
spans[username] = min(thanks_by_person[username], num_spans)
return spans
```

```
data = examine_spatial_distribution()
```

```
#define table inputs
columns = ['Language', 'Sample', 'Timeframe', 'Dif Actual', 'Dif Constant', 'Dif Random']
title = 'thanks-timeframe' #filename where table will be saved
languages = ['Italian', 'Portuguese', 'Polish', 'Farsi', 'Netherlandic']
```

```
#format data for table (round numbers, convert some numbers to labels)
def make_table(input_files=input_files, samples=samples, data=data, simple=False):
reformatted_data = []
for i in range(0, len(languages)):
for j in range(0, len(samples)):
if (j % 2 == 0):
sample = 'Bottom 20%'
else:
sample = 'Top 20%'
reformatted_data.append([languages[i], sample])
if (not simple):
reformatted_data.append([languages[i], sample])
for i in range(0, len(data)):
for j in range(0, len(data[i])):
if (not simple):
k = i*2+j
if (j % 2 == 0):
timeframe='Months'
else:
timeframe='Days'
reformatted_data[k].append(timeframe)
reformatted_data[k] += data[i][j]
else:
k = i
reformatted_data[k].append(data[i][j])
return reformatted_data
```

```
data = make_table()
```

```
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
```

```
#use reformatted data to create a table
def show_table(data=data, columns=columns, title=title, simple=False):
fig, ax = plt.subplots()
#hide axes
ax.axis('off')
ax.axis('tight')
#styling -- color cells by row, round all floats
colors = [['#9691b7']*len(data[0])]*len(data)
for i in range(0, len(colors)):
if (i % 2) == 0:
colors[i] = ['#ce7c5f']*len(data[0])
t = 3
if (simple):
t = 2
for i in range(0, len(data)):
for j in range(t, len(data[i])):
data[i][j] = round(data[i][j], 2)
df = pd.DataFrame(data, columns=columns)
table = ax.table(bbox=None, cellText=df.values, cellColours=colors, colColours=['#a06663']*len(columns), colLabels=df.columns, loc='center', cellLoc='center')
#styling -- get rid of lines in table
d = table.get_celld()
for k in d:
d[k].set_linewidth(0)
fig.tight_layout()
table.scale(2, 2)
plt.savefig('../figures/'+title+'.png', bbox_inches='tight')
plt.show()
```

```
simple = True
```

```
avgs_data = examine_spatial_distribution(simple=simple)
```

```
avgs_data = make_table(data=avgs_data, simple=simple)
```

```
#define table inputs
columns = ['Language', 'Sample', 'Thanks in Year', 'Thanks in Month', 'Thanks in Day']
title = 'thanks-avgs' #filename where table will be stored
```

Note: Only people who received a thank are represented in the data.

```
#figure 1, simpler analysis
show_table(avgs_data, columns, title, simple=simple)
```

```
#figure 2, more complicated analysis
show_table()
```

Thanks appear to be more clustered than they would be if spread out over random days. In other words, people tend to receive thanks in clusters. Further analysis is needed to know exactly what these clusters look like.