Introduction

This is an exploration of the ratios of thanks given/editors and thankers/editors since the "beginning of time" (when the thanks feature was rolled out). The data is meant to give a sense of the thanks feature's usage in different communities.

SQL Queries

  • This is a hive query (it would be too slow in sql)

WITH thanks_counts AS ( SELECT COUNT(var) as num_thanks, wiki_db as wiki FROM wmf_raw.mediawiki_logging WHERE log_action ='thank' AND log_type='thanks' AND log_timestamp < '20180601000000' AND log_timestamp >= '20130601000000' AND wiki_db = 'bgwiki' AND snapshot = '2018-05' GROUP BY wiki_db ),

revisions_counts AS( SELECT COUNT(DISTINCT event_user_id) as num_revs, wiki_db as wiki FROM wmf.mediawiki_history WHERE event_entity = 'revision' AND event_type = 'create' AND event_timestamp < '2018-06-01 00:00:00' AND event_timestamp >= '2013-06-01 00:00:00' AND wiki_db = 'bgwiki' AND snapshot = '2018-05' GROUP BY wiki_db )

SELECT t.wiki, num_thanks, num_revs FROM thanks_counts t JOIN revisions_counts r ON (t.wiki = r.wiki) ;

Note: var = * for num_thanks and DISTINCT log_user for num_thankers

Note: We use all projects for this study

import csv

#define filenames
src = '(1-5)-data/'
file1 = 'num_thanks_by_num_editors.tsv'
file2 = 'num_thankers_by_num_editors.tsv'
output_file = 'projects_by_thankers_ratio.csv'
#creates the output_file, an ordering of the wiki projects by their thanker/editor ratio
def order_file(input_file, output_file):
    ratios_dict = {}
    with open(input_file) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        i = -1
        for row in reader:
            i += 1
            if (i == 0):
                continue
            ratios_dict[row[0]] = [int(row[1])*1.0/int(row[2]), int(row[1])]
    with open(output_file, 'w') as csvfile:
        fieldnames = ['Project', 'Ratio', 'Thankers']
        wrter = csv.DictWriter(csvfile, fieldnames=fieldnames)
        wrter.writeheader()
        for tupl in sorted(ratios_dict.items(), key=lambda tupl: tupl[1][0]):
            wrter.writerow({'Project' : tupl[0], 'Ratio' : tupl[1][0], 'Thankers' : tupl[1][1]})
#order_file(src+file2, src+output_file)
#Next <five> functions set up the data for the graphs below
#get a sampling of the data
def get_sample(input_file, sample):
    lst = []
    with open(input_file) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        i = -1
        for row in reader:
            if (i in sample):
                lst.append(row)
            i+=1
    return lst
import random

#makes a random list of rows to be sampled and returns them
def get_random_sample(input_file, sample_size=10):
    with open(input_file) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        i = 0
        for row in reader:
            i+=1
    sample = random.sample(range(i-1), sample_size)
    return get_sample(input_file, sample)
#returns a random sample of the data (formatted as ratios rather than absolute numbers)
def get_formatted_sample(input_file):
    lst = get_random_sample(input_file)
    ratios_dict = {}
    for row in lst:
        ratios_dict[row[0]] = int(row[1])*1.0/int(row[2])
    return ratios_dict
def thanks_ratios_generator():
    return get_formatted_sample(src + file1)

def thankers_ratios_generator():
    return get_formatted_sample(src + file2)
#reformat data once more so that it is easy to plot
def convert_to_plottable(frequency_dict):
    x = []
    y = []
    for k in frequency_dict:
        x.append(k)
        y.append(frequency_dict[k])
    return [x, y]
#run pip install plotly in terminal
import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *
import numpy as np
import math
#note: plotly requires you to set-up an account
def make_trace(x, y, color):
    return Bar(
        x=x,
        y=y,
        marker=Marker(
            color=color,
            line=Line(
                color='white',
                width= 2.5
            )
        ),
    )

def make_layout(title, xlabel, rnge):
    return Layout(
        title=title,
        showlegend=False,
        yaxis = YAxis(
            range=rnge,
            zeroline=False,
            gridcolor='white'
        ),
        xaxis = XAxis(
            title=xlabel,
        ),
        paper_bgcolor='rgb(233,233,233)',
        plot_bgcolor='rgb(233,233,233)',
    )
#makes the graph
def make_figure(wikis, frequencies, title, xlabel):
    data = Data([
        make_trace(wikis, frequencies, '#ffb3cc'),
    ])
    ymax = max(frequencies)
    sf = -int(math.floor(math.log10(abs(ymax))))
    ymax = round(ymax, sf) + 10 ** (-sf)
    layout = make_layout(title, xlabel, [0, ymax])
    figure = Figure(data=data, layout=layout)
    return figure
import warnings
#turns off plotly deprecation warnings
warnings.filterwarnings('ignore')

Code to run for samples

(every time it's run it outputs a different sample of wiki projects)

#plot thanks/editors
thanks_ratios_dict = thanks_ratios_generator()
thanks_ratios = convert_to_plottable(thanks_ratios_dict)

title = "Thanks Frequencies Across Wikipedias <br> June 2013 - June 2018"
xlabel = "Projects"
figure = make_figure(thanks_ratios[0], thanks_ratios[1], title, xlabel)

py.iplot(figure, filename='thanks-frequencies-random-sample')
#plot thankers/editors
thankers_ratios_dict = thankers_ratios_generator()
thankers_ratios = convert_to_plottable(thankers_ratios_dict)

title = "Thankers Frequencies Across Wikipedias <br> June 2013 - June 2018"
xlabel = "Projects"
figure = make_figure(thankers_ratios[0], thankers_ratios[1], title, xlabel)

py.iplot(figure, filename='thankers-frequencies-random-sample')
#computes a few metrics (standard deviation, mean)
def find_stats(input_file):
    lst = []
    with open(input_file) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        i = -1
        for row in reader:
            if (i == -1):
                i += 1
                continue
            ratio = int(row[1])*1.0/int(row[2])
            lst.append(ratio)
    mean = np.mean(lst)
    std = np.std(lst)
    return [std, mean]
def compare_stats(src=src, file1=file1, file2=file2):
    stats = find_stats(src + file1)
    print ("The mean for the thanks-to-editors dataset is " + str(round(stats[1], 2)) + " and the standard deviation is " + str(round(stats[0], 2)))
    stats = find_stats(src + file2)
    print ("The mean for the thankers-to-editors dataset is " + str(round(stats[1], 2)) + " and the standard deviation is " + str(round(stats[0], 2)))
def compare_coefficients_of_variation(src=src, file1=file1, file2=file2):
    print("The coefficient of variation for the thanks-to-editors dataset is " + str(find_cv(src, file1)))
    print("The coefficient of variation for the thankers-to-editors dataset is " + str(find_cv(src, file2)))
#coefficient of variation = the standard deviation as a percentage of the mean
def find_cv(src, file_name):
    stats = find_stats(src+file_name)
    cv = round(stats[0] * 100.0/stats[1], 2)
    return cv

What follows is a metric for comparing the spreads of the two datasets:

compare_coefficients_of_variation()
The coefficient of variation for the thanks-to-editors dataset is 188.15
The coefficient of variation for the thankers-to-editors dataset is 72.49

Also, here are the means and standard deviations:

compare_stats()
The mean for the thanks-to-editors dataset is 0.29 and the standard deviation is 0.54
The mean for the thankers-to-editors dataset is 0.03 and the standard deviation is 0.02

Conclusion

There appears to be a fair amount of variation in both datasets, however, the thankers/editors ratios are decisively more consistent. This suggests that the number of thanks over the number of editors varies greatly even as the percentage of people involved in sending thanks does not.