Introduction

This notebook contains an analysis of the thanks given and received by different types of editors (those with high average edit counts vs those with low average edit counts)

SQL Queries

Thanks by year

  • gets the number of thanks given/received in a year as well as the edit counts of the people responsible for them

use nowiki_p;

select A.rev_user as ID, A.num_edits as Edits, coalesce(B.num_thanks, 0) as Thanks

from (select user_id, user_name from user) as C

join (select rev_user, rev_user_text, count(rev_user) as num_edits from revision where rev_timestamp < timestamp('2018-06-01') and rev_timestamp >= timestamp('2017-06-01') and rev_user != 0 group by rev_user order by count(rev_user)) as A on A.rev_user = C.user_id

left join (select log_user_text, count(log_user_text) as num_thanks from logging_userindex where log_action = 'thank' and log_type='thanks' and log_timestamp < timestamp('2018-06-01') and log_timestamp >= timestamp('2017-06-01') group by log_user_text) as B on B.log_user_text = C.user_name or B.log_user_text = A.rev_user_text

order by Edits;

Note: log_user_text is a username (not an ID) which could produce some inaccuracy. I decided to use log_user_text because there's no ID representation of log_title, and I wanted to be consistent with my thanks given and thanks received data

Note: We use fourteen languages for this study (three more than the eleven language sample from studies 1-4).

import csv
#num_groups refers to percentile groups, with num_groups = 20, the xaxis will be in percentile groups of 5
num_groups = 20
#information about the group size, remainder in case file_size % num_groups != 0
file_size, group_size, remainder = -1, -1, -1
def find_group_size(input_file, n):
    #computes the values of:
    global file_size, group_size, remainder
    file_size = 0
    with open(input_file, 'rt', encoding = 'utf-8') as csvfile:
        rder = csv.DictReader(csvfile)
        for row in rder:
            file_size += 1
    group_size = int(file_size/n)
    remainder = file_size - group_size * n
def get_header():
    return {'num_edits': 0, 'num_thanks': 0, 'group_size': 0}
#reformat (and consolidate) the data
def reformat_csv(input_files, output_file, num_groups, column_name):
    #data contains data from all input files, consolidated into one output file at end
    data = []
    for k in range(0, len(input_files)):
        input_file = input_files[k]
        with open(input_file, 'rt', encoding = 'utf-8') as csvfile:
            rder = csv.DictReader(csvfile)
            find_group_size(input_file, num_groups)
            global file_size, group_size, remainder

            i, j, edits_sum, thanks_sum = 0, 0, 0, 0
            adjusted_group_size = group_size
            for row in rder:
                i += 1
                edits_sum += int(row['Edits'])
                thanks_sum += int(row['Thanks'])

                if (i == adjusted_group_size):
                    #count number of edits, number of thanks, number of people in a group
                    a1 = edits_sum * 1.0
                    a2 = thanks_sum * 1.0
                    if (k == 0):
                        data.append(get_header())
                    data[j]['num_edits'] += a1
                    data[j]['num_thanks'] += a2
                    data[j]['group_size'] += adjusted_group_size
                    i, edits_sum, thanks_sum = 0, 0, 0
                    j += 1
                    if (j == num_groups - remainder):
                        adjusted_group_size += 1 
    with open(output_file, 'w') as csvfile:
        fieldnames = ['edits_avg', column_name]
        wrter = csv.DictWriter(csvfile, fieldnames=fieldnames)
        wrter.writeheader()
        for row in data:
            a1 = row['num_edits']/row['group_size']
            a2 = (row['num_thanks']/row['group_size'])
            #a2 is an average, can be a ratio based on what is passed
            if (column_name == 'ratio'):
                a2 /= a1
            wrter.writerow({'edits_avg': a1, column_name: a2})
#define filenames
languages = ['No', 'Th', 'Sv', 'Ko', 'Ar', 'He', 'Uk', 'Fa', 'Nl', 'Pl', 'Pt', 'It', 'Es', 'De']
#src = '(1-6)-data/thanks-received/' #to plot thanks-received data
src = '(1-6)-data/thanks-given/' #to plot thanks-given data
input_stem = 'Data.csv'
output_stem = '-Ratios.csv'
#calls reformat_csv() on all languages separately
def make_csvs_by_language(column_name, output_stem, languages=languages, src=src, input_stem=input_stem):
    for l in languages:
        find_group_size(src + l + input_stem, num_groups)
        reformat_csv([src + l + input_stem], src + l + output_stem, num_groups, column_name)
#calls reformat_csv() on all languages together
def combine_csvs(column_name, output_stem, languages=languages, src=src, input_stem=input_stem, num_groups=num_groups):
    input_files = []
    for l in languages:
        input_files.append(src + l + input_stem)
    return reformat_csv(input_files, src + output_stem, num_groups, column_name)
#only run once
#make_csvs_by_language('ratio', output_stem)

#combine_csvs('ratio', 'All-Data'+output_stem)
output_stem = '-Absolute.csv'
#only run once
#make_csvs_by_language('avg', output_stem)

#combine_csvs('avg', 'All-Data'+output_stem)
import plotly.plotly as py
#run pip install plotly from terminal or this will not work
import plotly.tools as tls
from plotly.graph_objs import *
import numpy as np
import csv
#note: plotly requires you to set-up an account
def get_csv_data(filepath, col_id):
    arr = []
    with open (filepath, 'r') as data_file:
        reder = csv.reader(data_file)
        i = 0
        for row in reder:
            if i == 0:
                i += 1
                continue
            if (col_id == -1):
                arr += row
            else: 
                arr.append(row[col_id])
    return np.array(arr)

percentiles=[]
def make_percentiles(n):
    x = int(100/n)
    for i in range(0, n):
        percentiles.append(str(x*i)+"th")
        
make_percentiles(num_groups)

def make_trace(x, y, color, text):
    return Scatter(
        x=x,
        y=y,
        text=text,
        marker=Marker(
            color=color,
            line=Line(
                color='white',
                width= 2.5
            )
        ),
        mode='lines+markers',
    )

def make_layout(title, ylabel, rnge):
    return Layout(
        title=title,
        showlegend=False,
        yaxis = YAxis(
            title=ylabel,
            range=rnge,
            zeroline=False,
            gridcolor='white'
        ),
        paper_bgcolor='rgb(233,233,233)',
        plot_bgcolor='rgb(233,233,233)',
    )
title = "Thanks to Edits Ratios<br> By Percentile Based on Edits (June 2017-June 2018) <br> in "
file_stem = "sample-ratio-graph" #do not change this or you will overload your (or my) plotly account
#make graph
def make_figures(languages, title, title_ending, file_stem, output_stem, ylabel):
    figures = []
    for l in languages:
        ratios = get_csv_data(src + l + output_stem, 1)
        hover_text = get_csv_data(src + l + output_stem, 0)
        data = Data([
            make_trace(percentiles, ratios, '#E3BA22', hover_text)
        ])
        ttle = title + l + title_ending
        ymax = float(max(ratios)) * 1.1
        layout = make_layout(ttle, ylabel, [0, ymax])
        figure = Figure(data=data, layout=layout)
        figures.append([figure, file_stem])
    return figures
import warnings
warnings.filterwarnings('ignore')
#turns off plotly deprecation warnings

Note: The data below will either be for thanks given or thanks received depending on the filenames definition near the top of the notebook

figures = make_figures(languages, title, " Wikipedia", file_stem, '-Ratios.csv', "Ratio of Thanks Given to Edits")
py.iplot(figures[4][0], filename=figures[4][1]) #see graph of a language individually
figures = make_figures(['All-Data'], title, '', file_stem, '-Ratios.csv', "Ratio of Thanks Given to Edits")
py.iplot(figures[0][0], filename='amalg-ratio-graph') #see a graph of ratios (for combined data)
figures = make_figures(['All-Data'], "Thanks to Edits Averages<br> By Percentile Based on Edits (June 2017-June 2018) <br> in ", '', file_stem, '-Absolute.csv', "Thanks Given to Edits")
py.iplot(figures[0][0], filename='amalg-avgs-graph') #see a graph of absolute values (for combined data)
Conclusion:

Although the most active editors both send and receive the most thanks, they have the lowest thanks to edits ratios (they send or receive fewer thanks with respect to their edit count).