Introduction

These are the official results for the dropout-threshold project, which is thus far an analysis of editor dropout across the Korean, Hindi, and Finnish Wikipedias. The goal is to decide what time length of inactivity is necessary before an editor can be considered to no longer be part of the Wikipedia community. In order to determine this, we look at the probability of an editor returning the following month after x months of inactivity and see if this behavior is different for more experienced vs less experienced editors.

num_editors_by_tenure_labels = ['<2 months', '2-6 months', '6-12 months', '1-2 years', '2-5 years', '5+ years', -1]
#boundaries for the tenure groups
b1 = 2
b2 = 6
b3 = 12
b4 = 24
b5 = 60

start_month, start_day, start_year = 1, 1, 2017
timeframe = 13

hi_file_src = "dropout-threshold-data/hindi-data"
hi_output_file_stem = "Month"

ko_file_src = "dropout-threshold-data/korean-data"
ko_output_file_stem = "Month"

fi_file_src = "dropout-threshold-data/finnish-data"
fi_output_file_stem = "Month"

title = "Retention Probabilities in Hi, Ko, Fi Wikipedias <br> (from January 2017)"
ylabel = "P(dropped editor returning next month)"
filename = 'retention-graph'
#Don't change the following w/o changing get_tenure_times() and correct_timestamp_format()
timestamp_format = "%Y%m%d"
def make_editors_by_tenure(n):
    lst = []
    for i in range(0, n):
        lst.append(0)
    return lst
boundaries = [b1, b2, b3, b4, b5]
num_editors_by_tenure = make_editors_by_tenure(len(boundaries)+1)
def add_file_src(file_src, file_stem):
    return file_src + "/" + file_stem

hi_output_file_stem = add_file_src(hi_file_src, hi_output_file_stem)
ko_output_file_stem = add_file_src(ko_file_src, ko_output_file_stem)
fi_output_file_stem = add_file_src(fi_file_src, fi_output_file_stem)
import csv

def get_threshold_data(input_files):
    data_by_tenure = []
    for i in range(0, len(input_files)):
        with open(input_files[i], 'rt', encoding = 'utf-8') as csvfile:
            rder = csv.DictReader(csvfile)
            j = 0
            for row in rder:
                if i == 0:
                    data_by_tenure.append([int(row['Edits'])])
                else:
                    data_by_tenure[j].append(int(row['Edits']))
                j += 1
    all_data = []
    for i in range(0, len(data_by_tenure)):
        for j in range(0, len(data_by_tenure[i])):
            if i == 0:
                all_data.append(data_by_tenure[i][j])
            else:
                all_data[j] += data_by_tenure[i][j]
    data_by_tenure.append(all_data)
    return data_by_tenure

def make_files_lst(n, output_file_stem):
    lst = []
    for i in range(1, n+1):
        lst.append(output_file_stem + str(i) + ".csv")
    return lst
ko_drpts = get_threshold_data(make_files_lst(timeframe, ko_output_file_stem))
hi_drpts = get_threshold_data(make_files_lst(timeframe, hi_output_file_stem))
fi_drpts = get_threshold_data(make_files_lst(timeframe, fi_output_file_stem))
import math

def generate_months(start_month, start_year, n):
    months = []
    for x in range(1, n+1):
        month = (start_month + x) % 12
        if month == 0:
            month = 12
        year = start_year + math.floor((start_month + x - 1) / 12)
        months.append(str(month) + "/" + str(year))
    return months
months = generate_months(start_month, start_year, timeframe)
#to take data from multiple languages and combine it
def amalgamate_dropout_data(data):
    drpts = []
    for i in range(0, len(data)):
        for j in range(0, len(data[i])):
            if (i == 0):
                drpts.append([])
            for k in range(0, len(data[i][j])):
                if (i == 0):
                    drpts[j].append(data[i][j][k])
                    continue
                drpts[j][k] += data[i][j][k]
    return drpts
drpts = amalgamate_dropout_data([ko_drpts, hi_drpts, fi_drpts])
def to_probabilities(drpts):
    drpts_p = []
    l = len(drpts)
    for i in range(0, l-1):
        p = (drpts[i]- drpts[i+1]) * 1.0/drpts[i]
        if p < 0:
            p = 0
        drpts_p.append(p)
    return drpts_p
def convert_to_probabilities(drpts):
    drpt_probabilities = []
    for lst in drpts:
        drpt_probabilities.append(to_probabilities(lst))
    return drpt_probabilities

drpt_probabilities = convert_to_probabilities(drpts)
import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *
import numpy as np
def make_trace(x, y, color):
    return Bar(
        x=x,
        y=y,
        marker=Marker(
            color=color,
            line=Line(
                color='white',
                width= 2.5
            )
        ),
    )

def make_layout(title, ylabel, rnge):
    return Layout(
        title=title,
        showlegend=False,
        yaxis = YAxis(
            title=ylabel,
            range=rnge,
            zeroline=False,
            gridcolor='white'
        ),
        paper_bgcolor='rgb(233,233,233)',
        plot_bgcolor='rgb(233,233,233)',
    )
def make_figures(drpt_probabilities):
    figures = []
    for i in range(0, len(drpt_probabilities)):
        retention_data = Data([
            make_trace(months[:-1], drpt_probabilities[i], '#465573'),
        ])
        global title
        if (num_editors_by_tenure_labels[i] == -1):
            ttle = title
        else:
            ttle = title + "<br> Tenure " + num_editors_by_tenure_labels[i]
        retention_layout = make_layout(ttle, ylabel, [0, max(drpt_probabilities[i])])
        retention_fig = Figure(data=retention_data, layout=retention_layout)
        figures.append([retention_fig, filename+str(i+1)])
    return figures
figures = make_figures(drpt_probabilities)
py.iplot(figures[0][0], filename=figures[0][1])
py.iplot(figures[1][0], filename=figures[1][1])
py.iplot(figures[2][0], filename=figures[2][1])
py.iplot(figures[3][0], filename=figures[3][1])
py.iplot(figures[4][0], filename=figures[4][1])
py.iplot(figures[5][0], filename=figures[5][1])
py.iplot(figures[6][0], filename=figures[6][1])

Markov Chain Analysis

Using the overall data (combined by both tenure and language), I made a transition matrix with 13 states (Active Editor, 1 month gone, 2 months gone, ... 11 months gone, dropped out), with dropped out being an absorbing state. This matrix can be used to make the fundamental matrix N, which tells us how many times we can expect an editor to be in any state given the state they are in currently. Based on this, if an editor has not been active for 6 months, we can expect them to be in the active editor category 0.2 times. If an editor has not been active for 3 months, they will be in the active category 0.42 times, and if they have not been active for 11 months, they will be in that category 0.02 times.