import pandas as pd
from sqlalchemy import create_engine
import os
from datetime import datetime as dt, timedelta as td
import time

def utf_8_decode(colstr):
    return colstr.decode('utf-8')

wmfdate_fmt = '%Y%m%d%H%M%S'

def wmftimestamp(datestr):
    decoded = utf_8_decode(datestr)
    return dt.strptime(decoded, wmfdate_fmt)

def to_wmftimestamp(date):
    return date.strftime('%Y%m%d%H%M%S')

from scipy.stats import ttest_ind, chisquare

%pylab inline
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

CACHE_ROOT = os.getenv("CACHE_DIR", 'cache')

def make_cached_df(cache_sub_dir):
    """a decorator who input is a direcotry under env's CACHE_DIR
    wraps a DataFrame returning function, and uses the repr of the arguments as a keys"""
    # decorator factory
    def cached_df(df_returning_fn):
        # decorator
        def get_with_cache(*args, **kwargs):
            # wrapping function
            cache_dir = os.path.join(CACHE_ROOT, cache_sub_dir)
            if not os.path.exists(cache_dir):
                os.makedirs(cache_dir, exist_ok=True)
            all_args = args+tuple(kwargs.values())
            str_args = "_".join([a.__name__ if callable(a) else str(a) for a in all_args])
            str_args = str_args.replace('/', '___')
            fname = f'{str_args}'
            fname = fname[:255] # these can get long
            cache_key = os.path.join(cache_dir, fname)
            try:
                return pd.read_pickle(cache_key)
            except FileNotFoundError:
                df = df_returning_fn(*args, **kwargs)
                df.to_pickle(cache_key)
                return df

        return get_with_cache

    return cached_df
Populating the interactive namespace from numpy and matplotlib
/srv/paws/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['chisquare']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
CACHE_ROOT = os.getenv("CACHE_DIR", 'cache')

def make_cached_df(cache_sub_dir):
    """a decorator who input is a direcotry under env's CACHE_DIR
    wraps a DataFrame returning function, and uses the repr of the arguments as a keys"""
    # decorator factory
    def cached_df(df_returning_fn):
        # decorator
        def get_with_cache(*args, **kwargs):
            # wrapping function
            cache_dir = os.path.join(CACHE_ROOT, cache_sub_dir)
            if not os.path.exists(cache_dir):
                os.makedirs(cache_dir, exist_ok=True)
            all_args = args+tuple(kwargs.values())
            str_args = "_".join([a.__name__ if callable(a) else str(a) for a in all_args])
            str_args = str_args.replace('/', '___')
            fname = f'{str_args}'
            fname = fname[:255] # these can get long
            cache_key = os.path.join(cache_dir, fname)
            try:
                return pd.read_pickle(cache_key)
            except FileNotFoundError:
                df = df_returning_fn(*args, **kwargs)
                df.to_pickle(cache_key)
                return df

        return get_with_cache

    return cached_df
constr = 'mysql+pymysql://{user}:{pwd}@{host}/DB?charset=utf8'.format(user=os.environ['MYSQL_USERNAME'],
                                                      pwd=os.environ['MYSQL_PASSWORD'],
                                                      host=os.environ['MYSQL_HOST'])
con = create_engine(constr, encoding='utf-8')

con.execute(f'use enwiki_p;');
experiment_start_date = datetime.date(2019,2,13)
experiment_end_date = datetime.date(2019,5,3)

hostbot_hist_sql = f"""select * from revision_userindex 
          join page
          on page.page_id=revision_userindex.rev_page
      where rev_actor = (select actor_id from actor where actor_name='HostBot')
          and rev_timestamp>={to_wmftimestamp(experiment_start_date)} and rev_timestamp<={to_wmftimestamp(experiment_end_date)}
          """

hostbot_user_sql_f = """select * from revision_userindex 
          join page
          on page.page_id=revision_userindex.rev_page
          join user
          on replace(page.page_title, '_', ' ')=user.user_name
      where rev_actor = (select actor_id from actor where actor_name='HostBot')
          and rev_timestamp>={0} and rev_timestamp<={1};"""


hostbot_comparison_sql = '''select user_id, actor_id from user
        join actor
        on actor_user = user_id
        where user_id > 35827517 and user_registration>=20190214000000 and user_registration<=20190512000000;'''

hostbot_user_sql = hostbot_user_sql_f.format('20190213000000', '20190503000000')
hostbot_user_sql_yesteryear = hostbot_user_sql_f.format('20180213000000', '20180503000000')
df = pd.read_sql(hostbot_hist_sql, con)
du = pd.read_sql(hostbot_user_sql, con)
duy = pd.read_sql(hostbot_user_sql_yesteryear, con)
du.head()
rev_id rev_page rev_text_id rev_comment_id rev_actor rev_timestamp rev_minor_edit rev_deleted rev_len rev_parent_id ... user_options user_touched user_token user_email_authenticated user_email_token user_email_token_expires user_registration user_newpass_time user_editcount user_password_expires
0 883064057 59938044 0.0 273923596.0 2958 b'20190213021136' 0 0 1181 0 ... None None None None None None b'20190213013202' None 31 None
1 883064059 59938045 0.0 273923596.0 2958 b'20190213021136' 0 0 1183 0 ... None None None None None None b'20190213013215' None 3 None
2 883072513 59938426 0.0 10.0 2958 b'20190213032413' 0 0 1429 883069407 ... None None None None None None b'20190212054718' None 27 None
3 883072516 59938602 0.0 273923596.0 2958 b'20190213032414' 0 0 1182 0 ... None None None None None None b'20190212072428' None 7 None
4 883072518 59938603 0.0 273923596.0 2958 b'20190213032415' 0 0 1185 0 ... None None None None None None b'20190212140636' None 13 None

5 rows × 42 columns

cu = pd.read_sql(hostbot_comparison_sql, con)
du['user_name'] = du['user_name'].apply(utf_8_decode)
du['user_registration'] = du['user_registration'].apply(wmftimestamp)
du['rev_timestamp'] = du['rev_timestamp'].apply(wmftimestamp)
du['day_invited'] = du['rev_timestamp'].apply(lambda d: d.date())
duy['user_name'] = duy['user_name'].apply(utf_8_decode)
duy['user_registration'] = duy['user_registration'].apply(wmftimestamp)
duy['rev_timestamp'] = duy['rev_timestamp'].apply(wmftimestamp)
duy['day_invited'] = duy['rev_timestamp'].apply(lambda d: d.date())
duy.columns
Index(['rev_id', 'rev_page', 'rev_text_id', 'rev_comment_id', 'rev_actor',
       'rev_timestamp', 'rev_minor_edit', 'rev_deleted', 'rev_len',
       'rev_parent_id', 'rev_sha1', 'rev_content_model', 'rev_content_format',
       'page_id', 'page_namespace', 'page_title', 'page_restrictions',
       'page_is_redirect', 'page_is_new', 'page_random', 'page_touched',
       'page_links_updated', 'page_latest', 'page_len', 'page_content_model',
       'page_lang', 'user_id', 'user_name', 'user_real_name', 'user_password',
       'user_newpassword', 'user_email', 'user_options', 'user_touched',
       'user_token', 'user_email_authenticated', 'user_email_token',
       'user_email_token_expires', 'user_registration', 'user_newpass_time',
       'user_editcount', 'user_password_expires', 'day_invited'],
      dtype='object')
duy.shape
(6562, 43)
du.shape
(13746, 43)
edit_titles = set(df['page_title'].values)
user_titles = set(du['page_title'].values)
len(edit_titles), len(user_titles)
(13753, 13746)
edit_titles.difference(user_titles)
{b'Maximilianklein/draft/2605:E000:1317:855B:F9B7:A9F3:F6BF:87A5',
 b'Maximilianklein/draft/Hjrohin',
 b'Maximilianklein/draft/Komadoe',
 b'Maximilianklein/draft/Lukemartens',
 b'Maximilianklein/draft/Premapandiri',
 b'Maximilianklein/draft/VEMBERDOM',
 b'Maximilianklein/draft/test_user'}
invited_by_hostbot = du[du['user_id'].apply(lambda x: x%2==1)]
invited_by_hostbot_ai = du[du['user_id'].apply(lambda x: x%2==0)]
len(invited_by_hostbot), len(invited_by_hostbot_ai)
(5605, 8141)
invited_by_hostbot_ai.groupby(by='day_invited').size().hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7ff0e3f56ba8>
invited_by_hostbot.groupby(by='day_invited').size().hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7ff0e3d5ba20>

adding in the overflow users

overflow_all = pd.read_csv('data/hostbot_overflow_candidates.csv', parse_dates=['user_registration','created_at'])
overflow = overflow_all[(overflow_all['created_at']>datetime.date(2019,2,13))]
/srv/paws/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Comparing Series of datetimes with 'datetime.date'.  Currently, the
'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  
overflow.shape, overflow_all.shape #not that much in the end
((50, 7), (4281, 7))

define retention scores

  • retention scores used in Open sym paper surviving new editor
  • classic retetion scores but different periods to mimic real-world dynamics of TH
    • trial --> registration-invite, survival --> after
    • trial 1 weeks, survival {1,3} weeks a rought measure of above ^^
    • trial 0 weeks, survival {1,3} weeks
      • (not a good survival metric, but this will include the initial spike) (if this is better than the others, then that suggests finding the editors who are "trendy" initially excited but dont' stick around longterm).
  • nathan's backfilling retention measure.
@make_cached_df('user_histories')
def get_user_history(user_id, user_registration, trial_period_weeks, surviving_period_weeks):
    con.execute('use enwiki_p;')

    user_edits_sql_f = """select rev_timestamp from revision_userindex 
          join page
          on page.page_id=revision_userindex.rev_page
      where rev_actor = (select actor_id from actor where actor_user={user_id})
          and rev_timestamp>={user_registration_date} and rev_timestamp<={survival_end_date}"""
    
    user_registration_date = to_wmftimestamp(user_registration)
    survival_end = user_registration + td(weeks=(trial_period_weeks+surviving_period_weeks))
    survival_end_date = to_wmftimestamp(survival_end)
    
    user_edits_sql = user_edits_sql_f.format(user_id=user_id,
                                            user_registration_date=user_registration_date,
                                            survival_end_date=survival_end_date)
    uf = pd.read_sql(user_edits_sql, con)
    uf['rev_timestamp'] = uf['rev_timestamp'].apply(wmftimestamp)
    return uf


def is_surviving(user_id, user_registration, trial_period_weeks, surviving_period_weeks, surviving_edits):
    """determine if user survived if they edit in the trial period, and then again in the surviving period
    returns None if user is never-activated (makes no edits in trial period)
    return True if survivng, False if activated non-surviving."""
    # then classify is users did activate
    uf = get_user_history(user_id, user_registration, trial_period_weeks, surviving_period_weeks)
    
    survival_end = user_registration + td(weeks=(trial_period_weeks+surviving_period_weeks))
    trial_end = user_registration + td(weeks=trial_period_weeks)
    edits_in_trial = uf[uf['rev_timestamp']<=trial_end]
    edits_in_survival = uf[(uf['rev_timestamp']>trial_end) & (uf['rev_timestamp']<= survival_end)]
    
    user_activated = True if (len(edits_in_trial) > 0) or (trial_period_weeks==0) else False
    if not user_activated:
        return None
    else:
    # then classify if users did survive
        return True if len(edits_in_survival) >= surviving_edits else False


    
def num_surviving_days(user_id, user_registration, window_weeks=24):
    highest_survival_weeks = window_weeks
    """how many days after the users registration was their last edit; with the consideration window
    window_weeks after registration."""
    uf = get_user_history(user_id=user_id, user_registration=user_registration, 
                          trial_period_weeks=0, surviving_period_weeks=highest_survival_weeks)
    experiment_window_end = user_registration+td(days=7*highest_survival_weeks)
    edits_in_window = uf[uf['rev_timestamp']<experiment_window_end]['rev_timestamp']
    # if data is deleted
    if len(edits_in_window)==0:
        return None
    last_edit_in_window = edits_in_window.max()
    days_survived = (last_edit_in_window - user_registration).days
#     print(f'reg: {user_registration} | last edit {last_edit_in_window} | survived {days_survived}')
    return days_survived


def num_surviving_edits(user_id, user_registration):
    highest_survival_weeks = 24
    """how many edits did the user make during their first 24 weeks on the site"""
    uf = get_user_history(user_id=user_id, user_registration=user_registration, 
                          trial_period_weeks=0, surviving_period_weeks=highest_survival_weeks)
    experiment_window_end = user_registration+td(days=7*highest_survival_weeks)
    edits_in_window = uf[uf['rev_timestamp']<experiment_window_end]['rev_timestamp']
    # if data is deleted
    if len(edits_in_window)==0:
        return None
    return len(edits_in_window)
du_short = du[:500].copy()
ov_short = overflow[:500].copy()
%%timeit -r 1 -n 1
du_short['survival_days'] = du_short.apply(lambda row: num_surviving_days(user_id=row['user_id'],
                                                                          user_registration=row['user_registration']),
                                                                                                axis=1)
3.02 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
du_short['survival_days'].plot(kind='hist')
<matplotlib.axes._subplots.AxesSubplot at 0x7ff0cebedfd0>
%%timeit -r 1 -n 1
du_short['is_surviving_0_1_1'] = du_short.apply(lambda row: is_surviving(
                                                user_id=row['user_id'],
                                                user_registration=row['user_registration'],
                                                trial_period_weeks=0,
                                                surviving_period_weeks=1,
                                                surviving_edits=1), axis=1)
3.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
%%timeit -r 1 -n 1
ov_short['is_surviving_0_1_1'] = ov_short.apply(lambda row: is_surviving(
                                                user_id=row['user_id'],
                                                user_registration=row['user_registration'],
                                                trial_period_weeks=0,
                                                surviving_period_weeks=1,
                                                surviving_edits=1), axis=1)
3.15 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

looping it

retention_configs = {'is_surviving_3_1_1': {'trial_period_weeks':3,
                                           'surviving_period_weeks':1,
                                           'surviving_edits':1},
                     'is_surviving_4_4_1': {'trial_period_weeks':4,
                                           'surviving_period_weeks':4,
                                           'surviving_edits':1},
                     'is_surviving_8_16_1': {'trial_period_weeks':8,
                                           'surviving_period_weeks':16,
                                           'surviving_edits':1},
                     'is_surviving_3_1_5': {'trial_period_weeks':3,
                                           'surviving_period_weeks':1,
                                           'surviving_edits':5},
                     'is_surviving_4_4_5': {'trial_period_weeks':4,
                                           'surviving_period_weeks':4,
                                           'surviving_edits':5},
                     'is_surviving_8_16_5': {'trial_period_weeks':8,
                                           'surviving_period_weeks':16,
                                           'surviving_edits':5},
                     'is_surviving_1_1_1': {'trial_period_weeks':1,
                                           'surviving_period_weeks':1,
                                           'surviving_edits':1},
                     'is_surviving_1_3_1': {'trial_period_weeks':1,
                                           'surviving_period_weeks':3,
                                           'surviving_edits':1},
                    }             
for configname, rconfig in retention_configs.items():
    for targ_df in [du, overflow]:
        if configname in targ_df.columns:
            print(f'already done {configname}')
            continue
        s_time = time.time()
        targ_df[configname] = targ_df.apply(lambda row: is_surviving(
                                                    user_id=row['user_id'],
                                                    user_registration=row['user_registration'],
                                                    trial_period_weeks=rconfig['trial_period_weeks'],
                                                    surviving_period_weeks=rconfig['surviving_period_weeks'],
                                                    surviving_edits=rconfig['surviving_edits']), 
                                                    axis=1)
        e_time = time.time()
        print(f'finished {configname} in {e_time-s_time} seconds')
already done is_surviving_3_1_1
already done is_surviving_3_1_1
already done is_surviving_4_4_1
already done is_surviving_4_4_1
already done is_surviving_8_16_1
already done is_surviving_8_16_1
already done is_surviving_3_1_5
already done is_surviving_3_1_5
already done is_surviving_4_4_5
already done is_surviving_4_4_5
already done is_surviving_8_16_5
already done is_surviving_8_16_5
already done is_surviving_1_1_1
already done is_surviving_1_1_1
already done is_surviving_1_3_1
already done is_surviving_1_3_1
du['survival_days_168'] = du.apply(lambda row: num_surviving_days(user_id=row['user_id'],
                                                                          user_registration=row['user_registration'],
                                                                window_weeks=24),
                                                                                                axis=1)

du['survival_days_028'] = du.apply(lambda row: num_surviving_days(user_id=row['user_id'],
                                                                          user_registration=row['user_registration'],
                                                                window_weeks=4),
                                                                                                axis=1)

du['survival_days_056'] = du.apply(lambda row: num_surviving_days(user_id=row['user_id'],
                                                                          user_registration=row['user_registration'],
                                                                window_weeks=8),
                                                                                                axis=1)

du['survival_days_014'] = du.apply(lambda row: num_surviving_days(user_id=row['user_id'],
                                                                          user_registration=row['user_registration'],
                                                                window_weeks=2),
                                                                                                axis=1)
du['survival_edits'] = du.apply(lambda row: num_surviving_edits(user_id=row['user_id'],
                                                                          user_registration=row['user_registration']),
                                                                                                axis=1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-97-d11bf9c63d89> in <module>
      2                                                                           user_registration=row['user_registration'],
      3                                                                 window_weeks=24),
----> 4                                                                                                 axis=1)
      5 
      6 du['survival_days_028'] = du.apply(lambda row: num_surviving_days(user_id=row['user_id'],

/srv/paws/lib/python3.6/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
   6485                          args=args,
   6486                          kwds=kwds)
-> 6487         return op.get_result()
   6488 
   6489     def applymap(self, func):

/srv/paws/lib/python3.6/site-packages/pandas/core/apply.py in get_result(self)
    149             return self.apply_raw()
    150 
--> 151         return self.apply_standard()
    152 
    153     def apply_empty_result(self):

/srv/paws/lib/python3.6/site-packages/pandas/core/apply.py in apply_standard(self)
    255 
    256         # compute the result using the series generator
--> 257         self.apply_series_generator()
    258 
    259         # wrap results

/srv/paws/lib/python3.6/site-packages/pandas/core/apply.py in apply_series_generator(self)
    284             try:
    285                 for i, v in enumerate(series_gen):
--> 286                     results[i] = self.f(v)
    287                     keys.append(v.name)
    288             except Exception as e:

<ipython-input-97-d11bf9c63d89> in <lambda>(row)
----> 1 du['survival_days_168'] = du.apply(lambda row: num_surviving_days(user_id=row['user_id'],
      2                                                                           user_registration=row['user_registration'],
      3                                                                 window_weeks=24),
      4                                                                                                 axis=1)
      5 

NameError: ("name 'num_surviving_days' is not defined", 'occurred at index 0')
du.to_pickle('cache/du_with_survival.pickle')
overflow.to_pickle('cache/overflow_with_survival.pickle')
du = pd.read_pickle('cache/du_with_survival.pickle')
overflow = pd.read_pickle('cache/overflow_with_survival.pickle')
ai_user_max_predictions = pd.read_csv('data/hostbot_ai_best_user_predictions.csv', index_col='user_name')
def split_invite_sets(du, balance_daily_sample=True, max_daily_invitees=None):
    print(f'balancing samples? {balance_daily_sample}')
    if not balance_daily_sample:
        invited_by_hostbot = du[du['user_id'].apply(lambda x: x%2==1)]
        invited_by_hostbot_ai = du[du['user_id'].apply(lambda x: x%2==0)]
        print(f'After not balancing samples heuristic invitees: {len(invited_by_hostbot)}, ai invitees: {len(invited_by_hostbot_ai)}')
        return invited_by_hostbot, invited_by_hostbot_ai        
    
    else:
        # balance the samples by invites per day
        invited_by_hostbot_daily = []
        invited_by_hostbot_ai_daily = []
        
        for invite_day in unique(du['day_invited']):
#             print(invite_day)
            # find out how many heuristic invited, find out how many ai invited (print if it's ai that's smaller)
            du_day = du[du['day_invited']==invite_day]
            heur_day = du_day[du_day['user_id'].apply(lambda x: x%2==1)]
            ai_day = du_day[du_day['user_id'].apply(lambda x: x%2==0)]
            
            # first chop down both to max if set
            if max_daily_invitees:
                heur_day = heur_day.sample(n=max_daily_invitees, random_state=889) if len(heur_day)>=max_daily_invitees else heur_day
                ai_day = ai_day.join(ai_user_max_predictions, on='user_name', how='left')\
                            .sort_values('max(pred_min)', ascending=False)[:max_daily_invitees]
            
            len_heur_day = len(heur_day)
            len_ai_day = len(ai_day)
            # if heuristic invited more just cut it randomly
            if len_heur_day > len_ai_day:
                heur_day = heur_day.sample(n=len_ai_day, random_state=889)
            # if ai invited more cut it by best predicted
            if len_ai_day > len_heur_day:
                ai_day_pred = ai_day.join(ai_user_max_predictions, on='user_name', how='left') if 'max(pred_min)' not in ai_day.columns else ai_day
                ai_day_pred_sorted = ai_day_pred.sort_values('max(pred_min)', ascending=False)
                # take the best predicted users from that day
                ai_day = ai_day_pred_sorted[:len_heur_day]
            
            # append the daily samples
            invited_by_hostbot_daily.append(heur_day)
            invited_by_hostbot_ai_daily.append(ai_day)

        invited_by_hostbot = pd.concat(invited_by_hostbot_daily)
        invited_by_hostbot_ai = pd.concat(invited_by_hostbot_ai_daily)
        print(f'After balancing samples heuristic invitees: {len(invited_by_hostbot)}, ai invitees: {len(invited_by_hostbot_ai)}')
        return invited_by_hostbot, invited_by_hostbot_ai
invited_by_hostbot, invited_by_hostbot_ai = split_invite_sets(du, balance_daily_sample=True, max_daily_invitees=None)
balancing samples? True
/srv/paws/lib/python3.6/site-packages/ipykernel_launcher.py:44: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

After balancing samples heuristic invitees: 5485, ai invitees: 5485
test_results_t = {}
test_results_chi ={}
for configname, rconfig in retention_configs.items():
    heur_ret_per = invited_by_hostbot[configname].sum() /len(invited_by_hostbot)
    ai_ret_per = invited_by_hostbot_ai[configname].sum() / len(invited_by_hostbot_ai)
    overflow_ret_per = overflow[configname].sum() / len(overflow)
    ai_heur_improvment_per = ai_ret_per - heur_ret_per
    ai_control_improvment_per = ai_ret_per - overflow_ret_per
    
    nulls_hb = invited_by_hostbot[pd.isnull(invited_by_hostbot[configname])]
    nulls_hb_ai = invited_by_hostbot_ai[pd.isnull(invited_by_hostbot_ai[configname])]
    nulls_overflow = overflow[pd.isnull(overflow[configname])]
    print(f'dropping {len(nulls_hb)} from dataset from hostbot for {configname}')
    print(f'dropping {len(nulls_hb_ai)} from dataset from hostbot_ai for {configname}')
    print(f'drppong {len(nulls_overflow)} from dataset from overflow for {configname}')
    
    invited_by_hostbot_nn = invited_by_hostbot[pd.notnull(invited_by_hostbot[configname])]
    invited_by_hostbot_ai_nn = invited_by_hostbot_ai[pd.notnull(invited_by_hostbot_ai[configname])]
    overflow_nn = overflow[pd.notnull(overflow[configname])]

    
    ttest_res = ttest_ind(invited_by_hostbot_nn[configname], invited_by_hostbot_ai_nn[configname])
    tstat = ttest_res.statistic
    pval = ttest_res.pvalue
    
    ttest_overflow = ttest_ind(invited_by_hostbot_ai_nn[configname], overflow_nn[configname])
    tstat_overflow = ttest_overflow.statistic
    pval_overflow = ttest_overflow.pvalue
    
#     chi_sq_res = chisquare([invited_by_hostbot[configname], invited_by_hostbot_ai[configname]])
    
#     print(configname, heur_ret_per, ai_ret_per, tstat, pval)
    test_results_t[configname] = {**rconfig,
                                'heuristic_retetion_perc': heur_ret_per*100,
                                'ai_retention_perc': ai_ret_per*100,
                                  'ai_control_retention_perc': overflow_ret_per*100,
                                'ai_heur_per_diff':ai_heur_improvment_per*100,
                                  'ai_control_per_diff': ai_control_improvment_per*100,
                               'ai_heur_t_stat': tstat,
                               'ai_heur_p_value': pval,
                               'ai_control_t_stat': tstat_overflow,
                               'ai_control_p_value': pval_overflow,
                                 }
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-ab446a505794> in <module>
      1 test_results_t = {}
      2 test_results_chi ={}
----> 3 for configname, rconfig in retention_configs.items():
      4     heur_ret_per = invited_by_hostbot[configname].sum() /len(invited_by_hostbot)
      5     ai_ret_per = invited_by_hostbot_ai[configname].sum() / len(invited_by_hostbot_ai)

NameError: name 'retention_configs' is not defined
tf = pd.DataFrame.from_dict(test_results_t, orient='index')
pd.options.display.float_format = '{:,.3f}'.format
tf
tf[['ai_heur_per_diff','ai_heur_p_value']].to_html()
p=tf[['heuristic_retetion_perc','ai_retention_perc']].plot(kind='bar', figsize=(8,4))
p.set_title("Comparison of Retention Rates beteween HostBot-Heuristic and HostBot-AI \n invites to TeaHouse")
p.set_ylabel("% invited editors retained")
handles, labels = p.get_legend_handles_labels()
p.legend(handles, ['HostBot-Heuristic','HostBot-AI'])
p.set_xlabel("Retention measure (*p<0.05)")
# p.set_xticklabels(['1 month, 1 edit*','1 month, 5 edits','2 months, 1 edit*','2 months, 5 edits*', '3 months, 1 edit', '3 months, 5 edits'])
plt.xticks(rotation=20)
survival_test = {}
for measure in ['survival_days_14','survival_days_28','survival_days_56','survival_days_168', 'survival_edits']:


    nulls_hb_soph = invited_by_hostbot[pd.isnull(invited_by_hostbot[measure])]
    nulls_hb_ai_soph = invited_by_hostbot_ai[pd.isnull(invited_by_hostbot_ai[measure])]
#     print(f'dropping {len(nulls_hb)} from dataset from hostbot for survival_days')
#     print(f'dropping {len(nulls_hb_ai)} from dataset from hostbot_ai for survival_days')

    invited_by_hostbot_nn = invited_by_hostbot[pd.notnull(invited_by_hostbot[measure])]
    invited_by_hostbot_ai_nn = invited_by_hostbot_ai[pd.notnull(invited_by_hostbot_ai[measure])]
    
    
    ## drop top 
    heur_99 = invited_by_hostbot_nn[measure].quantile(q=0.99)
    ai_99 = invited_by_hostbot_ai_nn[measure].quantile(q=0.99)
#     print(f'Heuristic 99th percentail is {heur_99}. AI 99th percentile is {ai_99}')
    invited_by_hostbot_nn_99 = invited_by_hostbot_nn[invited_by_hostbot_nn[measure]<heur_99]
    invited_by_hostbot_ai_nn_99 =  invited_by_hostbot_ai_nn[invited_by_hostbot_ai_nn[measure]<ai_99]
    
    print(f'dropping 99th perc, len goes from{len(invited_by_hostbot_nn)}, to {len(invited_by_hostbot_nn_99)}')
    print(f'dropping 99th perc, len goes from{len(invited_by_hostbot_ai_nn)}, to {len(invited_by_hostbot_ai_nn_99)}')

    heur_mean_survival = invited_by_hostbot_nn_99[measure].mean()
    ai_mean_surival = invited_by_hostbot_ai_nn_99[measure].mean()
    ai_heur_improvment = ai_mean_surival - heur_mean_survival


    ttest_res_soph = ttest_ind(invited_by_hostbot_nn_99[measure], invited_by_hostbot_ai_nn_99[measure])
    tstat_soph = ttest_res_soph.statistic
    pval_soph = ttest_res_soph.pvalue

    survival_test[measure]= {'heur_measure': heur_mean_survival,
                             'ai_measure':ai_mean_surival,
                            'ai_heur_improvement': ai_heur_improvment,
                            'tstat':tstat_soph,
                            'pval':pval_soph} 
dropping 99th perc, len goes from5340, to 5108
dropping 99th perc, len goes from4961, to 4763
dropping 99th perc, len goes from5350, to 5185
dropping 99th perc, len goes from4974, to 4783
dropping 99th perc, len goes from5356, to 5251
dropping 99th perc, len goes from4976, to 4882
dropping 99th perc, len goes from5358, to 5300
dropping 99th perc, len goes from4989, to 4930
dropping 99th perc, len goes from5358, to 5304
dropping 99th perc, len goes from4989, to 4939
soph_survival = pd.DataFrame.from_dict(survival_test, orient='index')
pd.options.display.float_format = '{:,.3f}'.format
soph_survival = soph_survival.loc[['survival_days_14', 'survival_days_28', 'survival_days_56', 'survival_days_168']]
soph_survival.to_html()
'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>heur_measure</th>\n      <th>ai_measure</th>\n      <th>ai_heur_improvement</th>\n      <th>tstat</th>\n      <th>pval</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>survival_days_14</th>\n      <td>1.776</td>\n      <td>1.712</td>\n      <td>-0.064</td>\n      <td>0.976</td>\n      <td>0.329</td>\n    </tr>\n    <tr>\n      <th>survival_days_28</th>\n      <td>4.070</td>\n      <td>4.527</td>\n      <td>0.457</td>\n      <td>-3.014</td>\n      <td>0.003</td>\n    </tr>\n    <tr>\n      <th>survival_days_56</th>\n      <td>8.363</td>\n      <td>10.107</td>\n      <td>1.744</td>\n      <td>-5.595</td>\n      <td>0.000</td>\n    </tr>\n    <tr>\n      <th>survival_days_168</th>\n      <td>22.456</td>\n      <td>20.780</td>\n      <td>-1.676</td>\n      <td>2.121</td>\n      <td>0.034</td>\n    </tr>\n  </tbody>\n</table>'
p=soph_survival[['heur_measure','ai_measure']].plot(kind='bar', figsize=(7,2))
p.set_title("Comparison of Retention Days beteween HostBot-Heuristic and HostBot-AI \n invites to TeaHouse")
p.set_ylabel("average number of surviving days")
handles, labels = p.get_legend_handles_labels()
p.legend(handles, ['HostBot-Heuristic','HostBot-AI'], loc='upper center')
p.set_xlabel("Retention measure (*p<0.05)")
# p.set_xticklabels(['1 month, 1 edit*','1 month, 5 edits','2 months, 1 edit*','2 months, 5 edits*', '3 months, 1 edit', '3 months, 5 edits'])
plt.xticks(rotation=20)
(array([0, 1, 2, 3]), <a list of 4 Text xticklabel objects>)
#two binary series regression method
paired_binary_series = {}
for week_i in range(0,23):
    survival_start_range = week_i * 7
    survival_end_range = survival_start_range + 6
    heur_surviving = invited_by_hostbot[invited_by_hostbot['survival_days_168']>survival_end_range]
    heur_surviving_perc = len(heur_surviving) / len(invited_by_hostbot)
    ai_surviving = invited_by_hostbot_ai[invited_by_hostbot_ai['survival_days_168']>survival_end_range]
    ai_surviving_perc = len(ai_surviving) / len(invited_by_hostbot_ai)
    paired_binary_series[week_i] = {'heur_surviving_perc':heur_surviving_perc,
                                   'ai_surviving_perc':ai_surviving_perc}
    
survival3 = pd.DataFrame.from_dict(paired_binary_series).T
survival3
ai_surviving_perc heur_surviving_perc
0 0.341841 0.332543
1 0.302461 0.286600
2 0.269644 0.253965
3 0.244303 0.229717
4 0.207657 0.208751
5 0.184139 0.192525
6 0.162261 0.177211
7 0.142753 0.165360
8 0.120146 0.154057
9 0.105196 0.140930
10 0.092434 0.130720
11 0.084230 0.120328
12 0.073838 0.110301
13 0.066727 0.101185
14 0.060164 0.093163
15 0.056700 0.086964
16 0.050866 0.079672
17 0.045943 0.072744
18 0.039745 0.063446
19 0.033728 0.056153
20 0.028259 0.046855
21 0.022242 0.036645
22 0.014768 0.022789
s3=survival3[['heur_surviving_perc', 'ai_surviving_perc']].plot(kind='line', figsize=(6,2))
s3.set_title("Comparison of Retention Rates beteween HostBot-Heuristic and HostBot-AI \n invites to TeaHouse")
s3.set_xlabel("Weeks elapsed for invitees")
s3.set_ylabel("% of invitees surviving\n(>1 edit)")
handles, labels = p.get_legend_handles_labels()
s3.legend(handles, ['HostBot-Heuristic', 'HostBot-AI',])
<matplotlib.legend.Legend at 0x7ff7372afeb8>

making account periods df

# from sklearn.linear_model import LogisticRegression
# # this logistic regression is wrong, it should be a table  between y is survining binary and X are the weeks
# clf = LogisticRegression().fit(y=survival3['ai_surviving_perc'].values, X=survival3.index.values.astype(float).reshape(-1,1))

Blog post

Wikimedia's ORES platform has long been using AI to score revisions on their quality. However there is no way to score a user, rather than their edits, which is why I created the meta-classifier built on top of ORES to judget a session. One application of being able to judge from their edits is that after just a few edits we can start to indentify which users look strong and give personalized personalized mentoring to them. Such a bot already does this on Wikimedia, known as "HostBot" which each day invites editors who recently joined to a mentoring forum called the "TeaHouse". In order not to overwhelm the human-respondents at the Tea House, HostBot used heuristics to pick a select few. I built a AI-powered version of HostBot to perform the same function, and with community blessing we conducted a 3 month A/B test between HostBot and HostBot-AI. Here are those results.

Classifier Measure

  • goodfath of sessions. the minimum (this makes it strict). Is it influencing results?

Restricting Samples for Balance

  • Both heuristic and AI invite a variable amount of users per day (maximum 150). I balanced the sample by restricting the number of included comparison users to match whichever method had the least invitees on that day. I restricted the heuristic by randomly downsampling, and restricted AI by selecting the top-n by our final measure This improved AI's performance in comparison without hurting the stat signficance.

Retention measures

The number one concern that Wikipedia for Wikipedia are to "retain" newcomers, that is to have them continue editing after their intitial joining—this is also known as "survival". The official retention measures from the Wikimedia Foundation are defined by a "trial period" (the inital spurt) the "suvival period" (when they return) and how many edits in the survival period define a return. (See a detailed explanation)[https://meta.wikimedia.org/wiki/Research:Surviving_new_editor] . We looked at 10 retention measures for different paramter definitions of retention. Finally, for each measure, we conduct an independent t-test between the retention data (binary outcomes) of HostBot versus HostBot-AI. The table below outlines the results.

2015 Paper Measures

  • read the measure as weeks after reg/ edit window/ edits requried.
ai_heur_per_diff ai_heur_p_value
is_surviving_1_1_1 -1.076 0.467
is_surviving_1_3_1 0.912 0.026
is_surviving_3_1_1 1.750 0.000
is_surviving_3_1_5 0.620 0.053
is_surviving_4_4_1 2.935 0.000
is_surviving_4_4_5 2.425 0.000
is_surviving_8_16_1 -2.261 0.010
is_surviving_8_16_5 -1.459 0.026

image.png

-- interpretation: in the short and medium term measures AI is better, but heuristic is better long term.

Survival in Window

  • how many days after the users registration was their last edit; with the consideration window window_weeks after registration.
    heur_measure ai_measure ai_heur_improvement tstat pval
    survival_days_14 1.776 1.712 -0.064 0.976 0.329
    survival_days_28 4.070 4.527 0.457 -3.014 0.003
    survival_days_56 8.363 10.107 1.744 -5.595 0.000
    survival_days_168 22.456 20.780 -1.676 2.121 0.034
    survival_edits 20.850 15.146 -5.704 8.730 0.000

image.png

Survival % over Time

image.png

Notes:

  • I lowered the thresholds to try and invite more users because we were getting signficantly less than the 150 per day. but also hostbot was also inviting less users, so it may have been a drier time for new users. we can change the threshold of the AI to try and achieve a constant rate.