# cf. http://paws-public.wmflabs.org/paws-public/User:Jtmorgan/ds4ux/paws-cheatsheet.ipynb
import os 
"""
Your db login credentials are stored in os.environ. 
DO NOT print or run os.environ, or it will expose your credentials in the Notebook
"""
import pymysql
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates
import seaborn as sns
from collections import defaultdict
import collections
from datetime import datetime
# import data - source: see https://phabricator.wikimedia.org/T167471
df = pd.read_table('~/pagetriage backlog by autoconfirmed as of 2017-06-07.tsv', sep='\t') #,index_col=0
convertfetchalltstodt = lambda x: pd.to_datetime(str(x))
df['created_yyyymmdd'] = df['created_yyyymmdd'].apply(convertfetchalltstodt)
df.set_index('created_yyyymmdd', inplace = True)
df
autoconfirmed unreviewed_count
created_yyyymmdd
2004-01-15 1 1
2005-12-19 1 1
2008-01-26 1 1
2009-03-14 1 1
2010-08-28 1 1
2012-12-05 1 1
2013-05-18 1 1
2013-12-06 1 1
2014-07-08 1 1
2015-02-03 1 1
2015-05-30 1 1
2016-06-17 0 1
2016-09-26 0 1
2016-10-04 1 1
2016-10-14 0 1
2016-10-17 1 1
2016-10-25 1 1
2016-11-10 0 1
2016-11-14 1 1
2016-12-23 1 1
2016-12-25 1 3
2016-12-26 0 4
2016-12-26 1 10
2016-12-27 0 3
2016-12-27 1 50
2016-12-28 0 8
2016-12-28 1 73
2016-12-29 0 11
2016-12-29 1 156
2016-12-30 0 13
... ... ...
2017-05-24 0 34
2017-05-24 1 127
2017-05-25 0 15
2017-05-25 1 116
2017-05-26 0 36
2017-05-26 1 110
2017-05-27 0 23
2017-05-27 1 124
2017-05-28 0 15
2017-05-28 1 74
2017-05-29 0 31
2017-05-29 1 104
2017-05-30 0 28
2017-05-30 1 112
2017-05-31 0 33
2017-05-31 1 104
2017-06-01 0 28
2017-06-01 1 115
2017-06-02 0 33
2017-06-02 1 112
2017-06-03 0 13
2017-06-03 1 110
2017-06-04 0 12
2017-06-04 1 70
2017-06-05 0 46
2017-06-05 1 141
2017-06-06 0 51
2017-06-06 1 179
2017-06-07 0 27
2017-06-07 1 51

349 rows × 2 columns

df.index.rename('page creation date', inplace=True)
df.describe()
autoconfirmed unreviewed_count
count 349.000000 349.000000
mean 0.518625 62.011461
std 0.500370 54.253025
min 0.000000 1.000000
25% 0.000000 16.000000
50% 1.000000 34.000000
75% 1.000000 112.000000
max 1.000000 207.000000
df_pivot = df.pivot(columns='autoconfirmed', values='unreviewed_count')
df_pivot.columns = ['by users who were new (not autoconfirmed) when creating the page', 'by users who were autoconfirmed']
df_pivot
by users who were new (not autoconfirmed) when creating the page by users who were autoconfirmed
page creation date
2004-01-15 NaN 1.0
2005-12-19 NaN 1.0
2008-01-26 NaN 1.0
2009-03-14 NaN 1.0
2010-08-28 NaN 1.0
2012-12-05 NaN 1.0
2013-05-18 NaN 1.0
2013-12-06 NaN 1.0
2014-07-08 NaN 1.0
2015-02-03 NaN 1.0
2015-05-30 NaN 1.0
2016-06-17 1.0 NaN
2016-09-26 1.0 NaN
2016-10-04 NaN 1.0
2016-10-14 1.0 NaN
2016-10-17 NaN 1.0
2016-10-25 NaN 1.0
2016-11-10 1.0 NaN
2016-11-14 NaN 1.0
2016-12-23 NaN 1.0
2016-12-25 NaN 3.0
2016-12-26 4.0 10.0
2016-12-27 3.0 50.0
2016-12-28 8.0 73.0
2016-12-29 11.0 156.0
2016-12-30 13.0 102.0
2016-12-31 5.0 56.0
2017-01-01 8.0 69.0
2017-01-02 17.0 89.0
2017-01-03 13.0 137.0
... ... ...
2017-05-09 45.0 178.0
2017-05-10 34.0 160.0
2017-05-11 19.0 97.0
2017-05-12 30.0 132.0
2017-05-13 20.0 115.0
2017-05-14 32.0 148.0
2017-05-15 19.0 182.0
2017-05-16 31.0 154.0
2017-05-17 48.0 171.0
2017-05-18 32.0 113.0
2017-05-19 34.0 143.0
2017-05-20 31.0 136.0
2017-05-21 18.0 80.0
2017-05-22 18.0 81.0
2017-05-23 47.0 153.0
2017-05-24 34.0 127.0
2017-05-25 15.0 116.0
2017-05-26 36.0 110.0
2017-05-27 23.0 124.0
2017-05-28 15.0 74.0
2017-05-29 31.0 104.0
2017-05-30 28.0 112.0
2017-05-31 33.0 104.0
2017-06-01 28.0 115.0
2017-06-02 33.0 112.0
2017-06-03 13.0 110.0
2017-06-04 12.0 70.0
2017-06-05 46.0 141.0
2017-06-06 51.0 179.0
2017-06-07 27.0 51.0

185 rows × 2 columns

# drop some spurious and/or irrelevant data from before December 2016
df_pivot = df_pivot.drop(df_pivot[df_pivot.index < '2016-12-01'].index)
df_pivot
by users who were new (not autoconfirmed) when creating the page by users who were autoconfirmed
page creation date
2016-12-23 NaN 1.0
2016-12-25 NaN 3.0
2016-12-26 4.0 10.0
2016-12-27 3.0 50.0
2016-12-28 8.0 73.0
2016-12-29 11.0 156.0
2016-12-30 13.0 102.0
2016-12-31 5.0 56.0
2017-01-01 8.0 69.0
2017-01-02 17.0 89.0
2017-01-03 13.0 137.0
2017-01-04 15.0 96.0
2017-01-05 14.0 109.0
2017-01-06 16.0 107.0
2017-01-07 12.0 83.0
2017-01-08 14.0 125.0
2017-01-09 22.0 145.0
2017-01-10 14.0 63.0
2017-01-11 14.0 99.0
2017-01-12 23.0 97.0
2017-01-13 18.0 89.0
2017-01-14 6.0 70.0
2017-01-15 10.0 115.0
2017-01-16 13.0 91.0
2017-01-17 34.0 102.0
2017-01-18 18.0 120.0
2017-01-19 29.0 53.0
2017-01-20 9.0 24.0
2017-01-21 15.0 74.0
2017-01-22 6.0 32.0
... ... ...
2017-05-09 45.0 178.0
2017-05-10 34.0 160.0
2017-05-11 19.0 97.0
2017-05-12 30.0 132.0
2017-05-13 20.0 115.0
2017-05-14 32.0 148.0
2017-05-15 19.0 182.0
2017-05-16 31.0 154.0
2017-05-17 48.0 171.0
2017-05-18 32.0 113.0
2017-05-19 34.0 143.0
2017-05-20 31.0 136.0
2017-05-21 18.0 80.0
2017-05-22 18.0 81.0
2017-05-23 47.0 153.0
2017-05-24 34.0 127.0
2017-05-25 15.0 116.0
2017-05-26 36.0 110.0
2017-05-27 23.0 124.0
2017-05-28 15.0 74.0
2017-05-29 31.0 104.0
2017-05-30 28.0 112.0
2017-05-31 33.0 104.0
2017-06-01 28.0 115.0
2017-06-02 33.0 112.0
2017-06-03 13.0 110.0
2017-06-04 12.0 70.0
2017-06-05 46.0 141.0
2017-06-06 51.0 179.0
2017-06-07 27.0 51.0

166 rows × 2 columns

dfpxlabels = df_pivot
dfpxlabels['xlabel'] = np.nan
for i in dfpxlabels.index:
    if i.day == 1:
        dfpxlabels.loc[i, 'xlabel'] = i.strftime('%Y-%m-%d')
    else:
        dfpxlabels.loc[i, 'xlabel'] = ''
meinplot = df_pivot.plot(kind='bar', stacked=True, width=1, figsize=(16,8))
meinplot.set_xticklabels(dfpxlabels.xlabel, rotation=70)
plt.title('English Wikipedia new page review backlog', fontsize=24)
plt.ylabel('Number of pages created that day\nwhich were still unreviewed on June 7, 2017', fontsize=16)
plt.rc('legend', fontsize=16)
plt.rc('xtick', labelsize=16)
plt.rc('axes', labelsize=16)
plt.annotate(" older pages", xycoords='data', xy=(15,195), xytext=(40,197), fontsize = 15, arrowprops=dict(facecolor='black', shrink=0.05))
plt.annotate("newer pages ", xycoords='data', xy=(150,195), xytext=(105,197), fontsize = 15, arrowprops=dict(facecolor='black', shrink=0.05))
<matplotlib.text.Annotation at 0x7f2778aadcf8>
/srv/paws/lib/python3.4/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))
df_pivot['by users who were autoconfirmed'].sum()
18313.0
df_pivot['by users who were new (not autoconfirmed) when creating the page'].sum()
3310.0
# overall ratio of unreviewed articles by autoconfirmed users
df_pivot['by users who were autoconfirmed'].sum() / (
df_pivot['by users who were autoconfirmed'].sum() + 
df_pivot['by users who were new (not autoconfirmed) when creating the page'].sum() )
0.84692225870600746
df_acpercentage = (df_pivot['by users who were autoconfirmed'] / (df_pivot['by users who were autoconfirmed'] +df_pivot['by users who were new (not autoconfirmed) when creating the page'])).to_frame()
df_acpercentage
0
page creation date
2016-12-23 NaN
2016-12-25 NaN
2016-12-26 0.714286
2016-12-27 0.943396
2016-12-28 0.901235
2016-12-29 0.934132
2016-12-30 0.886957
2016-12-31 0.918033
2017-01-01 0.896104
2017-01-02 0.839623
2017-01-03 0.913333
2017-01-04 0.864865
2017-01-05 0.886179
2017-01-06 0.869919
2017-01-07 0.873684
2017-01-08 0.899281
2017-01-09 0.868263
2017-01-10 0.818182
2017-01-11 0.876106
2017-01-12 0.808333
2017-01-13 0.831776
2017-01-14 0.921053
2017-01-15 0.920000
2017-01-16 0.875000
2017-01-17 0.750000
2017-01-18 0.869565
2017-01-19 0.646341
2017-01-20 0.727273
2017-01-21 0.831461
2017-01-22 0.842105
... ...
2017-05-09 0.798206
2017-05-10 0.824742
2017-05-11 0.836207
2017-05-12 0.814815
2017-05-13 0.851852
2017-05-14 0.822222
2017-05-15 0.905473
2017-05-16 0.832432
2017-05-17 0.780822
2017-05-18 0.779310
2017-05-19 0.807910
2017-05-20 0.814371
2017-05-21 0.816327
2017-05-22 0.818182
2017-05-23 0.765000
2017-05-24 0.788820
2017-05-25 0.885496
2017-05-26 0.753425
2017-05-27 0.843537
2017-05-28 0.831461
2017-05-29 0.770370
2017-05-30 0.800000
2017-05-31 0.759124
2017-06-01 0.804196
2017-06-02 0.772414
2017-06-03 0.894309
2017-06-04 0.853659
2017-06-05 0.754011
2017-06-06 0.778261
2017-06-07 0.653846

166 rows × 1 columns

Scratchpad

type(df_acpercentage)
pandas.core.frame.DataFrame
df_acpercentage.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f277846ceb8>
/srv/paws/lib/python3.4/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))
df_acpercentage.columns[0]
0
sns.regplot(x="page creation date", y=0, data = df_acpercentage)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8543)()

TypeError: an integer is required

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
/srv/paws/lib/python3.4/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2133             try:
-> 2134                 return self._engine.get_loc(key)
   2135             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4363)()

KeyError: 'page creation date'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:8543)()

TypeError: an integer is required

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-48-e62974a433eb> in <module>()
----> 1 sns.regplot(x="page creation date", y=0, data = df_acpercentage)

/srv/paws/lib/python3.4/site-packages/seaborn/linearmodels.py in regplot(x, y, data, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, label, color, marker, scatter_kws, line_kws, ax)
    769                                  order, logistic, lowess, robust, logx,
    770                                  x_partial, y_partial, truncate, dropna,
--> 771                                  x_jitter, y_jitter, color, label)
    772 
    773     if ax is None:

/srv/paws/lib/python3.4/site-packages/seaborn/linearmodels.py in __init__(self, x, y, data, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, color, label)
    108         # Extract the data vals from the arguments or passed dataframe
    109         self.establish_variables(data, x=x, y=y, units=units,
--> 110                                  x_partial=x_partial, y_partial=y_partial)
    111 
    112         # Drop null observations

/srv/paws/lib/python3.4/site-packages/seaborn/linearmodels.py in establish_variables(self, data, **kws)
     53         for var, val in kws.items():
     54             if isinstance(val, string_types):
---> 55                 setattr(self, var, data[val])
     56             else:
     57                 setattr(self, var, val)

/srv/paws/lib/python3.4/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/srv/paws/lib/python3.4/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/srv/paws/lib/python3.4/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/srv/paws/lib/python3.4/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3541 
   3542             if not isnull(item):
-> 3543                 loc = self.items.get_loc(item)
   3544             else:
   3545                 indexer = np.arange(len(self.items))[isnull(self.items)]

/srv/paws/lib/python3.4/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4363)()

KeyError: 'page creation date'