import sys
from pprint import pprint
from math import log, exp
from collections import defaultdict

from paws.EpochFail import wikiquality

monthly_wiki_quality = {}

for i, row in enumerate(wikiquality.read_aq('en')):
    if row['timestamp'] not in monthly_wiki_quality:
        monthly_wiki_quality[row['timestamp']] = {
            'weighted_sum': 0.0, 
            'weighted_log_sum': 0.0, 
            'n': 0, 
            'predictions': defaultdict(int)
        }
    monthly_wiki_quality[row['timestamp']]['n'] += 1
    monthly_wiki_quality[row['timestamp']]['weighted_sum'] += row['weighted_sum']
    monthly_wiki_quality[row['timestamp']]['weighted_log_sum'] += log(row['weighted_sum'])
    monthly_wiki_quality[row['timestamp']]['predictions'][row['prediction']] += 1
    if i % 10**5 == 0:
        sys.stderr.write(".")
    if i % 10**7 == 0:
        sys.stderr.write("\n")
Requirement already satisfied (use --upgrade to upgrade): mysqltsv in /srv/paws/lib/python3.4/site-packages
You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
{'page_id': 3204489, 'rev_id': 28847753, 'timestamp': '20051201000000', 'prediction': 'Stub', 'weighted_sum': 0.10413085831157592, 'title': 'Lost Moon'}
.
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
................
pprint(monthly_wiki_quality['20160101000000'])
{'n': 5031375,
 'predictions': {'B': 168927,
                 'C': 735552,
                 'FA': 32139,
                 'GA': 134161,
                 'Start': 1648028,
                 'Stub': 2312568},
 'weighted_log_sum': -3506831.5456545097,
 'weighted_sum': 5154727.345714714}
max_n = max(mq['n'] for mq in monthly_wiki_quality.values())
max_n
5206553
import mysqltsv
headers=['month', 'possible_n', 'stub_n', 'start_n', 
         'c_n', 'b_n', 'ga_n', 'fa_n', 'mean_weighted_sum', 
         'geo_mean_weighted_sum']

with open("enwiki.monthly_wiki_quality.tsv", "w") as f:
    writer = mysqltsv.Writer(f, headers=headers)
    for month, mq in monthly_wiki_quality.items():
        row = {}
        row['month'] = month
        row['possible_n'] = max_n
        row['stub_n'] = mq['predictions']['Stub']
        row['start_n'] = mq['predictions']['Start']
        row['c_n'] = mq['predictions']['C']
        row['b_n'] = mq['predictions']['B']
        row['ga_n'] = mq['predictions']['GA']
        row['fa_n'] = mq['predictions']['FA']
        row['mean_weighted_sum'] = mq['weighted_sum']/max_n
        row['geo_mean_weighted_sum'] = exp(mq['weighted_log_sum']/max_n)
        
        writer.write(row)