%matplotlib inline
import csv
import json
import time
import sys
from concurrent.futures import ThreadPoolExecutor

import requests
import mwapi
import mwtypes
import pandas
import seaborn
session = mwapi.Session("https://en.wikipedia.org", user_agent="ahalfaker@wikimedia.org -- IWSC demo")
WEIGHTS = {'Stub': 1, 'Start': 2, 'C': 3, 'B': 4, 'GA': 5, 'FA': 6}
def score2sum(score_doc):
    if score_doc is None:
        return None
    weighted_sum = 0
    for cl, proba in score_doc['probability'].items():
        weighted_sum += WEIGHTS[cl] * proba
    return weighted_sum
def fetch_wp10_score(rev_id):
    response = requests.get('https://ores.wikimedia.org/v3/scores/enwiki/{0}/wp10'.format(rev_id))
    try:
        return response.json()['enwiki']['scores'][str(rev_id)]['wp10']['score']
    except:
        return None


def fetch_wp10_scores(rev_ids):
    executor = ThreadPoolExecutor(max_workers=8)
    return executor.map(fetch_wp10_score, rev_ids)

def fetch_historical_scores(page_name):
    historical_scores = []
    for response_doc in session.get(action='query', prop='revisions', titles=page_name, 
                                    rvprop=['ids', 'timestamp'], rvlimit=100, rvdir="newer", 
                                    formatversion=2, continuation=True):
        rev_docs = response_doc['query']['pages'][0]['revisions']
        rev_ids = [d['revid'] for d in rev_docs]
        for rev_doc, score_doc in zip(rev_docs, fetch_wp10_scores(rev_ids)):
            rev_id = rev_doc['revid']
            timestamp = mwtypes.Timestamp(rev_doc['timestamp'])
            weighted_sum = score2sum(score_doc)
            historical_scores.append({'rev_id': rev_id, 'timestamp': timestamp, 'weighted_sum': weighted_sum})
            sys.stderr.write(".")
            sys.stderr.flush()
        sys.stderr.write("\n")
    
    return historical_scores
historical_scores = pandas.DataFrame(fetch_historical_scores("Ann Bishop (biologist)"))
....................................................................................................
..................................................................................
historical_scores['time'] =pandas.to_datetime(historical_scores.timestamp, format='%Y-%m-%dT%H:%M:%SZ',errors='ignore')
historical_scores = historical_scores.set_index('time')
historical_scores['weighted_sum'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fcf53ce7fd0>
historical_scores2 = pandas.DataFrame(fetch_historical_scores("Tiririca"))
...................................................................
historical_scores2['time'] =pandas.to_datetime(historical_scores2.timestamp, format='%Y-%m-%dT%H:%M:%SZ',errors='ignore')
historical_scores2 = historical_scores2.set_index('time')
historical_scores2['weighted_sum'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fcf847c15c0>
historical_scores2 = pandas.DataFrame(fetch_historical_scores("José Sócrates"))
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
.......................................................................
historical_scores2['time'] =pandas.to_datetime(historical_scores2.timestamp, format='%Y-%m-%dT%H:%M:%SZ',errors='ignore')
historical_scores2 = historical_scores2.set_index('time')
historical_scores2['weighted_sum'].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fcf84532198>
historical_scores2
rev_id timestamp weighted_sum
time
2001-10-30 20:06:56 252432 2001-10-30T20:06:56Z 1.115514
2001-10-30 20:06:56 799511211 2001-10-30T20:06:56Z 1.100899
2001-10-30 20:16:35 799511212 2001-10-30T20:16:35Z 1.614397
2002-02-25 15:51:15 25241 2002-02-25T15:51:15Z 1.632437
2002-03-07 21:28:46 49193 2002-03-07T21:28:46Z 1.655393
2002-04-09 13:51:34 201174 2002-04-09T13:51:34Z 1.670223
2002-09-07 06:50:41 201421 2002-09-07T06:50:41Z 1.690010
2002-09-07 11:30:20 585968 2002-09-07T11:30:20Z 1.663532
2003-01-14 11:09:21 714423 2003-01-14T11:09:21Z 1.663532
2003-03-02 13:08:07 714504 2003-03-02T13:08:07Z 1.825366
2003-03-02 13:52:43 911386 2003-03-02T13:52:43Z 1.804457
2003-05-12 03:19:43 1306774 2003-05-12T03:19:43Z 1.867823
2003-08-18 05:12:45 1421330 2003-08-18T05:12:45Z 1.882077
2003-09-11 23:24:06 1421331 2003-09-11T23:24:06Z 1.844890
2003-09-11 23:24:24 1424461 2003-09-11T23:24:24Z 1.855906
2003-09-12 18:53:37 1895210 2003-09-12T18:53:37Z 1.882352
2003-12-07 15:36:38 1990977 2003-12-07T15:36:38Z 1.882230
2003-12-18 14:32:18 3460241 2003-12-18T14:32:18Z 1.881871
2004-05-05 16:12:37 3476899 2004-05-05T16:12:37Z 1.892044
2004-05-06 21:44:40 4529951 2004-05-06T21:44:40Z 1.876159
2004-07-09 16:35:41 4675353 2004-07-09T16:35:41Z 1.860340
2004-07-16 18:03:11 4795210 2004-07-16T18:03:11Z 1.943746
2004-07-23 02:39:57 4795244 2004-07-23T02:39:57Z 1.945271
2004-07-23 02:42:26 5112646 2004-07-23T02:42:26Z 1.943746
2004-08-10 03:52:50 5172652 2004-08-10T03:52:50Z 1.962299
2004-08-12 23:23:43 5173623 2004-08-12T23:23:43Z 2.458415
2004-08-13 00:36:05 5626055 2004-08-13T00:36:05Z 2.449890
2004-09-03 10:17:14 5672507 2004-09-03T10:17:14Z 2.470495
2004-09-05 15:42:25 5694830 2004-09-05T15:42:25Z 2.458415
2004-09-06 16:31:44 5694849 2004-09-06T16:31:44Z 2.462040
... ... ... ...
2018-10-25 09:23:28 865654057 2018-10-25T09:23:28Z 4.100183
2018-10-25 09:31:55 865654688 2018-10-25T09:31:55Z 4.103021
2018-10-31 15:58:53 866638738 2018-10-31T15:58:53Z 4.103021
2018-11-21 00:39:15 869885186 2018-11-21T00:39:15Z 4.094013
2018-11-28 06:30:22 870990072 2018-11-28T06:30:22Z 4.089659
2018-12-01 19:53:24 871537876 2018-12-01T19:53:24Z 4.088451
2018-12-19 21:13:42 874529733 2018-12-19T21:13:42Z 4.088451
2019-01-01 22:16:37 876362830 2019-01-01T22:16:37Z 4.088451
2019-01-23 01:54:45 879735229 2019-01-23T01:54:45Z 4.088451
2019-01-23 01:57:42 879735518 2019-01-23T01:57:42Z 4.088451
2019-01-23 01:59:08 879735652 2019-01-23T01:59:08Z 4.088451
2019-01-23 02:39:02 879739726 2019-01-23T02:39:02Z 4.088451
2019-01-24 15:53:26 879974818 2019-01-24T15:53:26Z 4.087522
2019-01-24 15:55:26 879975175 2019-01-24T15:55:26Z 4.087522
2019-02-01 18:36:37 881307345 2019-02-01T18:36:37Z 4.090414
2019-02-05 14:14:03 881894730 2019-02-05T14:14:03Z 4.093697
2019-02-05 14:32:34 881897324 2019-02-05T14:32:34Z 4.173099
2019-02-05 14:33:54 881897496 2019-02-05T14:33:54Z 4.173099
2019-02-05 14:34:41 881897593 2019-02-05T14:34:41Z 4.173420
2019-02-05 14:37:59 881898024 2019-02-05T14:37:59Z 4.173390
2019-02-15 17:24:44 883477821 2019-02-15T17:24:44Z 4.173390
2019-02-24 20:41:02 884913321 2019-02-24T20:41:02Z 4.193871
2019-02-24 20:43:07 884913666 2019-02-24T20:43:07Z 4.193871
2019-02-24 20:58:19 884915931 2019-02-24T20:58:19Z 4.194195
2019-02-24 22:25:13 884927637 2019-02-24T22:25:13Z 4.194195
2019-02-25 22:19:00 885090781 2019-02-25T22:19:00Z 4.194195
2019-02-25 22:20:24 885090955 2019-02-25T22:20:24Z 4.194195
2019-02-25 22:32:34 885092587 2019-02-25T22:32:34Z 4.194195
2019-02-25 22:33:32 885092702 2019-02-25T22:33:32Z 4.194195
2019-02-25 22:48:01 885094577 2019-02-25T22:48:01Z 4.194195

2106 rows × 3 columns