Comparing top read and trending articles

See the research report

Imports and inputs

import getpass
import glob
from IPython.display import display
import ipywidgets as widgets
import json
from operator import itemgetter, getitem
import os
from pprint import pprint
import requests
from random import choice, sample, seed, shuffle
from urllib import parse
import time
#get your username and password for the current paws session. used to edit testwiki.

uname = os.environ['JPY_USER']
print(uname)
upass = getpass.getpass() #don't print me!
Jtmorgan
········
#input dates for the pageview API query

class date_input():
    def __init__(self, 
                 year = "", 
                 month = "", 
                 day = "",
                ):
        self.year = widgets.Text(description = 'Year (4 digit)',value = year)
        self.month = widgets.Text(description = 'Month (2 digit)',value = month)
        self.day = widgets.Text(description = 'Day (2 digit)',value = day)        
        self.year.on_submit(self.handle_submit)
        self.year.on_submit(self.handle_submit)
        self.year.on_submit(self.handle_submit)
        display(self.year, self.month, self.day)

    def handle_submit(self, text):
        self.v = text.value
        return self.v

print("enter the year, month and day above, then press return in any field")
f = date_input()
enter the year, month and day above, then press return in any field
print("Date for pageviews: " + "/".join([f.year.value, f.month.value, f.day.value]))
Date for pageviews: 2017/07/24

Generate the sample sets

Gather the current top 5 articles by pageviews and trending edits, and their metadata

def api_call(url):
    try:
        call = requests.get(url)
        response = call.json()
    except:
        response = None
    return response

Top read articles from yesterday

#https://en.wikipedia.org/api/rest_v1/feed/featured/2017/04/25
endpoint = 'https://en.wikipedia.org/api/rest_v1/feed/featured/{year}/{month}/{day}'

params = {'year' : f.year.value,
            'month' : f.month.value,
            'day' : f.day.value
            }

response = api_call(endpoint.format(**params))

if response:
    top_read = {}
    for a in response['mostread']['articles']:
        title = a['normalizedtitle']
        top_read[title] = {}
        top_read[title]['rank'] = a['rank']
        if 'description' in a.keys():
            top_read[title]['description'] = a['description']
        else:
            top_read[title]['description'] = ''
        if 'originalimage' in a.keys():
            top_read[title]['image url'] = parse.unquote(a['originalimage']['source'])
        else:
            top_read[title]['image url'] = ''            
else:
    print("Error retrieving data from API")
def image_link_parsing(article_list):
    """
    Parse out image filenames from image url
    flag articles with non-Commons images
    initialize filename to generic article icon if no image available
    """
    for k,v in article_list.items():
        if len(v['image url']) > 0:
            v['file name'] = v['image url'].split("/")[-1:][0]
        else:
            v['file name'] = 'OOjs_UI_icon_article-rtl.svg'
        if 'commons' in v['image url']:
            v['on commons'] = True
        else:
            v['on commons'] = False
            
    return article_list
top_read = image_link_parsing(top_read)
def counting_things(article_list):
    print("how many items in the set?")
    print(len(article_list))
    print("\n")
    print("how many have an image url?")
    print(len([k for k,v in article_list.items() if v['image url']]))
    print("\n")
    print("how many have a non-commons image?")
    print(len([k for k,v in article_list.items() if 'on commons' in v.keys() and not v['on commons']]))
counting_things(top_read)
how many items in the set?
39


how many have an image url?
33


how many have a non-commons image?
23
#sample
pprint(dict(list(top_read.items())[0:2]))
{'Battle of Dunkirk': {'description': 'important battle in the Second World '
                                      'War between the Allies and Germany',
                       'file name': 'Dunkirksoldier1.JPG',
                       'image url': 'https://upload.wikimedia.org/wikipedia/en/8/86/Dunkirksoldier1.JPG',
                       'on commons': False,
                       'rank': 8},
 'Planet of the Apes': {'description': 'science fiction media franchise',
                        'file name': 'Planet_of_the_Apes_(logo).svg',
                        'image url': 'https://upload.wikimedia.org/wikipedia/commons/7/75/Planet_of_the_Apes_(logo).svg',
                        'on commons': True,
                        'rank': 49}}
#save for later
timestr = time.strftime("%Y%m%d-%H%M%S")
with open('data/top_read_{}.json'.format(timestr), 'w') as fout:
    json.dump(top_read,fout,sort_keys = True)
print("last saved version: " + timestr)
last saved version: 20170725-142749

-Update 7/17/2017: filters out anything with a trendiness score of less than 1, and anything with fewer than 5 editors.

#https://en.wikipedia.org/api/rest_v1/feed/trending/edits
endpoint = 'https://en.wikipedia.org/api/rest_v1/feed/trending/edits'

response = api_call(endpoint)

if response:
    trending = {}
    for a in response['pages']:
        if a['editors'] >= 5 and a['trendiness'] >= 1:
            title = a['normalizedtitle']
            trending[title] = {}
            trending[title]['rank'] = a['trendiness']
            if 'description' in a.keys():
                trending[title]['description'] = a['description']
            else:
                trending[title]['description'] = ''
            if 'originalimage' in a.keys():
                trending[title]['image url'] = parse.unquote(a['originalimage']['source'])
            else:
                trending[title]['image url'] = ''
        else:
            pass
else:
    print("Error retrieving data from API")
trending = image_link_parsing(trending)
counting_things(trending) # If there are at least 5 articles left, great. If not, throw an error.
how many items in the set?
19


how many have an image url?
10


how many have a non-commons image?
12
#sample
pprint(dict(list(trending.items())[0:2]))
{'2017–18 United States network television schedule': {'description': '',
                                                       'file name': 'OOjs_UI_icon_article-rtl.svg',
                                                       'image url': '',
                                                       'on commons': False,
                                                       'rank': 3.0130928849244607},
 'Mitochondrial DNA depletion syndrome': {'description': '',
                                          'file name': 'Autosomal_recessive_-_en.svg',
                                          'image url': 'https://upload.wikimedia.org/wikipedia/commons/f/f1/Autosomal_recessive_-_en.svg',
                                          'on commons': True,
                                          'rank': 2.2671748363841844}}
#save for later
timestr = time.strftime("%Y%m%d-%H%M%S")
with open('data/trending_{}.json'.format(timestr), 'w') as fout:
    json.dump(trending,fout,sort_keys = True)
print("last saved version: " + timestr)
last saved version: 20170725-142749
#how much overlap?
for k in trending.keys():
    if k in top_read.keys():
        print(k)
Justice League (film)
Deaths in 2017
Anthony Scaramucci

Get the top five articles in each set

Convert the dicts into a list of tuples for ranking and truncate each list at 5 items after filtering out any duplicates in the top 5

top_read_sorted = sorted(top_read.items(),key=lambda x:getitem(x[1],'rank'), reverse=False)
#sample
pprint(top_read_sorted[0:2])
[('Chester Bennington',
  {'description': 'American musician',
   'file name': 'Linkin_Park-Rock_im_Park_2014-_by_2eight_3SC0327.jpg',
   'image url': 'https://upload.wikimedia.org/wikipedia/commons/6/6d/Linkin_Park-Rock_im_Park_2014-_by_2eight_3SC0327.jpg',
   'on commons': True,
   'rank': 3}),
 ('Dunkirk (2017 film)',
  {'description': '2017 film by Christopher Nolan',
   'file name': 'Dunkirk_Film_poster.jpg',
   'image url': 'https://upload.wikimedia.org/wikipedia/en/1/15/Dunkirk_Film_poster.jpg',
   'on commons': False,
   'rank': 5})]
trending_sorted = sorted(trending.items(),key=lambda x:getitem(x[1],'rank'), reverse=False)
#sample
pprint(trending_sorted[0:2])
[('Mitochondrial DNA depletion syndrome',
  {'description': '',
   'file name': 'Autosomal_recessive_-_en.svg',
   'image url': 'https://upload.wikimedia.org/wikipedia/commons/f/f1/Autosomal_recessive_-_en.svg',
   'on commons': True,
   'rank': 2.2671748363841844}),
 ('2017–18 United States network television schedule',
  {'description': '',
   'file name': 'OOjs_UI_icon_article-rtl.svg',
   'image url': '',
   'on commons': False,
   'rank': 3.0130928849244607})]
def distinct_lists(my_lists):
    """
    Takes two lists, compares the top 5 items.
    If duplicates are found in the top 5, chose a random copy from one list
    and remove it, replacing it with the next sequential item.
    Recheck the lists to see if there are any more dupes.
    """
    atop = my_lists[0][0:5]
    btop = my_lists[1][0:5]
    if not any(i in btop for i in atop):
        return atop, btop
    elif (len(my_lists[0]) > 5 and len(my_lists[1]) > 5):
        dupes = [x for x in atop if x in btop]
        item_to_remove = dupes[0]
        seed()
        list_to_prune = choice(my_lists)
        list_to_prune.remove(item_to_remove)
        return distinct_lists(my_lists)    
    else: 
        print("ran out of options, couldn't find three distinct") #need to return something, so the script doesn't choke?
#input order matches output order
top5_distinct = distinct_lists(([x[0] for x in top_read_sorted], [x[0] for x in trending_sorted]))

Store top 5 titles for later reference

top5_read_and_trending = {'top read' : top5_distinct[0], 'trending' : top5_distinct[1]}
pprint(top5_read_and_trending)
{'top read': ['Chester Bennington',
              'Dunkirk (2017 film)',
              'Jordan Spieth',
              'John Heard (actor)',
              'Battle of Dunkirk'],
 'trending': ['Mitochondrial DNA depletion syndrome',
              '2017–18 United States network television schedule',
              'Lone Echo',
              'Maynooth University',
              'Patriarchy']}
timestr = time.strftime("%Y%m%d-%H%M%S")
print("last saved version: " + timestr)
with open('data/top5_read_and_trending_{}.json'.format(timestr), 'w') as fout:
    json.dump(top5_read_and_trending,fout,sort_keys = False)
last saved version: 20170725-142749

Build the pages on testwiki

Get the latest version of each datafile

with open(max(glob.glob('data/top5_read_and_trending*'), key=os.path.getctime)) as fin:
    top5 = json.load(fin)
    print(fin)

with open(max(glob.glob('data/top_read*'), key=os.path.getctime)) as fin:
    top_read = json.load(fin)
    print(fin)

with open(max(glob.glob('data/trending*'), key=os.path.getctime)) as fin:
    trending = json.load(fin)
    print(fin)
<_io.TextIOWrapper name='data/top5_read_and_trending_20170725-142749.json' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='data/top_read_20170725-142749.json' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='data/trending_20170725-142749.json' mode='r' encoding='UTF-8'>

remove everything from the full datasets if it's not on the top5 list

def filter(dict_to_filter, lookup_list):
    return {k:v for k,v in dict_to_filter.items() if k in lookup_list}
top5_read = filter(top_read, top5['top read'])
top5_trending = filter(trending, top5['trending'])

Convert the top 5 dicts back into lists of tuples sorted by rank

top5_read_sorted = sorted(top5_read.items(),key=lambda x:getitem(x[1],'rank'), reverse=False)
top5_trending_sorted = sorted(top5_trending.items(),key=lambda x:getitem(x[1],'rank'), reverse=False)

Create the study pages on test.wikipedia.org

page_template = """
<!-- {condition} -->

__NOEDITSECTION__
__NOTOC__

{{|cellpadding="5" style="margin-left: auto; margin-right: auto; border-spacing: 10px; background-color: transparent; border-top: solid 2px; border-bottom: solid 2px;"
|- style="border-top: solid 1px; vertical-align:top;"
| [[File:{image_1}|100px]]
| '''{title_1}'''<br/><span style="color:gray">{description_1}</span>
|- style="border-top: solid 1px; vertical-align:top;"
| [[File:{image_2}|100px]]
| '''{title_2}'''<br/><span style="color:gray">{description_2}</span>
|- style="border-top: solid 1px; vertical-align:top;"
| [[File:{image_3}|100px]]
| '''{title_3}'''<br/><span style="color:gray">{description_3}</span>
|- style="border-top: solid 1px; vertical-align:top;"
|[[File:{image_4}|100px]]
| '''{title_4}'''<br/><span style="color:gray">{description_4}</span>
|- style="border-top: solid 1px; vertical-align:top;"
| [[File:{image_5}|100px]]
| '''{title_5}'''<br/><span style="color:gray">{description_5}</span>
|}}

[[Category:Top_articles_user_study_Apr-May_2017]]
"""
def recommendation_section(condition, articles, output_template):
    """
    Take a condition parameter, a list of (ar_title, ar_data_dict) tuples, and a wikitext template
    Format the template with the condition and the relevant info from the articles
    Adds a placeholder image if the article lacks one
    """
    rec_params = {}
    rec_params['condition'] = condition
    for i, x in enumerate(articles):
        rec_params['title_' + str(i + 1)] = x[0]
        for k,v in x[1].items():
            rec_params['description_' + str(i + 1)] = x[1]['description']
            rec_params['image_' + str(i + 1)] = x[1]['file name']
  
    formatted_output = output_template.format(**rec_params)
    
    return formatted_output
def login_request(baseurl, username, password):

    # Login request
    payload = {'action': 'query', 'format': 'json', 'utf8': '', 'meta': 'tokens', 'type': 'login'}
    r1 = requests.post(baseurl + 'api.php', data=payload)
    
    # login confirm
    login_token = r1.json()['query']['tokens']['logintoken']
    payload = {'action': 'login', 'format': 'json', 'utf8': '', 'lgname': username, 'lgpassword': password, 'lgtoken': login_token}
    r2 = requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
    
    return r2
def token_request(baseurl, r2):
    
    # get edit token2
    params3 = '?format=json&action=query&meta=tokens&continue='
    r3 = requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
    edit_token = r3.json()['query']['tokens']['csrftoken']

    edit_cookie = r2.cookies.copy()
    edit_cookie.update(r3.cookies)
    
    return edit_token, edit_cookie
def publish(page_title, page_content, uname, upass):
    base_url = 'https://test.wikipedia.org/w/'

    login = login_request(base_url, uname, upass)
    edit_token, edit_cookie = token_request(base_url, login)

    summary = 'building test page for [[meta:Research:Comparing_most_read_and_trending_edits_for_Top_Articles_feature|Top articles user study]]'
    headers={'User-Agent' : 'TopArticles user study', 'From' : 'jmorgan@wikimedia.org'}

    # save action
    payload = {'action': 'edit', 'assert': 'user', 'format': 'json', 'utf8': '', 'text': page_content,'summary': summary, 'title': page_title, 'token': edit_token}
    r4 = requests.post(base_url + 'api.php', data=payload, cookies=edit_cookie, headers=headers)
    print(r4.text)

    timestr = time.strftime("%Y%m%d-%H%M%S")
    print("Published page: " + page_title + " at " + timestr)
#format the page template
top5_read_output = recommendation_section('top read', top5_read_sorted, page_template)
top5_trending_output = recommendation_section('trending', top5_trending_sorted, page_template)
#should randomly assign
publish("Top_articles_1", top5_read_output, uname, upass)
{"edit":{"result":"Success","pageid":95788,"title":"Top articles 1","contentmodel":"wikitext","oldrevid":323682,"newrevid":323697,"newtimestamp":"2017-07-25T14:27:52Z"}}
Published page: Top_articles_1 at 20170725-142752
#should randomly assign
publish("Top_articles_2", top5_trending_output, uname, upass)
{"edit":{"result":"Success","pageid":95789,"title":"Top articles 2","contentmodel":"wikitext","oldrevid":323681,"newrevid":323698,"newtimestamp":"2017-07-25T14:27:54Z"}}
Published page: Top_articles_2 at 20170725-142754

Identify any non-commons images

I will manually upload these to test.wikipedia.org

def non_commons_images(articles):
    """
    Take a list of (ar_title, ar_data_dict) tuples
    Print info on those with images that aren't from Wikimedia Commons
    """
    for a in articles:
        if len(a[1]['image url']) >0 and not a[1]['on commons']:
            print(a[0] + "\t" + a[1]['image url'])
non_commons_images(top5_read_sorted)
non_commons_images(top5_trending_sorted)
Dunkirk (2017 film)	https://upload.wikimedia.org/wikipedia/en/1/15/Dunkirk_Film_poster.jpg
Battle of Dunkirk	https://upload.wikimedia.org/wikipedia/en/8/86/Dunkirksoldier1.JPG
Maynooth University	https://upload.wikimedia.org/wikipedia/en/3/3c/NUIM_Symbol.svg
timestr = time.strftime("%Y%m%d-%H%M%S")
print("last run: " + timestr)
last run: 20170725-142754