This analysis begins with exploratory SQL queries made directly to Labs DB, the live replicas of the Wikimedia databases used by developers:

https://wikitech.wikimedia.org/wiki/Help:Tool_Labs/Database

Luckily it is at least a less painful process thanks to Yuvi Panda's excellent Quarry public querying interface:

https://meta.wikimedia.org/wiki/Research:Quarry

from paws.YuviPanda.replicahelper import sql
import csv
import pandas as pd
import requests
import json
!ls /public/dumps
incr  lost+found  pagecounts-all-sites	pagecounts-raw	pageviews  public
!ls /public/dumps/pageviews
2015  2016
def get_DOIs(limit):
    query = """
        SELECT * FROM iwlinks
        JOIN page
        ON page_id = iwl_from
        WHERE iwl_prefix='doi' AND page_namespace=0"""
    if limit:
        query += """
        LIMIT {limit};""".format(limit = limit)
    for row in sql(query):
        yield {"page_title": row["page_title"], "DOI": row["iwl_title"]}
test = get_DOIs(5)
test
<generator object get_DOIs at 0x7f6c6403eee8>
for i in test:
    print(i)
{'page_title': 'Tore_Dybå', 'DOI': '/10.1109/52.877872'}
{'page_title': 'And_Still_I_Rise', 'DOI': '00933139'}
{'page_title': "Just_Give_Me_a_Cool_Drink_of_Water_'fore_I_Diiie", 'DOI': '00933139'}
{'page_title': 'Oh_Pray_My_Wings_Are_Gonna_Fit_Me_Well', 'DOI': '00933139'}
{'page_title': "Shaker,_Why_Don't_You_Sing?", 'DOI': '00933139'}
with open("DOIs.csv", "w", newline = "") as f:
    headers = ["DOI", "page_title"]
    writer = csv.DictWriter(f, delimiter = ',', fieldnames = headers)
    writer.writeheader()
    for row in get_DOIs(limit = None):
        writer.writerow(row)
!ls
DOIs.csv  wikipedia citations analysis.ipynb
!wc -l DOIs.csv
1094 DOIs.csv

Too bad! As you can see, there are only 1094 DOI links in iwlinks. So we'll have to use dumps rather than directly query with SQL. Luckily, Aaron Halfaker's mwcite generates these data nicely. Now to get the pageviews for the pages that have DOIs and other scholarly references...

citations = pd.read_csv("data/doi_isbn_pubmed_and_arxiv.enwiki20151102.tsv", delimiter = "\t")
citations.head()
citations.shape
(3643177, 6)
pages = citations.page_title.unique()
pages.shape
(877424,)
def get_pageviews(page_title, start, end):
    url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
    params = "{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}".format(
        project = "en.wikipedia.org",
        access = "all-access",
        agent = "user",
        article = page_title,
        granularity = "daily",
        start = start,
        end = end)
    r = requests.get(url + params)
    if r.status_code == 200:
        data = json.loads(r.text)
        try:
            return sum([i["views"] for i in data["items"]])
        except:
            return None

test = get_pageviews("Donald_Trump", "20160101", "20161231")
test
75773451
from multiprocessing import Pool, cpu_count
import csv
import pandas as pd
import requests
import json
import sys

"""
Note: This will not work well from a Notebook because of multiprocessing bad behavior. 
It needs to be run from the commandline. So save this as get_pageviews.py and run it.
"""

POOL_SIZE = cpu_count() * 2
# apparently 2 * (your number of cores) = the sweet spot
# http://blog.miguelgrinberg.com/post/easy-web-scraping-with-python 

def get_pageviews(page_title):
    url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
    params = "{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}".format(
        project = "en.wikipedia.org",
        access = "all-access",
        agent = "user",
        article = page_title,
        granularity = "daily",
        start = start,
        end = end)
    r = requests.get(url + params)
    if r.status_code == 200:
        data = json.loads(r.text)
        try:
            return sum([i["views"] for i in data["items"]])
        except:
            return None

if __name__ == "__main__":
    
    """
    Example use:  python get_pageviews.py data/doi_isbn_pubmed_and_arxiv.enwiki20151102.tsv 20150101 20151231
    """
    
    # LOAD the list of pages
    datafile, start, end = sys.argv[1:]
    pages = pd.read_csv(datafile, delimiter = "\t")
    pages = pages.page_title.unique().tolist()

    # get the page views in parallel
    pool = Pool(POOL_SIZE)
    views = pool.map(get_pageviews, pages)
    
    # save the results
    outfile_name = datafile[:-4] + "_pageviews.csv"
    pd.DataFrame({"page_title": pages, "views_2015": views}).to_csv(outfile_name)
pageviews = pd.read_csv("data/doi_isbn_pubmed_and_arxiv.enwiki20151102_pageviews.csv")
pageviews.head()
Unnamed: 0 page_title views_2015
0 0 Austrian legislative election, 1971 817.0
1 1 Fireflash 3110.0
2 2 De Havilland Firestreak 9749.0
3 3 Strix (mythology) 27448.0
4 4 Dehomag 6965.0
pageviews.sort_values(by = "views_2015", ascending = False)
Unnamed: 0 page_title views_2015
777992 777992 Star Wars 12452939.0
813000 813000 Pablo Escobar 11980439.0
143974 143974 Donald Trump 11946727.0
271485 271485 Islamic State of Iraq and the Levant 10395185.0
185001 185001 Facebook 9625879.0
36351 36351 Google 8934307.0
779822 779822 A. P. J. Abdul Kalam 8821933.0
404237 404237 Ronda Rousey 8634401.0
767697 767697 Spectre (2015 film) 8546488.0
767251 767251 Java (programming language) 8531996.0
308984 308984 Adele 7933856.0
66014 66014 Nuclear magnetic resonance spectroscopy 7876019.0
786971 786971 Eazy-E 7838878.0
121350 121350 United States 7782200.0
440160 440160 Stephen Hawking 6507073.0
236399 236399 Steve Jobs 6293880.0
750594 750594 Malware 6268762.0
289537 289537 Dr. Dre 6222292.0
771960 771960 India 6123231.0
835834 835834 Bernie Sanders 6077086.0
764266 764266 N.W.A 6014798.0
683825 683825 Ant-Man (film) 5809992.0
794521 794521 Ice Cube 5598534.0
857625 857625 Whitey Bulger 5575695.0
834251 834251 Barack Obama 5506250.0
859388 859388 Caitlyn Jenner 5268227.0
10900 10900 Ben Carson 5188184.0
790509 790509 Dwayne Johnson 5054026.0
778018 778018 World War II 5029378.0
380090 380090 Game of Thrones 4996959.0
... ... ... ...
864144 864144 Snooker world rankings 1990/1991 NaN
864156 864156 Snooker world rankings 1982/1983 NaN
864192 864192 Snooker world rankings 1983/1984 NaN
864199 864199 I/O bound NaN
864285 864285 Snooker world rankings 1984/1985 NaN
864498 864498 Face/Off NaN
864665 864665 19P/Borrelly NaN
864951 864951 Oxycodone/paracetamol NaN
866645 866645 ISO/IEC 15504 NaN
867709 867709 ISO/IEC 2022 NaN
868314 868314 Saunders-Roe SR.A/1 NaN
869698 869698 Where Do We Come From? What Are We? Where Are ... NaN
869837 869837 Do I Hear a Waltz? NaN
870303 870303 Fenfluramine/phentermine NaN
870613 870613 IBM System/4 Pi NaN
870801 870801 RPK-6 Vodopad/RPK-7 Veter NaN
870869 870869 /dev/zero NaN
871212 871212 Rover 200 / 25 NaN
871806 871806 Do You Like Horny Bunnies? NaN
871850 871850 What Is Art? NaN
872354 872354 Snooker world rankings 1985/1986 NaN
872371 872371 Snooker world rankings 1986/1987 NaN
872374 872374 Snooker world rankings 1987/1988 NaN
872391 872391 Snooker world rankings 1988/1989 NaN
872393 872393 Snooker world rankings 1989/1990 NaN
873680 873680 Mount Aspiring / Tititea NaN
873742 873742 How Long Is the Coast of Britain? Statistical ... NaN
875126 875126 DOS/360 and successors NaN
875480 875480 Chevrolet C/K NaN
876114 876114 Snooker world rankings 1976/1977 NaN

877424 rows × 3 columns

citations = pd.merge(citations, pageviews, on = "page_title", how = "left")
citations.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-a39ff7ed9a42> in <module>()
----> 1 citations = pd.merge(citations, pageviews, on = "page_title", how = "left")
      2 citations.head()

NameError: name 'citations' is not defined