# Makes the plots appear within the notebook
%matplotlib inline

# Two fundamental packages for doing data manipulation
import numpy as np                   # http://www.numpy.org/
import pandas as pd                  # http://pandas.pydata.org/

# Two related packages for plotting data
import matplotlib.pyplot as plt      # http://matplotlib.org/
import seaborn as sb                 # https://stanford.edu/~mwaskom/software/seaborn/

# Package for requesting data via the web and parsing resulting JSON
import requests
import json
from bs4 import BeautifulSoup

# Two packages for accessing the MySQL server
import pymysql                       # http://pymysql.readthedocs.io/en/latest/
import os                            # https://docs.python.org/3.4/library/os.html

# Packages for analyzing complex networks
import networkx as nx                # https://networkx.github.io/
import igraph as ig

# Setup the code environment to use plots with a white background and DataFrames show more columns and rows
sb.set_style('whitegrid')
pd.options.display.max_columns = 100
pd.options.display.max_rows = 110
/srv/paws/lib/python3.4/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/srv/paws/lib/python3.4/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-1-9a14d38e1bb8> in <module>()
     20 
     21 # Packages for analyzing complex networks
---> 22 import networkx as nx                # https://networkx.github.io/
     23 import igraph as ig
     24 

ImportError: No module named 'networkx'
def get_page_outlinks(page_title,redirects=1):
    # Replace spaces with underscores
    #page_title = page_title.replace(' ','_')
    
    bad_titles = ['Special:','Wikipedia:','Help:','Template:','Category:','International Standard','Portal:','s:']
    
    # Get the response from the API for a query
    # After passing a page title, the API returns the HTML markup of the current article version within a JSON payload
    req = requests.get('https://en.wikipedia.org/w/api.php?action=parse&format=json&page={0}&redirects={1}&prop=text&disableeditsection=1&disabletoc=1'.format(page_title,redirects))
    
    # Read the response into JSON to parse and extract the HTML
    json_string = json.loads(req.text)
    
    # Initialize an empty list to store the links
    outlinks_list = [] 
    if 'parse' in json_string.keys():
        page_html = json_string['parse']['text']['*']

        # Parse the HTML into Beautiful Soup
        soup = BeautifulSoup(page_html,'lxml')

        # Delete tags associated with templates
        for tag in soup.find_all('tr'):
            tag.replace_with('')

        # For each paragraph tag, extract the titles within the links
        for para in soup.find_all('p'):
            for link in para.find_all('a'):
                if link.has_attr('title'):
                    title = link['title']
                    # Ignore links that aren't interesting
                    if all(bad not in title for bad in bad_titles):
                        outlinks_list.append(title)

        # For each unordered list, extract the titles within the child links
        for unordered_list in soup.find_all('ul'):
            for item in unordered_list.find_all('li'):
                for link in item.find_all('a'):
                    if link.has_attr('title'):
                        title = link['title']
                        # Ignore links that aren't interesting
                        if all(bad not in title for bad in bad_titles):
                            outlinks_list.append(title)

    return outlinks_list
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-8701accf8deb> in <module>()
----> 1 if 'parse' in json_string.keys():
      2     page_html = json_string['parse']['text']['*']
      3 
      4     # Parse the HTML into Beautiful Soup
      5     soup = BeautifulSoup(page_html,'lxml')

NameError: name 'json_string' is not defined
def get_user_revision_counts(page_title,conn):
    """ Takes a page title and returns the number of revisions each user made.
      page_title = a string for the page title to get its revisions
      conn = a database connection
      
    Returns:
      A DataFrame with username, page title, edit count, and min/max timestamps
    """
    # In case you pass a page title with spaces in it, replace the spaces with underscores
    page_title = page_title.replace(' ','_').encode('utf8').decode('latin1')
    
    # The MySQL query string used to retrieve the data. By line, it is
    ## selecting username, page title, the count of edits, and min/max timestamp 
    ## from the "revisions" table
    ## joining the "page" table on it
    ## using the page_id and rev_page columns as keys
    ## limiting the results to entries that have the pagetitle, occur in the namespace, and have >1 edits
    ## grouping the results by username
    ## and subsetting the groupby to users making more than 1 edit
    s = """
            SELECT 
                rev_user_text as user,
                page.page_title as page,
                COUNT(rev_user_text) as edits,
                MIN(rev_timestamp) as min_timestamp,
                MAX(rev_timestamp) as max_timestamp
            FROM 
                revision 
            JOIN 
                page ON page.page_id = revision.rev_page
            WHERE 
                page.page_title = "{0}" 
                AND page_namespace = 0
            GROUP BY
                rev_user_text
            HAVING
                edits > 1
        """.format(page_title)

    # Use the connection to run the query and return the results as a DataFrame
    _df = pd.read_sql_query(s,conn)
    
    # Some of the results have a "bytestring" format
    byte_columns = ['user','page','min_timestamp','max_timestamp']
    
    # For each column, convert it from bytestring to a utf8 string
    for col in byte_columns:
        _df[col] = _df[col].str.decode('utf8')
        
    # Several of the columns are timestamps. Convert to datetimes
    try:
        _df['min_timestamp'] = _df['min_timestamp'].apply(lambda x:pd.datetime.strptime(x,'%Y%m%d%H%M%S'))
        _df['max_timestamp'] = _df['max_timestamp'].apply(lambda x:pd.datetime.strptime(x,'%Y%m%d%H%M%S'))
        _df['min_timestamp'] = _df['min_timestamp'].apply(lambda x:round((x-pd.Timestamp('2001-01-01'))/np.timedelta64(1,'D'),0))
        _df['max_timestamp'] = _df['max_timestamp'].apply(lambda x:round((x-pd.Timestamp('2001-01-01'))/np.timedelta64(1,'D'),0))
    except:
        pass
    
    # Return the data, with a clean index
    return _df

# Also define a cute little function that will return the sizes of the network
def bipartite_network_size(edgelist_df,user_col='user',page_col='page'):
    # Print out summary statistics
    users = len(collab_nobot_edgelist_df[user_col].unique())
    pages = len(collab_nobot_edgelist_df[page_col].unique())
    edges = len(edgelist_df)
    return users, pages, edges
# Awaken the data connection
conn.ping()
conn.cursor().execute('use enwiki_p')

# Get the revisions
single_page_user_revision_counts = get_user_revision_counts(page_title,conn)

# Look at the first few rows
single_page_user_revision_counts.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-d8e5a40562bc> in <module>()
      1 # Awaken the data connection
----> 2 conn.ping()
      3 conn.cursor().execute('use enwiki_p')
      4 
      5 # Get the revisions

NameError: name 'conn' is not defined