Effect of Gender on Revert Patterns

Import modules and setup environment

Load up all the libraries we'll need to connect to the database, retrieve information for analysis, and visualizing results.

# Makes the plots appear within the notebook
%matplotlib inline

# Two fundamental packages for doing data manipulation
import numpy as np                   # http://www.numpy.org/
import pandas as pd                  # http://pandas.pydata.org/

# Two related packages for plotting data
import matplotlib.pyplot as plt      # http://matplotlib.org/
import seaborn as sb                 # https://stanford.edu/~mwaskom/software/seaborn/

# Two packages for accessing the MySQL server
import pymysql                       # http://pymysql.readthedocs.io/en/latest/
import os                            # https://docs.python.org/3.4/library/os.html
from timeit import timeit

# Setup the code environment to use plots with a white background and DataFrames show more columns and rows
sb.set_style('whitegrid')
pd.options.display.max_columns = 100
pd.options.display.max_rows = 110

Setup a connection to the English Wikipedia database

Retrieve the host name, username, and (encrypted) password associated with your account so you can login to the database.

host, user, password = os.environ['MYSQL_HOST'], os.environ['MYSQL_USERNAME'], os.environ['MYSQL_PASSWORD']

Connect to the system using the creditials from above, then select the database for the English Wikipedia.

conn = pymysql.connect(host=host,user=user,password=password,database='enwiki_p',connect_timeout=3600)
conn.cursor().execute('use enwiki_p');
# What tables can I use?
# pd.read_sql_query('show tables',conn)

Let's make a matching study comparing: (1) female users with gender hidden within the userprop but not in the category, and (2) female users identifying gender with the category and compare their revert patterns to men.

Lists of Female Editors

Female Editors (User Property)

def get_female_sample_user_property(conn, n):
    female_names = []
    # The following query returns all the female editors
    s = """
        SELECT 
            up_user
        FROM 
            user_properties
        WHERE
            user_properties.up_property = 'gender' AND user_properties.up_value = 'female'
        LIMIT
            {0}
        """.format(n)
    
    female_ids_up = pd.read_sql_query(s, conn)['up_user'].tolist()
    for i, person_id in enumerate(female_ids_up):
        # The following query, given the page_id returns the full names of the editors
        s = """
            SELECT 
                *
            FROM 
                user
            WHERE
                user.user_id = '{0}'
            """.format(person_id)
    
        person_name = pd.read_sql_query(s, conn)['user_name'].tolist()[0]
        female_names.append((person_id, person_name))

    return female_names
female_names_up = get_female_sample_user_property(conn, 100)
"There are {0} female Wikipedians in our sample.".format(len(female_names_up))
'There are 100 female Wikipedians in our sample.'

Self-identified Female Editors

def get_female_sample_category(conn, n):
    female_names = []
    # The following query returns all links (page_ids) from the Female_Wikipedians page
    s = """
        SELECT 
            cl_from
        FROM 
            categorylinks
        WHERE
            cl_to = 'Female_Wikipedians'
        LIMIT
            {0}
        """.format(n)
    
    female_ids_cat = pd.read_sql_query(s, conn)['cl_from'].tolist()
    for i, person_id in enumerate(female_ids_cat):
        # The following query, given the page_id returns the full names of the editors
        s = """
            SELECT 
                page_title
            FROM 
                page
            WHERE
                page.page_id = '{0}'
            """.format(person_id)
    
        person_name = pd.read_sql_query(s, conn)['page_title'].tolist()[0]
        female_names.append((person_id, person_name))

    return female_names
female_names_cat = get_female_sample_category(conn, 100)
"There are {0} self-identifying female Wikipedians in our sample.".format(len(female_names_cat))
'There are 100 self-identifying female Wikipedians in our sample.'

Match similar male and female editors

By matching men and women with similar: (1) start dates and (2) number of revisions try to estimate the effect of gender alone.

# The following query returns properties of a specific editor.
s = """
        SELECT 
            user_id, user_registration, user_editcount
        FROM 
            user
        WHERE
            user.user_id = {0}
    """.format(female_names_up[0][0])
    
df = pd.read_sql_query(s,conn)
user_registration = df['user_registration'].tolist()[0]
number_of_edits = df['user_editcount'].tolist()[0]
print("This female editor ({0}) was registered on {1} and has made {2} edits.".format(female_names_up[0], user_registration, number_of_edits))
This female editor ((94945, b'Triona')) was registered on b'20040819072202' and has made 8009 edits.
# Find male users who look like this female.
s = """
        SELECT
            *
        FROM 
            user
        JOIN
            user_properties ON user.user_id = user_properties.up_user
        WHERE
            user_properties.up_property = 'gender' AND user_properties.up_value = 'male' AND
            ABS(TRUNCATE(CONVERT(user.user_registration, INTEGER), -8) = TRUNCATE(CONVERT('{1}', INTEGER), -8))
        ORDER BY 
            ABS(user.user_editcount - {0})
        LIMIT
            1
    """.format(number_of_edits, user_registration.decode('utf-8'))
# Rounding year and log bin edit counts 
pd.read_sql_query(s,conn)
user_id user_name user_real_name user_password user_newpassword user_email user_options user_touched user_token user_email_authenticated user_email_token user_email_token_expires user_registration user_newpass_time user_editcount user_password_expires up_user up_property up_value
0 90542 b'Furius' b'' None None None None None None None None None b'20040820094651' None 7171 None 90542 b'gender' b'male'

Revisions for Female Editors

The following query counts all of the reverts of female editors. We'd want to do the same for male editors and compare.

# The following query returns the reverts of an editor
s = """
        SELECT
            *
        FROM 
            revision
        WHERE
            revision.rev_parent_id IN
            (
            SELECT 
                rev_id
            FROM 
                revision_userindex
            WHERE
                revision_userindex.rev_user = {0}
            )
    """.format(female_names_up[0][0])
    
pd.read_sql_query(s,conn)
---------------------------------------------------------------------------
OperationalError                          Traceback (most recent call last)
/srv/paws/lib/python3.4/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs)
   1400             else:
-> 1401                 cur.execute(*args)
   1402             return cur

/srv/paws/lib/python3.4/site-packages/pymysql/cursors.py in execute(self, query, args)
    165 
--> 166         result = self._query(query)
    167         self._executed = query

/srv/paws/lib/python3.4/site-packages/pymysql/cursors.py in _query(self, q)
    321         self._last_executed = q
--> 322         conn.query(q)
    323         self._do_get_result()

/srv/paws/lib/python3.4/site-packages/pymysql/connections.py in query(self, sql, unbuffered)
    851         self._execute_command(COMMAND.COM_QUERY, sql)
--> 852         self._affected_rows = self._read_query_result(unbuffered=unbuffered)
    853         return self._affected_rows

/srv/paws/lib/python3.4/site-packages/pymysql/connections.py in _read_query_result(self, unbuffered)
   1052             result = MySQLResult(self)
-> 1053             result.read()
   1054         self._result = result

/srv/paws/lib/python3.4/site-packages/pymysql/connections.py in read(self)
   1335         try:
-> 1336             first_packet = self.connection._read_packet()
   1337 

/srv/paws/lib/python3.4/site-packages/pymysql/connections.py in _read_packet(self, packet_type)
    982         while True:
--> 983             packet_header = self._read_bytes(4)
    984             if DEBUG: dump_packet(packet_header)

/srv/paws/lib/python3.4/site-packages/pymysql/connections.py in _read_bytes(self, num_bytes)
   1028             raise err.OperationalError(
-> 1029                 CR.CR_SERVER_LOST, "Lost connection to MySQL server during query")
   1030         return data

OperationalError: (2013, 'Lost connection to MySQL server during query')

During handling of the above exception, another exception occurred:

InterfaceError                            Traceback (most recent call last)
/srv/paws/lib/python3.4/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs)
   1404             try:
-> 1405                 self.con.rollback()
   1406             except Exception:  # pragma: no cover

/srv/paws/lib/python3.4/site-packages/pymysql/connections.py in rollback(self)
    787         """Roll back the current transaction"""
--> 788         self._execute_command(COMMAND.COM_QUERY, "ROLLBACK")
    789         self._read_ok_packet()

/srv/paws/lib/python3.4/site-packages/pymysql/connections.py in _execute_command(self, command, sql)
   1066         if not self._sock:
-> 1067             raise err.InterfaceError("(0, '')")
   1068 

InterfaceError: (0, '')

During handling of the above exception, another exception occurred:

DatabaseError                             Traceback (most recent call last)
<ipython-input-40-dfe8419e65e5> in <module>()
     17     """.format(female_names_up[0][0])
     18 
---> 19 pd.read_sql_query(s,conn)

/srv/paws/lib/python3.4/site-packages/pandas/io/sql.py in read_sql_query(sql, con, index_col, coerce_float, params, parse_dates, chunksize)
    329     return pandas_sql.read_query(
    330         sql, index_col=index_col, params=params, coerce_float=coerce_float,
--> 331         parse_dates=parse_dates, chunksize=chunksize)
    332 
    333 

/srv/paws/lib/python3.4/site-packages/pandas/io/sql.py in read_query(self, sql, index_col, coerce_float, params, parse_dates, chunksize)
   1434 
   1435         args = _convert_params(sql, params)
-> 1436         cursor = self.execute(*args)
   1437         columns = [col_desc[0] for col_desc in cursor.description]
   1438 

/srv/paws/lib/python3.4/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs)
   1407                 ex = DatabaseError("Execution failed on sql: %s\n%s\nunable"
   1408                                    " to rollback" % (args[0], exc))
-> 1409                 raise_with_traceback(ex)
   1410 
   1411             ex = DatabaseError(

/srv/paws/lib/python3.4/site-packages/pandas/compat/__init__.py in raise_with_traceback(exc, traceback)
    338         if traceback == Ellipsis:
    339             _, _, traceback = sys.exc_info()
--> 340         raise exc.with_traceback(traceback)
    341 else:
    342     # this version of raise is a syntax error in Python 3

/srv/paws/lib/python3.4/site-packages/pandas/io/sql.py in execute(self, *args, **kwargs)
   1403         except Exception as exc:
   1404             try:
-> 1405                 self.con.rollback()
   1406             except Exception:  # pragma: no cover
   1407                 ex = DatabaseError("Execution failed on sql: %s\n%s\nunable"

/srv/paws/lib/python3.4/site-packages/pymysql/connections.py in rollback(self)
    786     def rollback(self):
    787         """Roll back the current transaction"""
--> 788         self._execute_command(COMMAND.COM_QUERY, "ROLLBACK")
    789         self._read_ok_packet()
    790 

/srv/paws/lib/python3.4/site-packages/pymysql/connections.py in _execute_command(self, command, sql)
   1065     def _execute_command(self, command, sql):
   1066         if not self._sock:
-> 1067             raise err.InterfaceError("(0, '')")
   1068 
   1069         # If the last query was unbuffered, make sure it finishes before

DatabaseError: Execution failed on sql: 
        SELECT
            *
        FROM 
            revision
        WHERE
            revision.rev_parent_id IN
            (
            SELECT 
                rev_id
            FROM 
                revision_userindex
            WHERE
                revision_userindex.rev_user = 94945
            )
    
(2013, 'Lost connection to MySQL server during query')
unable to rollback