How many NPP pages marked for deletion are actually deleted?

Specifically, how many were deleted by someone other than the patroller. We assume that the judgement of a non-patroller admin is a 'gold standard' for whether the page patroller accurately assessed whether the page should be deleted.

We could refine this by focusing solely on pages that were marked for Speedy Deletion, which is the primary purpose of NPP: articles that are tagged for PROD or AfD are presumed to be on a 'slow track' and it's possible that the article should be given a little more time to assert its worthiness. In this case, we would expect these CSD tagged articles to be deleted within a few days (CSD does not appear to be prone to large backlogs)

from datetime import datetime
import os
import pickle
import pymysql
def connect_to_db():
    """
    Connect to the public enwiki db using quarry.wmflabs.org credentials
    """
    conn = pymysql.connect(
        host=os.environ['MYSQL_HOST'],
#         user='u2402',

        user=os.environ['MYSQL_USERNAME'],
        password=os.environ['MYSQL_PASSWORD'], #don't print me!
        
        database='enwiki_p',
        charset='utf8',
        use_unicode=True
    )
    return conn
conn = connect_to_db()

Get all pages marked for deletion by NPP by month and year. We know that most of these pages are deleted because they are promospam or non-notable, so most of the deleted pages are likely to be in the public archive table (not revision deleted).

#all log actions for every page curated through page curation between November 2015 and June 2017
with conn.cursor() as cur:
    cur.execute('use enwiki_p;')
    cur.execute("""SELECT EXTRACT(YEAR FROM DATE_FORMAT(log_timestamp,'%Y%m%d%H%i%s')) AS `year`, 
                    EXTRACT(MONTH FROM DATE_FORMAT(log_timestamp,'%Y%m%d%H%i%s')) AS `month`, 
                    log_timestamp, 
                    log_user, 
                    log_title,
                    log_page,
                    log_action                    
                        from logging_logindex 
                            WHERE log_type='pagetriage-curation' 
                            AND log_timestamp 
                            between 20151101000000 
                            and 20170701000000
                        ORDER BY `year` ASC, 
                        `month` ASC;""")
    results = cur.fetchall()
print(type(results[0][2]))
<class 'bytes'>
all_pc_actions = [list(x) for x in results]
def decode_bytestring(dataset):
    """
    Takes a list of lists, 
    returns the same list with bytestrings in the specified positions
    Transformed into UTF-8 encoded strongs
    """
    
    for d in dataset: #make this a comprehension
        for f in d:
            try:
#             if type(f) == bytes:
#                 print(d[f])
                d[f] = d[f].decode("utf8")
            except AttributeError:
                pass
#         for p in positions:
#             d[p] = d[p].decode("utf8")

    return dataset        
sliced = all_pc_actions[:5]
x = decode_bytestring(sliced)
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-64-0973bfa8a764> in <module>()
      1 sliced = all_pc_actions[:5]
----> 2 x = decode_bytestring(sliced)

<ipython-input-63-06289d59bd26> in decode_bytestring(dataset)
     11 #             if type(f) == bytes:
     12 #                 print(d[f])
---> 13                 d[f] = d[f].decode("utf8")
     14             except AttributeError:
     15                 pass

IndexError: list index out of range
all_pc_actions = decode_bytestring(all_pc_actions) #[2,4,6]
print(all_pc_actions[5])
[2015, 11, b'20151101001203', 13821812, b'Ruga-Ruga', 48346791, b'reviewed']
#save for later

with open('NPP_all_actions',"wb") as fout:
    pickle.dump(all_pc_actions,fout)
#open saved version
with open("NPP_all_actions", "rb") as fin:
    all_pc_actions = pickle.load(fin)
def ts_to_dt(timestamp):
    """
    - Accepts a datetime object
    - Returns a db-timestamp formatted string e.g. '20160621195536'
    """
    return datetime.strptime(timestamp, "%Y%m%d%H%M%S")
def time_interval(ts1, ts2):
    """
    Accepts two datetimes
    First should be later, to yield a positive number
    Returns the number of days between them
    """
    return ((((ts1.date() - ts2.date()).total_seconds()/60)/60)/24)
def followup_log_actions(conn, page_id, log_date, log_type, action_type, log_namespace=0):
    """
    Takes: 
        - a database connection
        - the page id and the log date of a log action
        - the desired log action and type record to check
        - and an optional namespace to filter by
    Returns:
        - relevant info from any matching records of the desired type
    """

#     conn = connect_to_db()
    with conn.cursor() as cur:
        cur.execute('use enwiki_p;')
        cur.execute("""select log_timestamp, log_user, log_title, log_namespace 
                        from logging_logindex 
                        where log_namespace = 0 
                        and log_type = {} 
                        and log_action = {} 
                        and log_page = {};""".format(log_type, log_action, page_id))
        results = cur.fetchall()

Deletion actions

marked_for_deletion = [x for x in all_pc_actions if x[6] ==  'delete']
#save for later

with open('NPP_marked_for_deletion',"wb") as fout:
    pickle.dump(marked_for_deletion,fout)
#open saved version
with open("NPP_marked_for_deletion", "rb") as fin:
    marked_for_deletion = pickle.load(fin)
mslice = marked_for_deletion[:5]
mslice
[[2015, 11, '20151101050038', 15799856, 'Paula_&_Art', 48428858, 'delete'],
 [2015, 11, '20151101050352', 23039188, 'Anil_Dabhi', 48428862, 'delete'],
 [2015,
  11,
  '20151101064900',
  25061312,
  'Sangeeth_sakthikulangara',
  48429311,
  'delete'],
 [2015,
  11,
  '20151101064952',
  25061312,
  'Stanisław_Gronkowski',
  48429251,
  'delete'],
 [2015, 11, '20151101065235', 25061312, 'Kapil_Bhatt', 48429374, 'delete']]
#approach 2, use 'delete' 'delete' actions from the logging table. 
#If the action happened within 30 days of the deletion recommendation, we say that the deleting admin followed the rec
conn = connect_to_db()

#TODO: make this an agnostic function
for m in marked_for_deletion:
    review_date = ts_to_dt(m[2])
    with conn.cursor() as cur:
        page_id = m[5]
        cur.execute('use enwiki_p;')
        cur.execute("""select log_timestamp, log_user, log_title 
                        from logging_logindex 
                        where log_namespace = 0 
                        and log_type = 'delete' 
                        and log_action = 'delete' 
                        and log_page = {};""".format(page_id))
        results = cur.fetchall()
deletions = [list(x) for x in results]
#make this a function that converts all bytestrings
for d in deletions:
    d[0] = d[0].decode("utf8")
    d[2] = d[2].decode("utf8")
#save for later

with open('NPP_deletions',"wb") as fout:
    pickle.dump(deletions,fout)
#open saved version
with open("NPP_deletions", "rb") as fin:
    deletions = pickle.load(fin)
        if not res:
            m.append(False) #can't find a corresponding deletion action. assume it's still around.
        else:
            deleted_after = False
            for r in res:
                deletion_date = ts_to_dt(r[0].decode("utf8"))
                date_diff = time_interval(deletion_date, review_date)
#                 print(m[4])
#                 print(creation_date)
#                 print(review_date)                
#                 print(date_diff)
                if date_diff <= 30:
                    deleted_after = True
            m.append(deleted_after)
#approach 2, use 'delete' 'delete' actions from the logging table. 
#If the action happened within 30 days of the deletion recommendation, we say that the deleting admin followed the rec
conn = connect_to_db()

delete_actions = [list(m0)]
for m in marked_for_deletion:
    review_date = ts_to_dt(m[2])
    with conn.cursor() as cur:
        try:
            page_id = m[5]
#             title = m[4]
            cur.execute('use enwiki_p;')
            cur.execute("""select log_timestamp, log_user, log_title 
                            from logging_logindex 
                            where log_namespace = 0 
                            and log_type = 'delete' 
                            and log_action = 'delete' 
                            and log_page = {};""".format(page_id))
            res = cur.fetchall()
            if not res:
                m.append(False) #can't find a corresponding deletion action. assume it's still around.
            else:
                deleted_after = False
                for r in res:
                    deletion_date = ts_to_dt(r[0].decode("utf8"))
                    date_diff = time_interval(deletion_date, review_date)
    #                 print(m[4])
    #                 print(creation_date)
    #                 print(review_date)                
    #                 print(date_diff)
                    if date_diff <= 30:
                        deleted_after = True
                m.append(deleted_after)
        except:
            print(m)
mslice
[[2015, 12, '20151201002631', 1576422, 'EKasi+', 48689554, True],
 [2015, 12, '20151201010232', 1576422, 'Jessica_Forest', 48694826, True],
 [2015, 12, '20151201024724', 21119524, 'Sea_wort_grass', 48694930, True],
 [2015, 12, '20151201024846', 21119524, 'Interlex', 48694197, False],
 [2015, 12, '20151201024934', 21119524, 'Rifle_Paper_Co.', 48693944, True]]

todo

  • see if I can query the deletion log to see how many were deleted within a particular timespan
  • see if I can filter for deletion reason
  • check to make sure my counts match for deletion requests
marked_for_deletion[100:200]
[[2015,
  12,
  '20151202060535',
  17847210,
  'Benifit_of_unpollluted_environment',
  48706983,
  True],
 [2015,
  12,
  '20151202062243',
  17847210,
  'Exponential_Technologies_Institute',
  48707084,
  True],
 [2015, 12, '20151202062518', 17847210, 'Insilico_medicine', 48707105, True],
 [2015, 12, '20151202064006', 17847210, 'Ekdumranko', 48707150, True],
 [2015, 12, '20151202064500', 23748255, 'Sadiq_Ahmad_"Dawar"', 48707201, True],
 [2015, 12, '20151202064545', 23748255, 'Myles_Morgan_Crosby', 48707197, True],
 [2015, 12, '20151202065732', 12670366, 'Lovewar_(band)', 48706107, False],
 [2015, 12, '20151202074835', 17847210, 'KMPH_Trips', 48707506, True],
 [2015, 12, '20151202074901', 17847210, '@sohilkumar3', 48707507, True],
 [2015, 12, '20151202075344', 17847210, 'Priceraja', 48707445, True],
 [2015, 12, '20151202075452', 17847210, 'D.B._Singh', 48707525, False],
 [2015, 12, '20151202075529', 17847210, 'Atibul+rahaman', 48707521, True],
 [2015, 12, '20151202080500', 17847210, 'OGEZO', 48707573, True],
 [2015, 12, '20151202080521', 17847210, 'Ambalappad', 48707520, False],
 [2015, 12, '20151202081301', 17847210, 'Skimz', 48707586, True],
 [2015, 12, '20151202081352', 17847210, 'Gfhjnmbvcdfg', 48707610, True],
 [2015, 12, '20151202081419', 17847210, 'Gfhjnmbvcdfg', 48707610, True],
 [2015, 12, '20151202081541', 17847210, 'Dfghjdfghjknbdfg', 48707588, True],
 [2015, 12, '20151202081953', 17847210, 'Searchgfhjdfg', 48707633, True],
 [2015, 12, '20151202082028', 17847210, 'Bvgfhjnbvb', 48707620, True],
 [2015, 12, '20151202082140', 17847210, 'Fghjkfghj', 48707645, True],
 [2015, 12, '20151202082154', 17847210, 'Tech_evolutions', 48707649, True],
 [2015,
  12,
  '20151202082334',
  17847210,
  'Administrator_JohnCD_and_Materialscientist',
  48707657,
  True],
 [2015,
  12,
  '20151202082657',
  17847210,
  'Fuck_wikipedia_admins_with_their_fucking_hand_of_blocking',
  48707662,
  True],
 [2015,
  12,
  '20151202083147',
  17847210,
  'Catarman_National_High_School(Catarman)',
  48707678,
  False],
 [2015,
  12,
  '20151202083914',
  17847210,
  'Fuck_all_wikipedia_administrator_with_thei_fucking_hand_of_blockings_from_Block_account_Sadicology',
  48707710,
  True],
 [2015, 12, '20151202084119', 17847210, 'ADCOM_(company)', 48707722, True],
 [2015,
  12,
  '20151202084439',
  17847210,
  'Fuck_you_material_scientist_and_your_fucking_friend_JonhCD_from_me_Sadicology_a_Nigerian_wikipedian_Vandalizer',
  48707728,
  True],
 [2015,
  12,
  '20151202084711',
  17847210,
  'Sons_Of_Bastard_Money-Mongers_wikipedia_administrators',
  48707741,
  True],
 [2015,
  12,
  '20151202085037',
  17847210,
  'Ghjkbvfgh_sons_of_bastards_never_tired_of_making_such_types_of_thing_ever',
  48707749,
  True],
 [2015,
  12,
  '20151202085343',
  17847210,
  'Fuck_you_Ueutyi_and_your_fucking_contribution_son_of_bitch',
  48707768,
  True],
 [2015, 12, '20151202114144', 23753558, 'Bedroom_trends', 48708469, True],
 [2015,
  12,
  '20151202114457',
  23753558,
  'The_Media_Ambassador',
  48708412,
  True],
 [2015,
  12,
  '20151202114709',
  23753558,
  'Monitoramento_e_metas',
  48708401,
  True],
 [2015, 12, '20151202114737', 23753558, 'Замрзнување', 48708375, True],
 [2015, 12, '20151202133700', 23748255, 'Sadiq_Ahmad_"Dawar"', 48708702, True],
 [2015,
  12,
  '20151202140343',
  22089808,
  'C_BLEECH_gospel_hip_hop',
  48709131,
  True],
 [2015, 12, '20151202140419', 22089808, 'CHL_Hospital', 48708878, True],
 [2015,
  12,
  '20151202140518',
  22089808,
  'Jeffrey_Williams_(Bouhriz_D.M_Abdou)',
  48708668,
  True],
 [2015, 12, '20151202140857', 22089808, 'Lisita_lintre', 48708556, True],
 [2015,
  12,
  '20151202140945',
  22089808,
  'Herakhan_Vishwa_Mahadam',
  48708508,
  True],
 [2015, 12, '20151202141136', 22089808, 'ACTIVATE_Wits', 48708496, True],
 [2015, 12, '20151202141859', 22089808, 'Ahmad_shami', 48708244, True],
 [2015,
  12,
  '20151202142417',
  22089808,
  'The_Asylum_-_Short_Horror_Story',
  48707880,
  True],
 [2015, 12, '20151202142454', 4295, 'Hayden_gise', 48709194, True],
 [2015, 12, '20151202142512', 4295, 'PRAKASH_DAULAT_SAINDANE', 48709193, True],
 [2015,
  12,
  '20151202142637',
  22089808,
  'BABA_BALA_JI_FAQARSAR',
  48707734,
  True],
 [2015, 12, '20151202142854', 4295, 'Početna_strana', 48709188, True],
 [2015, 12, '20151202143322', 4295, 'PRAKASH_DAULAT_SAINDANE', 48709193, True],
 [2015, 12, '20151202154800', 9109237, 'Zouloo', 48709397, True],
 [2015, 12, '20151202172428', 26535374, 'VHS2', 48710292, True],
 [2015, 12, '20151202172632', 19343293, "Jewett's_Woods", 48710316, True],
 [2015, 12, '20151202174045', 1168880, 'Freeze_Cream', 48710156, True],
 [2015, 12, '20151202174907', 17847210, 'Geeta_Keeka', 48710477, True],
 [2015, 12, '20151202175530', 17847210, 'Arijus_Jurevičius', 48710516, True],
 [2015, 12, '20151202175644', 17847210, 'Son-d-lyte', 48710459, True],
 [2015, 12, '20151202180916', 17847210, 'TELELINGUISTIC', 48710616, True],
 [2015, 12, '20151202181907', 17847210, 'Surer@12', 48710680, True],
 [2015,
  12,
  '20151202181936',
  17847210,
  "Take_Your_Girlfriend's_Sister_to_Work_Day",
  48710678,
  True],
 [2015, 12, '20151202182457', 17847210, 'Pog_(animal)', 48710698, True],
 [2015,
  12,
  '20151202184231',
  9346035,
  'The_Difference_Between_Bulls_and_Goats_(For_Dummies)',
  48710853,
  True],
 [2015, 12, '20151202184307', 9346035, 'CATERWINGS', 48710792, True],
 [2015,
  12,
  '20151202184632',
  9346035,
  'Triton_Applied_Reef_Bioscience',
  48710596,
  True],
 [2015,
  12,
  '20151202184844',
  9346035,
  'Honorary_family_member',
  48710562,
  True],
 [2015, 12, '20151202185031', 9346035, 'Penelope_Pasta', 48710257, True],
 [2015,
  12,
  '20151202185457',
  9346035,
  'Iran_Traveling_Center',
  48709394,
  True],
 [2015, 12, '20151202185555', 9346035, 'Geetoshri_Roy', 48709110, True],
 [2015,
  12,
  '20151202190256',
  22604547,
  'Emily_Tent_International',
  48710916,
  True],
 [2015, 12, '20151202190753', 9346035, 'Katrina_Mathers', 48708250, False],
 [2015, 12, '20151202192931', 17847210, 'Nerissus_saegeri', 48711033, False],
 [2015, 12, '20151202193152', 17847210, 'Colin_DiMera', 48711260, True],
 [2015, 12, '20151202193359', 22604547, 'Infosocialism', 48711201, True],
 [2015, 12, '20151202194827', 17847210, 'Simon_chapleau', 48711455, True],
 [2015, 12, '20151202201010', 22665218, 'Oceanic_Abundance', 48711770, True],
 [2015, 12, '20151202201311', 17847210, 'Chader_Moto_Bou', 48711789, True],
 [2015, 12, '20151202203414', 22604547, 'Brandon_Barbee', 48712038, True],
 [2015, 12, '20151202205009', 14657398, 'Warhawk_Alley', 48711624, True],
 [2015, 12, '20151202211111', 20743205, 'Celina_rollon', 48712338, False],
 [2015, 12, '20151202211430', 20743205, 'Nahiri', 48712186, True],
 [2015, 12, '20151202211732', 20743205, 'Emin_Duraku_School', 48712141, True],
 [2015,
  12,
  '20151202212805',
  20743205,
  'Pratheeskumar_Sellathurai',
  48712508,
  True],
 [2015, 12, '20151202214547', 20743205, 'Jonathan_Forman', 48710781, True],
 [2015,
  12,
  '20151202215037',
  20743205,
  'Joseph_Robinson_(oboist)',
  48709807,
  False],
 [2015, 12, '20151202215217', 20743205, 'Ccavenue', 48709242, True],
 [2015, 12, '20151202215339', 20743205, 'Vancouver_Food', 48712727, True],
 [2015, 12, '20151202224509', 22604547, 'Baja_East_Surf_Shop', 48712945, True],
 [2015,
  12,
  '20151202225110',
  22604547,
  'Mixmastered_Acoustics',
  48712796,
  True],
 [2015,
  12,
  '20151202230238',
  17847210,
  'La_Boit_à_Merveilles',
  48713174,
  True],
 [2015, 12, '20151203010425', 24524350, 'Ambedkar_sena', 48714212, True],
 [2015, 12, '20151203010559', 24524350, 'Lawrence_Rigby', 48714201, True],
 [2015, 12, '20151203020310', 17847210, 'Lil_thony', 48714745, True],
 [2015, 12, '20151203020701', 17847210, 'Doc_events', 48714816, True],
 [2015, 12, '20151203024732', 22604547, 'Saint_Jizzle', 48715114, True],
 [2015, 12, '20151203024805', 22604547, 'Kathleen_N_Caputo', 48715091, True],
 [2015,
  12,
  '20151203025609',
  22604547,
  'Women_in_buttercream',
  48715129,
  True],
 [2015, 12, '20151203025801', 22604547, 'Bangladeshi_singer', 48715127, True],
 [2015, 12, '20151203031019', 17847210, 'Bogatyr_batallion', 48715254, False],
 [2015, 12, '20151203031808', 22604547, 'James_Eberhard', 48715350, True],
 [2015,
  12,
  '20151203032041',
  17847210,
  'GARTH_KEMP_NEWSCASTER_FANS',
  48715357,
  True],
 [2015, 12, '20151203033425', 17847210, 'Asaduzzaman_Chandan', 48715397, True]]
import pandas as pd
import numpy as np
headers = ["year","month","timestamp","log_user","log_title","log_page","deleted"]

df = pd.DataFrame(marked_for_deletion, columns=headers)
df.head()
year month timestamp log_user log_title log_page deleted
0 2015 12 20151201002631 1576422 EKasi+ 48689554 True
1 2015 12 20151201010232 1576422 Jessica_Forest 48694826 True
2 2015 12 20151201024724 21119524 Sea_wort_grass 48694930 True
3 2015 12 20151201024846 21119524 Interlex 48694197 False
4 2015 12 20151201024934 21119524 Rifle_Paper_Co. 48693944 True
pd.pivot_table(df,index=["year","month", "deleted"],values="log_page", aggfunc='count')
year  month  deleted
2015  12     False       438
             True       1539
2016  1      False       232
             True       1189
      2      False       218
             True       1151
      3      False       269
             True       1469
      4      False       271
             True       1553
      5      False       225
             True       1078
      6      False       167
             True        779
      7      False       176
             True        751
      8      False       256
             True       1277
      9      False       210
             True       1187
      10     False       215
             True       1113
      11     False       234
             True        887
      12     False       292
             True       1074
2017  1      False       242
             True       1016
      2      False       244
             True       1169
      3      False       254
             True        885
      4      False       173
             True        822
Name: log_page, dtype: int64
#stuff that's been around longer is more likely to have been deleted.
#how does that affect my analysis?
#select a random month and count how many deleted and not deleted
#seems to be working!
dec_deleted = 0
dec_not = 0

for m in marked_for_deletion:
    if (m[0] == 2016 and m[1] == 12):
        if m[6] == True:
            dec_deleted += 1
        elif m[6] == False:
            dec_not += 1

print(march_deleted)
print(march_not)
1074
292
deleted_by_month = pd.pivot_table(df,index=["year","month", "deleted"],values="log_page", aggfunc='count')
deleted_by_month.to_csv('deleted_by_month_log_delete.csv')
 
#OLD METHOD
conn = connect_to_db()
#if the page is listed at least once in the archive table
# with a creation date within 60 days of the review action, assume the page was deleted based on the review
for m in marked_for_deletion:
    review_date = ts_to_dt(m[2])
    with conn.cursor() as cur:
        try:
            title = conn.escape(m[4])
#             title = m[4]
            cur.execute('use enwiki_p;')
            cur.execute("""select ar_timestamp from archive 
                        where ar_namespace = 0 
                        and ar_deleted = 0 
                        and ar_title = {} 
                        and ar_parent_id = 0;""".format(title))
            res = cur.fetchall()
            if not res:
                m.append(False) #can't find a page with this name in the archive table; assume it's still around. will be some noise
            else:
                deleted_after = False
                for r in res:
                    creation_date = ts_to_dt(r[0].decode("utf8"))
                    date_diff = time_interval(review_date, creation_date)
    #                 print(m[4])
    #                 print(creation_date)
    #                 print(review_date)                
    #                 print(date_diff)
                    if (date_diff >= 0 and date_diff <= 60):
                        deleted_after = True
                m.append(deleted_after)
        except:
            print(m)