Users in Wikipedia AND/OR Wikidata

%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt  
import matplotlib
import numpy as np
import urllib
from scipy.stats import pearsonr
import os
import seaborn as sns

Reading input data files

enwiki_users = pd.read_csv('/Users/sarasua/Documents/Research/enwiki-wikidata/enwiki_users.tsv',delimiter="\t")
enwiki_users.columns = ['user_id','user_name','user_real_name','user_password','user_newpassword','user_email','user_options','user_touched','user_token','user_email_authenticated','user_email_token','user_email_token_expires','user_registration','user_newpass_time','user_editcount','user_password_exp']
# obtained with query "SELECT * FROM enwiki_p.user ORDER BY user_editcount DESC" at WM Tool Forge (as of 23.02.2019)
wikidatawiki_users = pd.read_csv('/Users/sarasua/Documents/Research/enwiki-wikidata/wikidata_users.tsv',delimiter="\t")
# obtained with query "SELECT * FROM wikidatawiki_p.user ORDER BY user_editcount DESC" at WM Tool Forge (as of 23.02.2019)
wikidatawiki_users.columns = ['user_id','user_name','user_real_name','user_password','user_newpassword','user_email','user_options','user_touched','user_token','user_email_authenticated','user_email_token','user_email_token_expires','user_registration','user_newpass_time','user_editcount','user_password_exp']
/Users/sarasua/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
uwikipedia = pd.DataFrame(enwiki_users['user_name'])
uwikidata = pd.DataFrame(wikidatawiki_users['user_name'])
intersection = pd.merge(uwikipedia, uwikidata, how='inner')
print(len(intersection))
2508285
intersection.head()
user_name
0 AxelBoldt
1 Magnus Manske
2 Kpjas
3 General Wesc
4 Jimbo Wales
print(len(uwikipedia))
35887706
print(len(uwikidata))
3109705
a = set(uwikipedia['user_name'])
b = set(uwikidata['user_name'])
print(a)
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
intersect = a.intersection(b)
print(len(intersect)) # different number (?)
2508268
if 'Jimbo Wales' in a:
    print("Jimbo found in Wikipedia")
if 'Jimbo Wales' in b:
    print("Jimbo found in Wikidata")    
Jimbo found in Wikipedia
Jimbo found in Wikidata
uwikipedia[uwikipedia['user_name']=="Jimbo Wales"]
user_name
22 Jimbo Wales
uwikidata[uwikidata['user_name']=="Jimbo Wales"]
user_name
125729 Jimbo Wales

Venn Diagrams

# https://pypi.org/project/matplotlib-venn/
!pip install matplotlib_venn
Requirement already satisfied: matplotlib_venn in /Users/sarasua/anaconda3/lib/python3.6/site-packages (0.11.5)
Requirement already satisfied: matplotlib in /Users/sarasua/anaconda3/lib/python3.6/site-packages (from matplotlib_venn) (2.1.0)
Requirement already satisfied: numpy in /Users/sarasua/anaconda3/lib/python3.6/site-packages (from matplotlib_venn) (1.13.3)
Requirement already satisfied: scipy in /Users/sarasua/anaconda3/lib/python3.6/site-packages (from matplotlib_venn) (0.19.1)
Requirement already satisfied: six>=1.10 in /Users/sarasua/anaconda3/lib/python3.6/site-packages (from matplotlib->matplotlib_venn) (1.11.0)
Requirement already satisfied: python-dateutil>=2.0 in /Users/sarasua/anaconda3/lib/python3.6/site-packages (from matplotlib->matplotlib_venn) (2.6.1)
Requirement already satisfied: pytz in /Users/sarasua/anaconda3/lib/python3.6/site-packages (from matplotlib->matplotlib_venn) (2017.2)
Requirement already satisfied: cycler>=0.10 in /Users/sarasua/anaconda3/lib/python3.6/site-packages (from matplotlib->matplotlib_venn) (0.10.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /Users/sarasua/anaconda3/lib/python3.6/site-packages (from matplotlib->matplotlib_venn) (2.2.0)
You are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
v = venn2([a,b])
label = v.get_label_by_id('11')          # Those are subset labels (i.e. numbers)
label.set_fontsize(22) 
label.set_family('serif')
label.set_x(label.get_position()[0] + 0.1)

v.get_label_by_id('A').set_text('Wikipedia users') # Those are set labels
v.get_label_by_id('A').set_fontsize(22)

v.get_label_by_id('B').set_text('Wikidata users') # Those are set labels
v.get_label_by_id('B').set_fontsize(22)


plt.show()
 
 

Final Visualization

def drawDiagram(enwiki_df,wikidatawiki_df,move_numbers):
    
    uwikipedia = pd.DataFrame(enwiki_df['user_name'])
    uwikidata = pd.DataFrame(wikidatawiki_df['user_name'])

    # is that needed (?)
    
    a = set(uwikipedia['user_name'])
    b = set(uwikidata['user_name'])

    intersect = a.intersection(b)
    
    
    v = venn2([a,b])
    #label = v.get_label_by_id('11')          # Those are subset labels (i.e. numbers)
    #label.set_fontsize(22) 
    #label.set_family('serif')
    #label.set_x(label.get_position()[0] + 0.1)

    v.get_label_by_id('A').set_text('Wikipedia users') # Those are set labels
    #v.get_label_by_id('A').set_fontsize(22)

    v.get_label_by_id('B').set_text('Wikidata users') # Those are set labels
    #v.get_label_by_id('B').set_fontsize(22)

    if move_numbers :
        v.get_label_by_id("100").set_x(-0.55)
        v.get_label_by_id("110").set_y(0.55)
        v.get_label_by_id("010").set_x(0.85)

    
    plt.show()
    
import seaborn as sns
sns.distplot(enwiki_users['user_editcount'],kde=False);
sns.set(style="whitegrid")
ax = sns.boxplot(x=enwiki_users['user_editcount'],palette="Blues")
/Users/sarasua/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py:462: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  box_data = remove_na(group_data)
enwiki_users['user_editcount'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x119662d30>
sns.distplot(wikidatawiki_users['user_editcount'],kde=False);
sns.set(style="whitegrid")
ax = sns.boxplot(x=wikidatawiki_users['user_editcount'],palette="Blues")
/Users/sarasua/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py:462: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  box_data = remove_na(group_data)
wikidatawiki_users['user_editcount'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1186e55f8>
 
# Prepare set ALL and call drawDiagram
drawDiagram(enwiki_users,wikidatawiki_users, True)
# Prepare set Top Users and call drawDiagram
top_enwiki = enwiki_users[enwiki_users['user_editcount'] >= 1000000]
top_wikidata = wikidatawiki_users[wikidatawiki_users['user_editcount'] >= 1000000]

drawDiagram(top_enwiki,top_wikidata, False)
# Better to select TOPK in each and compare
enwiki_users_sortededit = enwiki_users.sort_values(by=['user_editcount'])
wikidatawiki_users_sortededit = wikidatawiki_users.sort_values(by=['user_editcount'])

top_wikipedia = enwiki_users_sortededit.head(1000)
top_wikidata= wikidatawiki_users_sortededit.head(1000)
drawDiagram(top_wikipedia, top_wikidata, False)
# Prepare set Bots and Humans and call drawDiagram

# read lis of bots in Wikidata - but I don't have this for Wikipedia so as a rough estimate do if str contains 'bot'
# bots = pd.read_csv('/Users/sarasua/Documents/Research/enwiki-wikidata/bots_marisa2018.csv')
# df.loc[df['column_name'].isin(some_values)]
import re
def isBot(row):
    
    
    
    
    #TODO: better to get the real lists of bots and intersect - this "bot" regex gives things like Lobotomia and that's not a bot
    bot = False
    
    if "bot" in str(row.user_name):
        bot = True
    return bot
    
    #if re.search('bot', row.user_name, re.IGNORECASE):
    #    bot = True
    #return bot
#bots_wikipedia = enwiki_users.apply(isBot,axis=1)
#enwiki_users_b = pd.concat([enwiki_users,bots_wikipedia],axis=1)
#enwiki_users_b.columns = ['user_id','user_name','user_real_name','user_password','user_newpassword','user_email','user_options','user_touched','user_token','user_email_authenticated','user_email_token','user_email_token_expires','user_registration','user_newpass_time','user_editcount','user_password_exp','is_bot']

#humans_wikipedia = enwiki_users_b[enwiki_users_b['is_bot']== False]
#bots_wikipedia = enwiki_users_b[enwiki_users_b['is_bot']==True]


#bots_wikidata = wikidatawiki_users.apply(isBot,axis=1)
#wikidatawiki_users_b = pd.concat([wikidatawiki_users,bots_wikidata],axis=1)
#wikidatawiki_users_b.columns = ['user_id','user_name','user_real_name','user_password','user_newpassword','user_email','user_options','user_touched','user_token','user_email_authenticated','user_email_token','user_email_token_expires','user_registration','user_newpass_time','user_editcount','user_password_exp','is_bot']

#humans_wikidata = wikidatawiki_users_b[wikidatawiki_users_b['is_bot']== False]
#bots_wikidata = wikidatawiki_users_b[wikidatawiki_users_b['is_bot']==True]
bot_ids_wikipedia = pd.read_csv('/Users/sarasua/Documents/Research/enwiki-wikidata/bot_ids_wikipedia.tsv',delimiter="\t")
bot_ids_wikidata = pd.read_csv('/Users/sarasua/Documents/Research/enwiki-wikidata/bot_ids_wikidata.tsv',delimiter="\t")
bots_wikipedia = enwiki_users.loc[enwiki_users['user_name'].isin(bot_ids_wikipedia['Name'])] 
humans_wikipedia = enwiki_users.loc[~enwiki_users['user_name'].isin(bot_ids_wikipedia['Name'])]

bots_wikidata = wikidatawiki_users.loc[wikidatawiki_users['user_name'].isin(bot_ids_wikidata['Name'])] 
humans_wikidata = wikidatawiki_users.loc[~wikidatawiki_users['user_name'].isin(bot_ids_wikidata['Name'])]
bot_ids_wikidata['Name'].head()
0              SuccuBot
1           Edoderoobot
2    QuickStatementsBot
3             Emijrpbot
4                 KrBot
Name: Name, dtype: object
wikidatawiki_users[wikidatawiki_users['user_name']=="SuccuBot"]
user_id user_name user_real_name user_password user_newpassword user_email user_options user_touched user_token user_email_authenticated user_email_token user_email_token_expires user_registration user_newpass_time user_editcount user_password_exp
drawDiagram(humans_wikipedia, humans_wikidata, True)
print(len(bots_wikidata))
274
print(len(humans_wikidata))
3109431
 
drawDiagram(bots_wikipedia, bots_wikidata, True)
bots_wikidata.head()
user_id user_name user_real_name user_password user_newpassword user_email user_options user_touched user_token user_email_authenticated user_email_token user_email_token_expires user_registration user_newpass_time user_editcount user_password_exp
0 1554155 Edoderoobot NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 20141224085828 NaN 40585180 NaN
1 2769139 QuickStatementsBot NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 20161130140435 NaN 36074927 NaN
2 6811 Emijrpbot NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 20121031211348 NaN 30621935 NaN
3 150965 KrBot NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 20130310213208 NaN 28291672 NaN
4 1529466 BotNinja NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 20141211203623 NaN 27244470 NaN
def exploreIntersection(enwiki_df, wikidatawiki_df):
    uwikipedia = pd.DataFrame(enwiki_df['user_name'])
    uwikidata = pd.DataFrame(wikidatawiki_df['user_name'])


    a = set(uwikipedia['user_name'])
    b = set(uwikidata['user_name'])

    intersect = a.intersection(b)
    
    print(intersect)
exploreIntersection(bots_wikipedia,bots_wikidata)
{'Addbot', 'JVbot', 'Krdbot', 'RileyBot', 'JAnDbot', 'Maintenance script', 'VIAFbot', 'Dexbot', 'Pi bot', 'MediaWiki message delivery', 'JarBot', 'Emijrpbot', 'KasparBot', 'ListeriaBot', 'ProteinBoxBot', 'BotMultichillT', 'AudeBot', 'Cewbot', 'EranBot', 'MediaWiki default', 'Flow talk page manager', 'BotMultichill', 'KaldariBot', 'CommonsDelinker', 'JackieBot', 'Legobot', 'Hazard-Bot', 'Cyberbot I', 'CensusBot', 'Fluxbot', 'EmausBot', 'TohaomgBot', 'William Avery Bot', 'Chobot'}
enwiki_users[enwiki_users['user_name'] == "VIAFbot"]
user_id user_name user_real_name user_password user_newpassword user_email user_options user_touched user_token user_email_authenticated user_email_token user_email_token_expires user_registration user_newpass_time user_editcount user_password_exp
17095877 17215358 VIAFbot NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.012072e+13 NaN 254678 NaN
wikidatawiki_users[wikidatawiki_users['user_name'] == "VIAFbot"]
user_id user_name user_real_name user_password user_newpassword user_email user_options user_touched user_token user_email_authenticated user_email_token user_email_token_expires user_registration user_newpass_time user_editcount user_password_exp
60 39601 VIAFbot NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 20121128040408 NaN 2400743 NaN