import pandas as pd
import matplotlib.pyplot as plt
import datetime
import numpy as np
import pymysql
from sqlalchemy import create_engine
%matplotlib inline
import requests
 
data_editcount = pd.read_csv('quartile1_edits_21102018.csv')
data_editcount.columns=['user_name','automated_tool','edit_count']
data_editcount.head()
user_name automated_tool edit_count
0 187.252.28.97 f 1
1 Gustavo Santana f 3
2 61.100.196.106 f 1
3 150.176.165.5 f 1
4 Omnipotent Galaxy f 5
data_t=data_editcount[data_editcount['automated_tool']=='t']
ax= data_t.plot.hist()
ax.set_yscale('log')
data_t.describe()
edit_count
count 5.981000e+03
mean 1.528551e+04
std 2.080377e+05
min 1.000000e+00
25% 2.000000e+00
50% 1.100000e+01
75% 1.020000e+02
max 1.074339e+07
#data_t.quantile(0.1)

Building a table with user - type of user

def typeUser(row):
    
    
sizes = data_editcount.groupby('user_name').size()
sizes.head
<bound method NDFrame.head of user_name
! Bikkit !                            1
!!Fabio de Mello.S!!                  1
!-ArtMaster 95                        1
!KrzysiekBu!                          1
!Silent                               1
!aoniug                               1
!dea4u                                1
!llu$!on!$t                           1
!xXx!MLG!PV$SE!M4S7R!xXx!             1
!болит                                1
$$$Marlon$$$30                        1
$1LENCE D00600D                       1
$200inaire                            1
$Charrow$                             1
$DATA                                 1
$andlo17                              1
$toic                                 1
$traight-$hoota                       2
%Pier%                                2
&quot;Coreytaylor&quot;               1
&quot;D&quot;                         1
&quot;The Phenomenal (Decay)&quot;    1
&quot;Wikipedia-Demokonto&quot;       1
'Inyan                                1
'heiM                                 1
(:Julien:)                            1
(Carlos Emanuel)                      1
(Peter James)                         1
(RT)                                  1
(boxed)                               1
                                     ..
헤스트                                   1
현대증권 시즌1                              1
형석                                    1
혜비니                                   1
호로조                                   1
홈살구                                   1
홍인제                                   1
화목한                                   2
환골탈태                                  1
황동기시대                                 1
황미영남준우                                1
황제숙종                                  1
황제펭귄                                  1
희조                                    1
(O.O)(o.o)(V.V)                       1
(董宗瑋)                                 1
103系西九条行き                             1
88を愛する男                               1
EZWARD                                1
Hsjs                                  1
KoZ                                   1
Ks                                    1
Takajo Y                              1
Tk2122z                               1
Tokumbig                              1
Tom                                   1
Vvvbw                                 1
¥156                                  1
🌈                                     1
😂                                     1
Length: 741081, dtype: int64>
users_typeedits = sizes.reset_index()
users_typeedits.columns = ['user_name','count_typeedits']
users_typeedits.head()
user_name count_typeedits
0 ! Bikkit ! 1
1 !!Fabio de Mello.S!! 1
2 !-ArtMaster 95 1
3 !KrzysiekBu! 1
4 !Silent 1
users_typeedits[users_typeedits['county_typeedits']]
 
dataRegBots = pd.read_csv('qualitydims_regBots_OK24092018.csv')
dataNonRegBots = pd.read_csv('qualitydims_nonRegBots_OK24092018.csv')
dataNonRegBots.head()
qualitydim-ugroup count
0 Completeness 2026
1 Accuracy 175
2 Reputation 5588
3 Ease Understanding 22262
4 Interlinking 12844
dataRegBots2 = dataRegBots.sort_values(by='qualitydim-ugroup')
dataNonRegBots2 = dataNonRegBots.sort_values(by='qualitydim-ugroup')
dataRegBots2.plot(kind='bar',x='qualitydim-ugroup', y='count',logy=True, title='Number of edits per quality dimension (Edits done by bots who requested permission)')
<matplotlib.axes._subplots.AxesSubplot at 0x7f2efdbea8d0>
dataNonRegBots2.plot(kind='bar',x='qualitydim-ugroup', y='count',logy=True, title='Number of edits per quality dimension (Edits done by bots who did not request permission')
<matplotlib.axes._subplots.AxesSubplot at 0x7f2efbb16d30>
def plotQD(fileName, titlePlot):
    data = pd.read_csv(fileName)
    
    data.replace(1,"Accuracy",inplace=True)
    data.replace(2,"Completeness",inplace=True)
    data.replace(3,"Ease Understanding",inplace=True)
    data.replace(4,"Interlinking",inplace=True)
    data.replace(5,"Reputation",inplace=True)
    
    data.plot(kind='bar',x='q', y='c',logy=True, title=titlePlot)
plotQD('qualitydims_botsFlag.csv', 'Number of edits per quality dimension - Bots Permission & Flag')
plotQD('qualitydims_botsFlagItems.csv', 'Number of edits per quality dimension - (On Items) Bots Permission & Flag')
plotQD('qualitydims_botsNoFlag.csv', 'Number of edits per quality dimension - Bots Permission & No Flag')
plotQD('qualitydims_botsNoFlagItems.csv', 'Number of edits per quality dimension - (On Items) Bots Permission & No Flag')
data_humans = pd.read_csv('countedits_ugroupusernameautomated0512.csv')
data_humans.columns=['ugroup', 'username', 'automated_tool','count']
data_humans.head()
ugroup username automated_tool count
0 0 ° f 1868
1 0 ° t 4
2 0 -- f 231
3 0 -- t 2
4 0 -- -- -- f 2148
dT= data_humans[data_humans['automated_tool'] == 't']
dT['count'].describe()
count    5.981000e+03
mean     1.528551e+04
std      2.080377e+05
min      1.000000e+00
25%      2.000000e+00
50%      1.100000e+01
75%      1.020000e+02
max      1.074339e+07
Name: count, dtype: float64

select these in t

def ugroupupdate(row):
    if (row['count']>2):
        row['ugroup']=5 
    else:
        row['ugroup']=4
        
    return row    
dTupdated = dT.apply(ugroupupdate,axis=1)
dTupdated.head()
ugroup username automated_tool count
1 5 ° t 4
3 4 -- t 2
5 5 -- -- -- t 141
81 5 아턴 t 9
169 5 カビル t 14
 
dTupdated['ugroup'].value_counts()
5    4421
4    1560
Name: ugroup, dtype: int64
 
 
 
 
dTupdated[dTupdated[]]
s = dT[dT['count']>2]
s.head()
ugroup username automated_tool count
1 0 ° t 4
5 0 -- -- -- t 141
81 0 아턴 t 9
169 0 カビル t 14
178 0 トトト t 4
s.apply