# (forked from https://paws.wmflabs.org/paws/user/HaeB/notebooks/Mobile%20app%20edits%20on%20Wikidata.ipynb because of https://phabricator.wikimedia.org/T163157)
# cf. http://paws-public.wmflabs.org/paws-public/User:Jtmorgan/ds4ux/paws-cheatsheet.ipynb
import os 
"""
Your db login credentials are stored in os.environ. 
DO NOT print or run os.environ, or it will expose your credentials in the Notebook
"""
import pymysql
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import collections

# cf. http://paws-public.wmflabs.org/paws-public/User:EpochFail/editquality/ipython/reverted_detection_demo.ipynb :
import sys, traceback
import mwreverts.api
import mwapi

import re

Calculate daily rate of app description edits for Oct-Dec 2017

# repeat from below
%%time
!date

#get all app edits

conn2 = pymysql.connect(
    host=os.environ['MYSQL_HOST'],
    user=os.environ['MYSQL_USERNAME'],
    password=os.environ['MYSQL_PASSWORD'],
    database='wikidatawiki_p',
    charset='utf8'
)

with conn2.cursor() as cur:
    cur.execute("""
    SELECT rev_id, rev_timestamp, rev_comment, rev_user_text FROM wikidatawiki_p.revision AS rev
    JOIN wikidatawiki_p.change_tag AS ct
    ON rev.rev_id = ct.ct_rev_id AND ct.ct_tag = "mobile app edit"
    ORDER BY rev_timestamp;""")
    appedits = cur.fetchall()
Tue Jan 16 14:17:35 UTC 2018
CPU times: user 1.71 s, sys: 112 ms, total: 1.82 s
Wall time: 40.9 s
#check result
print(len(appedits))
print(appedits[0])
print(appedits[-1])
print(appedits[1200:1209])
123028
(181261717, b'20141215123456', b'/* wbsetdescription-add:1|en */ A town in Minnesota, USA', b'208.54.5.139')
(621051068, b'20180116141412', b'/* wbsetdescription-add:1|zh-hant */ sex', b'45.64.241.106')
((420165760, b'20161225054632', b'/* wbsetdescription-set:1|en */ theorem that describes the structure of certain types of field extensions', b'HaeB'), (423257674, b'20170101202935', b'/* wbsetdescription-set:1|en */ Thissssss is a sandbox for testing changes to items. Please be gentle with it. Feel free to change anything on this page! For testing links, try adding ones to userpages.', b'Alva1234567890'), (423259471, b'20170101203544', b'/* wbsetdescription-set:1|en */ This is a sandbox for testing changes to items. Please be gentle with it. Feel free to change anything on this page! For testing links, try adding ones to userpages.', b'Alva1234567890'), (423889596, b'20170103205551', b'/* wbsetdescription-set:1|en */ species of fungus', b'74.111.100.186'), (423890267, b'20170103205700', b'/* wbsetdescription-set:1|en */ species of edible fungus', b'DBrant (WMF)'), (423891075, b'20170103205822', b'/* wbsetdescription-set:1|en */ species of edible mushrooms', b'DBrant (WMF)'), (423908849, b'20170103213655', b'/* wbsetdescription-set:1|en */ genus of extinct mammals', b'DBrant (WMF)'), (423926260, b'20170103221609', b'/* wbsetdescription-set:1|en */ extinct genus of mammals', b'68.40.255.173'), (424077989, b'20170104035752', b'/* wbsetdescription-add:1|en */ sex', b'41.113.247.183'))
daterange =  pd.date_range(start='2017-10-01', end='2017-12-31', freq='D')
%%time
# calculate daily edit rate per language for a range of dates
!date

# initialize dataframes to count reverts per day:
columns = ['all']  # refers to the number of edits


try:
    del(appeditsperday)
except NameError:
    pass
appeditsperday = pd.DataFrame(index=daterange, columns=columns)
appeditsperday = appeditsperday.fillna(0)

try:
    del(appeditsperdaylang)
except NameError:
    pass
appeditsperdaylang = pd.DataFrame(index=daterange, columns=columns)

for col in appeditsperdaylang:
    appeditsperdaylang[col] = [defaultdict(int) for i in range(len(appeditsperdaylang.index))]

try:
    del(appeditsperaction)
except NameError:
    pass
appeditsperaction = defaultdict(int)


# count edits per day and language:
for rev in appedits:
    day = pd.to_datetime(rev[1][:8].decode())
    
    if day in daterange:
        
        appeditsperday.loc[day,'all'] += 1
        
        if rev[2] != None: # this can occur for oversighted edits, 
                        # e.g. https://www.wikidata.org/wiki/Special:Contributions/105.67.0.113
          summaryparse = re.match(r'/\* wbsetdescription-([^\:]*)\:1\|([^\ ]*) ', rev[2].decode())
        
          lang = summaryparse.group(2) # two- or three-letter language code
          appeditsperdaylang.loc[day,'all'][lang] +=1
        
          act = summaryparse.group(1) # add, set or remove
          appeditsperaction[act] += 1
        
        
 
Tue Jan 16 14:50:43 UTC 2018
CPU times: user 29.4 s, sys: 48 ms, total: 29.5 s
Wall time: 30 s
#check result
appeditsperdaylang
all
2017-10-01 {'es': 63, 'pt': 54, 'sq': 2, 'ru': 17, 'zh-ha...
2017-10-02 {'ru': 55, 'pt': 34, 'es': 87, 'fr': 24, 'ml':...
2017-10-03 {'de': 186, 'es': 97, 'zh-hans': 69, 'pt': 45,...
2017-10-04 {'es': 79, 'fa': 21, 'id': 8, 'zh-hans': 3, 'p...
2017-10-05 {'es': 77, 'fr': 12, 'ko': 10, 'pt': 26, 'ru':...
2017-10-06 {'ru': 37, 'es': 64, 'fr': 16, 'sv': 15, 'az':...
2017-10-07 {'ar': 44, 'pt': 21, 'hi': 10, 'es': 41, 'he':...
2017-10-08 {'gn': 1, 'da': 2, 'pt': 23, 'ru': 20, 'es': 7...
2017-10-09 {'ru': 24, 'pt': 16, 'ps': 5, 'es': 57, 'ar': ...
2017-10-10 {'ja': 1, 'es': 58, 'ar': 23, 'id': 5, 'pt': 1...
2017-10-11 {'vi': 3, 'es': 69, 'fi': 12, 'pt': 12, 'bn': ...
2017-10-12 {'de': 136, 'pt': 28, 'id': 10, 'es': 75, 'ar'...
2017-10-13 {'pt': 30, 'es': 49, 'tr': 1, 'ml': 1, 'it': 5...
2017-10-14 {'fr': 23, 'hi': 3, 'es': 53, 'it': 121, 'el':...
2017-10-15 {'it': 49, 'es': 68, 'id': 6, 'pt': 17, 'zh': ...
2017-10-16 {'es': 56, 'fr': 13, 'fa': 28, 'ru': 23, 'kk':...
2017-10-17 {'pt': 23, 'id': 10, 'es': 80, 'it': 25, 'tr':...
2017-10-18 {'es': 74, 'pt': 16, 'fr': 18, 'id': 6, 'ar': ...
2017-10-19 {'de': 24, 'it': 25, 'ru': 20, 'ar': 42, 'sv':...
2017-10-20 {'es': 66, 'fr': 10, 'pt': 12, 'he': 21, 'de':...
2017-10-21 {'he': 23, 'es': 38, 'pt': 63, 'ko': 3, 'fr': ...
2017-10-22 {'es': 84, 'ar': 36, 'fr': 20, 'fa': 39, 'pt':...
2017-10-23 {'es': 62, 'fr': 17, 'id': 6, 'pt': 22, 'zh': ...
2017-10-24 {'ar': 35, 'nl': 8, 'fa': 21, 'pt': 17, 'es': ...
2017-10-25 {'es': 41, 'id': 83, 'pt': 48, 'ta': 3, 'kn': ...
2017-10-26 {'id': 26, 'es': 58, 'pt': 8, 'ru': 12, 'fa': ...
2017-10-27 {'bn': 3, 'pt': 23, 'ar': 31, 'kn': 1, 'ru': 2...
2017-10-28 {'es': 89, 'tr': 3, 'ar': 41, 'de': 123, 'ps':...
2017-10-29 {'ar': 23, 'ru': 29, 'pt': 30, 'es': 82, 'zh-h...
2017-10-30 {'de': 102, 'es': 68, 'ar': 37, 'ru': 48, 'nl'...
... ...
2017-12-02 {'ar': 28, 'es': 48, 'id': 2, 'it': 33, 'ja': ...
2017-12-03 {'he': 10, 'de': 63, 'nl': 9, 'es': 50, 'id': ...
2017-12-04 {'ru': 31, 'la': 1, 'pt': 25, 'es': 61, 'fa': ...
2017-12-05 {'id': 7, 'pt': 30, 'es': 106, 'fa': 19, 'ru':...
2017-12-06 {'es': 53, 'ar': 25, 'pt': 15, 'ru': 155, 'am'...
2017-12-07 {'ru': 31, 'fr': 28, 'de': 27, 'pl': 12, 'fi':...
2017-12-08 {'es': 58, 'cs': 5, 'no': 1, 'fr': 22, 'ta': 4...
2017-12-09 {'pt': 29, 'ru': 32, 'zh-hans': 3, 'hi': 13, '...
2017-12-10 {'es': 47, 'de': 182, 'it': 27, 'el': 2, 'pt':...
2017-12-11 {'pl': 30, 'es': 45, 'de': 80, 'eo': 1, 'it': ...
2017-12-12 {'fr': 16, 'ar': 29, 'es': 45, 'pt': 18, 'ru':...
2017-12-13 {'ar': 16, 'es': 58, 'tr': 1, 'de': 23, 'fr': ...
2017-12-14 {'es': 76, 'pt': 11, 'ar': 11, 'hr': 2, 'sv': ...
2017-12-15 {'de': 46, 'es': 39, 'ps': 1, 'it': 74, 'ur': ...
2017-12-16 {'es': 47, 'ja': 22, 'fa': 6, 'nl': 10, 'ru': ...
2017-12-17 {'it': 61, 'pl': 11, 'nl': 7, 'sv': 13, 'de': ...
2017-12-18 {'es': 48, 'pt': 24, 'id': 5, 'de': 29, 'it': ...
2017-12-19 {'it': 22, 'es': 43, 'fr': 13, 'fa': 18, 'uk':...
2017-12-20 {'ar': 25, 'fr': 15, 'ja': 8, 'es': 44, 'it': ...
2017-12-21 {'bo': 1, 'es': 41, 'bn': 4, 'de': 35, 'id': 2...
2017-12-22 {'it': 57, 'pt': 28, 'es': 27, 'bn': 2, 'nl': ...
2017-12-23 {'fa': 22, 'ar': 42, 'es': 44, 'zh-hans': 4, '...
2017-12-24 {'es': 63, 'fr': 33, 'pt': 11, 'ar': 44, 'id':...
2017-12-25 {'es': 51, 'bn': 5, 'de': 63, 'fa': 25, 'fr': ...
2017-12-26 {'ml': 123, 'sv': 115, 'pt': 14, 'fi': 66, 'ru...
2017-12-27 {'ro': 1, 'de': 52, 'bg': 1, 'it': 22, 'fr': 4...
2017-12-28 {'es': 40, 'ro': 2, 'fi': 5, 'ar': 26, 'fa': 1...
2017-12-29 {'ru': 17, 'es': 60, 'ar': 32, 'pt': 31, 'de':...
2017-12-30 {'pt': 94, 'ar': 20, 'it': 40, 'de': 64, 'ru':...
2017-12-31 {'it': 34, 'de': 100, 'nl': 8, 'fr': 69, 'pt':...

92 rows × 1 columns

appeditsperaction
defaultdict(int, {'add': 30801, 'remove': 151, 'set': 13596})
# save result just in case
appeditsperdaylang.to_pickle('alldescrevperday_2017-10-01..2017-12-31.pickle')
%%time
# For comparison, calculate rate for all "manual" description edits (including non-app edits, excluding bots and OAuth tools)

!date 

# get all (non-bot, non-OAuth) description edits for a period of time
    
# excludes:
# edits that change description together with other fields 
#     (e.g. https://www.wikidata.org/w/index.php?diff=468744965:  "wbsetlabeldescriptionaliases")
# edits from "autoEdit Update Descriptions" 
#     (e.g. https://www.wikidata.org/w/index.php?diff=468745165 : "wbeditentity-update")
# bot edits like https://www.wikidata.org/w/index.php?diff=468757331by by Emijrpbot 
#     or https://www.wikidata.org/w/index.php?diff=468758634 by Mr.Ibrahembot 
#     (these use "wbeditentity-update" instead of "wbsetdescription")
# "#quickstatements" edits (e.g. https://www.wikidata.org/w/index.php?diff=468756428 , 
#      or https://www.wikidata.org/w/index.php?diff=468758591 by Poulpybot)
#      looks like these may have a tag anyway ((Tag: Widar [1.4]" or "Tag: QuickStatements [1.1]")
#      but check edit summary too)
# edits made using an OAuth tool such as:
#    reCH (e.g. https://www.wikidata.org/w/index.php?limit=50&title=Special%3AContributions&contribs=user&target=%D4%B1%D5%B7%D5%A2%D5%B8%D5%BF&namespace=&tagfilter=&year=2017&month=2 
#    distributed Wikidata game and other Widar edits 
#       (e.g https://www.wikidata.org/w/index.php?diff=457815520 :
#       "The Distributed Game (23): Kaspar's Persondata game: Descriptions #distributed-game")
# edits from users whose names end in -bot or -Bot (crude bot check)

# should exclude: 
# bot edits in general (use bot flag from RC table?)


conn3 = pymysql.connect(
    host=os.environ['MYSQL_HOST'],
    user=os.environ['MYSQL_USERNAME'],
    password=os.environ['MYSQL_PASSWORD'],
    database='wikidatawiki_p',
    charset='utf8')

with conn3.cursor() as cur:
    cur.execute("""
    SELECT rev_id, rev_timestamp, rev_comment, rev_user_text, ct_tag
    FROM (
        SELECT rev_id, rev_timestamp, rev_comment, rev_user_text
        FROM wikidatawiki_p.revision 
        WHERE rev_timestamp >= 20171001000000
        AND rev_comment LIKE '/* wbsetdescription%'
        AND rev_comment NOT LIKE '%#quickstatements') AS rev
    LEFT JOIN (
        SELECT ct_rev_id, ct_tag FROM wikidatawiki_p.change_tag
        WHERE ct_tag LIKE 'OAuth%'
        OR ct_tag LIKE 'QuickStatements%') AS ct
    ON rev.rev_id = ct.ct_rev_id  
    GROUP BY rev_id
    HAVING ct_tag IS NULL
    AND rev_user_text NOT RLIKE '[Bb]ot$'
    ORDER BY rev_timestamp;""")
    alldescedits = cur.fetchall()

    # NB: The change_tag table can contain several rows for the same edit 
Tue Jan 16 14:55:40 UTC 2018
CPU times: user 14.6 s, sys: 1.23 s, total: 15.8 s
Wall time: 30min 38s
#check result
print(len(alldescedits))
print(alldescedits[0])
print(alldescedits[-1])
print(alldescedits[0:10])
639397
(570140478, b'20171001000004', b'/* wbsetdescription-set:1|en */ Camilo Carrasco', b'152.201.13.20', None)
(621061680, b'20180116145519', b'/* wbsetdescription-add:1|en */ extinct genus of Triassic archosaur within the family Erpetosuchidae', b'Dhx1', None)
((570140478, b'20171001000004', b'/* wbsetdescription-set:1|en */ Camilo Carrasco', b'152.201.13.20', None), (570140642, b'20171001000036', b'/* wbsetdescription-add:1|es */ Historia de Colombia', b'177.252.247.80', None), (570140905, b'20171001000127', b'/* wbsetdescription-add:1|pl */ rze\xc5\xbaby we Wroc\xc5\x82awiu', b'Yarl', None), (570141773, b'20171001000450', b'/* wbsetdescription-set:1|pt */ Primeira letra do Alfabeto Latino.', b'191.188.86.211', None), (570142191, b'20171001000609', b'/* wbsetdescription-add:1|en */ justice', b'Nikkimaria', None), (570143523, b'20171001001010', b'/* wbsetdescription-set:1|en */ poeta ecuatoriano', b'186.178.181.51', None), (570144324, b'20171001001236', b'/* wbsetdescription-add:1|ceb */ usa ka AM estasyon sa Mareco Broadcasting Network', b'DYUA-AM', None), (570144997, b'20171001001443', b'/* wbsetdescription-add:1|pl */ szko\xc5\x82a w Legnicy', b'Yarl', None), (570145001, b'20171001001444', b'/* wbsetdescription-remove:1|de */ Schloss in Polen', b'Yarl', None), (570145160, b'20171001001512', b'/* wbsetdescription-add:1|en */ Italian singer-songwriter', b'XanonymusX', None))
%%time
# calculate daily edit rate per language for a range of dates
!date

# initialize dataframes:
columns = ['all']  # refers to the number of edits


try:
    del(desceditsperday)
except NameError:
    pass
desceditsperday = pd.DataFrame(index=daterange, columns=columns)
desceditsperday = desceditsperday.fillna(0)

try:
    del(desceditsperdaylang)
except NameError:
    pass
desceditsperdaylang = pd.DataFrame(index=daterange, columns=columns)

for col in desceditsperdaylang:
    desceditsperdaylang[col] = [defaultdict(int) for i in range(len(desceditsperdaylang.index))]


try:
    del(desceditsperaction)
except NameError:
    pass
desceditsperaction = defaultdict(int)



# count edits per day and language:
for rev in alldescedits:
    day = pd.to_datetime(rev[1][:8].decode())
    
    if day in daterange:
        
        desceditsperday.loc[day,'all'] += 1
        
        summaryparse = re.match(r'/\* wbsetdescription-([^\:]*)\:1\|([^\ ]*) ', rev[2].decode())
        
        lang = summaryparse.group(2) # two- or three-letter language code
        desceditsperdaylang.loc[day,'all'][lang] +=1

        
        act = summaryparse.group(1) # add, set or remove
        desceditsperaction[act] += 1
        
Tue Jan 16 15:34:50 UTC 2018
CPU times: user 4min 48s, sys: 232 ms, total: 4min 48s
Wall time: 4min 49s
# check result 
desceditsperday
all
2017-10-01 3778
2017-10-02 3696
2017-10-03 4444
2017-10-04 3419
2017-10-05 3205
2017-10-06 3492
2017-10-07 3515
2017-10-08 4469
2017-10-09 4272
2017-10-10 3764
2017-10-11 3550
2017-10-12 3560
2017-10-13 3802
2017-10-14 4706
2017-10-15 4086
2017-10-16 3377
2017-10-17 3487
2017-10-18 4088
2017-10-19 3744
2017-10-20 3701
2017-10-21 3732
2017-10-22 4154
2017-10-23 4080
2017-10-24 3908
2017-10-25 3742
2017-10-26 4893
2017-10-27 4131
2017-10-28 4412
2017-10-29 4578
2017-10-30 3881
... ...
2017-12-02 20170
2017-12-03 20653
2017-12-04 20174
2017-12-05 8234
2017-12-06 3525
2017-12-07 3583
2017-12-08 3357
2017-12-09 4173
2017-12-10 8143
2017-12-11 21146
2017-12-12 20611
2017-12-13 20563
2017-12-14 8974
2017-12-15 3694
2017-12-16 4351
2017-12-17 4250
2017-12-18 4173
2017-12-19 4185
2017-12-20 4000
2017-12-21 4181
2017-12-22 3283
2017-12-23 4358
2017-12-24 3294
2017-12-25 3637
2017-12-26 4055
2017-12-27 4213
2017-12-28 4762
2017-12-29 5008
2017-12-30 3978
2017-12-31 4808

92 rows × 1 columns

# check result
desceditsperaction
defaultdict(int, {'add': 465911, 'remove': 11613, 'set': 88564})
# calculate overall rate and app percentage per language during timespan

# sum over all languages that occurred
langs = set()
allperlang = collections.Counter()
appperlang = collections.Counter()

for i in desceditsperdaylang.loc[daterange].index:
    langs = langs | set(desceditsperdaylang['all'][i].keys())
    allperlang = allperlang + collections.Counter(desceditsperdaylang['all'][i])
    appperlang = appperlang + collections.Counter(appeditsperdaylang['all'][i])
    
# combine into one dataframe
try:
    del(descperlang)
except NameError:
    pass
descperlang = pd.DataFrame(index=langs, columns = ['all', 'app', 'app_rate'])
descperlang[['all', 'app']] = descperlang[['all', 'app']].fillna(0)
descperlang[['app_rate']] = descperlang[['app_rate']].fillna(np.NaN)

for lang in langs:
    descperlang.loc[lang,'all'] = allperlang[lang]
    descperlang.loc[lang,'app'] = appperlang[lang]
    if allperlang[lang] > 0:
        descperlang.loc[lang,'app_rate'] = appperlang[lang] / allperlang[lang]
        

descperlang = descperlang.sort_values('app',ascending=False)
# result
descperlang
all app app_rate
de 77843 5824 0.074817
es 80156 5592 0.069764
it 13440 4351 0.323735
sv 14239 3082 0.216448
ru 19766 2986 0.151067
ar 5920 2881 0.486655
pt 3887 2181 0.561101
fr 79620 2062 0.025898
fa 2481 1996 0.804514
he 3916 1439 0.367467
fi 2520 1206 0.478571
pl 9495 830 0.087414
gl 2569 676 0.263137
ja 5575 662 0.118744
hi 1283 651 0.507405
cs 9972 628 0.062976
id 1149 617 0.536989
vi 727 507 0.697387
nl 7528 431 0.057253
ca 6388 394 0.061678
ko 1645 388 0.235866
ml 415 370 0.891566
az 403 364 0.903226
bn 5471 360 0.065801
zh-hans 447 309 0.691275
hu 10075 224 0.022233
uk 6216 198 0.031853
bg 993 193 0.194361
sr 964 179 0.185685
zh-hant 852 173 0.203052
... ... ... ...
pcd 11 0 0.000000
cbk-zam 2 0 0.000000
koi 1 0 0.000000
mt 3 0 0.000000
kk-cyrl 9 0 0.000000
tt-cyrl 24 0 0.000000
de-formal 1 0 0.000000
got 1 0 0.000000
myv 13 0 0.000000
zh-my 52 0 0.000000
srn 1 0 0.000000
gd 11 0 0.000000
gan-hant 1 0 0.000000
io 7 0 0.000000
aeb-arab 2 0 0.000000
aeb 1 0 0.000000
stq 3 0 0.000000
sei 1 0 0.000000
kaa 1 0 0.000000
ary 15 0 0.000000
bjn 12 0 0.000000
bi 3 0 0.000000
fj 2 0 0.000000
fy 52 0 0.000000
ee 2 0 0.000000
csb 2 0 0.000000
diq 1 0 0.000000
frp 5 0 0.000000
rup 1 0 0.000000
new 3 0 0.000000

297 rows × 3 columns

# result re-sorted by overall edits
descperlang.sort_values('all',ascending=False)
all app app_rate
en 135766 28 0.000206
es 80156 5592 0.069764
fr 79620 2062 0.025898
de 77843 5824 0.074817
ru 19766 2986 0.151067
sv 14239 3082 0.216448
it 13440 4351 0.323735
hu 10075 224 0.022233
cs 9972 628 0.062976
pl 9495 830 0.087414
nl 7528 431 0.057253
ca 6388 394 0.061678
uk 6216 198 0.031853
ar 5920 2881 0.486655
ja 5575 662 0.118744
bn 5471 360 0.065801
el 4223 111 0.026285
he 3916 1439 0.367467
pt 3887 2181 0.561101
tr 3534 137 0.038766
ro 3290 158 0.048024
eo 3179 10 0.003146
hy 3061 22 0.007187
da 2961 46 0.015535
zh 2617 113 0.043179
gl 2569 676 0.263137
nb 2552 96 0.037618
fi 2520 1206 0.478571
fa 2481 1996 0.804514
et 2404 15 0.006240
... ... ... ...
nov 1 0 0.000000
pnt 1 0 0.000000
lg 1 0 0.000000
tw 1 0 0.000000
crh-latn 1 0 0.000000
chy 1 0 0.000000
kea 1 0 0.000000
kv 1 0 0.000000
kg 1 0 0.000000
za 1 0 0.000000
cu 1 0 0.000000
vec 1 0 0.000000
iu 1 0 0.000000
ln 1 0 0.000000
vo 1 0 0.000000
ts 1 0 0.000000
vro 1 0 0.000000
nso 1 0 0.000000
kab 1 0 0.000000
pi 1 0 0.000000
sma 1 0 0.000000
dv 1 0 0.000000
eml 1 0 0.000000
roa-tara 1 0 0.000000
sn 1 0 0.000000
ff 1 0 0.000000
ki 1 0 0.000000
kw 1 0 0.000000
ie 1 0 0.000000
pdc 1 1 1.000000

297 rows × 3 columns

# result sorted by app rate
descperlang.sort_values('app_rate',ascending=False)
all app app_rate
pdc 1 1 1.000000
lrc 1 1 1.000000
dz 1 1 1.000000
ti 1 1 1.000000
als 1 1 1.000000
na 1 1 1.000000
simple 55 53 0.963636
az 403 364 0.903226
ml 415 370 0.891566
xmf 7 6 0.857143
lo 12 10 0.833333
am 22 18 0.818182
fa 2481 1996 0.804514
bo 5 4 0.800000
as 55 44 0.800000
co 25 19 0.760000
km 12 9 0.750000
tpi 4 3 0.750000
cdo 4 3 0.750000
kk 61 45 0.737705
ps 82 58 0.707317
vi 727 507 0.697387
zh-hans 447 309 0.691275
ha 3 2 0.666667
arc 3 2 0.666667
om 3 2 0.666667
pms 3 2 0.666667
ig 3 2 0.666667
bh 51 31 0.607843
sd 10 6 0.600000
... ... ... ...
tn 9 0 0.000000
nov 1 0 0.000000
mhr 8 0 0.000000
ug 2 0 0.000000
arq 8 0 0.000000
yue 272 0 0.000000
mdf 1 0 0.000000
kl 1 0 0.000000
sgs 1 0 0.000000
nan 2 0 0.000000
lad 9 0 0.000000
gom 1 0 0.000000
zu 135 0 0.000000
wa 2 0 0.000000
crh-latn 1 0 0.000000
chy 1 0 0.000000
kea 1 0 0.000000
kv 1 0 0.000000
vro 1 0 0.000000
ks 1 0 0.000000
sr-el 6 0 0.000000
ku-latn 6 0 0.000000
ace 26 0 0.000000
tw 1 0 0.000000
ko-kp 2 0 0.000000
pnt 1 0 0.000000
gv 1 0 0.000000
av 1 0 0.000000
bug 11 0 0.000000
new 3 0 0.000000

297 rows × 3 columns

descperlang['app'].sum()
44548
# only languages with at least 1000 total edits
descperlang.loc[descperlang['all']>=1000]
all app app_rate
de 77843 5824 0.074817
es 80156 5592 0.069764
it 13440 4351 0.323735
sv 14239 3082 0.216448
ru 19766 2986 0.151067
ar 5920 2881 0.486655
pt 3887 2181 0.561101
fr 79620 2062 0.025898
fa 2481 1996 0.804514
he 3916 1439 0.367467
fi 2520 1206 0.478571
pl 9495 830 0.087414
gl 2569 676 0.263137
ja 5575 662 0.118744
hi 1283 651 0.507405
cs 9972 628 0.062976
id 1149 617 0.536989
nl 7528 431 0.057253
ca 6388 394 0.061678
ko 1645 388 0.235866
bn 5471 360 0.065801
hu 10075 224 0.022233
uk 6216 198 0.031853
ro 3290 158 0.048024
tr 3534 137 0.038766
zh 2617 113 0.043179
el 4223 111 0.026285
nb 2552 96 0.037618
da 2961 46 0.015535
en 135766 28 0.000206
hy 3061 22 0.007187
ka 1970 19 0.009645
et 2404 15 0.006240
eo 3179 10 0.003146
be 1614 5 0.003098
lb 2391 0 0.000000
en-gb 1258 0 0.000000
# English was the only languages (or language variants) where the feature was not yet in production
# cf. https://meta.wikimedia.org/wiki/User_talk:Elitre_(WMF)/Wikidata_editing_announcement
prodlangs = set(descperlang.index.values).difference(['en'])
# Strictly speaking, we should also remove some obsure language variants that don't correspond 
# to an existing Wikipedia, e.g. 'aeb-arab'. 
# But we'll assume that the numbers of non-app description edits are small enough there to ignore 
# them in the calculation
# percentage of app edits among all manual description edits 
# for all language where the feature was in production during the entire timespan:
descperlang.loc[prodlangs]['app'].sum() / descperlang.loc[prodlangs]['all'].sum()
0.1034574109620238

Calculate daily rate of app description edits (until July 2017)

%%time
!date

#get all app edits

conn2 = pymysql.connect(
    host=os.environ['MYSQL_HOST'],
    user=os.environ['MYSQL_USERNAME'],
    password=os.environ['MYSQL_PASSWORD'],
    database='wikidatawiki_p',
    charset='utf8'
)

with conn2.cursor() as cur:
    cur.execute("""
    SELECT rev_id, rev_timestamp, rev_comment, rev_user_text FROM wikidatawiki_p.revision AS rev
    JOIN wikidatawiki_p.change_tag AS ct
    ON rev.rev_id = ct.ct_rev_id AND ct.ct_tag = "mobile app edit"
    ORDER BY rev_timestamp;""")
    appedits = cur.fetchall()
Tue Jul 18 23:47:03 UTC 2017
CPU times: user 668 ms, sys: 60 ms, total: 728 ms
Wall time: 25.2 s
#check result
print(len(appedits))
print(appedits[0])
print(appedits[-1])
print(appedits[1200:1209])
33448
(181261717, b'20141215123456', b'/* wbsetdescription-add:1|en */ A town in Minnesota, USA', b'208.54.5.139')
(523392028, b'20170718234640', b'/* wbsetdescription-add:1|es */ localidad del departamento de Tacuaremb\xc3\xb3, Uruguay', b'Pepe piton')
((420163512, b'20161225053744', b'/* wbsetdescription-add:1|en */ algebraic field extension where the automorphism group fixes precisely the base field', b'HaeB'), (420165760, b'20161225054632', b'/* wbsetdescription-set:1|en */ theorem that describes the structure of certain types of field extensions', b'HaeB'), (423257674, b'20170101202935', b'/* wbsetdescription-set:1|en */ Thissssss is a sandbox for testing changes to items. Please be gentle with it. Feel free to change anything on this page! For testing links, try adding ones to userpages.', b'Alva1234567890'), (423259471, b'20170101203544', b'/* wbsetdescription-set:1|en */ This is a sandbox for testing changes to items. Please be gentle with it. Feel free to change anything on this page! For testing links, try adding ones to userpages.', b'Alva1234567890'), (423889596, b'20170103205551', b'/* wbsetdescription-set:1|en */ species of fungus', b'74.111.100.186'), (423890267, b'20170103205700', b'/* wbsetdescription-set:1|en */ species of edible fungus', b'DBrant (WMF)'), (423891075, b'20170103205822', b'/* wbsetdescription-set:1|en */ species of edible mushrooms', b'DBrant (WMF)'), (423908849, b'20170103213655', b'/* wbsetdescription-set:1|en */ genus of extinct mammals', b'DBrant (WMF)'), (423926260, b'20170103221609', b'/* wbsetdescription-set:1|en */ extinct genus of mammals', b'68.40.255.173'))
daterange =  pd.date_range(start='2017-04-24', end='2017-07-17', freq='D')
%%time
# calculate daily edit rate per language for a range of dates
!date

# initialize dataframes to count reverts per day:
columns = ['all']  # refers to the number of edits


try:
    del(appeditsperday)
except NameError:
    pass
appeditsperday = pd.DataFrame(index=daterange, columns=columns)
appeditsperday = appeditsperday.fillna(0)

try:
    del(appeditsperdaylang)
except NameError:
    pass
appeditsperdaylang = pd.DataFrame(index=daterange, columns=columns)

for col in appeditsperdaylang:
    appeditsperdaylang[col] = [defaultdict(int) for i in range(len(appeditsperdaylang.index))]

try:
    del(appeditsperaction)
except NameError:
    pass
appeditsperaction = defaultdict(int)


# count edits per day and language:
for rev in appedits:
    day = pd.to_datetime(rev[1][:8].decode())
    
    if day in daterange:
        
        appeditsperday.loc[day,'all'] += 1
        
        # lang = rev[2].decode()[26:28] # this is a bit hacky and won't work with (e.g.) three-letter lang codes
        summaryparse = re.match(r'/\* wbsetdescription-([^\:]*)\:1\|([^\ ]*) ', rev[2].decode())
        
        lang = summaryparse.group(2) # two- or three-letter language code
        appeditsperdaylang.loc[day,'all'][lang] +=1
        
        act = summaryparse.group(1) # add, set or remove
        appeditsperaction[act] += 1
        
        
 
Thu Jul 20 02:32:58 UTC 2017
CPU times: user 29.2 s, sys: 276 ms, total: 29.5 s
Wall time: 29.8 s
#check result (after fix for removal actions)
appeditsperdaylang
all
2017-04-24 {'ru': 25, 'zh': 1, 'ca': 8, 'pl': 2, 'lv': 1,...
2017-04-25 {'bn': 2, 'ru': 15, 'pl': 10, 'am': 1, 'hu': 1...
2017-04-26 {'eu': 1, 'cs': 11, 'ko': 26, 'pl': 9, 'bn': 1...
2017-04-27 {'eu': 10, 'ca': 3, 'arz': 1, 'ru': 27, 'pl': ...
2017-04-28 {'eu': 1, 'ca': 4, 'ko': 29, 'pl': 7, 'gl': 1,...
2017-04-29 {'eu': 14, 'cs': 36, 'ru': 46, 'pl': 8, 'hu': ...
2017-04-30 {'eu': 18, 'cs': 101, 'ko': 4, 'pl': 17, 'da':...
2017-05-01 {'cs': 5, 'ca': 2, 'pl': 6, 'tl': 1, 'te': 1, ...
2017-05-02 {'eu': 2, 'cs': 7, 'ru': 43, 'pl': 28, 'gl': 3...
2017-05-03 {'cs': 4, 'ko': 6, 'pl': 8, 'da': 2, 'be': 1, ...
2017-05-04 {'eu': 1, 'cs': 49, 'ro': 1, 'ru': 19, 'pl': 1...
2017-05-05 {'cs': 12, 'ru': 15, 'pl': 38, 'id': 21, 'be':...
2017-05-06 {'diq': 1, 'bn': 3, 'pl': 28, 'hu': 4, 'az': 1...
2017-05-07 {'eu': 53, 'cs': 12, 'ru': 38, 'pl': 104, 'cdo...
2017-05-08 {'eu': 11, 'cs': 51, 'ro': 3, 'pl': 10, 'nb': ...
2017-05-09 {'eu': 1, 'cs': 12, 'ru': 34, 'pl': 10, 'hu': ...
2017-05-10 {'bn': 4, 'ro': 2, 'sn': 2, 'gl': 1, 'ko': 7, ...
2017-05-11 {'cs': 65, 'gu': 1, 'ko': 8, 'pl': 1, 'lv': 1,...
2017-05-12 {'cs': 5, 'ca': 6, 'pl': 8, 'kl': 1, 'hu': 1, ...
2017-05-13 {'jv': 2, 'cs': 69, 'ru': 8, 'pl': 13, 'en': 3...
2017-05-14 {'eu': 2, 'bn': 4, 'cs': 166, 'pl': 8, 'hu': 1...
2017-05-15 {'cs': 51, 'ro': 1, 'pl': 24, 'cdo': 19, 'az':...
2017-05-16 {'cs': 5, 'ko': 19, 'pl': 15, 'hu': 1, 'az': 1...
2017-05-17 {'eu': 1, 'cs': 9, 'ko': 14, 'pl': 5, 'se': 2,...
2017-05-18 {'cs': 145, 'bar': 1, 'io': 1, 'ro': 5, 'pl': ...
2017-05-19 {'bn': 4, 'bar': 1, 'ro': 4, 'pl': 6, 'cdo': 3...
2017-05-20 {'el': 1, 'cs': 12, 'io': 1, 'ru': 18, 'pl': 5...
2017-05-21 {'cs': 9, 'ko': 1, 'pl': 6, 'lv': 1, 'gl': 1, ...
2017-05-22 {'cs': 7, 'ru': 57, 'pl': 9, 'id': 8, 'te': 1,...
2017-05-23 {'bn': 1, 'cs': 11, 'pl': 3, 'kn': 1, 'sr': 2,...
... ...
2017-06-18 {'ru': 18, 'ro': 2, 'pl': 8, 'ky': 3, 'am': 1,...
2017-06-19 {'cs': 11, 'ko': 7, 'pl': 4, 'ar': 20, 'cy': 1...
2017-06-20 {'jv': 1, 'bn': 1, 'ro': 3, 'pl': 6, 'ru': 17,...
2017-06-21 {'cs': 8, 'ml': 3, 'zu': 1, 'ko': 3, 'kn': 2, ...
2017-06-22 {'cs': 17, 'ml': 1, 'pl': 10, 'lv': 1, 'hu': 5...
2017-06-23 {'cs': 5, 'ru': 23, 'pl': 3, 'sr': 5, 'fi': 5,...
2017-06-24 {'eu': 1, 'bn': 22, 'gu': 1, 'ru': 49, 'pl': 3...
2017-06-25 {'bn': 13, 'gu': 1, 'ko': 4, 'pl': 8, 'lv': 11...
2017-06-26 {'cs': 8, 'ca': 3, 'pl': 13, 'lv': 1, 'hu': 5,...
2017-06-27 {'cs': 8, 'gu': 1, 'ru': 62, 'pl': 9, 'hu': 1,...
2017-06-28 {'ca': 2, 'gu': 1, 'ta': 1, 'ml': 2, 'pl': 5, ...
2017-06-29 {'bn': 2, 'ko': 2, 'pl': 4, 'am': 1, 'te': 1, ...
2017-06-30 {'bn': 7, 'ro': 5, 'pl': 21, 'te': 1, 'hu': 1,...
2017-07-01 {'cs': 9, 'ml': 2, 'pl': 7, 'da': 2, 'hu': 2, ...
2017-07-02 {'bn': 3, 'ml': 5, 'pl': 11, 'kn': 1, 'ru': 31...
2017-07-03 {'cs': 12, 'gu': 1, 'ml': 5, 'pl': 3, 'tl': 1,...
2017-07-04 {'cs': 2, 'gu': 1, 'ko': 3, 'pl': 37, 'da': 1,...
2017-07-05 {'cs': 11, 'pt': 6, 'ko': 1, 'pl': 1, 'de': 9,...
2017-07-06 {'eu': 1, 'cs': 18, 'zh-hans': 5, 'ml': 1, 'pl...
2017-07-07 {'bn': 1, 'da': 1, 'tl': 3, 'zh-hans': 4, 'ko'...
2017-07-08 {'el': 2, 'cs': 5, 'tl': 1, 'zh-hans': 11, 'ca...
2017-07-09 {'bn': 4, 'bar': 1, 'zh-hans': 3, 'ace': 1, 'i...
2017-07-10 {'ca': 1, 'zh-hans': 3, 'ml': 3, 'pl': 9, 'de'...
2017-07-11 {'cs': 37, 'ta': 1, 'zh-hans': 4, 'ko': 5, 'pl...
2017-07-12 {'gan': 1, 'bn': 17, 'gu': 1, 'tl': 1, 'zh-han...
2017-07-13 {'bn': 1, 'zh-hans': 4, 'am': 1, 'te': 2, 'sr'...
2017-07-14 {'jv': 1, 'cs': 30, 'zh-hans': 9, 'ru': 41, 'p...
2017-07-15 {'el': 1, 'cs': 8, 'pt': 18, 'zh-hans': 2, 'ro...
2017-07-16 {'cs': 60, 'gu': 1, 'zh-hans': 5, 'ko': 8, 'pl...
2017-07-17 {'bn': 10, 'gu': 3, 'zh-hans': 5, 'cs': 45, 'p...

85 rows × 1 columns

appeditsperaction
defaultdict(int, {'add': 19072, 'remove': 68, 'set': 5143})
#check result
appeditsperdaylang
all
2017-04-24 {'ru': 25, 'zh': 1, 'ca': 8, 'pl': 2, 'lv': 1,...
2017-04-25 {'bn': 2, 'ru': 15, 'pl': 10, 'am': 1, 'hu': 1...
2017-04-26 {'eu': 1, 'cs': 11, 'ko': 26, 'pl': 8, 'hu': 1...
2017-04-27 {'eu': 10, 'ca': 3, 'pm': 1, 'ru': 27, 'pl': 1...
2017-04-28 {'eu': 1, 'ca': 4, 'ko': 29, 'pl': 7, 'gl': 1,...
2017-04-29 {'eu': 14, 'cs': 36, 'le': 1, 'ru': 46, 'pl': ...
2017-04-30 {'eu': 18, 'cs': 101, 'ko': 4, 'pl': 17, 'da':...
2017-05-01 {'ca': 2, 'ko': 25, 'pl': 6, 'tl': 1, 'te': 1,...
2017-05-02 {'eu': 2, 'cs': 7, 'ru': 43, 'pl': 28, 'gl': 3...
2017-05-03 {'cs': 4, 'ko': 6, 'pl': 8, 'da': 2, 'be': 1, ...
2017-05-04 {'eu': 1, 'cs': 49, 'ro': 1, 'ru': 19, 'pl': 1...
2017-05-05 {'cs': 12, 'ru': 15, 'pl': 38, 'id': 21, 'be':...
2017-05-06 {'cs': 4, 'bn': 3, 'pl': 28, 'hu': 4, 'az': 1,...
2017-05-07 {'eu': 53, 'cs': 12, 'ru': 38, 'pl': 104, 'hu'...
2017-05-08 {'eu': 11, 'cs': 51, 'ro': 3, 'pl': 10, 'nb': ...
2017-05-09 {'eu': 1, 'cs': 12, 'ru': 34, 'pl': 10, 'hu': ...
2017-05-10 {'bn': 4, 'ro': 2, 'sn': 2, 'gl': 1, 'ko': 7, ...
2017-05-11 {'cs': 65, 'gu': 1, 'ha': 4, 'ko': 8, 'pl': 1,...
2017-05-12 {'cs': 5, 'ru': 38, 'pl': 8, 'kl': 1, 'hu': 1,...
2017-05-13 {'jv': 2, 'cs': 69, 'ru': 8, 'pl': 13, 'gl': 1...
2017-05-14 {'eu': 2, 'bn': 4, 'cs': 166, 'pl': 8, 'hu': 1...
2017-05-15 {'cs': 51, 'ro': 1, 'pl': 24, 'hu': 1, 'az': 1...
2017-05-16 {'cs': 5, 'ko': 19, 'pl': 15, 'hu': 1, 'az': 1...
2017-05-17 {'eu': 1, 'cs': 9, 'ko': 14, 'pl': 5, 'se': 2,...
2017-05-18 {'cs': 145, 'io': 1, 'ro': 5, 'pl': 1, 'de': 1...
2017-05-19 {'bn': 4, 'ro': 4, 'pl': 6, 'da': 2, 'ba': 1, ...
2017-05-20 {'el': 1, 'cs': 12, 'io': 1, 'ru': 18, 'pl': 5...
2017-05-21 {'cs': 9, 'ko': 1, 'pl': 6, 'lv': 1, 'gl': 1, ...
2017-05-22 {'cs': 7, 'ru': 57, 'pl': 9, 'id': 8, 'te': 1,...
2017-05-23 {'bn': 1, 'cs': 11, 'pl': 3, 'kn': 1, 'si': 2,...
... ...
2017-06-18 {'ru': 17, 'ro': 2, 'pl': 8, 'ky': 3, 'am': 1,...
2017-06-19 {'cs': 11, 'ko': 7, 'pl': 4, 'ar': 20, 'cy': 1...
2017-06-20 {'jv': 1, 'bn': 1, 'ro': 3, 'pl': 6, 'ru': 17,...
2017-06-21 {'cs': 8, 'ml': 3, 'pl': 7, 'da': 1, 'tl': 2, ...
2017-06-22 {'cs': 17, 'ml': 1, 'pl': 10, 'lv': 1, 'hu': 5...
2017-06-23 {'cs': 5, 'ru': 23, 'pl': 3, 'sr': 5, 'fi': 5,...
2017-06-24 {'eu': 1, 'bn': 22, 'gu': 1, 'ru': 49, 'pl': 3...
2017-06-25 {'bn': 13, 'gu': 1, 'ko': 4, 'pl': 8, 'lv': 11...
2017-06-26 {'cs': 8, 'ca': 3, 'pl': 13, 'lv': 1, 'hu': 5,...
2017-06-27 {'cs': 8, 'gu': 1, 'ru': 62, 'pl': 9, 'kn': 3,...
2017-06-28 {'ca': 2, 'gu': 1, 'ta': 1, 'ml': 2, 'pl': 5, ...
2017-06-29 {'bn': 2, 'ko': 2, 'pl': 4, 'am': 1, 'te': 1, ...
2017-06-30 {'bn': 7, 'ro': 5, 'pl': 21, 'te': 1, 'hu': 1,...
2017-07-01 {'cs': 9, 'ml': 2, 'pl': 7, 'da': 2, 'hu': 2, ...
2017-07-02 {'bn': 3, 'ml': 5, 'pl': 11, 'kn': 1, 'ru': 30...
2017-07-03 {'cs': 12, 'gu': 1, 'ml': 5, 'pl': 3, 'tl': 1,...
2017-07-04 {'cs': 2, 'gu': 1, 'dt': 1, 'ko': 3, 'pl': 36,...
2017-07-05 {'cs': 11, 'pt': 6, 'ko': 1, 'pl': 1, 'de': 9,...
2017-07-06 {'eu': 1, 'cs': 18, 'ml': 1, 'pl': 5, 'de': 71...
2017-07-07 {'bn': 1, 'da': 1, 'tl': 3, 'ko': 1, 'pl': 7, ...
2017-07-08 {'pl': 3, 'cs': 5, 'tl': 1, 'ca': 2, 'fa': 23,...
2017-07-09 {'bn': 4, 'ml': 11, 'id': 6, 'kn': 2, 'ba': 1,...
2017-07-10 {'ca': 1, 'ml': 3, 'pl': 9, 'de': 99, 'ja': 11...
2017-07-11 {'cs': 37, 'ta': 1, 'ko': 5, 'pl': 6, 'de': 92...
2017-07-12 {'bn': 17, 'gu': 1, 'tl': 1, 'ru': 75, 'pl': 4...
2017-07-13 {'bn': 1, 'am': 1, 'te': 2, 'sr': 1, 'tk': 1, ...
2017-07-14 {'jv': 1, 'cs': 30, 'ru': 41, 'pl': 4, 'de': 8...
2017-07-15 {'cs': 8, 'ro': 5, 'pl': 8, 'de': 75, 'mr': 2,...
2017-07-16 {'cs': 60, 'gu': 1, 'ko': 8, 'pl': 15, 'lv': 3...
2017-07-17 {'bn': 10, 'gu': 3, 'cs': 45, 'pl': 5, 'lv': 2...

85 rows × 1 columns

# save result just in case
appeditsperdaylang.to_pickle('alldescrevperday_2017-04-24..2017-07-17.pickle')
%%time
# For comparison, calculate rate for all "manual" description edits (including non-app edits, excluding bots and OAuth tools)

!date 

# get all (non-bot, non-OAuth) description edits for a period of time
    
# excludes:
# edits that change description together with other fields 
#     (e.g. https://www.wikidata.org/w/index.php?diff=468744965:  "wbsetlabeldescriptionaliases")
# edits from "autoEdit Update Descriptions" 
#     (e.g. https://www.wikidata.org/w/index.php?diff=468745165 : "wbeditentity-update")
# bot edits like https://www.wikidata.org/w/index.php?diff=468757331by by Emijrpbot 
#     or https://www.wikidata.org/w/index.php?diff=468758634 by Mr.Ibrahembot 
#     (these use "wbeditentity-update" instead of "wbsetdescription")
# "#quickstatements" edits (e.g. https://www.wikidata.org/w/index.php?diff=468756428 , 
#      or https://www.wikidata.org/w/index.php?diff=468758591 by Poulpybot)
#      looks like these may have a tag anyway ((Tag: Widar [1.4]" or "Tag: QuickStatements [1.1]")
#      but check edit summary too)
# edits made using an OAuth tool such as:
#    reCH (e.g. https://www.wikidata.org/w/index.php?limit=50&title=Special%3AContributions&contribs=user&target=%D4%B1%D5%B7%D5%A2%D5%B8%D5%BF&namespace=&tagfilter=&year=2017&month=2 
#    distributed Wikidata game and other Widar edits 
#       (e.g https://www.wikidata.org/w/index.php?diff=457815520 :
#       "The Distributed Game (23): Kaspar's Persondata game: Descriptions #distributed-game")
# edits from users whose names end in -bot or -Bot

# should exclude: 
# bot edits in general (use bot flag from RC table?)


conn3 = pymysql.connect(
    host=os.environ['MYSQL_HOST'],
    user=os.environ['MYSQL_USERNAME'],
    password=os.environ['MYSQL_PASSWORD'],
    database='wikidatawiki_p',
    charset='utf8')

with conn3.cursor() as cur:
    cur.execute("""
    SELECT rev_id, rev_timestamp, rev_comment, rev_user_text, ct_tag
    FROM (
        SELECT rev_id, rev_timestamp, rev_comment, rev_user_text
        FROM wikidatawiki_p.revision 
        WHERE rev_timestamp >= 20170424000000
        AND rev_comment LIKE '/* wbsetdescription%'
        AND rev_comment NOT LIKE '%#quickstatements') AS rev
    LEFT JOIN (
        SELECT ct_rev_id, ct_tag FROM wikidatawiki_p.change_tag
        WHERE ct_tag LIKE 'OAuth%'
        OR ct_tag LIKE 'QuickStatements%') AS ct
    ON rev.rev_id = ct.ct_rev_id  
    GROUP BY rev_id
    HAVING ct_tag IS NULL
    AND rev_user_text NOT RLIKE '[Bb]ot$'
    ORDER BY rev_timestamp;""")
    alldescedits = cur.fetchall()

    # NB: The change_tag table can contain several rows for the same edit 
    # NB: crude bot check - exclude user names ending in -bot or Bot
Wed Jul 19 00:28:58 UTC 2017
CPU times: user 14.9 s, sys: 496 ms, total: 15.4 s
Wall time: 13min 10s
#check result
print(len(alldescedits))
print(alldescedits[0])
print(alldescedits[-1])
print(alldescedits[0:10])
1107130
(477756197, b'20170424000018', b"/* wbsetdescription-set:1|fr */ page d'homonymie de Wikimedia", b'Bob08', None)
(523405421, b'20170719002844', b'/* wbsetdescription-add:1|en */ album by Sonoko Kawai', b'Andreasmperu', None)
((477756197, b'20170424000018', b"/* wbsetdescription-set:1|fr */ page d'homonymie de Wikimedia", b'Bob08', None), (477756398, b'20170424000209', b"/* wbsetdescription-set:1|fr */ page d'homonymie de Wikimedia", b'Bob08', None), (477756634, b'20170424000412', b'/* wbsetdescription-set:1|en */ blogger', b'Trivialist', None), (477756686, b'20170424000436', b'/* wbsetdescription-add:1|de */ Art der Gattung Tometes', b'Tommy Kronkvist', None), (477756687, b'20170424000437', b'/* wbsetdescription-add:1|nl */ soort uit het geslacht Tometes', b'Tommy Kronkvist', None), (477756690, b'20170424000437', b'/* wbsetdescription-add:1|sv */ fiskart', b'Tommy Kronkvist', None), (477756698, b'20170424000441', b'/* wbsetdescription-add:1|nn */ canadisk skeisel\xc3\xb8par og sykkelryttar', b'Migrant', None), (477756865, b'20170424000532', b'/* wbsetdescription-set:1|en */ Austrian-German historian', b'K1812', None), (477757063, b'20170424000656', b'/* wbsetdescription-add:1|es */ identificador de un miembro de la Royal Flemish Academy of Belgium for Science and the Arts', b'Strakhov', None), (477757082, b'20170424000703', b'/* wbsetdescription-add:1|pl */ wie\xc5\x9b w obwodzie Irkuckim, Rosja', b'Tommy Jantarek', None))
%%time
# calculate daily edit rate per language for a range of dates
!date

# initialize dataframes to count reverts per day:
columns = ['all']  # refers to the number of edits


try:
    del(desceditsperday)
except NameError:
    pass
desceditsperday = pd.DataFrame(index=daterange, columns=columns)
desceditsperday = desceditsperday.fillna(0)

try:
    del(desceditsperdaylang)
except NameError:
    pass
desceditsperdaylang = pd.DataFrame(index=daterange, columns=columns)

for col in desceditsperdaylang:
    desceditsperdaylang[col] = [defaultdict(int) for i in range(len(desceditsperdaylang.index))]


try:
    del(desceditsperaction)
except NameError:
    pass
desceditsperaction = defaultdict(int)



# count edits per day and language:
for rev in alldescedits:
    day = pd.to_datetime(rev[1][:8].decode())
    
    if day in daterange:
        
        desceditsperday.loc[day,'all'] += 1
        
        # lang = rev[2].decode()[26:28] # this is a bit hacky and won't work with (e.g.) three-letter lang codes
        summaryparse = re.match(r'/\* wbsetdescription-([^\:]*)\:1\|([^\ ]*) ', rev[2].decode())
        
        lang = summaryparse.group(2) # two- or three-letter language code
        desceditsperdaylang.loc[day,'all'][lang] +=1

        
        act = summaryparse.group(1) # add, set or remove
        desceditsperaction[act] += 1
        
Thu Jul 20 02:42:54 UTC 2017
CPU times: user 20min 48s, sys: 6.75 s, total: 20min 55s
Wall time: 21min 2s
# check result 
desceditsperday
all
2017-04-24 2929
2017-04-25 3442
2017-04-26 3232
2017-04-27 3268
2017-04-28 2898
2017-04-29 3218
2017-04-30 3160
2017-05-01 4137
2017-05-02 3223
2017-05-03 3717
2017-05-04 3490
2017-05-05 3175
2017-05-06 3395
2017-05-07 3674
2017-05-08 4459
2017-05-09 4498
2017-05-10 5021
2017-05-11 3301
2017-05-12 3066
2017-05-13 3022
2017-05-14 3491
2017-05-15 3251
2017-05-16 3788
2017-05-17 4237
2017-05-18 2869
2017-05-19 3226
2017-05-20 3356
2017-05-21 3212
2017-05-22 2885
2017-05-23 2580
... ...
2017-06-18 3380
2017-06-19 3053
2017-06-20 3289
2017-06-21 3066
2017-06-22 30419
2017-06-23 36399
2017-06-24 17043
2017-06-25 3306
2017-06-26 3532
2017-06-27 3333
2017-06-28 3373
2017-06-29 3237
2017-06-30 3942
2017-07-01 4789
2017-07-02 4811
2017-07-03 3442
2017-07-04 3177
2017-07-05 3420
2017-07-06 3818
2017-07-07 3907
2017-07-08 3974
2017-07-09 4072
2017-07-10 3681
2017-07-11 3587
2017-07-12 3221
2017-07-13 3423
2017-07-14 3695
2017-07-15 4931
2017-07-16 3992
2017-07-17 3680

85 rows × 1 columns

# check result 
desceditsperdaylang
all
2017-04-24 {'bn': 1, 'he': 89, 'kn': 7, 'tt': 1, 'ka': 8,...
2017-04-25 {'bn': 5, 'it': 83, 'ka': 1, 'nn': 8, 'min': 1...
2017-04-26 {'bn': 9, 'bar': 2, 'zu': 2, 'arn': 1, 'et': 6...
2017-04-27 {'bn': 4, 'he': 105, 'ka': 7, 'la': 1, 'min': ...
2017-04-28 {'bn': 3, 'bar': 1, 'he': 49, 'am': 1, 'te': 1...
2017-04-29 {'bn': 4, 'he': 54, 'frr': 2, 'et': 4, 'la': 3...
2017-04-30 {'bn': 3, 'ilo': 1, 'it': 100, 'uz': 1, 'ka': ...
2017-05-01 {'bn': 11, 'bar': 5, 'ace': 2, 'ka': 14, 'la':...
2017-05-02 {'bn': 14, 'ilo': 5, 'yi': 3, 'ml': 1, 'id': 1...
2017-05-03 {'bn': 33, 'zh-hans': 3, 'ml': 2, 'mr': 1, 'te...
2017-05-04 {'bn': 7, 'bar': 3, 'ace': 4, 'kk-cyrl': 1, 'k...
2017-05-05 {'bn': 2, 'el': 38, 'uz': 1, 'et': 2, 'bh': 3,...
2017-05-06 {'bn': 48, 'bar': 4, 'uz': 1, 'tt': 1, 'et': 3...
2017-05-07 {'bn': 7, 'lt': 1, 'is': 3, 'arn': 1, 'da': 26...
2017-05-08 {'bn': 10, 'it': 95, 'tt': 1, 'ka': 45, 'la': ...
2017-05-09 {'bn': 6, 'ace': 1, 'kn': 2, 'ka': 41, 'nn': 1...
2017-05-10 {'bn': 11, 'bar': 1, 'zh-hans': 1, 'lt': 2, 'i...
2017-05-11 {'bn': 21, 'bar': 5, 'is': 1, 'arn': 1, 'vec':...
2017-05-12 {'eu': 1, 'ka': 3, 'zh-hans': 1, 'bn': 22, 'is...
2017-05-13 {'bn': 44, 'bar': 2, 'lt': 2, 'te': 2, 'hsb': ...
2017-05-14 {'bn': 16, 'bar': 3, 'tt': 1, 'ka': 18, 'la': ...
2017-05-15 {'bn': 5, 'bar': 1, 'it': 55, 'ka': 17, 'la': ...
2017-05-16 {'bn': 2, 'si': 1, 'ka': 2, 'la': 10, 'zh-sg':...
2017-05-17 {'bn': 1, 'bar': 2, 'kn': 3, 'si': 1, 'ka': 3,...
2017-05-18 {'bn': 7, 'bar': 1, 'ky': 2, 'lt': 4, 'ar': 19...
2017-05-19 {'bn': 10, 'bar': 1, 'ace': 1, 'kn': 1, 'it': ...
2017-05-20 {'gan': 1, 'bn': 3, 'bar': 1, 'he': 19, 'aeb-a...
2017-05-21 {'bn': 15, 'kk-cyrl': 1, 'kn': 1, 'ka': 25, 'n...
2017-05-22 {'bn': 1, 'bar': 1, 'yi': 3, 'ml': 3, 'is': 1,...
2017-05-23 {'bn': 5, 'yi': 1, 'ml': 1, 'kn': 1, 'da': 26,...
... ...
2017-06-18 {'bn': 6, 'ace': 2, 'is': 4, 'uz': 1, 'ka': 3,...
2017-06-19 {'bn': 8, 'ace': 1, 'it': 65, 'ka': 3, 'nn': 5...
2017-06-20 {'bn': 18, 'bar': 1, 'frr': 1, 'si': 1, 'ka': ...
2017-06-21 {'bn': 5, 'bar': 1, 'ilo': 2, 'zu': 2, 'kn': 2...
2017-06-22 {'bn': 15, 'lt': 1, 'sl': 2, 'id': 6, 'hsb': 1...
2017-06-23 {'bn': 23, 'uz': 1, 'ka': 3, 'nn': 14, 'min': ...
2017-06-24 {'bn': 31, 'ace': 1, 'kn': 3, 'uz': 1, 'et': 2...
2017-06-25 {'bn': 14, 'bar': 1, 'kn': 3, 'it': 98, 'uz': ...
2017-06-26 {'bn': 11, 'bar': 4, 'kn': 1, 'ka': 2, 'nn': 8...
2017-06-27 {'bn': 3, 'kn': 3, 'uz': 1, 'et': 4, 'nn': 10,...
2017-06-28 {'ru': 311, 'bar': 1, 'yi': 1, 'ml': 2, 'mr': ...
2017-06-29 {'bn': 5, 'ace': 1, 'kk-cyrl': 5, 'kn': 65, 'u...
2017-06-30 {'bn': 9, 'is': 1, 'id': 7, 'yue': 1, 'kn': 73...
2017-07-01 {'bn': 7, 'he': 50, 'kn': 788, 'it': 108, 'sa'...
2017-07-02 {'bn': 21, 'yi': 2, 'ml': 5, 'gl': 6, 'te': 11...
2017-07-03 {'eu': 49, 'tg-cyrl': 1, 'ml': 5, 'gl': 5, 'yu...
2017-07-04 {'bn': 8, 'ace': 1, 'arn': 1, 'it': 118, 'ka':...
2017-07-05 {'bn': 6, 'zh-hans': 2, 'gl': 5, 'yue': 2, 'kn...
2017-07-06 {'bn': 20, 'zh-hans': 6, 'ml': 1, 'is': 1, 'id...
2017-07-07 {'bn': 42, 'zh-hans': 5, 'cs': 67, 'sa': 13, '...
2017-07-08 {'bn': 2, 'bar': 1, 'ilo': 1, 'ace': 1, 'kn': ...
2017-07-09 {'be-tarask': 1, 'bn': 5, 'bar': 1, 'zh-hans':...
2017-07-10 {'ru': 150, 'bar': 25, 'arz': 1, 'zh-hans': 3,...
2017-07-11 {'bn': 9, 'bar': 2, 'zh-hans': 5, 'lt': 4, 'yi...
2017-07-12 {'gan': 1, 'bn': 17, 'tg-cyrl': 1, 'zh-hans': ...
2017-07-13 {'bn': 3, 'kn': 17, 'et': 1, 'nn': 1, 'om': 1,...
2017-07-14 {'bn': 1, 'kn': 14, 'et': 4, 'la': 3, 'sr-ec':...
2017-07-15 {'bn': 11, 'bar': 1, 'he': 37, 'frr': 1, 'kn':...
2017-07-16 {'bn': 10, 'bar': 2, 'ace': 2, 'uz': 1, 'tt': ...
2017-07-17 {'bn': 11, 'bar': 1, 'zh-hans': 5, 'af': 1, 's...

85 rows × 1 columns

# check result
desceditsperaction
defaultdict(int, {'add': 1027532, 'remove': 12105, 'set': 63432})
# calculate overall rate and app percentage per language during timespan

# sum over all languages that occurred
langs = set()
allperlang = collections.Counter()
appperlang = collections.Counter()

for i in desceditsperdaylang.loc[daterange].index:
    langs = langs | set(desceditsperdaylang['all'][i].keys())
    allperlang = allperlang + collections.Counter(desceditsperdaylang['all'][i])
    appperlang = appperlang + collections.Counter(appeditsperdaylang['all'][i])
    
# combine into one dataframe
try:
    del(descperlang)
except NameError:
    pass
descperlang = pd.DataFrame(index=langs, columns = ['all', 'app', 'app_rate'])
descperlang[['all', 'app']] = descperlang[['all', 'app']].fillna(0)
descperlang[['app_rate']] = descperlang[['app_rate']].fillna(np.NaN)

for lang in langs:
    descperlang.loc[lang,'all'] = allperlang[lang]
    descperlang.loc[lang,'app'] = appperlang[lang]
    if allperlang[lang] > 0:
        descperlang.loc[lang,'app_rate'] = appperlang[lang] / allperlang[lang]
        

descperlang = descperlang.sort_values('app',ascending=False)
# result
descperlang
all app app_rate
ru 22317 2938 0.131649
ar 77842 2075 0.026657
cs 10638 2029 0.190731
he 4741 1859 0.392111
es 17899 1341 0.074920
de 23763 1090 0.045870
fa 1382 1059 0.766281
it 8334 1004 0.120470
sv 5634 980 0.173944
pl 3577 893 0.249651
id 1434 749 0.522315
ko 1425 717 0.503158
fr 23316 717 0.030751
fi 2812 716 0.254623
pt 1711 447 0.261251
lv 947 442 0.466737
vi 643 408 0.634526
hi 3635 358 0.098487
bn 1004 288 0.286853
zh 1529 223 0.145847
sr 865 207 0.239306
hu 5779 207 0.035819
th 471 188 0.399151
nl 6504 181 0.027829
ja 3219 175 0.054365
uk 3342 174 0.052065
ro 730 167 0.228767
en 72803 163 0.002239
ca 4099 159 0.038790
ps 151 137 0.907285
... ... ... ...
sei 1 0 0.000000
gan-hans 1 0 0.000000
sje 27 0 0.000000
smj 163 0 0.000000
sma 2 0 0.000000
kk-latn 2 0 0.000000
olo 2 0 0.000000
tn 1 0 0.000000
be-tarask 266 0 0.000000
en-ca 992 0 0.000000
gan-hant 1 0 0.000000
ltg 2 0 0.000000
en-gb 2045 0 0.000000
lzh 2 0 0.000000
pcd 3 0 0.000000
aeb-arab 4 0 0.000000
ary 5 0 0.000000
atj 1 0 0.000000
sah 2 0 0.000000
pam 41 0 0.000000
jut 1 0 0.000000
tcy 116 0 0.000000
ln 1 0 0.000000
pap 2 0 0.000000
szl 2 0 0.000000
udm 1 0 0.000000
vep 9 0 0.000000
de-at 1022 0 0.000000
zh-cn 187 0 0.000000
kk-kz 1 0 0.000000

246 rows × 3 columns

# result re-sorted
descperlang.sort_values('all',ascending=False)
all app app_rate
ta 741651 102 0.000138
ar 77842 2075 0.026657
en 72803 163 0.002239
de 23763 1090 0.045870
fr 23316 717 0.030751
ru 22317 2938 0.131649
es 17899 1341 0.074920
cs 10638 2029 0.190731
it 8334 1004 0.120470
nl 6504 181 0.027829
hu 5779 207 0.035819
sv 5634 980 0.173944
el 5128 100 0.019501
he 4741 1859 0.392111
ca 4099 159 0.038790
nb 3836 7 0.001825
da 3831 25 0.006526
hi 3635 358 0.098487
pl 3577 893 0.249651
uk 3342 174 0.052065
ja 3219 175 0.054365
fi 2812 716 0.254623
kn 2649 41 0.015478
hy 2495 18 0.007214
eo 2352 46 0.019558
en-gb 2045 0 0.000000
nn 1771 0 0.000000
tl 1759 46 0.026151
pt 1711 447 0.261251
zh 1529 223 0.145847
... ... ... ...
ln 1 0 0.000000
atj 1 0 0.000000
jut 1 0 0.000000
azb 1 1 1.000000
gan-hans 1 0 0.000000
sei 1 0 0.000000
rw 1 1 1.000000
kl 1 1 1.000000
bo 1 1 1.000000
gom 1 1 1.000000
lez 1 1 1.000000
kg 1 0 0.000000
mhr 1 0 0.000000
wo 1 0 0.000000
crh 1 0 0.000000
ty 1 0 0.000000
vls 1 0 0.000000
kab 1 0 0.000000
bm 1 0 0.000000
ak 1 0 0.000000
rue 1 0 0.000000
dv 1 0 0.000000
ee 1 0 0.000000
qug 1 0 0.000000
arq 1 0 0.000000
os 1 0 0.000000
nl-informal 1 0 0.000000
xh 1 0 0.000000
kea 1 0 0.000000
kk-kz 1 0 0.000000

246 rows × 3 columns

# result re-sorted
descperlang.sort_values('app_rate',ascending=False)
all app app_rate
azb 1 1 1.000000
gom 1 1 1.000000
kl 1 1 1.000000
rw 1 1 1.000000
cdo 87 87 1.000000
pnb 2 2 1.000000
als 8 8 1.000000
tk 2 2 1.000000
vec 2 2 1.000000
ve 3 3 1.000000
lez 1 1 1.000000
ab 3 3 1.000000
xmf 13 13 1.000000
bo 1 1 1.000000
am 18 17 0.944444
simple 30 28 0.933333
kk 51 47 0.921569
my 49 45 0.918367
ps 151 137 0.907285
ku 10 9 0.900000
ang 9 8 0.888889
om 8 7 0.875000
az 155 135 0.870968
mr 114 96 0.842105
sd 12 10 0.833333
lo 45 37 0.822222
hak 5 4 0.800000
fa 1382 1059 0.766281
km 15 11 0.733333
ht 11 8 0.727273
... ... ... ...
de-ch 861 0 0.000000
frp 4 0 0.000000
mhr 1 0 0.000000
pt-br 679 0 0.000000
kk-cyrl 10 0 0.000000
gsw 63 0 0.000000
ee 1 0 0.000000
dv 1 0 0.000000
fit 2 0 0.000000
nn 1771 0 0.000000
zh-sg 57 0 0.000000
rue 1 0 0.000000
min 36 0 0.000000
ak 1 0 0.000000
bm 1 0 0.000000
kab 1 0 0.000000
map-bms 5 0 0.000000
vls 1 0 0.000000
ty 1 0 0.000000
dsb 10 0 0.000000
ig 2 0 0.000000
ki 2 0 0.000000
sr-el 6 0 0.000000
crh 1 0 0.000000
wo 1 0 0.000000
nds-nl 8 0 0.000000
ku-latn 9 0 0.000000
arn 26 0 0.000000
wa 23 0 0.000000
kk-kz 1 0 0.000000

246 rows × 3 columns

descperlang['app'].sum()
24283
# only languages with at least 1000 total edits
descperlang.loc[descperlang['all']>=1000]
all app app_rate
ru 22317 2938 0.131649
ar 77842 2075 0.026657
cs 10638 2029 0.190731
he 4741 1859 0.392111
es 17899 1341 0.074920
de 23763 1090 0.045870
fa 1382 1059 0.766281
it 8334 1004 0.120470
sv 5634 980 0.173944
pl 3577 893 0.249651
id 1434 749 0.522315
ko 1425 717 0.503158
fr 23316 717 0.030751
fi 2812 716 0.254623
pt 1711 447 0.261251
hi 3635 358 0.098487
bn 1004 288 0.286853
zh 1529 223 0.145847
hu 5779 207 0.035819
nl 6504 181 0.027829
ja 3219 175 0.054365
uk 3342 174 0.052065
en 72803 163 0.002239
ca 4099 159 0.038790
bg 1438 109 0.075800
ta 741651 102 0.000138
el 5128 100 0.019501
eo 2352 46 0.019558
tl 1759 46 0.026151
kn 2649 41 0.015478
da 3831 25 0.006526
hy 2495 18 0.007214
nb 3836 7 0.001825
sa 1179 1 0.000848
nn 1771 0 0.000000
en-gb 2045 0 0.000000
de-at 1022 0 0.000000
#all languages with description edits in this timeframe:
descperlang.index.values
array(['ru', 'ar', 'cs', 'he', 'es', 'de', 'fa', 'it', 'sv', 'pl', 'id',
       'ko', 'fr', 'fi', 'pt', 'lv', 'vi', 'hi', 'bn', 'zh', 'sr', 'hu',
       'th', 'nl', 'ja', 'uk', 'ro', 'en', 'ca', 'ps', 'az', 'eu', 'bg',
       'ta', 'el', 'mr', 'ms', 'cdo', 'zh-hant', 'ur', 'ml', 'gl',
       'zh-hans', 'hr', 'kk', 'eo', 'tl', 'my', 'kn', 'tr', 'lo', 'sk',
       'simple', 'da', 'te', 'bs', 'ceb', 'sl', 'ne', 'gu', 'hy', 'am',
       'jv', 'no', 'ka', 'la', 'si', 'sq', 'et', 'bh', 'af', 'xmf', 'sh',
       'mn', 'mk', 'km', 'lt', 'pa', 'mai', 'sd', 'ku', 'ang', 'as', 'ht',
       'yi', 'als', 'om', 'nb', 'uz', 'ga', 'ckb', 'bar', 'ky', 'or', 'be',
       'hak', 'cy', 'is', 'scn', 'an', 'ab', 'sw', 'ace', 'arz', 've',
       'sco', 'io', 'ie', 'sn', 'vec', 'tk', 'nap', 'tg', 'so', 'se',
       'pnb', 'ast', 'gan', 'oc', 'rw', 'kl', 'new', 'fo', 'azb', 'sa',
       'mzn', 'yo', 'bo', 'ti', 'srn', 'gom', 'hsb', 'war', 'pms', 'br',
       'lez', 'zu', 'dty', 'diq', 'frp', 'de-ch', 'fy', 'zh-mo', 'ext',
       'bug', 'zh-hk', 'lb', 'csb', 'sr-ec', 'ba', 'nds', 'rmy', 'tt',
       'stq', 'yue', 'vo', 'su', 'zh-tw', 'ce', 'kk-tr', 'qu', 'kg', 'mhr',
       'bho', 'pt-br', 'gsw', 'ilo', 'wa', 'arn', 'ku-latn', 'nds-nl',
       'wo', 'crh', 'sr-el', 'ki', 'ig', 'dsb', 'ty', 'vls', 'map-bms',
       'kab', 'bm', 'ak', 'min', 'rue', 'zh-sg', 'nn', 'fit', 'dv', 'ee',
       'kk-cyrl', 'qug', 'frr', 'ia', 'sc', 'bjn', 'arq', 'os', 'gd', 'ay',
       'tg-cyrl', 'gn', 'mt', 'lad', 'mi', 'nl-informal', 'myv', 'mg',
       'tg-latn', 'zh-my', 'xh', 'lmo', 'kea', 'cv', 'de-formal', 'ksh',
       'sei', 'gan-hans', 'sje', 'smj', 'sma', 'kk-latn', 'olo', 'tn',
       'be-tarask', 'en-ca', 'gan-hant', 'ltg', 'en-gb', 'lzh', 'pcd',
       'aeb-arab', 'ary', 'atj', 'sah', 'pam', 'jut', 'tcy', 'ln', 'pap',
       'szl', 'udm', 'vep', 'de-at', 'zh-cn', 'kk-kz'], dtype=object)
# remove all languages (and language variants) where the feature was not yet in production before July 5, 2017
# cf. https://meta.wikimedia.org/wiki/User_talk:Elitre_(WMF)/Wikidata_editing_announcement
prodlangs = set(descperlang.index.values).difference([
 'en', 
 'de',
 'de-at',
 'de-ch',
 'de-formal',
 'it', 'fr', 'es', 'ja', 'nl', 'pt', 'tr', 
 'zh',
 'zh-cn',
 'zh-hans',
 'zh-hant',
 'zh-hk',
 'zh-mo',
 'zh-my',
 'zh-sg',
 'zh-tw'])
# Strictly speaking, we should also remove some obsure language variants that don't correspond 
# to an existing Wikipedia, e.g. 'aeb-arab'. 
# But we'll assume that the numbers of non-app description edits are small enough there to ignore 
# them in the calculation
# check result:
prodlangs
{'ab',
 'ace',
 'aeb-arab',
 'af',
 'ak',
 'als',
 'am',
 'an',
 'ang',
 'ar',
 'arn',
 'arq',
 'ary',
 'arz',
 'as',
 'ast',
 'atj',
 'ay',
 'az',
 'azb',
 'ba',
 'bar',
 'be',
 'be-tarask',
 'bg',
 'bh',
 'bho',
 'bjn',
 'bm',
 'bn',
 'bo',
 'br',
 'bs',
 'bug',
 'ca',
 'cdo',
 'ce',
 'ceb',
 'ckb',
 'crh',
 'cs',
 'csb',
 'cv',
 'cy',
 'da',
 'diq',
 'dsb',
 'dty',
 'dv',
 'ee',
 'el',
 'en-ca',
 'en-gb',
 'eo',
 'et',
 'eu',
 'ext',
 'fa',
 'fi',
 'fit',
 'fo',
 'frp',
 'frr',
 'fy',
 'ga',
 'gan',
 'gan-hans',
 'gan-hant',
 'gd',
 'gl',
 'gn',
 'gom',
 'gsw',
 'gu',
 'hak',
 'he',
 'hi',
 'hr',
 'hsb',
 'ht',
 'hu',
 'hy',
 'ia',
 'id',
 'ie',
 'ig',
 'ilo',
 'io',
 'is',
 'jut',
 'jv',
 'ka',
 'kab',
 'kea',
 'kg',
 'ki',
 'kk',
 'kk-cyrl',
 'kk-kz',
 'kk-latn',
 'kk-tr',
 'kl',
 'km',
 'kn',
 'ko',
 'ksh',
 'ku',
 'ku-latn',
 'ky',
 'la',
 'lad',
 'lb',
 'lez',
 'lmo',
 'ln',
 'lo',
 'lt',
 'ltg',
 'lv',
 'lzh',
 'mai',
 'map-bms',
 'mg',
 'mhr',
 'mi',
 'min',
 'mk',
 'ml',
 'mn',
 'mr',
 'ms',
 'mt',
 'my',
 'myv',
 'mzn',
 'nap',
 'nb',
 'nds',
 'nds-nl',
 'ne',
 'new',
 'nl-informal',
 'nn',
 'no',
 'oc',
 'olo',
 'om',
 'or',
 'os',
 'pa',
 'pam',
 'pap',
 'pcd',
 'pl',
 'pms',
 'pnb',
 'ps',
 'pt-br',
 'qu',
 'qug',
 'rmy',
 'ro',
 'ru',
 'rue',
 'rw',
 'sa',
 'sah',
 'sc',
 'scn',
 'sco',
 'sd',
 'se',
 'sei',
 'sh',
 'si',
 'simple',
 'sje',
 'sk',
 'sl',
 'sma',
 'smj',
 'sn',
 'so',
 'sq',
 'sr',
 'sr-ec',
 'sr-el',
 'srn',
 'stq',
 'su',
 'sv',
 'sw',
 'szl',
 'ta',
 'tcy',
 'te',
 'tg',
 'tg-cyrl',
 'tg-latn',
 'th',
 'ti',
 'tk',
 'tl',
 'tn',
 'tt',
 'ty',
 'udm',
 'uk',
 'ur',
 'uz',
 've',
 'vec',
 'vep',
 'vi',
 'vls',
 'vo',
 'wa',
 'war',
 'wo',
 'xh',
 'xmf',
 'yi',
 'yo',
 'yue',
 'zu'}
descperlang.loc[prodlangs]
all app app_rate
gan 2 1 0.500000
bar 90 6 0.066667
ace 25 3 0.120000
kn 2649 41 0.015478
vec 2 2 1.000000
tt 29 0 0.000000
et 134 15 0.111940
rmy 8 0 0.000000
sr-ec 9 0 0.000000
om 8 7 0.875000
fy 57 0 0.000000
nds 179 0 0.000000
wa 23 0 0.000000
ki 2 0 0.000000
gsw 63 0 0.000000
ab 3 3 1.000000
als 8 8 1.000000
he 4741 1859 0.392111
dty 3 1 0.333333
ur 159 68 0.427673
bh 21 14 0.666667
qu 218 0 0.000000
sk 935 31 0.033155
ve 3 3 1.000000
bho 5 0 0.000000
is 165 3 0.018182
lv 947 442 0.466737
cdo 87 87 1.000000
ms 384 94 0.244792
ku-latn 9 0 0.000000
... ... ... ...
kg 1 0 0.000000
eu 552 123 0.222826
jv 61 17 0.278689
diq 2 1 0.500000
ty 1 0 0.000000
vls 1 0 0.000000
sw 176 3 0.017045
war 4 1 0.250000
mhr 1 0 0.000000
eo 2352 46 0.019558
bs 166 22 0.132530
mn 40 13 0.325000
pt-br 679 0 0.000000
kk-tr 2 0 0.000000
sr-el 6 0 0.000000
simple 30 28 0.933333
ang 9 8 0.888889
nb 3836 7 0.001825
th 471 188 0.399151
af 86 14 0.162791
ce 13 0 0.000000
lb 820 0 0.000000
pa 14 10 0.714286
stq 6 0 0.000000
arz 18 3 0.166667
frp 4 0 0.000000
vo 2 0 0.000000
lo 45 37 0.822222
mk 781 12 0.015365
mai 15 10 0.666667

225 rows × 3 columns

# percentage of app edits among all manual description edits 
# for all language where the feature was in production during the entire timespan:
descperlang.loc[prodlangs]['app'].sum() / descperlang.loc[prodlangs]['all'].sum()
0.019964951010104232
# same without Tamil (which during this timeframe saw 741651 overall description edits in the dataset, 
# or more than 9x as many as the second largest language - so we assume for now that most of these 
# are bot edits not caught by our bot detection method above
descperlang.loc[prodlangs.difference(['ta'])]['app'].sum() / descperlang.loc[prodlangs.difference(['ta'])]['all'].sum() 
0.093953619425702017
# same while removing Arabic (second-most edits, more than English)
descperlang.loc[prodlangs.difference(['ta','ar'])]['app'].sum() / descperlang.loc[prodlangs.difference(['ta','ar'])]['all'].sum() 
0.13728133658657624

Results from March 1 (first production rollout) on

# repeat from above, with change date range and variable names
daterange2 =  pd.date_range(start='2017-03-01', end='2017-07-17', freq='D')
%%time
# calculate daily edit rate per language for a range of dates
!date

# initialize dataframes to count reverts per day:
columns = ['all']  # refers to the number of edits


try:
    del(appeditsperday2)
except NameError:
    pass
appeditsperday2 = pd.DataFrame(index=daterange2, columns=columns)
appeditsperday2 = appeditsperday2.fillna(0)

try:
    del(appeditsperdaylang2)
except NameError:
    pass
appeditsperdaylang2 = pd.DataFrame(index=daterange2, columns=columns)

for col in appeditsperdaylang2:
    appeditsperdaylang2[col] = [defaultdict(int) for i in range(len(appeditsperdaylang2.index))]

try:
    del(appeditsperaction2)
except NameError:
    pass
appeditsperaction2 = defaultdict(int)


# count edits per day and language:
for rev in appedits:
    day = pd.to_datetime(rev[1][:8].decode())
    
    if day in daterange2:
        
        appeditsperday2.loc[day,'all'] += 1
        
        # lang = rev[2].decode()[26:28] # this is a bit hacky and won't work with (e.g.) three-letter lang codes
        summaryparse = re.match(r'/\* wbsetdescription-([^\:]*)\:1\|([^\ ]*) ', rev[2].decode())
        
        lang = summaryparse.group(2) # two- or three-letter language code
        appeditsperdaylang2.loc[day,'all'][lang] +=1
        
        act = summaryparse.group(1) # add, set or remove
        appeditsperaction2[act] += 1
Thu Jul 20 07:02:48 UTC 2017
CPU times: user 34.2 s, sys: 272 ms, total: 34.5 s
Wall time: 34.6 s
#check result
appeditsperdaylang2
all
2017-03-01 {'ru': 63, 'he': 45, 'ar': 3}
2017-03-02 {'ru': 92, 'hr': 4, 'ca': 2, 'de': 1, 'he': 27...
2017-03-03 {'ca': 2, 'hr': 5, 'he': 23, 'ar': 1, 'ru': 211}
2017-03-04 {'ru': 173, 'hr': 54, 'he': 26, 'ar': 4, 'ca': 1}
2017-03-05 {'ru': 122, 'he': 15, 'ar': 2, 'ca': 1}
2017-03-06 {'ru': 187, 'hr': 1, 'ca': 25, 'en': 2, 'de': ...
2017-03-07 {'ru': 118, 'he': 33, 'ca': 13, 'en': 5}
2017-03-08 {'ru': 73, 'he': 122, 'ar': 1, 'ca': 11, 'en': 1}
2017-03-09 {'ru': 52, 'hr': 1, 'ca': 26, 'en': 1, 'he': 1...
2017-03-10 {'ru': 36, 'hr': 15, 'he': 60, 'ar': 2, 'ca': ...
2017-03-11 {'ru': 82, 'hr': 1, 'he': 11, 'id': 1, 'ca': 55}
2017-03-12 {'ru': 29, 'hr': 1, 'he': 17, 'ar': 1, 'ca': 54}
2017-03-13 {'ru': 49, 'hr': 5, 'he': 27, 'ca': 17}
2017-03-14 {'ru': 64, 'he': 13, 'ca': 4}
2017-03-15 {'ru': 78, 'he': 16, 'ca': 3}
2017-03-16 {'ru': 31, 'he': 15, 'ca': 1}
2017-03-17 {'en': 1, 'ru': 24, 'he': 13, 'ca': 11}
2017-03-18 {'en': 2, 'ar': 2, 'he': 16, 'ca': 10, 'ru': 39}
2017-03-19 {'ru': 98, 'he': 13, 'ar': 1, 'ca': 22}
2017-03-20 {'ru': 100, 'he': 43, 'ar': 1, 'ca': 22}
2017-03-21 {'ru': 28, 'he': 35, 'ca': 10, 'en': 1}
2017-03-22 {'ru': 51, 'he': 13, 'ar': 8, 'ca': 1}
2017-03-23 {'ru': 40, 'he': 173, 'ca': 17}
2017-03-24 {'ru': 182, 'he': 386, 'ca': 17}
2017-03-25 {'en': 12, 'he': 210, 'ca': 11, 'ru': 54}
2017-03-26 {'en': 1, 'ru': 22, 'he': 65, 'ca': 12}
2017-03-27 {'ru': 17, 'he': 19, 'ca': 11}
2017-03-28 {'ru': 81, 'he': 28, 'ca': 7}
2017-03-29 {'ru': 42, 'he': 19, 'ca': 2}
2017-03-30 {'ru': 35, 'hr': 24, 'he': 19}
... ...
2017-06-18 {'ru': 18, 'ro': 2, 'pl': 8, 'ky': 3, 'am': 1,...
2017-06-19 {'cs': 11, 'ko': 7, 'pl': 4, 'ar': 20, 'cy': 1...
2017-06-20 {'jv': 1, 'bn': 1, 'ro': 3, 'pl': 6, 'ru': 17,...
2017-06-21 {'cs': 8, 'ml': 3, 'zu': 1, 'ko': 3, 'kn': 2, ...
2017-06-22 {'cs': 17, 'ml': 1, 'pl': 10, 'lv': 1, 'hu': 5...
2017-06-23 {'cs': 5, 'ru': 23, 'pl': 3, 'sr': 5, 'fi': 5,...
2017-06-24 {'eu': 1, 'bn': 22, 'gu': 1, 'ru': 49, 'pl': 3...
2017-06-25 {'bn': 13, 'gu': 1, 'ko': 4, 'pl': 8, 'lv': 11...
2017-06-26 {'cs': 8, 'ca': 3, 'pl': 13, 'lv': 1, 'hu': 5,...
2017-06-27 {'cs': 8, 'gu': 1, 'ru': 62, 'pl': 9, 'hu': 1,...
2017-06-28 {'ca': 2, 'gu': 1, 'ta': 1, 'ml': 2, 'pl': 5, ...
2017-06-29 {'bn': 2, 'ko': 2, 'pl': 4, 'am': 1, 'te': 1, ...
2017-06-30 {'bn': 7, 'ro': 5, 'pl': 21, 'te': 1, 'hu': 1,...
2017-07-01 {'cs': 9, 'ml': 2, 'pl': 7, 'da': 2, 'hu': 2, ...
2017-07-02 {'bn': 3, 'ml': 5, 'pl': 11, 'kn': 1, 'ru': 31...
2017-07-03 {'cs': 12, 'gu': 1, 'ml': 5, 'pl': 3, 'tl': 1,...
2017-07-04 {'cs': 2, 'gu': 1, 'ko': 3, 'pl': 37, 'da': 1,...
2017-07-05 {'cs': 11, 'pt': 6, 'ko': 1, 'pl': 1, 'de': 9,...
2017-07-06 {'eu': 1, 'cs': 18, 'zh-hans': 5, 'ml': 1, 'pl...
2017-07-07 {'bn': 1, 'da': 1, 'tl': 3, 'zh-hans': 4, 'ko'...
2017-07-08 {'el': 2, 'cs': 5, 'tl': 1, 'zh-hans': 11, 'ca...
2017-07-09 {'bn': 4, 'bar': 1, 'zh-hans': 3, 'ace': 1, 'i...
2017-07-10 {'ca': 1, 'zh-hans': 3, 'ml': 3, 'pl': 9, 'de'...
2017-07-11 {'cs': 37, 'ta': 1, 'zh-hans': 4, 'ko': 5, 'pl...
2017-07-12 {'gan': 1, 'bn': 17, 'gu': 1, 'tl': 1, 'zh-han...
2017-07-13 {'bn': 1, 'zh-hans': 4, 'am': 1, 'te': 2, 'sr'...
2017-07-14 {'jv': 1, 'cs': 30, 'zh-hans': 9, 'ru': 41, 'p...
2017-07-15 {'el': 1, 'cs': 8, 'pt': 18, 'zh-hans': 2, 'ro...
2017-07-16 {'cs': 60, 'gu': 1, 'zh-hans': 5, 'ko': 8, 'pl...
2017-07-17 {'bn': 10, 'gu': 3, 'zh-hans': 5, 'cs': 45, 'p...

139 rows × 1 columns

# check result
appeditsperaction2
defaultdict(int, {'add': 23847, 'remove': 73, 'set': 6504})
#check result
appeditsperday2
all
2017-03-01 111
2017-03-02 127
2017-03-03 242
2017-03-04 258
2017-03-05 140
2017-03-06 251
2017-03-07 169
2017-03-08 208
2017-03-09 117
2017-03-10 235
2017-03-11 150
2017-03-12 102
2017-03-13 98
2017-03-14 81
2017-03-15 97
2017-03-16 47
2017-03-17 49
2017-03-18 69
2017-03-19 134
2017-03-20 166
2017-03-21 74
2017-03-22 73
2017-03-23 230
2017-03-24 585
2017-03-25 287
2017-03-26 100
2017-03-27 47
2017-03-28 116
2017-03-29 63
2017-03-30 78
... ...
2017-06-18 203
2017-06-19 129
2017-06-20 181
2017-06-21 126
2017-06-22 142
2017-06-23 130
2017-06-24 201
2017-06-25 323
2017-06-26 310
2017-06-27 209
2017-06-28 138
2017-06-29 207
2017-06-30 181
2017-07-01 167
2017-07-02 246
2017-07-03 176
2017-07-04 194
2017-07-05 228
2017-07-06 658
2017-07-07 812
2017-07-08 673
2017-07-09 750
2017-07-10 529
2017-07-11 662
2017-07-12 568
2017-07-13 476
2017-07-14 503
2017-07-15 700
2017-07-16 722
2017-07-17 861

139 rows × 1 columns

# number of all app edits since March 1
appeditsperday2.sum()
all    30424
dtype: int64
%%time
# For comparison, calculate rate for all "manual" description edits (including non-app edits, excluding bots and OAuth tools)

!date 

# get all (non-bot, non-OAuth) description edits for a period of time
    
# excludes:
# edits that change description together with other fields 
#     (e.g. https://www.wikidata.org/w/index.php?diff=468744965:  "wbsetlabeldescriptionaliases")
# edits from "autoEdit Update Descriptions" 
#     (e.g. https://www.wikidata.org/w/index.php?diff=468745165 : "wbeditentity-update")
# bot edits like https://www.wikidata.org/w/index.php?diff=468757331by by Emijrpbot 
#     or https://www.wikidata.org/w/index.php?diff=468758634 by Mr.Ibrahembot 
#     (these use "wbeditentity-update" instead of "wbsetdescription")
# "#quickstatements" edits (e.g. https://www.wikidata.org/w/index.php?diff=468756428 , 
#      or https://www.wikidata.org/w/index.php?diff=468758591 by Poulpybot)
#      looks like these may have a tag anyway ((Tag: Widar [1.4]" or "Tag: QuickStatements [1.1]")
#      but check edit summary too)
# edits made using an OAuth tool such as:
#    reCH (e.g. https://www.wikidata.org/w/index.php?limit=50&title=Special%3AContributions&contribs=user&target=%D4%B1%D5%B7%D5%A2%D5%B8%D5%BF&namespace=&tagfilter=&year=2017&month=2 
#    distributed Wikidata game and other Widar edits 
#       (e.g https://www.wikidata.org/w/index.php?diff=457815520 :
#       "The Distributed Game (23): Kaspar's Persondata game: Descriptions #distributed-game")
# edits from users whose names end in -bot or -Bot

# should exclude: 
# bot edits in general (use bot flag from RC table?)
# probably other so far undetected automated edits, e.g. in Tamil

conn3 = pymysql.connect(
    host=os.environ['MYSQL_HOST'],
    user=os.environ['MYSQL_USERNAME'],
    password=os.environ['MYSQL_PASSWORD'],
    database='wikidatawiki_p',
    charset='utf8')

with conn3.cursor() as cur:
    cur.execute("""
    SELECT rev_id, rev_timestamp, rev_comment, rev_user_text, ct_tag
    FROM (
        SELECT rev_id, rev_timestamp, rev_comment, rev_user_text
        FROM wikidatawiki_p.revision 
        WHERE rev_timestamp >= 20170301000000
        AND rev_comment LIKE '/* wbsetdescription%'
        AND rev_comment NOT LIKE '%#quickstatements') AS rev
    LEFT JOIN (
        SELECT ct_rev_id, ct_tag FROM wikidatawiki_p.change_tag
        WHERE ct_tag LIKE 'OAuth%'
        OR ct_tag LIKE 'QuickStatements%') AS ct
    ON rev.rev_id = ct.ct_rev_id  
    GROUP BY rev_id
    HAVING ct_tag IS NULL
    AND rev_user_text NOT RLIKE '[Bb]ot$'
    ORDER BY rev_timestamp;""")
    alldescdits2 = cur.fetchall()

    # NB: The change_tag table can contain several rows for the same edit 
Thu Jul 20 07:10:04 UTC 2017
#fix typo, check result
alldescedits2 = alldescdits2
print(len(alldescedits2))
print(alldescedits2[0])
print(alldescedits2[-1])
print(alldescedits2[0:10])
1279627
(458445931, b'20170301000037', b'/* wbsetdescription-add:1|pl */ dos\xc5\x82. wolny Krym.', b'Lantuszka', None)
(524055539, b'20170720071000', b'/* wbsetdescription-add:1|he */ \xd7\xa2\xd7\x99\xd7\xaa\xd7\x95\xd7\xa0\xd7\x90\xd7\x99\xd7\xaa \xd7\x99\xd7\xa9\xd7\xa8\xd7\x90\xd7\x9c\xd7\x99\xd7\xaa', b'DGtal', None)
((458445931, b'20170301000037', b'/* wbsetdescription-add:1|pl */ dos\xc5\x82. wolny Krym.', b'Lantuszka', None), (458446046, b'20170301000051', b'/* wbsetdescription-set:1|en */ line simplification algorithm', b'Mwtoews', None), (458446204, b'20170301000110', b'/* wbsetdescription-add:1|fi */ 2017 alkanut Netflix-sarja', b'OneMember', None), (458446470, b'20170301000146', b'/* wbsetdescription-add:1|fi */ 2016 vuoden nuortenelokuva', b'OneMember', None), (458446828, b'20170301000232', b'/* wbsetdescription-add:1|fi */ Yhdysvaltalainen koko perheen tietokoneanimoitu komediaelokuva', b'OneMember', None), (458447036, b'20170301000255', b'/* wbsetdescription-add:1|pl */ \xc5\x9bwi\xc4\x99ty katolicki i prawos\xc5\x82awny', b'Birkeen', None), (458447151, b'20170301000312', b'/* wbsetdescription-set:1|fi */ Yhdysvaltalainen tilannekomedia', b'OneMember', None), (458447286, b'20170301000332', b'/* wbsetdescription-add:1|fi */ Yhdysvaltalainen televisiosarja', b'OneMember', None), (458447721, b'20170301000433', b'/* wbsetdescription-add:1|fi */ 2011 vuoden sotaelokuva', b'OneMember', None), (458447749, b'20170301000437', b'/* wbsetdescription-add:1|uk */ \xd0\xba\xd0\xbe\xd0\xbb\xd1\x83\xd0\xbc\xd0\xb1\xd1\x96\xd0\xb9\xd1\x81\xd1\x8c\xd0\xba\xd0\xb8\xd0\xb9 \xd1\x84\xd1\x83\xd1\x82\xd0\xb1\xd0\xbe\xd0\xbb\xd1\x96\xd1\x81\xd1\x82', b'Lexusuns', None))
%%time
# calculate daily edit rate per language for a range of dates
!date

# initialize dataframes to count reverts per day:
columns = ['all']  # refers to the number of edits


try:
    del(desceditsperday2)
except NameError:
    pass
desceditsperday2 = pd.DataFrame(index=daterange2, columns=columns)
desceditsperday2 = desceditsperday2.fillna(0)

try:
    del(desceditsperdaylang2)
except NameError:
    pass
desceditsperdaylang2 = pd.DataFrame(index=daterange2, columns=columns)

for col in desceditsperdaylang2:
    desceditsperdaylang2[col] = [defaultdict(int) for i in range(len(desceditsperdaylang2.index))]


try:
    del(desceditsperaction2)
except NameError:
    pass
desceditsperaction2 = defaultdict(int)



# count edits per day and language:
for rev in alldescedits2:
    day = pd.to_datetime(rev[1][:8].decode())
    
    if day in daterange2:
        
        desceditsperday2.loc[day,'all'] += 1
        
        # lang = rev[2].decode()[26:28] # this is a bit hacky and won't work with (e.g.) three-letter lang codes
        summaryparse = re.match(r'/\* wbsetdescription-([^\:]*)\:1\|([^\ ]*) ', rev[2].decode())
        
        lang = summaryparse.group(2) # two- or three-letter language code
        desceditsperdaylang2.loc[day,'all'][lang] +=1

        
        act = summaryparse.group(1) # add, set or remove
        desceditsperaction2[act] += 1
        
Thu Jul 20 08:06:00 UTC 2017
CPU times: user 24min 14s, sys: 11 s, total: 24min 25s
Wall time: 24min 32s
# check result 
desceditsperday2
all
2017-03-01 3636
2017-03-02 3656
2017-03-03 5035
2017-03-04 3451
2017-03-05 3473
2017-03-06 3140
2017-03-07 2955
2017-03-08 3211
2017-03-09 3427
2017-03-10 3346
2017-03-11 3704
2017-03-12 6296
2017-03-13 2839
2017-03-14 3210
2017-03-15 3214
2017-03-16 2584
2017-03-17 2914
2017-03-18 2661
2017-03-19 3215
2017-03-20 3272
2017-03-21 3035
2017-03-22 2616
2017-03-23 3088
2017-03-24 3790
2017-03-25 3281
2017-03-26 2719
2017-03-27 3032
2017-03-28 2517
2017-03-29 3039
2017-03-30 2603
... ...
2017-06-18 3380
2017-06-19 3052
2017-06-20 3289
2017-06-21 3066
2017-06-22 30419
2017-06-23 36399
2017-06-24 17043
2017-06-25 3306
2017-06-26 3532
2017-06-27 3333
2017-06-28 3373
2017-06-29 3237
2017-06-30 3942
2017-07-01 4789
2017-07-02 4811
2017-07-03 3442
2017-07-04 3177
2017-07-05 3419
2017-07-06 3818
2017-07-07 3907
2017-07-08 3973
2017-07-09 4072
2017-07-10 3681
2017-07-11 3587
2017-07-12 3221
2017-07-13 3423
2017-07-14 3695
2017-07-15 4931
2017-07-16 3992
2017-07-17 3678

139 rows × 1 columns

# check result 
desceditsperdaylang2
all
2017-03-01 {'bn': 3, 'bar': 1, 'zh-hans': 1, 'is': 1, 'ar...
2017-03-02 {'bn': 13, 'zh-hans': 1, 'yue': 5, 'zh-hant': ...
2017-03-03 {'bn': 19, 'zh-hans': 1, 'cs': 51, 'is': 1, 'k...
2017-03-04 {'bn': 17, 'bar': 2, 'zu': 1, 'kk-cyrl': 3, 'e...
2017-03-05 {'ru': 220, 'ka': 12, 'he': 47, 'zh-hans': 1, ...
2017-03-06 {'bn': 1, 'he': 67, 'it': 81, 'ka': 29, 'bh': ...
2017-03-07 {'bn': 35, 'is': 2, 'ar': 83, 'te': 3, 'da': 1...
2017-03-08 {'bn': 15, 'ka': 4, 'la': 1, 'min': 1, 'or': 6...
2017-03-09 {'bn': 6, 'bar': 1, 'tt': 4, 'et': 5, 'bh': 1,...
2017-03-10 {'bn': 7, 'lt': 4, 'yue': 3, 'zh-hant': 4, 'da...
2017-03-11 {'bn': 5, 'ilo': 1, 'zh-hans': 3, 'myv': 11, '...
2017-03-12 {'ru': 159, 'zh-hans': 1, 'bn': 5, 'ba': 1, 's...
2017-03-13 {'bn': 4, 'zh-hans': 2, 'zh-hk': 1, 'lt': 1, '...
2017-03-14 {'bn': 1, 'sr-el': 3, 'si': 1, 'ka': 36, 'nn':...
2017-03-15 {'bn': 9, 'zh-hant': 3, 'zh-hans': 1, 'ml': 2,...
2017-03-16 {'bn': 4, 'ml': 3, 'fa': 3, 'te': 4, 'da': 27,...
2017-03-17 {'bn': 30, 'frr': 1, 'it': 64, 'et': 1, 'la': ...
2017-03-18 {'bn': 5, 'ay': 4, 'da': 17, 'la': 2, 'et': 19...
2017-03-19 {'bn': 2, 'yi': 3, 'lt': 3, 'te': 1, 'zh-hant'...
2017-03-20 {'bn': 1, 'frr': 3, 'af': 1, 'el': 41, 'it': 8...
2017-03-21 {'bn': 18, 'bar': 1, 'tokipona': 1, 'it': 83, ...
2017-03-22 {'fo': 1, 'bn': 3, 'lt': 5, 'is': 3, 'te': 4, ...
2017-03-23 {'bn': 29, 'frr': 1, 'si': 1, 'ka': 4, 'la': 2...
2017-03-24 {'bn': 2, 'bar': 2, 'it': 70, 'ka': 31, 'nn': ...
2017-03-25 {'bn': 6, 'zh-hans': 3, 'lt': 2, 'zh-hk': 1, '...
2017-03-26 {'bn': 18, 'zh-hans': 2, 'lt': 2, 'is': 1, 'ck...
2017-03-27 {'bn': 2, 'bar': 2, 'he': 63, 'ka': 2, 'la': 1...
2017-03-28 {'bn': 3, 'bar': 1, 'he': 50, 'yi': 3, 'lt': 3...
2017-03-29 {'bn': 4, 'bar': 1, 'zh-hans': 20, 'zh-hk': 19...
2017-03-30 {'sc': 1, 'bn': 3, 'bar': 1, 'zh-hans': 1, 'ar...
... ...
2017-06-18 {'bn': 6, 'ace': 2, 'is': 4, 'uz': 1, 'ka': 3,...
2017-06-19 {'bn': 8, 'ace': 1, 'it': 65, 'ka': 3, 'nn': 5...
2017-06-20 {'bn': 18, 'bar': 1, 'frr': 1, 'si': 1, 'ka': ...
2017-06-21 {'bn': 5, 'bar': 1, 'ilo': 2, 'zu': 2, 'kn': 2...
2017-06-22 {'bn': 15, 'lt': 1, 'sl': 2, 'id': 6, 'hsb': 1...
2017-06-23 {'bn': 23, 'uz': 1, 'ka': 3, 'nn': 14, 'min': ...
2017-06-24 {'bn': 31, 'ace': 1, 'kn': 3, 'uz': 1, 'et': 2...
2017-06-25 {'bn': 14, 'bar': 1, 'kn': 3, 'it': 98, 'uz': ...
2017-06-26 {'bn': 11, 'bar': 4, 'kn': 1, 'ka': 2, 'nn': 8...
2017-06-27 {'bn': 3, 'kn': 3, 'uz': 1, 'et': 4, 'nn': 10,...
2017-06-28 {'ru': 311, 'bar': 1, 'yi': 1, 'ml': 2, 'mr': ...
2017-06-29 {'bn': 5, 'ace': 1, 'kk-cyrl': 5, 'kn': 65, 'u...
2017-06-30 {'bn': 9, 'is': 1, 'id': 7, 'yue': 1, 'kn': 73...
2017-07-01 {'bn': 7, 'he': 50, 'kn': 788, 'it': 108, 'sa'...
2017-07-02 {'bn': 21, 'yi': 2, 'ml': 5, 'gl': 6, 'te': 11...
2017-07-03 {'eu': 49, 'tg-cyrl': 1, 'ml': 5, 'gl': 5, 'yu...
2017-07-04 {'bn': 8, 'ace': 1, 'arn': 1, 'it': 118, 'ka':...
2017-07-05 {'bn': 6, 'zh-hans': 2, 'gl': 5, 'yue': 2, 'kn...
2017-07-06 {'bn': 20, 'zh-hans': 6, 'ml': 1, 'is': 1, 'id...
2017-07-07 {'bn': 42, 'zh-hans': 5, 'cs': 67, 'sa': 13, '...
2017-07-08 {'bn': 2, 'bar': 1, 'ilo': 1, 'ace': 1, 'kn': ...
2017-07-09 {'be-tarask': 1, 'bn': 5, 'bar': 1, 'zh-hans':...
2017-07-10 {'ru': 150, 'bar': 25, 'arz': 1, 'zh-hans': 3,...
2017-07-11 {'bn': 9, 'bar': 2, 'zh-hans': 5, 'lt': 4, 'yi...
2017-07-12 {'gan': 1, 'bn': 17, 'tg-cyrl': 1, 'zh-hans': ...
2017-07-13 {'bn': 3, 'kn': 17, 'et': 1, 'nn': 1, 'om': 1,...
2017-07-14 {'bn': 1, 'kn': 14, 'et': 4, 'la': 3, 'sr-ec':...
2017-07-15 {'bn': 11, 'bar': 1, 'he': 37, 'frr': 1, 'kn':...
2017-07-16 {'bn': 10, 'bar': 2, 'ace': 2, 'uz': 1, 'tt': ...
2017-07-17 {'bn': 11, 'bar': 1, 'zh-hans': 5, 'af': 1, 's...

139 rows × 1 columns

# check result 
desceditsperaction2
defaultdict(int, {'add': 1156551, 'remove': 18121, 'set': 96826})
# calculate overall rate and app percentage per language during timespan

# sum over all languages that occurred
langs = set()
allperlang2 = collections.Counter()
appperlang2 = collections.Counter()

for i in desceditsperdaylang2.loc[daterange2].index:
    langs = langs | set(desceditsperdaylang2['all'][i].keys())
    allperlang2 = allperlang2 + collections.Counter(desceditsperdaylang2['all'][i])
    appperlang2 = appperlang2 + collections.Counter(appeditsperdaylang2['all'][i])
    
# combine into one dataframe
try:
    del(descperlang2)
except NameError:
    pass
descperlang2 = pd.DataFrame(index=langs, columns = ['all', 'app', 'app_rate'])
descperlang2[['all', 'app']] = descperlang2[['all', 'app']].fillna(0)
descperlang2[['app_rate']] = descperlang2[['app_rate']].fillna(np.NaN)

for lang in langs:
    descperlang2.loc[lang,'all'] = allperlang2[lang]
    descperlang2.loc[lang,'app'] = appperlang2[lang]
    if allperlang2[lang] > 0:
        descperlang2.loc[lang,'app_rate'] = appperlang2[lang] / allperlang2[lang]
        

descperlang2 = descperlang2.sort_values('app',ascending=False)
# check result
descperlang2
all app app_rate
ru 34745 5936 0.170845
he 8396 3991 0.475345
ar 79066 2151 0.027205
cs 14399 2029 0.140913
es 26385 1341 0.050824
de 39504 1093 0.027668
fa 1555 1059 0.681029
it 13084 1004 0.076735
sv 7934 986 0.124275
pl 5988 899 0.150134
ca 6816 840 0.123239
id 1884 753 0.399682
ko 2128 723 0.339756
fi 5134 719 0.140047
fr 35887 717 0.019979
pt 2994 447 0.149299
lv 1324 442 0.333837
vi 921 409 0.444083
hi 3836 359 0.093587
bn 1550 288 0.185806
zh 2053 224 0.109109
hu 12291 209 0.017004
en 117702 208 0.001767
sr 1027 207 0.201558
th 592 194 0.327703
hr 674 193 0.286350
nl 11433 181 0.015831
ja 4907 175 0.035663
uk 5597 175 0.031267
ro 1241 168 0.135375
... ... ... ...
kk-latn 2 0 0.000000
aeb-arab 4 0 0.000000
en-gb 3656 0 0.000000
tg-latn 3 0 0.000000
ext 23 0 0.000000
ba 136 0 0.000000
bjn 8 0 0.000000
yue 179 0 0.000000
sc 25 0 0.000000
csb 3 0 0.000000
ia 282 0 0.000000
bug 2 0 0.000000
kab 1 0 0.000000
dv 2 0 0.000000
frr 16 0 0.000000
sei 2 0 0.000000
qug 1 0 0.000000
os 1 0 0.000000
sje 27 0 0.000000
kk-cyrl 18 0 0.000000
smj 163 0 0.000000
gan-hans 1 0 0.000000
sma 3 0 0.000000
ee 1 0 0.000000
ak 8 0 0.000000
tn 1 0 0.000000
be-tarask 641 0 0.000000
kk-kz 1 0 0.000000
gan-hant 1 0 0.000000
ltg 3 0 0.000000

253 rows × 3 columns

Results for March 1-31

# result for daterange =  pd.date_range(start='2017-03-01', end='2017-03-31', freq='D') : 
descperlang
all app app_rate
ru 6387 2296 0.359480
he 2413 1570 0.650642
ca 1736 503 0.289747
hr 188 113 0.601064
ar 925 51 0.055135
en 29096 26 0.000894
:1 3448 3 0.000870
de 10079 3 0.000298
id 256 1 0.003906
zh 611 0 0.000000
ga 9 0 0.000000
si 5 0 0.000000
pc 1 0 0.000000
uz 1 0 0.000000
ky 5 0 0.000000
ma 12 0 0.000000
pl 1440 0 0.000000
ay 64 0 0.000000
oc 10 0 0.000000
lb 320 0 0.000000
nl 2817 0 0.000000
ja 747 0 0.000000
et 199 0 0.000000
fr 6565 0 0.000000
bg 3902 0 0.000000
no 15 0 0.000000
kl 1 0 0.000000
pa 12 0 0.000000
io 2 0 0.000000
dv 1 0 0.000000
... ... ... ...
hu 3223 0 0.000000
ig 1 0 0.000000
ta 16 0 0.000000
yo 2 0 0.000000
lt 39 0 0.000000
az 3 0 0.000000
nd 5 0 0.000000
gs 6 0 0.000000
sr 120 0 0.000000
sv 1434 0 0.000000
sw 5 0 0.000000
ur 104 0 0.000000
th 49 0 0.000000
sk 1517 0 0.000000
my 99 0 0.000000
km 6 0 0.000000
mz 1 0 0.000000
sm 1 0 0.000000
qu 114 0 0.000000
ak 6 0 0.000000
ex 7 0 0.000000
as 22 0 0.000000
eo 541 0 0.000000
gd 34 0 0.000000
ck 36 0 0.000000
wa 2 0 0.000000
tr 696 0 0.000000
sa 3 0 0.000000
tl 55 0 0.000000
eu 155 0 0.000000

135 rows × 3 columns

# result re-sorted for daterange =  pd.date_range(start='2017-03-01', end='2017-03-31', freq='D') :
descperlang.sort_values('all',ascending=False)
all app app_rate
en 29096 26 0.000894
de 10079 3 0.000298
fr 6565 0 0.000000
ru 6387 2296 0.359480
es 4694 0 0.000000
bg 3902 0 0.000000
:1 3448 3 0.000870
hu 3223 0 0.000000
it 2863 0 0.000000
nl 2817 0 0.000000
he 2413 1570 0.650642
cs 1804 0 0.000000
fi 1776 0 0.000000
ca 1736 503 0.289747
da 1665 0 0.000000
sk 1517 0 0.000000
pl 1440 0 0.000000
sv 1434 0 0.000000
uk 1361 0 0.000000
el 1145 0 0.000000
ar 925 51 0.055135
hy 840 0 0.000000
pt 804 0 0.000000
ja 747 0 0.000000
tr 696 0 0.000000
zh 611 0 0.000000
eo 541 0 0.000000
be 512 0 0.000000
ko 425 0 0.000000
nb 398 0 0.000000
... ... ... ...
gn 3 0 0.000000
sa 3 0 0.000000
az 3 0 0.000000
cv 2 0 0.000000
wa 2 0 0.000000
tt 2 0 0.000000
ku 2 0 0.000000
yo 2 0 0.000000
jv 2 0 0.000000
lf 2 0 0.000000
io 2 0 0.000000
kn 2 0 0.000000
na 2 0 0.000000
ht 2 0 0.000000
pc 1 0 0.000000
ig 1 0 0.000000
ln 1 0 0.000000
mi 1 0 0.000000
dv 1 0 0.000000
ki 1 0 0.000000
ps 1 0 0.000000
lo 1 0 0.000000
to 1 0 0.000000
li 1 0 0.000000
uz 1 0 0.000000
nr 1 0 0.000000
sm 1 0 0.000000
mz 1 0 0.000000
ks 1 0 0.000000
kl 1 0 0.000000

135 rows × 3 columns

Scratchpad

# check ":1" issue

testdaterange =  pd.date_range(start='2017-04-24', end='2017-04-27', freq='D')

for rev in appedits:        
        lang = rev[2].decode()[26:28]
        if lang == ":1":
            print(rev)
            
(405227979, b'20161110141959', b'/* wbsetdescription-remove:1|en */ g', b'68.40.255.173')
(412637708, b'20161130165601', b'/* wbsetdescription-remove:1|en */ Australian cricketer', b'62.135.211.53')
(436214896, b'20170125235929', b'/* wbsetdescription-remove:1|en */ lake located in South Australia, Australia', b'62.135.211.53')
(440290762, b'20170131232705', b'/* wbsetdescription-remove:1|en */ \xd7\xaa\xd6\xb7\xd7\xa8\xd6\xb0\xd7\xa0\xd6\xb0\xd7\x92\xd7\x95\xd6\xb9\xd7\x9c  his is a\xf0\x9f\x94\x8aest \xd7\x9e\xd6\xb7\xd7\x99\xd6\xb4\xd7\x9d', b'50.250.225.50')
(440551146, b'20170201153436', b'/* wbsetdescription-remove:1|en */ gdyyd', b'RHo (WMF)')
(440580124, b'20170201174054', b'/* wbsetdescription-remove:1|en */ lake', b'RHo (WMF)')
(458871234, b'20170301184133', b'/* wbsetdescription-remove:1|ru */ \xd0\xbb\xd1\x8f', b'95.220.146.174')
(470656142, b'20170327051224', b'/* wbsetdescription-remove:1|ru */ \xd0\xbf\xd1\x80\xd0\xb8\xd0\xb5\xd0\xb7\xd0\xb6\xd0\xb0\xd0\xb9\xd1\x82\xd0\xb5 \xd0\xb2 \xd0\xbc\xd0\xb0\xd0\xb3\xd0\xb0\xd0\xb4\xd0\xb0\xd0\xbd', b'91.108.29.159')
(470690437, b'20170327105536', b'/* wbsetdescription-remove:1|ru */ \xd1\x88\xd0\xba\xd1\x83\xd1\x80\xd0\xb0\xd0\xbc \xd0\xbd\xd0\xb5 \xd1\x81\xd0\xbe\xd0\xb2\xd0\xb5\xd1\x82\xd1\x83\xd1\x8e', b'109.195.223.170')
(473676318, b'20170410074540', b'/* wbsetdescription-remove:1|ru */ \xd0\xa1\xd0\xb0\xd0\xbc\xd0\xbc\xd0\xb0\xd1\x8f \xd1\x81\xd0\xb5\xd0\xba\xd1\x81\xd1\x83\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xbd\xd0\xb0\xd1\x8f \xd0\xb4\xd0\xb5\xd0\xb2\xd1\x83\xd1\x88\xd0\xba\xd0\xb0', b'William montmorency')
(473825445, b'20170411011301', b'/* wbsetdescription-remove:1|ru */ .', b'80.83.235.126')
(479031026, b'20170426063822', b'/* wbsetdescription-remove:1|pl */ g\xc5\x82owa', b'2A00:F41:18AA:E5C6:3E4F:FBBB:59A:90F9')
(479249970, b'20170427070315', b'/* wbsetdescription-remove:1|zh */ \xe6\xb1\x9f\xe6\xb9\x96 \xe5\xb0\x87\xe8\xbb\x8d\xe6\xbe\xb3\xe5\x8d\x80\xe5\xb9\xb3\xe5\xae\x9a\xe5\xa4\xa9\xe4\xb8\x8b [\xe4\xb8\x89\xe8\x8b\xb1\xe4\xb9\x8b\xe4\xb8\x80]', b'119.236.175.72')
(479954128, b'20170429220359', b'/* wbsetdescription-remove:1|fi */ fryjgg', b'91.157.187.94')
(480390523, b'20170501003955', b'/* wbsetdescription-remove:1|ms */ Peristiwa Sebelum Penbunuhan Penghulu Imban Medan', b'113.210.196.23')
(480509206, b'20170501150522', b'/* wbsetdescription-remove:1|ru */ \xd0\x9d\xd0\xb8\xd0\xba\xd0\xbe\xd0\xbb\xd0\xb0\xcc\x81\xd0\xb9 \xd0\x93\xd1\x80\xd0\xb8\xd0\xb3\xd0\xbe\xcc\x81\xd1\x80\xd1\x8c\xd0\xb5\xd0\xb2\xd0\xb8\xd1\x87 \xd0\x9f\xd0\xb8\xd0\xb3\xd0\xbe\xd1\x80\xd0\xb5\xd0\xb2 (20 \xd0\xbc\xd0\xb0\xd1\x8f 1925, \xd0\x9a\xd1\x83\xd1\x80\xd1\x81\xd0\xba, \xd0\xa0\xd0\xa1\xd0\xa4\xd0\xa1\xd0\xa0, \xd0\xa1\xd0\xa1\xd0\xa1\xd0\xa0 \xe2\x80\x94 23 \xd0\xbc\xd0\xb0\xd1\x80\xd1\x82\xd0\xb0 1944, \xd0\xb1\xd0\xbb\xd0\xb8\xd0\xb7 \xd1\x81\xd0\xb5\xd0\xbb\xd0\xb0 \xd0\xa0\xd0\xbe\xd0\xbc\xd0\xb0\xd0\xbd\xd0\xbe\xd0\xb2\xd0\xba\xd0\xb0, \xd0\xa2\xd0\xb5\xd1\x80\xd0\xbd\xd0\xbe\xd0\xbf\xd0\xbe\xd0\xbb\xd1\x8c\xd1\x81\xd0\xba\xd0\xb8\xd0\xb9 \xd1\x80\xd0\xb0\xd0\xb9\xd0\xbe\xd0\xbd, \xd0\xa2\xd0\xb5...', b'46.72.251.237')
(480892288, b'20170503044842', b'/* wbsetdescription-remove:1|fa */ \xd9\x85\xdb\x8c\xd8\xae\xd9\x88\xd8\xa7\xd9\x85 \xd8\xb9\xda\xa9\xd8\xb3\xd9\x85\xd9\x88 \xd8\xaf\xd8\xb1 \xda\xaf\xd9\x88\xda\xaf\xd9\x84 \xd8\xa8\xd8\xb2\xd8\xa7\xd8\xb1\xd9\x85', b'5.202.116.165')
(481995660, b'20170505204430', b'/* wbsetdescription-remove:1|ar */ \xd8\xb2', b'41.250.222.17')
(481996386, b'20170505204725', b'/* wbsetdescription-remove:1|ar */ \xd8\xad\xd9\x85\xd8\xb2\xd8\xa9 \xd8\xa7\xd9\x84\xd8\xa8\xd9\x87\xd8\xac\xd8\xa7\xd9\x88\xd9\x8a', b'41.250.222.17')
(482523622, b'20170506120641', b"/* wbsetdescription-remove:1|zh */ '''\xe9\x9b\xbb\xe5\x8b\x95\xe5\x96\xae\xe8\xbc\xaa\xe8\xbb\x8a'''\xe3\x80\x8a\xe8\x8a\xb1\xe5\xbc\x8f\xe9\x9a\x8a\xe9\x9a\x8a\xe9\x95\xb7\xe3\x80\x8b \xe9\xa6\x99\xe6\xb8\xaf\xe7\xac\xac\xe4\xb8\x80\xe4\xbb\xa3\xe7\x8e\xa9\xe9\x9b\xbb\xe5\xad\x90\xe9\xab\x94\xe6\x84\x9f\xe8\xbb\x8a\xe7\x9a\x84\xe6\xbd\xae\xe4\xba\xba \xe5\x9c\xb0\xe4\xb8\x8b\xe8\x8a\xb1\xe5\xbc\x8f\xe6\xaf\x94\xe8\xb3\xbd\xef\xbc\x9a\xe4\xb9\x9d\xe9\xbe\x8d\xe5\x8d\x80\xe7\xac\xac\xe4\xb8\x80\xe5\x90\x8d\xe3\x80\x81\xe9\xa6\x99\xe6\xb8\xaf\xe5\x8d\x80\xe7\xac\xac\xe4\xb8\x80\xe5\x90\x8d\xe3\x80\x81\xe5\x85\xa8\xe9\xa6\x993\xe5\x90\x8d\xe5\x85\xa7\xe3\x80\x82 (\xe9\x9b\xbb\xe5\x8b\x95\xe8\xbb\x8a\xe6\xad\xa3\xe7\xa2\xba\xe5\x90\x8d\xe7\xa8\xb1\xe7\x82\xba \xe2\x80\x9c\xe9\xab\x94\xe6\x84\x9f\xe8\xbb\x8a\xe2\x80\x9d)", b'218.255.163.76')
(483606141, b'20170509064604', b'/* wbsetdescription-remove:1|zh */ \xe7\x94\xb1\xe6\x96\xaf\xe5\xa8\x81\xe5\xa4\xab\xe7\x89\xb9\xe4\xb8\x8e\xe6\xad\x8c\xe6\x9b\xb2\xe7\x9a\x84\xe5\x88\xb6\xe4\xbd\x9c\xe4\xba\xba\xe9\xa9\xac\xe5\x85\x8b\xe6\x96\xaf\xc2\xb7\xe9\xa9\xac\xe4\xb8\x81\xe5\x92\x8c\xe6\xad\x87\xe5\xb0\x94\xe8\xb4\x9d\xe5\x85\x8b\xe5\x85\xb1\xe5\x90\x8c\xe5\x88\x9b\xe4\xbd\x9c\xe7\x9a\x84\xe6\xad\x8c\xe6\x9b\xb2', b'Vitsuta')
(484501023, b'20170511164642', b'/* wbsetdescription-remove:1|fa */ \xd8\xb9\xd9\x84\xd8\xa7\xd9\x82\xd9\x85\xd9\x86\xd8\xaf\xd8\xa7\xd9\x86 \xd8\xb9\xd8\xb2\xdb\x8c\xd8\xb2\xdb\x8c \xda\xa9\xd9\x87 \xd8\xa8\xd9\x87 \xd8\xa2\xd8\xb4\xd9\xbe\xd8\xb2\xdb\x8c \xd9\x88 \xd8\xb4\xdb\x8c\xd8\xb1\xdb\x8c\xd9\x86\xdb\x8c \xd9\xbe\xd8\xb2\xdb\x8c \xd8\xb9\xd9\x84\xd8\xa7\xd9\x82\xd9\x87 \xd8\xaf\xd8\xa7\xd8\xb1\xdb\x8c\xd9\x86\xd8\x8c \xd8\xa7\xda\xaf\xd9\x87 \xd8\xb3\xd8\xa7\xda\xa9\xd9\x86 \xd8\xb4\xdb\x8c\xd8\xb1\xd8\xa7\xd8\xb2 \xd9\x87\xd8\xb3\xd8\xaa\xdb\x8c\xd9\x86 \xd9\x88 \xd8\xaf\xd9\x86\xd8\xa8\xd8\xa7\xd9\x84 \xdb\x8c\xd9\x87 \xd8\xa2\xd9\x85\xd9\x88\xd8\xb2\xd8\xb4\xda\xaf\xd8\xa7\xd9\x87 \xd8\xae\xd9\x88\xd8\xa8 \xd8\xa8\xd8\xb1\xd8\xa7\xdb\x8c \xd8\xa2\xd9\x85\xd9\x88\xd8\xb2\xd8\xb4 \xd9\x87\xd8\xb3\xd8\xaa\xdb\x8c\xd9\x86\xd8\x8c...', b'151.239.187.98')
(484690105, b'20170512034039', b'/* wbsetdescription-remove:1|th */ \xe0\xb8\x99\xe0\xb8\xb1\xe0\xb8\x81\xe0\xb8\xa3\xe0\xb9\x89\xe0\xb8\xad\xe0\xb8\x87\xe0\xb8\x9c\xe0\xb8\xb9\xe0\xb9\x89\xe0\xb9\x84\xe0\xb8\x94\xe0\xb9\x89\xe0\xb8\xa3\xe0\xb8\xb1\xe0\xb8\x9a\xe0\xb8\xa3\xe0\xb8\xb2\xe0\xb8\x87\xe0\xb8\xa7\xe0\xb8\xb1\xe0\xb8\xa5\xe0\xb8\x8a\xe0\xb8\x99\xe0\xb8\xb0\xe0\xb9\x80\xe0\xb8\xa5\xe0\xb8\xb4\xe0\xb8\xa8\xe0\xb8\x88\xe0\xb8\xb2\xe0\xb8\x81\xe0\xb8\xa3\xe0\xb8\xb2\xe0\xb8\xa2\xe0\xb8\x81\xe0\xb8\xb2\xe0\xb8\xa3 \xe0\xb8\x97\xe0\xb8\xa3\xe0\xb8\xb9 \xe0\xb8\xad\xe0\xb8\xb0\xe0\xb8\x84\xe0\xb8\xb2\xe0\xb9\x80\xe0\xb8\x94\xe0\xb8\xa1\xe0\xb8\xb5\xe0\xb9\x88 \xe0\xb9\x81\xe0\xb8\x9f\xe0\xb8\x99\xe0\xb9\x80\xe0\xb8\x97\xe0\xb9\x80\xe0\xb8\x8a\xe0\xb8\xb5\xe0\xb8\xa2 \xe0\xb8\x8b\xe0\xb8\xb5\xe0\xb8\x8b\xe0\xb8\xb1\xe0\xb9\x88\xe0\xb8\x99 6', b'NELLA32')
(485156225, b'20170513074806', b'/* wbsetdescription-remove:1|ro */ Titanic', b'188.27.73.150')
(486529516, b'20170515192145', b'/* wbsetdescription-remove:1|hr */ govno', b'93.180.101.84')
(486710158, b'20170516104439', b'/* wbsetdescription-remove:1|ps */ \xd8\xaa\xd8\xa7\xd9\x86\xd8\xa8\xd9\x87', b'Bloch khan')
(486727847, b'20170516120114', b'/* wbsetdescription-remove:1|vi */ M\xe1\xbb\x99t h\xe1\xba\xb1ng s\xe1\xbb\x91 trong to\xc3\xa1n h\xe1\xbb\x8dc, k\xc3\xad hi\xe1\xbb\x87u l\xc3\xa0: i', b'Davidmay')
(487087285, b'20170518012743', b'/* wbsetdescription-remove:1|ht */ Madichon se paw\xc3\xb2l malveyan ki swete pou mal\xc3\xa8 rive yon moun.', b'200.113.238.146')
(487116289, b'20170518090436', b'/* wbsetdescription-remove:1|zh */ \xe8\x84\xbe\xe6\xb0\x94\xe7\x81\xab\xe7\x88\x86\xe5\x88\xb0.....', b'2001:E68:5422:2F38:5594:1CA3:1E71:9CB8')
(487345963, b'20170519113025', b'/* wbsetdescription-remove:1|fa */ \xdb\x8c\xd8\xa7 \xd8\xb9\xd9\x84\xdb\x8c', b'2.190.88.78')
(487517070, b'20170520075439', b'/* wbsetdescription-remove:1|fi */ ranskalainen poliitikko"0', b'85.76.78.226')
(487555773, b'20170520143112', b'/* wbsetdescription-remove:1|ar */ \xd8\xa3\xd8\xb1\xd8\xb7\xd8\xba\xd8\xb1\xd9\x84 \xd9\x88\xd9\x84\xd8\xa7 \xd8\xb3\xd8\xa7\xd9\x84\xd9\x85 \xd9\x82\xd8\xaf\xd9\x87\xd9\x88 \xd8\xb9\xd9\x84\xd9\x89 \xd8\xa7\xd9\x84\xd8\xb7\xd9\x8a\xd8\xa8 \xd8\xb6\xd8\xa7\xd8\xb1\xd9\x8a\xf0\x9f\x93\xa3', b'51.235.30.110')
(488834919, b'20170523193930', b'/* wbsetdescription-remove:1|ca */ El jon barbe', b'95.131.169.252')
(490865730, b'20170526190136', b'/* wbsetdescription-remove:1|fa */ \xd8\xaa\xd9\x88\xd9\x84', b'2.186.86.87')
(491330641, b'20170528021912', b'/* wbsetdescription-remove:1|fa */ \xd8\xa7\xd9\x85\xd8\xa7\xd9\x85 \xd8\xac\xd9\x85\xd8\xb9\xd9\x87 \xd8\xb4\xd9\x87\xd8\xb1 \xd8\xa7\xd8\xb1\xd8\xaf\xd8\xa8\xdb\x8c\xd9\x84', b'176.65.184.162')
(491469901, b'20170528124536', b'/* wbsetdescription-remove:1|fa */ \xd8\xaf\xd8\xb1 \xd8\xa7\xdb\x8c\xd9\x86 \xd8\xaf\xd9\x88\xd8\xb1\xd8\xa7\xd9\x86 \xd9\xbe\xd8\xa7\xd8\xaf\xd8\xb4\xd8\xa7\xd9\x87\xd8\xa7\xd9\x86 \xd8\xb9\xd8\xa7\xd8\xaf\xd9\x84\xdb\x8c \xd8\xa8\xd9\x88\xd8\xaf\xd9\x86\xd8\xaf \xda\xa9\xd9\x87 \xd8\xa7\xd8\xb2 \xd8\xb1\xd9\x88\xdb\x8c \xd9\x86\xd9\x88\xd8\xb4\xd8\xaa\xd9\x87 \xd9\x87\xd8\xa7\xdb\x8c \xd8\xa8\xd8\xb4\xd9\x82\xd8\xa7\xd8\xa8\xdb\x8c \xd9\x81\xd9\x84\xd8\xb2\xdb\x8c \xd8\xaa\xd8\xb5\xd9\x85\xdb\x8c\xd9\x85\xd8\xa7\xd8\xaa \xd8\xaf\xd8\xb1\xd8\xb3\xd8\xaa \xd8\xb1\xd8\xa7 \xd9\x85\xdb\x8c\xda\xaf\xd8\xb1\xd9\x81\xd8\xaa\xd9\x86\xd8\xaf', b'188.229.66.45')
(491500426, b'20170528141433', b'/* wbsetdescription-remove:1|pl */ gatunek udomowionego gryzonia', b'Kruk974')
(492348306, b'20170530063929', b'/* wbsetdescription-remove:1|fa */ \xd8\xb4\xd8\xae\xd8\xb5\xd9\x8a\xd8\xaa \xd8\xad\xd8\xb3\xd9\x8a\xd9\x86 \xd8\xa7\xd9\x84\xd9\x86\xd8\xa7\xd8\xa6\xd9\x84\xd9\x8a', b'151.235.67.91')
(492885596, b'20170531155531', b'/* wbsetdescription-remove:1|sv */ jnl', b'213.141.67.89')
(493114213, b'20170601091554', b'/* wbsetdescription-remove:1|id */ Zz', b'Nan Aliyev')
(495405167, b'20170605062508', b'/* wbsetdescription-remove:1|ru */ \xd1\x82\xd0\xb8\xd0\xbf homo sapiens', b'212.26.236.57')
(498130211, b'20170609104537', b'/* wbsetdescription-remove:1|ru */ \xd1\x83\xd1\x87\xd1\x91\xd0\xbd\xd1\x8b\xd0\xb9 \xd0\xb8\xd0\xb7 \xd0\xa5\xd0\xbe\xd1\x80\xd0\xb5\xd0\xb7\xd0\xbc\xd0\xb0, \xd0\xb0\xd0\xb2\xd1\x82\xd0\xbe\xd1\x80 \xd0\xbc\xd0\xbd\xd0\xbe\xd0\xb3\xd0\xbe\xd1\x87\xd0\xb8\xd1\x81\xd0\xbb\xd0\xb5\xd0\xbd\xd0\xbd\xd1\x8b\xd1\x85 \xd0\xba\xd0\xb0\xd0\xbf\xd0\xb8\xd1\x82\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xbd\xd1\x8b\xd1\x85 \xd1\x82\xd1\x80\xd1\x83\xd0\xb4\xd0\xbe\xd0\xb2', b'213.87.123.52')
(498386320, b'20170609193408', b'/* wbsetdescription-remove:1|cs */ .', b'Lollypop M')
(498756084, b'20170610093830', b'/* wbsetdescription-remove:1|id */ Kartu Tanda Masuk Tol Elektronik', b'202.62.16.5')
(499759314, b'20170612061928', b'/* wbsetdescription-remove:1|ar */ \xd8\xa7\xd9\x84\xd8\xb9\xd8\xa8\xd8\xb1\xd9\x8a\xd8\xa9', b'51.235.177.246')
(500571612, b'20170613212518', b'/* wbsetdescription-remove:1|fa */ romi rain', b'Nami.kam')
(500822862, b'20170614143528', b'/* wbsetdescription-remove:1|fi */ Kyrp\xc3\xa4 on hyv\xc3\xa4\xc3\xa4', b'2001:14BB:42:51A5:DDC9:8C89:86D9:DE0C')
(501273284, b'20170615204606', b'/* wbsetdescription-remove:1|ru */ suka', b'\xd0\x90\xd1\x81\xd0\xb8\xd0\xbd\xd0\xbe\xd0\xbd\xd0\xb8\xd0\xba\xd1\x81')
(501847099, b'20170616235151', b'/* wbsetdescription-remove:1|ar */ \xd8\xa7\xd9\x84\xd8\xaf\xd9\x85\xd8\xa7\xd8\xb1 \xd8\xa7\xd9\x84\xd8\xb4\xd8\xa7\xd9\x85\xd9\x84', b'Khalid4kha')
(501959021, b'20170617042414', b'/* wbsetdescription-remove:1|mr */ handicap certificate', b'27.97.140.109')
(502328363, b'20170617144354', b'/* wbsetdescription-remove:1|ru */ \xd0\xbf\xd0\xbe\xd1\x80\xd0\xbd\xd0\xbe\xd0\xb7\xd0\xb2\xd0\xb5\xd0\xb7\xd0\xb4\xd0\xb0', b'Kukakku2000')
(502939544, b'20170618185856', b'/* wbsetdescription-remove:1|ru */ \xd1\x84\xd0\xbe\xd0\xbd\xd0\xb4 \xd0\xbf\xd1\x80\xd0\xb8 \xd0\xbe\xd0\xbe\xd0\xbd', b'Danya20096')
(503469185, b'20170619222444', b'/* wbsetdescription-remove:1|he */ \xd7\xa0\xd7\x99\xd7\xa6\xd7\x97\xd7\x95\xd7\x9f \xd7\x9e\xd7\xa8\xd7\x94\xd7\x99\xd7\x91 \xd7\x9c\xd7\xa8\xd7\x99\xd7\x90\xd7\x9c \xd7\x9e\xd7\x93\xd7\xa8\xd7\x99\xd7\x93, \xd7\x94\xd7\xa4\xd7\xa1\xd7\x93 \xd7\xa6\xd7\x95\xd7\xa8\xd7\x91 \xd7\x9c\xd7\x99\xd7\x95\xd7\x91\xd7\xa0\xd7\x98\xd7\x95\xd7\xa1', b'79.178.110.185')
(504127144, b'20170621110812', b'/* wbsetdescription-remove:1|pl */ warto\xc5\x9b\xc4\x87 samouzasadniaj\xc4\x85cs si\xc4\x99', b'178.235.41.93')
(504929014, b'20170622210557', b'/* wbsetdescription-remove:1|ar */ \xd9\x82\xd8\xb7\xd8\xb1', b'5.41.162.209')
(505916069, b'20170624101733', b'/* wbsetdescription-remove:1|hi */ The best president after Kalam sir till date.', b'47.9.189.207')
(507869856, b'20170627061947', b'/* wbsetdescription-remove:1|he */ \xd7\xaa\xd7\x9b', b'ALMOG111')
(508288436, b'20170627181945', b'/* wbsetdescription-remove:1|uk */ ,n9ijj99o989ooo99999898999k9p', b'46.211.113.133')
(511655345, b'20170702111033', b'/* wbsetdescription-remove:1|ru */ \xd0\xb1\xd1\x82\xd1\x80 80', b'Stormsnipe')
(512399880, b'20170703045304', b'/* wbsetdescription-remove:1|ar */ \xd9\x83\xd8\xa3\xd8\xb3 \xd8\xa7\xd9\x84\xd9\x82\xd8\xa7\xd8\xb1\xd8\xa7\xd8\xaa \xd8\xb1\xd9\x88\xd8\xb3\xd9\x8a\xd8\xa7 2018', b'5.156.18.229')
(512481700, b'20170703082832', b'/* wbsetdescription-remove:1|ru */ .', b'77.222.109.68')
(512585074, b'20170703123654', b'/* wbsetdescription-remove:1|bg */ \xd0\x9c\xd1\x8e\xd1\x81\xd1\x8e\xd0\xbb\xd0\xbc\xd0\xb0\xd0\xbd\xd1\x81\xd0\xba\xd0\xb8 \xd0\xbf\xd1\x80\xd0\xb0\xd0\xb7\xd0\xbd\xd0\xb8\xd0\xba', b'85.118.84.37')
(513078988, b'20170704123345', b'/* wbsetdescription-remove:1|pl */ Eukaryota', b'94.197.121.81')
(514473514, b'20170706124804', b'/* wbsetdescription-remove:1|ja */ \xe3\x82\xbf\xe3\x82\xaf\xe3\x83\x86\xe3\x82\xa3\xe3\x82\xab\xe3\x83\xab\xe3\x83\x9a\xe3\x83\xb3\xe3\x81\xae\xe8\xa3\xbd\xe9\x80\xa0\xe3\x82\x82\xe8\xa1\x8c\xe3\x81\xa3\xe3\x81\xa6\xe3\x81\x84\xe3\x82\x8b', b'211.125.107.166')
(514725332, b'20170706203540', b'/* wbsetdescription-remove:1|pt */ content://media/external/file/254655', b'177.89.197.91')
(514798741, b'20170706230725', b'/* wbsetdescription-remove:1|fa */ \xd8\xaa\xd8\xa7', b'5.121.123.42')
(514899882, b'20170707022435', b'/* wbsetdescription-remove:1|pt */ Este trabalho foi feito com Pietro MP e o jogo eletr\xc3\xb4nico FNAF foi criado por Scott Cawton', b'177.194.210.239')
(515041744, b'20170707075958', b'/* wbsetdescription-remove:1|hi */ \xe0\xa4\x9a\xe0\xa5\x80\xe0\xa4\xa8 \xe0\xa4\x9c\xe0\xa5\x88\xe0\xa4\xb8\xe0\xa5\x87 \xe0\xa4\x85\xe0\xa4\xb2\xe0\xa5\x8b\xe0\xa4\x95\xe0\xa4\xa4\xe0\xa4\xbe\xe0\xa4\x82\xe0\xa4\xa4\xe0\xa5\x8d\xe0\xa4\xb0\xe0\xa4\xbf\xe0\xa4\x95 \xe0\xa4\xa6\xe0\xa5\x87\xe0\xa4\xb6 \xe0\xa4\x95\xe0\xa4\xbe \xe0\xa4\xa6\xe0\xa5\x81\xe0\xa4\xa8\xe0\xa4\xbf\xe0\xa4\xaf\xe0\xa4\xbe \xe0\xa4\x95\xe0\xa5\x87 \xe0\xa4\xb8\xe0\xa4\xac\xe0\xa4\xb8\xe0\xa5\x87 \xe0\xa4\xac\xe0\xa4\xa1\xe0\xa4\xbc\xe0\xa5\x87 \xe0\xa4\xb2\xe0\xa5\x8b\xe0\xa4\x95\xe0\xa4\xa4\xe0\xa4\x82\xe0\xa4\xa4\xe0\xa5\x8d\xe0\xa4\xb0 \xe0\xa4\x95\xe0\xa5\x87 \xe0\xa4\x8a\xe0\xa4\xaa\xe0\xa4\xb0 \xe0\xa4\x95\xe0\xa4\xbf\xe0\xa4\xaf\xe0\xa4\xbe \xe0\xa4\x97\xe0\xa4\xaf\xe0\xa4\xbe \xe0\xa4\xb9\xe0\xa4\xae\xe0\xa4\xb2\xe0\xa4\xbe', b'Rkm iti')
(515152462, b'20170707125603', b'/* wbsetdescription-remove:1|ar */ p', b'109.74.195.83')
(515814108, b'20170708045033', b'/* wbsetdescription-remove:1|ja */ \xe3\x83\xad\xe3\x82\xb7\xe3\x82\xa2\xe7\x9a\x87\xe5\xb8\x9d', b'Maraoyazi')
(515819514, b'20170708050018', b'/* wbsetdescription-remove:1|ja */ \xe3\x83\xad\xe3\x83\x9e\xe3\x83\x8e\xe3\x83\x95\xe6\x9c\x9d\xe7\xac\xac13\xe4\xbb\xa3\xe3\x83\xad\xe3\x82\xb7\xe3\x82\xa2\xe7\x9a\x87\xe5\xb8\x9d', b'Maraoyazi')
(516913524, b'20170709193846', b'/* wbsetdescription-remove:1|fr */ 6', b'178.197.236.7')
(517236625, b'20170710132055', b'/* wbsetdescription-remove:1|zh */ \xe5\xbc\xb5\xe7\xbe\xbd\xe5\xb8\x8c', b'175.45.28.138')
(517490177, b'20170711060300', b'/* wbsetdescription-remove:1|ru */ \xd0\xb6\xd0\xb5\xd0\xbd\xd1\x89\xd0\xb8\xd0\xbd\xd0\xb0 \xd1\x81\xd0\xbe \xd1\x81\xd0\xb1\xd0\xbe\xd1\x81\xd0\xbe\xd0\xb1\xd0\xbd\xd0\xbe\xd1\x81\xd1\x82\xd1\x8f\xd0\xbc\xd0\xb8 \xd1\x81\xd0\xbf\xd0\xb0\xd1\x81\xd0\xb0\xd1\x82\xd1\x8c \xd0\xbc\xd0\xb8\xd1\x80', b'5.18.98.241')
(517881876, b'20170711222230', b'/* wbsetdescription-remove:1|ru */ \xd1\x88\xd1\x82', b'217.66.159.96')
(518124929, b'20170712110748', b'/* wbsetdescription-remove:1|pl */ Mucinska mafia', b'Kryty999')
(519202991, b'20170714060955', b'/* wbsetdescription-remove:1|hi */ \xe0\xa4\xb8\xe0\xa5\x87\xe0\xa4\xb5\xe0\xa4\xbe\xe0\xa4\xb2\xe0\xa4\xbe\xe0\xa4\xb2 \xe0\xa4\xac\xe0\xa4\xa1\xe0\xa4\x97\xe0\xa5\x81\xe0\xa4\x9c\xe0\xa4\xb0', b'Shubham bad')
(520821132, b'20170716073639', b'/* wbsetdescription-remove:1|zh */ \xe7\xb5\xb1\xe4\xba\x8c\xe5\x8c\x96\xe5\xb7\xa5', b'39.10.195.247')
(521474063, b'20170716191646', b'/* wbsetdescription-remove:1|ru */ \xd0\xb4\xd0\xb0\xd0\xb2\xd0\xb0\xd0\xb9 \xd0\xb1\xd1\x8b\xd1\x81\xd1\x82\xd1\x80\xd0\xb5\xd0\xb5 \xd0\xb5\xd0\xb3\xd0\xbe \xd0\xb2 \xd0\x9c\xd0\xb0\xd0\xbd\xd1\x87\xd0\xb5\xd1\x81\xd1\x82\xd0\xb5\xd1\x80 \xd0\xae\xd0\xbd\xd0\xb0\xd0\xb9\xd1\x82\xd0\xb5\xd0\xb4', b'BvBLA')
rev
(523392028,
 b'20170718234640',
 b'/* wbsetdescription-add:1|es */ localidad del departamento de Tacuaremb\xc3\xb3, Uruguay',
 b'Pepe piton')
rev[2].decode()
'/* wbsetdescription-add:1|es */ localidad del departamento de Tacuarembó, Uruguay'
lang = rev[2].decode()[26:28]
'es'
re.match(r'/\* wbsetdescription-([^\:]*)\:1\|([^\ ]*) ', rev[2].decode()).group(2)
'es'