import pandas as pd
import matplotlib.pyplot as plt
import datetime
import numpy as np
import pymysql
from sqlalchemy import create_engine
%matplotlib inline
import requests
# Read the file published by xxxxx
wditems = pd.read_json(path_or_buf="/Users/sarasua/Downloads/wikidatawiki.labelings.5k.json", lines=True,encoding='utf-8', orient="records")
wditems.head()
auto_labeled autolabel claims item_quality page_len page_title rev_id strata timestamp
0 False {} {'P31': None} E 244 Q9339998 18870298 1024 2017-04-28 13:24:08.021190
1 False {} {'P31': None} E 485 Q27449075 390874717 1024 2017-05-04 11:01:37.170210
2 False {} {'P31': 'Q13417114'} E 891 Q1722071 321169678 1024 2017-04-17 15:18:24.166690
3 False {} {'P31': None} E 188 Q7924576 14031509 1024 2017-04-19 13:25:01.458270
4 False {} {'P31': None} E 338 Q25664377 355623343 1024 2017-04-15 20:16:25.217360
# Focus on A* elements - count
aitems= wditems[wditems['item_quality'] =='A']
aitems.head()
auto_labeled autolabel claims item_quality page_len page_title rev_id strata timestamp
3096 False {} {'P31': 'Q16521'} A 146159 Q11575 471056890 262144 2017-04-30 20:20:00.049620
3123 False {} {'P31': 'Q11424'} A 174149 Q110365 470323282 262144 2017-04-12 09:57:14.507070
3192 False {} {'P31': 'Q5'} A 148917 Q23505 468512153 262144 2017-04-24 14:35:08.470330
3204 False {} {'P31': 'Q6256'} A 215800 Q786 472965712 262144 2017-05-01 21:17:16.909150
3332 False {} {'P31': 'Q5119'} A 418973 Q1085 471997655 inf 2017-04-24 10:05:20.340270
print(len(aitems))
322
# to compare
eitems= wditems[wditems['item_quality'] =='E']
eitems.head()
auto_labeled autolabel claims item_quality page_len page_title rev_id strata timestamp
0 False {} {'P31': None} E 244 Q9339998 18870298 1024 2017-04-28 13:24:08.021190
1 False {} {'P31': None} E 485 Q27449075 390874717 1024 2017-05-04 11:01:37.170210
2 False {} {'P31': 'Q13417114'} E 891 Q1722071 321169678 1024 2017-04-17 15:18:24.166690
3 False {} {'P31': None} E 188 Q7924576 14031509 1024 2017-04-19 13:25:01.458270
4 False {} {'P31': None} E 338 Q25664377 355623343 1024 2017-04-15 20:16:25.217360
print(len(eitems))
1470
# Compute some statistis
## Contributors & data content

Basic stats

import json
def getEditorsFromItem(row):
    itemid = row[5]
    #print(itemid) # testing
    
    url = 'https://www.wikidata.org/w/api.php?action=query&format=json&prop=contributors&titles='+itemid
    
    response = requests.get(url)
    
    anoncontributors = None
    registeredcontributors = None
    
    if (response != None) & (response.status_code == 200): 
        re = response.json()
        re = response.json()
    
        # format looks like this
        ''' {'continue': {'pccontinue': '13096|3582', 'continue': '||'}, 
        'query': {'pages': {'13096': {'pageid': 13096, 'ns': 0, 'title': 'Q11575', 'anoncontributors': 17, 'contributors': [{'userid': 24, 'name': 'Abián'}, {'userid': 56, 'name': 'Ebrahim'}, 
                                                                                                                        {'userid': 159, 'name': 'Mutante'}, {'userid': 725, 'name': 'Holder'}, 
                                                                                                                        {'userid': 908, 'name': 'Art-top'}, {'userid': 2152, 'name': 'Sadads'}, 
                
                {'userid': 2496, 'name': 'YMS'}, {'userid': 2688, 'name': 'W.SE'}, 
                                                                                                                   {'userid': 2936, 'name': 'Stryn'}, {'userid': 3280, 'name': 'MerlIwBot'}]}}}}
        '''
        key_itempageid = list(re["query"]["pages"].keys())[0]
    
    
    
   
    
        if 'anoncontributors' in re["query"]["pages"][key_itempageid].keys():
            anoncontributors = re["query"]["pages"][key_itempageid]["anoncontributors"]
        if 'contributors' in re["query"]["pages"][key_itempageid].keys():     # registered users are listed individually with userid
            listregcontributors = re["query"]["pages"][key_itempageid]["contributors"]
            registeredcontributors = len(listregcontributors)
           
    
    
    result = pd.Series([anoncontributors, registeredcontributors])
    return result
    
   
    
editors = aitems.apply(getEditorsFromItem, axis=1)
editors.columns=['anoncontributors','regcontributors']
editors.head()
anoncontributors regcontributors
3096 17.0 10.0
3123 18.0 10.0
3192 19.0 10.0
3204 41.0 10.0
3332 13.0 10.0
def getItemDescription(row):
    itemid = row[5]
    
    #http://www.wikidata.org/wiki/Special:EntityData/Q42
    url = 'http://www.wikidata.org/wiki/Special:EntityData/'+itemid
    response = requests.get(url)
    
    labelscount = None
    descriptionscount = None
    claimscount = None
    sitelinkscount = None
    
    if (response != None) & (response.status_code == 200): 
        re = response.json()
    
        # see the format here: https://www.wikidata.org/wiki/Special:ApiSandbox#action=wbgetentities&format=json&ids=Q72
    
        
    
       
    
        #if key exists - add
        if 'labels' in re["entities"][itemid].keys():
            labelscount = len(re["entities"][itemid]["labels"]) # counts languages of labels
    
        if 'descriptions' in re["entities"][itemid].keys():
            descriptionscount = len(re["entities"][itemid]["descriptions"]) # counts languages of descriptions
    
        if 'claims' in re["entities"][itemid].keys():
            claimscount = len(re["entities"][itemid]["claims"]) # counts languages of claims
    
        if 'sitelinks' in re["entities"][itemid].keys():
            sitelinkscount = len(re["entities"][itemid]["sitelinks"]) # counts languages of sitelinks
    
    result = pd.Series([labelscount, descriptionscount,claimscount,sitelinkscount])
    return result
    
   
descriptions = aitems.apply(getItemDescription, axis=1)
descriptions.columns=['labelscount','descriptionscount','claimscount','sitelinkscount']
descriptions.head()
labelscount descriptionscount claimscount sitelinkscount
3096 214 27 55 190
3123 43 24 87 38
3192 139 34 88 158
3204 203 30 105 205
3332 212 43 103 233
descriptions['labelscount'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1172ea898>
# Test how we can compare
editors_a = aitems.apply(getEditorsFromItem, axis=1)
editors_a.columns=['anoncontributors','regcontributors']

editors_e = eitems.apply(getEditorsFromItem, axis=1)
editors_e.columns=['anoncontributors','regcontributors']
editors_a['anoncontributors'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x116f420f0>
editors_e['anoncontributors'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x1170732b0>
descriptions_a = aitems.apply(getItemDescription, axis=1)
descriptions_a.columns=['labelscount','descriptionscount','claimscount','sitelinkscount']
# sparql via gastrodon to know number of statetements etc.
## classification
## description
## outgoing links