Visualization for wikidata entity relationship

import numpy as np
import requests

import xml.etree.cElementTree as ET
import csv
import os
import os, json
from os import listdir
from os.path import isfile, join
import urllib
from SPARQLWrapper import SPARQLWrapper, JSON
import re
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
%matplotlib
try: 
    from BeautifulSoup import BeautifulSoup
except ImportError:
    from bs4 import BeautifulSoup
Using matplotlib backend: TkAgg
def queryGenerator(entityId,action):
    #query qpi for retreiving entity details from wikidata base (all kind of instances, entities/properties or anything in wikidata)
    query = apiURL+"action="+action+"&ids="+entityId+"&format="+resultFormat
    return query
def getQueryResult(entity,action):
    query = queryGenerator(entity,action)
    
    print("processing ",query)
    data = requests.get(query)
    return data.json(),data.json()['entities'][entity]["labels"]["en"]["value"]
    
def getName(entity,action):
    if(entity in myentities.keys()):
        return myentities[entity]
    else:
        query = queryGenerator(entity,action)
        data = requests.get(query)
        try:
            name = data.json()['entities'][entity]["labels"]["en"]["value"]
        except:
            name = ''
        myentities[entity] = name
        return name
    
def getResult(entities,action):
    mydict = {}
    for entity in entities:
        result,name = getQueryResult(entity,action)
        processJSON(result,entity,mydict)
    return mydict
def getSnakValues(value):
    rname = ""
    r = ""
    # we do not care about novalue/somevalue, we only care about value
    if(value["snaktype"] == "value"):
        try:
            if(value["datatype"]=="wikibase-item"):
                r = value["datavalue"]["value"]["id"]
                #get the name of the item 
                rname = getName(r,action)
            elif(value["datatype"]=="monolingualtext"):
                r = value["datavalue"]["value"]["text"]
            elif(value["datatype"]=="commonsMedia"):
                r = 'https://commons.wikimedia.org/wiki/File:'+value["datavalue"]["value"]+'#file'
            elif(value["datatype"]=="time"):
                # do not handle
                #r = value["datavalue"]["value"]["time"]
                r = ""
            elif(value["datatype"]=="string"):
                r = value["datavalue"]["value"]
            elif(value["datatype"]=="globe-coordinate"):
                r = (value["datavalue"]["value"]["latitude"],value["datavalue"]["value"]["longitude"])
            elif(value["datatype"]=="external-id"):
                r = value["datavalue"]["value"]    
            elif(value["datatype"]=="quantity"):
                # do not handle
                #r = value["datavalue"]["value"]["amount"] 
                r = ""
            else:
                r = value["datavalue"]["value"]
        except:
            print('something wrong with value["datatype"]: ',value["datavalue"])
        
    return (r,rname)
def getLabels(v):
    try:
        return v['en']['value']
    except:
        return 0

def getClaims(v):
    claims = []
    c = []
    for propKey,propValue in v.items():
        
        #get the property name
        propName = getName(propKey,action)
        #for each claim statement
        for value in propValue:
            #claim values
            (r,rname) = getSnakValues(value["mainsnak"])
            #claim qualifiers
            qualifier = []
            ##also do references------------>
            if("qualifiers" in value.keys()):
                for k,v in value["qualifiers"].items():
                    for each in v:
                        qualifierName = getName(each["property"],action)
                        qval,qname = getSnakValues(each)
                        qkey = propKey+":"+r+":"+k
                        qualifier.append((qkey,qval))
                        if(len(qname)!=0):
                            qreturnname = qname
                        else:
                            qreturnname = qval
                        if len(qreturnname)!=0:
                            c.append((qkey,qreturnname))
            claims.append((propKey,r,qualifier))
            if(len(rname)!=0):
                returnname = rname
            else:
                returnname = r
            if len(r)!=0:
                #print(propKey,returnname)
                c.append((propKey,returnname))
    return c
    #return claims

def getDescriptions(v):
    return v['en']['value']

def getAliases(v):
    try:
        return v['en']['value']
    except:
        return 0

def getSitelinks(v):
    #do not handle sitelinks for now
    return v
one layer deep relationship
def processJSON(f,entity,mydict):
    #compare if its there in others
    mydict[entity] = {}
    for k,v in f['entities'][entity].items():
        if k=='labels':
            mydict[entity][k] = getLabels(v)
            myentities[entity] = getLabels(v)
        elif k=='claims':
            allClaims = getClaims(v)
            for c in allClaims:
                if c[0] in mydict[entity].keys():
                    if isinstance(mydict[entity][c[0]],list):
                        if c[1] not in mydict[entity][c[0]]:
                            mydict[entity][c[0]].append(c[1])
                    else:
                        val = mydict[entity][c[0]]
                        mydict[entity][c[0]] = [val]
                        if c[1] not in mydict[entity][c[0]]:
                            mydict[entity][c[0]].append(c[1])
                else:
                    mydict[entity][c[0]]=c[1]
        elif k=='descriptions':
            mydict[entity][k] = getDescriptions(v)
        elif k=='aliases':
            mydict[entity][k] = getAliases(v)
        elif k=='sitelinks':
            pass
            #mydict[entity][k] = getSitelinks(v)
        else:
            mydict[entity][k] = v
    
              
    return mydict
def unhash(values):
        mylist = []
        for each in values:
            if isinstance(each,list):
                for e in each:
                    mylist.append(e)
            else:
                mylist.append(each)
        return mylist
def intersection(l1,l2):
    common = list(set(l1) & set(l2))
    return common

def union(l1,l2):
    common = list(set(l1) | set(l2))
    return common
    
def retreiveRelationship(data):
    #entities[0], entities[1] relationship
    values1 = data[entities[0]].values()
    list1 = unhash(values1)
    
    values2 = data[entities[1]].values()
    list2 = unhash(values2)
    
    common = intersection(list1,list2)
    common = list(filter(None, common)) # fastest
    
    #print(common)
    relations = {}
    #Add the common items and their relationship in to our relations dictionary
    for entity in entities:
        for prop, value in data[entity].items():    # for name, age in list.items():  (for Python 3.x)
            if isinstance(value,list):
                for each in value:
                    if each in common:
                            if each in relations.keys():
                                relations[each].append((prop,entity))
                            else:
                                relations[each] = [(prop,entity)]
            elif value in common:
                if value in relations.keys():
                    relations[value].append((prop,entity))
                else:
                    relations[value] = [(prop,entity)]
    #handle the relationship in case of qualifiers
    #if something is a qualifier, add the prefix claim before that
    print("found relations")
    #for k,v in relations.items():
        #print(k,v)
    return relations
def generateGraphRelations(relations):
    #generate from and to list for graph
    fromList = []
    toList = []
    relationList = []
    for k,v in relations.items():
        for each in v:
            (relation,entity) = each
            #qualifiers
            if ":" in relation: 
                ids = relation.strip().split(":")
                prop = myentities[ids[0]]
                propvalue = myentities[ids[1]]
                qualifier = myentities[ids[2]]
                if entity in myentities.keys():
                    fromList.append(myentities[entity])
                else:
                    fromList.append(entity)
                if prop in myentities.keys():
                    relationList.append(myentities[prop])
                else:
                    relationList.append(prop)
                if propvalue in myentities.keys():
                    fromList.append(myentities[propvalue])
                    toList.append(myentities[propvalue])
                else:
                    fromList.append(propvalue)
                    toList.append(propvalue)
                toList.append(k)
                if qualifier in myentities.keys():
                    relationList.append(myentities[qualifier])
                else:
                    relationList.append(qualifier)
            #not a qualifier
            else:
                if relation in myentities.keys():
                    relationList.append(myentities[relation]) 
                else:
                    relationList.append(relation)
                if entity in myentities.keys():
                    fromList.append(myentities[entity])
                else:
                    fromList.append(entity)
                toList.append(k)
    return fromList,toList,relationList

Execute the cell below to Generate Relationships

#global variables
myentities = {}
entities = ["Q72","Q672290"] #in the form of wikidata id
apiURL = "https://www.wikidata.org/w/api.php?"
#action still passable within the function calls
action = "wbgetentities"
resultFormat = "json"
data = getResult(entities,action)
#values = {}

#get relationships
relations = retreiveRelationship(data)

#generate graphy friendly (nodes and edges)
fromList,toList,relationList = generateGraphRelations(relations)


#use myentities and relations for visualization
nodes = union(fromList,toList)
processing  https://www.wikidata.org/w/api.php?action=wbgetentities&ids=Q72&format=json
processing  https://www.wikidata.org/w/api.php?action=wbgetentities&ids=Q672290&format=json
['item', 'Canton of Zürich', 'Katzensee', 'Switzerland']
found relations
item [('type', 'Q72'), ('type', 'Q672290')]
Switzerland [('P31:Q51929311:P642', 'Q72'), ('P17', 'Q72'), ('P17', 'Q672290'), ('P205', 'Q672290')]
Canton of Zürich [('P131', 'Q72'), ('P1376', 'Q72'), ('P131', 'Q672290')]
Katzensee [('P206', 'Q72'), ('labels', 'Q672290'), ('P373', 'Q672290')]
'''
print("------------")
for k,v in relations.items():
        print(k,v)
print("------------")
print(fromList)
print(toList)
print(relationList)
print(nodes)
'''
------------
item [('type', 'Q72'), ('type', 'Q672290')]
Switzerland [('P31:Q51929311:P642', 'Q72'), ('P17', 'Q72'), ('P17', 'Q672290'), ('P205', 'Q672290')]
Canton of Zürich [('P131', 'Q72'), ('P1376', 'Q72'), ('P131', 'Q672290')]
Katzensee [('P206', 'Q72'), ('labels', 'Q672290'), ('P373', 'Q672290')]
------------
['Zürich', 'Katzensee', 'Zürich', 'largest city', 'Zürich', 'Katzensee', 'Katzensee', 'Zürich', 'Zürich', 'Katzensee', 'Zürich', 'Katzensee', 'Katzensee']
['item', 'item', 'largest city', 'Switzerland', 'Switzerland', 'Switzerland', 'Switzerland', 'Canton of Zürich', 'Canton of Zürich', 'Canton of Zürich', 'Katzensee', 'Katzensee', 'Katzensee']
['type', 'type', 'instance of', 'of', 'country', 'country', 'basin country', 'located in the administrative territorial entity', 'capital of', 'located in the administrative territorial entity', 'located in or next to body of water', 'labels', 'Commons category']
['item', 'Zürich', 'Canton of Zürich', 'Katzensee', 'Switzerland', 'largest city']
#default values that could be used
'''
-----------
item [('type', 'Q72'), ('type', 'Q672290')]
Switzerland [('P31:Q51929311:P642', 'Q72'), ('P17', 'Q72'), ('P17', 'Q672290'), ('P205', 'Q672290')]
Canton of Zürich [('P131', 'Q72'), ('P1376', 'Q72'), ('P131', 'Q672290')]
Katzensee [('P206', 'Q72'), ('labels', 'Q672290'), ('P373', 'Q672290')]
------------
'''
fromList = ['Zürich', 'Katzensee', 'Zürich', 'largest city', 'Zürich', 'Katzensee', 'Katzensee', 'Zürich', 'Zürich', 'Katzensee', 'Zürich', 'Katzensee', 'Katzensee']
toList = ['item', 'item', 'largest city', 'Switzerland', 'Switzerland', 'Switzerland', 'Switzerland', 'Canton of Zürich', 'Canton of Zürich', 'Canton of Zürich', 'Katzensee', 'Katzensee', 'Katzensee']
relationList = ['type', 'type', 'instance of', 'of', 'country', 'country', 'basin country', 'located in the administrative territorial entity', 'capital of', 'located in the administrative territorial entity', 'located in or next to body of water', 'labels', 'Commons category']
graphnodes = ['item', 'Zürich', 'Canton of Zürich', 'Katzensee', 'Switzerland', 'largest city']

Relationship visualization

def visualizeRelationship():
    import networkx as nx
    import matplotlib.pyplot as plt


    fig = plt.figure(figsize=(15,15))
    ax = plt.subplot(111)
    # Step 1: Build up a graph
    G = nx.Graph()
    for n in graphnodes:
        G.add_node(n, alias=n)

    for i in range(len(relationList)):
        #ignore self-loop nodes(not interesting)
        if fromList[i]!=toList[i]:
            G.add_edge(fromList[i], toList[i], relationship=relationList[i])


    # Step 2: Draw the graph and suppress node labels (node id)
    pos = nx.spring_layout(G) 
    nx.draw_networkx(G, pos, with_labels=False,node_color='b',node_size=[len(v) *500 for v in G.nodes()],linewidth=0) # OR: nx.draw(G, pos)

    # Step 3: Draw the graph with the specific node labels
    node_labels = nx.get_node_attributes(G, 'alias') # a dict of attributes keyed by node
    nx.draw_networkx_labels(G, pos, node_labels)
    #draw_networkx_labels(G, pos, labels=None, font_size=12, font_color='k', font_family='sans-serif', font_weight='normal', alpha=1.0, bbox=None, ax=None, **kwds)

    nx.draw_networkx_edges(G, pos, alpha = 0.5, arrows=True)

    edge_labels = nx.get_edge_attributes(G, 'relationship')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    #draw_networkx_edge_labels(G, pos, edge_labels=None, label_pos=0.5, font_size=10, font_color='k', font_family='sans-serif', font_weight='normal', alpha=1.0, bbox=None, ax=None, rotate=True, **kwds)

    #plt.axis('off')
    plt.savefig("relationshipGraph.png", format="PNG")
    plt.show()
visualizeRelationship()
import numpy as np
import pandas as pd
import holoviews as hv
import networkx as nx

hv.extension('bokeh')
%%opts Graph [tools=['hover']]

%opts Graph [width=400 height=400]

# Declare abstract edges
N = 8
node_indices = nodes
source = fromList
target = toList

padding = dict(x=(-1.2, 1.2), y=(-1.2, 1.2))
colors = ['#000000']+hv.Cycle('Category40').values
plot_opts = dict(color_index='one', width=800, height=800, xaxis=None, yaxis=None, show_frame=False)
style_opts = dict(node_size=20, edge_line_width=1, cmap=colors)
simple_graph = hv.Graph(((source, target),)).redim.range(**padding).opts(style=style_opts,plot=plot_opts)
simple_graph