COVID-19 All Related articles (and tagged relation)

In this notebook we focus on maximum recall on articles related with COVID.

First we take all Wikidata Articles that links to a main COVID-19 pages COVID-19 (Q84263196) and the 2019–20 COVID-19 pandemic (Q81068910), and then we join both sets and do a final pass to get relationships between the seeds and all the resulting items.

In [23]:
#install dependencies 
!pip install SPARQLWrapper
!pip install -U pandas
Requirement already satisfied: SPARQLWrapper in /srv/paws/lib/python3.6/site-packages
Requirement already satisfied: rdflib>=4.0 in /srv/paws/lib/python3.6/site-packages (from SPARQLWrapper)
Requirement already satisfied: isodate in /srv/paws/lib/python3.6/site-packages (from rdflib>=4.0->SPARQLWrapper)
Requirement already satisfied: pyparsing in /srv/paws/lib/python3.6/site-packages (from rdflib>=4.0->SPARQLWrapper)
Requirement already satisfied: six in /srv/paws/lib/python3.6/site-packages (from isodate->rdflib>=4.0->SPARQLWrapper)
Requirement already up-to-date: pandas in /srv/paws/lib/python3.6/site-packages
Requirement already up-to-date: numpy>=1.13.3 in /srv/paws/lib/python3.6/site-packages (from pandas)
Requirement already up-to-date: python-dateutil>=2.6.1 in /srv/paws/lib/python3.6/site-packages (from pandas)
Requirement already up-to-date: pytz>=2017.2 in /srv/paws/lib/python3.6/site-packages (from pandas)
Requirement already up-to-date: six>=1.5 in /srv/paws/lib/python3.6/site-packages (from python-dateutil>=2.6.1->pandas)
In [24]:
import pandas as pd
now  = pd.Timestamp.now()
In [25]:
#getting seed based on what links to  coronavirus disease 2019 (Q84263196)  in Wikidata
import requests

whatLinks = []

# COVID -19
url = 'https://www.wikidata.org/w/api.php?action=query&format=json&list=backlinks&bltitle=Q84263196&bllimit=500&blnamespace=0'
response = requests.get(url=url).json()
whatLinks.extend(response['query']['backlinks'])

while 'continue' in response:
    url = url + '&blcontinue='+ response['continue']['blcontinue']
    response = requests.get(url=url).json()
    whatLinks.extend(response['query']['backlinks'])

QswhatLinks = [v['title'] for v in whatLinks]
QswhatLinks = set(QswhatLinks)
In [26]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
#https://w.wiki/KvX (Thanks User:Dipsacus_fullonum)
# All statements with item, property, value and rank with COVID-19 (Q84263196) as value for qualifier.

sparql.setQuery("""
SELECT ?item ?itemLabel ?property ?propertyLabel ?value ?valueLabel ?rank ?qualifier ?qualifierLabel
WHERE
{
  ?item ?claim ?statement.
  ?property wikibase:claim ?claim.
  ?property wikibase:statementProperty ?sprop.
  ?statement ?sprop ?value.
  ?statement wikibase:rank ?rank. 
  ?statement ?qprop wd:Q84263196. # COVID-19

  
  ?qualifier wikibase:qualifier ?qprop.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

allStatements = pd.io.json.json_normalize(results['results']['bindings'])
/srv/paws/lib/python3.6/site-packages/ipykernel_launcher.py:27: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead
In [27]:
allStatements['valueLabel.value'].value_counts()
Out[27]:
disease outbreak                 437
human                              7
treatment                          2
epidemiological surveillance       1
vaccine                            1
drug repositioning                 1
mascot character                   1
diagnostic test                    1
pandemic                           1
hierarchy of hazard controls       1
pneumonia                          1
moe anthropomorphic character      1
drug development                   1
2020-03-05T00:00:00Z               1
medical diagnosis                  1
Name: valueLabel.value, dtype: int64
In [28]:
# All truthy statements with COVID-19 (Q84263196) as value.
#https://w.wiki/KvZ (Thanks User:Dipsacus_fullonum)

sparql.setQuery("""
SELECT ?item ?itemLabel ?property ?propertyLabel
WHERE
{
  ?item ?claim wd:Q84263196.
  ?property wikibase:directClaim ?claim.
   SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

truthy = pd.io.json.json_normalize(results['results']['bindings'])
/srv/paws/lib/python3.6/site-packages/ipykernel_launcher.py:15: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead
  from ipykernel import kernelapp as app
In [7]:
truthyQ = [ link.split('/')[-1] for link in truthy['item.value'].tolist()]
allStatementsQ = [ link.split('/')[-1] for link in allStatements['item.value'].tolist()]
allSPARQL = set(truthyQ).union(set(allStatementsQ))
In [8]:
allSPARQL - QswhatLinks
Out[8]:
{'L253474-S1'}
In [9]:
QswhatLinks - allSPARQL 
Out[9]:
{'Q10304982', 'Q66777139', 'Q84420257', 'Q85110277', 'Q88019029', 'Q88870103'}
In [10]:
Q84263196AllItems  = allSPARQL.union(QswhatLinks)

2019–20 COVID-19 pandemic (Q81068910)

In [11]:
# 2019–20 COVID-19 pandemic 

url = 'https://www.wikidata.org/w/api.php?action=query&format=json&list=backlinks&bltitle=Q81068910&bllimit=500&blnamespace=0'
response = requests.get(url=url).json()
whatLinks.extend(response['query']['backlinks'])

while 'continue' in response:
    url = url + '&blcontinue='+ response['continue']['blcontinue']
    response = requests.get(url=url).json()
    whatLinks.extend(response['query']['backlinks'])

QswhatLinks2 = [v['title'] for v in whatLinks]
QswhatLinks2 = set(QswhatLinks)
In [12]:
#All truthy statements with 2019–20 COVID-19 pandemic (Q81068910) as value.
#https://w.wiki/Kvd (Thanks User:Dipsacus_fullonum)

sparql.setQuery("""
# 
SELECT ?item ?itemLabel ?property ?propertyLabel WHERE {
  ?item ?claim wd:Q81068910. #2019–20 COVID-19 pandemic
  ?property wikibase:directClaim ?claim.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

Q81068910 = pd.io.json.json_normalize(results['results']['bindings'])
/srv/paws/lib/python3.6/site-packages/ipykernel_launcher.py:17: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead
In [13]:
Q81068910SPARQL = [ link.split('/')[-1] for link in Q81068910['item.value'].tolist()]
Q81068910SPARQL = set(Q81068910SPARQL)
In [14]:
len(QswhatLinks2 - Q81068910SPARQL)
Out[14]:
1111
In [15]:
len(Q81068910SPARQL - QswhatLinks2)
Out[15]:
510
In [16]:
Q81068910All = Q81068910SPARQL.union(QswhatLinks2)
In [17]:
# Joining the full set from both seeds
Qs = Q81068910All.union(Q84263196AllItems)
In [18]:
## add seeds (by definition they were out)
Qs = Q81068910All.union({'Q81068910','Q84263196'})
In [19]:
len(Qs)
Out[19]:
2771
In [20]:
# Adding a third seed  SARS-CoV-2 (Q82069695)
url = 'https://www.wikidata.org/w/api.php?action=query&format=json&list=backlinks&bltitle=Q82069695&bllimit=500&blnamespace=0'
response = requests.get(url=url).json()
whatLinks.extend(response['query']['backlinks'])

while 'continue' in response:
    url = url + '&blcontinue='+ response['continue']['blcontinue']
    response = requests.get(url=url).json()
    whatLinks.extend(response['query']['backlinks'])

QswhatLinks3 = [v['title'] for v in whatLinks]
QswhatLinks3 = set(QswhatLinks)
In [21]:
Qs = Qs.union(QswhatLinks3) 
In [22]:
len(Qs)
Out[22]:
2771

Getting articles and relations

In [33]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
wikidata_query_base = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=aliases|claims|datatype|descriptions|info|labels|sitelinks|sitelinks/urls&ids=' 
itemsInfo = {}
c = 0
for items in chunks(list(Qs),50):
    c +=50
    if c%200 ==0: print(c,'items reviewed')
    url = wikidata_query_base  + '|'.join(items)
    itemsInfo.update(requests.get(url=url).json()['entities'])
200 items reviewed
400 items reviewed
600 items reviewed
800 items reviewed
1000 items reviewed
1200 items reviewed
1400 items reviewed
1600 items reviewed
1800 items reviewed
2000 items reviewed
2200 items reviewed
2400 items reviewed
2600 items reviewed
In [36]:
def getRelationships(claims,targetQs):
    '''
    This function receives a list of claims from a Wikidata Item, and a list of target Qs
    Iterating over the claims, looking for the target Qs and returning the pair Property and target Q
    For example, if it find relationship Part of (P31) of Q12323 (that is the target list)
    will return [(P31,Q3)]
    inputs:
    claims: object, result from wikidata queries like 
            'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids=Q5' 
    targetQs: list of str, where str are Q values 
    output:
        return a list of pairs (prop,target)
    '''
    pairs = []
    for prop, relationships in claims.items():
        for relationship in relationships:
            if 'mainsnak' in relationship:
                datatype = relationship['mainsnak'].get('datatype','')
                if datatype=='wikibase-item':
                    try: #found some cases without  id even for a wikibase-item datatype
                        Qfound = relationship['mainsnak']['datavalue']['value'].get('id','')
                        if Qfound in targetQs:
                            pairs.append([prop,targetQs[targetQs.index(Qfound)]])
                    except:
                        pass
    if not pairs:
        pairs.append(['unknown','unknown'])
    return pairs

def getValueIfWikidataItem(claim):
    '''
    this function return a list of values for a given claim, if those values point to a wikidata item
    datatype=='wikibase-item'
    input:
    claim: object
    output:
    wikidataItems: list of str
      '''
    output = []
    for relationship in claim:
        if 'mainsnak' in relationship:
            datatype = relationship['mainsnak'].get('datatype','')
            if datatype=='wikibase-item':
                Qfound = relationship['mainsnak']['datavalue']['value'].get('id','')
                output.append(Qfound)
    if not output:
        output.append('unknown')
    return output
    
In [74]:
pagesPerProject = {}
pagesPerProjectTable = {}
itemsInfoTable = {}
labelsEn = {}
for item,v in itemsInfo.items():
    itemsInfoTable[item] = {}
    try:
        itemsInfoTable[item]['item Label'] = v['labels']['en']['value']
    except:
        itemsInfoTable[item]['item Label'] = 'unknown '
    #checking if there are claims for that Q, if not claims we return an empty dict, to avoid errors
    claims = v.get('claims',{})
    if 'P31' in  claims: #getting part of to classify the item        
        itemsInfoTable[item]['Instace Of'] = getValueIfWikidataItem(claims.get('P31'))
    else:
        itemsInfoTable[item]['Instace Of'] = ['unknown']
    #find COVID-19 / COVID-19 pandemics relationships
    itemsInfoTable[item]['RelationTuple'] = getRelationships(claims,['Q81068910','Q84263196'])

    if 'sitelinks' in v:
        for wiki,data in v['sitelinks'].items():
            page = data['title']
            project ='%s.%s' %  (data['url'][8:].split('.')[0],data['url'][8:].split('.')[1]) #could be more elegant with regex           
            pagesPerProject[project] = pagesPerProject.get(project,[])
            pagesPerProject[project].append(page)
            article_link  = data['url']
            if project.split('.')[1] == 'wikipedia' or  project.split('.')[0] == 'commons': #iwlinks : https://meta.wikimedia.org/wiki/Help:Interwiki_linking
                projectcode = project.split('.')[0]
            else:
                projectcode = '%s:%s ' % (project.split('.')[1],project.split('.')[0])
            wikilink = '[[%s:%s|%s]]' % (projectcode,page,page)
            pagesPerProjectTable[article_link] = {'project':project,'page':page,'wikidataItem':item,'wikilink':wikilink}
            
            
itemsInfoTable = pd.DataFrame.from_dict(itemsInfoTable,orient='index')
In [75]:
pagesPerProjectTable = pd.DataFrame.from_dict(pagesPerProjectTable,orient='index')
In [76]:
#FINALVERSION Of pagesPerProjectTable

pagesPerProjectTable['url'] = pagesPerProjectTable.index
pagesPerProjectTable
Out[76]:
project page wikidataItem wikilink url
https://commons.wikimedia.org/wiki/Category:Thiago_Seyboth_Wild commons.wikimedia Category:Thiago Seyboth Wild Q50198692 [[commons:Category:Thiago Seyboth Wild|Categor... https://commons.wikimedia.org/wiki/Category:Th...
https://de.wikipedia.org/wiki/Thiago_Seyboth_Wild de.wikipedia Thiago Seyboth Wild Q50198692 [[de:Thiago Seyboth Wild|Thiago Seyboth Wild]] https://de.wikipedia.org/wiki/Thiago_Seyboth_Wild
https://en.wikipedia.org/wiki/Thiago_Seyboth_Wild en.wikipedia Thiago Seyboth Wild Q50198692 [[en:Thiago Seyboth Wild|Thiago Seyboth Wild]] https://en.wikipedia.org/wiki/Thiago_Seyboth_Wild
https://es.wikipedia.org/wiki/Thiago_Seyboth_Wild es.wikipedia Thiago Seyboth Wild Q50198692 [[es:Thiago Seyboth Wild|Thiago Seyboth Wild]] https://es.wikipedia.org/wiki/Thiago_Seyboth_Wild
https://fr.wikipedia.org/wiki/Thiago_Seyboth_Wild fr.wikipedia Thiago Seyboth Wild Q50198692 [[fr:Thiago Seyboth Wild|Thiago Seyboth Wild]] https://fr.wikipedia.org/wiki/Thiago_Seyboth_Wild
... ... ... ... ... ...
https://pt.wikipedia.org/wiki/Pandemia_de_COVID-19_no_Uzbequist%C3%A3o pt.wikipedia Pandemia de COVID-19 no Uzbequistão Q87755912 [[pt:Pandemia de COVID-19 no Uzbequistão|Pande... https://pt.wikipedia.org/wiki/Pandemia_de_COVI...
https://ta.wikipedia.org/wiki/2020_%E0%AE%89%E0%AE%9A%E0%AF%81%E0%AE%AA%E0%AF%86%E0%AE%95%E0%AF%8D%E0%AE%95%E0%AE%BF%E0%AE%9A%E0%AF%81%E0%AE%A4%E0%AE%BE%E0%AE%A9%E0%AE%BF%E0%AE%B2%E0%AF%8D_%E0%AE%95%E0%AF%8A%E0%AE%B0%E0%AF%8B%E0%AE%A9%E0%AE%BE%E0%AE%B5%E0%AF%88%E0%AE%B0%E0%AE%9A%E0%AF%81%E0%AE%A4%E0%AF%8D_%E0%AE%A4%E0%AF%8A%E0%AE%B1%E0%AF%8D%E0%AE%B1%E0%AF%81 ta.wikipedia 2020 உசுபெக்கிசுதானில் கொரோனாவைரசுத் தொற்று Q87755912 [[ta:2020 உசுபெக்கிசுதானில் கொரோனாவைரசுத் தொற்... https://ta.wikipedia.org/wiki/2020_%E0%AE%89%E...
https://tr.wikipedia.org/wiki/%C3%96zbekistan%27da_2020_koronavir%C3%BCs_pandemisi tr.wikipedia Özbekistan'da 2020 koronavirüs pandemisi Q87755912 [[tr:Özbekistan'da 2020 koronavirüs pandemisi|... https://tr.wikipedia.org/wiki/%C3%96zbekistan%...
https://uz.wikipedia.org/wiki/O%CA%BBzbekistonda_COVID-19_pandemiyasi uz.wikipedia Oʻzbekistonda COVID-19 pandemiyasi Q87755912 [[uz:Oʻzbekistonda COVID-19 pandemiyasi|Oʻzbek... https://uz.wikipedia.org/wiki/O%CA%BBzbekiston...
https://vi.wikipedia.org/wiki/%C4%90%E1%BA%A1i_d%E1%BB%8Bch_COVID-19_t%E1%BA%A1i_Uzbekistan vi.wikipedia Đại dịch COVID-19 tại Uzbekistan Q87755912 [[vi:Đại dịch COVID-19 tại Uzbekistan|Đại dịch... https://vi.wikipedia.org/wiki/%C4%90%E1%BA%A1i...

10538 rows × 5 columns

In [77]:
itemsInfoTable = itemsInfoTable.explode('Instace Of').explode('RelationTuple')
In [78]:
itemsInfoTable['connector'] = itemsInfoTable['RelationTuple'].apply(lambda x:x[0])
itemsInfoTable['connected To'] = itemsInfoTable['RelationTuple'].apply(lambda x:x[1])
itemsInfoTable.drop('RelationTuple',inplace=True,axis=1)
In [79]:
connectedToLabel = {'Q84263196':'COVID-19', 'Q81068910':'2019–20 COVID-19 pandemic'} 
itemsInfoTable['connected To Label'] = itemsInfoTable['connected To'].apply(lambda x:connectedToLabel.get(x))
In [80]:
## Getting labels for connector (properties)
Ps = list(itemsInfoTable['connector'].unique())
props = []
for P in Ps:
    props.append(requests.get('https://www.wikidata.org/w/api.php?action=wbgetentities&ids=%s&format=json' % P).json())
In [81]:
propLabels ={}
for P in props:
    if 'entities' in P:
        for Pid,data in P['entities'].items():
            tmplabel = data.get('labels').get('en',{})
            propLabels[Pid]= tmplabel.get('value','unknown')
propLabels = pd.DataFrame.from_dict(propLabels,orient='index',columns=['connector Label'])
propLabels['connector'] = propLabels.index
In [82]:
itemsInfoTable = itemsInfoTable.join(propLabels, on='connector',rsuffix='_tmp').drop('connector_tmp',axis=1)
In [83]:
itemsInfoTable['item_id'] = itemsInfoTable.index
In [84]:
itemsInfoTable
Out[84]:
item Label Instace Of connector connected To connected To Label connector Label item_id
Q50198692 Thiago Seyboth Wild Q5 P1050 Q84263196 COVID-19 medical condition Q50198692
Q88938156 2020 coronavirus pandemic in Swiss canton AR Q3241045 unknown unknown None NaN Q88938156
Q88976185 CORONAVIRUS IN PREGNANCY AND DELIVERY: RAPID R... Q13442814 P921 Q84263196 COVID-19 main subject Q88976185
Q88976185 CORONAVIRUS IN PREGNANCY AND DELIVERY: RAPID R... Q13442814 P921 Q81068910 2019–20 COVID-19 pandemic main subject Q88976185
Q88973815 Exploring the coronavirus epidemic using the n... Q13442814 P921 Q84263196 COVID-19 main subject Q88973815
... ... ... ... ... ... ... ...
Q87461608 Potential interventions for novel coronavirus ... Q13442814 P921 Q84263196 COVID-19 main subject Q87461608
Q87349559 Category:People with coronavirus disease 2019 Q4167836 P971 Q84263196 COVID-19 category combines topics Q87349559
Q87755912 2020 coronavirus pandemic in Uzbekistan Q3241045 unknown unknown None NaN Q87755912
Q88974700 Comparison of throat swabs and sputum specimen... Q13442814 P921 Q84263196 COVID-19 main subject Q88974700
Q88974700 Comparison of throat swabs and sputum specimen... Q13442814 P921 Q81068910 2019–20 COVID-19 pandemic main subject Q88974700

3984 rows × 7 columns

In [85]:
## Getting Instance of labels
instaceOfQs = list(itemsInfoTable['Instace Of'].unique())
print(len(instaceOfQs))
QiOf = [] # Q instace
for Q in instaceOfQs:
    QiOf.append(requests.get('https://www.wikidata.org/w/api.php?action=wbgetentities&ids=%s&format=json' % Q).json())
98
In [86]:
QiOfLabels ={}
for P in QiOf:
    if 'entities' in P:
        for Pid,data in P['entities'].items():
            tmplabel = data.get('labels').get('en',{})
            QiOfLabels[Pid]= tmplabel.get('value','unknown')
QiOfLabels = pd.DataFrame.from_dict(QiOfLabels,orient='index',columns=['Instace Of Label'])
QiOfLabels['Instace Of'] = QiOfLabels.index
In [87]:
#FINALVERSION Of Info Table
itemsInfoTable = itemsInfoTable.join(QiOfLabels, on='Instace Of',rsuffix='_tmp').drop('Instace Of_tmp',axis=1)
In [88]:
nonHumans  = itemsInfoTable[itemsInfoTable['Instace Of Label'] != 'human']
In [89]:
nonHumans
Out[89]:
item Label Instace Of connector connected To connected To Label connector Label item_id Instace Of Label
Q88938156 2020 coronavirus pandemic in Swiss canton AR Q3241045 unknown unknown None NaN Q88938156 disease outbreak
Q88976185 CORONAVIRUS IN PREGNANCY AND DELIVERY: RAPID R... Q13442814 P921 Q84263196 COVID-19 main subject Q88976185 scholarly article
Q88976185 CORONAVIRUS IN PREGNANCY AND DELIVERY: RAPID R... Q13442814 P921 Q81068910 2019–20 COVID-19 pandemic main subject Q88976185 scholarly article
Q88973815 Exploring the coronavirus epidemic using the n... Q13442814 P921 Q84263196 COVID-19 main subject Q88973815 scholarly article
Q88973815 Exploring the coronavirus epidemic using the n... Q13442814 P921 Q81068910 2019–20 COVID-19 pandemic main subject Q88973815 scholarly article
... ... ... ... ... ... ... ... ...
Q87461608 Potential interventions for novel coronavirus ... Q13442814 P921 Q84263196 COVID-19 main subject Q87461608 scholarly article
Q87349559 Category:People with coronavirus disease 2019 Q4167836 P971 Q84263196 COVID-19 category combines topics Q87349559 Wikimedia category
Q87755912 2020 coronavirus pandemic in Uzbekistan Q3241045 unknown unknown None NaN Q87755912 disease outbreak
Q88974700 Comparison of throat swabs and sputum specimen... Q13442814 P921 Q84263196 COVID-19 main subject Q88974700 scholarly article
Q88974700 Comparison of throat swabs and sputum specimen... Q13442814 P921 Q81068910 2019–20 COVID-19 pandemic main subject Q88974700 scholarly article

3184 rows × 8 columns

In [90]:
nonHumansPages = nonHumans.join(pagesPerProjectTable.set_index('wikidataItem'))
In [91]:
nonHumansPages
Out[91]:
item Label Instace Of connector connected To connected To Label connector Label item_id Instace Of Label project page wikilink url
Q103177 severe acute respiratory syndrome Q18123741 P1542 Q81068910 2019–20 COVID-19 pandemic has effect Q103177 infectious disease af.wikipedia Ernstige akute respiratoriese sindroom [[af:Ernstige akute respiratoriese sindroom|Er... https://af.wikipedia.org/wiki/Ernstige_akute_r...
Q103177 severe acute respiratory syndrome Q18123741 P1542 Q81068910 2019–20 COVID-19 pandemic has effect Q103177 infectious disease ar.wikipedia متلازمة تنفسية حادة وخيمة [[ar:متلازمة تنفسية حادة وخيمة|متلازمة تنفسية ... https://ar.wikipedia.org/wiki/%D9%85%D8%AA%D9%...
Q103177 severe acute respiratory syndrome Q18123741 P1542 Q81068910 2019–20 COVID-19 pandemic has effect Q103177 infectious disease ast.wikipedia Síndrome respiratoriu agudu grave [[ast:Síndrome respiratoriu agudu grave|Síndro... https://ast.wikipedia.org/wiki/S%C3%ADndrome_r...
Q103177 severe acute respiratory syndrome Q18123741 P1542 Q81068910 2019–20 COVID-19 pandemic has effect Q103177 infectious disease azb.wikipedia سارس [[azb:سارس|سارس]] https://azb.wikipedia.org/wiki/%D8%B3%D8%A7%D8...
Q103177 severe acute respiratory syndrome Q18123741 P1542 Q81068910 2019–20 COVID-19 pandemic has effect Q103177 infectious disease be.wikipedia Цяжкі востры рэспіраторны сіндром [[be:Цяжкі востры рэспіраторны сіндром|Цяжкі в... https://be.wikipedia.org/wiki/%D0%A6%D1%8F%D0%...
... ... ... ... ... ... ... ... ... ... ... ... ...
Q89368376 COVID-19 crisis 2020 at the GGD Hollands Midden Q3241045 unknown unknown None NaN Q89368376 disease outbreak NaN NaN NaN NaN
Q89368378 COVID-19 crisis 2020 at the GGD Rotterdam-Rijn... Q3241045 unknown unknown None NaN Q89368378 disease outbreak NaN NaN NaN NaN
Q89368379 COVID-19 crisis 2020 at the GGD Zuid-Holland-Zuid Q3241045 unknown unknown None NaN Q89368379 disease outbreak NaN NaN NaN NaN
Q89375395 Novel Coronavirus (2019-nCoV) Situation Report 74 Q88380217 P921 Q81068910 2019–20 COVID-19 pandemic main subject Q89375395 WHO situation report NaN NaN NaN NaN
Q89378072 2020 coronavirus pandemic in Washim district Q3241045 P361 Q81068910 2019–20 COVID-19 pandemic part of Q89378072 disease outbreak NaN NaN NaN NaN

10326 rows × 12 columns

In [181]:
nonHumansPages.to_pickle('pagesPerProjectNonHumans20200403.pickle')
In [92]:
import numpy as np
with open('pagesPerProjectNonHumans20200403.wikitext','w') as f:
    for project,data in nonHumansPages.groupby('project'):
        if project != np.nan:
            f.write('\n== %s == \n \n' % project )
            for wikilink,d  in data.groupby('wikilink'):                
                f.write('* %s (' % wikilink)
                for index,cause in d.iterrows():
                    if cause['connector Label'] == np.nan:
                        cause['connector Label'] == 'unknown'
                        cause['connected To Label'] == 'unknown'

                    output = '%s: %s, ' % (cause['connector Label'],cause['connected To Label'])
                output = output[0:-2]
                output += ')\n'
                f.write(output)
In [95]:
!pip  install xlwt
nonHumansPages[['project','page','url','wikilink']].drop_duplicates().to_excel('pagesPerProjectNonHumans20200403.xls')
Requirement already satisfied: xlwt in /srv/paws/lib/python3.6/site-packages
In [96]:
itemsInfoTablePages = itemsInfoTable.join(pagesPerProjectTable.set_index('wikidataItem'))
In [101]:
import numpy as np
with open('pagesPerProjectMethodologyMarch30-excuted.wikitext','w') as f:
    for project,data in nonHumansPages.groupby('project'):
        if project != np.nan:
            f.write('\n== %s == \n \n' % project )
            for wikilink,d  in data.groupby('wikilink'):                
                f.write('* %s (' % wikilink)
                for index,cause in d.iterrows():
                    if cause['connector'] != np.nan:
                        output = '%s: %s, ' % (cause['connector Label'],cause['connected To Label'])
                output = output[0:-2]
                output += ')\n'
                f.write(output)
In [ ]: