Python script to convert Wikidata item claims from one format to another

The Museum of Fine Arts Boston changed the format of their unique object identifiers from slug-number format (eg. "naval-cutter-earl-howe-38422") to just the number itself ("38422"). Since there was no redirection on their side, all the links from Wikidata were broken.

This script goes through and converts the P4625 statments from the old to new style IDs. The rough flow:

  • Do a SPARQL query to get all items that have a P4625 claim and put it in a table of qids and objectids
  • Create a new column called newid that is the oldid without the slug (text) content
  • Iterate through the table and delete the old claim and add the new claim
%%time
from urllib.request import urlopen
import numpy as np
import pandas as pd
import requests

# Setup SPARQL queries
wikidata_api_url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'

# Wikidata property for unique identifier for works of a GLAM org (eg. Museum of Fine Arts, Boston)
glam_objectid    = 'P4625'
CPU times: user 1.17 s, sys: 276 ms, total: 1.45 s
Wall time: 1.28 s
## Grab all items in Wikidata with GLAM's ID

query = '''
SELECT ?item ?objectid WHERE { 
  ?item wdt:P4625 ?objectid .
}
'''
data = requests.post(wikidata_api_url, data={'query': query, 'format': 'json'}).json()

resultarray = []
for item in data['results']['bindings']:
    resultarray.append({
        'qid':   int(item['item']['value'].replace('http://www.wikidata.org/entity/Q','')),
        'objectid': str(item['objectid']['value']),
    })

# Turn results into a DataFrame
inwd_df = pd.DataFrame(resultarray)
inwd_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4060 entries, 0 to 4059
Data columns (total 2 columns):
objectid    4060 non-null object
qid         4060 non-null int64
dtypes: int64(1), object(1)
memory usage: 63.5+ KB
# Make copy of the DataFrame
work_df = inwd_df.copy()

# Extract a newid (just numbers) and rename columns
#  Use regex to turn naval-cutter-earl-howe-38422 
#  into 38422
work_df = work_df.join(inwd_df['objectid'].str.extract(r'^.+-(\d+)$'))
work_df.columns = ['objectid','qid','newid']

# Test inspect some rows
# Should look like this
# qid objectid newid
# 69884849 naval-cutter-earl-howe-38422 38422
work_df.sample(10)
objectid qid newid
1068 32889 20771853 NaN
936 35584 20778202 NaN
2636 33866 20634609 NaN
2706 35092 20778338 NaN
2763 265265 20785865 NaN
2334 32939 20553722 NaN
2349 36581 20774118 NaN
2762 23513 20786602 NaN
3975 551631 20786207 NaN
2106 31504 20777567 NaN
import pywikibot
from tqdm import tqdm_notebook as tqdm

### Use pywikibot or Quickstatements to actually replace old with new ids on Wikidata

site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()

# Iterate over all the entries
for index, row in tqdm(work_df.iterrows()):

    # If newid is NaN, that means an old style ID was not found, so skip
    if pd.isna(row['newid']):
        continue

    # For Quickstatments - You could generate QS to make this change too
    #   Uncomment the three lines below if you want to do it that way
    # print ('-Q{}|{}|"{}"'.format(row['qid'],glam_objectid,row['objectid']))
    # print ('Q{}|{}|"{}"'.format(row['qid'],glam_objectid,row['newid']))
    # continue
    
    # Use pywikibot
    
    # Grab the Wikidata entry 
    item = pywikibot.ItemPage(repo, "Q{}".format(str(row['qid'])))
    item_dict = item.get()
    clm_dict = item_dict["claims"]
    clm_list = clm_dict["P4625"]      # Grab claims related to the GLAM object identifier

    # Go through the claims to replace old style with new style
    for clm in clm_list:
        # Make sure we actually see the old-style claim - eg. naval-cutter-earl-howe-38422
        if (clm.getTarget() == str(row['objectid'])):
            # Remove the old claim - eg. naval-cutter-earl-howe-38422
            item.removeClaims(clm,bot=True,summary="remove old style ID")
            # Update to new claim - eg. 38422
            clm.setTarget(str(row['newid']))
            # Add this to Wikidata
            item.addClaim(clm,bot=True,summary="replace old ID with new style")