import json
import pywikibot
from pywikibot import pagegenerators
from import APIError
import re
import requests

wikidata_site = pywikibot.Site('wikidata', 'wikidata')
repo = wikidata_site.data_repository()

dataset_query = """SELECT DISTINCT ?item WHERE {
  ?item p:P227/ps:P227 ?value .
  FILTER(REGEX(?value, "^1[01]?\\\\d{7}[0-9X]|[47]\\\\d{6}-\\\\d|[1-9]\\\\d{0,7}-[0-9X]|3\\\\d{7}[0-9X]$") = false ) .
gndPattern = re.compile('^1[01]?\d{7}[0-9X]|[47]\d{6}-\d|[1-9]\d{0,7}-[0-9X]|3\d{7}[0-9X]$')
viafUrlPattern = '{}/justlinks.json'
viafHeaders = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0' }

for Qitem in pagegenerators.WikidataSPARQLPageGenerator(dataset_query, site=wikidata_site):
    if not or 'P227' not in or 'P214' not in
        print('{}: no claims, or no P227 or no P214 claim'.format(Qitem.title()))

    allExistingGnds = []
    for claim2 in['P227']:
    for claim in['P227']:
        oldGnd = claim.getTarget().strip()
        match = gndPattern.findall(oldGnd) # a python list of matches; empty when no matches are found
        if match != None and len(match) > 0:
            print('{}: {} --- pass'.format(Qitem.title(), oldGnd))
        if len(['P214']) != 1:
            print('{}: no VIAF claim found (GND {})'.format(Qitem.title(), oldGnd))
        viaf =['P214'][0].getTarget().strip()
        r = requests.get(viafUrlPattern.format(viaf), headers=viafHeaders)
        if r.status_code != 200:
            print('{}: could not retrieve VIAF justlinks.json, status {} (GND {}; VIAF {})'.format(Qitem.title(), r.status_code, oldGnd, viaf))

        viafLinks = json.loads(r.text)
        if 'DNB' not in viafLinks:
            print('{}: no GND link found in VIAF justlinks.json (GND {}; VIAF {})'.format(Qitem.title(), oldGnd, viaf))
        alreadyRemoved = False
        for newGndUrl in viafLinks['DNB']:
            newGnd = newGndUrl[21:]
            matchNew = gndPattern.findall(newGnd)
            if matchNew == None or len(matchNew) == 0:
                print('{}: new GND identifier also invalid (GND {}; VIAF {}; newGND {})'.format(Qitem.title(), oldGnd, viaf, newGnd))
            print('{}: new GND found: {} (GND {}; VIAF {})'.format(Qitem.title(), newGnd, oldGnd, viaf))
            if newGnd in allExistingGnds:
                    Qitem.removeClaims([claim], summary='remove invalid GND identifier GND:{} (which per VIAF:{} resolves to GND:{}, already set in the item)'.format(oldGnd, viaf, newGnd))
                    alreadyRemoved = True
                    print('* Already found, removed oldGND only')
                except APIError as e:
                    print('* Cannot remove oldGND due to APIerror: {}'.format(Qitem.title(), e))        
                    if alreadyRemoved == True:
                        newGndClaim = pywikibot.Claim(repo, 'P227')
                        Qitem.addClaim(newGndClaim, summary='add GND identifier GND:{} based on VIAF:{}'.format(newGnd, viaf))
                        print('* Add new claim, as the old one was already removed in a previous iteration')
                        claim.changeTarget(value=newGnd, summary='resolve invalid GND identifier GND:{} to GND:{} per VIAF:{}'.format(oldGnd, newGnd, viaf))
                        print('* Not yet found, replaced oldGND by newGND')
                except APIError as e:
                    print('* Cannot update oldGND to newGND due to APIerror: {}'.format(Qitem.title(), e))        
print('Job done, all finished')
Q20853245: 4775939-2 --- pass
Q20853245: no VIAF claim found (GND 970894228)
Q7353829: no VIAF claim found (GND 00433910X)
Q7353829: no VIAF claim found (GND 056396392)
Job done, all finished