import json
import pywikibot
from pywikibot import pagegenerators
from pywikibot.data.api import APIError
import re
import requests

wikidata_site = pywikibot.Site('wikidata', 'wikidata')
repo = wikidata_site.data_repository()

dataset_query = """SELECT DISTINCT ?item WHERE {
  ?item p:P227/ps:P227 ?value .
  FILTER(REGEX(?value, "^1[01]?\\\\d{7}[0-9X]|[47]\\\\d{6}-\\\\d|[1-9]\\\\d{0,7}-[0-9X]|3\\\\d{7}[0-9X]$") = false ) .
}"""
gndPattern = re.compile('^1[01]?\d{7}[0-9X]|[47]\d{6}-\d|[1-9]\d{0,7}-[0-9X]|3\d{7}[0-9X]$')
viafUrlPattern = 'https://viaf.org/viaf/{}/justlinks.json'
viafHeaders = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0' }

for Qitem in pagegenerators.WikidataSPARQLPageGenerator(dataset_query, site=wikidata_site):
    Qitem.get()
    
    if not Qitem.claims or 'P227' not in Qitem.claims or 'P214' not in Qitem.claims:
        print('{}: no claims, or no P227 or no P214 claim'.format(Qitem.title()))
        continue

    allExistingGnds = []
    for claim2 in Qitem.claims['P227']:
        allExistingGnds.append(claim2.getTarget().strip())
        
    for claim in Qitem.claims['P227']:
        oldGnd = claim.getTarget().strip()
        match = gndPattern.findall(oldGnd) # a python list of matches; empty when no matches are found
        if match != None and len(match) > 0:
            print('{}: {} --- pass'.format(Qitem.title(), oldGnd))
            continue
        
        if len(Qitem.claims['P214']) != 1:
            print('{}: no VIAF claim found (GND {})'.format(Qitem.title(), oldGnd))
            continue
        
        viaf = Qitem.claims['P214'][0].getTarget().strip()
        r = requests.get(viafUrlPattern.format(viaf), headers=viafHeaders)
        if r.status_code != 200:
            print('{}: could not retrieve VIAF justlinks.json, status {} (GND {}; VIAF {})'.format(Qitem.title(), r.status_code, oldGnd, viaf))
            continue

        viafLinks = json.loads(r.text)
        if 'DNB' not in viafLinks:
            print('{}: no GND link found in VIAF justlinks.json (GND {}; VIAF {})'.format(Qitem.title(), oldGnd, viaf))
            continue
        
        alreadyRemoved = False
        for newGndUrl in viafLinks['DNB']:
            newGnd = newGndUrl[21:]
            matchNew = gndPattern.findall(newGnd)
            if matchNew == None or len(matchNew) == 0:
                print('{}: new GND identifier also invalid (GND {}; VIAF {}; newGND {})'.format(Qitem.title(), oldGnd, viaf, newGnd))
                continue
            
            print('{}: new GND found: {} (GND {}; VIAF {})'.format(Qitem.title(), newGnd, oldGnd, viaf))
            
            if newGnd in allExistingGnds:
                try:
                    Qitem.removeClaims([claim], summary='remove invalid GND identifier GND:{} (which per VIAF:{} resolves to GND:{}, already set in the item)'.format(oldGnd, viaf, newGnd))
                    alreadyRemoved = True
                    print('* Already found, removed oldGND only')
                except APIError as e:
                    print('* Cannot remove oldGND due to APIerror: {}'.format(Qitem.title(), e))        
            else:
                try:
                    if alreadyRemoved == True:
                        newGndClaim = pywikibot.Claim(repo, 'P227')
                        newGndClaim.setTarget(value=newGnd)
                        Qitem.addClaim(newGndClaim, summary='add GND identifier GND:{} based on VIAF:{}'.format(newGnd, viaf))
                        print('* Add new claim, as the old one was already removed in a previous iteration')
                    else:
                        claim.changeTarget(value=newGnd, summary='resolve invalid GND identifier GND:{} to GND:{} per VIAF:{}'.format(oldGnd, newGnd, viaf))
                        print('* Not yet found, replaced oldGND by newGND')
                except APIError as e:
                    print('* Cannot update oldGND to newGND due to APIerror: {}'.format(Qitem.title(), e))        
    
print('Job done, all finished')
Q20853245: 4775939-2 --- pass
Q20853245: no VIAF claim found (GND 970894228)
Q7353829: no VIAF claim found (GND 00433910X)
Q7353829: no VIAF claim found (GND 056396392)
Job done, all finished