import pywikibot
from pywikibot import pagegenerators as pg
import codecs #used in logfiles, unicoded strings
import sys, re
import datetime
import time
import urllib.request
from urllib.parse import quote
from collections import defaultdict

wheretoskip='Wikipedia:Links naar doorverwijspagina\'s/skips'
sourcefromfile='https://tools.wmflabs.org/multichill/queries2/nlwp/links_naar_doorverwijspaginas.txt'
wikiurl={'nl':u'Wikipedia:Links_naar_doorverwijspagina%27s/data'}
linkstostr=u'https://nl.wikipedia.org/w/index.php?title=Speciaal%3AVerwijzingenNaarHier&namespace=0&target='
template = 'template:Dp'
disamb_addition=' (doorverwijspagina)'
allowed_namespaces = [0]
treshold=1  #less then this number of backlinks? the item will be skipped
max_new_pages=100

def getSkiplinks(site):
 skiplinks = defaultdict(list)
 try:
   skiplinkspage = pywikibot.Page(site, wheretoskip).get()
   lines = re.findall("\*.*", skiplinkspage)
   linkre = re.compile("\[([^\[\|\]]*)[\]\|]")
   for line in lines:
     titles = linkre.findall(line)
     if len(titles) > 1:
       skiplinks[titles[0]] += titles[1:]
 except pywikibot.NoPage:
   pass
 return skiplinks

def getlinksfromfile(filename):
 with urllib.request.urlopen(filename) as response:
   html = response.read().decode("utf-8")
   result=prevx=''
   collect=False
   for i in range(len(html)):
        x=html[i:i+1]
        if (x=='[') and (prevx=='['):
          collect=True
          result=''
        if (collect):
          result=result+x  
        
        if (x==']') and (prevx==']'):
            yield(result[1:len(result)-2])
            result=''
            collect=False
        prevx=x

def getnewpages(site):
  for page in pg.NewpagesPageGenerator(site,0,max_new_pages):
    if (page.namespace().id in allowed_namespaces):
        yield(page.title())

def count_links(dppage):
  linksfound=0
  if (dppage.title().find(disamb_addition)==-1):
   for onelink in dppage.backlinks():
    if onelink.namespace().id in allowed_namespaces:
     if not onelink.isRedirectPage():
      if not onelink.isDisambig():
       if not (dppage.title() == onelink.title() + disamb_addition):
        if not onelink.title() in skiplinks[dppage.title()]:
              linksfound +=1
              #print('%s-%s-%s' % (linksfound,dppage.title(),onelink.title()))
  return linksfound

def process_one_disambiguation_page(site,pagetitle,result):
 if (pagetitle.find(disamb_addition)<0):
  dpPage=pywikibot.Page(site,pagetitle)  #pagetitle can contain spaces or underscores
  linksfound = count_links(dpPage)
  if (linksfound>=treshold):
    if (not (dpPage.title() in result)):
      result.update({dpPage.title():linksfound})
    else:
      pass  

def process_one_regular_page(site,pagetitle,result):
   page=pywikibot.Page(site,pagetitle)
   for link in page.linkedPages():
        if link.isDisambig():
            process_one_disambiguation_page(site, link.title(),result)

starttime=time.strftime(time.ctime())
print('Start: %s' % starttime)        
site=pywikibot.Site('nl')
skiplinks=getSkiplinks(site)
#print(skiplinks); print(0/0)
result={}
wikistr = u'{{verwijzing2|WP:LND/D}}\n'
wikistr += u'Deze pagina wordt met regelmaat door een bot opnieuw gemaakt.\n'
wikistr += u'Zie de geschiedenis van de pagina wanneer, en door welke bot.\n'
wikistr += u'Als hier links zijn meegeteld die niet gerepareerd hoeven te worden, voeg ze dan toe op [[%s]].\n' %(wheretoskip)
wikistr += u'{| class="wikitable sortable"\n|-\n! Artikel !! Aantal !! Links \n'

for link in getlinksfromfile('https://tools.wmflabs.org/multichill/queries2/nlwp/links_naar_doorverwijspaginas.txt'):
  process_one_disambiguation_page(site,link,result)
for link in getnewpages(site):
  process_one_regular_page(site,link,result)
process_one_regular_page(site,wikiurl['nl'],result)  #the actual page, refresh
for item in result:
    wikistr+='|-\n|[[%s]]||%s||[%s%s link]\n' % (item,result[item],linkstostr,quote(item))
wikistr += '|}'   
stoptime=time.strftime(time.ctime())
wikistr += '\n\n%s-%s' % (starttime,stoptime)
pywikibot.Page(site,wikiurl['nl']).put(wikistr,summary='#dp-update')
#print(wikistr)
print('Klaar')    
Start: Tue Dec 11 15:20:50 2018