# these settings select the category and the snapshots that will be handled
# create duplciates for other sets

# rootcatnames defines the root category(ies), which must be flat at the moment
rootcatnames = ['Category:Cultural heritage monuments in Austria with known IDs']
rootcatnames = ['Category:Cultural heritage monuments in Berlin with known ID']

oldstateDate = '20161211'
newstateDate = '20161229'
oldstateDate = '20161229'
newstateDate = '20170425'
newstateDate = '20181215'
# rootcatnames defines the root category(ies), which must be flat at the moment
rootcatnames = ['Category:Natural monuments in Austria with known ID',
               'Category:Protected landscape elements in Austria with known ID',
               'Category:Protected caves in Austria with known ID',
               'Category:Nature parks in Austria with known ID',
               'Category:Nature reserves in Austria with known ID',
               'Category:Protected landscape elements in Austria with known ID',
               'Category:National parks of Austria with known ID']

oldstateDate = '20161220'
newstateDate = '20170425'
# these settings select the category and the snapshots that will be handled
# create duplciates for other sets

# rootcatnames defines the root category(ies), which must be flat at the moment
rootcatnames = ['Category:Public art in Austria with known IDs']

oldstateDate = '20161211'
newstateDate = '20170425'
# For a category tree on commons count the number of images, rank them and save data

## TODO
# get id from template {{Denkmalgeschütztes Objekt Österreich|id}}, is used as sort key

import pywikibot
import sys
import re
import math
import time
from datetime import date
from datetime import datetime
from enum import Enum
from pywikibot import pagegenerators

import json
import codecs

## preconditions

timestamp = datetime.now()
rootcatname = rootcatnames[0]

# json results
jsondata = 'data/' + rootcatname + timestamp.strftime(" %Y%m%d") + '.json'
# running the script twice a day will overwrite daily results!

log = 'data/' + rootcatname + timestamp.strftime(" %Y%m%d") + '.log'

logfile = codecs.open (log, "w", 'utf-8', buffering=1)

def cleanup():
    if (logfile):
        print ("see %s for log" % log)
        logfile.close()

import atexit
atexit.register(cleanup)

### BEGIN ###
start = time.time() # start stopwatch

dewiki   = pywikibot.Site('de')
wikidata = pywikibot.Site('wikidata', fam='wikidata')
repo     = wikidata.data_repository()
commons  = pywikibot.Site('commons', fam='commons')
popMax   = 80000 # skip municipalities with more than that many pop

dewikiMaxDepth = 4
commonsMaxDepth = 8
maxEntriesLogged = 500
def stopWatch(value):
    '''From seconds to Hours:Minutes:Seconds'''

    valueH = (value/3600)
    Hours = int(valueH)
    valueM = (valueH - Hours)*60
    Minutes = int(valueM)
    valueS = (valueM - Minutes)*60
    Seconds = int(valueS)
    logfile.write ('\ntime consumed: %d h %d m %d s\n' % (Hours, Minutes, Seconds))
    print('\ntime consumed: %d h %d m %d s\n' % (Hours, Minutes, Seconds))

# overall stats
catcount = 0
commonsFilesSum = 0

uniqueCats = set()
for cat in rootcatnames: # there can be more than 1 rootcat
    rootcat = pywikibot.Category(commons,cat) # source category
    
    # get all pages in category and down the tree / by ID is flat
    cats = pagegenerators.SubCategoriesPageGenerator(rootcat, 0)
    uniqueCats = uniqueCats.union(cats)
    #print (uniqueCats)

print ("%5d categories found\n" % (len(uniqueCats)))
logfile.write ("%5d categories found\n" % (len(uniqueCats)))

res = {} # a dict k=commoncat (as cat) and v=count
res2 = [] # array for json, string only
for page in sorted(uniqueCats):
    #if catInScope(page):
    catcount = catcount + 1
    pages = pagegenerators.CategorizedPageGenerator(page, commonsMaxDepth, namespaces = [6]) # files
    commonsFiles = len(set(pages))
    commonsFilesSum = commonsFilesSum+commonsFiles
    res [page] = commonsFiles
    logfile.write ("%5d unique images in %s\n" % (commonsFiles, page))
    #print (page.title())
    sys.stdout.write('.') # progress
    if (catcount % 1000 == 0):
        sys.stdout.write ("(%d)" % (catcount))
    
    ## stopper for test, toggle comment
    #if (catcount>=125): break

print ("\n\n")
logfile.write ('\n\n')

count = 0 # count all rows
rank  = 0 # care for ex aequo positions, count >= rank
lastVal = -1

resSortedByName = sorted(res) # sort asc. by name (as secondary criteria)
resSortedByCount = sorted(resSortedByName, key=res.get, reverse=True) # sort desc. by # of files

# assign rank on sorted list
for c in resSortedByCount: # sort desc. by # of files, when same # of files sorted by name
    count = count + 1
    if (lastVal != -1):
        if res[c] != lastVal:
            lastVal = res[c]
            rank = count
    else:
        lastVal = res[c]
        rank = 1
    
    res2.append({'name':c.title(), 'files': res[c], 'rank':rank, 'pageid': c.pageid})
           
rootres = {'name':rootcatname, 'files': commonsFilesSum, 'rank':-1, 'pageid': rootcat.pageid}
data = {'date': timestamp.strftime("%Y%m%d"),
        'catcount': count,
        'version':'v2',
        'root': rootres,
        'categories': res2
       }
# Writing JSON data
with codecs.open(jsondata, 'w', 'utf-8') as f:
    json.dump(data, f)
    print ("see %s for json result data" % jsondata)

end = time.time() # start stopwatch
stopWatch(end-start)
cleanup() # after! stopwatch write to log file
print ('finished')
 2164 categories found

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................(1000)........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................(2000)....................................................................................................................................................................


see data/Category:Cultural heritage monuments in Berlin with known ID 20181215.json for json result data

time consumed: 1 h 3 m 3 s

see data/Category:Cultural heritage monuments in Berlin with known ID 20181215.log for log
finished
# compare two different results from different dates and create wikitext
# or if there is only one version (old.json does not exist) create only wikitext

import pywikibot
import sys
import re
import math
import time
from datetime import date
from datetime import datetime
from enum import Enum
from pywikibot import pagegenerators

import json
import codecs

## preconditions
# rootcatname defines the root category, which must be flat at the moment
#rootcatname = 'Category:Cultural heritage monuments in Austria with known IDs'
# is set on init

timestamp = datetime.now()
maxEntriesLogged = 500 # temp, remove after next run
rootcatname = rootcatnames[0]

# oldstateDate & newstateDate are set on init

oldstateJson = 'data/' + rootcatname + ' ' + oldstateDate + '.json'
newstateJson = 'data/' + rootcatname + ' ' + newstateDate + '.json'
# depends on existence of files: result = 'data/' + rootcatname + ' ' + newstateDate + '-' + oldstateDate + '.wikitext'

import os

if (not os.path.isfile(newstateJson)):
    print ('>>> File %s does not exist, exiting' % newstateJson)
    assert(False) # quit() terminates the session
# Reading data back
with open(newstateJson, 'r') as f:
     new = json.load(f)
        
if (os.path.isfile(oldstateJson)):
    # compare with old stuff
    result = 'data/' + rootcatname + ' ' + newstateDate + '-' + oldstateDate + '.wikitext'
    # Reading data back
    with open(oldstateJson, 'r') as f:
         old = json.load(f)
    diffmode = True
else:
    print ('>>> File %s does not exist, no compare' % oldstateJson)
    # do a simple print without compare
    result = 'data/' + rootcatname + ' ' + newstateDate + '.wikitext'
    diffmode = False

print (result)
#data = {'date': timestamp.strftime("%Y%m%d"),
#        'catcount': count,
#        'version': version, # formatversion
#        'root': rootres,
#        'categories': res2
#       }
#       res2[c.title()] {files': res[c], 'rank':rank, 'pageid': c.pageid} // version = v1
#       res2 {'name': c.title(), files': res[c], 'rank':rank, 'pageid': c.pageid} // version = v2


def convertV1ToV2(data):
    if (data.get('version', 'v1') == 'v1'): #format 1
        for k in data['root'].keys():
            val = data['root'][k]
            val['name']= k
            data['root']=val
            break
        categories = []
        for k in data['categories'].keys():
            val = data['categories'][k]
            val['name']= k
            categories.append(val)
        data['categories']=categories
        data['version']='v2'
    return data

def checks(data, wikitextfile):
    # do some checks for plausi and unwanted states
    sortedByFiles = sorted(data, key=itemgetter('files'))
    lastFiles = -1 # for grouping
    count = 0
    for i in sortedByFiles:
        if (i['files'] > 2):
            if (count > 0):
                wikitextfile.write ('\n%d categories with cardinality %d\n\n' % (count, lastFiles))
            break # end condition
        if (lastFiles == -1):
            # header
            wikitextfile.write ('\n== remarks ==\n')
            # fall through
        if (lastFiles != i['files']): # group change
            if (count > 0):
                wikitextfile.write ('\n%d categories with cardinality %d\n\n' % (count, lastFiles))
            wikitextfile.write ('=== categories with cardinality %d ===\n' % i['files'])
            lastFiles = i['files']
            sep = ''
        wikitextfile.write ('%s[[:Commons:%s|%s]]\n' % (sep, i['name'], re.sub("Category:", "", i['name'])))
        sep   = '• ' # except for first
        count = count +1

# convert format version v1 to v2 (if necessary)
if (diffmode):
    old = convertV1ToV2(old)
new = convertV1ToV2(new)

from operator import itemgetter, attrgetter
# sort new by rank and by name, access by order
new['categories'] = sorted(new['categories'], key=itemgetter('rank', 'name'))

# access old by pageid, sorting irrelevant
# convert to dict
if (diffmode):
    oldByPageid = {}
    for i in old['categories']:
        oldByPageid[i['pageid']] = i

def createSimpleWikitext(wikitextfile):
    # intro
    t = """\
Untersuchung der Anzahl der Bilder in den Kategorien innerhalb [[:Commons:%s|%s]]. Von insgesamt %d Kategorien werden hier die maximal %d mächtigsten
gelistet. Die Mächtigkeit einer Kategorie wird dabei rekursiv mit allen Unterkategorien (bis zur Tiefe %d) bestimmt.
Es wird keine Rücksicht darauf genommen, ob die Bilder das relevante Objekt zeigen (z.B. ein denkmalgeschütztes Objekt)
oder die Bilder nur im Kontext des Objektes aufgenommen wuren (z.B. Events im Rahmen des denkmalgeschützten Objekts).

Die Proben stammen vom %s (%d Kategorien).

Das Ziehen der Probe dauert etwas mehr als eine Stunde (gemessen an den denkmalgeschützten Objekten mit etwas mehr als 15000 Kategorien).
Änderungen an der Kategoriezuordnung von Bildern in diesem Zeitraum können zu leichten Inkonsistenzen bei 
den Zählern und der Reihenfolge führen, in Einzelfällen auch zu komplett falschen Ergebnissen
(z.B. Ein- / Ausfügen einer mächtigen Unterkategorie).

"""
    t = t % (new['root']['name'], re.sub("Category:", "", new['root']['name']), new['catcount'],
             maxEntriesLogged, commonsMaxDepth,
             datetime.strptime(new['date'], "%Y%m%d").strftime("%d %B %Y"), new['catcount'])
    wikitextfile.write (t)
    
    # header / 7 columns
    wikitextfile.write ('{|class="wikitable sortable toptextcells" style="font-size:85%;"\n!commonscat !! rank !! #files\n')
    # lines
    for i in new['categories']:
        nameWithoutNS = re.sub("Category:", "", i['name'])
        if (i['rank'] <= maxEntriesLogged):
            wikitextfile.write ('|-\n|[[:Commons:%s|%s]] || style="text-align:right"| %d || style="text-align:right"| %d\n'
                                % (i['name'], nameWithoutNS, i['rank'], i['files']))
        else: # rank > maxEntriesLogged
            wikitextfile.write ('|-\n| … || style="text-align:right"|%d || style="text-align:right"|%d \n'
                   % (i['rank'], i['files'])
                  )
            break
    # footer
    wikitextfile.write ('|-\n! ∑ !! &nbsp; !! style="text-align:right"|%d\n'
                        % (new['root']['files']))
    wikitextfile.write ('|}\n\n')

def createDiffWikitext(wikitextfile):
    # intro
    t = """\
Untersuchung der Anzahl der Bilder in den Kategorien innerhalb [[:Commons:%s|%s]] und
Vergleich mit einer vorher gezogenen Stichprobe. Von insgesamt %d Kategorien werden hier die maximal %d mächtigsten
gelistet. Die Mächtigkeit einer Kategorie wird dabei rekursiv mit allen Unterkategorien (bis zur Tiefe %d) bestimmt.
Es wird keine Rücksicht darauf genommen, ob die Bilder das relevante Objekt zeigen (z.B. ein denkmalgeschütztes Objekt)
oder die Bilder nur im Kontext des Objektes aufgenommen wuren (z.B. Events im Rahmen des denkmalgeschützten Objekts).

Verglichen werden hier die Proben vom %s (%d Kategorien) mit dem Stand von %s (%d Kategorien).

Das Ziehen der Probe dauert etwas mehr als eine Stunde (gemessen an den denkmalgeschützten Objekten mit etwas mehr als 15000 Kategorien).
Änderungen an der Kategoriezuordnung von Bildern in diesem Zeitraum können zu leichten Inkonsistenzen bei 
den Zählern und der Reihenfolge führen, in Einzelfällen auch zu komplett falschen Ergebnissen
(z.B. Ein- / Ausfügen einer mächtigen Unterkategorie).

Die Zuordnung der Kategorien erfolgt über die pageid,
sollte also von Verschiebungen der Kategorien unberührt bleiben.

"""
    t = t % (new['root']['name'], re.sub("Category:", "", new['root']['name']), new['catcount'],
             maxEntriesLogged, commonsMaxDepth,
             datetime.strptime(new['date'], "%Y%m%d").strftime("%d %B %Y"), new['catcount'],
             datetime.strptime(old['date'], "%Y%m%d").strftime("%d %B %Y"), old['catcount'])
    wikitextfile.write (t)
    
    # header / 7 columns
    wikitextfile.write ('{|class="wikitable sortable toptextcells" style="font-size:85%;"\n!commonscat !! rank !! #files !! old rank !! Δrank !! #old-files !! Δ#files\n')
    # lines
    for i in new['categories']:
        nameWithoutNS = re.sub("Category:", "", i['name'])
        if (i['rank'] <= maxEntriesLogged):
            op = oldByPageid.get(i['pageid'])
            if (op):
                rankDiff =  op['rank'] - i['rank'] # rank is sorted asc., so old rank = 10 and new rank = 12 results in -2
                if (rankDiff > 0):
                    rankIcon = 'Green Arrow Up.svg'
                elif (rankDiff == 0):
                    rankIcon = 'Equal.svg'
                else:
                    rankIcon = 'Red Arrow Down.svg'
                filesDiff = i['files'] - op['files']
                if (filesDiff > 0):
                    filesIcon = 'Green Arrow Up.svg'
                elif (filesDiff == 0):
                    filesIcon = 'Equal.svg'
                else:
                    filesIcon = 'Red Arrow Down.svg'
                wikitextfile.write ('|-\n|[[:Commons:%s|%s]] || style="text-align:right"| %d || style="text-align:right"| %d '\
                                    '|| style="text-align:right"| %d'\
                                    '|| style="text-align:right" data-sort-value="%d"|[[Datei:%s|rahmenlos|10px]]&nbsp;%+d'\
                                    '|| style="text-align:right"| %d'\
                                    '|| style="text-align:right" data-sort-value="%d"|[[Datei:%s|rahmenlos|10px]]&nbsp;%+d\n'
                                    % (i['name'], nameWithoutNS, i['rank'], i['files'],
                                       op['rank'], rankDiff, rankIcon, rankDiff,
                                       op['files'], filesDiff, filesIcon, filesDiff))
            else:
                wikitextfile.write ('|-\n|[[:Commons:%s|%s]] || style="text-align:right"| %d || style="text-align:right"| %d || &nbsp;|| &nbsp;|| &nbsp;|| &nbsp;\n'
                                    % (i['name'], nameWithoutNS, i['rank'], i['files']))
        else: # rank > maxEntriesLogged
            wikitextfile.write ('|-\n| … || style="text-align:right"|%d || style="text-align:right"|%d '\
                                '|| &nbsp; || &nbsp; || &nbsp; || &nbsp;\n'
                   % (i['rank'], i['files'])
                  )
            break
    # footer
    wikitextfile.write ('|-\n! ∑ !! &nbsp; !! style="text-align:right"|%d !! &nbsp; !! &nbsp; !! style="text-align:right"|%d !! &nbsp;\n'
                        % (new['root']['files'], old['root']['files']))
    wikitextfile.write ('|}\n\n')

with open(result, 'w') as wikitextfile:

    if (diffmode):
        createDiffWikitext(wikitextfile)
        # do plausi checks
        checks(new['categories'], wikitextfile)
    else:
        createSimpleWikitext(wikitextfile)
        # do plausi checks
        checks(new['categories'], wikitextfile)    

    wikitextfile.close ()    


print ("finished")
>>> File data/Category:Cultural heritage monuments in Berlin with known ID 20161229.json does not exist, no compare
data/Category:Cultural heritage monuments in Berlin with known ID 20181215.wikitext
finished
import pywikibot
import sys
import re
import math
import time
from datetime import date
from datetime import datetime
from enum import Enum
from pywikibot import pagegenerators

import json
import codecs

commons  = pywikibot.Site('commons', fam='commons')

query="""\
use commonswiki_p;
set @cat := 'Cultural_heritage_monuments_in_Austria_with_known_IDs';

SELECT cl_sortkey_prefix as id, (select page_title from page where page_id = cl_from) as title,
       cl_type as type, count(*)
   from categorylinks
   WHERE cl_to = @cat
   and cl_sortkey_prefix not in (select cl_sortkey_prefix from categorylinks where cl_type = 'subcat' and cl_to = @cat)
   group by id
   having count(*) > 3
   order by type asc # internal key seems not to match collation of values
         , 4 desc # count
   ;
"""

import os; x = os.system ("python scripts/listpages.py -mysqlquery:\""+query+"\"")
print (x)
#assert (False)
gen = pagegenerators.MySQLPageGenerator(query, site=commons)
for (i, page, t, count) in gen:
    print (page)
print ('finished')
512
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
/srv/paws/pwb/pywikibot/data/mysql.py in <module>()
     12 try:
---> 13     import oursql as mysqldb
     14 except ImportError:

ModuleNotFoundError: No module named 'oursql'

During handling of the above exception, another exception occurred:

ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-4-099e6f79f66c> in <module>()
     33 #assert (False)
     34 gen = pagegenerators.MySQLPageGenerator(query, site=commons)
---> 35 for (i, page, t, count) in gen:
     36     print (page)
     37 print ('finished')

/srv/paws/pwb/pywikibot/pagegenerators.py in MySQLPageGenerator(query, site, verbose)
   2663     @return: generator which yield pywikibot.Page
   2664     """
-> 2665     from pywikibot.data import mysql
   2666 
   2667     if site is None:

/srv/paws/pwb/pywikibot/data/mysql.py in <module>()
     13     import oursql as mysqldb
     14 except ImportError:
---> 15     import MySQLdb as mysqldb
     16 
     17 import pywikibot

ModuleNotFoundError: No module named 'MySQLdb'
for i in new['categories']:
    try:
        diff =  i['files'] - oldByPageid[i['pageid']]['files']
        if ( diff > 8):
            print (i['name'], ":", i['rank'], ":", diff)
    except:
        pass
Category:St. Stephen's Cathedral, Vienna : 1 : 63
Category:Palace and gardens of Schönbrunn : 4 : 23
Category:Heldenplatz, Vienna : 6 : 96
Category:Capuchin Church, Vienna : 7 : 36
Category:Maria-Theresien-Platz, Vienna : 13 : 13
Category:Stift Heiligenkreuz : 16 : 13
Category:Großglockner-Hochalpenstraße : 33 : 19
Category:Stallburg : 37 : 20
Category:Stift Millstatt : 51 : 19
Category:Schloss Hof : 52 : 24
Category:Main building of the Kunsthistorisches Museum : 56 : 13
Category:Laxenburg castles : 68 : 11
Category:Mödlinger Friedhof : 71 : 49
Category:Stift Griffen : 107 : 14
Category:Feuerhalle Simmering : 228 : 10
Category:Schloss Herberstein : 310 : 27
Category:Kloster Wernberg : 472 : 10
Category:Schloss Frauenstein, Carinthia : 824 : 17
Category:Theater in der Josefstadt : 1009 : 13
Category:Waisenhaus (Mödling) : 1009 : 17
Category:Stadtpfarrkirche Mariae Himmelfahrt, Vils : 1042 : 18
Category:Persenbeug Castle : 1124 : 17
Category:Schloss Weyer, Sankt Veit an der Glan : 1753 : 10
print (oldstateDate)