import glob
paths = glob.glob('/public/dumps/public/dewiki/20160801/fiwiki-20160801-pages-meta-history.xml.bz2')
paths
[]
import glob
paths = glob.glob('/public/dumps/public/dewiki/20160801/fiwiki-20160801-pages-meta-history.xml.bz2')
paths
[]
import glob
paths = glob.glob('/public/dumps/public/fiwiki/20160801/fiwiki-20160801-pages-meta-history.xml.bz2')
paths

import mwxml
import re

EXTS = ["png", "gif", "jpg", "jpeg"]
# [[(file|image):<file>.<ext>]]
IMAGE_LINK_RE = re.compile(r"\[\[" + 
                           r"(file|image|afbeelding|bestand):" +  # Group 1
                           r"([^\]]+.(" + "|".join(EXTS) + r"))" +  # Group 2 & 3
                           r"(|[^\]]+)?" +  # Group 4
                           r"\]\]")

def extract_image_links(text):
  for m in IMAGE_LINK_RE.finditer(text):
    yield m.group(2)
    
def process_dump(dump, path):
  for page in dump:
    last_count = 0
    for revision in page:
      image_links = list(extract_image_links(revision.text or ""))
      delta = len(image_links) - last_count
      if delta != 0:
        yield revision.id, revision.timestamp, delta
      last_count = len(image_links)
count = 0
for rev_id, rev_timestamp, delta in mwxml.map(process_dump, paths):
    print("\t".join(str(v) for v in [rev_id, rev_timestamp, delta]))
    count += 1
    if count > 10:
        break
import glob
paths = glob.glob('/public/dumps/public/nlwiki/20160801/nlwiki-20160801-pages-meta-history*.xml.bz2')
paths