import glob
paths = glob.glob('/public/dumps/public/nlwiki/20160801/nlwiki-20160801-pages-meta-history*.xml.bz2')

import mwxml
import re
import mwparserfromhell


def extract_refs(text):
  count = 0
  for row in mwparserfromhell.parse(text).filter_tags():
    if(row._tag == 'ref'):
      count += 1
  return count
    
def process_dump(dump, path):
  for page in dump:
    last_count = 0
    for revision in page:
      ref_count = extract_refs(revision.text or "")
      delta = ref_count - last_count
      if delta != 0:
        yield revision.page.id, revision.page.namespace, revision.user.id if revision.user else 0, revision.id, revision.timestamp, delta
      last_count = ref_count
count = 0
f = open('data_nl_ref_full.txt','w')
for rev_page, page_namespace, rev_user, rev_id, rev_timestamp, delta in mwxml.map(process_dump, paths):
    print("\t".join(str(v) for v in [rev_page, page_namespace, rev_user, rev_id, rev_timestamp.strftime('%d-%m-%Y'), delta]))
    f.write(",".join(str(v) for v in [rev_page, page_namespace, rev_user, rev_id, rev_timestamp.strftime('%d-%m-%Y'), delta, '\n']))
    count += 1
    if False:
        break
    if count % 500 == 0:
        f.flush()
f.close()
 
 
import glob
paths = glob.glob('/public/dumps/public/nlwiki/20160901/nlwiki-20160901-pages-meta-history*.xml.bz2')

import mwxml
import re
import mwparserfromhell


def extract_refs(text):
  count = 0
  for row in mwparserfromhell.parse(text).filter_tags():
    if(row._tag == 'ref'):
      count += 1
  return count
    
def process_dump(dump, path):
  for page in dump:
    last_count = 0
    for revision in page:
      ref_count = extract_refs(revision.text or "")
      delta = ref_count - last_count
      if delta != 0:
        yield revision.page.id, revision.page.namespace, revision.user.id if revision.user else 0, revision.id, revision.timestamp, delta
      last_count = ref_count
count = 0
f = open('data_nl_ref_full.txt','w')
for rev_page, page_namespace, rev_user, rev_id, rev_timestamp, delta in mwxml.map(process_dump, paths):
    #print("\t".join(str(v) for v in [rev_page, page_namespace, rev_user, rev_id, rev_timestamp.strftime('%d-%m-%Y'), delta]))
    f.write(",".join(str(v) for v in [rev_page, page_namespace, rev_user, rev_id, rev_timestamp.strftime('%d-%m-%Y'), delta, '\n']))
    count += 1
    if False:
        break
    if count % 500 == 0:
        f.flush()
f.close()
paths = glob.glob('/public/dumps/public/nlwiki/20160801/nlwiki-20160801-pages-meta-history*.xml.bz2')
paths
[]