import glob
paths = glob.glob('/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history*.xml-*.bz2')

import mwxml
import re
import mwparserfromhell


def extract_refs(text):
  count = 0
  for row in mwparserfromhell.parse(text).filter_tags():
    if(row._tag == 'ref'):
      count += 1
  return count
    
def process_dump(dump, path):
  for page in dump:
    last_count = 0
    for revision in page:
      ref_count = extract_refs(revision.text or "")
      delta = ref_count - last_count
      if delta != 0:
        yield revision.page.id, revision.page.namespace, revision.user.id if revision.user else 0, revision.id, revision.timestamp, delta
      last_count = ref_count
count = 0
f = open('data_de_ref_full.txt','w')
for rev_page, page_namespace, rev_user, rev_id, rev_timestamp, delta in mwxml.map(process_dump, paths):
    print("\t".join(str(v) for v in [rev_page, page_namespace, rev_user, rev_id, rev_timestamp.strftime('%d-%m-%Y'), delta]))
    f.write(",".join(str(v) for v in [rev_page, page_namespace, rev_user, rev_id, rev_timestamp.strftime('%d-%m-%Y'), delta, '\n']))
    count += 1
    if False:
        break
    if count % 500 == 0:
        f.flush()
f.close()
8042286	0	624607	126087793	03-01-2014	5
4964505	0	607738	68888926	06-01-2010	1
4964505	0	607738	73854748	02-05-2010	1
8042287	2	373504	126087805	03-01-2014	13
8042288	0	1614807	126087948	03-01-2014	5
8042288	0	459705	136108891	23-11-2014	1
8042288	0	96428	152718972	21-03-2016	1
8042288	0	217280	153325162	09-04-2016	1
8042289	0	1718302	126087986	03-01-2014	1
8042290	0	423759	126087998	03-01-2014	2
 
 
import glob
paths = glob.glob('/public/dumps/public/enwiki/20160901/enwiki-20160901-pages-meta-history*.xml-*.bz2')

import mwxml
import re
import mwparserfromhell


def extract_refs(text):
  count = 0
  for row in mwparserfromhell.parse(text).filter_tags():
    if(row._tag == 'ref'):
      count += 1
  return count
    
def process_dump(dump, path):
  for page in dump:
    last_count = 0
    for revision in page:
      ref_count = extract_refs(revision.text or "")
      delta = ref_count - last_count
      if delta != 0:
        yield revision.page.id, revision.page.namespace, revision.user.id if revision.user else 0, revision.id, revision.timestamp, delta
      last_count = ref_count
count = 0
f = open('data_en_ref_full.txt','w')
for rev_page, page_namespace, rev_user, rev_id, rev_timestamp, delta in mwxml.map(process_dump, paths):
    #print("\t".join(str(v) for v in [rev_page, page_namespace, rev_user, rev_id, rev_timestamp.strftime('%d-%m-%Y'), delta]))
    f.write(",".join(str(v) for v in [rev_page, page_namespace, rev_user, rev_id, rev_timestamp.strftime('%d-%m-%Y'), delta, '\n']))
    count += 1
    if False:
        break
    if count % 500 == 0:
        f.flush()
f.close()
paths = glob.glob('/public/dumps/public/nlwiki/20160801/nlwiki-20160801-pages-meta-history*.xml.bz2')
paths
[]
paths = glob.glob('/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history*.xml-*.bz2')
paths
['/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history4.xml-p004964502p005397066.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history4.xml-p008042286p009043989.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history3.xml-p003822171p004358274.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history1.xml-p000000001p000008196.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history3.xml-p003211964p003822169.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history1.xml-p000008197p000021172.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history1.xml-p000055572p000118657.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history4.xml-p005875665p006359556.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history2.xml-p000610687p000794601.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history4.xml-p007165977p008042283.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history1.xml-p000400461p000425449.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history2.xml-p000794602p001069677.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history1.xml-p000021173p000055571.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history2.xml-p000425451p000610686.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history1.xml-p000118658p000218045.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history3.xml-p002737370p003211957.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history1.xml-p000218046p000400460.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history4.xml-p009043990p009572983.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history3.xml-p004358276p004964500.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history2.xml-p001411674p001833566.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history2.xml-p001833569p001877043.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history4.xml-p006770728p007165976.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history3.xml-p001877047p002312677.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history3.xml-p002312678p002737368.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history4.xml-p006359557p006770727.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history2.xml-p001069678p001411671.bz2',
 '/public/dumps/public/dewiki/20160901/dewiki-20160901-pages-meta-history4.xml-p005397068p005875664.bz2']