import glob
paths = glob.glob('/public/dumps/public/ptwiki/20160901/ptwiki-20160901-pages-meta-history*.xml.bz2')

import mwxml
import re
import mwparserfromhell


def extract_refs(text):
  count = 0
  for row in mwparserfromhell.parse(text).filter_tags():
    if(row._tag == 'ref'):
      count += 1
  return count
    
def process_dump(dump, path):
  for page in dump:
    last_count = 0
    for revision in page:
      ref_count = extract_refs(revision.text or "")
      delta = ref_count - last_count
      if delta != 0:
        yield revision.page.id, revision.page.namespace, revision.user.id if revision.user else 0, revision.id, revision.timestamp, delta
      last_count = ref_count
count = 0
f = open('data_pt_ref_full.txt','w')
for rev_page, page_namespace, rev_user, rev_id, rev_timestamp, delta in mwxml.map(process_dump, paths):
    print("\t".join(str(v) for v in [rev_page, page_namespace, rev_user, rev_id, rev_timestamp.strftime('%d-%m-%Y'), delta]))
    f.write(",".join(str(v) for v in [rev_page, page_namespace, rev_user, rev_id, rev_timestamp.strftime('%d-%m-%Y'), delta, '\n']))
    count += 1
    if count > 100:
        break
    if count % 500 == 0:
        f.flush()
f.close()
2309292	0	5844	30353811	22-05-2012	6
2309300	0	106672	15994295	09-07-2009	3
2309309	0	553617	15994328	09-07-2009	2
2309309	0	553617	16077019	16-07-2009	1
2309309	0	511382	16824186	11-09-2009	1
158763	0	None	17590929	10-11-2009	1
158763	0	10425	18114815	22-12-2009	-1
158763	0	260498	18128358	23-12-2009	1
2309315	0	204901	32341467	24-09-2012	4
158763	0	None	18283836	05-01-2010	1
158763	0	260498	18283874	05-01-2010	-1
158763	0	10425	18399116	13-01-2010	-1
948320	0	932853	36400818	14-07-2013	1
2309317	0	None	25499145	31-05-2011	1
2309317	0	None	25654586	13-06-2011	1
2309317	0	None	25659633	13-06-2011	1
158763	0	260498	40108714	21-09-2014	1
158763	0	1068444	40510886	06-11-2014	1
2309317	0	None	33159509	03-12-2012	1
158763	0	260498	40905316	22-12-2014	-1
2309317	0	None	36707884	19-08-2013	1
158763	0	None	43076172	09-08-2015	1
158763	0	64385	43076177	09-08-2015	-1
948416	0	201391	5427997	25-03-2007	1
948416	0	201391	5428018	25-03-2007	-1
948416	0	372348	11259902	29-06-2008	2
948416	0	None	12656236	10-10-2008	-2
948416	0	446582	12656239	10-10-2008	2
2309317	0	490845	38976481	22-05-2014	1
2309317	0	787541	38977365	22-05-2014	1
2309317	0	490845	38978323	22-05-2014	-1
158774	0	293735	20621615	11-06-2010	3
158774	0	293735	20621777	11-06-2010	4
948416	0	328476	15143678	01-05-2009	12
948416	0	420082	15868387	29-06-2009	-1
158776	0	258117	9917389	26-03-2008	2
2309317	0	None	39949994	31-08-2014	-6
158776	0	72914	22639690	18-11-2010	2
2309317	0	353728	39950003	31-08-2014	6
2309317	0	None	39964278	01-09-2014	1
2309317	0	None	40739244	30-11-2014	1
2309317	0	490845	40745046	01-12-2014	-1
2309317	0	490845	40920861	24-12-2014	1
2309317	0	363551	41475151	03-03-2015	1
2309317	0	490845	41478682	04-03-2015	2
948416	0	716999	21301997	05-08-2010	-12
948416	0	716999	21302209	05-08-2010	4
2309317	0	None	42774853	04-07-2015	1
2309317	0	490845	42782205	05-07-2015	-1
948416	0	716999	21307740	05-08-2010	1
2309317	0	None	46195087	17-07-2016	-1
948416	0	716999	21307793	05-08-2010	5
2309317	0	490845	46199489	18-07-2016	1
948416	0	716999	21307866	05-08-2010	9
2309318	0	313497	15994476	09-07-2009	1
2309318	0	313497	17460120	30-10-2009	1
948416	0	716999	21308197	05-08-2010	-6
2309319	0	313497	17460045	30-10-2009	1
948416	0	716999	21308208	05-08-2010	6
948416	0	944380	29727778	15-04-2012	-2
948416	0	944380	29727813	15-04-2012	-4
948416	0	944380	29728337	15-04-2012	-14
948416	0	944380	29728364	15-04-2012	14
2309327	0	None	27136155	03-10-2011	1
2309327	0	None	27983989	15-12-2011	-1
158786	0	484966	35404847	14-04-2013	2
158788	0	15437	3027921	26-08-2006	1
158788	0	None	27840200	02-12-2011	-1
158788	0	446582	27840202	02-12-2011	1
158788	0	708269	43071111	09-08-2015	-1
158789	0	484966	23827410	05-02-2011	10
948496	1	11290	15334264	17-05-2009	1
2309336	0	None	36841743	06-09-2013	1
948942	0	353728	18597812	28-01-2010	1
220	0	151165	15688412	14-06-2009	19
949002	0	228067	22763809	28-11-2010	1
220	0	36855	16749861	06-09-2009	1
949002	0	None	45095141	16-03-2016	-1
949002	0	1294751	45095235	16-03-2016	1
949002	0	None	45603997	13-05-2016	-1
949002	0	62887	45604031	13-05-2016	1
220	0	None	17277525	15-10-2009	4
220	0	103227	17277825	15-10-2009	-4
220	0	623037	17333925	20-10-2009	-20
220	0	446582	17333936	20-10-2009	20
220	0	151165	19437573	26-03-2010	11
158806	0	109474	40094598	19-09-2014	5
949070	0	910910	42986197	29-07-2015	1
949070	0	910910	42986219	29-07-2015	1
2309342	0	584878	15994716	09-07-2009	6
2309342	0	71355	32022265	29-08-2012	-1
2309349	0	295099	45422386	23-04-2016	1
2309360	0	100858	40889590	19-12-2014	1
158822	0	709056	27930781	10-12-2011	1
158822	0	709056	27931860	10-12-2011	1
158822	0	709056	27933679	10-12-2011	1
158822	0	None	32728887	25-10-2012	1
158822	0	1416485	41320886	12-02-2015	1
2309374	0	100858	41090443	15-01-2015	1
158822	0	1730656	45217415	31-03-2016	2
2309378	0	5529	15997208	09-07-2009	1