import mwxml
import gzip
import dateutil
import glob
from collections import Counter
from sqlalchemy import create_engine
import sys, os
import pandas as pd

constr = 'mysql+pymysql://{user}:{pwd}@{host}'.format(user=os.environ['MYSQL_USERNAME'],
con = create_engine(constr)
revisions = mwxml.Dump.from_file('/public/dumps/public/enwiki/20180601/enwiki-20180601-stub-meta-history.xml.gz'))
Page(id=626, title='Auteur Theory Film', namespace=0, redirect='Auteur', restrictions=[]) (Timestamp('2001-03-08T00:27:40Z'), 984011260)
users = pd.read_sql("""select ukuser.user_id as uk_id,
enuser.user_id as en_id,
ukuser.user_name as username from ukwiki_p.user ukuser
join enwiki_p.user enuser ON ukuser.user_name = enuser.user_name
where ukuser.user_editcount between 10 and 1000 and enuser.user_editcount between 10 and 1000""", con)
en_users = set(users['en_id'])
uk_originals = pd.read_sql("""
SELECT ips_item_id as wikidataId, ips_site_page AS uktitle, as ukpage_id, english.entitle as entitle, min(ukwiki_p.revision.rev_timestamp) as uk_created
FROM wikidatawiki_p.wb_items_per_site  
  	(SELECT ips_item_id as enwikidata, ips_site_page as entitle FROM wikidatawiki_p.wb_items_per_site WHERE ips_site_id= 'enwiki') 
     as english 
     on ips_item_id=english.enwikidata 
   INNER JOIN ON = ips_site_page
   INNER JOIN ukwiki_p.revision ON = ukwiki_p.revision.rev_page
   WHERE ips_site_id= 'ukwiki'
   GROUP BY uktitle""", con)
TypeError                                 Traceback (most recent call last)
<ipython-input-9-6a59e4a26cd3> in <module>()
----> 1 uk_originals.take()

TypeError: take() missing 1 required positional argument: 'indices'
uk_created = {row.entitle.decode(): dateutil.parser.parse(row.uk_created).timestamp() 
              for _, row in uk_originals.iterrows()}
ukpage_id = {row.entitle.decode(): row.ukpage_id 
              for _, row in uk_originals.iterrows()}
uk_created['Biocybernetics'], ukpage_id['Biocybernetics']
(1177356881.0, 158009)
def filt(pages, path):
    for page in pages:
        if page.title not in uk_created:

        revs = list(page)
        en_created = min([rev.timestamp.unix() for rev in revs])
        #print(page.title, len(revs), en_created, uk_created[page.title])
        #if en_created < uk_created[page.title]:
        #    continue

        authors = { for rev in revs if rev.user}
        #print(authors, authors & en_users)
        if not authors & en_users:

        yield (, ukpage_id[page.title], page.title, en_created, uk_created[page.title], authors & en_users)
def map_(pages, path):
    for page in pages:
        for rev in page:
            if != 0:
            if not rev.user or not
            yield (, rev.user.text,, or, rev.minor, rev.comment, rev.bytes, rev.timestamp.unix())
files = glob.glob('/public/dumps/public/enwiki/20181120/enwiki-20181120-stub-meta-history[1-9]*.xml.gz')
#files = [ for f in files]
with open('en-revisions.tsv', 'w', buffering=100) as f:
    for filename in files:
        for r in, [filename]):
            f.write('\t'.join(map(str, r)) + '\n')