import mwxml
import gzip
import dateutil
import glob
from collections import Counter
from sqlalchemy import create_engine
import sys, os
import pandas as pd

constr = 'mysql+pymysql://{user}:{pwd}@{host}'.format(user=os.environ['MYSQL_USERNAME'],
                                                      pwd=os.environ['MYSQL_PASSWORD'],
                                                      host=os.environ['MYSQL_HOST'])
con = create_engine(constr)
revisions = mwxml.Dump.from_file(gzip.open('/public/dumps/public/enwiki/20180601/enwiki-20180601-stub-meta-history.xml.gz'))
    
Page(id=626, title='Auteur Theory Film', namespace=0, redirect='Auteur', restrictions=[]) (Timestamp('2001-03-08T00:27:40Z'), 984011260)
users = pd.read_sql("""select ukuser.user_id as uk_id,
enuser.user_id as en_id,
ukuser.user_name as username from ukwiki_p.user ukuser
join enwiki_p.user enuser ON ukuser.user_name = enuser.user_name
where ukuser.user_editcount between 10 and 1000 and enuser.user_editcount between 10 and 1000""", con)
en_users = set(users['en_id'])
len(en_users)
2763
uk_originals = pd.read_sql("""
SELECT ips_item_id as wikidataId, ips_site_page AS uktitle, ukwiki_p.page.page_id as ukpage_id, english.entitle as entitle, min(ukwiki_p.revision.rev_timestamp) as uk_created
FROM wikidatawiki_p.wb_items_per_site  
   INNER JOIN  
  	(SELECT ips_item_id as enwikidata, ips_site_page as entitle FROM wikidatawiki_p.wb_items_per_site WHERE ips_site_id= 'enwiki') 
     as english 
     on ips_item_id=english.enwikidata 
     
   INNER JOIN ukwiki_p.page ON ukwiki_p.page.page_title = ips_site_page
   INNER JOIN ukwiki_p.revision ON ukwiki_p.page.page_id = ukwiki_p.revision.rev_page
     
   WHERE ips_site_id= 'ukwiki'
   GROUP BY uktitle""", con)
uk_originals
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-9-6a59e4a26cd3> in <module>()
----> 1 uk_originals.take()

TypeError: take() missing 1 required positional argument: 'indices'
uk_created = {row.entitle.decode(): dateutil.parser.parse(row.uk_created).timestamp() 
              for _, row in uk_originals.iterrows()}
ukpage_id = {row.entitle.decode(): row.ukpage_id 
              for _, row in uk_originals.iterrows()}
uk_created['Biocybernetics'], ukpage_id['Biocybernetics']
(1177356881.0, 158009)
def filt(pages, path):
    for page in pages:
        if page.title not in uk_created:
            continue

        revs = list(page)
        en_created = min([rev.timestamp.unix() for rev in revs])
        #print(page.title, len(revs), en_created, uk_created[page.title])
        #if en_created < uk_created[page.title]:
        #    continue

        authors = {rev.user.id for rev in revs if rev.user}
        #print(authors, authors & en_users)
        if not authors & en_users:
            continue

        yield (page.id, ukpage_id[page.title], page.title, en_created, uk_created[page.title], authors & en_users)
def map_(pages, path):
    for page in pages:
        for rev in page:
            if rev.page.namespace != 0:
                continue
                
            if not rev.user or not rev.page:
                continue
                
            yield (rev.user.id, rev.user.text, rev.page.id, rev.page.redirect or rev.page.title, rev.minor, rev.comment, rev.bytes, rev.timestamp.unix())
files = glob.glob('/public/dumps/public/enwiki/20181120/enwiki-20181120-stub-meta-history[1-9]*.xml.gz')
#files = [gzip.open(f) for f in files]
with open('en-revisions.tsv', 'w', buffering=100) as f:
    for filename in files:
        print(filename)
        for r in mwxml.map(map_, [filename]):
            f.write('\t'.join(map(str, r)) + '\n')
/public/dumps/public/enwiki/20181120/enwiki-20181120-stub-meta-history18.xml.gz