#In PAWS

from sqlalchemy import create_engine
import sys, os
import pandas as pd
constr = 'mysql+pymysql://{user}:{pwd}@{host}'.format(user=os.environ['MYSQL_USERNAME'],pwd=os.environ['MYSQL_PASSWORD'],host=os.environ['MYSQL_HOST'])
con = create_engine(constr)
q_en_uk = """
SELECT  
    en.ips_item_id as id
    ,uk.ips_site_page as uk_page_title
    ,en.ips_site_page as en_page_title
    ,p.page_id as en_page_id
    ,cast(min(r.rev_timestamp) as DATETIME) as en_first_rev 
FROM 
    wikidatawiki_p.wb_items_per_site as uk
join wikidatawiki_p.wb_items_per_site as en
    on uk.ips_item_id = en.ips_item_id
    and uk.ips_site_id= 'ukwiki'  
    and en.ips_site_id = 'enwiki'  
join enwiki_p.page as p
    on p.page_title = en.ips_site_page
join enwiki_p.revision as r 
    on r.rev_page = p.page_id
    and r.rev_parent_id = 0
group by
    en.ips_item_id
    ,uk.ips_site_page
    ,en.ips_site_page
    ,p.page_id
"""
d_en_uk = pd.read_sql(q_en_uk, con)
print(d_en_uk.shape)
d_en_uk.head(5)
d_en_uk.to_csv("en_uk.csv")
q_uk = """
SELECT  
    uk.ips_item_id as uk_page_title
    ,p.page_id as uk_page_id
FROM 
    wikidatawiki_p.wb_items_per_site as uk
join ukwiki_p.page as p
    on p.page_title = uk.ips_site_page
"""
d_uk = pd.read_sql(q_uk, con)
print(d_uk.shape)
d_uk.head(5)
q_uk_incoming_links = """
select 
    pl.pl_title as uk_page_title
    ,count(*) as uk_incoming_links
from 
    ukwiki_p.pagelinks as pl
group by 
    pl.pl_title
    """
d_uk_incoming_links = pd.read_sql(q_uk_incoming_links, con)
d_uk_incoming_links['uk_page_title'] = d_uk_incoming_links['uk_page_title'].str.decode('utf-8')
d_uk_incoming_links.to_csv("uk_incoming_links.csv")
q_uk_outcoming_links = """
select 
    pl.pl_from as uk_page_id
    ,count(*) as uk_outcoming_links
from 
    ukwiki_p.pagelinks as pl
group by 
    pl.pl_from
    """
d_uk_outcoming_links = pd.read_sql(q_uk_outcoming_links, con)
d_uk_outcoming_links.to_csv("uk_outcoming_links.csv")
df_en_names.head()
q_en_names = """
select 
    p.page_id as en_page_id
    ,p.page_title as en_page_title
from 
    enwiki_p.page p 
where
    p.page_namespace=0
    """
df_en_names = pd.read_sql(q_en_names, con)
df_en_names['uk_page_title'] = df_en_names['uk_page_title'].str.decode('utf-8')
df_en_names.to_csv("en.csv")
q_uk = """
SELECT uk.page_id as uk_page_id,
       uk.page_title as uk_page_title
from ukwiki_p.page as uk
where page_namespace=0
"""
df_uk = pd.read_sql(q_uk, con)
df_uk['uk_page_title'] = df_uk['uk_page_title'].str.decode('utf-8')
df_uk.to_csv("uk.csv")
pr.read_csv('')
q_en_revisions3 = """
SELECT r.rev_page as en_page_id,
       cast(r.rev_timestamp as DATETIME) as en_first_edit
from 
    enwiki_p.revision as r 
where 
    r.rev_parent_id=0
LIMIT 15000000 OFFSET 30000000
"""
df_en_revisions3 = pd.read_sql(q_en_revisions3, con)
df_en_revisions3['uk_page_title'] = df_en_revisions3['uk_page_title'].str.decode('utf-8')
df_en_revisions3.to_csv("en_revisions3.csv")
df_en_revisions3.shape
df_en_revisions3.head()
en_page_id en_first_edit
0 40064074 2013-07-25 05:39:02
1 40064075 2013-07-25 05:39:04
2 40064076 2013-07-25 05:39:07
3 40064077 2013-07-25 05:39:10
4 40064078 2013-07-25 05:39:13
q_en_revisions2 = """
SELECT r.rev_page as en_page_id,
       cast(r.rev_timestamp as DATETIME) as en_first_edit
from 
    enwiki_p.revision as r 
where 
    r.rev_parent_id=0
LIMIT 15000000 OFFSET 15000000
"""
df_en_revisions2 = pd.read_sql(q_en_revisions2, con)
df_en_revisions2.to_csv("en_revisions2.csv")
df_en_revisions2.shape
(15000000, 2)