#In PAWS
from sqlalchemy import create_engine
import sys, os
import pandas as pd
constr = 'mysql+pymysql://{user}:{pwd}@{host}'.format(user=os.environ['MYSQL_USERNAME'],pwd=os.environ['MYSQL_PASSWORD'],host=os.environ['MYSQL_HOST'])
con = create_engine(constr)
q_2 = """
SELECT  * FROM ukwiki_p.page p WHERE p.page_id = 1075363 limit 1000;
"""
d2 = pd.read_sql(q_2, con)
d2.head()
page_id page_namespace page_title page_restrictions page_counter page_is_redirect page_is_new page_random page_touched page_links_updated page_latest page_len page_no_title_convert page_content_model page_lang
0 1075363 0 b'\xd0\xa4\xd1\x80\xd0\xb0\xd0\xbd\xd0\xba\xd0... b'' 0 1 1 0.748187 b'20180226211853' None 7578259 64 0 b'wikitext' None
q_2 = """
SELECT  r.rev_page FROM ukwiki_p.revision r group by r.rev_page
"""
d2 = pd.read_sql(q_2, con)
d2.head()
rev_page
0 0
1 1
2 2
3 3
4 4
d2.shape
(2427824, 1)
q_2 = """
SELECT *  FROM ukwiki_p.revision r where r.rev_parent_id = 0
"""
d2 = pd.read_sql(q_2, con)
d2.head()
rev_id rev_page rev_text_id rev_comment_id rev_comment rev_user rev_user_text rev_timestamp rev_minor_edit rev_deleted rev_len rev_parent_id rev_sha1 rev_content_model rev_content_format
0 1 3 0.0 0.0 b'*' 0.0 b'flets-a-west-14-238.dsn.jp' b'20031227004500' 1 0 10773.0 0 b'r0sva25xkmcm3o6heygbwi7v6t2u6ux' None None
1 12 13 0.0 0.0 b'' 0.0 b'202.216.55.46' b'20040126110432' 0 0 1044.0 0 b'4gfap67wt1p6vwschint0oybigusmbl' None None
2 13 584 0.0 0.0 b'' 0.0 b'61.125.212.32' b'20040130024256' 0 0 545.0 0 b'6t9e0qzyq99kybmn6w62ymppgv50j89' None None
3 18 586 0.0 0.0 b'' 3.0 b'Yuri koval' b'20040130051203' 0 0 428.0 0 b'p0iff8txekqscwnrvwsttqjuokmkzsm' None None
4 33 587 0.0 0.0 b'' 3.0 b'Yuri koval' b'20040130051740' 0 0 61.0 0 b'onxb48vte83q1mgoipzfernx4bfsx2v' None None
d2.shape
(2428508, 15)
q_en_uk = """
SELECT  
    en.ips_item_id as id
    ,uk.ips_site_page as uk_page_title
    ,en.ips_site_page as en_page_title
FROM 
    wikidatawiki_p.wb_items_per_site as uk
left join wikidatawiki_p.wb_items_per_site as en
    on uk.ips_item_id = en.ips_item_id
    and en.ips_site_id = 'enwiki'  
where 
     uk.ips_site_id= 'ukwiki'  
"""
d_en_uk = pd.read_sql(q_en_uk, con)
d_en_uk['uk_page_title'] = d_en_uk['uk_page_title'].str.decode('utf-8')
d_en_uk['en_page_title_clear'] = d_en_uk['en_page_title'].str.decode('utf-8')
d_en_uk.to_csv("en_uk_titles.csv")