from sqlalchemy import create_engine
import sys, os
import pandas as pd

constr = 'mysql+pymysql://{user}:{pwd}@{host}'.format(user=os.environ['MYSQL_USERNAME'],
                                                      pwd=os.environ['MYSQL_PASSWORD'],
                                                      host=os.environ['MYSQL_HOST'])
con = create_engine(constr)
q_en_last_rev = """
SELECT en_revs.rev_page as en_id,
       cast(en_revs.rev_timestamp as DATETIME) as en_timestamp
       
from enwiki_p.revision as en_revs 
where en_revs.rev_parent_id=0
LIMIT 15000000 OFFSET 0
"""
df_en_last_rev = pd.read_sql(q_en_last_rev, con)
df_en_last_rev.to_csv("df_en_last_rev_1")
print(df_en_last_rev.shape)
df_en_last_rev.head(10)
(15000000, 2)
en_id en_timestamp
0 36469 2002-01-26 14:32:30
1 49738 2002-01-26 14:57:24
2 5030553 2002-01-26 15:09:12
3 36498 2002-01-26 15:18:53
4 36624 2002-01-26 15:19:00
5 34122 2002-01-26 15:35:42
6 36631 2002-01-26 16:00:54
7 36637 2002-01-26 16:11:43
8 36636 2002-01-26 16:12:46
9 36512 2002-01-26 16:26:55
q_en_last_rev = """
SELECT en_revs.rev_page as en_id,
       cast(en_revs.rev_timestamp as DATETIME) as en_timestamp
       
from enwiki_p.revision as en_revs 
where en_revs.rev_parent_id=0
LIMIT 15000000 OFFSET 15000000
"""
df_en_last_rev = pd.read_sql(q_en_last_rev, con)
df_en_last_rev.to_csv("df_en_last_rev_2")
print(df_en_last_rev.shape)
df_en_last_rev.head(10)
(15000000, 2)
en_id en_timestamp
0 21783471 2009-03-03 01:29:25
1 21783472 2009-03-03 01:29:35
2 21783473 2009-03-03 01:29:38
3 21783474 2009-03-03 01:29:39
4 21783476 2009-03-03 01:29:46
5 21783478 2009-03-03 01:29:59
6 21783480 2009-03-03 01:30:19
7 21783482 2009-03-03 01:30:28
8 21783483 2009-03-03 01:30:30
9 21783484 2009-03-03 01:30:31
q_en_last_rev = """
SELECT en_revs.rev_page as en_id,
       cast(en_revs.rev_timestamp as DATETIME) as en_timestamp
       
from enwiki_p.revision as en_revs 
where en_revs.rev_parent_id=0
LIMIT 15000000 OFFSET 30000000
"""
df_en_last_rev = pd.read_sql(q_en_last_rev, con)
df_en_last_rev.to_csv("df_en_last_rev_3")
print(df_en_last_rev.shape)
df_en_last_rev.head(10)
(15000000, 2)
en_id en_timestamp
0 40061011 2013-07-24 21:15:27
1 40061012 2013-07-24 21:15:28
2 40061013 2013-07-24 21:15:30
3 40061014 2013-07-24 21:15:47
4 40061016 2013-07-24 21:16:00
5 40061018 2013-07-24 21:16:26
6 40061019 2013-07-24 21:16:33
7 40061020 2013-07-24 21:16:35
8 40061021 2013-07-24 21:16:36
9 40061023 2013-07-24 21:16:52
q_en_page = """
SELECT en_page.page_id as en_id,
       en_page.page_title as en_title
       
from enwiki_p.page as en_page
where page_namespace=0
"""
df_en_page = pd.read_sql(q_en_page, con)
df_en_page.to_csv("df_en_page")
print(df_en_page.shape)
df_en_page.head(10)
(14019836, 2)
en_id en_title
0 5878274 b'!'
1 3632887 b'!!'
2 600744 b'!!!'
3 34443176 b'!!!Fuck_You!!!'
4 11011780 b'!!!Fuck_You!!!_And_Then_Some'
5 34443184 b'!!!Fuck_You!!!_and_Then_Some'
6 39401265 b'!!!_(Chk_Chk_Chk)'
7 2556962 b'!!!_(album)'
8 10065458 b'!!!_(band)'
9 55029148 b'!!!_(disambiguation)'
q_uk_page = """
SELECT uk_page.page_id as uk_id,
       uk_page.page_title as uk_title
from ukwiki_p.page as uk_page
where page_namespace=0
"""
df_uk_page = pd.read_sql(q_uk_page, con)
df_uk_page.to_csv("df_uk_page")
print(df_uk_page.shape)
df_uk_page.head(10)
(1245071, 2)
uk_id uk_title
0 829961 b'!'
1 2062457 b'!!'
2 425480 b'!!!'
3 2591631 b'!!!Fuck_You!!!_and_Then_Some'
4 2141483 b'!!_(\xd0\xb7\xd0\xbd\xd0\xb0\xd1\x87\xd0\xb5...
5 1767358 b'!Action_Pact'
6 848226 b'!Action_Pact!'
7 1767359 b'!T.O.O.H.'
8 425157 b'!T.O.O.H.!'
9 2112530 b'!_(\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xb1\xd0\xbe\...
q_uk_last_rev = """
SELECT uk_revs.rev_page as uk_id,
       cast(uk_revs.rev_timestamp as DATETIME) as uk_timestamp
       
from ukwiki_p.revision as uk_revs 
where uk_revs.rev_parent_id=0
"""
df_uk_last_rev = pd.read_sql(q_uk_last_rev, con)
df_uk_last_rev.to_csv("df_uk_last_rev")
print(df_uk_last_rev.shape)
df_uk_last_rev.head(10)
(2424900, 2)
uk_id uk_timestamp
0 3 2003-12-27 00:45:00
1 13 2004-01-26 11:04:32
2 584 2004-01-30 02:42:56
3 586 2004-01-30 05:12:03
4 587 2004-01-30 05:17:40
5 591 2004-01-30 09:07:51
6 590 2004-01-30 06:49:18
7 592 2004-01-30 09:49:55
8 50870 2004-01-30 10:40:05
9 588 2004-01-30 06:13:37
q_uk_lang_links = """
SELECT uk_links.ll_from as uk_id,
       uk_links.ll_title as en_title
       
from ukwiki_p.langlinks as uk_links
where uk_links.ll_lang='en'
"""
df_uk_lang_links = pd.read_sql(q_uk_lang_links, con)
df_uk_lang_links.to_csv("df_uk_lang_links")
print(df_uk_lang_links.shape)
df_uk_lang_links.head(10)
q_uk_lang_not_translated = """
SELECT  ips_item_id AS wikidata, ips_site_page AS title 
FROM wikidatawiki_p.wb_items_per_site 
WHERE ips_site_id= 'ukwiki' AND  
    ips_item_id  NOT IN (SELECT ips_item_id FROM wikidatawiki_p.wb_items_per_site WHERE ips_site_id= 'enwiki')
ORDER BY RAND()
LIMIT 5000

"""
df_uk_lang_not_translated = pd.read_sql(q_uk_lang_not_translated, con)
df_uk_lang_not_translated.to_csv("df_uk_lang_not_translated")
print(df_uk_lang_not_translated.shape)
df_uk_lang_not_translated.head(10)
(5000, 2)
wikidata title
0 4196688 b"\xd0\x86\xd0\xb2\xd0\xb0\xd0\xbd\xd1\x96\xd0...
1 32498793 b"\xd0\x9a\xd0\xb0\xd1\x82\xd0\xb5\xd0\xb3\xd0...
2 32504150 b'\xd0\x9a\xd0\xb0\xd1\x82\xd0\xb5\xd0\xb3\xd0...
3 1231827 b'\xd0\x9a\xd0\xbd\xd1\x8f\xd0\xb7\xd1\x94\xd0...
4 12103014 b'\xd0\x94\xd1\x83\xd1\x89\xd0\xb0\xd0\xba \xd...
5 12104452 b'\xd0\x96\xd0\xb5\xd1\x80\xd0\xbd\xd0\xbe\xd0...
6 19754678 b'\xd0\x93\xd0\xbe\xd1\x80\xd0\xbe\xd0\xb1\xd1...
7 32480537 b'\xd0\x9a\xd0\xb0\xd1\x82\xd0\xb5\xd0\xb3\xd0...
8 1459833 b'\xd0\x9f\xd0\xb5\xd1\x80\xd0\xb5\xd0\xb4\xd0...
9 12105558 b'\xd0\x97\xd0\xb0\xd0\xba\xd1\x80\xd1\x96\xd0...