#In PAWS
from sqlalchemy import create_engine
import sys, os
import pandas as pd
constr = 'mysql+pymysql://{user}:{pwd}@{host}'.format(user=os.environ['MYSQL_USERNAME'],pwd=os.environ['MYSQL_PASSWORD'],host=os.environ['MYSQL_HOST'])
con = create_engine(constr)
#q3 = "select  pl.*, p.* from ukwiki_p.pagelinks as pl left join ukwiki_p.page as p on p.page_title = pl.pl_title where p.page_title is Null "
#q4 = "SELECT  ips_item_id AS wikidata, ips_site_page AS title FROM wikidatawiki_p.wb_items_per_site WHERE ips_site_id= 'ukwiki' AND  ips_item_id  NOT IN (SELECT ips_item_id FROM wikidatawiki_p.wb_items_per_site WHERE ips_site_id= 'enwiki') LIMIT 1000000;"
#df3 = pd.read_sql(q3, con)

Revisions data collection

q_uk_revisions = """
select 
    r.rev_page as uk_page_id
    ,cast(min(r.rev_timestamp) as DATETIME) as uk_first_edit 
    ,cast(max(r.rev_timestamp) as DATETIME) as uk_last_edit
    ,count(*) as uk_revisions_count
    ,sum(r.rev_minor_edit) as uk_minor_revisions
    ,sum(case when r.rev_deleted = 1 then 1 else 0 end) as uk_deleted_revisions
from 
    ukwiki_p.revision as r 
group by 
    r.rev_page 
"""
df_uk_revisions = pd.read_sql(q_uk_revisions, con)
df_uk_revisions.to_csv("uk_revisions.csv")
q_uk_ll = """
select 
    ll.ll_from as page_id 
    ,count(*) as langlinks_count 
from 
    ukwiki_p.langlinks as ll 
group by
    ll.ll_from 
    """
df_uk_ll = pd.read_sql(q_uk_ll, con)
df_uk_ll.to_csv("ukpages_langlinks.csv")
print(df_uk_ll.shape)
df_uk_ll.head(5)
(820222, 2)
page_id langlinks_count
0 1 4
1 3 46
2 13 226
3 584 178
4 585 67
# q_uk_all_links = """
# select 
#     pl.pl_from as from_id
#     ,p.page_id as to_id
#     ,case when p.page_title is null then 'red' else 'blue' end as link_type
# from 
#     ukwiki_p.pagelinks as pl 
# left join ukwiki_p.page p 
#     on p.page_title = pl.pl_title
#     """
# df_uk_all_links = pd.read_sql(q_uk_all_links, con)
# df_uk_all_links.to_csv("ukpages_all_links.csv")
# df_uk_name.head(5)
# q_uk_links_agg = """
# select 
#     pl.pl_from as from_ida
#     ,case when p.page_title is null then 'red' else 'blue' end as link_type
#     ,count(*) 
# from 
#     ukwiki_p.pagelinks as pl
# left join ukwiki_p.page p 
#     on p.page_title = pl.pl_title 
# group by 
#     pl.pl_from
#     ,case when p.page_title is null then 'red' else 'blue' end
#     """
# df_uk_links_agg = pd.read_sql(q_uk_links_agg, con)
# df_uk_links_agg.to_csv("ukpages_all_links_agg.csv")
# q5 = "select pl.pl_from as from_id, p1.page_title as from_title, p.page_id as to_id, p.page_title as to_title, case when p.page_title is null then 'red' else 'blue' end as link_type from ukwiki_p.pagelinks as pl left join ukwiki_p.page p on p.page_title = pl.pl_title join ukwiki_p.page p1 on p1.page_id = pl.pl_from"
q_uk_names = """
select 
    p.page_id as uk_page_id
    ,p.page_title as uk_page_title
from 
    ukwiki_p.page p 
    """
df_uk_names = pd.read_sql(q_uk_names, con)
df_uk_names.to_csv("uk.csv")
print(df_uk_names.shape)
df_uk_names.head(5)
(2416375, 2)
uk_page_id uk_page_title
0 829961 b'!'
1 2062457 b'!!'
2 425480 b'!!!'
3 2591631 b'!!!Fuck_You!!!_and_Then_Some'
4 2141483 b'!!_(\xd0\xb7\xd0\xbd\xd0\xb0\xd1\x87\xd0\xb5...
q_uk_outlinks_agg = """
select 
    pl.pl_from as from_id
    ,count(*) as all_out_links
from 
    ukwiki_p.pagelinks as pl
group by 
    pl.pl_from
    """
df_uk_outlinks_agg = pd.read_sql(q_uk_outlinks_agg, con)
df_uk_outlinks_agg.to_csv("uk_outcoming_links.csv")
print(df_uk_outlinks_agg.shape)
df_uk_outlinks_agg.head(5)
(2227676, 2)
from_id all_out_links
0 1 18
1 2 1
2 3 298
3 4 69
4 6 1
q_uk_inlinks_agg = """
select 
    pl.pl_title as page_title
    ,count(*) as all_in_links
from 
    ukwiki_p.pagelinks as pl
group by 
    pl.pl_title
    """
df_uk_inlinks_agg = pd.read_sql(q_uk_inlinks_agg, con)
df_uk_inlinks_agg.to_csv("uk_incoming_links.csv")
print(df_uk_inlinks_agg.shape)
df_uk_inlinks_agg.head(5)
(3849906, 2)
page_title all_in_links
0 b'!' 67
1 b'!!' 42
2 b'!!!' 8
3 b'!!!Fuck_You!!!_and_Then_Some' 3
4 b'!!!_(\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xb1\xd0\xb... 1
# q_uk_blue_links_agg = """
# select 
#     pl.pl_from as from_id
#     ,count(*) as blue_links 
# from 
#     ukwiki_p.pagelinks  as pl
# join 
# (
#     select
#         p.p_page_title
#     from
#         ukwiki_p.page as p 
# ) p on p.page_title = pl.pl_title 
# group by 
#     pl.pl_from
# """
# df_uk_blue_links_agg = pd.read_sql(q_uk_blue_links_agg, con)
# df_uk_blue_links_agg.to_csv("ukpages_blue_links_agg.csv")
q_uk_blue_trans_links_agg = """
select 
    pl.pl_from as from_id
    ,count(*) as blue__links 
from 
    ukwiki_p.pagelinks as pl
join ukwiki_p.page p 
    on p.page_id = pl.pl_from
join
(
SELECT  
    ips_item_id AS wikidata
    , ips_site_page AS title 
FROM 
    wikidatawiki_p.wb_items_per_site 
WHERE 
    ips_site_id= 'ukwiki' 
    AND  ips_item_id IN (SELECT ips_item_id FROM wikidatawiki_p.wb_items_per_site WHERE ips_site_id= 'enwiki')
) en_titles
    on p.page_title = en_titles.title
group by 
    pl.pl_from
"""
df_uk_blue_trans_links_agg = pd.read_sql(q_uk_blue_trans_links_agg, con)
df_uk_blue_trans_links_agg.to_csv("uk_outcoming_links_translated.csv")
print(df_uk_blue_trans_links_agg.shape)
df_uk_blue_trans_links_agg.head(5)
(261460, 2)
from_id blue_links
0 13 204
1 584 372
2 585 29
3 587 14
4 588 419
q_uk_langs = """

SELECT  
    uk.ips_site_page AS page_title 
    ,count(*) as translations
FROM 
    wikidatawiki_p.wb_items_per_site as uk
join wikidatawiki_p.wb_items_per_site as en
    on uk.ips_site_id= 'ukwiki'  
    and en.ips_site_id <> 'ukwiki'  
group by 
    uk.ips_site_page
"""
df_uk_langs = pd.read_sql(q_uk_langs, con)
df_uk_langs.to_csv("ukpages_langs.csv")