import pandas as pd
uk_titles_df = pd.read_csv('en_uk_titles.csv')
uk_titles_df['uk_page_title'] = uk_titles_df['uk_page_title'].str.replace(' ', '_')
#uk_df = pd.read_csv('uk.csv')
uk_titles_df.head()
Unnamed: 0 id uk_page_title en_page_title en_page_title_clear
0 0 NaN !_(альбом_С.К.А.Й.) NaN NaN
1 1 4540205.0 !_(альбом) b'! (The Dismemberment Plan album)' ! (The Dismemberment Plan album)
2 2 404000.0 !_(значення) b'! (disambiguation)' ! (disambiguation)
3 3 353153.0 !!_(значення) b'!!' !!
4 4 371.0 !!! b'!!!' !!!
uk_titles_uk_df = pd.merge(uk_titles_df, uk_df, how='left', left_on=['uk_page_title'], right_on=['uk_page_title'])
merge1 = uk_titles_uk_df.drop(columns=["Unnamed: 0_x", "Unnamed: 0_y"])
merge1.to_csv('uk_plus_titles.csv')
merge1.head(20)
id uk_page_title en_page_title uk_page_id
0 NaN !_(альбом_С.К.А.Й.) NaN 829565.0
1 4540205.0 !_(альбом) b'! (The Dismemberment Plan album)' 2112530.0
2 404000.0 !_(значення) b'! (disambiguation)' 1366003.0
3 353153.0 !!_(значення) b'!!' 2141483.0
4 371.0 !!! b'!!!' 425480.0
5 343686.0 !Action_Pact! b'!Action Pact!' 848226.0
6 1622767.0 !T.O.O.H.! b'!T.O.O.H.!' 425157.0
7 NaN !ФЕСТ NaN 432740.0
8 NaN !Чидро NaN 590602.0
9 1384337.0 $_(значення) b'$ (disambiguation)' 1369614.0
10 NaN $uicideboy$ NaN 2610007.0
11 NaN &RQ NaN 156371.0
12 149404.0 '03_Bonnie_&_Clyde b"'03 Bonnie & Clyde" 425479.0
13 1075363.0 '39 b"'39" 2014472.0
14 4540381.0 '50s_on_5 b"'50s on 5" 1624538.0
15 4540383.0 '60s_on_6 b"'60s on 6" 1625253.0
16 425605.0 '74_Jailbreak b"'74 Jailbreak" 226044.0
17 891050.0 '92_Tour_EP b"'92 Tour EP" 1707500.0
18 1047737.0 '98_Live_Meltdown b"'98 Live Meltdown" 1702515.0
19 3612456.0 'Allelujah!_Don't_Bend!_Ascend! b"'Allelujah! Don't Bend! Ascend!" 2144196.0
for el in uk_titles_df['uk_page_title'].values:
    if '!!!Fuck_You!!!_and_Then_Some' in str(el):
        print(el)
uk_incoming_links_df = pd.read_csv('uk_incoming_links.csv')
merge2_df = pd.merge(merge1, uk_incoming_links_df, how='left', left_on=['uk_page_title'], right_on=['uk_page_title'])
merge2_df = merge2_df.drop(columns=['Unnamed: 0'])
merge2_df.to_csv('uk_plus_titles_plus_incominglinks.csv')
uk_incoming_links_df.head(20)
Unnamed: 0 uk_page_title uk_incoming_links
0 0 ! 67
1 1 !! 42
2 2 !!! 8
3 3 !!!Fuck_You!!!_and_Then_Some 3
4 4 !!!_(альбом) 1
5 5 !!/документація 2
6 6 !!/пісочниця 2
7 7 !!/тести 2
8 8 !!vdtfo8d!! 1
9 9 !!zaq 1
10 10 !!і 1
11 11 !( 28
12 12 !(( 26
13 13 !((/документація 2
14 14 !((/пісочниця 1
15 15 !((/тести 1
16 16 !(/Документація 2
17 17 !(/документація 2
18 18 !(/пісочниця 1
19 19 !(/тести 1
merge2_df.head(20)
id uk_page_title en_page_title uk_page_id uk_incoming_links
0 NaN !_(альбом_С.К.А.Й.) NaN 829565.0 31.0
1 4540205.0 !_(альбом) b'! (The Dismemberment Plan album)' 2112530.0 NaN
2 404000.0 !_(значення) b'! (disambiguation)' 1366003.0 1.0
3 353153.0 !!_(значення) b'!!' 2141483.0 NaN
4 371.0 !!! b'!!!' 425480.0 8.0
5 343686.0 !Action_Pact! b'!Action Pact!' 848226.0 4.0
6 1622767.0 !T.O.O.H.! b'!T.O.O.H.!' 425157.0 4.0
7 NaN !ФЕСТ NaN 432740.0 31.0
8 NaN !Чидро NaN 590602.0 4.0
9 1384337.0 $_(значення) b'$ (disambiguation)' 1369614.0 1.0
10 NaN $uicideboy$ NaN 2610007.0 1.0
11 NaN &RQ NaN 156371.0 52.0
12 149404.0 '03_Bonnie_&_Clyde b"'03 Bonnie & Clyde" 425479.0 17.0
13 1075363.0 '39 b"'39" 2014472.0 111.0
14 4540381.0 '50s_on_5 b"'50s on 5" 1624538.0 2.0
15 4540383.0 '60s_on_6 b"'60s on 6" 1625253.0 2.0
16 425605.0 '74_Jailbreak b"'74 Jailbreak" 226044.0 49.0
17 891050.0 '92_Tour_EP b"'92 Tour EP" 1707500.0 47.0
18 1047737.0 '98_Live_Meltdown b"'98 Live Meltdown" 1702515.0 2.0
19 3612456.0 'Allelujah!_Don't_Bend!_Ascend! b"'Allelujah! Don't Bend! Ascend!" 2144196.0 9.0
langlinks_df = pd.read_csv('uk_langlinks.csv')
langlinks_df.head()
Unnamed: 0 uk_page_id uk_langlinks_count
0 0 1 4
1 1 3 46
2 2 13 226
3 3 584 178
4 4 585 67
merge3_df = pd.merge(merge2_df, langlinks_df, how='left', on=['uk_page_id']).drop(columns=['Unnamed: 0'])
merge3_df.head()
id uk_page_title en_page_title uk_page_id uk_incoming_links uk_langlinks_count
0 NaN !_(альбом_С.К.А.Й.) NaN 829565.0 31.0 NaN
1 4540205.0 !_(альбом) b'! (The Dismemberment Plan album)' 2112530.0 NaN 12.0
2 404000.0 !_(значення) b'! (disambiguation)' 1366003.0 1.0 16.0
3 353153.0 !!_(значення) b'!!' 2141483.0 NaN 17.0
4 371.0 !!! b'!!!' 425480.0 8.0 25.0
outcominglinks_df = pd.read_csv('uk_outcoming_links.csv')
outcominglinks_df.head()
merge4_df = pd.merge(merge3_df, outcominglinks_df, how='left', on=['uk_page_id']).drop(columns=['Unnamed: 0'])
merge4_df.to_csv('merged_titles_id_incoming_uklanglinks_outcoming_links.csv')
merge4_df.head()
id uk_page_title en_page_title uk_page_id uk_incoming_links uk_langlinks_count outcoming_links
0 NaN !_(альбом_С.К.А.Й.) NaN 829565.0 31.0 NaN 40.0
1 4540205.0 !_(альбом) b'! (The Dismemberment Plan album)' 2112530.0 NaN 12.0 12.0
2 404000.0 !_(значення) b'! (disambiguation)' 1366003.0 1.0 16.0 32.0
3 353153.0 !!_(значення) b'!!' 2141483.0 NaN 17.0 18.0
4 371.0 !!! b'!!!' 425480.0 8.0 25.0 36.0