import pandas as pd
df_en_uk_titles = pd.read_csv('en_uk_titles.csv')
df_en_uk_titles.head(20)
Unnamed: 0 id uk_page_title en_page_title
0 0 NaN b'! (\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xb1\xd0\xbe\... NaN
1 1 4540205.0 b'! (\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xb1\xd0\xbe\... b'! (The Dismemberment Plan album)'
2 2 404000.0 b'! (\xd0\xb7\xd0\xbd\xd0\xb0\xd1\x87\xd0\xb5\... b'! (disambiguation)'
3 3 353153.0 b'!! (\xd0\xb7\xd0\xbd\xd0\xb0\xd1\x87\xd0\xb5... b'!!'
4 4 371.0 b'!!!' b'!!!'
5 5 343686.0 b'!Action Pact!' b'!Action Pact!'
6 6 1622767.0 b'!T.O.O.H.!' b'!T.O.O.H.!'
7 7 NaN b'!\xd0\xa4\xd0\x95\xd0\xa1\xd0\xa2' NaN
8 8 NaN b'!\xd0\xa7\xd0\xb8\xd0\xb4\xd1\x80\xd0\xbe' NaN
9 9 1384337.0 b'$ (\xd0\xb7\xd0\xbd\xd0\xb0\xd1\x87\xd0\xb5\... b'$ (disambiguation)'
10 10 NaN b'$uicideboy$' NaN
11 11 NaN b'&RQ' NaN
12 12 149404.0 b"'03 Bonnie & Clyde" b"'03 Bonnie & Clyde"
13 13 1075363.0 b"'39" b"'39"
14 14 4540381.0 b"'50s on 5" b"'50s on 5"
15 15 4540383.0 b"'60s on 6" b"'60s on 6"
16 16 425605.0 b"'74 Jailbreak" b"'74 Jailbreak"
17 17 891050.0 b"'92 Tour EP" b"'92 Tour EP"
18 18 1047737.0 b"'98 Live Meltdown" b"'98 Live Meltdown"
19 19 3612456.0 b"'Allelujah! Don't Bend! Ascend!" b"'Allelujah! Don't Bend! Ascend!"
df_uk_outcoming_links_translated = pd.read_csv('uk_outcoming_links_translated.csv')
df_uk_outcoming_links_translated.head(12)
Unnamed: 0 uk_page_id outcoming_links_translated
0 0 13 204
1 1 584 372
2 2 585 29
3 3 587 14
4 4 588 419
5 5 590 26
6 6 591 198
7 7 592 389
8 8 600 67
9 9 601 181
10 10 602 28
11 11 603 319
df_uk_revisions = pd.read_csv('uk_revisions.csv')
df_uk_revisions.head(12)
Unnamed: 0 uk_page_id uk_first_edit uk_last_edit uk_revisions_count uk_minor_revisions_count uk_deleted_revisions
0 0 0 2008-11-02 13:02:01 2010-10-01 13:33:05 2 0.0 0.0
1 1 1 2003-10-20 12:27:01 2006-10-10 06:46:53 4 0.0 0.0
2 2 2 2003-12-23 09:53:53 2011-07-06 06:56:28 6 4.0 0.0
3 3 3 2003-12-27 00:45:00 2018-03-30 15:15:50 408 218.0 0.0
4 4 4 2003-10-20 12:19:03 2018-07-06 09:55:31 1489 385.0 2.0
5 5 6 2003-10-15 10:51:10 2004-08-01 07:40:52 6 0.0 0.0
6 6 9 2004-01-25 14:56:26 2009-03-30 05:10:25 4 2.0 0.0
7 7 13 2004-01-26 11:04:32 2018-05-07 08:33:18 340 188.0 5.0
8 8 584 2004-01-30 02:42:56 2018-06-04 19:49:52 621 242.0 0.0
9 9 585 2004-01-30 04:52:04 2017-03-01 17:01:44 66 43.0 0.0
10 10 586 2004-01-30 05:12:03 2017-08-23 16:53:36 52 25.0 0.0
11 11 587 2004-01-30 05:17:40 2017-01-13 22:29:53 97 79.0 0.0
merged1 = pd.merge(df_uk_outcoming_links_translated, df_uk_revisions, how = 'left', on = 'uk_page_id')
merged1.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y'], inplace=True)
merged1.head(12)
uk_page_id outcoming_links_translated uk_first_edit uk_last_edit uk_revisions_count uk_minor_revisions_count uk_deleted_revisions
0 13 204 2004-01-26 11:04:32 2018-05-07 08:33:18 340 188.0 5.0
1 584 372 2004-01-30 02:42:56 2018-06-04 19:49:52 621 242.0 0.0
2 585 29 2004-01-30 04:52:04 2017-03-01 17:01:44 66 43.0 0.0
3 587 14 2004-01-30 05:17:40 2017-01-13 22:29:53 97 79.0 0.0
4 588 419 2004-01-30 06:13:37 2018-06-15 11:41:38 387 213.0 0.0
5 590 26 2004-01-30 06:49:18 2018-02-23 12:59:50 110 90.0 0.0
6 591 198 2004-01-30 09:07:51 2018-07-10 08:19:11 258 133.0 0.0
7 592 389 2004-01-30 09:49:55 2018-07-06 11:09:53 450 207.0 2.0
8 600 67 2004-01-31 05:41:49 2018-06-22 16:22:12 208 96.0 0.0
9 601 181 2004-01-31 06:44:15 2017-12-20 21:07:37 107 59.0 0.0
10 602 28 2004-01-31 07:23:33 2018-02-06 09:15:12 64 48.0 0.0
11 603 319 2004-01-31 07:23:59 2018-07-04 01:08:53 40 22.0 0.0
merged1.to_csv('merged_outcoming_translated_and_revisions.csv')
df_uk_translations = pd.read_csv('uk_translations.csv')
df_uk_translations.head(12)
Unnamed: 0 uk_page_title uk_translations_count
0 0 b'\xd0\x92\xd1\x81\xd0\xb5\xd1\x81\xd0\xb2\xd1... 103
1 1 b'\xd0\x97\xd0\xb5\xd0\xbc\xd0\xbb\xd1\x8f' 145
2 2 b'\xd0\x96\xd0\xb8\xd1\x82\xd1\x82\xd1\x8f' 105
3 3 b'\xd0\xa1\xd0\xbc\xd0\xb5\xd1\x80\xd1\x82\xd1... 96
4 4 b'\xd0\x9b\xd1\x8e\xd0\xb4\xd0\xb8\xd0\xbd\xd0... 121
5 5 b'\xd0\xa9\xd0\xb0\xd1\x81\xd1\x82\xd1\x8f' 76
6 6 b'\xd0\xa2\xd1\x80\xd0\xb8\xd1\x81\xd0\xba\xd0... 40
7 7 b'\xd0\x90\xd1\x84\xd1\x80\xd0\xb8\xd0\xba\xd0... 160
8 8 b'\xd0\x9a\xd0\xb0\xd0\xbd\xd0\xb0\xd0\xb4\xd0... 158
9 9 b'\xd0\xaf\xd0\xbf\xd0\xbe\xd0\xbd\xd1\x96\xd1... 155
10 10 b'\xd0\x9f\xd1\x96\xd0\xb2\xd0\xb4\xd0\xb5\xd0... 137
11 11 b'\xd0\x9d\xd0\xb5\xd1\x87\xd0\xb5\xd1\x81\xd0... 15
merged2 = pd.merge(df_uk_translations, df_en_uk_titles, how = 'left', on = 'uk_page_title')
merged2.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y'], inplace=True)
merged2.head(12)
uk_page_title uk_translations_count id en_page_title
0 b'\xd0\x92\xd1\x81\xd0\xb5\xd1\x81\xd0\xb2\xd1... 103 1.0 b'Universe'
1 b'\xd0\x97\xd0\xb5\xd0\xbc\xd0\xbb\xd1\x8f' 145 2.0 b'Earth'
2 b'\xd0\x96\xd0\xb8\xd1\x82\xd1\x82\xd1\x8f' 105 3.0 b'Life'
3 b'\xd0\xa1\xd0\xbc\xd0\xb5\xd1\x80\xd1\x82\xd1... 96 4.0 b'Death'
4 b'\xd0\x9b\xd1\x8e\xd0\xb4\xd0\xb8\xd0\xbd\xd0... 121 5.0 b'Human'
5 b'\xd0\xa9\xd0\xb0\xd1\x81\xd1\x82\xd1\x8f' 76 8.0 b'Happiness'
6 b'\xd0\xa2\xd1\x80\xd0\xb8\xd1\x81\xd0\xba\xd0... 40 13.0 b'Triskaidekaphobia'
7 b'\xd0\x90\xd1\x84\xd1\x80\xd0\xb8\xd0\xba\xd0... 160 15.0 b'Africa'
8 b'\xd0\x9a\xd0\xb0\xd0\xbd\xd0\xb0\xd0\xb4\xd0... 158 16.0 b'Canada'
9 b'\xd0\xaf\xd0\xbf\xd0\xbe\xd0\xbd\xd1\x96\xd1... 155 17.0 b'Japan'
10 b'\xd0\x9f\xd1\x96\xd0\xb2\xd0\xb4\xd0\xb5\xd0... 137 18.0 b'South America'
11 b'\xd0\x9d\xd0\xb5\xd1\x87\xd0\xb5\xd1\x81\xd0... 15 19.0 b'Cheating'
merged2.to_csv('merged_translations_and_titles.csv')
merged1.head(2)
uk_page_id outcoming_links_translated uk_first_edit uk_last_edit uk_revisions_count uk_minor_revisions_count uk_deleted_revisions
0 13 204 2004-01-26 11:04:32 2018-05-07 08:33:18 340 188.0 5.0
1 584 372 2004-01-30 02:42:56 2018-06-04 19:49:52 621 242.0 0.0
merged2.head(2)
uk_page_title uk_translations_count id en_page_title
0 b'\xd0\x92\xd1\x81\xd0\xb5\xd1\x81\xd0\xb2\xd1... 103 1.0 b'Universe'
1 b'\xd0\x97\xd0\xb5\xd0\xbc\xd0\xbb\xd1\x8f' 145 2.0 b'Earth'
df_uk_and_titles = pd.read_csv('uk_plus_titles.csv')
df_uk_and_titles.drop(columns=['Unnamed: 0'], inplace= True)
df_uk_and_titles.head(2)
id uk_page_title en_page_title uk_page_id
0 NaN b'! (\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xb1\xd0\xbe\... NaN NaN
1 4540205.0 b'! (\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xb1\xd0\xbe\... b'! (The Dismemberment Plan album)' NaN
merged3 = pd.merge(df_uk_translations, df_uk_and_titles, how = 'left', on = 'uk_page_title')
merged3.to_csv('merged_translations_and_titles_and_uk.csv')
merged = pd.merge(merged1, merged3, how = 'left', on = 'uk_page_id')
merged.to_csv('merged_together.csv')
merged.head(2)
uk_page_id outcoming_links_translated uk_first_edit uk_last_edit uk_revisions_count uk_minor_revisions_count uk_deleted_revisions Unnamed: 0 uk_page_title uk_translations_count id en_page_title
0 13 204 2004-01-26 11:04:32 2018-05-07 08:33:18 340 188.0 5.0 936.0 b'\xd0\x93\xd0\xb5\xd0\xbe\xd0\xb3\xd1\x80\xd0... 140.0 1071.0 b'Geography'
1 584 372 2004-01-30 02:42:56 2018-06-04 19:49:52 621 242.0 0.0 6692.0 b'\xd0\x90\xd1\x82\xd0\xbe\xd0\xbc' 117.0 9121.0 b'Atom'
merged.columns
Index(['uk_page_id', 'outcoming_links_translated', 'uk_first_edit',
       'uk_last_edit', 'uk_revisions_count', 'uk_minor_revisions_count',
       'uk_deleted_revisions', 'Unnamed: 0', 'uk_page_title',
       'uk_translations_count', 'id', 'en_page_title'],
      dtype='object')
df_uk_and_titles.head(50)
id uk_page_title en_page_title uk_page_id
0 NaN b'! (\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xb1\xd0\xbe\... NaN NaN
1 4540205.0 b'! (\xd0\xb0\xd0\xbb\xd1\x8c\xd0\xb1\xd0\xbe\... b'! (The Dismemberment Plan album)' NaN
2 404000.0 b'! (\xd0\xb7\xd0\xbd\xd0\xb0\xd1\x87\xd0\xb5\... b'! (disambiguation)' NaN
3 353153.0 b'!! (\xd0\xb7\xd0\xbd\xd0\xb0\xd1\x87\xd0\xb5... b'!!' NaN
4 371.0 b'!!!' b'!!!' 425480.0
5 343686.0 b'!Action Pact!' b'!Action Pact!' NaN
6 1622767.0 b'!T.O.O.H.!' b'!T.O.O.H.!' 425157.0
7 NaN b'!\xd0\xa4\xd0\x95\xd0\xa1\xd0\xa2' NaN 432740.0
8 NaN b'!\xd0\xa7\xd0\xb8\xd0\xb4\xd1\x80\xd0\xbe' NaN 590602.0
9 1384337.0 b'$ (\xd0\xb7\xd0\xbd\xd0\xb0\xd1\x87\xd0\xb5\... b'$ (disambiguation)' NaN
10 NaN b'$uicideboy$' NaN 2610007.0
11 NaN b'&RQ' NaN 156371.0
12 149404.0 b"'03 Bonnie & Clyde" b"'03 Bonnie & Clyde" NaN
13 1075363.0 b"'39" b"'39" 2014472.0
14 4540381.0 b"'50s on 5" b"'50s on 5" NaN
15 4540383.0 b"'60s on 6" b"'60s on 6" NaN
16 425605.0 b"'74 Jailbreak" b"'74 Jailbreak" NaN
17 891050.0 b"'92 Tour EP" b"'92 Tour EP" NaN
18 1047737.0 b"'98 Live Meltdown" b"'98 Live Meltdown" NaN
19 3612456.0 b"'Allelujah! Don't Bend! Ascend!" b"'Allelujah! Don't Bend! Ascend!" NaN
20 NaN b"'Cause You Are Young" NaN NaN
21 NaN b"'Cichlasoma'" NaN 357578.0
22 5527115.0 b"'Cichlasoma' salvini" b"Salvin's cichlid" NaN
23 NaN b"'Lac Motion" NaN NaN
24 205891.0 b"'O sole mio" b'\xe2\x80\x99O sole mio' NaN
25 4540515.0 b"'Round About Midnight at the Cafe Bohemia" b"'Round About Midnight at the Cafe Bohemia" NaN
26 4540516.0 b"'Round Midnight (\xd0\xb0\xd0\xbb\xd1\x8c\xd... b"'Round Midnight (1963 Betty Carter album)" NaN
27 NaN b"'\xd0\x90\xd0\xb1\xd0\xb4 \xd0\x90\xd0\xbb\x... NaN NaN
28 399787.0 b'( ) (\xd0\xb7\xd0\xbd\xd0\xb0\xd1\x87\xd0\xb... b'( ) (disambiguation)' NaN
29 NaN b'(10003) 1971 UD1' NaN NaN
30 NaN b'(100048) 1991 TE14' NaN NaN
31 1627099.0 b'(11435) 1931 UB' b'(11435) 1931 UB' NaN
32 2221416.0 b'(11436) 1969 QR' b'(11436) 1969 QR' NaN
33 1318410.0 b'(115485) 2003 UR19' b'(115485) 2003 UR19' NaN
34 2805321.0 b'(120132) 2003 FY128' b'(120132) 2003 FY128' NaN
35 1944739.0 b'(120178) 2003 OP32' b'(120178) 2003 OP32' NaN
36 3273524.0 b'(121514) 1999 UJ7' b'(121514) 1999 UJ7' NaN
37 1901921.0 b'(128621) 2004 RD' b'(128621) 2004 RD' NaN
38 1627255.0 b'(129066) 2004 VY28' b'(129066) 2004 VY28' NaN
39 1627114.0 b'(131323) 2001 GE11' b'(131323) 2001 GE11' NaN
40 1627149.0 b'(134010) 2004 VW28' b'(134010) 2004 VW28' NaN
41 1627061.0 b'(143052) 2002 WY2' b'(143052) 2002 WY2' NaN
42 709674.0 b'(14321) 1978 VT9' b'(14321) 1978 VT9' NaN
43 711729.0 b'(14323) 1979 MV1' b'(14323) 1979 MV1' NaN
44 709762.0 b'(14324) 1979 MK6' b'(14324) 1979 MK6' NaN
45 709843.0 b'(14325) 1979 MM6' b'(14325) 1979 MM6' NaN
46 1627216.0 b'(145166) 2005 JL' b'(145166) 2005 JL' NaN
47 1901874.0 b'(147735) 2005 NE' b'(147735) 2005 NE' NaN
48 1901220.0 b'(147799) 2005 RA34' b'(147799) 2005 RA34' NaN
49 1901314.0 b'(149450) 2003 CE14' b'(149450) 2003 CE14' NaN