Imports

In [38]:
from sqlalchemy import create_engine
import sys, os
import pandas as pd

Create SQLAlchemy Engine

In [2]:
constr = 'mysql+pymysql://{user}:{pwd}@{host}'.format(
    user=os.environ['MYSQL_USERNAME'],
    pwd=os.environ['MYSQL_PASSWORD'],
    host=os.environ['MYSQL_HOST']
)
engine = create_engine(constr)

Describe Tables

page

has a row for each page.

  • namespace=0 is for main pages
  • namespace=6 is for file pages
  • namespace=14 is for category pages
In [3]:
pd.read_sql('''
select page_id, page_namespace, page_title 
from enwiki_p.page 
where page_namespace=0
limit 10
offset 1000
''', engine)
Out[3]:
page_id page_namespace page_title
0 37515680 0 b'"Hot_Ice"_Hilda'
1 17612192 0 b'"Hot_Stuff"_Eddie_Gilbert'
2 2641607 0 b'"Hotshot"_Johnny_Devine'
3 29828002 0 b'"House,_M.D."'
4 37248992 0 b'"House_Hippo"'
5 31746200 0 b'"How\'s_my_driving?"_sign'
6 55123262 0 b'"How_You_do_That"'
7 41723702 0 b'"How_to_Be_a_Woman"'
8 25420739 0 b'"Howling"_Jack_Smith'
9 54175440 0 b'"Hoyoyo!"_Space_Adventure'
In [4]:
pd.read_sql('''
select page_id, page_namespace, page_title 
from enwiki_p.page 
where page_namespace=6
limit 10
offset 1000
''', engine)
Out[4]:
page_id page_namespace page_title
0 50385347 6 b'"The_Middle_Watch"_(1940).jpg'
1 41856004 6 b'"The_Mind_Benders"_(1963).jpg'
2 45092997 6 b'"The_Mind_of_Mr._J.G._Reader".jpg'
3 45261686 6 b'"The_Mind_of_Mr._Reeder"_(1939).jpg'
4 45430994 6 b'"The_Mind_of_Mr._Soames"_(1970).jpg'
5 45266363 6 b'"The_Missing_Million"_(1942).jpg'
6 45257436 6 b'"The_Missing_People"_(1940).jpg'
7 58874668 6 b'"The_Morning"_Paining_in_Space_by_Yervand_Ko...
8 45334178 6 b'"The_Music_Box_Kid"_(1960).jpg'
9 43232438 6 b'"The_Mutations".jpeg'
In [5]:
pd.read_sql('''
select page_id, page_namespace, page_title 
from enwiki_p.page 
where page_namespace=14
limit 10
offset 1000
''', engine)
Out[5]:
page_id page_namespace page_title
0 942353 14 b'1031'
1 62220874 14 b'1031_beginnings'
2 1310033 14 b'1031_births'
3 1459349 14 b'1031_deaths'
4 7475112 14 b'1031_disestablishments'
5 46784919 14 b'1031_disestablishments_in_Europe'
6 62262861 14 b'1031_endings'
7 19304010 14 b'1031_establishments'
8 46784950 14 b'1031_establishments_in_Europe'
9 47062838 14 b'1031_in_Europe'

cl_from: is the page_id to which the category is being assigned. this page_id can represent a main page, a file page, or a category page.

cl_to: is the page_title of a category page. parent/child relationships between categories are inidcated by rows in which the cl_from page_id refers to a category page.

cl_type is one of [page, file, subcat] and indicates the type of page cl_from refers to.

Types of edges

  • file -> category
  • page -> category
  • category -> category

For example, the first row below indicates that

In [6]:
pd.read_sql('''
select cl_from, cl_to, cl_type 
from enwiki_p.categorylinks
where cl_type = 'file'
limit 10
offset 1000
''', engine)
Out[6]:
cl_from cl_to cl_type
0 4878781 b'1940_comics_images' b'file'
1 7962163 b'1940_comics_images' b'file'
2 8156862 b'1940_comics_images' b'file'
3 11082103 b'1940_comics_images' b'file'
4 14322623 b'1940_comics_images' b'file'
5 15209336 b'1940_comics_images' b'file'
6 20590511 b'1940_comics_images' b'file'
7 20590755 b'1940_comics_images' b'file'
8 36162043 b'1940s_United_States_bomber_aircraft' b'file'
9 15919649 b'1940s_comics_images' b'file'
In [7]:
pd.read_sql('''
select cl_from, cl_to, cl_type 
from enwiki_p.categorylinks
where cl_type = 'page'
limit 10
offset 1000
''', engine)
Out[7]:
cl_from cl_to cl_type
0 28990260 b'"C-Class"_Wikipedia_books' b'page'
1 29011033 b'"C-Class"_Wikipedia_books' b'page'
2 29012950 b'"C-Class"_Wikipedia_books' b'page'
3 29022099 b'"C-Class"_Wikipedia_books' b'page'
4 29093935 b'"C-Class"_Wikipedia_books' b'page'
5 29110575 b'"C-Class"_Wikipedia_books' b'page'
6 29250310 b'"C-Class"_Wikipedia_books' b'page'
7 29320975 b'"C-Class"_Wikipedia_books' b'page'
8 29322320 b'"C-Class"_Wikipedia_books' b'page'
9 29325788 b'"C-Class"_Wikipedia_books' b'page'
In [8]:
pd.read_sql('''
select cl_from, cl_to, cl_type 
from enwiki_p.categorylinks
where cl_type = 'subcat'
limit 10
''', engine)
Out[8]:
cl_from cl_to cl_type
0 19328819 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
1 19857745 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
2 39389003 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
3 48581424 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
4 50531174 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
5 56547044 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
6 2534343 b'"Weird_Al"_Yankovic' b'subcat'
7 2890926 b'"Weird_Al"_Yankovic' b'subcat'
8 23594509 b'"Weird_Al"_Yankovic' b'subcat'
9 23594513 b'"Weird_Al"_Yankovic' b'subcat'

category

In [9]:
pd.read_sql('select * from enwiki_p.category limit 10', engine)
Out[9]:
cat_id cat_title cat_pages cat_subcats cat_files
0 2 b'Unprintworthy_redirects' 1409133 18 0
1 3 b'Computer_storage_devices' 89 10 0
2 7 b'Unknown-importance_Animation_articles' 326 23 0
3 8 b'Low-importance_Animation_articles' 13288 23 0
4 9 b'Vietnam_stubs' 335 9 0
5 10 b'Rivers_of_Vietnam' 10 3 0
6 11 b'Quang_Binh_Province' 16 4 0
7 12 b'All_articles_with_unsourced_statements' 372690 0 0
8 14 b'Wikipedia_articles_needing_clarification' 161 159 0
9 15 b'Articles_needing_additional_references_from_... 1523 0 0
In [10]:
pd.read_sql("select * from enwiki_p.category where cat_title='Anarchism' limit 10", engine)
Out[10]:
cat_id cat_title cat_pages cat_subcats cat_files
0 1659 b'Anarchism' 51 19 1

page_props

pp_page is a page_id. rows that have pp_propname = 'hiddencat' are categories that are not shown at the bottom of pages. these are typically only interesting for wikipedia editors or admins.

In [11]:
pd.read_sql('select * from enwiki_p.page_props limit 10', engine)
Out[11]:
pp_page pp_propname pp_value pp_sortkey
0 12 b'wikibase-badge-Q17437798' b'1' 1.0
1 12 b'wikibase-shortdesc' b'Political philosophy that advocates self-gov... NaN
2 12 b'wikibase_item' b'Q6199' NaN
3 25 b'page_image_free' b'Autism-stacking-cans_2nd_edit.jpg' NaN
4 25 b'wikibase-badge-Q17437796' b'1' 1.0
5 25 b'wikibase-shortdesc' b'Neurodevelopmental disorder involving proble... NaN
6 25 b'wikibase_item' b'Q38404' NaN
7 39 b'page_image_free' b'Albedo-e_hg.svg' NaN
8 39 b'wikibase_item' b'Q101038' NaN
9 290 b'page_image_free' b'A_cursiva.gif' NaN
In [12]:
pd.read_sql("""
select pp_page, pp_propname, page_title
from enwiki_p.page_props pp
join enwiki_p.page p on pp_page = p.page_id
where pp_propname = 'hiddencat'
limit 10""", engine)
Out[12]:
pp_page pp_propname page_title
0 747593 b'hiddencat' b'Possible_copyright_violations'
1 869361 b'hiddencat' b'Articles_to_be_split'
2 869371 b'hiddencat' b'Articles_to_be_merged'
3 885354 b'hiddencat' b'Accuracy_disputes'
4 885355 b'hiddencat' b'NPOV_disputes'
5 936375 b'hiddencat' b'Self-contradictory_articles'
6 1056245 b'hiddencat' b'1911_Britannica_articles_needing_updates'
7 1065461 b'hiddencat' b'Articles_to_be_expanded'
8 1114786 b'hiddencat' b'Festival_stubs'
9 1125232 b'hiddencat' b'Redirects_with_possibilities'

SQL Examples

List some main pages

In [13]:
pd.read_sql('''
select page_id, page_namespace, page_title
from enwiki_p.page 
where page_namespace=0 and page_is_redirect=0
order by page_id
limit 10''', engine)
Out[13]:
page_id page_namespace page_title
0 12 0 b'Anarchism'
1 25 0 b'Autism'
2 39 0 b'Albedo'
3 290 0 b'A'
4 303 0 b'Alabama'
5 305 0 b'Achilles'
6 307 0 b'Abraham_Lincoln'
7 308 0 b'Aristotle'
8 309 0 b'An_American_in_Paris'
9 316 0 b'Academy_Award_for_Best_Production_Design'

Start at a main page and find categories

In [16]:
pd.read_sql('''
select 
  cl_from as cl_from_id,
  p1.page_namespace as cl_from_ns,
  p1.page_title as cl_from_title,
  p2.page_id as cl_to_id,
  cl_to as cl_to_title, 
  cl_type,
  coalesce(pp.pp_propname, 'vizcat')='hiddencat' as hidden,
  c.cat_id,
  c.cat_pages,
  c.cat_subcats,
  c.cat_files
from enwiki_p.categorylinks as cl
inner join enwiki_p.page as p1
  on p1.page_id = cl_from
inner join enwiki_p.page as p2
  on p2.page_title = cl_to
left join enwiki_p.page_props as pp
  on p2.page_id = pp.pp_page
  and pp.pp_propname = 'hiddencat'
join enwiki_p.category as c
  on c.cat_title = cl_to
where 
  cl_from = 12 and 
  p2.page_namespace = 14
''', engine)
Out[16]:
cl_from_id cl_from_ns cl_from_title cl_to_id cl_to_title cl_type hidden cat_id cat_pages cat_subcats cat_files
0 12 0 b'Anarchism' 24815679 b'All_articles_lacking_reliable_references' b'page' 1 811130 73842 0 0
1 12 0 b'Anarchism' 9329647 b'All_articles_with_unsourced_statements' b'page' 1 12 372689 0 0
2 12 0 b'Anarchism' 780754 b'Anarchism' b'page' 0 1659 51 19 1
3 12 0 b'Anarchism' 22737025 b'Anti-capitalism' b'page' 0 518370 73 14 1
4 12 0 b'Anarchism' 7252754 b'Anti-fascism' b'page' 0 50218 33 6 0
5 12 0 b'Anarchism' 39696359 b'Articles_containing_French-language_text' b'page' 1 30959666 21945 1 0
6 12 0 b'Anarchism' 39705246 b'Articles_containing_Spanish-language_text' b'page' 1 30960077 16642 0 0
7 12 0 b'Anarchism' 60094242 b'Articles_lacking_reliable_references_from_Ma... b'page' 1 247617437 1138 0 0
8 12 0 b'Anarchism' 44259556 b'Articles_prone_to_spam_from_November_2014' b'page' 1 219224733 44 0 0
9 12 0 b'Anarchism' 56024289 b'Articles_with_short_description' b'page' 1 247193040 1390577 1 0
10 12 0 b'Anarchism' 61175942 b'Articles_with_unsourced_statements_from_July... b'page' 1 247721752 6311 0 0
11 12 0 b'Anarchism' 60094118 b'Articles_with_unsourced_statements_from_Marc... b'page' 1 247617302 6166 0 0
12 12 0 b'Anarchism' 3050197 b'Economic_ideologies' b'page' 0 116765 127 7 0
13 12 0 b'Anarchism' 15899799 b'Far-left_politics' b'page' 0 128115 85 12 0
14 12 0 b'Anarchism' 27529113 b'Good_articles' b'page' 1 27470393 30308 1 0
15 12 0 b'Anarchism' 33152009 b'Libertarian_socialism' b'page' 0 9892682 38 8 0
16 12 0 b'Anarchism' 1543366 b'Libertarianism' b'page' 0 1642 33 14 0
17 12 0 b'Anarchism' 21722732 b'Political_culture' b'page' 0 375591 57 18 0
18 12 0 b'Anarchism' 6243762 b'Political_ideologies' b'page' 0 249779 215 65 0
19 12 0 b'Anarchism' 717474 b'Political_movements' b'page' 0 249793 142 22 0
20 12 0 b'Anarchism' 23079219 b'Social_theories' b'page' 0 704944 132 9 0
21 12 0 b'Anarchism' 41506360 b'Use_British_English_from_January_2014' b'page' 1 196713806 2941 0 0
22 12 0 b'Anarchism' 57795375 b'Use_dmy_dates_from_July_2018' b'page' 1 247373412 11635 0 0
23 12 0 b'Anarchism' 38547981 b'Wikipedia_articles_with_BNF_identifiers' b'page' 1 169959383 218495 0 0
24 12 0 b'Anarchism' 38547977 b'Wikipedia_articles_with_GND_identifiers' b'page' 1 169959072 305503 0 0
25 12 0 b'Anarchism' 57707714 b'Wikipedia_articles_with_HDS_identifiers' b'page' 1 247364008 4999 0 0
26 12 0 b'Anarchism' 38547913 b'Wikipedia_articles_with_LCCN_identifiers' b'page' 1 169945358 467768 0 0
27 12 0 b'Anarchism' 57707725 b'Wikipedia_articles_with_NKC_identifiers' b'page' 1 247364030 124772 0 0
28 12 0 b'Anarchism' 20815170 b'Wikipedia_indefinitely_move-protected_pages' b'page' 1 588287 4646 8 47
In [ ]:
 
In [17]:
pd.read_sql('''
select 
  cl_from as cl_from_id,
  p1.page_namespace as cl_from_ns,
  p1.page_title as cl_from_title,
  p2.page_id as cl_to_id,
  cl_to as cl_to_title, 
  cl_type,
  coalesce(pp.pp_propname, 'vizcat')='hiddencat' as hidden,
  c.cat_id,
  c.cat_pages,
  c.cat_subcats,
  c.cat_files
from enwiki_p.categorylinks as cl
inner join enwiki_p.page as p1
  on p1.page_id = cl_from
inner join enwiki_p.page as p2
  on p2.page_title = cl_to
left join enwiki_p.page_props as pp
  on p2.page_id = pp.pp_page
  and pp.pp_propname = 'hiddencat'
join enwiki_p.category as c
  on c.cat_title = cl_to
where 
  cl_from = 780754 and 
  p2.page_namespace = 14
''', engine)
Out[17]:
cl_from_id cl_from_ns cl_from_title cl_to_id cl_to_title cl_type hidden cat_id cat_pages cat_subcats cat_files
0 780754 14 b'Anarchism' 22737025 b'Anti-capitalism' b'subcat' 0 518370 73 14 1
1 780754 14 b'Anarchism' 59055138 b'Commons_category_link_is_on_Wikidata' b'subcat' 1 247490234 668081 300968 0
2 780754 14 b'Anarchism' 3050197 b'Economic_ideologies' b'subcat' 0 116765 127 7 0
3 780754 14 b'Anarchism' 15899799 b'Far-left_politics' b'subcat' 0 128115 85 12 0
4 780754 14 b'Anarchism' 2764307 b'Philosophical_schools_and_traditions' b'subcat' 0 246567 66 21 0
5 780754 14 b'Anarchism' 21722732 b'Political_culture' b'subcat' 0 375591 57 18 0
6 780754 14 b'Anarchism' 6243762 b'Political_ideologies' b'subcat' 0 249779 215 65 0
7 780754 14 b'Anarchism' 717474 b'Political_movements' b'subcat' 0 249793 142 22 0
8 780754 14 b'Anarchism' 1013657 b'Political_philosophy' b'subcat' 0 250478 206 47 0
9 780754 14 b'Anarchism' 23772928 b'Statelessness' b'subcat' 0 742734 23 4 0
10 780754 14 b'Anarchism' 56294120 b'Wikipedia_categories_named_after_ideologies' b'subcat' 1 247226417 477 477 0
In [ ]:
 
In [39]:
sql_query = '''
SELECT 
  cl_from AS cl_from_id,
  p1.page_namespace AS cl_from_ns,
  p1.page_title AS cl_from_title,
  p2.page_id AS cl_to_id,
  cl_to AS cl_to_title, 
  cl_type,
  coalesce(pp.pp_propname, 'vizcat')='hiddencat' AS hidden,
  c.cat_id,
  c.cat_pages,
  c.cat_subcats,
  c.cat_files
FROM enwiki_p.categorylinks AS cl
JOIN enwiki_p.page AS p1
  ON p1.page_id = cl_from
JOIN enwiki_p.page AS p2
  ON p2.page_title = cl_to
LEFT JOIN enwiki_p.page_props AS pp
  ON p2.page_id = pp.pp_page
  AND pp.pp_propname = 'hiddencat'
JOIN enwiki_p.category AS c
  ON c.cat_title = cl_to
WHERE 
  p2.page_namespace = 14 AND 
  coalesce(pp.pp_propname, 'vizcat')='hiddencat' = 0 AND 
  p1.page_namespace IN (0, 6, 14)
limit 100000
'''
In [40]:
df_iter = pd.read_sql_query(sql_query, engine, chunksize=30)
ERROR:sqlalchemy.pool.impl.QueuePool:Exception during reset or similar
Traceback (most recent call last):
  File "/srv/paws/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 680, in _finalize_fairy
    fairy._reset(pool)
  File "/srv/paws/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 867, in _reset
    pool._dialect.do_rollback(self)
  File "/srv/paws/lib/python3.6/site-packages/sqlalchemy/dialects/mysql/base.py", line 2227, in do_rollback
    dbapi_connection.rollback()
  File "/srv/paws/lib/python3.6/site-packages/pymysql/connections.py", line 430, in rollback
    self._read_ok_packet()
  File "/srv/paws/lib/python3.6/site-packages/pymysql/connections.py", line 394, in _read_ok_packet
    pkt = self._read_packet()
  File "/srv/paws/lib/python3.6/site-packages/pymysql/connections.py", line 657, in _read_packet
    packet_header = self._read_bytes(4)
  File "/srv/paws/lib/python3.6/site-packages/pymysql/connections.py", line 707, in _read_bytes
    CR.CR_SERVER_LOST, "Lost connection to MySQL server during query")
pymysql.err.OperationalError: (2013, 'Lost connection to MySQL server during query')
In [32]:
df = next(df_iter)
In [35]:
df.iloc[0]['cl_from_title']
Out[35]:
b'GA-Class_Animation_articles_of_Unknown-importance'
In [36]:
df
Out[36]:
cl_from_id cl_from_ns cl_from_title cl_to_id cl_to_title cl_type hidden cat_id cat_pages cat_subcats cat_files
0 30178463 14 b'GA-Class_Animation_articles_of_Unknown-impor... 16374159 b'Unknown-importance_Animation_articles' b'subcat' 0 7 326 23 0
1 30179900 14 b'List-Class_Animation_articles_of_Unknown-imp... 16374159 b'Unknown-importance_Animation_articles' b'subcat' 0 7 326 23 0
2 30535345 14 b'NA-Class_Animation_articles_of_Unknown-impor... 16374159 b'Unknown-importance_Animation_articles' b'subcat' 0 7 326 23 0
3 30530274 14 b'Portal-Class_Animation_articles_of_Unknown-i... 16374159 b'Unknown-importance_Animation_articles' b'subcat' 0 7 326 23 0
4 30530352 14 b'Project-Class_Animation_articles_of_Unknown-... 16374159 b'Unknown-importance_Animation_articles' b'subcat' 0 7 326 23 0
5 30535258 14 b'Redirect-Class_Animation_articles_of_Unknown... 16374159 b'Unknown-importance_Animation_articles' b'subcat' 0 7 326 23 0
6 30180014 14 b'Start-Class_Animation_articles_of_Unknown-im... 16374159 b'Unknown-importance_Animation_articles' b'subcat' 0 7 326 23 0
7 30179911 14 b'Stub-Class_Animation_articles_of_Unknown-imp... 16374159 b'Unknown-importance_Animation_articles' b'subcat' 0 7 326 23 0
8 30535354 14 b'Template-Class_Animation_articles_of_Unknown... 16374159 b'Unknown-importance_Animation_articles' b'subcat' 0 7 326 23 0
9 30179836 14 b'Unassessed_Animation_articles_of_Unknown-imp... 16374159 b'Unknown-importance_Animation_articles' b'subcat' 0 7 326 23 0
10 30941582 14 b'Unknown-importance_Animation_articles_by_wor... 16374159 b'Unknown-importance_Animation_articles' b'subcat' 0 7 326 23 0
11 30179716 14 b'A-Class_Animation_articles_of_Low-importance' 16374148 b'Low-importance_Animation_articles' b'subcat' 0 8 13288 23 0
12 30180086 14 b'B-Class_Animation_articles_of_Low-importance' 16374148 b'Low-importance_Animation_articles' b'subcat' 0 8 13288 23 0
13 30182085 14 b'Book-Class_Animation_articles_of_Low-importa... 16374148 b'Low-importance_Animation_articles' b'subcat' 0 8 13288 23 0
14 30178480 14 b'C-Class_Animation_articles_of_Low-importance' 16374148 b'Low-importance_Animation_articles' b'subcat' 0 8 13288 23 0
15 30182191 14 b'Category-Class_Animation_articles_of_Low-imp... 16374148 b'Low-importance_Animation_articles' b'subcat' 0 8 13288 23 0
16 30182057 14 b'Disambig-Class_Animation_articles_of_Low-imp... 16374148 b'Low-importance_Animation_articles' b'subcat' 0 8 13288 23 0
17 44076999 14 b'Draft-Class_Animation_articles_of_Low-import... 16374148 b'Low-importance_Animation_articles' b'subcat' 0 8 13288 23 0
18 30179697 14 b'FA-Class_Animation_articles_of_Low-importance' 16374148 b'Low-importance_Animation_articles' b'subcat' 0 8 13288 23 0
19 30183717 14 b'File-Class_Animation_articles_of_Low-importa... 16374148 b'Low-importance_Animation_articles' b'subcat' 0 8 13288 23 0

Iterate rows

In [22]:
!ls
category-sql_demo.ipynb  qwikidata_demo.ipynb
geeq-sql_demo.ipynb	 sql_demo.ipynb
In [41]:
!df -h
Filesystem                                                                        Size  Used Avail Use% Mounted on
overlay                                                                            60G   22G   36G  38% /
tmpfs                                                                             4.0G     0  4.0G   0% /dev
tmpfs                                                                             4.0G     0  4.0G   0% /sys/fs/cgroup
nfs-tools-project.svc.eqiad.wmnet:/project/tools/project/paws/userhomes/43975115  8.0T  5.2T  2.4T  69% /home/paws
/dev/vda3                                                                          19G  3.3G   15G  19% /public/dumps
/dev/mapper/vd-second--local--disk                                                 60G   22G   36G  38% /etc/hostname
shm                                                                                64M     0   64M   0% /dev/shm
labstore1007.wikimedia.org:/dumps                                                  97T   64T   29T  70% /mnt/nfs/dumps-labstore1007.wikimedia.org
tmpfs                                                                             4.0G     0  4.0G   0% /sys/firmware
In [43]:
!ls /home/paws
category-sql_demo.ipynb  qwikidata_demo.ipynb
geeq-sql_demo.ipynb	 sql_demo.ipynb
In [ ]: