Imports

In [1]:
from sqlalchemy import create_engine
from sqlalchemy import MetaData
from sqlalchemy import dialects, PrimaryKeyConstraint, ForeignKeyConstraint
import sys, os
import pandas as pd

Create SQLAlchemy Engine

In [2]:
constr = 'mysql+pymysql://{user}:{pwd}@{host}'.format(
    user=os.environ['MYSQL_USERNAME'],
    pwd=os.environ['MYSQL_PASSWORD'],
    host=os.environ['MYSQL_HOST']
)
engine = create_engine(constr)

Describe Tables

page

has a row for each page.

  • namespace=0 is for main pages
  • namespace=6 is for file pages
  • namespace=14 is for category pages
In [3]:
pd.read_sql('''
select page_id, page_namespace, page_title 
from enwiki_p.page 
where page_namespace=0
limit 10
offset 1000
''', engine)
Out[3]:
page_id page_namespace page_title
0 37515680 0 b'"Hot_Ice"_Hilda'
1 17612192 0 b'"Hot_Stuff"_Eddie_Gilbert'
2 2641607 0 b'"Hotshot"_Johnny_Devine'
3 29828002 0 b'"House,_M.D."'
4 37248992 0 b'"House_Hippo"'
5 31746200 0 b'"How\'s_my_driving?"_sign'
6 55123262 0 b'"How_You_do_That"'
7 41723702 0 b'"How_to_Be_a_Woman"'
8 25420739 0 b'"Howling"_Jack_Smith'
9 54175440 0 b'"Hoyoyo!"_Space_Adventure'
In [4]:
pd.read_sql('''
select page_id, page_namespace, page_title 
from enwiki_p.page 
where page_namespace=6
limit 10
offset 1000
''', engine)
Out[4]:
page_id page_namespace page_title
0 50385347 6 b'"The_Middle_Watch"_(1940).jpg'
1 41856004 6 b'"The_Mind_Benders"_(1963).jpg'
2 45092997 6 b'"The_Mind_of_Mr._J.G._Reader".jpg'
3 45261686 6 b'"The_Mind_of_Mr._Reeder"_(1939).jpg'
4 45430994 6 b'"The_Mind_of_Mr._Soames"_(1970).jpg'
5 45266363 6 b'"The_Missing_Million"_(1942).jpg'
6 45257436 6 b'"The_Missing_People"_(1940).jpg'
7 58874668 6 b'"The_Morning"_Paining_in_Space_by_Yervand_Ko...
8 45334178 6 b'"The_Music_Box_Kid"_(1960).jpg'
9 43232438 6 b'"The_Mutations".jpeg'
In [5]:
pd.read_sql('''
select page_id, page_namespace, page_title 
from enwiki_p.page 
where page_namespace=14
limit 10
offset 1000
''', engine)
Out[5]:
page_id page_namespace page_title
0 942353 14 b'1031'
1 62220874 14 b'1031_beginnings'
2 1310033 14 b'1031_births'
3 1459349 14 b'1031_deaths'
4 7475112 14 b'1031_disestablishments'
5 46784919 14 b'1031_disestablishments_in_Europe'
6 62262861 14 b'1031_endings'
7 19304010 14 b'1031_establishments'
8 46784950 14 b'1031_establishments_in_Europe'
9 47062838 14 b'1031_in_Europe'

cl_from: is the page_id to which the category is being assigned. this page_id can represent a main page, a file page, or a category page.

cl_to: is the page_title of a category page. parent/child relationships between categories are inidcated by rows in which the cl_from page_id refers to a category page.

cl_type is one of [page, file, subcat] and indicates the type of page cl_from refers to.

Types of edges

  • file -> category
  • page -> category
  • category -> category

For example, the first row below indicates that

In [6]:
pd.read_sql('''
select cl_from, cl_to, cl_type 
from enwiki_p.categorylinks
where cl_type = 'file'
limit 10
offset 1000
''', engine)
Out[6]:
cl_from cl_to cl_type
0 4878781 b'1940_comics_images' b'file'
1 7962163 b'1940_comics_images' b'file'
2 8156862 b'1940_comics_images' b'file'
3 11082103 b'1940_comics_images' b'file'
4 14322623 b'1940_comics_images' b'file'
5 15209336 b'1940_comics_images' b'file'
6 20590511 b'1940_comics_images' b'file'
7 20590755 b'1940_comics_images' b'file'
8 36162043 b'1940s_United_States_bomber_aircraft' b'file'
9 15919649 b'1940s_comics_images' b'file'
In [7]:
pd.read_sql('''
select cl_from, cl_to, cl_type 
from enwiki_p.categorylinks
where cl_type = 'page'
limit 10
offset 1000
''', engine)
Out[7]:
cl_from cl_to cl_type
0 28990260 b'"C-Class"_Wikipedia_books' b'page'
1 29011033 b'"C-Class"_Wikipedia_books' b'page'
2 29012950 b'"C-Class"_Wikipedia_books' b'page'
3 29022099 b'"C-Class"_Wikipedia_books' b'page'
4 29093935 b'"C-Class"_Wikipedia_books' b'page'
5 29110575 b'"C-Class"_Wikipedia_books' b'page'
6 29250310 b'"C-Class"_Wikipedia_books' b'page'
7 29320975 b'"C-Class"_Wikipedia_books' b'page'
8 29322320 b'"C-Class"_Wikipedia_books' b'page'
9 29325788 b'"C-Class"_Wikipedia_books' b'page'
In [8]:
pd.read_sql('''
select cl_from, cl_to, cl_type 
from enwiki_p.categorylinks
where cl_type = 'subcat'
limit 10
''', engine)
Out[8]:
cl_from cl_to cl_type
0 19328819 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
1 19857745 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
2 39389003 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
3 48581424 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
4 50531174 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
5 56547044 b'"Part_of_a_series_on"_sidebar_templates' b'subcat'
6 2534343 b'"Weird_Al"_Yankovic' b'subcat'
7 2890926 b'"Weird_Al"_Yankovic' b'subcat'
8 23594509 b'"Weird_Al"_Yankovic' b'subcat'
9 23594513 b'"Weird_Al"_Yankovic' b'subcat'

category

In [9]:
pd.read_sql('select * from enwiki_p.category limit 10', engine)
Out[9]:
cat_id cat_title cat_pages cat_subcats cat_files
0 2 b'Unprintworthy_redirects' 1409132 18 0
1 3 b'Computer_storage_devices' 89 10 0
2 7 b'Unknown-importance_Animation_articles' 326 23 0
3 8 b'Low-importance_Animation_articles' 13288 23 0
4 9 b'Vietnam_stubs' 335 9 0
5 10 b'Rivers_of_Vietnam' 10 3 0
6 11 b'Quang_Binh_Province' 16 4 0
7 12 b'All_articles_with_unsourced_statements' 372691 0 0
8 14 b'Wikipedia_articles_needing_clarification' 161 159 0
9 15 b'Articles_needing_additional_references_from_... 1523 0 0
In [10]:
pd.read_sql("select * from enwiki_p.category where cat_title='Anarchism' limit 10", engine)
Out[10]:
cat_id cat_title cat_pages cat_subcats cat_files
0 1659 b'Anarchism' 51 19 1

page_props

pp_page is a page_id. rows that have pp_propname = 'hiddencat' are categories that are not shown at the bottom of pages. these are typically only interesting for wikipedia editors or admins.

In [11]:
pd.read_sql('select * from enwiki_p.page_props limit 10', engine)
Out[11]:
pp_page pp_propname pp_value pp_sortkey
0 12 b'wikibase-badge-Q17437798' b'1' 1.0
1 12 b'wikibase-shortdesc' b'Political philosophy that advocates self-gov... NaN
2 12 b'wikibase_item' b'Q6199' NaN
3 25 b'page_image_free' b'Autism-stacking-cans_2nd_edit.jpg' NaN
4 25 b'wikibase-badge-Q17437796' b'1' 1.0
5 25 b'wikibase-shortdesc' b'Neurodevelopmental disorder involving proble... NaN
6 25 b'wikibase_item' b'Q38404' NaN
7 39 b'page_image_free' b'Albedo-e_hg.svg' NaN
8 39 b'wikibase_item' b'Q101038' NaN
9 290 b'page_image_free' b'A_cursiva.gif' NaN
In [12]:
pd.read_sql("""
select pp_page, pp_propname, page_title
from enwiki_p.page_props pp
join enwiki_p.page p on pp_page = p.page_id
where pp_propname = 'hiddencat'
limit 10""", engine)
Out[12]:
pp_page pp_propname page_title
0 747593 b'hiddencat' b'Possible_copyright_violations'
1 869361 b'hiddencat' b'Articles_to_be_split'
2 869371 b'hiddencat' b'Articles_to_be_merged'
3 885354 b'hiddencat' b'Accuracy_disputes'
4 885355 b'hiddencat' b'NPOV_disputes'
5 936375 b'hiddencat' b'Self-contradictory_articles'
6 1056245 b'hiddencat' b'1911_Britannica_articles_needing_updates'
7 1065461 b'hiddencat' b'Articles_to_be_expanded'
8 1114786 b'hiddencat' b'Festival_stubs'
9 1125232 b'hiddencat' b'Redirects_with_possibilities'

SQL Examples

List some main pages

In [13]:
pd.read_sql('''
select page_id, page_namespace, page_title
from enwiki_p.page 
where page_namespace=0 and page_is_redirect=0
order by page_id
limit 10''', engine)
Out[13]:
page_id page_namespace page_title
0 12 0 b'Anarchism'
1 25 0 b'Autism'
2 39 0 b'Albedo'
3 290 0 b'A'
4 303 0 b'Alabama'
5 305 0 b'Achilles'
6 307 0 b'Abraham_Lincoln'
7 308 0 b'Aristotle'
8 309 0 b'An_American_in_Paris'
9 316 0 b'Academy_Award_for_Best_Production_Design'

Start at a main page and find categories

In [14]:
pd.read_sql('''
select 
  cl_from as cl_from_id,
  p1.page_title as cl_from_title,
  p2.page_id as cl_to_id,
  cl_to as cl_to_title, 
  cl_type,
  coalesce(pp.pp_propname, 'vizcat')='hiddencat' as hidden,
  c.cat_id,
  c.cat_pages,
  c.cat_subcats,
  c.cat_files
from enwiki_p.categorylinks as cl
inner join enwiki_p.page as p1
  on p1.page_id = cl_from
inner join enwiki_p.page as p2
  on p2.page_title = cl_to
left join enwiki_p.page_props as pp
  on p2.page_id = pp.pp_page
  and pp.pp_propname = 'hiddencat'
join enwiki_p.category as c
  on c.cat_title = cl_to
where 
  cl_from = 12 and 
  p2.page_namespace = 14
''', engine)
Out[14]:
cl_from_id cl_from_title cl_to_id cl_to_title cl_type hidden cat_id cat_pages cat_subcats cat_files
0 12 b'Anarchism' 24815679 b'All_articles_lacking_reliable_references' b'page' 1 811130 73843 0 0
1 12 b'Anarchism' 9329647 b'All_articles_with_unsourced_statements' b'page' 1 12 372691 0 0
2 12 b'Anarchism' 780754 b'Anarchism' b'page' 0 1659 51 19 1
3 12 b'Anarchism' 22737025 b'Anti-capitalism' b'page' 0 518370 73 14 1
4 12 b'Anarchism' 7252754 b'Anti-fascism' b'page' 0 50218 33 6 0
5 12 b'Anarchism' 39696359 b'Articles_containing_French-language_text' b'page' 1 30959666 21945 1 0
6 12 b'Anarchism' 39705246 b'Articles_containing_Spanish-language_text' b'page' 1 30960077 16642 0 0
7 12 b'Anarchism' 60094242 b'Articles_lacking_reliable_references_from_Ma... b'page' 1 247617437 1138 0 0
8 12 b'Anarchism' 44259556 b'Articles_prone_to_spam_from_November_2014' b'page' 1 219224733 44 0 0
9 12 b'Anarchism' 56024289 b'Articles_with_short_description' b'page' 1 247193040 1390563 1 0
10 12 b'Anarchism' 61175942 b'Articles_with_unsourced_statements_from_July... b'page' 1 247721752 6311 0 0
11 12 b'Anarchism' 60094118 b'Articles_with_unsourced_statements_from_Marc... b'page' 1 247617302 6166 0 0
12 12 b'Anarchism' 3050197 b'Economic_ideologies' b'page' 0 116765 127 7 0
13 12 b'Anarchism' 15899799 b'Far-left_politics' b'page' 0 128115 85 12 0
14 12 b'Anarchism' 27529113 b'Good_articles' b'page' 1 27470393 30308 1 0
15 12 b'Anarchism' 33152009 b'Libertarian_socialism' b'page' 0 9892682 38 8 0
16 12 b'Anarchism' 1543366 b'Libertarianism' b'page' 0 1642 33 14 0
17 12 b'Anarchism' 21722732 b'Political_culture' b'page' 0 375591 57 18 0
18 12 b'Anarchism' 6243762 b'Political_ideologies' b'page' 0 249779 215 65 0
19 12 b'Anarchism' 717474 b'Political_movements' b'page' 0 249793 142 22 0
20 12 b'Anarchism' 23079219 b'Social_theories' b'page' 0 704944 132 9 0
21 12 b'Anarchism' 41506360 b'Use_British_English_from_January_2014' b'page' 1 196713806 2941 0 0
22 12 b'Anarchism' 57795375 b'Use_dmy_dates_from_July_2018' b'page' 1 247373412 11635 0 0
23 12 b'Anarchism' 38547981 b'Wikipedia_articles_with_BNF_identifiers' b'page' 1 169959383 218494 0 0
24 12 b'Anarchism' 38547977 b'Wikipedia_articles_with_GND_identifiers' b'page' 1 169959072 305503 0 0
25 12 b'Anarchism' 57707714 b'Wikipedia_articles_with_HDS_identifiers' b'page' 1 247364008 4999 0 0
26 12 b'Anarchism' 38547913 b'Wikipedia_articles_with_LCCN_identifiers' b'page' 1 169945358 467768 0 0
27 12 b'Anarchism' 57707725 b'Wikipedia_articles_with_NKC_identifiers' b'page' 1 247364030 124772 0 0
28 12 b'Anarchism' 20815170 b'Wikipedia_indefinitely_move-protected_pages' b'page' 1 588287 4646 8 47

Install setuptools and graphql_compiler

In [15]:
!pip install --upgrade setuptools
Requirement already up-to-date: setuptools in /srv/paws/lib/python3.6/site-packages
In [16]:
!pip install git+https://github.com/kensho-technologies/graphql-compiler
Collecting git+https://github.com/kensho-technologies/graphql-compiler
  Cloning https://github.com/kensho-technologies/graphql-compiler to /tmp/pip-p_9sl3fh-build
  Requirement already satisfied (use --upgrade to upgrade): graphql-compiler==1.11.0 from git+https://github.com/kensho-technologies/graphql-compiler in /srv/paws/lib/python3.6/site-packages
Requirement already satisfied: arrow<1,>=0.15.0 in /srv/paws/lib/python3.6/site-packages (from graphql-compiler==1.11.0)
Requirement already satisfied: frozendict<2,>=1.2 in /srv/paws/lib/python3.6/site-packages (from graphql-compiler==1.11.0)
Requirement already satisfied: funcy<2,>=1.7.3 in /srv/paws/lib/python3.6/site-packages (from graphql-compiler==1.11.0)
Requirement already satisfied: graphql-core<3,>=2.1 in /srv/paws/lib/python3.6/site-packages (from graphql-compiler==1.11.0)
Requirement already satisfied: pytz>=2017.2 in /srv/paws/lib/python3.6/site-packages (from graphql-compiler==1.11.0)
Requirement already satisfied: six>=1.10.0 in /srv/paws/lib/python3.6/site-packages (from graphql-compiler==1.11.0)
Requirement already satisfied: sqlalchemy<2,>=1.3.0 in /srv/paws/lib/python3.6/site-packages (from graphql-compiler==1.11.0)
Requirement already satisfied: python-dateutil in /srv/paws/lib/python3.6/site-packages (from arrow<1,>=0.15.0->graphql-compiler==1.11.0)
Requirement already satisfied: promise>=2.1 in /srv/paws/lib/python3.6/site-packages (from graphql-core<3,>=2.1->graphql-compiler==1.11.0)
Requirement already satisfied: rx<3,>=1.6 in /srv/paws/lib/python3.6/site-packages (from graphql-core<3,>=2.1->graphql-compiler==1.11.0)
In [17]:
import graphql_compiler
from graphql_compiler import get_sqlalchemy_schema_info
from graphql import print_schema
from graphql_compiler import graphql_to_sql

Reflect the Schema from the MariaDB server

In [18]:
metadata = MetaData(bind=engine)
In [19]:
from sqlalchemy import Table, Column, Integer, String, ForeignKey
from graphql_compiler.schema_generation.sqlalchemy.edge_descriptors import DirectEdgeDescriptor
In [20]:
page = Table(
    'page', 
    metadata, 
    Column('page_id', Integer, primary_key=True),
    autoload=True, 
    autoload_with=engine,
    schema='enwiki_p'
)
page
Out[20]:
Table('page', MetaData(bind=Engine(mysql+pymysql://Gabrielaltay:***@10.110.98.159)), Column('page_id', Integer(), table=<page>, primary_key=True, nullable=False), Column('page_namespace', INTEGER(display_width=11), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260be8d0>, for_update=False)), Column('page_title', VARBINARY(length=255), table=<page>), Column('page_restrictions', TINYBLOB(), table=<page>), Column('page_is_redirect', TINYINT(display_width=1, unsigned=True), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260bea20>, for_update=False)), Column('page_is_new', TINYINT(display_width=1, unsigned=True), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260befd0>, for_update=False)), Column('page_random', DOUBLE(asdecimal=True, unsigned=True), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260d40b8>, for_update=False)), Column('page_touched', VARBINARY(length=14), table=<page>), Column('page_links_updated', VARBINARY(length=14), table=<page>), Column('page_latest', INTEGER(display_width=8, unsigned=True), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260d41d0>, for_update=False)), Column('page_len', INTEGER(display_width=8, unsigned=True), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260d4278>, for_update=False)), Column('page_content_model', VARBINARY(length=32), table=<page>), Column('page_lang', VARBINARY(length=35), table=<page>), schema='enwiki_p')
In [21]:
category = Table(
    'category', 
    metadata, 
    Column('cat_id', Integer, primary_key=True),
    Column('cat_title', String, ForeignKey('page.page_title')),
    autoload=True, 
    autoload_with=engine,
    schema='enwiki_p'
)
category
Out[21]:
Table('category', MetaData(bind=Engine(mysql+pymysql://Gabrielaltay:***@10.110.98.159)), Column('cat_id', Integer(), table=<category>, primary_key=True, nullable=False), Column('cat_title', String(), ForeignKey('page.page_title'), table=<category>), Column('cat_pages', INTEGER(display_width=11), table=<category>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260be7b8>, for_update=False)), Column('cat_subcats', INTEGER(display_width=11), table=<category>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260d4f28>, for_update=False)), Column('cat_files', INTEGER(display_width=11), table=<category>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260e1128>, for_update=False)), schema='enwiki_p')
In [22]:
categorylinks = Table(
    'categorylinks', 
    metadata, 
    Column('cl_from', Integer, ForeignKey('page.page_id'), primary_key=True),
    Column('cl_to', String, ForeignKey('page.page_title'), primary_key=True),
    Column('cl_type', String),
    autoload=True, 
    autoload_with=engine,
    schema='enwiki_p'
)
categorylinks
Out[22]:
Table('categorylinks', MetaData(bind=Engine(mysql+pymysql://Gabrielaltay:***@10.110.98.159)), Column('cl_from', Integer(), ForeignKey('page.page_id'), table=<categorylinks>, primary_key=True, nullable=False), Column('cl_to', String(), ForeignKey('page.page_title'), table=<categorylinks>, primary_key=True, nullable=False), Column('cl_sortkey', VARBINARY(length=230), table=<categorylinks>), Column('cl_timestamp', TIMESTAMP(), table=<categorylinks>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260e8320>, for_update=False)), Column('cl_sortkey_prefix', VARBINARY(length=255), table=<categorylinks>), Column('cl_collation', VARBINARY(length=32), table=<categorylinks>), Column('cl_type', String(), table=<categorylinks>), schema='enwiki_p')
In [23]:
page_props = Table(
    'page_props', 
    metadata, 
    Column('pp_page', Integer, ForeignKey('page.page_id'), primary_key=True),
    Column('pp_propname', String, primary_key=True),
    autoload=True, 
    autoload_with=engine,
    schema='enwiki_p'
)
page_props
Out[23]:
Table('page_props', MetaData(bind=Engine(mysql+pymysql://Gabrielaltay:***@10.110.98.159)), Column('pp_page', Integer(), ForeignKey('page.page_id'), table=<page_props>, primary_key=True, nullable=False), Column('pp_propname', String(), table=<page_props>, primary_key=True, nullable=False), Column('pp_value', BLOB(), table=<page_props>), Column('pp_sortkey', FLOAT(), table=<page_props>), schema='enwiki_p')
In [24]:
#metadata.reflect(schema='enwiki_p', views=True)
In [25]:
metadata.tables.keys()
Out[25]:
dict_keys(['enwiki_p.page', 'enwiki_p.category', 'enwiki_p.categorylinks', 'enwiki_p.page_props'])

Take a peek at the table objects

Note that there are not primary keys or foreign keys. This is because the tables we are working with are views.

In [26]:
metadata.tables['enwiki_p.page']
Out[26]:
Table('page', MetaData(bind=Engine(mysql+pymysql://Gabrielaltay:***@10.110.98.159)), Column('page_id', Integer(), table=<page>, primary_key=True, nullable=False), Column('page_namespace', INTEGER(display_width=11), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260be8d0>, for_update=False)), Column('page_title', VARBINARY(length=255), table=<page>), Column('page_restrictions', TINYBLOB(), table=<page>), Column('page_is_redirect', TINYINT(display_width=1, unsigned=True), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260bea20>, for_update=False)), Column('page_is_new', TINYINT(display_width=1, unsigned=True), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260befd0>, for_update=False)), Column('page_random', DOUBLE(asdecimal=True, unsigned=True), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260d40b8>, for_update=False)), Column('page_touched', VARBINARY(length=14), table=<page>), Column('page_links_updated', VARBINARY(length=14), table=<page>), Column('page_latest', INTEGER(display_width=8, unsigned=True), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260d41d0>, for_update=False)), Column('page_len', INTEGER(display_width=8, unsigned=True), table=<page>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260d4278>, for_update=False)), Column('page_content_model', VARBINARY(length=32), table=<page>), Column('page_lang', VARBINARY(length=35), table=<page>), schema='enwiki_p')
In [27]:
metadata.tables['enwiki_p.category']
Out[27]:
Table('category', MetaData(bind=Engine(mysql+pymysql://Gabrielaltay:***@10.110.98.159)), Column('cat_id', Integer(), table=<category>, primary_key=True, nullable=False), Column('cat_title', String(), ForeignKey('page.page_title'), table=<category>), Column('cat_pages', INTEGER(display_width=11), table=<category>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260be7b8>, for_update=False)), Column('cat_subcats', INTEGER(display_width=11), table=<category>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260d4f28>, for_update=False)), Column('cat_files', INTEGER(display_width=11), table=<category>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260e1128>, for_update=False)), schema='enwiki_p')
In [28]:
metadata.tables['enwiki_p.categorylinks']
Out[28]:
Table('categorylinks', MetaData(bind=Engine(mysql+pymysql://Gabrielaltay:***@10.110.98.159)), Column('cl_from', Integer(), ForeignKey('page.page_id'), table=<categorylinks>, primary_key=True, nullable=False), Column('cl_to', String(), ForeignKey('page.page_title'), table=<categorylinks>, primary_key=True, nullable=False), Column('cl_sortkey', VARBINARY(length=230), table=<categorylinks>), Column('cl_timestamp', TIMESTAMP(), table=<categorylinks>, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x7feb260e8320>, for_update=False)), Column('cl_sortkey_prefix', VARBINARY(length=255), table=<categorylinks>), Column('cl_collation', VARBINARY(length=32), table=<categorylinks>), Column('cl_type', String(), table=<categorylinks>), schema='enwiki_p')
In [29]:
metadata.tables['enwiki_p.page_props']
Out[29]:
Table('page_props', MetaData(bind=Engine(mysql+pymysql://Gabrielaltay:***@10.110.98.159)), Column('pp_page', Integer(), ForeignKey('page.page_id'), table=<page_props>, primary_key=True, nullable=False), Column('pp_propname', String(), table=<page_props>, primary_key=True, nullable=False), Column('pp_value', BLOB(), table=<page_props>), Column('pp_sortkey', FLOAT(), table=<page_props>), schema='enwiki_p')
In [31]:
direct_edges = {
    'Page_InCategory': DirectEdgeDescriptor(
        from_vertex='enwiki_p.categorylinks',
        from_column='cl_from',
        to_vertex='enwiki_p.categorylinks',
        to_column='cl_to'
    )
}

Create vertex name to table mapping

In [32]:
vertex_name_to_table = {
    'page': metadata.tables['enwiki_p.page'],
    'category': metadata.tables['enwiki_p.category'],
    'categorylinks': metadata.tables['enwiki_p.categorylinks'],
    'page_props': metadata.tables['enwiki_p.page_props'],
}

Create graphql_compiler schema info

In [33]:
sql_schema_info = get_sqlalchemy_schema_info(
    vertex_name_to_table, {}, dialects.mysql.dialect())
/srv/paws/lib/python3.6/site-packages/graphql_compiler/schema_generation/sqlalchemy/scalar_type_mapper.py:178: UserWarning: Ignoring column "page_restrictions" with unsupported SQL datatype: TINYBLOB
  .format(column_name, type(column_type).__name__))
/srv/paws/lib/python3.6/site-packages/graphql_compiler/schema_generation/sqlalchemy/scalar_type_mapper.py:178: UserWarning: Ignoring column "pp_value" with unsupported SQL datatype: BLOB
  .format(column_name, type(column_type).__name__))
In [34]:
print(print_schema(sql_schema_info.schema))
schema {
  query: RootSchemaQuery
}

directive @filter(op_name: String!, value: [String!]) on FIELD | INLINE_FRAGMENT

directive @tag(tag_name: String!) on FIELD

directive @output(out_name: String!) on FIELD

directive @output_source on FIELD

directive @optional on FIELD

directive @recurse(depth: Int!) on FIELD

directive @fold on FIELD

scalar DateTime

type RootSchemaQuery {
  category: [category]
  categorylinks: [categorylinks]
  page: [page]
  page_props: [page_props]
}

type category {
  _x_count: Int
  cat_files: Int
  cat_id: Int
  cat_pages: Int
  cat_subcats: Int
  cat_title: String
}

type categorylinks {
  _x_count: Int
  cl_collation: String
  cl_from: Int
  cl_sortkey: String
  cl_sortkey_prefix: String
  cl_timestamp: DateTime
  cl_to: String
  cl_type: String
}

type page {
  _x_count: Int
  page_content_model: String
  page_id: Int
  page_is_new: Int
  page_is_redirect: Int
  page_lang: String
  page_latest: Int
  page_len: Int
  page_links_updated: String
  page_namespace: Int
  page_random: Float
  page_title: String
  page_touched: String
}

type page_props {
  _x_count: Int
  pp_page: Int
  pp_propname: String
  pp_sortkey: Float
}

In [35]:
graphql_query = """                                                                                                                                            
{                                                                                                                                                              
  page {                                                                                                                                       
    page_title @output(out_name: "page_title") 
    # @filter(op_name: ">", value:["$pt_min"])                                                         
  }                                                                                                                                                           
}                                                                                                                                                              
"""                                                                                                                                                            
graphql_args = {
    # "pt_min": "b"
}                                                                                                                                                              
                                                                                                                                                               
compilation_result = graphql_to_sql(sql_schema_info, graphql_query, graphql_args)                                                                              
In [36]:
str_qry = str(compilation_result.query.compile())
In [37]:
print(str_qry)
SELECT page_1.page_title AS page_title 
FROM enwiki_p.page AS page_1
In [38]:
res = engine.execute(
    str_qry + " limit 10", 
    #pt_min="b",
)
In [39]:
res.fetchall()
Out[39]:
[(b'!',),
 (b'!!',),
 (b'!!!',),
 (b'!!!Fuck_You!!!',),
 (b'!!!Fuck_You!!!_And_Then_Some',),
 (b'!!!Fuck_You!!!_and_Then_Some',),
 (b'!!!_(!!!_album)',),
 (b'!!!_(American_band)',),
 (b'!!!_(Chk_Chk_Chk)',),
 (b'!!!_(album)',)]
In [40]:
engine.execute("select * from enwiki_p.page where page_title = 'Computer_storage_devices' limit 10").fetchall()
Out[40]:
[(895945, 14, b'Computer_storage_devices', b'', 0, 0, 0.473992695321, b'20191030013441', b'20191030024418', 913055482, 206, b'wikitext', None),
 (1225869, 15, b'Computer_storage_devices', b'', 0, 0, 0.989071707217, b'20160814195210', b'20140825165013', 478600727, 493, b'wikitext', None),
 (42371130, 0, b'Computer_storage_devices', b'', 1, 0, 0.599298894658, b'20191109071437', b'20190703083157', 602293536, 63, b'wikitext', None)]
In [41]:
engine.execute("SELECT VERSION();").fetchall()
Out[41]:
[('10.1.42-MariaDB',)]
In [42]:
engine.execute("select * from enwiki_p.page where page_id=14124672").fetchall()
Out[42]:
[(14124672, 6, b'!!!_-_!!!_album_cover.jpg', b'', 0, 0, 0.369205268908, b'20191111111759', b'20191111165600', 801939430, 1287, b'wikitext', None)]
In [43]:
engine.execute("select * from enwiki_p.page where page_namespace=6 limit 10").fetchall()
Out[43]:
[(49816251, 6, b'!!!_(Chk_Chk_Chk)_-_One_Girl_One_Boy_cover_art.jpg', b'', 0, 0, 0.525450626227, b'20191111112439', b'20191112001440', 819212828, 571, b'wikitext', None),
 (14124672, 6, b'!!!_-_!!!_album_cover.jpg', b'', 0, 0, 0.369205268908, b'20191111111759', b'20191111165600', 801939430, 1287, b'wikitext', None),
 (59465719, 6, b'!!e!VBQQ!mM_$(KGrHqEOKi8E03iU,-u!BNP3+G6Mqw_1.jpg', b'', 1, 1, 0.770836454707, b'20190516113002', b'20181222215843', 874974667, 88, b'wikitext', None),
 (51956677, 6, b'!0_Trombones_Like_2_Pianos.jpg', b'', 0, 0, 0.584178402679, b'20191111112455', b'20191112001440', 761365147, 588, b'wikitext', None),
 (60432898, 6, b'!Haunu.ogg', b'', 0, 0, 0.425876150404, b'20191030132342', b'20191030165938', 891623471, 874, b'wikitext', None),
 (35907496, 6, b'!Hero_(album).jpg', b'', 0, 0, 0.478076758525, b'20191111112233', b'20191111201658', 808066152, 407, b'wikitext', None),
 (33662256, 6, b'!Women_Art_Revolution_(documentary_film)_poster_art.jpg', b'', 0, 0, 0.076640717253, b'20191111111929', b'20191111161705', 821220027, 962, b'wikitext', None),
 (62014251, 6, b'!_(The_Song_Formely_Known_As)_by_Regurgitator.png', b'', 0, 0, 0.429478722035, b'20191111114750', b'20191111111748', 921699931, 425, b'wikitext', None),
 (33004262, 6, b'""Motor-Cycle"_LP_cover-Lotti_Golden.jpg', b'', 0, 0, 0.215892726423, b'20191111112159', b'20191112001507', 821872130, 552, b'wikitext', None),
 (47123102, 6, b'"...And_Mother_Makes_Five".jpg', b'', 0, 0, 0.476580930384, b'20191111111729', b'20191111114604', 686812952, 972, b'wikitext', None)]
In [44]:
engine.execute("select * from enwiki_p.page where page_id=4878781").fetchall()
Out[44]:
[(4878781, 6, b'Blue_Beetle_radio.jpg', b'', 0, 0, 0.070720384761, b'20191111111657', b'20191111112908', 831761532, 813, b'wikitext', None)]
In [ ]: