Replica Databases Helper

This is a importable notebook that provides simple helpers for performing queries on the labsdb replica databases from PAWS. It is stateful and designed to be easy to use in an interactive setup.

Import behavior

As soon as you import this notebook, it sets up a simple mysql connection pool to the replica. No actual connections are made until you try to use any of the functions.

It provides the following functions:

sql(query, *args, dbname=None)

This runs the given query (with passed in parameters) on the currently selected db. It returns an iterator

use_db(dbname)

import os
from sqlalchemy import create_engine
from sqlalchemy.sql import text
host = os.environ['MYSQL_HOST']
user = os.environ['MYSQL_USERNAME']
password = os.environ['MYSQL_PASSWORD']
connection_string = 'mysql+pymysql://{user}:{password}@{host}/?charset=utf8mb4'.format(
    user=user,
    password=password,
    host=host
)
engine = create_engine(connection_string, pool_recycle=300)
cur_db = 'enwiki_p'
def use_db(db):
    global cur_db
    if not db.endswith('_p'):
        db = db + '_p'
    cur_db = db
def pre_process_param(param):
    if type(param) == list:
        return tuple(param)
    return param
def sql(query_str, params={}, db=None):
    """
    Run the given SQL query with given 
    """
    def encode_if_necessary(b):
        if type(b) is bytes:
            return b.decode('utf8')
        return b
    
    if db is None:
        db = cur_db
    if type(db) != list:
        db = [db]
    query = text(query_str)
    con = engine.connect()
    for d in db:
        con.execute('use %s' % d)
        for k in params:
            params[k] = pre_process_param(params[k])
        for r in con.execute(query, **params):
            d = dict(r)
            # This is unfortunately necessary since I can not seem to get sqlalchemy
            # nor pymysql to co-erce all strings to unicode values. This might be
            # because our dbs don't have proper encoding set.
            unicode_d = {k: encode_if_necessary(v) for k, v in d.items()}
            yield unicode_d