import pymysql
import os
import pandas as pd
import sqlite3
db = sqlite3.connect('my.db')
conn = pymysql.connect(
    host=os.environ['MYSQL_HOST'],
    user=os.environ['MYSQL_USERNAME'],
    password=os.environ['MYSQL_PASSWORD'],
    database='enwiki_p',
    charset='utf8'
)
host=os.environ['MYSQL_HOST']
host
'10.97.130.38'
sql_pages = """
SELECT DISTINCT cl_from AS page_id
FROM categorylinks 
WHERE cl_to LIKE 'All_WikiProject_Medicine_articles'
ORDER BY page_id 
"""
pages = []
with conn.cursor() as cur:
    cur.execute('use enwiki_p')
    cur.execute(sql_pages)
    pages = [row[0] for row in cur.fetchall()]
pages[:5]
[30974924, 30999381, 31000036, 793, 2013]
src_files = !ls /mnt/nfs/dumps-labstore1007.wikimedia.org/xmldatadumps/public/enwiki/latest/*pages-articles*.bz2
src_files[1]
'/mnt/nfs/dumps-labstore1007.wikimedia.org/xmldatadumps/public/enwiki/latest/enwiki-latest-pages-articles11.xml-p3046517p3926861.bz2'
os.system('cp {} my.bz2'.format(src_files[1]))
0
!ls -lh
total 2.5G
-rw-r--r-- 1 tools.paws tools.paws 368M Oct 26 19:18 my.bz2
-rw-r--r-- 1 tools.paws tools.paws    0 Oct 26 18:58 my.db
-rw-r--r-- 1 tools.paws tools.paws 2.1G Oct 26 19:16 my.xml.bz2
-rw-r--r-- 1 tools.paws tools.paws 3.1K Oct 26 19:19 Test.ipynb
-rw-r--r-- 1 tools.paws tools.paws   72 Oct 26 18:47 Untitled.ipynb
!bzip2 -d my.xml.bz2