!pip install mysqltsv
Collecting mysqltsv
  Using cached mysqltsv-0.0.7.tar.gz
Installing collected packages: mysqltsv
  Running setup.py install for mysqltsv ... - \ done
Successfully installed mysqltsv-0.0.7
!pip install mwparserfromhell
Collecting mwparserfromhell
  Using cached mwparserfromhell-0.4.3.tar.gz
Installing collected packages: mwparserfromhell
  Running setup.py install for mwparserfromhell ... - \ | / - \ | done
Successfully installed mwparserfromhell-0.4.3
import sys
import mwxml
import mysqltsv
import mwparserfromhell as mwp
dump_files = !ls /public/dumps/public/dewiki/20161001/dewiki-20161001-pages-articles*.xml-*.bz2
dump_files
['/public/dumps/public/dewiki/20161001/dewiki-20161001-pages-articles1.xml-p000000001p000425449.bz2',
 '/public/dumps/public/dewiki/20161001/dewiki-20161001-pages-articles2.xml-p000425451p001877043.bz2',
 '/public/dumps/public/dewiki/20161001/dewiki-20161001-pages-articles3.xml-p001877053p004964499.bz2',
 '/public/dumps/public/dewiki/20161001/dewiki-20161001-pages-articles4.xml-p004964505p009606033.bz2']
!touch dewiki_20161001_headings_2.tsv
!ls
First PAWS Notebook.ipynb	frwiki_20161001_headings_3.tsv
chunking.ipynb			frwiki_section_frequency.ipynb
chunking_analysis.ipynb		generate_rows_to_skip.ipynb
chunks_it_wp.csv		generate_sections_file.ipynb
de_wiki.ipynb			it_wp.ipynb
dewiki_20161001_headings.tsv	it_wp_articles_only.ipynb
dewiki_20161001_headings_2.tsv	it_wp_indices_to_skip
en_wiki.ipynb			it_wp_indices_to_skip_2
enwiki_20161001_headings.tsv	itwiki_20161001_headings.tsv
eswiki_20161001_headings.tsv	itwiki_20161001_headings_2.tsv
eswiki_20161001_headings_2.tsv	itwiki_20161001_headings_3.tsv
eswiki_20161001_headings_3.tsv	itwiki_20161001_headings_3_noWS.tsv
eswiki_20161001_headings_4.tsv	itwiki_analysis.ipynb
eswiki_analysis.ipynb		itwiki_section_frequency.ipynb
fr_wiki.ipynb			sample_file.tsv
frwiki_20161001_headings.tsv	testing_article_only_import.ipynb
frwiki_20161001_headings_2.tsv	testing_article_only_import_2.ipynb
def process_dump(dump, path):
    for page in dump:
        for revision in page:
            try:
                wikicode = mwp.parse(revision.text or "")
                headings = list(wikicode.filter_headings())
            except Exception as e:
                sys.stderr.write("Failed to parse text: " + str(e) + "\n")
            
            for heading in headings:
                yield page.id, page.title, page.namespace, heading.level, str(heading.title) 
output = mysqltsv.Writer(
    open("dewiki_20161001_headings_2.tsv", "w"),
    headers=["page_id", "page_title", "page_ns", "heading_level", "heading_text"])

for page_id, page_title, page_ns, heading_level, heading_text in mwxml.map(process_dump, dump_files):
    output.write([page_id, page_title, page_ns, heading_level, heading_text])
type(output)
mysqltsv.writer.Writer
import pandas as pd
eswiki = pd.read_csv("eswiki_20161001_headings.tsv", header=0, sep='\t')