!pip install mysqltsv
Collecting mysqltsv
  Using cached mysqltsv-0.0.7.tar.gz
Installing collected packages: mysqltsv
  Running setup.py install for mysqltsv ... - \ done
Successfully installed mysqltsv-0.0.7
!pip install mwparserfromhell
Collecting mwparserfromhell
  Using cached mwparserfromhell-0.4.3.tar.gz
Installing collected packages: mwparserfromhell
  Running setup.py install for mwparserfromhell ... - \ | / - \ | done
Successfully installed mwparserfromhell-0.4.3
import sys
import mwxml
import mysqltsv
import mwparserfromhell as mwp
dump_files = !ls /public/dumps/public/eswiki/20161101/eswiki-20161101-pages-articles*.xml-*.bz2
dump_files
['/public/dumps/public/eswiki/20161101/eswiki-20161101-pages-articles1.xml-p000000005p000229076.bz2',
 '/public/dumps/public/eswiki/20161101/eswiki-20161101-pages-articles2.xml-p000229079p001083449.bz2',
 '/public/dumps/public/eswiki/20161101/eswiki-20161101-pages-articles3.xml-p001083458p003407509.bz2',
 '/public/dumps/public/eswiki/20161101/eswiki-20161101-pages-articles4.xml-p003407510p007641562.bz2']
!touch eswiki_20161101_headings.tsv
def process_dump(dump, path):
    number_of_articles = 0
    for page in dump:
        if page.namespace == 0:
            if page.redirect == None:
                number_of_articles +=1
            for revision in page:
                try:
                    wikicode = mwp.parse(revision.text or "")
                    headings = list(wikicode.filter_headings())

                except Exception as e:
                    sys.stderr.write("Failed to parse text: " + str(e) + "\n")

                for heading in headings:
                    yield page.id, page.title, page.namespace, heading.level, str(heading.title)
    print("total articles: " + str(number_of_articles))
output = mysqltsv.Writer(
    open("eswiki_20161101_headings.tsv", "w"),
    headers=["page_id", "page_title", "page_ns", "heading_level", "heading_text"])

for page_id, page_title, page_ns, heading_level, heading_text in mwxml.map(process_dump, dump_files):
    output.write([page_id, page_title, page_ns, heading_level, heading_text])
Failed to parse text: This is a bug and should be reported. Info: C tokenizer exited with non-empty token stack.
total articles: 67027
total articles: 176261
total articles: 307253
Failed to parse text: This is a bug and should be reported. Info: C tokenizer exited with non-empty token stack.
Failed to parse text: This is a bug and should be reported. Info: C tokenizer exited with non-empty token stack.
total articles: 699666