!pip install mysqltsv
Collecting mysqltsv
  Using cached mysqltsv-0.0.7.tar.gz
Installing collected packages: mysqltsv
  Running setup.py install for mysqltsv ... - \ done
Successfully installed mysqltsv-0.0.7
!pip install mwparserfromhell
Collecting mwparserfromhell
  Using cached mwparserfromhell-0.4.3.tar.gz
Installing collected packages: mwparserfromhell
  Running setup.py install for mwparserfromhell ... - \ | / - \ | done
Successfully installed mwparserfromhell-0.4.3
import sys
import mwxml
import mysqltsv
import mwparserfromhell as mwp
dump_files = !ls /public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles*.xml-*.bz2
dump_files
['/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles1.xml-p000000010p000030302.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles10.xml-p002336425p003046511.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles11.xml-p003046517p003926861.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles12.xml-p003926864p005040435.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles13.xml-p005040438p006197593.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles14.xml-p006197599p007744799.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles15.xml-p007744803p009518046.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles16.xml-p009518059p011539266.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles17.xml-p011539268p013693066.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles18.xml-p013693075p016120541.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles19.xml-p016120548p018754723.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles2.xml-p000030304p000088444.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles20.xml-p018754736p021222156.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles21.xml-p021222161p023927980.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles22.xml-p023927984p026823658.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles23.xml-p026823661p030503448.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles24.xml-p030503454p033952815.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles25.xml-p033952817p038067198.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles26.xml-p038067204p042663461.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles27.xml-p042663464p052158770.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles3.xml-p000088445p000200507.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles4.xml-p000200511p000352689.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles5.xml-p000352690p000565312.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles6.xml-p000565314p000892912.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles7.xml-p000892914p001268691.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles8.xml-p001268693p001791079.bz2',
 '/public/dumps/public/enwiki/20161101/enwiki-20161101-pages-articles9.xml-p001791081p002336422.bz2']
!touch enwiki_20161101_headings.tsv
def process_dump(dump, path):
    number_of_articles = 0
    for page in dump:
        if page.namespace == 0:
            if page.redirect == None:
                number_of_articles +=1
            for revision in page:
                try:
                    wikicode = mwp.parse(revision.text or "")
                    headings = list(wikicode.filter_headings())

                except Exception as e:
                    sys.stderr.write("Failed to parse text: " + str(e) + "\n")

                for heading in headings:
                    yield page.id, page.title, page.namespace, heading.level, str(heading.title)
    print("total articles: " + str(number_of_articles))
output = mysqltsv.Writer(
    open("enwiki_20161101_headings.tsv", "w"),
    headers=["page_id", "page_title", "page_ns", "heading_level", "heading_text"])

for page_id, page_title, page_ns, heading_level, heading_text in mwxml.map(process_dump, dump_files):
    output.write([page_id, page_title, page_ns, heading_level, heading_text])
total articles: 15066
Failed to parse text: This is a bug and should be reported. Info: C tokenizer exited with non-empty token stack.
total articles: 123840
total articles: 134101
total articles: 146550
total articles: 148890
total articles: 163271
total articles: 166299
total articles: 167168
total articles: 26891
total articles: 236992
total articles: 235247
total articles: 221422
total articles: 231606
Failed to parse text: This is a bug and should be reported. Info: C tokenizer exited with non-empty token stack.
total articles: 275305
total articles: 268970
total articles: 309746
total articles: 303636
total articles: 65589
total articles: 362288
total articles: 49160
total articles: 65132
total articles: 424371
total articles: 78429
total articles: 91076
total articles: 105547
total articles: 107511
total articles: 751285