!pip install mysqltsv
Collecting mysqltsv
  Using cached mysqltsv-0.0.7.tar.gz
Installing collected packages: mysqltsv
  Running setup.py install for mysqltsv ... - \ done
Successfully installed mysqltsv-0.0.7
!pip install mwparserfromhell
Collecting mwparserfromhell
  Using cached mwparserfromhell-0.4.3.tar.gz
Installing collected packages: mwparserfromhell
  Running setup.py install for mwparserfromhell ... - \ | / - \ | done
Successfully installed mwparserfromhell-0.4.3
import sys
import mwxml
import mysqltsv
import mwparserfromhell as mwp
dump_files = !ls /public/dumps/public/dewiki/20161101/dewiki-20161101-pages-articles*.xml-*.bz2
dump_files
['/public/dumps/public/dewiki/20161101/dewiki-20161101-pages-articles1.xml-p000000001p000425449.bz2',
 '/public/dumps/public/dewiki/20161101/dewiki-20161101-pages-articles2.xml-p000425451p001877043.bz2',
 '/public/dumps/public/dewiki/20161101/dewiki-20161101-pages-articles3.xml-p001877053p004964499.bz2',
 '/public/dumps/public/dewiki/20161101/dewiki-20161101-pages-articles4.xml-p004964505p009642338.bz2']
!touch dewiki_20161101_headings.tsv
def process_dump(dump, path):
    number_of_articles = 0
    for page in dump:
        if page.namespace == 0:
            if page.redirect == None:
                number_of_articles +=1
            for revision in page:
                try:
                    wikicode = mwp.parse(revision.text or "")
                    headings = list(wikicode.filter_headings())

                except Exception as e:
                    sys.stderr.write("Failed to parse text: " + str(e) + "\n")

                for heading in headings:
                    yield page.id, page.title, page.namespace, heading.level, str(heading.title)
    print("total articles: " + str(number_of_articles))
output = mysqltsv.Writer(
    open("dewiki_20161101_headings.tsv", "w"),
    headers=["page_id", "page_title", "page_ns", "heading_level", "heading_text"])

for page_id, page_title, page_ns, heading_level, heading_text in mwxml.map(process_dump, dump_files):
    output.write([page_id, page_title, page_ns, heading_level, heading_text])
total articles: 154182
total articles: 333054
total articles: 548767
total articles: 957195