!pip install mysqltsv
Collecting mysqltsv
  Using cached mysqltsv-0.0.7.tar.gz
Installing collected packages: mysqltsv
  Running setup.py install for mysqltsv ... - \ done
Successfully installed mysqltsv-0.0.7
!pip install mwparserfromhell
Collecting mwparserfromhell
  Using cached mwparserfromhell-0.4.3.tar.gz
Installing collected packages: mwparserfromhell
  Running setup.py install for mwparserfromhell ... - \ | / - \ | done
Successfully installed mwparserfromhell-0.4.3
import sys
import mwxml
import mysqltsv
import mwparserfromhell as mwp
dump_files = !ls /public/dumps/public/frwiki/20161101/frwiki-20161101-pages-articles*.xml-*.bz2
dump_files
['/public/dumps/public/frwiki/20161101/frwiki-20161101-pages-articles1.xml-p000000003p000412300.bz2',
 '/public/dumps/public/frwiki/20161101/frwiki-20161101-pages-articles2.xml-p000412304p001647873.bz2',
 '/public/dumps/public/frwiki/20161101/frwiki-20161101-pages-articles3.xml-p001647919p004419859.bz2',
 '/public/dumps/public/frwiki/20161101/frwiki-20161101-pages-articles4.xml-p004419860p010395851.bz2']
def process_dump(dump, path):
    number_of_articles = 0
    for page in dump:
        if page.namespace == 0:
            if page.redirect == None:
                number_of_articles +=1
            for revision in page:
                try:
                    wikicode = mwp.parse(revision.text or "")
                    headings = list(wikicode.filter_headings())

                except Exception as e:
                    sys.stderr.write("Failed to parse text: " + str(e) + "\n")

                for heading in headings:
                    yield page.id, page.title, page.namespace, heading.level, str(heading.title)
    print("total articles: " + str(number_of_articles))
!touch frwiki_20161101_headings.tsv
output = mysqltsv.Writer(
    open("frwiki_20161101_headings.tsv", "w"),
    headers=["page_id", "page_title", "page_ns", "heading_level", "heading_text"])

for page_id, page_title, page_ns, heading_level, heading_text in mwxml.map(process_dump, dump_files):
    output.write([page_id, page_title, page_ns, heading_level, heading_text])
total articles: 158795
total articles: 323148
total articles: 406124
Failed to parse text: This is a bug and should be reported. Info: C tokenizer exited with BAD_ROUTE.
total articles: 920951