!pip install mysqltsv
Collecting mysqltsv
  Using cached mysqltsv-0.0.7.tar.gz
Installing collected packages: mysqltsv
  Running setup.py install for mysqltsv ... - \ done
Successfully installed mysqltsv-0.0.7
!pip install mwparserfromhell
Collecting mwparserfromhell
  Using cached mwparserfromhell-0.4.3.tar.gz
Installing collected packages: mwparserfromhell
  Running setup.py install for mwparserfromhell ... - \ | / - \ | done
Successfully installed mwparserfromhell-0.4.3
import sys
import mwxml
import mysqltsv
import mwparserfromhell as mwp
dump_files = !ls /public/dumps/public/itwiki/20161101/itwiki-20161101-pages-articles*.xml-*.bz2
dump_files
['/public/dumps/public/itwiki/20161101/itwiki-20161101-pages-articles1.xml-p000000002p000442893.bz2',
 '/public/dumps/public/itwiki/20161101/itwiki-20161101-pages-articles2.xml-p000442894p001492776.bz2',
 '/public/dumps/public/itwiki/20161101/itwiki-20161101-pages-articles3.xml-p001492777p002874472.bz2',
 '/public/dumps/public/itwiki/20161101/itwiki-20161101-pages-articles4.xml-p002874482p006139713.bz2']
def process_dump(dump, path):
    number_of_articles = 0
    for page in dump:
        if page.namespace == 0:
            if page.redirect == None:
                number_of_articles +=1
            for revision in page:
                try:
                    wikicode = mwp.parse(revision.text or "")
                    headings = list(wikicode.filter_headings())

                except Exception as e:
                    sys.stderr.write("Failed to parse text: " + str(e) + "\n")

                for heading in headings:
                    yield page.id, page.title, page.namespace, heading.level, str(heading.title)
    print("total articles: " + str(number_of_articles))
!touch itwiki_20161101_headings.tsv
output = mysqltsv.Writer(
    open("itwiki_20161101_headings.tsv", "w"),
    headers=["page_id", "page_title", "page_ns", "heading_level", "heading_text"])

for page_id, page_title, page_ns, heading_level, heading_text in mwxml.map(process_dump, dump_files):
    output.write([page_id, page_title, page_ns, heading_level, heading_text])
total articles: 146924
total articles: 222907
Failed to parse text: This is a bug and should be reported. Info: C tokenizer exited with non-empty token stack.
total articles: 299029
Namespace id conflict detected.  <title>=Wikipedia:Pagine da cancellare/Log/2015 gennaio 26, <namespace>=0, mapped_namespace=4
total articles: 640789