# using some code from Aaron and Zareen, 
# cf. https://meta.wikimedia.org/wiki/Research:Investigate_frequency_of_section_titles_in_5_large_Wikipedias
# e.g. https://paws-public.wmflabs.org/paws-public/45876923/italian/generate_it_headers_file.ipynb
!pip install mysqltsv
Collecting mysqltsv
  Using cached https://files.pythonhosted.org/packages/57/53/0951afa1db49c654dc85892a5f68704b1656b59471eac6848368f543bc0b/mysqltsv-0.0.7.tar.gz
Building wheels for collected packages: mysqltsv
  Running setup.py bdist_wheel for mysqltsv ... error
  Complete output from command /srv/paws/bin/python3.6 -u -c "import setuptools, tokenize;__file__='/tmp/pip-build-wf5zoua6/mysqltsv/setup.py';f=getattr(tokenize, 'open', open)(__file__);code=f.read().replace('\r\n', '\n');f.close();exec(compile(code, __file__, 'exec'))" bdist_wheel -d /tmp/tmpfsxvcffspip-wheel- --python-tag cp36:
  usage: -c [global_opts] cmd1 [cmd1_opts] [cmd2 [cmd2_opts] ...]
     or: -c --help [cmd1 cmd2 ...]
     or: -c --help-commands
     or: -c cmd --help
  
  error: invalid command 'bdist_wheel'
  
  ----------------------------------------
  Failed building wheel for mysqltsv
  Running setup.py clean for mysqltsv
Failed to build mysqltsv
Installing collected packages: mysqltsv
  Running setup.py install for mysqltsv ... done
Successfully installed mysqltsv-0.0.7
!pip install mwparserfromhell
Requirement already satisfied: mwparserfromhell in /srv/paws/lib/python3.6/site-packages
import sys
import mwxml
import mwparserfromhell as mwp
import collections

Bavarian Wikipedia

dump_files = !ls /public/dumps/public/barwiki/20190301/barwiki-20190301-pages-articles.xml*.bz2
dump_files
['/public/dumps/public/barwiki/20190301/barwiki-20190301-pages-articles.xml.bz2']
# as a check, replicate process from https://paws-public.wmflabs.org/paws-public/45876923/italian/generate_it_headers_file.ipynb :
import mysqltsv
def process_dump(dump, path):
    number_of_articles = 0
    for page in dump:
        if page.namespace == 0:
            if page.redirect == None:
                number_of_articles +=1
            for revision in page:
                try:
                    wikicode = mwp.parse(revision.text or "")
                    headings = list(wikicode.filter_headings())

                except Exception as e:
                    sys.stderr.write("Failed to parse text: " + str(e) + "\n")

                for heading in headings:
                    yield page.id, page.title, page.namespace, heading.level, str(heading.title)
    print("total articles: " + str(number_of_articles))
!touch barwiki_20190301_headings.tsv
output = mysqltsv.Writer(
    open("barwiki_20190301_headings.tsv", "w"),
    headers=["page_id", "page_title", "page_ns", "heading_level", "heading_text"])

for page_id, page_title, page_ns, heading_level, heading_text in mwxml.map(process_dump, dump_files):
    output.write([page_id, page_title, page_ns, heading_level, heading_text])
total articles: 27833
def get_levels_from_dump(dump, path):
    number_of_articles = 0
    
    level_freq = collections.Counter()
    for page in dump:
                
        if page.namespace == 0:
            if page.redirect == None:
                number_of_articles +=1
                levels_in_page = set()
                for revision in page:
                    try:
                        wikicode = mwp.parse(revision.text or "")
                        headings = list(wikicode.filter_headings())

                    except Exception as e:
                        sys.stderr.write("Failed to parse text: " + str(e) + "\n")

                    for heading in headings:
                        levels_in_page.add(heading.level)
                yield levels_in_page
                            
                        
    print("articles in dump: " + str(number_of_articles))
%%time
!date
total_number_of_articles = 0
total_level_freq = collections.Counter()

for levels_in_page in mwxml.map( get_levels_from_dump, dump_files):
    
    for level in levels_in_page:
        total_level_freq[level]+=1
    
    total_number_of_articles +=1

print("total number of articles: " + str(total_number_of_articles))

for level in total_level_freq:
        print('Level '+str(level)+': '+str(total_level_freq[level])+'  = ',str(round(100*total_level_freq[level]/total_number_of_articles,3))+'%')
Fri Mar 15 13:56:52 UTC 2019
articles in dump: 27833
total number of articles: 27833
Level 2: 24446  =  87.831%
Level 3: 2888  =  10.376%
Level 4: 262  =  0.941%
Level 5: 17  =  0.061%
Level 1: 7  =  0.025%
Level 6: 2  =  0.007%
CPU times: user 2min 30s, sys: 436 ms, total: 2min 30s
Wall time: 2min 31s
total_number_of_articles
27833

Italian Wikipedia

dump_files = !ls /public/dumps/public/itwiki/20190301/itwiki-20190301-pages-articles[0-9].xml-*.bz2
dump_files
['/public/dumps/public/itwiki/20190301/itwiki-20190301-pages-articles1.xml-p2p277087.bz2',
 '/public/dumps/public/itwiki/20190301/itwiki-20190301-pages-articles2.xml-p277092p1057872.bz2',
 '/public/dumps/public/itwiki/20190301/itwiki-20190301-pages-articles3.xml-p1057873p1959895.bz2',
 '/public/dumps/public/itwiki/20190301/itwiki-20190301-pages-articles4.xml-p1959896p3147928.bz2',
 '/public/dumps/public/itwiki/20190301/itwiki-20190301-pages-articles5.xml-p3147954p4647954.bz2',
 '/public/dumps/public/itwiki/20190301/itwiki-20190301-pages-articles5.xml-p4647954p5013198.bz2',
 '/public/dumps/public/itwiki/20190301/itwiki-20190301-pages-articles6.xml-p5013200p6513200.bz2',
 '/public/dumps/public/itwiki/20190301/itwiki-20190301-pages-articles6.xml-p6513200p7845922.bz2']
%%time
!date
total_number_of_articles = 0
total_level_freq = collections.Counter()

for levels_in_page in mwxml.map( get_levels_from_dump, dump_files):
    
    for level in levels_in_page:
        total_level_freq[level]+=1
    
    total_number_of_articles +=1

print("total number of articles: " + str(total_number_of_articles))

for level in total_level_freq:
        print('... with Level '+str(level)+'s: '+str(total_level_freq[level])+'  = ',str(round(100*total_level_freq[level]/total_number_of_articles,3))+'%')
Fri Mar 15 14:44:20 UTC 2019
articles in dump: 117948
articles in dump: 149846
articles in dump: 207081
articles in dump: 74697
Namespace id conflict detected.  <title>=Wikipedia:Pagine da cancellare/Log/2015 gennaio 26, <namespace>=0, mapped_namespace=4
articles in dump: 251039
articles in dump: 137311
articles in dump: 259868
articles in dump: 312714
total number of articles: 1510504
... with Level 2s: 1417437  =  93.839%
... with Level 3s: 473253  =  31.331%
... with Level 4s: 92182  =  6.103%
... with Level 5s: 8402  =  0.556%
... with Level 6s: 588  =  0.039%
... with Level 1s: 12  =  0.001%
CPU times: user 2min 6s, sys: 15.4 s, total: 2min 22s
Wall time: 1h 24min 11s

English Wikipedia

dump_files = !ls /public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles[0-9]*.xml-*.bz2
dump_files
['/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles10.xml-p2336425p3046511.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles11.xml-p3046517p3926861.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles12.xml-p3926864p5040435.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles13.xml-p5040438p6197593.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles14.xml-p6197599p7697599.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles14.xml-p7697599p7744799.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles15.xml-p7744803p9244803.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles15.xml-p9244803p9518046.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles16.xml-p11018059p11539266.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles16.xml-p9518059p11018059.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles17.xml-p11539268p13039268.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles17.xml-p13039268p13693066.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles18.xml-p13693075p15193075.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles18.xml-p15193075p16120541.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles19.xml-p16120548p17620548.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles19.xml-p17620548p18754723.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles1.xml-p10p30302.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles20.xml-p18754736p20254736.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles20.xml-p20254736p21222156.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles21.xml-p21222161p22722161.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles21.xml-p22722161p23927980.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles22.xml-p23927984p25427984.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles22.xml-p25427984p26823658.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles23.xml-p26823661p28323661.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles23.xml-p28323661p29823661.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles23.xml-p29823661p30503448.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles24.xml-p30503454p32003454.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles24.xml-p32003454p33503454.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles24.xml-p33503454p33952815.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles25.xml-p33952817p35452817.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles25.xml-p35452817p36952817.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles25.xml-p36952817p38067198.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles26.xml-p38067204p39567204.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles26.xml-p39567204p41067204.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles26.xml-p41067204p42567204.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles26.xml-p42567204p42663461.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p42663464p44163464.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p44163464p45663464.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p45663464p47163464.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p47163464p48663464.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p48663464p50163464.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p50163464p51663464.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p51663464p53163464.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p53163464p54663464.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p54663464p56163464.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p56163464p57663464.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p57663464p59163464.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles27.xml-p59163464p60106664.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles2.xml-p30304p88444.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles3.xml-p88445p200507.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles4.xml-p200511p352689.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles5.xml-p352690p565312.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles6.xml-p565314p892912.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles7.xml-p892914p1268691.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles8.xml-p1268693p1791079.bz2',
 '/public/dumps/public/enwiki/20190301/enwiki-20190301-pages-articles9.xml-p1791081p2336422.bz2']
%%time
!date
total_number_of_articles = 0
total_level_freq = collections.Counter()

for levels_in_page in mwxml.map( get_levels_from_dump, dump_files):
    
    for level in levels_in_page:
        total_level_freq[level]+=1
    
    total_number_of_articles +=1

print("total number of articles: " + str(total_number_of_articles))

for level in total_level_freq:
        print('... with Level '+str(level)+'s: '+str(total_level_freq[level])+'  = ',str(round(100*total_level_freq[level]/total_number_of_articles,3))+'%')
Fri Mar 15 18:19:20 UTC 2019
articles in dump: 122859
articles in dump: 132931
articles in dump: 147472
articles in dump: 4544
articles in dump: 144981
articles in dump: 24259
articles in dump: 46236
articles in dump: 157114
articles in dump: 140186
articles in dump: 118978
articles in dump: 62504
articles in dump: 172269
articles in dump: 97694
articles in dump: 15032
articles in dump: 135281
articles in dump: 107331
articles in dump: 111924
articles in dump: 85848
articles in dump: 143569
articles in dump: 154845
articles in dump: 118437
articles in dump: 143363
articles in dump: 122574
articles in dump: 122367
articles in dump: 45653
articles in dump: 138703
articles in dump: 39311
articles in dump: 131755
articles in dump: 113380
articles in dump: 129520
articles in dump: 150955
articles in dump: 95657
articles in dump: 8393
articles in dump: 131296
articles in dump: 140500
articles in dump: 140660
articles in dump: 90481
articles in dump: 139710
articles in dump: 123146
articles in dump: 115287
articles in dump: 109209
articles in dump: 121640
articles in dump: 111683
articles in dump: 120126
articles in dump: 102943
articles in dump: 101919
articles in dump: 62426
articles in dump: 133547
articles in dump: 26817
articles in dump: 49076
articles in dump: 65502
articles in dump: 64915
articles in dump: 78110
articles in dump: 90707
articles in dump: 104890
articles in dump: 106609
total number of articles: 5817124
... with Level 2s: 5500191  =  94.552%
... with Level 3s: 1282941  =  22.055%
... with Level 4s: 172667  =  2.968%
... with Level 5s: 13084  =  0.225%
... with Level 6s: 763  =  0.013%
... with Level 1s: 443  =  0.008%
CPU times: user 7min 17s, sys: 1min, total: 8min 17s
Wall time: 5h 28min 9s