German Wikipedia Heading Frequency

This notebook cannot run in PAWS as it exceeds the current memory limitations (1GB).

This notebook serves to sort German Wikipedia section headers by frequency as related to this research project.

import numpy as np
import pandas as pd
# read in headers file by chunks of 100000 to conserve memory
# https://stackoverflow.com/questions/25962114/how-to-read-a-6-gb-csv-file-with-pandas
tp = pd.read_csv('dewiki_20161101_headings.tsv', sep='\t', header=0, dtype={'page_id': np.int32, 'page_title': object, 'page_ns': np.int16, 'heading_level': np.int8, 'heading_text': object}, iterator=True, chunksize=100000)
# concatenate all rows into a pandas dataframe
de_DF = pd.concat([chunk for chunk in tp])
de_DF.head()
# remove leading and trailing whitespace from heading_text column
de_DF['heading_text'] = pd.core.strings.str_strip(de_DF['heading_text'])
# groupby heading_text and count the number of unique page_titles each heading appears in
# sort in descending order
# this returns a pandas series object
article_count = de_DF.groupby('heading_text')['page_title'].apply(lambda x: len(x.unique())).sort_values(ascending=False)
# turn pandas series object into pandas dataframe
de_article_count_DF = pd.DataFrame({'section_title':article_count.index, 'number_of_articles':article_count.values})
# add a column for the percentage of articles that header appears in
de_article_count_DF['article_percentage'] = (de_article_count_DF['number_of_articles']/1993198)*100
# set pandas options to display 100 rows
# round percentage to 2 decimal places and show top 100 results
pd.options.display.max_rows = 100
de_article_count_DF.round({'article_percentage': 2}).head(100)