install.packages("ggplot2", "~/R")
install.packages("data.table", "~/R")
library(ggplot2, lib.loc="~/R")
library(labeling, lib.loc="~/R")
library(data.table, lib.loc="~/R")
options("scipen"=100, "digits"=4)
clean_up_months = function(month_data, group){
    # Fix column name
    month_data$weighted_sum = month_data$weighed_sum
    month_data$weighed_sum = NULL
    
    # Set group
    month_data$group = group
    
    # Add month column and remove timestamp
    month_data$month = with(month_data, as.Date(as.character(timestamp), format="%Y%m%d000000"))
    month_data$timestamp = NULL
    
    # Compute article count and then statistics
    max_n = with(month_data, max(fa_n + ga_n + b_n + c_n + start_n + stub_n))
    month_data$avg_weighted_sum = with(month_data, (weighted_sum + 1) / max_n)
    month_data$fa_prop = with(month_data, fa_n / max_n)
    month_data$ga_prop = with(month_data, ga_n / max_n)
    month_data$b_prop = with(month_data, b_n / max_n)
    month_data$c_prop = with(month_data, c_n / max_n)
    month_data$start_prop = with(month_data, start_n / max_n)
    month_data$stub_prop = with(month_data, stub_n / max_n)
    
    # Return our cleaned up data
    month_data
}
all_wiki = clean_up_months(data.table(read.csv(url(
    'https://quarry.wmflabs.org/run/193625/output/0/csv?download=true'))), "all_wiki")
women_writers = clean_up_months(data.table(read.csv(url(
    'https://quarry.wmflabs.org/run/197183/output/0/csv?download=true'))), "women_writers")

women_writers_vs_all = merge(all_wiki, women_writers, by=c("month"), 
                             suffixes=c(".all_wiki", ".women_writers"))
women_writers_vs_all$avg_weighted_sum_diff = with(women_writers_vs_all,
    avg_weighted_sum.women_writers - avg_weighted_sum.all_wiki)
women_writers_vs_all$direction = sapply(
    women_writers_vs_all$avg_weighted_sum_diff,
    function(avg_weighted_sum_diff){if(avg_weighted_sum_diff > 0){"surplus"}else{"gap"}})
plot = ggplot(merged_months, 
        aes(x=month, y=avg_weighted_sum_diff, fill=direction)) + 
theme_bw() + 
theme(legend.position="none") + 
geom_bar(stat="identity") + 
scale_y_continuous("Avg. weighted sum difference", limits=c(-0.14, 0.45)) + 
scale_x_date(breaks=as.Date(paste(seq(2002, 2017, 2), "-01-01", sep="")),
             labels=as.character(seq(2002, 2017, 2)))
print(plot)
svg("avg_weighted_sum_difference.women_writers.svg", height=5, width=7)
print(plot)
dev.off()
png: 2
normalize_class_props = function(month_data){
    normalized_data = 
        rbind(with(month_data, data.table(month, group, prop=fa_prop, class="FA")), 
              with(month_data, data.table(month, group, prop=ga_prop, class="GA")), 
              with(month_data, data.table(month, group, prop=b_prop, class="B")), 
              with(month_data, data.table(month, group, prop=c_prop, class="C")), 
              with(month_data, data.table(month, group, prop=start_prop, class="Start")), 
              with(month_data, data.table(month, group, prop=stub_prop, class="Stub")))
    normalized_data$class = ordered(normalized_data$class, levels=c("Stub", "Start", "C", "B", "GA", "FA"))
    normalized_data
}
plot = ggplot(
    rbind(normalize_class_props(women_writers), normalize_class_props(all_wiki))[class %in% c('GA', 'FA'),],
    aes(x=month, y=prop, color=group, linetype=group)) + 
theme_bw() + 
facet_wrap(~class, nrow=1) + 
geom_line()
print(plot)
svg("article_class_prop.women_writers.ga_fa.svg", height=5, width=7)
print(plot)
dev.off()
png: 2
plot = ggplot(
    rbind(normalize_class_props(women_writers), normalize_class_props(all_wiki))[class %in% c('Stub', 'Start', 'C', 'B'),],
    aes(x=month, y=prop, color=group, linetype=group)) + 
theme_bw() + 
facet_wrap(~class, nrow=1) + 
geom_line()
print(plot)
svg("article_class_prop.women_writers.stub_start_c_b.svg", height=5, width=7)
print(plot)
dev.off()
png: 2