install.packages("ggplot2", "~/R")
install.packages("data.table", "~/R")
library(ggplot2, lib.loc="~/R")
library(labeling, lib.loc="~/R")
library(data.table, lib.loc="~/R")
options("scipen"=100, "digits"=4)
clean_up_months = function(month_data, group){
# Fix column name
month_data$weighted_sum = month_data$weighed_sum
month_data$weighed_sum = NULL
# Set group
month_data$group = group
# Add month column and remove timestamp
month_data$month = with(month_data, as.Date(as.character(timestamp), format="%Y%m%d000000"))
month_data$timestamp = NULL
# Compute article count and then statistics
max_n = with(month_data, max(fa_n + ga_n + b_n + c_n + start_n + stub_n))
month_data$avg_weighted_sum = with(month_data, (weighted_sum + 1) / max_n)
month_data$fa_prop = with(month_data, fa_n / max_n)
month_data$ga_prop = with(month_data, ga_n / max_n)
month_data$b_prop = with(month_data, b_n / max_n)
month_data$c_prop = with(month_data, c_n / max_n)
month_data$start_prop = with(month_data, start_n / max_n)
month_data$stub_prop = with(month_data, stub_n / max_n)
# Return our cleaned up data
month_data
}
all_wiki = clean_up_months(data.table(read.csv(url(
'https://quarry.wmflabs.org/run/193625/output/0/csv?download=true'))), "all_wiki")
women_writers = clean_up_months(data.table(read.csv(url(
'https://quarry.wmflabs.org/run/197183/output/0/csv?download=true'))), "women_writers")
women_writers_vs_all = merge(all_wiki, women_writers, by=c("month"),
suffixes=c(".all_wiki", ".women_writers"))
women_writers_vs_all$avg_weighted_sum_diff = with(women_writers_vs_all,
avg_weighted_sum.women_writers - avg_weighted_sum.all_wiki)
women_writers_vs_all$direction = sapply(
women_writers_vs_all$avg_weighted_sum_diff,
function(avg_weighted_sum_diff){if(avg_weighted_sum_diff > 0){"surplus"}else{"gap"}})
plot = ggplot(merged_months,
aes(x=month, y=avg_weighted_sum_diff, fill=direction)) +
theme_bw() +
theme(legend.position="none") +
geom_bar(stat="identity") +
scale_y_continuous("Avg. weighted sum difference", limits=c(-0.14, 0.45)) +
scale_x_date(breaks=as.Date(paste(seq(2002, 2017, 2), "-01-01", sep="")),
labels=as.character(seq(2002, 2017, 2)))
print(plot)
svg("avg_weighted_sum_difference.women_writers.svg", height=5, width=7)
print(plot)
dev.off()
normalize_class_props = function(month_data){
normalized_data =
rbind(with(month_data, data.table(month, group, prop=fa_prop, class="FA")),
with(month_data, data.table(month, group, prop=ga_prop, class="GA")),
with(month_data, data.table(month, group, prop=b_prop, class="B")),
with(month_data, data.table(month, group, prop=c_prop, class="C")),
with(month_data, data.table(month, group, prop=start_prop, class="Start")),
with(month_data, data.table(month, group, prop=stub_prop, class="Stub")))
normalized_data$class = ordered(normalized_data$class, levels=c("Stub", "Start", "C", "B", "GA", "FA"))
normalized_data
}
plot = ggplot(
rbind(normalize_class_props(women_writers), normalize_class_props(all_wiki))[class %in% c('GA', 'FA'),],
aes(x=month, y=prop, color=group, linetype=group)) +
theme_bw() +
facet_wrap(~class, nrow=1) +
geom_line()
print(plot)
svg("article_class_prop.women_writers.ga_fa.svg", height=5, width=7)
print(plot)
dev.off()
plot = ggplot(
rbind(normalize_class_props(women_writers), normalize_class_props(all_wiki))[class %in% c('Stub', 'Start', 'C', 'B'),],
aes(x=month, y=prop, color=group, linetype=group)) +
theme_bw() +
facet_wrap(~class, nrow=1) +
geom_line()
print(plot)
svg("article_class_prop.women_writers.stub_start_c_b.svg", height=5, width=7)
print(plot)
dev.off()