library(RMySQL)
mydb = dbConnect(MySQL(), user='cscw', password='cscw', dbname='datasets_p', host='paws-db')
Loading required package: DBI
dbSendQuery(mydb, 'use datasets_p;')
results <- dbSendQuery(mydb, 'select * from enwiki_sessions_20150801 limit 100000')
data <- fetch(results, n = -1)
options(jupyter.plot_mimetypes = 'image/png')
<MySQLResult:(355,0,0)> 
data$cend <- as.POSIXct(data$end, format = "%Y%m%d%H%M%S")
data$cstart <- as.POSIXct(data$start, format = "%Y%m%d%H%M%S")
data$length <- data$cend - data$cstart
hist(as.numeric(data$length))
data$loglength <- log10(as.numeric(data$length))
hist(as.numeric(data$loglength))
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}
Mode(data$start)
clean_data <- subset(data, start != "20020225154311")
'20020225154311'
pos_clean_data <- subset(data, length > 0)
pos_clean_data$eps <- pos_clean_data$events / as.numeric(pos_clean_data$length)
subset(pos_clean_data, eps > 1)
user_textstartendindexeventscendcstartlengthloglengtheps
22926198.54.202.xxx2001122919104520011229191047232001-12-29 19:10:472001-12-29 19:10:452 secs0.301031.5
344930200202251543112002022515511567182002-02-25 15:51:152002-02-25 15:43:11484 secs2.6848451.483471
34518Conversion script200202251321332002022515511550278462002-02-25 15:51:152002-02-25 13:21:338982 secs3.9533733.1002
4947824.188.241.17120020523131707200205231317081022002-05-23 13:17:082002-05-23 13:17:071 secs02
52138217.162.214.7220020607201700200206072017011522002-06-07 20:17:012002-06-07 20:17:001 secs02
54017AxelBoldt200206161625532002061616255497322002-06-16 16:25:542002-06-16 16:25:531 secs02
56757Zoe20020701003429200207010034303922002-07-01 00:34:302002-07-01 00:34:291 secs02
pos_clean_data$radius <- sqrt(pos_clean_data$events / pi)
rbPal <- colorRampPalette(c("blue", "red"))
pos_clean_data$color <- rbPal(10)[as.numeric(cut(log(pos_clean_data$eps), breaks = 10))]
symbols(pos_clean_data$cstart,pos_clean_data$loglength, circles = pos_clean_data$radius, inches = 0.35, 
        fg = "white", bg = pos_clean_data$color, xlab = "Time", ylab = "Log Length of Session")
subset_clean_data <- pos_clean_data
subset_clean_data[c(2,3,4,6,7,11,12)] <- list(NULL)
pos_clean_data[1,]
subset_clean_data[1,]
user_textstartendindexeventscendcstartlengthloglengthepsradiuscolor
1eiffel.demon.co.uk2001011620083320010116201251022001-01-16 20:12:512001-01-16 20:08:33258 secs2.411620.0077519380.7978846#5500AA
user_texteventslengthloglengtheps
1eiffel.demon.co.uk2258 secs2.411620.007751938
agg_sub_clean_data <- aggregate(subset_clean_data[,2:5], list(user_text = subset_clean_data$user_text), mean)
with(agg_sub_clean_data, plot(events, length))
sort_clean_data <- pos_clean_data[order(clean_data$user_text, clean_data$index),]
sort_clean_data$etime <- c(0,diff(as.numeric(sort_clean_data$start)))
sort_clean_data$etime[sort_clean_data$index == 0] <- 0
sub_sort_clean_data <- sort_clean_data
sub_sort_clean_data[c(2,3,4,6,7,11,12)] <- list(NULL)
sub_sort_clean_data[1,]
user_texteventslengthloglengthepsetime
745066.81.79.xxx31734 secs3.2390490.0017301040
var_sub_sort_clean_data <- aggregate(sub_sort_clean_data[,2:6], list(user_text = sub_sort_clean_data$user_text), sd)
plot(agg_sub_clean_data$events ~ var_sub_sort_clean_data$etime)
var_sub_sort_clean_data[1:100,]
user_texteventslengthloglengtheps
1NANANANA
2(NANANANA
3-- April20.62509151073884119.715352630850.5581248720135630.0220687447502561
4-- SodiumNANANANA
5045.74993742991442929.729341849480.5793906460782910.0964130890925773
61 Lucky Texan8.485281374238574770.849452665640.8577876783581040.00334537882582565
710.0.0.60, 217.135.223.1160581.2417741353420.7897541953030870.0192082542675262
810.152NANANANA
910.165NANANANA
1010.18NANANANA
1110.1941.4142135623731770.039284712150.3371877929946980.0008639037179331
1210.22.201.350.707106781186548405.8792924010780.2401619635755910.00304620748507539
1310.22.201.xxx01861.105048082990.5718740852682840.00246807340424123
1410.229NANANANA
1510.246NANANANA
1610.261.32287565553231098.442242956410.4100250080830050.00277700594911328
1710.583.535533905932742794.485999249240.3311828542346290.000132210194871789
1810.61NANANANA
1910.7NANANANA
2011.1057.079250629387561993.296925091430.5330946900335670.00985994203661602
2111.1742.12132034355964718.4204896855320.5177821038271280.0032869049528697
2211.234NANANANA
2311.246NANANANA
2411.56NANANANA
2511.74NANANANA
2611.940.707106781186548214.2533546995240.04719418458859610.000553343032069497
2712.103.198.26NANANANA
2812.108.192.170NANANANA
2912.109.52.200NANANANA
3012.11.183.2543.414953112694932864.547579170810.616834844357420.00815797960262994
31
7112.224.175.2NANANANA
7212.224.189.70NANANANA
7312.224.52.69NANANANA
7412.225.153.241NANANANA
7512.225.170.29NANANANA
7612.227.149.12NANANANA
7712.227.2.390.7071067811865481931.815726201650.7216834022474630.00689741300081249
7812.228.20.xxxNANANANA
7912.228.61.600603.8691911333120.2808477917806150.0014843007125285
8012.228.96.215NANANANA
8112.229.226.1000.707106781186548298.3990616607230.5202657910948430.0107833244244879
8212.229.234.41NANANANA
8312.229.252.xxxNANANANA
8412.23.133.6NANANANA
8512.230.119.982.828427124746192361.029542381880.4846883137581380.00713791973003588
8612.230.139.45NANANANA
8712.230.209.2050.8164965809277261200.178424235330.6070484829114170.00843817776332748
8812.230.209.xxx1.06904496764971370.398319311370.413430208422840.00270563738928451
8912.230.7.1610.707106781186548217.7888886054570.1967785318494950.0018398652885195
9012.231.15.30NANANANA
9112.231.224.10NANANANA
9212.232.19.38NANANANA
9312.233.245.186NANANANA
9412.233.248.xxxNANANANA
9512.233.57.93NANANANA
9612.233.98.1615.613922784671882092.17046273990.7086103306027010.0237077825085268
9712.234.138.1571.41421356237311634.83087810330.9548710650373640.0119257984828218
9812.234.193.26NANANANA
9912.234.193.99NANANANA
10012.234.196.121NANANANA