setwd("/Users/lotze/workspace/sfjourney_analysis") # install.packages(c('RColorBrewer','rgdal','rjags','zipcode','segmented','boot','earth')) require('earth') require('segmented') registrations = read.csv("cleaned_registrations.csv", stringsAsFactors=FALSE) registrations = registrations[order(registrations$signup_timestamp), ] ##### Section 2 ##### # add: total registrations + attendees # analysis 1: arrival curve of registrations registrations$num_reg = (1:nrow(registrations)) # add: labels for timestamps timeplot <- function(y_column, df=registrations, num_marks=5, format="%m/%d", ...) { at_ts = pretty(df$signup_timestamp, n=num_marks) at_labels = strftime(as.POSIXct(at_ts, origin = "1970-01-01"), format=format) plot(df[, "signup_timestamp"], df[, y_column], xlab="Date", xaxt="n", ...) axis(1, at=at_ts, labels=at_labels) } # should probably generalize a little to a function like this tsplot <- function(timestamps, yvalues, num_marks=5, format="%m/%d", ...) { at_ts = pretty(timestamps, n=num_marks) at_labels = strftime(as.POSIXct(at_ts, origin = "1970-01-01"), format=format) plot(timestamps, yvalues, xlab="Date", xaxt="n", ...) axis(1, at=at_ts, labels=at_labels) } # how many people signed up over time? svg("signups.svg", width=8, height=6) timeplot("num_reg", type="l", ylab="Total registered", main="Cumulative Registrations Over Time") dev.off() # huge surge on 10/10, when we emailed the list first # starts picking up more and more as the date approaches, especially after 11/3 # (11/9 was Journey) # daily pattern during the hour -- slowdown 10PM - 9AM last_few_hours = registrations[registrations$signup_timestamp > 1383462326, ] svg("signups_hourly.svg", width=8, height=6) timeplot("num_reg", df=last_few_hours, format="%d %H:00", type="l", ylab="Total registered", num_marks=20) dev.off() # probably segmenting is appropriate segmented_model <- segmented(glm(num_reg ~ signup_timestamp, data=registrations), seg.Z= ~ signup_timestamp, psi=c(as.numeric(as.POSIXct('2013-10-10')), as.numeric(as.POSIXct('2013-11-3'))) ) timeplot("num_reg", type="l", ylab="Total registered") points(registrations$signup_timestamp, predict(segmented_model, newdata=registrations), type="l", col=2) summary(segmented_model) # maybe MARS? multi-adaptive regresison splines? # Ha! Yes, comes up with basically the same segmentation -- and points out the increase two weeks before! svg("mars_segments.svg", width=8, height=6) mars = earth(num_reg ~ signup_timestamp, data=registrations) timeplot("num_reg", type="l", ylab="Total registered") points(registrations$signup_timestamp, predict(mars, newdata=registrations), type="l", col=2, lwd=2) dev.off() # part 2: simulation of expected numbers; confidence intervals? # predicting ultimate signups, given data so far # use HW smoothing and projection with CIs; dangerous because the trend can cause exponential growth registrations$signup_date = strftime(as.POSIXct(registrations$signup_timestamp, origin="1970-01-01"), format="%m/%d") min_hour = as.numeric(strptime(strftime(as.POSIXct(min(registrations$signup_timestamp), origin="1970-01-01"), format="%Y-%m-%d"), format="%Y-%m-%d")) max_hour = as.numeric(strptime(strftime(as.POSIXct(max(registrations$signup_timestamp), origin="1970-01-01"), format="%Y-%m-%d"), format="%Y-%m-%d")) one_hour <- 86400 all_hours = seq(min_hour, max_hour, one_hour) my_empty = rep(NA, length(all_hours)) daily_predictions <- data.frame(hour = strftime(as.POSIXct(all_hours, origin="1970-01-01"), "%m/%d"), actual=my_empty, predicted_total=my_empty, lower=my_empty, upper=my_empty) for (hour_ts in seq(min_hour + 3*one_hour, max_hour, one_hour)) { hours = seq(min_hour, hour_ts, one_hour) by_hour = as.numeric(table(cut(registrations$signup_timestamp[registrations$signup_timestamp <= hour_ts], breaks=hours))) if ((hour_ts - min_hour)/one_hour > 14) { by_hour_ts = ts(by_hour, frequency=7) hw <- HoltWinters(by_hour_ts, seasonal="additive") } else { by_hour_ts = ts(by_hour, frequency=1) hw <- HoltWinters(by_hour_ts, gamma=FALSE) } index = round((hour_ts - min_hour) / one_hour) if (max_hour - hour_ts > one_hour/2) { predictions = predict(hw, (max_hour - hour_ts)/one_hour, prediction.interval=TRUE) fit = sapply(c(by_hour, predictions[,"fit"]), max, 0) upper = c(by_hour, predictions[,"upr"]) lower = sapply(c(by_hour, predictions[,"lwr"]), max, 0) } else { fit = by_hour upper = by_hour lower = by_hour } daily_predictions[index, c("actual", "predicted_total", "lower", "upper")] = c(sum(by_hour), sum(fit), sum(lower), sum(upper)) } # Hm; exponential smoothing is not great at predicting...at least on a daily level... # try hourly, on a 24 hour "season" min_hour = as.numeric(strptime(strftime(as.POSIXct(min(registrations$signup_timestamp), origin="1970-01-01"), format="%Y-%m-%d %H"), format="%Y-%m-%d %H")) max_hour = as.numeric(strptime(strftime(as.POSIXct(max(registrations$signup_timestamp), origin="1970-01-01"), format="%Y-%m-%d %H"), format="%Y-%m-%d %H")) one_hour <- 3600 all_hours = seq(min_hour, max_hour, one_hour) my_empty = rep(NA, length(all_hours)) daily_predictions <- data.frame(hour = strftime(as.POSIXct(all_hours, origin="1970-01-01"), "%m/%d"), actual=my_empty, predicted_total=my_empty, lower=my_empty, upper=my_empty) for (hour_ts in seq(min_hour + 3*one_hour, max_hour, one_hour)) { hours = seq(min_hour, hour_ts, one_hour) by_hour = as.numeric(table(cut(registrations$signup_timestamp[registrations$signup_timestamp <= hour_ts], breaks=hours))) if ((hour_ts - min_hour)/one_hour > 14) { by_hour_ts = ts(by_hour, frequency=7) hw <- HoltWinters(by_hour_ts, seasonal="additive") } else { by_hour_ts = ts(by_hour, frequency=1) hw <- HoltWinters(by_hour_ts, gamma=FALSE) } index = round((hour_ts - min_hour) / one_hour) if (max_hour - hour_ts > one_hour/2) { predictions = predict(hw, (max_hour - hour_ts)/one_hour, prediction.interval=TRUE) fit = sapply(c(by_hour, predictions[,"fit"]), max, 0) upper = c(by_hour, predictions[,"upr"]) lower = sapply(c(by_hour, predictions[,"lwr"]), max, 0) } else { fit = by_hour upper = by_hour lower = by_hour } daily_predictions[index, c("actual", "predicted_total", "lower", "upper")] = c(sum(by_hour), sum(fit), sum(lower), sum(upper)) } # that's still pretty terrible # really we should try to encode some more knowledge in here -- specifically, the pattern of increase in the hours leading up to the event # get "expected rate changes" model based on our overall regression -- need more examples to see what variance we can expect date_breaks = c(as.numeric(as.POSIXct('2013-10-10')), as.numeric(as.POSIXct('2013-10-11')), as.numeric(as.POSIXct('2013-10-28')), as.numeric(as.POSIXct('2013-11-3'))) segmented_model <- segmented(glm(num_reg ~ signup_timestamp, data=registrations), seg.Z= ~ signup_timestamp, psi=date_breaks ) coefficients = coef(summary(segmented_model)) expected_rates = coefficients["signup_timestamp", "Estimate"] + cumsum(coefficients[sprintf("U%s.signup_timestamp", seq(1,4)), "Estimate"]) # really we will just use the "expected rate changes" for two weeks and one week before the event expected_rate_changes = diff(expected_rates)[-1] expected_rate_change_timepoints = date_breaks[-2:-1] + 86400 timeplot("num_reg", type="l", ylab="Total registered") points(registrations$signup_timestamp, predict(segmented_model, newdata=registrations), type="l", col=2) # Okay -- let's say we had a 3-part segmented model (after initial registration) and estimated the rates at each day; how might our predictions change over time? one_day <- 86400 min_hour = date_breaks[2] + 2*one_day last_reg <- max(registrations$signup_timestamp) max_hour = as.numeric(strptime(strftime(as.POSIXct(last_reg, origin="1970-01-01"), format="%Y-%m-%d"), format="%Y-%m-%d")) all_hours = seq(min_hour, max_hour, one_day) my_empty = rep(NA, length(all_hours)) daily_predictions <- data.frame(ts = all_hours, hour = strftime(as.POSIXct(all_hours, origin="1970-01-01"), "%m/%d"), actual=my_empty, predicted_total=my_empty, lower=my_empty, upper=my_empty) segmented_prediction <- function(hour_ts, max_hour, stage_index) { # probably a better way to do this, but let's break into three cases: more than 2 weeks before; 1-2 weeks; within 1 week if (hour_ts <= expected_rate_change_timepoints[1]) { prediction = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[1]) - hour_ts) * latest_rate + I(max_hour >= expected_rate_change_timepoints[1])*(min(max_hour,expected_rate_change_timepoints[2]) - expected_rate_change_timepoints[1]) * max(latest_rate + expected_rate_changes[1], 0) + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(latest_rate + sum(expected_rate_changes), 0) upper = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[1]) - hour_ts) * upper_rate + I(max_hour >= expected_rate_change_timepoints[1])*(min(max_hour,expected_rate_change_timepoints[2]) - expected_rate_change_timepoints[1]) * max(upper_rate + expected_rate_changes[1], 0) + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(upper_rate + sum(expected_rate_changes), 0) lower = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[1]) - hour_ts) * lower_rate + I(max_hour >= expected_rate_change_timepoints[1])*(min(max_hour,expected_rate_change_timepoints[2]) - expected_rate_change_timepoints[1]) * max(lower_rate + expected_rate_changes[1], 0) + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(lower_rate + sum(expected_rate_changes), 0) } else if (hour_ts <= expected_rate_change_timepoints[2]) { prediction = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[2]) - hour_ts) * latest_rate + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(latest_rate + expected_rate_changes[2], 0) upper = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[2]) - hour_ts) * upper_rate + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(upper_rate + expected_rate_changes[2], 0) lower = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[2]) - hour_ts) * lower_rate + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(lower_rate + expected_rate_changes[2], 0) } else { prediction = training_data$num_reg[nrow(training_data)] + (max_hour - hour_ts) * latest_rate upper = training_data$num_reg[nrow(training_data)] + (max_hour - hour_ts) * upper_rate lower = training_data$num_reg[nrow(training_data)] + (max_hour - hour_ts) * lower_rate } return(c(prediction, lower, upper)) } for (hour_index in seq(1,length(all_hours))) { #hour_index = hour_index + 1 hour_ts = all_hours[hour_index] training_data <- subset(registrations, registrations$signup_timestamp < hour_ts) culled_date_breaks = date_breaks[date_breaks + one_day < hour_ts] segmented_model <- segmented(glm(num_reg ~ signup_timestamp, data=training_data), seg.Z= ~ signup_timestamp, psi=culled_date_breaks ) # remove dates if some breaks are not significant coefficients <- coef(summary(segmented_model)) coefficient_rows <- grepl("^U", rownames(coefficients)) while (sum(coefficients[coefficient_rows, "Pr(>|t|)"] >= 0.01) > 0) { culled_date_breaks <- culled_date_breaks[-length(culled_date_breaks)] segmented_model <- segmented(glm(num_reg ~ signup_timestamp, data=training_data), seg.Z= ~ signup_timestamp, psi=culled_date_breaks ) # remove dates if some breaks are not significant coefficients <- coef(summary(segmented_model)) coefficient_rows <- grepl("^U", rownames(coefficients)) } # TODO: extract rates, extend out to multipliers in later stages, predict eventual total registrations coefficients = coef(summary(segmented_model)) breaks = summary(segmented_model)$psi latest_rate = sum(coefficients[2:nrow(coefficients),"Estimate"]) latest_rate_std = sum(coefficients[coefficients[,"Estimate"] != 0,"Std. Error"]) - coefficients["(Intercept)","Std. Error"] upper_rate = max(latest_rate + 1.96 * latest_rate_std, 0) lower_rate = max(latest_rate - 1.96 * latest_rate_std, 0) latest_rate = max(latest_rate, 0) # which "stage" are we in? long slow growth (1), two-weeks-prior (2), or preceeding week (3) # based on the number of segments found in the segmented model stage_index = sum(grepl("^U", rownames(coef(summary(segmented_model))))) - 1 predicted = segmented_prediction(hour_ts, last_reg, stage_index) # print(sprintf("At index %d, time %d (vs %d), rate %f", hour_index, hour_ts, expected_rate_change_timepoints[1], latest_rate)) # print(coefficients) # print(predicted) # future_indices = seq(hour_index,length(all_hours), by=1) # future = t(sapply(future_indices, function(i) { # segmented_prediction(hour_ts, all_hours[i]) # })) # points(all_hours[future_indices], future[,1], type="l", col=hour_index) daily_predictions[hour_index, c("actual", "predicted_total", "lower", "upper")] = c(nrow(training_data), predicted) } svg("predicted_registrations.svg", width=8, height=6) timeplot("num_reg", type="l", ylab="Total registered", ylim=c(0, max(daily_predictions$upper)), main="Predicted Registrations Over Time") points(registrations$signup_timestamp, predict(segmented_model, newdata=registrations), type="l", col=2) points(daily_predictions$ts, daily_predictions$predicted, type="l", col=3) points(daily_predictions$ts, daily_predictions$lower, type="l", col=3, lty=2) points(daily_predictions$ts, daily_predictions$upper, type="l", col=3, lty=2) legend("left", legend=c("running forecast", "post-hoc MARS model", "actual"), fill=c(3,2,1)) dev.off()