setwd("/Users/lotze/workspace/sfjourney_analysis")
# install.packages(c('RColorBrewer','rgdal','rjags','zipcode','segmented','boot','earth'))
require('earth')
require('segmented')

registrations = read.csv("cleaned_registrations.csv", stringsAsFactors=FALSE)
registrations = registrations[order(registrations$signup_timestamp), ]

##### Section 2 #####

# add: total registrations + attendees
# analysis 1: arrival curve of registrations
registrations$num_reg = (1:nrow(registrations))

# add: labels for timestamps
timeplot <- function(y_column, df=registrations, num_marks=5, format="%m/%d", ...) {
  at_ts = pretty(df$signup_timestamp, n=num_marks)
  at_labels = strftime(as.POSIXct(at_ts, origin = "1970-01-01"), format=format)
  plot(df[, "signup_timestamp"], df[, y_column], xlab="Date", xaxt="n", ...)
  axis(1, at=at_ts, labels=at_labels)
}

# should probably generalize a little to a function like this
tsplot <- function(timestamps, yvalues, num_marks=5, format="%m/%d", ...) {
  at_ts = pretty(timestamps, n=num_marks)
  at_labels = strftime(as.POSIXct(at_ts, origin = "1970-01-01"), format=format)
  plot(timestamps, yvalues, xlab="Date", xaxt="n", ...)
  axis(1, at=at_ts, labels=at_labels)
}

# how many people signed up over time?
svg("signups.svg", width=8, height=6)
timeplot("num_reg", type="l", ylab="Total registered", main="Cumulative Registrations Over Time")
dev.off()

# huge surge on 10/10, when we emailed the list first
# starts picking up more and more as the date approaches, especially after 11/3
# (11/9 was Journey)

# daily pattern during the hour -- slowdown 10PM - 9AM
last_few_hours = registrations[registrations$signup_timestamp > 1383462326, ]
svg("signups_hourly.svg", width=8, height=6)
timeplot("num_reg", df=last_few_hours, format="%d %H:00", type="l", ylab="Total registered", num_marks=20)
dev.off()

# probably segmenting is appropriate
segmented_model <- segmented(glm(num_reg ~ signup_timestamp, data=registrations),
	seg.Z= ~ signup_timestamp,
	psi=c(as.numeric(as.POSIXct('2013-10-10')), as.numeric(as.POSIXct('2013-11-3'))) )
timeplot("num_reg", type="l", ylab="Total registered")
points(registrations$signup_timestamp, predict(segmented_model, newdata=registrations), type="l", col=2)
summary(segmented_model)

# maybe MARS? multi-adaptive regresison splines?
# Ha! Yes, comes up with basically the same segmentation -- and points out the increase two weeks before!
svg("mars_segments.svg", width=8, height=6)
mars = earth(num_reg ~ signup_timestamp, data=registrations)
timeplot("num_reg", type="l", ylab="Total registered")
points(registrations$signup_timestamp, predict(mars, newdata=registrations), type="l", col=2, lwd=2)
dev.off()

# part 2: simulation of expected numbers; confidence intervals?
# predicting ultimate signups, given data so far
# use HW smoothing and projection with CIs; dangerous because the trend can cause exponential growth

registrations$signup_date = strftime(as.POSIXct(registrations$signup_timestamp, origin="1970-01-01"), format="%m/%d")
min_hour = as.numeric(strptime(strftime(as.POSIXct(min(registrations$signup_timestamp), origin="1970-01-01"), format="%Y-%m-%d"), format="%Y-%m-%d"))
max_hour = as.numeric(strptime(strftime(as.POSIXct(max(registrations$signup_timestamp), origin="1970-01-01"), format="%Y-%m-%d"), format="%Y-%m-%d"))
one_hour <- 86400
all_hours = seq(min_hour, max_hour, one_hour)
my_empty = rep(NA, length(all_hours))
daily_predictions <- data.frame(hour = strftime(as.POSIXct(all_hours, origin="1970-01-01"), "%m/%d"), actual=my_empty, predicted_total=my_empty, lower=my_empty, upper=my_empty)
for (hour_ts in seq(min_hour + 3*one_hour, max_hour, one_hour)) {
	hours = seq(min_hour, hour_ts, one_hour)
	by_hour = as.numeric(table(cut(registrations$signup_timestamp[registrations$signup_timestamp <= hour_ts], breaks=hours)))
	if ((hour_ts - min_hour)/one_hour > 14) {
		by_hour_ts = ts(by_hour, frequency=7)
		hw <- HoltWinters(by_hour_ts, seasonal="additive")
	} else {
		by_hour_ts = ts(by_hour, frequency=1)
		hw <- HoltWinters(by_hour_ts, gamma=FALSE)
	}
	index = round((hour_ts - min_hour) / one_hour)
	if (max_hour - hour_ts > one_hour/2) {
		predictions = predict(hw, (max_hour - hour_ts)/one_hour, prediction.interval=TRUE)
		fit = sapply(c(by_hour, predictions[,"fit"]), max, 0)
		upper = c(by_hour, predictions[,"upr"])
		lower = sapply(c(by_hour, predictions[,"lwr"]), max, 0)
	} else {
		fit = by_hour
		upper = by_hour
		lower = by_hour
	}
	daily_predictions[index, c("actual", "predicted_total", "lower", "upper")] = c(sum(by_hour), sum(fit), sum(lower), sum(upper))
}

# Hm; exponential smoothing is not great at predicting...at least on a daily level...
# try hourly, on a 24 hour "season"

min_hour = as.numeric(strptime(strftime(as.POSIXct(min(registrations$signup_timestamp), origin="1970-01-01"), format="%Y-%m-%d %H"), format="%Y-%m-%d %H"))
max_hour = as.numeric(strptime(strftime(as.POSIXct(max(registrations$signup_timestamp), origin="1970-01-01"), format="%Y-%m-%d %H"), format="%Y-%m-%d %H"))
one_hour <- 3600
all_hours = seq(min_hour, max_hour, one_hour)
my_empty = rep(NA, length(all_hours))
daily_predictions <- data.frame(hour = strftime(as.POSIXct(all_hours, origin="1970-01-01"), "%m/%d"), actual=my_empty, predicted_total=my_empty, lower=my_empty, upper=my_empty)
for (hour_ts in seq(min_hour + 3*one_hour, max_hour, one_hour)) {
  hours = seq(min_hour, hour_ts, one_hour)
  by_hour = as.numeric(table(cut(registrations$signup_timestamp[registrations$signup_timestamp <= hour_ts], breaks=hours)))
  if ((hour_ts - min_hour)/one_hour > 14) {
    by_hour_ts = ts(by_hour, frequency=7)
    hw <- HoltWinters(by_hour_ts, seasonal="additive")
  } else {
    by_hour_ts = ts(by_hour, frequency=1)
    hw <- HoltWinters(by_hour_ts, gamma=FALSE)
  }
  index = round((hour_ts - min_hour) / one_hour)
  if (max_hour - hour_ts > one_hour/2) {
    predictions = predict(hw, (max_hour - hour_ts)/one_hour, prediction.interval=TRUE)
    fit = sapply(c(by_hour, predictions[,"fit"]), max, 0)
    upper = c(by_hour, predictions[,"upr"])
    lower = sapply(c(by_hour, predictions[,"lwr"]), max, 0)
  } else {
    fit = by_hour
    upper = by_hour
    lower = by_hour
  }
  daily_predictions[index, c("actual", "predicted_total", "lower", "upper")] = c(sum(by_hour), sum(fit), sum(lower), sum(upper))
}

# that's still pretty terrible
# really we should try to encode some more knowledge in here -- specifically, the pattern of increase in the hours leading up to the event

# get "expected rate changes" model based on our overall regression -- need more examples to see what variance we can expect
date_breaks = c(as.numeric(as.POSIXct('2013-10-10')), as.numeric(as.POSIXct('2013-10-11')), as.numeric(as.POSIXct('2013-10-28')), as.numeric(as.POSIXct('2013-11-3')))
segmented_model <- segmented(glm(num_reg ~ signup_timestamp, data=registrations),
  seg.Z= ~ signup_timestamp,
  psi=date_breaks )
coefficients = coef(summary(segmented_model))
expected_rates = coefficients["signup_timestamp", "Estimate"] + cumsum(coefficients[sprintf("U%s.signup_timestamp", seq(1,4)), "Estimate"])
# really we will just use the "expected rate changes" for two weeks and one week before the event
expected_rate_changes = diff(expected_rates)[-1]
expected_rate_change_timepoints = date_breaks[-2:-1] + 86400

timeplot("num_reg", type="l", ylab="Total registered")
points(registrations$signup_timestamp, predict(segmented_model, newdata=registrations), type="l", col=2)

# Okay -- let's say we had a 3-part segmented model (after initial registration) and estimated the rates at each day; how might our predictions change over time?
one_day <- 86400
min_hour = date_breaks[2] + 2*one_day
last_reg <- max(registrations$signup_timestamp)
max_hour = as.numeric(strptime(strftime(as.POSIXct(last_reg, origin="1970-01-01"), format="%Y-%m-%d"), format="%Y-%m-%d"))
all_hours = seq(min_hour, max_hour, one_day)
my_empty = rep(NA, length(all_hours))
daily_predictions <- data.frame(ts = all_hours, hour = strftime(as.POSIXct(all_hours, origin="1970-01-01"), "%m/%d"), actual=my_empty, predicted_total=my_empty, lower=my_empty, upper=my_empty)
segmented_prediction <- function(hour_ts, max_hour, stage_index) {
  # probably a better way to do this, but let's break into three cases: more than 2 weeks before; 1-2 weeks; within 1 week
  if (hour_ts <= expected_rate_change_timepoints[1]) {
    prediction = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[1]) - hour_ts) * latest_rate + I(max_hour >= expected_rate_change_timepoints[1])*(min(max_hour,expected_rate_change_timepoints[2]) - expected_rate_change_timepoints[1]) * max(latest_rate + expected_rate_changes[1], 0) + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(latest_rate + sum(expected_rate_changes), 0)
    upper = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[1]) - hour_ts) * upper_rate + I(max_hour >= expected_rate_change_timepoints[1])*(min(max_hour,expected_rate_change_timepoints[2]) - expected_rate_change_timepoints[1]) * max(upper_rate + expected_rate_changes[1], 0) + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(upper_rate + sum(expected_rate_changes), 0)
    lower = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[1]) - hour_ts) * lower_rate + I(max_hour >= expected_rate_change_timepoints[1])*(min(max_hour,expected_rate_change_timepoints[2]) - expected_rate_change_timepoints[1]) * max(lower_rate + expected_rate_changes[1], 0) + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(lower_rate + sum(expected_rate_changes), 0)
  } else if (hour_ts <= expected_rate_change_timepoints[2]) {
    prediction = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[2]) - hour_ts) * latest_rate + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(latest_rate + expected_rate_changes[2], 0)
    upper = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[2]) - hour_ts) * upper_rate + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(upper_rate + expected_rate_changes[2], 0)
    lower = training_data$num_reg[nrow(training_data)] + (min(max_hour,expected_rate_change_timepoints[2]) - hour_ts) * lower_rate + I(max_hour >= expected_rate_change_timepoints[2])*(max_hour - expected_rate_change_timepoints[2]) * max(lower_rate + expected_rate_changes[2], 0)
  } else {
    prediction = training_data$num_reg[nrow(training_data)] + (max_hour - hour_ts) * latest_rate
    upper = training_data$num_reg[nrow(training_data)] + (max_hour - hour_ts) * upper_rate
    lower = training_data$num_reg[nrow(training_data)] + (max_hour - hour_ts) * lower_rate
  }
  return(c(prediction, lower, upper))
}
for (hour_index in seq(1,length(all_hours))) {
  #hour_index = hour_index + 1
  hour_ts = all_hours[hour_index]
  training_data <- subset(registrations, registrations$signup_timestamp < hour_ts)
  culled_date_breaks = date_breaks[date_breaks + one_day < hour_ts]
  segmented_model <- segmented(glm(num_reg ~ signup_timestamp, data=training_data),
    seg.Z= ~ signup_timestamp,
    psi=culled_date_breaks )
  # remove dates if some breaks are not significant
  coefficients <- coef(summary(segmented_model))
  coefficient_rows <- grepl("^U", rownames(coefficients))
  while (sum(coefficients[coefficient_rows, "Pr(>|t|)"] >= 0.01) > 0) {
    culled_date_breaks <- culled_date_breaks[-length(culled_date_breaks)]
    segmented_model <- segmented(glm(num_reg ~ signup_timestamp, data=training_data),
      seg.Z= ~ signup_timestamp,
      psi=culled_date_breaks )
    # remove dates if some breaks are not significant
    coefficients <- coef(summary(segmented_model))
    coefficient_rows <- grepl("^U", rownames(coefficients))
  }

  # TODO: extract rates, extend out to multipliers in later stages, predict eventual total registrations
  coefficients = coef(summary(segmented_model))
  breaks = summary(segmented_model)$psi

  latest_rate = sum(coefficients[2:nrow(coefficients),"Estimate"])
  latest_rate_std = sum(coefficients[coefficients[,"Estimate"] != 0,"Std. Error"]) - coefficients["(Intercept)","Std. Error"]
  upper_rate = max(latest_rate + 1.96 * latest_rate_std, 0)
  lower_rate = max(latest_rate - 1.96 * latest_rate_std, 0)
  latest_rate = max(latest_rate, 0)

  # which "stage" are we in? long slow growth (1), two-weeks-prior (2), or preceeding week (3)
  # based on the number of segments found in the segmented model
  stage_index = sum(grepl("^U", rownames(coef(summary(segmented_model))))) - 1
  predicted = segmented_prediction(hour_ts, last_reg, stage_index)
#  print(sprintf("At index %d, time %d (vs %d), rate %f", hour_index, hour_ts, expected_rate_change_timepoints[1], latest_rate))
#  print(coefficients)
#  print(predicted)

#  future_indices = seq(hour_index,length(all_hours), by=1)
#  future = t(sapply(future_indices, function(i) {
#    segmented_prediction(hour_ts, all_hours[i])
#  }))
#  points(all_hours[future_indices], future[,1], type="l", col=hour_index)

  daily_predictions[hour_index, c("actual", "predicted_total", "lower", "upper")] = c(nrow(training_data), predicted)
}

svg("predicted_registrations.svg", width=8, height=6)
timeplot("num_reg", type="l", ylab="Total registered",
  ylim=c(0, max(daily_predictions$upper)), main="Predicted Registrations Over Time")
points(registrations$signup_timestamp, predict(segmented_model, newdata=registrations), type="l", col=2)
points(daily_predictions$ts, daily_predictions$predicted, type="l", col=3)
points(daily_predictions$ts, daily_predictions$lower, type="l", col=3, lty=2)
points(daily_predictions$ts, daily_predictions$upper, type="l", col=3, lty=2)
legend("left", legend=c("running forecast", "post-hoc MARS model", "actual"), fill=c(3,2,1))
dev.off()