setwd("/Users/lotze/workspace/sfjourney_analysis") require('RColorBrewer') registrations = read.csv("cleaned_registrations.csv", stringsAsFactors=FALSE) registrations = registrations[order(registrations$signup_timestamp), ] ##### Section 1 ##### registrations$num_reg = (1:nrow(registrations)) registrations$num_attended = cumsum(registrations$attended) registrations$pct_attended = registrations$num_attended/registrations$num_reg attended <- registrations[registrations$attended,] attended$num_reg = (1:nrow(attended)) # histogram of signup ages svg("age_histogram.svg", width=8, height=6) hist(registrations$age, xlim=c(15,44), breaks=seq(-0.5,100.5), xlab="age", main="Histogram of Registrations by Age") dev.off() summary(registrations$age) quantile(registrations$age, prob=c(0.01, 0.025, 0.05, 0.95, 0.975, 0.99)) # unused heatmap of gender/age ages = seq(0, max(registrations$age), by=1) registrations$gender = "undetermined" registrations$gender[registrations$male > 0.5] = "M" registrations$gender[registrations$male <= 0.5] = "F" by_age = data.frame(age=ages, M=rep(0, length(ages)), F=rep(0, length(ages))) M = table(registrations$age[registrations$gender=="M"]) by_age$M[match(names(M), by_age$age)] = M F = table(registrations$age[registrations$gender=="F"]) by_age$F[match(names(F), by_age$age)] = F by_age$total = by_age$F + by_age$M rownames = by_age$age by_age = by_age[, c("age", "M", "F", "total")] heatmap(as.matrix(by_age[, c("M", "F")]/sum(by_age$total)), Rowv=NA, Colv=NA, scale="none") # barplot of gender/age svg("age_gender_barplot.svg", width=8, height=6) main_ages = by_age[by_age$age >= 14 & by_age$age <=50,] barplot(t(as.matrix(main_ages[, c("M", "F")])), names.arg=main_ages$age, beside=TRUE, legend=TRUE, xlab="age", ylab="registrations", main="Registrations by Age and Gender") dev.off() # gender comparison: about 1.5 (3 men to every 2 women) num_males = sum(registrations$male > 0.5, na.rm=TRUE) num_females = sum(registrations$male < 0.5, na.rm=TRUE) num_females / (num_males + num_females) svg("why_is_this_a_plot.svg", width=8, height=6) barplot(as.matrix(c(num_males, num_females)), beside=TRUE, legend = c("Male", "Female"), ylab="registered", main="60% Male, 40% Female") dev.off() # interestingly, there does seem to be a slight age difference, with males skewing older quantile(registrations$age[registrations$male > 0.5], prob=c(0.01, 0.025, 0.25, 0.5, 0.75, 0.975, 0.99), na.rm=TRUE) quantile(registrations$age[registrations$male < 0.5], prob=c(0.01, 0.025, 0.25, 0.5, 0.75, 0.975, 0.99), na.rm=TRUE) hist(registrations$age[registrations$male > 0.5], xlim=c(15,44), breaks=100) hist(registrations$age[registrations$male < 0.5], xlim=c(15,44), breaks=100) # interesting note: breakdown of registration methods for (method in c("facebook", "google", "twitter")) { registrations[, sprintf("is_%s", method)] = grepl(method, registrations$services) print(sprintf("%s: %0.2f%%", method, 100*sum(registrations[, sprintf("is_%s", method)])/nrow(registrations))) } summary(registrations[,c("is_facebook", "is_google", "is_twitter")]) # wow! more people authenticated with facebook than google! svg("authentication_bars.svg", width=8, height=6) reg_percents = 100*sapply(registrations[,c("is_facebook", "is_google", "is_twitter")], sum)/nrow(registrations) barplot(reg_percents, beside=TRUE, ylab="percent registrations using method", names.arg=c("facebook", "google", "twitter"), main="Authentication method Frequency") dev.off() # and breakdown of email domains domain_table = table(registrations$domain) domain_table = rev(sort(domain_table)) domain_names = names(domain_table) domain_table = 100*domain_table/sum(domain_table) names(domain_table) = domain_names domain_table[1:30] # yep; gmail is kind of email; yahoo and hotmail still alive; a few local universities svg("email_bars.svg", width=8, height=6) barplot(domain_table[1:6], ylab="percent registrations", main="Email Domain Frequency") dev.off() # and breakdown of area codes area_code_table = rev(sort(table(registrations$area_code))) area_code_table[1:20] # 415 and 510, sure -- but a lot of diversity (maybe not surprisingly) registrations$area_code_factor = registrations$area_code registrations$area_code_factor[!(registrations$area_code %in% names(area_code_table[1:10]))] = "small" registrations$area_code_factor = as.factor(registrations$area_code_factor)