wehrley · wjholst · Jan 25, 2016 · Jan 26, 2016 · Jan 29, 2016
diff --git a/.RData b/.RData
diff --git a/.Rhistory b/.Rhistory
@@ -0,0 +1,375 @@
+ggmap(chicago)
+??ggmap
+library(XLConnect)
+library(XLConnect)
+ff <-readWorksheetFromFile("FFDraft2015.xlsx")
+ff <-readWorksheetFromFile("FFDraft2015.xlsx",sheet=1)
+ff
+?<<-
+x=matrix[1:12,3,4]
+x<-matrix[1:12,3,4]
+x<-matrix(1:12,3,4)
+x
+x(3,1)
+x[3,1]
+x[2,]
+x[,2]
+?[]
+x[i=3,j=1]
+x[j=3,i=1]
+x<-matrix(1:24,3,4,2)
+x
+x[1,]
+x[2,]
+x[,1]
+x[,3]
+x[,,1]
+?matrix
+getwd()
+dir()
+x = rnorm(100,mean=rep(1:5),each=20,sd=.2)
+x = rnorm(100,mean=rep(1:5,each=20),sd=.2)
+x
+y = rnorm(100,mean=rep(1:4,each=25),sd=.15)
+x = rnorm(100,mean=rep(1:5,each=20),sd=.2)
+dataframe=data.frame(x,y)
+plot (x,y)
+distxy = dist(dataframe)
+distxy
+hcl = hclust(distxy)
+plot(hcl)
+set.seed (1234)
+x = rnorm(50,mean=rep(1:5,each=10),sd=.2)
+y = rnorm(60,mean=rep(1:4,each=15),sd=.25)
+plot(x,y)
+plot(x=x,y=y)
+set.seed (1234)
+x = rnorm(60,mean=rep(1:5,each=10),sd=.2)
+y = rnorm(60,mean=rep(1:4,each=15),sd=.25)
+plot(x,y)
+distxy=dist(x,y)
+df = data.frame(x,y)
+distxy=dist(x,y)
+distxy=dist(df)
+hc = hclust(df)
+hc = hclust(distxy)
+plot (hc)
+str(hc)
+?hclust
+plot(hc,hang=-1)
+plot(hc,hang=3)
+plot(hc,hang=5)
+plot(hc,hang=10)
+plot(hc,hang=-5)
+?plot
+pie(hc)
+pie(xydist)
+pie(distxy)
+pie(hc)
+pie(x)
+pie(y)
+pie(c(1,2,3,4))
+pie(c(1,2,3,4,2))
+set.seed (1234)
+x = rnorm(60,mean=rep(1:5,each=10),sd=.2)
+y = rnorm(60,mean=rep(1:4,each=15),sd=.25)
+df = data.frame(x,y)
+distxy=dist(df)
+hc = hclust(distxy)
+plot(hc)
+setwd("~/GitHub/Titanic")
+install.packages("Amelia")
+setwd("~/GitHub/Titanic")
+install.packages("corrgram")
+install.packages("Hmisc")
+install.packages("stringr")
+install.packages("plyr")
+?revalue
+library (plyr)
+install.packages("plyr")
+library(plyr)
+library(parse)
+R.Version()
+?corrgram
+library(corrgram)
+?corrgram
+corrgram
+train.raw <- readData(Titanic.path, train.data.file,
+train.column.types, missing.types)
+df.train <- train.raw
+readData <- function(path.name, file.name, column.types, missing.types) {
+read.csv( url( paste(path.name, file.name, sep="") ),
+colClasses=column.types,
+na.strings=missing.types )
+}
+train.raw <- readData(Titanic.path, train.data.file,
+train.column.types, missing.types)
+Titanic.path <- "https://raw.github.com/wehrley/Kaggle_Titanic/master/"
+train.data.file <- "train.csv"
+test.data.file <- "test.csv"
+missing.types <- c("NA", "")
+train.column.types <- c('integer',   # PassengerId
+'factor',    # Survived
+'factor',    # Pclass
+'character', # Name
+'factor',    # Sex
+'numeric',   # Age
+'integer',   # SibSp
+'integer',   # Parch
+'character', # Ticket
+'numeric',   # Fare
+'character', # Cabin
+'factor'     # Embarked
+)
+test.column.types <- train.column.types[-2]     # # no Survived
+train.raw <- readData(Titanic.path, train.data.file,
+train.column.types, missing.types)
+df.train <- train.raw
+test.raw <- readData(Titanic.path, test.data.file,
+test.column.types, missing.types)
+df.train$Title <- factor(df.train$Title,
+c("Capt","Col","Major","Sir","Lady","Rev",
+"Dr","Don","Jonkheer","the Countess","Mrs",
+"Ms","Mr","Mme","Mlle","Miss","Master"))
+getTitle <- function(data) {
+title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
+title.comma.end <- title.dot.start
++ attr(title.dot.start, "match.length")-1
+data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
+return (data$Title)
+}
+df.train$Title <- getTitle(df.train)
+unique(df.train$Title)
+df.train$Title
+names(df.train)
+df.train$Title <- getTitle(df.train)
+df.train$Title
+boxplot(df.train$Age ~ df.train$Survived,
+main="Passenger Fate by Age",
+xlab="Survived", ylab="Age")
+getTitle <- function(data) {
+title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
+title.comma.end <- title.dot.start
++ attr(title.dot.start, "match.length")-1
+data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
+return (data$Title)
+}
+df.train$Title <- getTitle(df.train)
+df.train$Title
+df.train
+getTitle <- function(data) {
+title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
+title.comma.end <- title.dot.start
++ attr(title.dot.start, "match.length")-1
+data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
+data$Title
+return (data$Title)
+}
+df.train$Title <- getTitle(df.train)
+x = <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE)
+x <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE)
+x
+str(df.train)
+barplot(table(df.train$Survived),
+names.arg = c("Perished", "Survived"),
+main="Survived (passenger fate)", col="black")
+getTitle(df.train)
+getTitle(df.train$Name)
+gegetTitle
+getTitle
+title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
+title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE)
+title.comma.end <- title.dot.start
++ attr(title.dot.start, "match.length")-1
+Title <- substr(df.train$Name, title.dot.start+2, title.comma.end-1)
+Title
+Title <- substr(df.train$Name, title.dot.start+2, 10)
+Title
+?substr
+Title <- substr(df.train$Name, title.dot.start+2, title.dot.start+2+title.comma.end-1)
+Title
+head(title.dot.start,5)
+head(title.comma.end,5)
+title.comma.end <- title.dot.start
++ attr(title.dot.start, "match.length")-1
+data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
+data$title
+Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
+Title <- substr(df.train$Name, title.dot.start+2, title.comma.end-1)
+Title
+Title <- substr(df.train$Name, title.comma.end-1, title.dot.start - 2)
+Title
+Title <- substr(df.train$Name, title.comma.end-1, title.dot.start - 2)
+head(title.comma.end)
+head(title.dot.start)
+title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE)
+head(title.dot.start)
+title.comma.end <- title.dot.start
++ attr(title.dot.start, "match.length")-1
+head(title.comma.end,5)
+attr(title.dot.start, "match.length")
+head(attr(title.dot.start, "match.length"),5)
+title.comma.end <- title.dot.start +
+attr(title.dot.start, "match.length")
+head(title.comma.end)
+Title <- substr(df.train$Name, title.dot.start+2, title.comma.end-1)
+Title
+getTitle <- function(data) {
+title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
+title.comma.end <- title.dot.start +
+attr(title.dot.start, "match.length")-1
+data$Title <- substr(data$Name,title.dot.start+2,title.comma.end-)
+return (data$Title)
+}
+getTitle <- function(data) {
+title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
+title.comma.end <- title.dot.start +
+attr(title.dot.start, "match.length")-1
+data$Title <- substr(data$Name,title.dot.start+2,
+title.comma.end-1)
+return (data$Title)
+}
+df.train$Title <- getTitle(df.train)
+unique(df.train$Title)
+install.packages("caret")
+library(caret)
+?createDataPartition
+df.train$Survived
+training.rows <- createDataPartition(
+df.train, p = 0.8, list = FALSE)
+featureEngrg <- function(data) {
+## Using Fate ILO Survived because term is shorter and just sounds good
+data$Fate <- data$Survived
+## Revaluing Fate factor to ease assessment of confusion matrices later
+data$Fate <- revalue(data$Fate, c("1" = "Survived", "0" = "Perished"))
+## Boat.dibs attempts to capture the "women and children first"
+## policy in one feature.  Assuming all females plus males under 15
+## got "dibs' on access to a lifeboat
+data$Boat.dibs <- "No"
+data$Boat.dibs[which(data$Sex == "female" | data$Age < 15)] <- "Yes"
+data$Boat.dibs <- as.factor(data$Boat.dibs)
+## Family consolidates siblings and spouses (SibSp) plus
+## parents and children (Parch) into one feature
+data$Family <- data$SibSp + data$Parch
+## Fare.pp attempts to adjust group purchases by size of family
+data$Fare.pp <- data$Fare/(data$Family + 1)
+## Giving the traveling class feature a new look
+data$Class <- data$Pclass
+data$Class <- revalue(data$Class,
+c("1"="First", "2"="Second", "3"="Third"))
+## First character in Cabin number represents the Deck
+data$Deck <- substring(data$Cabin, 1, 1)
+data$Deck[ which( is.na(data$Deck ))] <- "UNK"
+data$Deck <- as.factor(data$Deck)
+## Odd-numbered cabins were reportedly on the port side of the ship
+## Even-numbered cabins assigned Side="starboard"
+data$cabin.last.digit <- str_sub(data$Cabin, -1)
+data$Side <- "UNK"
+data$Side[which(isEven(data$cabin.last.digit))] <- "port"
+data$Side[which(isOdd(data$cabin.last.digit))] <- "starboard"
+data$Side <- as.factor(data$Side)
+data$cabin.last.digit <- NULL
+return (data)
+}
+## add remaining features to training data frame
+df.train <- featureEngrg(df.train)
+isEven <- function(x) x %in% c("0","2","4","6","8")
+## test a character as an ODD single digit
+isOdd <- function(x) x %in% c("1","3","5","7","9")
+featureEngrg <- function(data) {
+## Using Fate ILO Survived because term is shorter and just sounds good
+data$Fate <- data$Survived
+## Revaluing Fate factor to ease assessment of confusion matrices later
+data$Fate <- revalue(data$Fate, c("1" = "Survived", "0" = "Perished"))
+## Boat.dibs attempts to capture the "women and children first"
+## policy in one feature.  Assuming all females plus males under 15
+## got "dibs' on access to a lifeboat
+data$Boat.dibs <- "No"
+data$Boat.dibs[which(data$Sex == "female" | data$Age < 15)] <- "Yes"
+data$Boat.dibs <- as.factor(data$Boat.dibs)
+## Family consolidates siblings and spouses (SibSp) plus
+## parents and children (Parch) into one feature
+data$Family <- data$SibSp + data$Parch
+## Fare.pp attempts to adjust group purchases by size of family
+data$Fare.pp <- data$Fare/(data$Family + 1)
+## Giving the traveling class feature a new look
+data$Class <- data$Pclass
+data$Class <- revalue(data$Class,
+c("1"="First", "2"="Second", "3"="Third"))
+## First character in Cabin number represents the Deck
+data$Deck <- substring(data$Cabin, 1, 1)
+data$Deck[ which( is.na(data$Deck ))] <- "UNK"
+data$Deck <- as.factor(data$Deck)
+## Odd-numbered cabins were reportedly on the port side of the ship
+## Even-numbered cabins assigned Side="starboard"
+data$cabin.last.digit <- str_sub(data$Cabin, -1)
+data$Side <- "UNK"
+data$Side[which(isEven(data$cabin.last.digit))] <- "port"
+data$Side[which(isOdd(data$cabin.last.digit))] <- "starboard"
+data$Side <- as.factor(data$Side)
+data$cabin.last.digit <- NULL
+return (data)
+}
+## add remaining features to training data frame
+df.train <- featureEngrg(df.train)
+install.packages("stringr")
+library (stringr)
+featureEngrg <- function(data) {
+## Using Fate ILO Survived because term is shorter and just sounds good
+data$Fate <- data$Survived
+## Revaluing Fate factor to ease assessment of confusion matrices later
+data$Fate <- revalue(data$Fate, c("1" = "Survived", "0" = "Perished"))
+## Boat.dibs attempts to capture the "women and children first"
+## policy in one feature.  Assuming all females plus males under 15
+## got "dibs' on access to a lifeboat
+data$Boat.dibs <- "No"
+data$Boat.dibs[which(data$Sex == "female" | data$Age < 15)] <- "Yes"
+data$Boat.dibs <- as.factor(data$Boat.dibs)
+## Family consolidates siblings and spouses (SibSp) plus
+## parents and children (Parch) into one feature
+data$Family <- data$SibSp + data$Parch
+## Fare.pp attempts to adjust group purchases by size of family
+data$Fare.pp <- data$Fare/(data$Family + 1)
+## Giving the traveling class feature a new look
+data$Class <- data$Pclass
+data$Class <- revalue(data$Class,
+c("1"="First", "2"="Second", "3"="Third"))
+## First character in Cabin number represents the Deck
+data$Deck <- substring(data$Cabin, 1, 1)
+data$Deck[ which( is.na(data$Deck ))] <- "UNK"
+data$Deck <- as.factor(data$Deck)
+## Odd-numbered cabins were reportedly on the port side of the ship
+## Even-numbered cabins assigned Side="starboard"
+data$cabin.last.digit <- str_sub(data$Cabin, -1)
+data$Side <- "UNK"
+data$Side[which(isEven(data$cabin.last.digit))] <- "port"
+data$Side[which(isOdd(data$cabin.last.digit))] <- "starboard"
+data$Side <- as.factor(data$Side)
+data$cabin.last.digit <- NULL
+return (data)
+}
+## add remaining features to training data frame
+df.train <- featureEngrg(df.train)
+train.keeps <- c("Fate", "Sex", "Boat.dibs", "Age", "Title",
+"Class", "Deck", "Side", "Fare", "Fare.pp",
+"Embarked", "Family")
+df.train.munged <- df.train[train.keeps]
+str(df.train.munged)
+set.seed(23)
+training.rows <- createDataPartition(
+df.train.munged$Survived, p = 0.8, list = FALSE)
+df.train.munged$Survived
+df.train.munged$
+df.train.munged
+df.train.munged
+str(df.train.munged)
+training.rows <- createDataPartition(
+df.train.munged$Fate, p = 0.8, list = FALSE)
+training.rows
+install.packages("pROC")
+library (pROC)
+library (pROC)
+install.packages("randomForest")
+library(randomForest)
+install.packages("ada")
+library(ada)
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,17 @@
+# Auto detect text files and perform LF normalization
+* text=auto
+
+# Custom for Visual Studio
+*.cs     diff=csharp
+
+# Standard to msysgit
+*.doc	 diff=astextplain
+*.DOC	 diff=astextplain
+*.docx diff=astextplain
+*.DOCX diff=astextplain
+*.dot  diff=astextplain
+*.DOT  diff=astextplain
+*.pdf  diff=astextplain
+*.PDF	 diff=astextplain
+*.rtf	 diff=astextplain
+*.RTF	 diff=astextplain