Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .RData
Binary file not shown.
375 changes: 375 additions & 0 deletions .Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,375 @@
ggmap(chicago)
??ggmap
library(XLConnect)
library(XLConnect)
ff <-readWorksheetFromFile("FFDraft2015.xlsx")
ff <-readWorksheetFromFile("FFDraft2015.xlsx",sheet=1)
ff
?<<-
x=matrix[1:12,3,4]
x<-matrix[1:12,3,4]
x<-matrix(1:12,3,4)
x
x(3,1)
x[3,1]
x[2,]
x[,2]
?[]
x[i=3,j=1]
x[j=3,i=1]
x<-matrix(1:24,3,4,2)
x
x[1,]
x[2,]
x[,1]
x[,3]
x[,,1]
?matrix
getwd()
dir()
x = rnorm(100,mean=rep(1:5),each=20,sd=.2)
x = rnorm(100,mean=rep(1:5,each=20),sd=.2)
x
y = rnorm(100,mean=rep(1:4,each=25),sd=.15)
x = rnorm(100,mean=rep(1:5,each=20),sd=.2)
dataframe=data.frame(x,y)
plot (x,y)
distxy = dist(dataframe)
distxy
hcl = hclust(distxy)
plot(hcl)
set.seed (1234)
x = rnorm(50,mean=rep(1:5,each=10),sd=.2)
y = rnorm(60,mean=rep(1:4,each=15),sd=.25)
plot(x,y)
plot(x=x,y=y)
set.seed (1234)
x = rnorm(60,mean=rep(1:5,each=10),sd=.2)
y = rnorm(60,mean=rep(1:4,each=15),sd=.25)
plot(x,y)
distxy=dist(x,y)
df = data.frame(x,y)
distxy=dist(x,y)
distxy=dist(df)
hc = hclust(df)
hc = hclust(distxy)
plot (hc)
str(hc)
?hclust
plot(hc,hang=-1)
plot(hc,hang=3)
plot(hc,hang=5)
plot(hc,hang=10)
plot(hc,hang=-5)
?plot
pie(hc)
pie(xydist)
pie(distxy)
pie(hc)
pie(x)
pie(y)
pie(c(1,2,3,4))
pie(c(1,2,3,4,2))
set.seed (1234)
x = rnorm(60,mean=rep(1:5,each=10),sd=.2)
y = rnorm(60,mean=rep(1:4,each=15),sd=.25)
df = data.frame(x,y)
distxy=dist(df)
hc = hclust(distxy)
plot(hc)
setwd("~/GitHub/Titanic")
install.packages("Amelia")
setwd("~/GitHub/Titanic")
install.packages("corrgram")
install.packages("Hmisc")
install.packages("stringr")
install.packages("plyr")
?revalue
library (plyr)
install.packages("plyr")
library(plyr)
library(parse)
R.Version()
?corrgram
library(corrgram)
?corrgram
corrgram
train.raw <- readData(Titanic.path, train.data.file,
train.column.types, missing.types)
df.train <- train.raw
readData <- function(path.name, file.name, column.types, missing.types) {
read.csv( url( paste(path.name, file.name, sep="") ),
colClasses=column.types,
na.strings=missing.types )
}
train.raw <- readData(Titanic.path, train.data.file,
train.column.types, missing.types)
Titanic.path <- "https://raw.github.com/wehrley/Kaggle_Titanic/master/"
train.data.file <- "train.csv"
test.data.file <- "test.csv"
missing.types <- c("NA", "")
train.column.types <- c('integer', # PassengerId
'factor', # Survived
'factor', # Pclass
'character', # Name
'factor', # Sex
'numeric', # Age
'integer', # SibSp
'integer', # Parch
'character', # Ticket
'numeric', # Fare
'character', # Cabin
'factor' # Embarked
)
test.column.types <- train.column.types[-2] # # no Survived
train.raw <- readData(Titanic.path, train.data.file,
train.column.types, missing.types)
df.train <- train.raw
test.raw <- readData(Titanic.path, test.data.file,
test.column.types, missing.types)
df.train$Title <- factor(df.train$Title,
c("Capt","Col","Major","Sir","Lady","Rev",
"Dr","Don","Jonkheer","the Countess","Mrs",
"Ms","Mr","Mme","Mlle","Miss","Master"))
getTitle <- function(data) {
title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
title.comma.end <- title.dot.start
+ attr(title.dot.start, "match.length")-1
data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
return (data$Title)
}
df.train$Title <- getTitle(df.train)
unique(df.train$Title)
df.train$Title
names(df.train)
df.train$Title <- getTitle(df.train)
df.train$Title
boxplot(df.train$Age ~ df.train$Survived,
main="Passenger Fate by Age",
xlab="Survived", ylab="Age")
getTitle <- function(data) {
title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
title.comma.end <- title.dot.start
+ attr(title.dot.start, "match.length")-1
data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
return (data$Title)
}
df.train$Title <- getTitle(df.train)
df.train$Title
df.train
getTitle <- function(data) {
title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
title.comma.end <- title.dot.start
+ attr(title.dot.start, "match.length")-1
data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
data$Title
return (data$Title)
}
df.train$Title <- getTitle(df.train)
x = <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE)
x <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE)
x
str(df.train)
barplot(table(df.train$Survived),
names.arg = c("Perished", "Survived"),
main="Survived (passenger fate)", col="black")
getTitle(df.train)
getTitle(df.train$Name)
gegetTitle
getTitle
title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE)
title.comma.end <- title.dot.start
+ attr(title.dot.start, "match.length")-1
Title <- substr(df.train$Name, title.dot.start+2, title.comma.end-1)
Title
Title <- substr(df.train$Name, title.dot.start+2, 10)
Title
?substr
Title <- substr(df.train$Name, title.dot.start+2, title.dot.start+2+title.comma.end-1)
Title
head(title.dot.start,5)
head(title.comma.end,5)
title.comma.end <- title.dot.start
+ attr(title.dot.start, "match.length")-1
data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
data$title
Title <- substr(data$Name, title.dot.start+2, title.comma.end-1)
Title <- substr(df.train$Name, title.dot.start+2, title.comma.end-1)
Title
Title <- substr(df.train$Name, title.comma.end-1, title.dot.start - 2)
Title
Title <- substr(df.train$Name, title.comma.end-1, title.dot.start - 2)
head(title.comma.end)
head(title.dot.start)
title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE)
head(title.dot.start)
title.comma.end <- title.dot.start
+ attr(title.dot.start, "match.length")-1
head(title.comma.end,5)
attr(title.dot.start, "match.length")
head(attr(title.dot.start, "match.length"),5)
title.comma.end <- title.dot.start +
attr(title.dot.start, "match.length")
head(title.comma.end)
Title <- substr(df.train$Name, title.dot.start+2, title.comma.end-1)
Title
getTitle <- function(data) {
title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
title.comma.end <- title.dot.start +
attr(title.dot.start, "match.length")-1
data$Title <- substr(data$Name,title.dot.start+2,title.comma.end-)
return (data$Title)
}
getTitle <- function(data) {
title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
title.comma.end <- title.dot.start +
attr(title.dot.start, "match.length")-1
data$Title <- substr(data$Name,title.dot.start+2,
title.comma.end-1)
return (data$Title)
}
df.train$Title <- getTitle(df.train)
unique(df.train$Title)
install.packages("caret")
library(caret)
?createDataPartition
df.train$Survived
training.rows <- createDataPartition(
df.train, p = 0.8, list = FALSE)
featureEngrg <- function(data) {
## Using Fate ILO Survived because term is shorter and just sounds good
data$Fate <- data$Survived
## Revaluing Fate factor to ease assessment of confusion matrices later
data$Fate <- revalue(data$Fate, c("1" = "Survived", "0" = "Perished"))
## Boat.dibs attempts to capture the "women and children first"
## policy in one feature. Assuming all females plus males under 15
## got "dibs' on access to a lifeboat
data$Boat.dibs <- "No"
data$Boat.dibs[which(data$Sex == "female" | data$Age < 15)] <- "Yes"
data$Boat.dibs <- as.factor(data$Boat.dibs)
## Family consolidates siblings and spouses (SibSp) plus
## parents and children (Parch) into one feature
data$Family <- data$SibSp + data$Parch
## Fare.pp attempts to adjust group purchases by size of family
data$Fare.pp <- data$Fare/(data$Family + 1)
## Giving the traveling class feature a new look
data$Class <- data$Pclass
data$Class <- revalue(data$Class,
c("1"="First", "2"="Second", "3"="Third"))
## First character in Cabin number represents the Deck
data$Deck <- substring(data$Cabin, 1, 1)
data$Deck[ which( is.na(data$Deck ))] <- "UNK"
data$Deck <- as.factor(data$Deck)
## Odd-numbered cabins were reportedly on the port side of the ship
## Even-numbered cabins assigned Side="starboard"
data$cabin.last.digit <- str_sub(data$Cabin, -1)
data$Side <- "UNK"
data$Side[which(isEven(data$cabin.last.digit))] <- "port"
data$Side[which(isOdd(data$cabin.last.digit))] <- "starboard"
data$Side <- as.factor(data$Side)
data$cabin.last.digit <- NULL
return (data)
}
## add remaining features to training data frame
df.train <- featureEngrg(df.train)
isEven <- function(x) x %in% c("0","2","4","6","8")
## test a character as an ODD single digit
isOdd <- function(x) x %in% c("1","3","5","7","9")
featureEngrg <- function(data) {
## Using Fate ILO Survived because term is shorter and just sounds good
data$Fate <- data$Survived
## Revaluing Fate factor to ease assessment of confusion matrices later
data$Fate <- revalue(data$Fate, c("1" = "Survived", "0" = "Perished"))
## Boat.dibs attempts to capture the "women and children first"
## policy in one feature. Assuming all females plus males under 15
## got "dibs' on access to a lifeboat
data$Boat.dibs <- "No"
data$Boat.dibs[which(data$Sex == "female" | data$Age < 15)] <- "Yes"
data$Boat.dibs <- as.factor(data$Boat.dibs)
## Family consolidates siblings and spouses (SibSp) plus
## parents and children (Parch) into one feature
data$Family <- data$SibSp + data$Parch
## Fare.pp attempts to adjust group purchases by size of family
data$Fare.pp <- data$Fare/(data$Family + 1)
## Giving the traveling class feature a new look
data$Class <- data$Pclass
data$Class <- revalue(data$Class,
c("1"="First", "2"="Second", "3"="Third"))
## First character in Cabin number represents the Deck
data$Deck <- substring(data$Cabin, 1, 1)
data$Deck[ which( is.na(data$Deck ))] <- "UNK"
data$Deck <- as.factor(data$Deck)
## Odd-numbered cabins were reportedly on the port side of the ship
## Even-numbered cabins assigned Side="starboard"
data$cabin.last.digit <- str_sub(data$Cabin, -1)
data$Side <- "UNK"
data$Side[which(isEven(data$cabin.last.digit))] <- "port"
data$Side[which(isOdd(data$cabin.last.digit))] <- "starboard"
data$Side <- as.factor(data$Side)
data$cabin.last.digit <- NULL
return (data)
}
## add remaining features to training data frame
df.train <- featureEngrg(df.train)
install.packages("stringr")
library (stringr)
featureEngrg <- function(data) {
## Using Fate ILO Survived because term is shorter and just sounds good
data$Fate <- data$Survived
## Revaluing Fate factor to ease assessment of confusion matrices later
data$Fate <- revalue(data$Fate, c("1" = "Survived", "0" = "Perished"))
## Boat.dibs attempts to capture the "women and children first"
## policy in one feature. Assuming all females plus males under 15
## got "dibs' on access to a lifeboat
data$Boat.dibs <- "No"
data$Boat.dibs[which(data$Sex == "female" | data$Age < 15)] <- "Yes"
data$Boat.dibs <- as.factor(data$Boat.dibs)
## Family consolidates siblings and spouses (SibSp) plus
## parents and children (Parch) into one feature
data$Family <- data$SibSp + data$Parch
## Fare.pp attempts to adjust group purchases by size of family
data$Fare.pp <- data$Fare/(data$Family + 1)
## Giving the traveling class feature a new look
data$Class <- data$Pclass
data$Class <- revalue(data$Class,
c("1"="First", "2"="Second", "3"="Third"))
## First character in Cabin number represents the Deck
data$Deck <- substring(data$Cabin, 1, 1)
data$Deck[ which( is.na(data$Deck ))] <- "UNK"
data$Deck <- as.factor(data$Deck)
## Odd-numbered cabins were reportedly on the port side of the ship
## Even-numbered cabins assigned Side="starboard"
data$cabin.last.digit <- str_sub(data$Cabin, -1)
data$Side <- "UNK"
data$Side[which(isEven(data$cabin.last.digit))] <- "port"
data$Side[which(isOdd(data$cabin.last.digit))] <- "starboard"
data$Side <- as.factor(data$Side)
data$cabin.last.digit <- NULL
return (data)
}
## add remaining features to training data frame
df.train <- featureEngrg(df.train)
train.keeps <- c("Fate", "Sex", "Boat.dibs", "Age", "Title",
"Class", "Deck", "Side", "Fare", "Fare.pp",
"Embarked", "Family")
df.train.munged <- df.train[train.keeps]
str(df.train.munged)
set.seed(23)
training.rows <- createDataPartition(
df.train.munged$Survived, p = 0.8, list = FALSE)
df.train.munged$Survived
df.train.munged$
df.train.munged
df.train.munged
str(df.train.munged)
training.rows <- createDataPartition(
df.train.munged$Fate, p = 0.8, list = FALSE)
training.rows
install.packages("pROC")
library (pROC)
library (pROC)
install.packages("randomForest")
library(randomForest)
install.packages("ada")
library(ada)
17 changes: 17 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Auto detect text files and perform LF normalization
* text=auto

# Custom for Visual Studio
*.cs diff=csharp

# Standard to msysgit
*.doc diff=astextplain
*.DOC diff=astextplain
*.docx diff=astextplain
*.DOCX diff=astextplain
*.dot diff=astextplain
*.DOT diff=astextplain
*.pdf diff=astextplain
*.PDF diff=astextplain
*.rtf diff=astextplain
*.RTF diff=astextplain
Loading