diff --git a/.RData b/.RData new file mode 100644 index 0000000..fd2b768 Binary files /dev/null and b/.RData differ diff --git a/.Rhistory b/.Rhistory new file mode 100644 index 0000000..82e1a23 --- /dev/null +++ b/.Rhistory @@ -0,0 +1,375 @@ +ggmap(chicago) +??ggmap +library(XLConnect) +library(XLConnect) +ff <-readWorksheetFromFile("FFDraft2015.xlsx") +ff <-readWorksheetFromFile("FFDraft2015.xlsx",sheet=1) +ff +?<<- +x=matrix[1:12,3,4] +x<-matrix[1:12,3,4] +x<-matrix(1:12,3,4) +x +x(3,1) +x[3,1] +x[2,] +x[,2] +?[] +x[i=3,j=1] +x[j=3,i=1] +x<-matrix(1:24,3,4,2) +x +x[1,] +x[2,] +x[,1] +x[,3] +x[,,1] +?matrix +getwd() +dir() +x = rnorm(100,mean=rep(1:5),each=20,sd=.2) +x = rnorm(100,mean=rep(1:5,each=20),sd=.2) +x +y = rnorm(100,mean=rep(1:4,each=25),sd=.15) +x = rnorm(100,mean=rep(1:5,each=20),sd=.2) +dataframe=data.frame(x,y) +plot (x,y) +distxy = dist(dataframe) +distxy +hcl = hclust(distxy) +plot(hcl) +set.seed (1234) +x = rnorm(50,mean=rep(1:5,each=10),sd=.2) +y = rnorm(60,mean=rep(1:4,each=15),sd=.25) +plot(x,y) +plot(x=x,y=y) +set.seed (1234) +x = rnorm(60,mean=rep(1:5,each=10),sd=.2) +y = rnorm(60,mean=rep(1:4,each=15),sd=.25) +plot(x,y) +distxy=dist(x,y) +df = data.frame(x,y) +distxy=dist(x,y) +distxy=dist(df) +hc = hclust(df) +hc = hclust(distxy) +plot (hc) +str(hc) +?hclust +plot(hc,hang=-1) +plot(hc,hang=3) +plot(hc,hang=5) +plot(hc,hang=10) +plot(hc,hang=-5) +?plot +pie(hc) +pie(xydist) +pie(distxy) +pie(hc) +pie(x) +pie(y) +pie(c(1,2,3,4)) +pie(c(1,2,3,4,2)) +set.seed (1234) +x = rnorm(60,mean=rep(1:5,each=10),sd=.2) +y = rnorm(60,mean=rep(1:4,each=15),sd=.25) +df = data.frame(x,y) +distxy=dist(df) +hc = hclust(distxy) +plot(hc) +setwd("~/GitHub/Titanic") +install.packages("Amelia") +setwd("~/GitHub/Titanic") +install.packages("corrgram") +install.packages("Hmisc") +install.packages("stringr") +install.packages("plyr") +?revalue +library (plyr) +install.packages("plyr") +library(plyr) +library(parse) +R.Version() +?corrgram +library(corrgram) +?corrgram +corrgram +train.raw <- readData(Titanic.path, train.data.file, +train.column.types, missing.types) +df.train <- train.raw +readData <- function(path.name, file.name, column.types, missing.types) { +read.csv( url( paste(path.name, file.name, sep="") ), +colClasses=column.types, +na.strings=missing.types ) +} +train.raw <- readData(Titanic.path, train.data.file, +train.column.types, missing.types) +Titanic.path <- "https://raw.github.com/wehrley/Kaggle_Titanic/master/" +train.data.file <- "train.csv" +test.data.file <- "test.csv" +missing.types <- c("NA", "") +train.column.types <- c('integer', # PassengerId +'factor', # Survived +'factor', # Pclass +'character', # Name +'factor', # Sex +'numeric', # Age +'integer', # SibSp +'integer', # Parch +'character', # Ticket +'numeric', # Fare +'character', # Cabin +'factor' # Embarked +) +test.column.types <- train.column.types[-2] # # no Survived +train.raw <- readData(Titanic.path, train.data.file, +train.column.types, missing.types) +df.train <- train.raw +test.raw <- readData(Titanic.path, test.data.file, +test.column.types, missing.types) +df.train$Title <- factor(df.train$Title, +c("Capt","Col","Major","Sir","Lady","Rev", +"Dr","Don","Jonkheer","the Countess","Mrs", +"Ms","Mr","Mme","Mlle","Miss","Master")) +getTitle <- function(data) { +title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE) +title.comma.end <- title.dot.start ++ attr(title.dot.start, "match.length")-1 +data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1) +return (data$Title) +} +df.train$Title <- getTitle(df.train) +unique(df.train$Title) +df.train$Title +names(df.train) +df.train$Title <- getTitle(df.train) +df.train$Title +boxplot(df.train$Age ~ df.train$Survived, +main="Passenger Fate by Age", +xlab="Survived", ylab="Age") +getTitle <- function(data) { +title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE) +title.comma.end <- title.dot.start ++ attr(title.dot.start, "match.length")-1 +data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1) +return (data$Title) +} +df.train$Title <- getTitle(df.train) +df.train$Title +df.train +getTitle <- function(data) { +title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE) +title.comma.end <- title.dot.start ++ attr(title.dot.start, "match.length")-1 +data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1) +data$Title +return (data$Title) +} +df.train$Title <- getTitle(df.train) +x = <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE) +x <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE) +x +str(df.train) +barplot(table(df.train$Survived), +names.arg = c("Perished", "Survived"), +main="Survived (passenger fate)", col="black") +getTitle(df.train) +getTitle(df.train$Name) +gegetTitle +getTitle +title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE) +title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE) +title.comma.end <- title.dot.start ++ attr(title.dot.start, "match.length")-1 +Title <- substr(df.train$Name, title.dot.start+2, title.comma.end-1) +Title +Title <- substr(df.train$Name, title.dot.start+2, 10) +Title +?substr +Title <- substr(df.train$Name, title.dot.start+2, title.dot.start+2+title.comma.end-1) +Title +head(title.dot.start,5) +head(title.comma.end,5) +title.comma.end <- title.dot.start ++ attr(title.dot.start, "match.length")-1 +data$Title <- substr(data$Name, title.dot.start+2, title.comma.end-1) +data$title +Title <- substr(data$Name, title.dot.start+2, title.comma.end-1) +Title <- substr(df.train$Name, title.dot.start+2, title.comma.end-1) +Title +Title <- substr(df.train$Name, title.comma.end-1, title.dot.start - 2) +Title +Title <- substr(df.train$Name, title.comma.end-1, title.dot.start - 2) +head(title.comma.end) +head(title.dot.start) +title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", df.train$Name, TRUE) +head(title.dot.start) +title.comma.end <- title.dot.start ++ attr(title.dot.start, "match.length")-1 +head(title.comma.end,5) +attr(title.dot.start, "match.length") +head(attr(title.dot.start, "match.length"),5) +title.comma.end <- title.dot.start + +attr(title.dot.start, "match.length") +head(title.comma.end) +Title <- substr(df.train$Name, title.dot.start+2, title.comma.end-1) +Title +getTitle <- function(data) { +title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE) +title.comma.end <- title.dot.start + +attr(title.dot.start, "match.length")-1 +data$Title <- substr(data$Name,title.dot.start+2,title.comma.end-) +return (data$Title) +} +getTitle <- function(data) { +title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE) +title.comma.end <- title.dot.start + +attr(title.dot.start, "match.length")-1 +data$Title <- substr(data$Name,title.dot.start+2, +title.comma.end-1) +return (data$Title) +} +df.train$Title <- getTitle(df.train) +unique(df.train$Title) +install.packages("caret") +library(caret) +?createDataPartition +df.train$Survived +training.rows <- createDataPartition( +df.train, p = 0.8, list = FALSE) +featureEngrg <- function(data) { +## Using Fate ILO Survived because term is shorter and just sounds good +data$Fate <- data$Survived +## Revaluing Fate factor to ease assessment of confusion matrices later +data$Fate <- revalue(data$Fate, c("1" = "Survived", "0" = "Perished")) +## Boat.dibs attempts to capture the "women and children first" +## policy in one feature. Assuming all females plus males under 15 +## got "dibs' on access to a lifeboat +data$Boat.dibs <- "No" +data$Boat.dibs[which(data$Sex == "female" | data$Age < 15)] <- "Yes" +data$Boat.dibs <- as.factor(data$Boat.dibs) +## Family consolidates siblings and spouses (SibSp) plus +## parents and children (Parch) into one feature +data$Family <- data$SibSp + data$Parch +## Fare.pp attempts to adjust group purchases by size of family +data$Fare.pp <- data$Fare/(data$Family + 1) +## Giving the traveling class feature a new look +data$Class <- data$Pclass +data$Class <- revalue(data$Class, +c("1"="First", "2"="Second", "3"="Third")) +## First character in Cabin number represents the Deck +data$Deck <- substring(data$Cabin, 1, 1) +data$Deck[ which( is.na(data$Deck ))] <- "UNK" +data$Deck <- as.factor(data$Deck) +## Odd-numbered cabins were reportedly on the port side of the ship +## Even-numbered cabins assigned Side="starboard" +data$cabin.last.digit <- str_sub(data$Cabin, -1) +data$Side <- "UNK" +data$Side[which(isEven(data$cabin.last.digit))] <- "port" +data$Side[which(isOdd(data$cabin.last.digit))] <- "starboard" +data$Side <- as.factor(data$Side) +data$cabin.last.digit <- NULL +return (data) +} +## add remaining features to training data frame +df.train <- featureEngrg(df.train) +isEven <- function(x) x %in% c("0","2","4","6","8") +## test a character as an ODD single digit +isOdd <- function(x) x %in% c("1","3","5","7","9") +featureEngrg <- function(data) { +## Using Fate ILO Survived because term is shorter and just sounds good +data$Fate <- data$Survived +## Revaluing Fate factor to ease assessment of confusion matrices later +data$Fate <- revalue(data$Fate, c("1" = "Survived", "0" = "Perished")) +## Boat.dibs attempts to capture the "women and children first" +## policy in one feature. Assuming all females plus males under 15 +## got "dibs' on access to a lifeboat +data$Boat.dibs <- "No" +data$Boat.dibs[which(data$Sex == "female" | data$Age < 15)] <- "Yes" +data$Boat.dibs <- as.factor(data$Boat.dibs) +## Family consolidates siblings and spouses (SibSp) plus +## parents and children (Parch) into one feature +data$Family <- data$SibSp + data$Parch +## Fare.pp attempts to adjust group purchases by size of family +data$Fare.pp <- data$Fare/(data$Family + 1) +## Giving the traveling class feature a new look +data$Class <- data$Pclass +data$Class <- revalue(data$Class, +c("1"="First", "2"="Second", "3"="Third")) +## First character in Cabin number represents the Deck +data$Deck <- substring(data$Cabin, 1, 1) +data$Deck[ which( is.na(data$Deck ))] <- "UNK" +data$Deck <- as.factor(data$Deck) +## Odd-numbered cabins were reportedly on the port side of the ship +## Even-numbered cabins assigned Side="starboard" +data$cabin.last.digit <- str_sub(data$Cabin, -1) +data$Side <- "UNK" +data$Side[which(isEven(data$cabin.last.digit))] <- "port" +data$Side[which(isOdd(data$cabin.last.digit))] <- "starboard" +data$Side <- as.factor(data$Side) +data$cabin.last.digit <- NULL +return (data) +} +## add remaining features to training data frame +df.train <- featureEngrg(df.train) +install.packages("stringr") +library (stringr) +featureEngrg <- function(data) { +## Using Fate ILO Survived because term is shorter and just sounds good +data$Fate <- data$Survived +## Revaluing Fate factor to ease assessment of confusion matrices later +data$Fate <- revalue(data$Fate, c("1" = "Survived", "0" = "Perished")) +## Boat.dibs attempts to capture the "women and children first" +## policy in one feature. Assuming all females plus males under 15 +## got "dibs' on access to a lifeboat +data$Boat.dibs <- "No" +data$Boat.dibs[which(data$Sex == "female" | data$Age < 15)] <- "Yes" +data$Boat.dibs <- as.factor(data$Boat.dibs) +## Family consolidates siblings and spouses (SibSp) plus +## parents and children (Parch) into one feature +data$Family <- data$SibSp + data$Parch +## Fare.pp attempts to adjust group purchases by size of family +data$Fare.pp <- data$Fare/(data$Family + 1) +## Giving the traveling class feature a new look +data$Class <- data$Pclass +data$Class <- revalue(data$Class, +c("1"="First", "2"="Second", "3"="Third")) +## First character in Cabin number represents the Deck +data$Deck <- substring(data$Cabin, 1, 1) +data$Deck[ which( is.na(data$Deck ))] <- "UNK" +data$Deck <- as.factor(data$Deck) +## Odd-numbered cabins were reportedly on the port side of the ship +## Even-numbered cabins assigned Side="starboard" +data$cabin.last.digit <- str_sub(data$Cabin, -1) +data$Side <- "UNK" +data$Side[which(isEven(data$cabin.last.digit))] <- "port" +data$Side[which(isOdd(data$cabin.last.digit))] <- "starboard" +data$Side <- as.factor(data$Side) +data$cabin.last.digit <- NULL +return (data) +} +## add remaining features to training data frame +df.train <- featureEngrg(df.train) +train.keeps <- c("Fate", "Sex", "Boat.dibs", "Age", "Title", +"Class", "Deck", "Side", "Fare", "Fare.pp", +"Embarked", "Family") +df.train.munged <- df.train[train.keeps] +str(df.train.munged) +set.seed(23) +training.rows <- createDataPartition( +df.train.munged$Survived, p = 0.8, list = FALSE) +df.train.munged$Survived +df.train.munged$ +df.train.munged +df.train.munged +str(df.train.munged) +training.rows <- createDataPartition( +df.train.munged$Fate, p = 0.8, list = FALSE) +training.rows +install.packages("pROC") +library (pROC) +library (pROC) +install.packages("randomForest") +library(randomForest) +install.packages("ada") +library(ada) diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..bdb0cab --- /dev/null +++ b/.gitattributes @@ -0,0 +1,17 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Custom for Visual Studio +*.cs diff=csharp + +# Standard to msysgit +*.doc diff=astextplain +*.DOC diff=astextplain +*.docx diff=astextplain +*.DOCX diff=astextplain +*.dot diff=astextplain +*.DOT diff=astextplain +*.pdf diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cd2946a --- /dev/null +++ b/.gitignore @@ -0,0 +1,47 @@ +# Windows image file caches +Thumbs.db +ehthumbs.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# ========================= +# Operating System Files +# ========================= + +# OSX +# ========================= + +.DS_Store +.AppleDouble +.LSOverride + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk diff --git a/README.md b/README.md new file mode 100644 index 0000000..c3128c3 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# wehrley.github.io + +A sample R markdown file for analyzing the Kaggle Titianic data. diff --git a/TitanicAnalysis.Rmd b/TitanicAnalysis.Rmd new file mode 100644 index 0000000..337212c --- /dev/null +++ b/TitanicAnalysis.Rmd @@ -0,0 +1,1153 @@ +#### Titanic Survival Prediction +*One* Approach to Deriving a Model +---------------------------- +###Disclaimer +The following describes an approach I took to the [Titanic survival prediction challenge](http://www.kaggle.com/c/titanic-gettingStarted) presented by [Kaggle](http://www.kaggle.com). By no means does this approach represent a comprehensive, exhaustive pursuit of the best model for predicting Titanic passenger survival based on data provided by Kaggle. The intent here is merely to demonstrate utilization of some of the tools of the data science trade. + +### Use of R +This entire analysis, [soup to nuts](http://en.wikipedia.org/wiki/Soup_to_nuts), will utilize the [R software environment](http://www.r-project.org/). Many would rightfully argue that there are better tools for portions (e.g. [munging](http://en.wikipedia.org/wiki/Data_wrangling)) of the journey to a fitted model. I sought to demonstrate here that, for those who do tend to lean on R or who wish to learn R, an all-R solution is possible. + +V1 - results were .77 prior to LR changes + +### Background +One could hypothesize from stories of the Titanic's sinking that a passenger's survival was heavily dependent upon two factors: +1. Recognition of the possibility that the ship could sink +2. Access to a lifeboat + +[According to Wikipedia](http://en.wikipedia.org/wiki/RMS_Titanic), the Titanic reportedly struck an iceberg at 11:40 pm ship's time. The majority of its 2,224 passengers and crew had likely retired to their respective cabins for the evening by that time. Those on the upper decks had a shorter journey to the lifeboats, and possibly access to more timely and accurate information about the impending threat. Thus, any data relating to one's location on the ship could prove helpful to survival predictions. Below is a cross-section of the Titanic: +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uY2hQd3lfcE82ZEU) + +The Titanic was designed to carry 32 lifeboats, but this number was reduced to 20 (enough for about 1,180 people) for its maiden voyage -- likely a cost-cutting measure influenced by perceptions that the additional boats would clutter the deck of a ship deemed "unsinkable." Given that constraint, it is not surprising that a disproportionate number of men were apparently left aboard because of a [women and children first](http://en.wikipedia.org/wiki/Women_and_children_first) protocol followed by some of the officers overseeing the loading of lifeboats with passengers. + +### Getting the Data Into R +Kaggle packaged the data for the Titanic challenge into two csv-format files: +- **train.csv** (data containing attributes and known outcomes [survived or perished] for a subset of the passengers) +- **test.csv** (data containing attributes *without* outcomes for a subset of passengers) + +I've reviewed a *lot* of code containing approaches to sourcing data from a csv file with R. The majority seem to go no further than a simple read.csv function, mostly devoid of options, coded separately for each file being read. Later, the user often finds her/himself manually doing a number of tasks that could have been handled within the read function call. I chose to get as much out of ``` read.csv``` as I could in the context of a re-usable custom function. +```{r} +setwd("~/GitHub/Titanic") +readData <- function(path.name, file.name, column.types, missing.types) { + read.csv( url( paste(path.name, file.name, sep="") ), + colClasses=column.types, + na.strings=missing.types ) +} +``` +I've pushed the [Titanic csv files](https://github.com/wehrley/Kaggle_Titanic) to my GitHub account so that I can access the data from anywhere and, more importantly, demonstrate here the reading of data from a web source. Here are the arguments I will pass into this custom file reading function for the train.csv file: +```{r} + +Titanic.path <- "https://raw.github.com/wehrley/Kaggle_Titanic/master/" +train.data.file <- "train.csv" +test.data.file <- "test.csv" +missing.types <- c("NA", "") +train.column.types <- c('integer', # PassengerId + 'factor', # Survived + 'factor', # Pclass + 'character', # Name + 'factor', # Sex + 'numeric', # Age + 'integer', # SibSp + 'integer', # Parch + 'character', # Ticket + 'numeric', # Fare + 'character', # Cabin + 'factor' # Embarked +) +test.column.types <- train.column.types[-2] # # no Survived column in test.csv +``` +Specifying missing types up front should make the data munging process a bit easier, and while I may have to change the class type for a data frame column or two along the way, I've specified class definitions in a manner which should be most conducive to modeling later. This leaves me with much cleaner code for reading the csv files. +```{r} +train.raw <- readData(Titanic.path, train.data.file, + train.column.types, missing.types) +df.train <- train.raw + +test.raw <- readData(Titanic.path, test.data.file, + test.column.types, missing.types) +df.infer <- test.raw +``` +### Data Munging +Josh Wills, senior director of data science at [Cloudera](http://www.cloudera.com/), described himself as a *data janitor* in [this interview](http://www.technologyreview.com/news/513866/in-a-data-deluge-companies-seek-to-fill-a-new-role/) from spring 2013. My experience in analytics projects over the years has certainly confirmed that data preparation accounts for the bulk of the effort. While some consider the process of getting data into an analysis-ready form as a sort of necessary evil, I've often derived value from "getting one's hands dirty" and acquiring a granular view of the available data. Sometimes my greatest insights have come from this phase, often referred to as data pre-processing. + +Let's start with a look at missing data in the training set. I'll use the ```missmap ``` function from the [Amelia package](http://cran.r-project.org/web/packages/Amelia/) to display those. +```{r} +library(Amelia) +## map missing data by provided feature +require(Amelia) +missmap(df.train, main="Titanic Training Data - Missings Map", + col=c("yellow", "black"), legend=FALSE) +``` +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uNjVrTFFLQ3ljeDA) + +Roughly 20 percent of the Age data is missing, and well above 70 percent of the passengers cannot be linked to a specific cabin number. While the proportion of Age "missings" is likely small enough for reasonable replacement with some form of [imputation](http://en.wikipedia.org/wiki/Imputation_%28statistics%29), the cabin missings seem too extensive to make reliable imputation possible. Nevertheless, *some* data could be better than *zero* data, so I'll look at cabin numbers later to see how we can put them to use. + +Before we start filling in missing data, let's see what can be learned from the data we have. Putting some simple data visualization tools to work can take us a long way toward understanding what might influence the outcome we're trying to predict -- in this case, whether or not a passenger survived. Below is some code and the graphs they produced: +```{r} +barplot(table(df.train$Survived), + names.arg = c("Perished", "Survived"), + main="Survived (passenger fate)", col="black") +barplot(table(df.train$Pclass), + names.arg = c("first", "second", "third"), + main="Pclass (passenger traveling class)", col="firebrick") +barplot(table(df.train$Sex), main="Sex (gender)", col="darkviolet") +hist(df.train$Age, main="Age", xlab = NULL, col="brown") +barplot(table(df.train$SibSp), main="SibSp (siblings + spouse aboard)", + col="darkblue") +barplot(table(df.train$Parch), main="Parch (parents + kids aboard)", + col="gray50") +hist(df.train$Fare, main="Fare (fee paid for ticket[s])", xlab = NULL, + col="darkgreen") +barplot(table(df.train$Embarked), + names.arg = c("Cherbourg", "Queenstown", "Southampton"), + main="Embarked (port of embarkation)", col="sienna") +``` +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uZlQ4S0ttU0dmeTA) +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6ueFBOX3F4S1hTa0k) + +Note the dominant categories in the first three graphs: +* more passengers perished than survived +* about twice as many passengers in 3rd class than in either 1st or 2nd +* male passengers far outnumbered females + +Perhaps these are the first clues that the two themes discussed earlier -- women and children first policy, and location on the ship -- could dictate the feature set. +Although the fact that Southampton was the port of embarkation for most passengers doesn't make for a very balanced *Embarked* factor, it might mean something in the final analysis. + +Mosaic plots offer an interesting -- and arguably under-utilized -- way to summarize data. The [vcd package](http://cran.r-project.org/web/packages/vcd/index.html) includes the ``` mosaicplot``` function for creating those. The following mosaic suggests that traveling class did influence the odds of a passenger's survival. +```{r} +mosaicplot(df.train$Pclass ~ df.train$Survived, + main="Passenger Fate by Traveling Class", shade=FALSE, + color=TRUE, xlab="Pclass", ylab="Survived") +``` +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6ucVdZYmJMNnotaWs) + +Do you recall the earlier bar graph showing that some 2/3 of passengers were males? That is taken into account by the width of the two rectangles labeled "male" in the mosaic below. Now look at the *height* of the leftmost light gray rectangle [representing the proportion of females who survived] and compare it to the much shorter light gray rectange [representing proportion of males who survived]. Gender should certainly prove to be a prominent feature in the final model. +```{r} +mosaicplot(df.train$Sex ~ df.train$Survived, + main="Passenger Fate by Gender", shade=FALSE, color=TRUE, + xlab="Sex", ylab="Survived") +``` +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uWkpVaTRMRGlNSms) + +Is it possible that "survival of the fittest" dictated the fate of passengers in certain parts of the ship? Perhaps, though it isn't apparent at first glance from the boxplot of Age by Survival. +```{r} +boxplot(df.train$Age ~ df.train$Survived, + main="Passenger Fate by Age", + xlab="Survived", ylab="Age") +``` +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uQVZ1dThUdkFJV1U) + +While passenger survival didn't vary as much across the three ports of embarkation as it did between genders and traveling classes, perhaps the *Embarked* feature will prove useful at some point. +```{r} +mosaicplot(df.train$Embarked ~ df.train$Survived, + main="Passenger Fate by Port of Embarkation", + shade=FALSE, color=TRUE, xlab="Embarked", ylab="Survived") +``` +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uaVdXbmdYeXNRVzA) + +Just one more graph, then we'll get to those missing ages. The [corrgram package](http://cran.r-project.org/web/packages/corrgram/) is the source of a function for creating what is sometimes referred to as a correlogram. The one shown below confirms a couple of observations already made -- namely, that survival odds drop with class, and age may not prove to be a significant predictor. Given that the upper class ranks tend to be represented by an older demographic, an inverse correlation between age and traveling class is to be expected. Although fare and class are closely related, it might be worth throwing the **Fare** feature into the mix as another way to define a passenger's location on the ship. +```{r} +library(corrgram) +require(corrgram) +library(plyr) # for revalue +library(ada) # for ada (later) +corrgram.data <- df.train +## change features of factor type to numeric type for inclusion on correlogram +corrgram.data$Survived <- as.numeric(corrgram.data$Survived) +corrgram.data$Pclass <- as.numeric(corrgram.data$Pclass) +corrgram.data$Embarked <- revalue(corrgram.data$Embarked, + c("C" = 1, "Q" = 2, "S" = 3)) +## generate correlogram +corrgram.vars <- c("Survived", "Pclass", "Sex", "Age", + "SibSp", "Parch", "Fare", "Embarked") +corrgram(corrgram.data[,corrgram.vars], order=FALSE, + lower.panel=panel.ellipse, upper.panel=panel.pie, + text.panel=panel.txt, main="Titanic Training Data") + +``` +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uTm96MlFFZDU2ZFU) + +Time to tackle those missing ages. A common approach to this type of situation is to replacing the missings with the average of the available values. In this case, that would mean replacing 177 missing **Age** values with 29.7. +```{r} +summary(df.train$Age) +# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's +# 0.42 20.12 28.00 29.70 38.00 80.00 177 +``` +Taking that approach would be fine if only a small fraction of the ages were missing. However, with missings accounting for 20 percent of all **Age** data in a relatively small data set (<900 records), one could justify a search for a more refined method of imputation. Let's peek again at the list of currently available features: +```{r} +names(df.train) +# [1] "PassengerId" "Survived" "Pclass" "Name" "Sex" "Age" +# [7] "SibSp" "Parch" "Ticket" "Fare" "Cabin" +# "Embarked" +``` +**PassengerId** is merely a record number, and we already know that splitting the ages solely by **Survived** doesn't reveal much. A boxplot of ages by passenger traveling class looks interesting... +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uMVdXTmlQM2s5WHM) + +This makes intuitive sense: Passengers in the upper classes (first and second) would tend to be wealthier, and in that period of U.S. history, acquiring wealth usually required a good deal of time (no dot-com kings in their 20s were aboard the Titanic on her maiden voyage). There are no missing values in **Pclass**, so we could replace the missing age for, say, a third class passenger with the average or median of the available ages for those in ``` Pclass="3"```. Doing so would be an improvement over assigning 29.7 to all **Age** missings. +## function for extracting honorific (i.e. title) from the Name feature +```{r} +getTitle <- function(data) { + title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE) + title.comma.end <- title.dot.start + + attr(title.dot.start, "match.length")-1 + data$Title <- substr(data$Name,title.dot.start+2, + title.comma.end-1) + return (data$Title) +} +``` +Inspection of the next feature -- **Name** -- reveals what could be an even better approach... +```{r} +df.train$Title <- getTitle(df.train) +unique(df.train$Title) +# [1] "Braund, Mr. Owen Harris" +# [2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" +# [3] "Heikkinen, Miss. Laina" +# [4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)" +# [5] "Allen, Mr. William Henry" +# [6] "Moran, Mr. James" +# [7] "McCarthy, Mr. Timothy J" +# [8] "Palsson, Master. Gosta Leonard" +# [9] "Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)" +#[10] "Nasser, Mrs. Nicholas (Adele Achem)" +``` +Notice the titles -- Mr., Mrs., Miss., Master. -- following each of the surnames. The [Wikipedia entry](http://en.wikipedia.org/wiki/Master_%28form_of_address%29) for the [English honorific](http://en.wikipedia.org/wiki/English_honorific) "Master" explains that, +> By the late 19th century, etiquette dictated that men be addressed as Mister, and boys as Master." + +The title "Miss" should help with differentiation betweeen younger and older females. Also, note the way the title appears in the name: The format "Surname, Title. Firstname..." is consistent in **Name** across all records. I used that pattern to create a custom function which employs a regular expression and the ``` regexpr``` function to extract the title from each name: +```{r} +## function for extracting honorific (i.e. title) from the Name feature +getTitle <- function(data) { + title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE) + title.comma.end <- title.dot.start + + attr(title.dot.start, "match.length")-1 + data$Title <- substr(data$Name,title.dot.start+2, + title.comma.end-1) + return (data$Title) +} +``` +Let's fetch the titles, given them their own column in the **df.train** data frame, and look at the uniques. +```{r} +# ensure there are no null titles +df.train$Title = "Mr" +df.train$Title[df.train$Sex=="female"]="Mrs" +df.train$Title[as.numeric(df.train$Age) < 24 & df.train$Sex =="female"]="Miss" + + +df.train$Title <- getTitle(df.train) +unique(df.train$Title) +# [1] "Mr" "Mrs" "Miss" "Master" "Don" "Rev" +# [7] "Dr" "Mme" "Ms" "Major" "Lady" "Sir" +#[13] "Mlle" "Col" "Capt" "the Countess" "Jonkheer" +str(df.train) + + +``` +To identify the titles which have at least one record with an age missing, I'll use the ``` bystats``` function from the [Hmisc package](http://cran.r-project.org/web/packages/Hmisc/index.html). +```{r} +options(digits=2) + +require(Hmisc) +bystats(df.train$Age, df.train$Title, + fun=function(x)c(Mean=mean(x),Median=median(x))) +# N Missing Mean Median +# Capt 1 0 70.0 70.0 +# Col 2 0 58.0 58.0 +# Don 1 0 40.0 40.0 +# Dr 6 1 42.0 46.5 +# Jonkheer 1 0 38.0 38.0 +# Lady 1 0 48.0 48.0 +# Major 2 0 48.5 48.5 +# Master 36 4 4.6 3.5 +# Miss 146 36 21.8 21.0 +# Mlle 2 0 24.0 24.0 +# Mme 1 0 24.0 24.0 +# Mr 398 119 32.4 30.0 +# Mrs 108 17 35.9 35.0 +# Ms 1 0 28.0 28.0 +# Rev 6 0 43.2 46.5 +# Sir 1 0 49.0 49.0 +# the Countess 1 0 33.0 33.0 +# ALL 714 177 29.7 28.0 +``` +Now I can assign the titles with at least one missing **Age** value to a list... +```{r} +## list of titles with missing Age value(s) requiring imputation +titles.na.train <- c("Dr", "Master", "Mrs", "Miss", "Mr") +``` +...then pass that list to the following custom function I created for imputing the missing ages: +```{r} +imputeMedian <- function(impute.var, filter.var, var.levels) { + for (v in var.levels) { + impute.var[ which( filter.var == v)] <- impute(impute.var[ + which( filter.var == v)]) + } + return (impute.var) +} +``` +I apply the ``` impute``` function from the Hmisc package on a per-title basis to assign the median of the available ages to the missing age(s). For example, the single record with a missing **Age** value and ``` Title="Dr"``` will be assigned the median of the ages from the 6 records with ``` Title="Dr"``` which *do* have age data. +```{r} +df.train$Age[which(df.train$Title=="Dr")] +#[1] 44 54 23 32 50 NA 49 +``` +After doing the age imputations, I check the **Age** data and find that the function seems to have done its job. +```{r} +df.train$Age <- imputeMedian( + df.train$Age, df.train$Title, + titles.na.train) +df.train$Age[which(df.train$Title=="Dr")] +#[1] 44.0 54.0 23.0 32.0 50.0 46.5 49.0 +summary(df.train$Age) +# Min. 1st Qu. Median Mean 3rd Qu. Max. +# 0.42 21.00 30.00 29.39 35.00 80.00 +``` +You may recall that the ``` Embarked``` feature also had at least one missing value. A summary of that data... +```{r} +summary(df.train$Embarked) + +``` +...reveals just two missings. It should be fine to replace those missings with "S", the most common value. +```{r} +df.train$Embarked[which(is.na(df.train$Embarked))] <- 'S' +``` +While there are no missing Fare values, a summary does show at least one ``` Fare=0```... +```{r} +summary(df.train$Fare) +``` +(That exceptionally high fare of $512.30 suggests that some tickets were purchased in groups. We'll address that later.) +A zero fare might have been assigned to a baby. However, a closer look at records where ``` Fare = 0``` suggests otherwise... +```{r} +subset(df.train, Fare < 7)[order(subset(df.train, Fare < 7)$Fare, + subset(df.train, Fare < 7)$Pclass), + c("Age", "Title", "Pclass", "Fare")] +# Age Title Pclass Fare +# 264 40 Mr 1 0.0 +# 634 30 Mr 1 0.0 +# 807 39 Mr 1 0.0 +# 816 30 Mr 1 0.0 +# 823 38 Noble 1 0.0 +# 278 30 Mr 2 0.0 +# 414 30 Mr 2 0.0 +# 467 30 Mr 2 0.0 +# 482 30 Mr 2 0.0 +# 675 30 Mr 2 0.0 +# 733 30 Mr 2 0.0 +# 180 36 Mr 3 0.0 +# 272 25 Mr 3 0.0 +# 303 19 Mr 3 0.0 +# 598 49 Mr 3 0.0 +# 379 20 Mr 3 4.0 +# 873 33 Mr 1 5.0 +# 327 61 Mr 3 6.2 +# 844 34 Mr 3 6.4 +# 819 43 Mr 3 6.4 +# 203 34 Mr 3 6.5 +# 372 18 Mr 3 6.5 +# 144 19 Mr 3 6.8 +# 655 18 Miss 3 6.8 +# 412 30 Mr 3 6.9 +# 826 30 Mr 3 7.0 +# 130 45 Mr 3 7.0 +# 805 27 Mr 3 7.0 +``` +The jump in fares from 0 to the 4-7 range suggests errors. I replaced the zero **Fare** values with the median fare from the respective passenger class using the imputMedian function introduced earlier. +```{r} +## impute missings on Fare feature with median fare by Pclass +df.train$Fare[ which( df.train$Fare == 0 )] <- NA +df.train$Fare <- imputeMedian(df.train$Fare, df.train$Pclass, + as.numeric(levels(df.train$Pclass))) +``` +I see the titles as more than merely a guide for imputation of missing ages. A passenger's title can reflect gender, his/her position on the ship (officers & royalty), and access to a lifeboat (where "Master" superceded "Mr"). Making the effort to get the **Title** feature model-ready seems worthwhile. + +Recall from the ``` bystats``` results above that the training data contains 17 different titles. We already know that "Master" and "Mr" should separate the males into roughly two groups by age. The following script... +```{r} +df.train$Title <- factor(df.train$Title, + c("Capt","Col","Major","Sir","Lady","Rev", + "Dr","Don","Jonkheer","the Countess","Mrs", + "Ms","Mr","Mme","Mlle","Miss","Master")) +boxplot(df.train$Age ~ df.train$Title, + main="Passenger Age by Title", xlab="Title", ylab="Age", + y=range(df.train$Age, na.rm=TRUE)) +``` +...produces [this boxplot](https://drive.google.com/file/d/0B-yx9UUIpB6ubEZ5NU5WSFo1U0E/edit?usp=sharing) (too wide for display here) showing passenger age by title, including shading which illustrates the manner in which I consolidated the titles. I created and applied a custom function for revaluing the titles, then reclassified **Title** to a factor type, as follows: +```{r} +## function for assigning a new title value to old title(s) +changeTitles <- function(data, old.titles, new.title) { + for (honorific in old.titles) { + data$Title[ which( data$Title == honorific)] <- new.title + } + return (data$Title) +} +## Title consolidation +#df.train.tmp = df.train +#df.train=df.train.tmp +write.table (df.train,"dftraintmp.csv") + +# Replaced the Noble with Sir, because is is an existing factor + +df.train$Title <- changeTitles( + df.train, + c("Capt", "Col", "Don", "Dr", + "Jonkheer", "Lady", "Major", + "Rev", "Sir"), + "Sir") +df.train$Title <- changeTitles(df.train, c("the Countess", "Ms"), + "Mrs") +df.train$Title <- changeTitles(df.train, c("Mlle", "Mme"), "Miss") +df.train$Title <- as.factor(df.train$Title) +``` +I assigned the Countess of Rothes, a woman in first class and the sole passenger with a "Countess" title, to the "Mrs" group. In retrospect, I could have placed her under the "Noble" umbrella. Given that 91 of the 94 female first-class passengers in the training set survived, I was willing to live with that choice. + +All of the work done designing the new **Title** column can be considered a part of **feature engineering**. The other features I chose to add are generated using custom function ``` featureEngrg```, which can be applied to both the training data in **df.train** and the Kaggle-provided test data in **df.infer**. +```{r} +library(stringr) +library(plyr) +require(plyr) # for the revalue function +require(stringr) # for the str_sub function + +## test a character as an EVEN single digit +isEven <- function(x) x %in% c("0","2","4","6","8") +## test a character as an ODD single digit +isOdd <- function(x) x %in% c("1","3","5","7","9") + +## function to add features to training or test data frames +featureEngrg <- function(data) { + ## Using Fate ILO Survived because term is shorter and just sounds good + data$Fate <- data$Survived + ## Revaluing Fate factor to ease assessment of confusion matrices later + data$Fate <- revalue(data$Fate, c("1" = "Survived", "0" = "Perished")) + ## Boat.dibs attempts to capture the "women and children first" + ## policy in one feature. Assuming all females plus males under 15 + ## got "dibs' on access to a lifeboat + data$Boat.dibs <- "No" + data$Boat.dibs[which(data$Sex == "female" | data$Age < 15)] <- "Yes" + data$Boat.dibs <- as.factor(data$Boat.dibs) + ## Family consolidates siblings and spouses (SibSp) plus + ## parents and children (Parch) into one feature + data$Family <- data$SibSp + data$Parch + ## Fare.pp attempts to adjust group purchases by size of family + data$Fare.pp <- data$Fare/(data$Family + 1) + ## Giving the traveling class feature a new look + data$Class <- data$Pclass + data$Class <- revalue(data$Class, + c("1"="First", "2"="Second", "3"="Third")) + ## First character in Cabin number represents the Deck + data$Deck <- substring(data$Cabin, 1, 1) + data$Deck[ which( is.na(data$Deck ))] <- "UNK" + data$Deck <- as.factor(data$Deck) + ## Odd-numbered cabins were reportedly on the port side of the ship + ## Even-numbered cabins assigned Side="starboard" + data$cabin.last.digit <- str_sub(data$Cabin, -1) + data$Side <- "UNK" + data$Side[which(isEven(data$cabin.last.digit))] <- "port" + data$Side[which(isOdd(data$cabin.last.digit))] <- "starboard" + data$Side <- as.factor(data$Side) + data$cabin.last.digit <- NULL + return (data) +} + +## add remaining features to training data frame +df.train <- featureEngrg(df.train) +``` +Some color on the features I've added: +* **Boat.dibs** - assumes all females plus males under 15 get "dibs' on access to a lifeboat. Filtering by **Title="Master"** was considered, but the highest age in the training data for males addressed as "Master" was just 12, and I wanted to account for male teens with **Title="Mr"** who could pass for a child. +* **Deck** - levels are as shown in the Titanic cross-section displayed previously. Cabin data provided for just 23 percent of training data records, so it's tough to give this one much emphasis. +* **Side** - subject to the same concern (dearth of data) expressed for Deck + +I finish the data munging process by paring down the data frame to the columns I will use in model building. +```{r} +train.keeps <- c("Fate", "Sex", "Boat.dibs", "Age", "Title", + "Class", "Deck", "Side", "Fare", "Fare.pp", + "Embarked", "Family") +df.train.munged <- df.train[train.keeps] +str(df.train.munged) + +``` + +## Fitting a Model + +Later, I will be conducting the predictive modeling effort using the ``` caret``` package. Created by Max Kuhn of Pfizer Global R&D, ```caret```provides a unified interface for modeling & prediction, and streamlines the model tuning process using resampling. [The package](http://cran.r-project.org/web/packages/caret/index.html) includes a ``` createDataPartition``` function for splitting data into a training set and a test set (sometimes referred to as a *validation* set) via [stratified random sampling](http://www.investopedia.com/terms/stratified_random_sampling.asp). In [this presentation](https://drive.google.com/file/d/0B-yx9UUIpB6uaGVQVmhveGNjNDQ/edit?usp=sharing), Kuhn delivered the best explanation I've seen of the decision on how to "spend" the available training data. His conclusion: +> Statistically, the best course of action would be to use all the data for model building and use statistical methods to get good estimates of error. From a non-statistical perspective, many consumers of these models emphasize the need for an untouched set of samples to evaluate performance. + +I selected an 80/20 split for training data and testing data. The code: +```{r} +library(caret) +## split training data into train batch and test batch +set.seed(23) +training.rows <- createDataPartition( + df.train.munged$Fate, p = 0.8, list = FALSE) +train.batch <- df.train.munged[training.rows, ] +test.batch <- df.train.munged[-training.rows, ] + +# temporary fix for missing value +#test.batch[36,5]="Mr" + +``` +Before I go pouring features into the popular Random Forest method, I'm going to give one of the simplest classification methods a crack at the Titanic prediction challenge. Logistic regression, which surfaced about 70 years ago, has been used extensively in multiple fields. I'll start simple by passing essentially the features provided in the raw training data (remember that we combined ``` SibSp``` and ``` Parch``` to form ``` Family```) through the R function for fitting general linearized models. When entering the model formula, I typically have a habit of listing the features in an order roughly corresponding to what I initially believe their importance will be. In this case, I've ordered them roughly by the two main themes I discussed earlier (women & children first policy and location on the ship). By setting the argument ``` family``` to ``` binomial``` with a ``` logit``` link, I'm asking ``` glm( )``` to produce a logistic regression. + +```{r} +Titanic.logit.1 <- glm(Fate ~ Sex + Class + Age + Family + + Embarked+ Fare, data = train.batch, family=binomial("logit")) +``` +To assess this first model and the various binary logistic regressions that will appear in its wake, we will use the [chi-square](http://en.wikipedia.org/wiki/Chi-squared_test) statistic, which is basically a measure of the *goodness of fit* of observed values to expected values. The bigger the difference (or *deviance*) of the observed values from the expected values, the poorer the fit of the model. The *null deviance* shows how well passenger survival is predicted by a "null" model using only a constant ([grand mean](http://en.wikipedia.org/wiki/Grand_mean)). As we adjust the model's formula by adding and/or removing variables, we'll look for those changes which prompt a drop in the *residual deviance*, indicating an improvement in fit. +```{r} +Titanic.logit.1 + +#Call: glm(formula = Fate ~ Sex + Class + Age + Family + Embarked + +# Fare, family = binomial("logit"), data = train.batch) + +#Coefficients: +#(Intercept) Sexmale ClassSecond ClassThird Age Family +# 4.1991007 -2.7367328 -0.9333119 -2.0678612 -0.0441754 -0.2871471 +# EmbarkedQ EmbarkedS Fare +# 0.0003177 -0.4913073 0.0052758 + +#Degrees of Freedom: 713 Total (i.e. Null); 705 Residual +#Null Deviance: 950.9 +#Residual Deviance: 618.7 AIC: 636.7 +``` +The deviance was reduced by 332.2 points on 713-705=8 degrees of freedom (DF), a significant reduction... +```{r} +1 - pchisq(332.2, df=8) +#[1] 0 +``` +In other words, the model put forth is significantly different from the null model. Overall, the model appears to have performed well -- but I'm willing to bet that we could improve on that residual deviance with a different combination of features. Calling ``` anova()```, an extractor function, generates the results of the analysis. +```{r} +anova(Titanic.logit.1, test="Chisq") +#Analysis of Deviance Table + +#Model: binomial, link: logit + +#Response: Fate + +#Terms added sequentially (first to last) + +# Df Deviance Resid. Df Resid. Dev Pr(>Chi) +#NULL 713 950.86 +#Sex 1 218.443 712 732.42 < 2.2e-16 *** +#Class 2 72.191 710 660.23 < 2.2e-16 *** +#Age 1 19.971 709 640.26 7.862e-06 *** +#Family 1 13.135 708 627.12 0.0002899 *** +#Embarked 2 5.608 706 621.52 0.0605668 . +#Fare 1 2.855 705 618.66 0.0911186 . +#--- +#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +``` +Notice how the **Sex** and **Class** features accounted for the lion's share of the reduction in the deviance, providing some support to our hypotheses about life boat access and location on ship. Since **Fare** isn't doing much for us, let's see if the **Fare.pp** we created fares any better (pun intended). +```{r} +Titanic.logit.2 <- glm(Fate ~ Sex + Class + Age + Family + Embarked + Fare.pp, data = train.batch, family=binomial("logit")) +anova(Titanic.logit.2, test="Chisq") +#Analysis of Deviance Table + +#Model: binomial, link: logit + +#Response: Fate + +#Terms added sequentially (first to last) + +# Df Deviance Resid. Df Resid. Dev Pr(>Chi) +#NULL 713 950.86 +#Sex 1 218.443 712 732.42 < 2.2e-16 *** +#Class 2 72.191 710 660.23 < 2.2e-16 *** +#Age 1 19.971 709 640.26 7.862e-06 *** +#Family 1 13.135 708 627.12 0.0002899 *** +#Embarked 2 5.608 706 621.52 0.0605668 . +#Fare.pp 1 1.312 705 620.21 0.2521103 +#--- +#Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +``` +Hmm, that was no help. Dropping fares altogether and passing a slightly slimmer formula through the ``` glm()``` function will give us a new baseline for model improvement. +```{r} +glm(Fate ~ Sex + Class + Age + Family + Embarked, + data = train.batch, family=binomial("logit")) + +#Call: glm(formula = Fate ~ Sex + Class + Age + Family + Embarked, family = #binomial("logit"), data = train.batch) + +#Coefficients: +#(Intercept) Sexmale ClassSecond ClassThird Age Family +# 4.60950 -2.74715 -1.19354 -2.38903 -0.04466 -0.24416 +# EmbarkedQ EmbarkedS +# -0.03949 -0.55186 + +#Degrees of Freedom: 713 Total (i.e. Null); 706 Residual +#Null Deviance: 950.9 +#Residual Deviance: 621.5 AIC: 637.5 +``` +Time to shift the model fitting to a higher gear. Henceforth, I'm going to use the ``` train``` function in Kuhn's ``` caret``` package to fit binary logistic regression models, as well as models built using other methods. + +Modeling taken to an extreme on a training data set can leave you with a model which *very* accurately maps the training data, but does not generalize well to new samples. This phenomenon, commonly referred to as *overfitting*, can be addressed by resampling the training samples in a way which approximates the fitted model's performance on future data. I'm going to use a form of resampling known as 10-fold cross-validation (CV), repeated 3 times. + +Later, I plan to compare the fitted logit model to other model types using the receiver operating characteristic (ROC) curve. The ``` twoClassSummary``` function in ``` caret``` can calculate the figures I'll need for that if I give it class probabilities predicted by the logistic regression model. + +All of these things I want -- 3x 10-fold CV, estimation of class probabilities, metrics from ``` twoClassSummary``` -- can be passed through the ``` trainControl``` function. +```{r} +## Define control function to handle optional arguments for train function +## Models to be assessed based on largest absolute area under ROC curve +cv.ctrl <- trainControl(method = "repeatedcv", repeats = 3, + summaryFunction = twoClassSummary, + classProbs = TRUE) +``` +Below is the ``` train``` function call using the same formula (sans Fare) that we recently passed through ``` glm``` function. I use the ``` metric``` argument to tell ``` train``` to optimize the model by maximizing the area under the ROC curve (AUC). ``` summary()```, another extractor function, is called to generate regression coefficients with standard errors and a z-test, plus the residual deviance metric we were watching earlier. +```{r} +#install.packages("pROC") + +library (pROC) +set.seed(35) +glm.tune.1 <- train(Fate ~ Sex + Class + Age + Family + Embarked, + data = train.batch, + method = "glm", + metric = "ROC", + trControl = cv.ctrl) +glm.tune.1 +#714 samples +# 11 predictors +# 2 classes: 'Perished', 'Survived' + +#No pre-processing +#Resampling: Cross-Validation (10 fold, repeated 3 times) + +#Summary of sample sizes: 642, 643, 643, 642, 643, 642, ... + +#Resampling results + +# ROC Sens Spec ROC SD Sens SD Spec SD +# 0.856 0.855 0.698 0.0433 0.071 0.0852 + +summary(glm.tune.1) + +#Call: +# NULL +# +# Deviance Residuals: +# Min 1Q Median 3Q Max +# -2.2206 -0.5945 -0.4131 0.6208 2.5031 +# +# Coefficients: +# Estimate Std. Error z value Pr(>|z|) +# (Intercept) 4.60950 0.50032 9.213 < 2e-16 *** +# Sexmale -2.74715 0.22441 -12.242 < 2e-16 *** +# ClassSecond -1.19354 0.30646 -3.895 9.84e-05 *** +# ClassThird -2.38903 0.28411 -8.409 < 2e-16 *** +# Age -0.04466 0.00908 -4.918 8.73e-07 *** +# Family -0.24416 0.07787 -3.136 0.00171 ** +# EmbarkedQ -0.03949 0.42227 -0.094 0.92549 +# EmbarkedS -0.55186 0.26154 -2.110 0.03485 * +# --- +# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +# +# (Dispersion parameter for binomial family taken to be 1) +# +# Null deviance: 950.86 on 713 degrees of freedom +# Residual deviance: 621.52 on 706 degrees of freedom +# AIC: 637.52 +# +# Number of Fisher Scoring iterations: 5 +``` +This is as good a time as any to introduce the concept of *class compression*. Think of it as collapsing particular levels on a categorical variable. One of the earlier bar graphs showed about 70 percent of the Titanic's passengers boarded the ship at Southampton. I'm going to use **Embarked** and the ``` I()``` function, which inhibits interpretation & conversion of R objects, to create a new 2-level factor *within* the model formula. This factor is valued TRUE if a passenger's port of origin was Southampton ("S"), or FALSE otherwise. +```{r} + set.seed(35) + glm.tune.2 <- train(Fate ~ Sex + Class + Age + Family + I(Embarked=="S"), + data = train.batch, method = "glm", + metric = "ROC", trControl = cv.ctrl) + summary(glm.tune.2) + +# Call: +# NULL +# +# Deviance Residuals: +# Min 1Q Median 3Q Max +# -2.2165 -0.5935 -0.4127 0.6230 2.5039 +# +# Coefficients: +# Estimate Std. Error z value Pr(>|z|) +# (Intercept) 4.599379 0.488154 9.422 < 2e-16 *** +# Sexmale -2.745061 0.223226 -12.297 < 2e-16 *** +# ClassSecond -1.196456 0.304837 -3.925 8.68e-05 *** +# ClassThird -2.395542 0.275479 -8.696 < 2e-16 *** +# Age -0.044652 0.009076 -4.920 8.66e-07 *** +# Family -0.243642 0.077633 -3.138 0.0017 ** +# `I(Embarked == "S")TRUE` -0.539793 0.227551 -2.372 0.0177 * +# --- +# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +# +# (Dispersion parameter for binomial family taken to be 1) +# +# Null deviance: 950.86 on 713 degrees of freedom +# Residual deviance: 621.53 on 707 degrees of freedom +# AIC: 635.53 +# +# Number of Fisher Scoring iterations: 5 +``` +As I discussed earlier, the **Title** feature addresses more than one theme. For that reason, I believe it has real potential to improve this model. Besides, I put a good chunk of effort into it, so why not give it a go? +```{r} + set.seed(35) + glm.tune.3 <- train(Fate ~ Sex + Class + Title + Age + + Family + I(Embarked=="S"), + data = train.batch, method = "glm", + metric = "ROC", trControl = cv.ctrl) + summary(glm.tune.3) + +# Coefficients: +# Estimate Std. Error z value Pr(>|z|) +# (Intercept) 19.98972 623.55577 0.032 0.974426 +# Sexmale -15.28525 623.55543 -0.025 0.980443 +# TitleMiss -15.57857 623.55565 -0.025 0.980068 +# TitleMr -3.04656 0.57156 -5.330 9.81e-08 *** +# TitleMrs -14.53106 623.55571 -0.023 0.981408 +# TitleNoble -3.13799 0.82733 -3.793 0.000149 *** +# Age -0.03695 0.01065 -3.471 0.000518 *** +# Family -0.43025 0.08915 -4.826 1.39e-06 *** +# ClassSecond -1.43867 0.33135 -4.342 1.41e-05 *** +# ClassThird -2.54556 0.29641 -8.588 < 2e-16 *** +# `I(Embarked == "S")TRUE` -0.55423 0.23509 -2.358 0.018395 * +# --- +# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +# +# (Dispersion parameter for binomial family taken to be 1) +# +# Null deviance: 950.86 on 713 degrees of freedom +# Residual deviance: 588.32 on 703 degrees of freedom +# AIC: 610.32 +# +# Number of Fisher Scoring iterations: 13 +``` +Nice! That gave us our first material decline in the residual deviance. Since the **Title** feature seems to give us everything that **Age** did (and more), I'm going to drop **Age** from the formula. I will also collapse the titles “Miss” and “Mrs” and leave a duo of Title-related factors which should represent the “women and children first” theme well. +```{r} +set.seed(35) +# subst Sir for Noble +glm.tune.4 <- train(Fate ~ Class + I(Title=="Mr") + I(Title=="Sir") + + Age + Family + I(Embarked=="S"), + data = train.batch, method = "glm", + metric = "ROC", trControl = cv.ctrl) +summary(glm.tune.4) + +# Call: +# NULL +# +# Deviance Residuals: +# Min 1Q Median 3Q Max +# -2.4813 -0.5646 -0.3840 0.6026 2.4523 +# +# Coefficients: +# Estimate Std. Error z value Pr(>|z|) +# (Intercept) 4.348090 0.479097 9.076 < 2e-16 *** +# ClassSecond -1.320352 0.318842 -4.141 3.46e-05 *** +# ClassThird -2.372211 0.284693 -8.333 < 2e-16 *** +# `I(Title == "Mr")TRUE` -3.238061 0.253776 -12.760 < 2e-16 *** +# `I(Title == "Noble")TRUE` -2.616810 0.619869 -4.222 2.43e-05 *** +# Age -0.026335 0.009127 -2.885 0.00391 ** +# Family -0.434170 0.084179 -5.158 2.50e-07 *** +# `I(Embarked == "S")TRUE` -0.508882 0.232502 -2.189 0.02862 * +# --- +# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +# +# (Dispersion parameter for binomial family taken to be 1) +# +# Null deviance: 950.86 on 713 degrees of freedom +# Residual deviance: 593.28 on 706 degrees of freedom +# AIC: 609.28 +# +# Number of Fisher Scoring iterations: 5 +``` +Remember that there were a lot of male passengers in third class. Given the “women and children first” policy already mentioned plus reports that the Titanic's internal layout was confusing (I recall reading that one crew member claimed it took him two weeks to become comfortable with finding his way around the ship), to say that “grown men in the lower decks had it tough” is such a gross understatement that I hesitated to put it in type. A feature reflecting those third-class men might make a further dent in that residual deviance. Indeed, it does... +```{r9} + set.seed(35) + glm.tune.5 <- train(Fate ~ Class + I(Title=="Mr") + I(Title=="Sir") + + Age + Family + I(Embarked=="S") + + I(Title=="Mr"&Class=="Third"), + data = train.batch, + method = "glm", metric = "ROC", + trControl = cv.ctrl) + +summary(glm.tune.5) +# +# Call: +# NULL +# +# Deviance Residuals: +# Min 1Q Median 3Q Max +# -3.0703 -0.5859 -0.3947 0.3725 2.4811 +# +# Coefficients: +# Estimate Std. Error z value Pr(>|z|) +# (Intercept) 6.33818 0.72561 8.735 < 2e-16 *** +# ClassSecond -2.19222 0.48004 -4.567 4.95e-06 *** +# ClassThird -4.65442 0.60918 -7.641 2.16e-14 *** +# `I(Title == "Mr")TRUE` -5.20467 0.54771 -9.503 < 2e-16 *** +# `I(Title == "Noble")TRUE` -4.07411 0.77141 -5.281 1.28e-07 *** +# Age -0.03268 0.01023 -3.194 0.00140 ** +# Family -0.40503 0.08971 -4.515 6.34e-06 *** +# `I(Embarked == "S")TRUE` -0.59956 0.23065 -2.599 0.00934 ** +# `I(Title == "Mr" 3.00867 0.60761 4.952 7.36e-07 *** +# & Class == "Third")TRUE` +# --- +# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +# +# (Dispersion parameter for binomial family taken to be 1) +# +# Null deviance: 950.86 on 713 degrees of freedom +# Residual deviance: 561.11 on 705 degrees of freedom +# AIC: 579.11 +# +# Number of Fisher Scoring iterations: 6 +``` + +Unfortunately, the other features did not contribute to further deviance compression. Taking a different approach to representing the “women and children first” policy didn't bear fruit (removing the title references in the formula and adding Boat.dibs produced a residual deviance of 565 -- no better than what we already have, using a new feature which some may find confusing). Given that Deck and Side combined (a) shaved just a few points off of the deviance, and (b) were derived from such a small subset of the training data, I decided to withdraw them from consideration. + +#### Other Models +Logistic regression is certainly not the only binary classification model available. There are plenty more –- perhaps too many for some data scientists to digest. For purpose of illustration, I'll simply take the logistic regression model formula from **glm.tune.1** and pass it through ``` train()``` for each of three other model types, with one new twist: tuning variables specific to each model. + +First up is [boosting](http://en.wikipedia.org/wiki/AdaBoost). I can instruct ``` train``` to fit a *stochastic boosting* model for the binary response **Fate** using the ``` ada```package and a range of values for each of three tuning parameters. Concretely, when fitting a model using ``` train``` with ``` method=”ada”```, one has three levers to tweak: ``` iter``` (number of boosting iterations, default=50), ``` maxdepth``` (depth of trees), and ``` nu``` (shrinkage parameter, default=1). Create a data frame with these three variables as column names and one row per tuning variable combination, and you're good to go. Here is just one example of a tuning grid for ``` ada```: +```{r10} +## note the dot preceding each variable +ada.grid <- expand.grid(.iter = c(50, 100), + .maxdepth = c(4, 8), + .nu = c(0.1, 1)) +``` +Specify ``` method=”ada”``` and ``` tuneGrid=ada.grid``` in ``` train```, and away we go... + +```{r cache=TRUE} + +set.seed(35) + +ada.tune <- train(Fate ~ Sex + Class + Age + Family + Embarked, + data = train.batch, + method = "ada", + metric = "ROC", + tuneGrid = ada.grid, + trControl = cv.ctrl) + + +``` +The model output shows that, given the **train.batch** data and 8 combinations of tuning variables tested, the optimal model had an ROC of 0.871. The tuning parameter values used to build that model were ``` iter = 100```, ``` maxdepth = 4```, and ``` nu = 0.1```. + +```{r12} +ada.tune +# 714 samples +# 11 predictors +# 2 classes: 'Perished', 'Survived' +# +# No pre-processing +# Resampling: Cross-Validation (10 fold, repeated 3 times) +# +# Summary of sample sizes: 642, 643, 643, 642, 642, 643, ... +# +# Resampling results across tuning parameters: +# +# iter maxdepth nu ROC Sens Spec ROC SD Sens SD Spec SD +# 50 4 0.1 0.869 0.931 0.666 0.061 0.046 0.0784 +# 50 4 1 0.855 0.907 0.703 0.0572 0.046 0.09 +# 50 8 0.1 0.864 0.919 0.685 0.0571 0.0457 0.085 +# 50 8 1 0.846 0.88 0.716 0.0559 0.0482 0.0944 +# 100 4 0.1 0.871 0.923 0.679 0.0609 0.0449 0.0829 +# 100 4 1 0.855 0.896 0.707 0.0559 0.0552 0.0884 +# 100 8 0.1 0.867 0.919 0.7 0.0597 0.0429 0.0767 +# 100 8 1 0.837 0.879 0.709 0.0646 0.0561 0.0908 +# +# ROC was used to select the optimal model using the largest value. +# The final values used for the model were iter = 100, maxdepth = 4 and nu = 0.1. + +# doesnt work plot(ada.tune) ## ada accuracy profile + +``` +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uLWhoNGJXVlJzLUU) + +Time to give the popular **Random Forest** (RF) model a shot at the Titanic challenge. The number of randomly pre-selected predictor variables for each node, designated ``` mtry```, is the sole parameter available for tuning an RF with ``` train```. Since the number of features is so small, there really isn't much scope for tuning ``` mtry``` in this case. Nevertheless, I'll demonstrate here how it can be done. Let's have ``` mtry=2``` and ```mtry=3``` duke it out over the Titanic data. +```{r cache=TRUE} +library(randomForest) +rf.grid <- data.frame(.mtry = c(2, 3)) +set.seed(35) +rf.tune <- train(Fate ~ Sex + Class + Age + Family + Embarked, + data = train.batch, + method = "rf", + metric = "ROC", + tuneGrid = rf.grid, + trControl = cv.ctrl) +``` +[Strobl et al](http://www.ncbi.nlm.nih.gov/pubmed/19968396) suggested setting ``` mtry``` at the square root of the number of variables. In this case, that would be ``` mtry = 2```, which did produce the better RF model. +```{r} + +rf.tune +# 714 samples +# 11 predictors +# 2 classes: 'Perished', 'Survived' +# +# No pre-processing +# Resampling: Cross-Validation (10 fold, repeated 3 times) +# +# Summary of sample sizes: 643, 643, 643, 642, 643, 643, ... +# +# Resampling results across tuning parameters: +# +# mtry ROC Sens Spec ROC SD Sens SD Spec SD +# 2 0.866 0.952 0.633 0.052 0.0288 0.0945 +# 3 0.861 0.934 0.642 0.0514 0.0345 0.0916 +# +# ROC was used to select the optimal model using the largest value. +# The final value used for the model was mtry = 2. +``` +And finally, we'll fit a **support vector machine (SVM)** model to the Titanic data. There are two functions which can be tuned for SVM using ``` train```. The default value for one of them -– ``` sigest``` –- produces good results on most occasions. The default grid of cost parameter C is 0.25, 0.5, and 1. If we set ``` train``` argument ``` tuneLength = 9```, the grid expands to c(0.25, 0.5, 1, 2, 4, 8, 16, 32, 64). As SVM is considered sensitive to the scale and magnitude of the presented features, I'll use the ``` preProcess``` argument to instruct ``` train``` to make arrangements for [normalizing](http://en.wikipedia.org/wiki/Feature_scaling) the data within resampling loops. +```{r cache=TRUE} +set.seed(35) +library(kernlab) + +svm.tune <- train(Fate ~ Sex + Class + Age + Family + Embarked, + data = train.batch, + method = "svmRadial", + tuneLength = 9, + preProcess = c("center", "scale"), + metric = "ROC", + trControl = cv.ctrl) +``` +You may have noticed that the same random number seed was set prior to fitting each model. This ensures that the same resampling sets are used for each model, enabling an "apple-to-apples" comparison of the resampling profiles between models during model evaluation. +```{r47} +svm.tune +# 714 samples +# 11 predictors +# 2 classes: 'Perished', 'Survived' +# +# Pre-processing: centered, scaled +# Resampling: Cross-Validation (10 fold, repeated 3 times) +# +# Summary of sample sizes: 643, 643, 643, 642, 643, 643, ... +# +# Resampling results across tuning parameters: +# +# C ROC Sens Spec ROC SD Sens SD Spec SD +# 0.25 0.832 0.951 0.628 0.0609 0.0274 0.0948 +# 0.5 0.833 0.947 0.629 0.0627 0.0282 0.0966 +# 1 0.833 0.944 0.639 0.0589 0.032 0.0904 +# 2 0.835 0.936 0.645 0.0623 0.0398 0.0892 +# 4 0.826 0.933 0.644 0.0615 0.0426 0.0935 +# 8 0.824 0.932 0.64 0.0568 0.0418 0.0845 +# 16 0.82 0.923 0.634 0.0553 0.0441 0.0867 +# 32 0.803 0.915 0.633 0.0617 0.0386 0.0876 +# 64 0.788 0.906 0.626 0.056 0.0367 0.0855 +# +# Tuning parameter 'sigma' was held constant at a value of 0.2204311 +# ROC was used to select the optimal model using the largest value. +# The final values used for the model were C = 2 and sigma = 0.22. +``` +Although the model output above does display ROC by cost parameter value, the following graph makes it abundantly clear that the ROC starts dropping at C=4. +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uejJwNmlXaEg4N2c) + + +### Model Evaluation + +With all four models in hand, I can begin to evaluate their performance by whipping together some cross-tabulations of the observed and predicted **Fate** for the passengers in the **test.batch** data. ``` caret``` makes this easy with the ``` confusionMatrix``` function. +```{r48} +## Logistic regression model +glm.pred <- predict(glm.tune.5, test.batch) +confusionMatrix(glm.pred, test.batch$Fate) +# Confusion Matrix and Statistics +# +# Reference +# Prediction Perished Survived +# Perished 97 19 +# Survived 12 49 +# +# Accuracy : 0.8249 +# 95% CI : (0.7607, 0.8778) +# No Information Rate : 0.6158 +# P-Value [Acc > NIR] : 1.304e-09 +# +# Kappa : 0.6225 +# Mcnemar's Test P-Value : 0.2812 +# +# Sensitivity : 0.8899 +# Specificity : 0.7206 +# Pos Pred Value : 0.8362 +# Neg Pred Value : 0.8033 +# Prevalence : 0.6158 +# Detection Rate : 0.5480 +# Detection Prevalence : 0.6554 +``` +```{r49} +## Boosted model +ada.pred <- predict(ada.tune, test.batch) +confusionMatrix(ada.pred, test.batch$Fate) +# Confusion Matrix and Statistics +# +# Reference +# Prediction Perished Survived +# Perished 100 23 +# Survived 9 45 +# +# Accuracy : 0.8192 +# 95% CI : (0.7545, 0.8729) +# No Information Rate : 0.6158 +# P-Value [Acc > NIR] : 3.784e-09 +# +# Kappa : 0.6025 +# Mcnemar's Test P-Value : 0.02156 +# +# Sensitivity : 0.9174 +# Specificity : 0.6618 +# Pos Pred Value : 0.8130 +# Neg Pred Value : 0.8333 +# Prevalence : 0.6158 +# Detection Rate : 0.5650 +# Detection Prevalence : 0.6949 +``` + +```{r50} +## Random Forest model +rf.pred <- predict(rf.tune, test.batch) +confusionMatrix(rf.pred, test.batch$Fate) +# Confusion Matrix and Statistics +# +# Reference +# Prediction Perished Survived +# Perished 103 27 +# Survived 6 41 +# +# Accuracy : 0.8136 +# 95% CI : (0.7483, 0.8681) +# No Information Rate : 0.6158 +# P-Value [Acc > NIR] : 1.058e-08 +# +# Kappa : 0.5817 +# Mcnemar's Test P-Value : 0.0004985 +# +# Sensitivity : 0.9450 +# Specificity : 0.6029 +# Pos Pred Value : 0.7923 +# Neg Pred Value : 0.8723 +# Prevalence : 0.6158 +# Detection Rate : 0.5819 +# Detection Prevalence : 0.7345 +``` + +```{r51} +## SVM model +svm.pred <- predict(svm.tune, test.batch) +confusionMatrix(svm.pred, test.batch$Fate) +# Confusion Matrix and Statistics +# +# Reference +# Prediction Perished Survived +# Perished 101 27 +# Survived 8 41 +# +# Accuracy : 0.8023 +# 95% CI : (0.7359, 0.8582) +# No Information Rate : 0.6158 +# P-Value [Acc > NIR] : 7.432e-08 +# +# Kappa : 0.5589 +# Mcnemar's Test P-Value : 0.002346 +# +# Sensitivity : 0.9266 +# Specificity : 0.6029 +# Pos Pred Value : 0.7891 +# Neg Pred Value : 0.8367 +# Prevalence : 0.6158 +# Detection Rate : 0.5706 +# Detection Prevalence : 0.7232 + +``` +(Perhaps now you've come to appreciate why I revalued the **Fate** feature earlier!) While there are no convincing conclusions to be drawn from the confusion matrices embedded within the outputs above, the logistic regression model we put together earlier appears to do the best job of selecting the survivors among the passengers in the test.batch. The Random Forest model, on the other hand, seems to have a slight edge on predicting those who perished. + +We can also calculate, using each of the four fitted models, the predicted probabilities for the **test.batch**, and use those probabilities to plot the ROC +```{r52} +## Logistic regression model (BLACK curve) + +glm.probs <- predict(glm.tune.5, test.batch, type = "prob") +glm.ROC <- roc(response = test.batch$Fate, + predictor = glm.probs$Survived, + levels = levels(test.batch$Fate)) + +plot(glm.ROC, type="S") + +## Area under the curve: 0.8609 +#``` + +#```{r53} +## Boosted model (GREEN curve) +ada.probs <- predict(ada.tune, test.batch, type = "prob") +ada.ROC <- roc(response = test.batch$Fate, + predictor = ada.probs$Survived, + levels = levels(test.batch$Fate)) +plot(ada.ROC, add=TRUE, col="green") +## Area under the curve: 0.8759 +#``` +#```{r} +## Random Forest model (RED curve) +rf.probs <- predict(rf.tune, test.batch, type = "prob") +rf.ROC <- roc(response = test.batch$Fate, + predictor = rf.probs$Survived, + levels = levels(test.batch$Fate)) +plot(rf.ROC, add=TRUE, col="red") +## Area under the curve: 0.8713 +#``` + +#```{r} +## SVM model (BLUE curve) +svm.probs <- predict(svm.tune, test.batch, type = "prob") +svm.ROC <- roc(response = test.batch$Fate, + predictor = svm.probs$Survived, + levels = levels(test.batch$Fate)) +plot(svm.ROC, add=TRUE, col="blue") +## Area under the curve: 0.8077 +``` +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uTlhEZFpOOU5PVTA) + +The following R script uses ``` caret``` function ``` resamples``` to collect the resampling results, then calls the ``` dotplot``` function to create a visualization of the resampling distributions. I'm typically not one for leaning on a single metric for important decisions, but if you have been looking for that one graph which sums up the performance of the four models, this is it. +```{r} +cv.values <- resamples(list(Logit = glm.tune.5, Ada = ada.tune, + RF = rf.tune, SVM = svm.tune)) +dotplot(cv.values, metric = "ROC") +``` +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uaEJzZFYzSmpEQWs) + +The next graph (my last, scout's honor) compares the four models on the basis of ROC, sensitivity, and specificity. Here, sensitivity (“Sens” on the graph) is the probability that a model will predict a Titanic passenger's death, given that the passenger actually did perish. Think of sensitivity in this case as the true perished rate. Specificity (“Spec”), on the other hand, is the probability that a model will predict survival, given that the passenger actually did survive. Simply put, all four models were better at predicting passenger fatalities than survivals, and none of them are significantly better or worse than the other three. Of the four, if I *had* to pick one, I'd probably put my money on the logistic regression model. +![alt text](http://drive.google.com/uc?export=view&id=0B-yx9UUIpB6uNVVsQXh2RVA2LUk) + +Let me reiterate the point I made in the disclaimer, *way* up at the top of this tl;dr page: This journey, paved with text and graphs, was never intended to reveal a path to discovery of the best model for predicting the fate of the passengers referenced in the Titanic data set. I sought only to demonstrate use of a subset of the tools – methods and software (R in this case) – a data scientist can employ in pursuit of a binary classification model. + +### Cast Your Votes + +Given everything we've been through here, it would be a shame if we didn't submit at least one of the four models to the [Titanic competition at Kaggle](http://www.kaggle.com/c/titanic-gettingStarted). Here is a script which munges the data Kaggle provided in their test.csv file, uses that data and the logistic regression model **glm.tune.5** to predict the survival (or not) of passengers listed in the test file, links the predictions to the PassengerId in a data frame, and writes those results to a submission-ready csv file. +```{r} +# get titles +df.infer$Title <- getTitle(df.infer) + +# impute missing Age values +df.infer$Title <- changeTitles(df.infer, c("Dona", "Ms"), "Mrs") +titles.na.test <- c("Master", "Mrs", "Miss", "Mr") +df.infer$Age <- imputeMedian(df.infer$Age, df.infer$Title, titles.na.test) + +# consolidate titles + +# same correction for Noble + +df.infer$Title <- changeTitles(df.infer, c("Col", "Dr", "Rev"), "Sir") +df.infer$Title <- changeTitles(df.infer, c("Mlle", "Mme"), "Miss") +df.infer$Title <- as.factor(df.infer$Title) + +# impute missing fares +df.infer$Fare[ which( df.infer$Fare == 0)] <- NA +df.infer$Fare <- imputeMedian(df.infer$Fare, df.infer$Pclass, + as.numeric(levels(df.infer$Pclass))) +# add the other features +df.infer <- featureEngrg(df.infer) + +# data prepped for casting predictions +test.keeps <- train.keeps[-1] +pred.these <- df.infer[test.keeps] + +# use the logistic regression model to generate predictions +Survived <- predict(glm.tune.5, newdata = pred.these) + +# try the svm model +# Survived <- predict(svm.tune, newdata = pred.these) +# reformat predictions to 0 or 1 and link to PassengerId in a data frame +Survived <- revalue(Survived, c("Survived" = 1, "Perished" = 0)) +predictions <- as.data.frame(Survived) +predictions$PassengerId <- df.infer$PassengerId + +# write predictions to csv file for submission to Kaggle +write.csv(predictions[,c("PassengerId", "Survived")], + file="Titanic_predictions.csv", row.names=FALSE, quote=FALSE) +``` + +If you must know, the logistic regression model scored 0.77990 on Kaggle – roughly middle of the pack on [the leaderboard](http://www.kaggle.com/c/titanic-gettingStarted/leaderboard) (as of late January 2014). I have submitted better scoring models; my best thus far, at 0.79426, trails just 17 percent of the 1150+ participants on the leaderboard. + +While I'm certain that I could squeeze more out of a model like Random Forest to improve my Kaggle ranking, I see better uses of my time. The correlation between public scores and final scores at Kaggle competitions is historically poor (see [this post](http://www.rouli.net/2013/02/five-lessons-from-kaggles-event.html)). Besides, I'd rather devote more time helping people with *today's* challenges. diff --git a/TitanicAnalysis.html b/TitanicAnalysis.html new file mode 100644 index 0000000..299d507 --- /dev/null +++ b/TitanicAnalysis.html @@ -0,0 +1,1818 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+

Titanic Survival Prediction

+
+
+

One Approach to Deriving a Model

+
+

Disclaimer

+

The following describes an approach I took to the Titanic survival prediction challenge presented by Kaggle. By no means does this approach represent a comprehensive, exhaustive pursuit of the best model for predicting Titanic passenger survival based on data provided by Kaggle. The intent here is merely to demonstrate utilization of some of the tools of the data science trade.

+
+
+

Use of R

+

This entire analysis, soup to nuts, will utilize the R software environment. Many would rightfully argue that there are better tools for portions (e.g. munging) of the journey to a fitted model. I sought to demonstrate here that, for those who do tend to lean on R or who wish to learn R, an all-R solution is possible.

+

V1 - results were .77

+
+
+

Background

+

One could hypothesize from stories of the Titanic’s sinking that a passenger’s survival was heavily dependent upon two factors: 1. Recognition of the possibility that the ship could sink 2. Access to a lifeboat

+

According to Wikipedia, the Titanic reportedly struck an iceberg at 11:40 pm ship’s time. The majority of its 2,224 passengers and crew had likely retired to their respective cabins for the evening by that time. Those on the upper decks had a shorter journey to the lifeboats, and possibly access to more timely and accurate information about the impending threat. Thus, any data relating to one’s location on the ship could prove helpful to survival predictions. Below is a cross-section of the Titanic: alt text

+

The Titanic was designed to carry 32 lifeboats, but this number was reduced to 20 (enough for about 1,180 people) for its maiden voyage – likely a cost-cutting measure influenced by perceptions that the additional boats would clutter the deck of a ship deemed “unsinkable.” Given that constraint, it is not surprising that a disproportionate number of men were apparently left aboard because of a women and children first protocol followed by some of the officers overseeing the loading of lifeboats with passengers.

+
+
+

Getting the Data Into R

+

Kaggle packaged the data for the Titanic challenge into two csv-format files: - train.csv (data containing attributes and known outcomes [survived or perished] for a subset of the passengers) - test.csv (data containing attributes without outcomes for a subset of passengers)

+

I’ve reviewed a lot of code containing approaches to sourcing data from a csv file with R. The majority seem to go no further than a simple read.csv function, mostly devoid of options, coded separately for each file being read. Later, the user often finds her/himself manually doing a number of tasks that could have been handled within the read function call. I chose to get as much out of read.csv as I could in the context of a re-usable custom function.

+
setwd("~/GitHub/Titanic")
+readData <- function(path.name, file.name, column.types, missing.types) {
+  read.csv( url( paste(path.name, file.name, sep="") ), 
+            colClasses=column.types,
+            na.strings=missing.types )
+}
+

I’ve pushed the Titanic csv files to my GitHub account so that I can access the data from anywhere and, more importantly, demonstrate here the reading of data from a web source. Here are the arguments I will pass into this custom file reading function for the train.csv file:

+
Titanic.path <- "https://raw.github.com/wehrley/Kaggle_Titanic/master/"
+train.data.file <- "train.csv"
+test.data.file <- "test.csv"
+missing.types <- c("NA", "")
+train.column.types <- c('integer',   # PassengerId
+                        'factor',    # Survived 
+                        'factor',    # Pclass
+                        'character', # Name
+                        'factor',    # Sex
+                        'numeric',   # Age
+                        'integer',   # SibSp
+                        'integer',   # Parch
+                        'character', # Ticket
+                        'numeric',   # Fare
+                        'character', # Cabin
+                        'factor'     # Embarked
+)
+test.column.types <- train.column.types[-2]     # # no Survived column in test.csv
+

Specifying missing types up front should make the data munging process a bit easier, and while I may have to change the class type for a data frame column or two along the way, I’ve specified class definitions in a manner which should be most conducive to modeling later. This leaves me with much cleaner code for reading the csv files.

+
train.raw <- readData(Titanic.path, train.data.file, 
+                      train.column.types, missing.types)
+df.train <- train.raw
+
+test.raw <- readData(Titanic.path, test.data.file, 
+                     test.column.types, missing.types)
+df.infer <- test.raw   
+
+
+

Data Munging

+

Josh Wills, senior director of data science at Cloudera, described himself as a data janitor in this interview from spring 2013. My experience in analytics projects over the years has certainly confirmed that data preparation accounts for the bulk of the effort. While some consider the process of getting data into an analysis-ready form as a sort of necessary evil, I’ve often derived value from “getting one’s hands dirty” and acquiring a granular view of the available data. Sometimes my greatest insights have come from this phase, often referred to as data pre-processing.

+

Let’s start with a look at missing data in the training set. I’ll use the missmap function from the Amelia package to display those.

+
library(Amelia)
+
## Warning: package 'Amelia' was built under R version 3.2.3
+
## Loading required package: Rcpp
+## ## 
+## ## Amelia II: Multiple Imputation
+## ## (Version 1.7.4, built: 2015-12-05)
+## ## Copyright (C) 2005-2016 James Honaker, Gary King and Matthew Blackwell
+## ## Refer to http://gking.harvard.edu/amelia/ for more information
+## ##
+
## map missing data by provided feature
+require(Amelia)
+missmap(df.train, main="Titanic Training Data - Missings Map", 
+        col=c("yellow", "black"), legend=FALSE)
+

alt text

+

Roughly 20 percent of the Age data is missing, and well above 70 percent of the passengers cannot be linked to a specific cabin number. While the proportion of Age “missings” is likely small enough for reasonable replacement with some form of imputation, the cabin missings seem too extensive to make reliable imputation possible. Nevertheless, some data could be better than zero data, so I’ll look at cabin numbers later to see how we can put them to use.

+

Before we start filling in missing data, let’s see what can be learned from the data we have. Putting some simple data visualization tools to work can take us a long way toward understanding what might influence the outcome we’re trying to predict – in this case, whether or not a passenger survived. Below is some code and the graphs they produced:

+
barplot(table(df.train$Survived),
+        names.arg = c("Perished", "Survived"),
+        main="Survived (passenger fate)", col="black")
+

+
barplot(table(df.train$Pclass), 
+        names.arg = c("first", "second", "third"),
+        main="Pclass (passenger traveling class)", col="firebrick")
+

+
barplot(table(df.train$Sex), main="Sex (gender)", col="darkviolet")
+

+
hist(df.train$Age, main="Age", xlab = NULL, col="brown")
+

+
barplot(table(df.train$SibSp), main="SibSp (siblings + spouse aboard)", 
+        col="darkblue")
+

+
barplot(table(df.train$Parch), main="Parch (parents + kids aboard)", 
+        col="gray50")
+

+
hist(df.train$Fare, main="Fare (fee paid for ticket[s])", xlab = NULL, 
+     col="darkgreen")
+

+
barplot(table(df.train$Embarked), 
+        names.arg = c("Cherbourg", "Queenstown", "Southampton"),
+        main="Embarked (port of embarkation)", col="sienna")
+

alt text alt text

+

Note the dominant categories in the first three graphs: * more passengers perished than survived * about twice as many passengers in 3rd class than in either 1st or 2nd * male passengers far outnumbered females

+

Perhaps these are the first clues that the two themes discussed earlier – women and children first policy, and location on the ship – could dictate the feature set. Although the fact that Southampton was the port of embarkation for most passengers doesn’t make for a very balanced Embarked factor, it might mean something in the final analysis.

+

Mosaic plots offer an interesting – and arguably under-utilized – way to summarize data. The vcd package includes the mosaicplot function for creating those. The following mosaic suggests that traveling class did influence the odds of a passenger’s survival.

+
mosaicplot(df.train$Pclass ~ df.train$Survived, 
+           main="Passenger Fate by Traveling Class", shade=FALSE, 
+           color=TRUE, xlab="Pclass", ylab="Survived")
+

alt text

+

Do you recall the earlier bar graph showing that some 2/3 of passengers were males? That is taken into account by the width of the two rectangles labeled “male” in the mosaic below. Now look at the height of the leftmost light gray rectangle [representing the proportion of females who survived] and compare it to the much shorter light gray rectange [representing proportion of males who survived]. Gender should certainly prove to be a prominent feature in the final model.

+
mosaicplot(df.train$Sex ~ df.train$Survived, 
+           main="Passenger Fate by Gender", shade=FALSE, color=TRUE, 
+           xlab="Sex", ylab="Survived")
+

alt text

+

Is it possible that “survival of the fittest” dictated the fate of passengers in certain parts of the ship? Perhaps, though it isn’t apparent at first glance from the boxplot of Age by Survival.

+
boxplot(df.train$Age ~ df.train$Survived, 
+        main="Passenger Fate by Age",
+        xlab="Survived", ylab="Age")
+

alt text

+

While passenger survival didn’t vary as much across the three ports of embarkation as it did between genders and traveling classes, perhaps the Embarked feature will prove useful at some point.

+
mosaicplot(df.train$Embarked ~ df.train$Survived, 
+           main="Passenger Fate by Port of Embarkation",
+           shade=FALSE, color=TRUE, xlab="Embarked", ylab="Survived")
+

alt text

+

Just one more graph, then we’ll get to those missing ages. The corrgram package is the source of a function for creating what is sometimes referred to as a correlogram. The one shown below confirms a couple of observations already made – namely, that survival odds drop with class, and age may not prove to be a significant predictor. Given that the upper class ranks tend to be represented by an older demographic, an inverse correlation between age and traveling class is to be expected. Although fare and class are closely related, it might be worth throwing the Fare feature into the mix as another way to define a passenger’s location on the ship.

+
library(corrgram)
+
## Warning: package 'corrgram' was built under R version 3.2.3
+
require(corrgram)
+library(plyr) # for revalue
+
## Warning: package 'plyr' was built under R version 3.2.3
+
## 
+## Attaching package: 'plyr'
+## 
+## The following object is masked from 'package:corrgram':
+## 
+##     baseball
+
library(ada) # for ada (later)
+
## Warning: package 'ada' was built under R version 3.2.3
+
## Loading required package: rpart
+
corrgram.data <- df.train
+## change features of factor type to numeric type for inclusion on correlogram
+corrgram.data$Survived <- as.numeric(corrgram.data$Survived)
+corrgram.data$Pclass <- as.numeric(corrgram.data$Pclass)
+corrgram.data$Embarked <- revalue(corrgram.data$Embarked, 
+                                  c("C" = 1, "Q" = 2, "S" = 3))
+## generate correlogram
+corrgram.vars <- c("Survived", "Pclass", "Sex", "Age", 
+                   "SibSp", "Parch", "Fare", "Embarked")
+corrgram(corrgram.data[,corrgram.vars], order=FALSE, 
+         lower.panel=panel.ellipse, upper.panel=panel.pie, 
+         text.panel=panel.txt, main="Titanic Training Data")
+

alt text

+

Time to tackle those missing ages. A common approach to this type of situation is to replacing the missings with the average of the available values. In this case, that would mean replacing 177 missing Age values with 29.7.

+
summary(df.train$Age)
+
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
+##    0.42   20.12   28.00   29.70   38.00   80.00     177
+
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
+#   0.42   20.12   28.00   29.70   38.00   80.00     177 
+

Taking that approach would be fine if only a small fraction of the ages were missing. However, with missings accounting for 20 percent of all Age data in a relatively small data set (<900 records), one could justify a search for a more refined method of imputation. Let’s peek again at the list of currently available features:

+
names(df.train)
+
##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
+##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
+## [11] "Cabin"       "Embarked"
+
# [1] "PassengerId" "Survived"   "Pclass"   "Name"   "Sex"    "Age"
+# [7] "SibSp"       "Parch"      "Ticket"   "Fare"   "Cabin"  
+# "Embarked"
+

PassengerId is merely a record number, and we already know that splitting the ages solely by Survived doesn’t reveal much. A boxplot of ages by passenger traveling class looks interesting… alt text

+

This makes intuitive sense: Passengers in the upper classes (first and second) would tend to be wealthier, and in that period of U.S. history, acquiring wealth usually required a good deal of time (no dot-com kings in their 20s were aboard the Titanic on her maiden voyage). There are no missing values in Pclass, so we could replace the missing age for, say, a third class passenger with the average or median of the available ages for those in Pclass="3". Doing so would be an improvement over assigning 29.7 to all Age missings. ## function for extracting honorific (i.e. title) from the Name feature

+
getTitle <- function(data) {
+  title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
+  title.comma.end <- title.dot.start + 
+      attr(title.dot.start, "match.length")-1
+  data$Title <- substr(data$Name,title.dot.start+2,
+                       title.comma.end-1)
+  return (data$Title)
+}   
+

Inspection of the next feature – Name – reveals what could be an even better approach…

+
df.train$Title <- getTitle(df.train)
+unique(df.train$Title)
+
##  [1] "Mr"           "Mrs"          "Miss"         "Master"      
+##  [5] "Don"          "Rev"          "Dr"           "Mme"         
+##  [9] "Ms"           "Major"        "Lady"         "Sir"         
+## [13] "Mlle"         "Col"          "Capt"         "the Countess"
+## [17] "Jonkheer"
+
# [1] "Braund, Mr. Owen Harris"                            
+# [2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
+# [3] "Heikkinen, Miss. Laina"                             
+# [4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"       
+# [5] "Allen, Mr. William Henry"                           
+# [6] "Moran, Mr. James"                                   
+# [7] "McCarthy, Mr. Timothy J"                            
+# [8] "Palsson, Master. Gosta Leonard"                     
+# [9] "Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)"  
+#[10] "Nasser, Mrs. Nicholas (Adele Achem)" 
+

Notice the titles – Mr., Mrs., Miss., Master. – following each of the surnames. The Wikipedia entry for the English honorific “Master” explains that, > By the late 19th century, etiquette dictated that men be addressed as Mister, and boys as Master."

+

The title “Miss” should help with differentiation betweeen younger and older females. Also, note the way the title appears in the name: The format “Surname, Title. Firstname…” is consistent in Name across all records. I used that pattern to create a custom function which employs a regular expression and the regexpr function to extract the title from each name:

+
## function for extracting honorific (i.e. title) from the Name feature
+getTitle <- function(data) {
+  title.dot.start <- regexpr("\\,[A-Z ]{1,20}\\.", data$Name, TRUE)
+  title.comma.end <- title.dot.start + 
+      attr(title.dot.start, "match.length")-1
+  data$Title <- substr(data$Name,title.dot.start+2,
+                       title.comma.end-1)
+  return (data$Title)
+}   
+

Let’s fetch the titles, given them their own column in the df.train data frame, and look at the uniques.

+
# ensure there are no null titles
+df.train$Title = "Mr"
+df.train$Title[df.train$Sex=="female"]="Mrs" 
+df.train$Title[as.numeric(df.train$Age) < 24 & df.train$Sex =="female"]="Miss"
+
+ 
+df.train$Title <- getTitle(df.train)
+unique(df.train$Title)
+
##  [1] "Mr"           "Mrs"          "Miss"         "Master"      
+##  [5] "Don"          "Rev"          "Dr"           "Mme"         
+##  [9] "Ms"           "Major"        "Lady"         "Sir"         
+## [13] "Mlle"         "Col"          "Capt"         "the Countess"
+## [17] "Jonkheer"
+
# [1] "Mr"     "Mrs"     "Miss"    "Master"        "Don"    "Rev"
+# [7] "Dr"     "Mme"      "Ms"     "Major"         "Lady"       "Sir"
+#[13] "Mlle"   "Col"     "Capt"    "the Countess"  "Jonkheer"
+str(df.train)
+
## 'data.frame':    891 obs. of  13 variables:
+##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
+##  $ Survived   : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
+##  $ Pclass     : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
+##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
+##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
+##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
+##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
+##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
+##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
+##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
+##  $ Cabin      : chr  NA "C85" NA "C123" ...
+##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
+##  $ Title      : chr  "Mr" "Mrs" "Miss" "Mrs" ...
+

To identify the titles which have at least one record with an age missing, I’ll use the bystats function from the Hmisc package.

+
options(digits=2)
+   
+require(Hmisc)
+
## Loading required package: Hmisc
+
## Warning: package 'Hmisc' was built under R version 3.2.3
+
## Loading required package: lattice
+## Loading required package: survival
+## Loading required package: Formula
+## Loading required package: ggplot2
+
## Warning: package 'ggplot2' was built under R version 3.2.3
+
## 
+## Attaching package: 'Hmisc'
+## 
+## The following objects are masked from 'package:plyr':
+## 
+##     is.discrete, summarize
+## 
+## The following objects are masked from 'package:base':
+## 
+##     format.pval, round.POSIXt, trunc.POSIXt, units
+
bystats(df.train$Age, df.train$Title, 
+        fun=function(x)c(Mean=mean(x),Median=median(x)))
+
## 
+##  c(5, 13, 5, 55, 13, 55, 5, 5) of df.train$Age by df.train$Title 
+## 
+##                N Missing Mean Median
+## Capt           1       0 70.0   70.0
+## Col            2       0 58.0   58.0
+## Don            1       0 40.0   40.0
+## Dr             6       1 42.0   46.5
+## Jonkheer       1       0 38.0   38.0
+## Lady           1       0 48.0   48.0
+## Major          2       0 48.5   48.5
+## Master        36       4  4.6    3.5
+## Miss         146      36 21.8   21.0
+## Mlle           2       0 24.0   24.0
+## Mme            1       0 24.0   24.0
+## Mr           398     119 32.4   30.0
+## Mrs          108      17 35.9   35.0
+## Ms             1       0 28.0   28.0
+## Rev            6       0 43.2   46.5
+## Sir            1       0 49.0   49.0
+## the Countess   1       0 33.0   33.0
+## ALL          714     177 29.7   28.0
+
#                N Missing Mean Median
+# Capt           1       0 70.0   70.0
+# Col            2       0 58.0   58.0
+# Don            1       0 40.0   40.0
+# Dr             6       1 42.0   46.5
+# Jonkheer       1       0 38.0   38.0
+# Lady           1       0 48.0   48.0
+# Major          2       0 48.5   48.5
+# Master        36       4  4.6    3.5
+# Miss         146      36 21.8   21.0
+# Mlle           2       0 24.0   24.0
+# Mme            1       0 24.0   24.0
+# Mr           398     119 32.4   30.0
+# Mrs          108      17 35.9   35.0
+# Ms             1       0 28.0   28.0
+# Rev            6       0 43.2   46.5
+# Sir            1       0 49.0   49.0
+# the Countess   1       0 33.0   33.0
+# ALL          714     177 29.7   28.0
+

Now I can assign the titles with at least one missing Age value to a list…

+
## list of titles with missing Age value(s) requiring imputation
+titles.na.train <- c("Dr", "Master", "Mrs", "Miss", "Mr")
+

…then pass that list to the following custom function I created for imputing the missing ages:

+
imputeMedian <- function(impute.var, filter.var, var.levels) {
+  for (v in var.levels) {
+    impute.var[ which( filter.var == v)] <- impute(impute.var[ 
+      which( filter.var == v)])
+  }
+  return (impute.var)
+}
+

I apply the impute function from the Hmisc package on a per-title basis to assign the median of the available ages to the missing age(s). For example, the single record with a missing Age value and Title="Dr" will be assigned the median of the ages from the 6 records with Title="Dr" which do have age data.

+
df.train$Age[which(df.train$Title=="Dr")]
+
## [1] 44 54 23 32 50 NA 49
+
#[1] 44 54 23 32 50 NA 49
+

After doing the age imputations, I check the Age data and find that the function seems to have done its job.

+
df.train$Age <- imputeMedian(
+    df.train$Age, df.train$Title, 
+    titles.na.train)
+df.train$Age[which(df.train$Title=="Dr")]
+
## [1] 44 54 23 32 50 46 49
+
#[1] 44.0 54.0 23.0 32.0 50.0 46.5 49.0
+summary(df.train$Age)
+
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0      21      30      29      35      80
+
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+#   0.42   21.00   30.00   29.39   35.00   80.00
+

You may recall that the Embarked feature also had at least one missing value. A summary of that data…

+
summary(df.train$Embarked)
+
##    C    Q    S NA's 
+##  168   77  644    2
+

…reveals just two missings. It should be fine to replace those missings with “S”, the most common value.

+
df.train$Embarked[which(is.na(df.train$Embarked))] <- 'S'
+

While there are no missing Fare values, a summary does show at least one Fare=0

+
summary(df.train$Fare)
+
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
+##       0       8      14      32      31     512
+

(That exceptionally high fare of $512.30 suggests that some tickets were purchased in groups. We’ll address that later.) A zero fare might have been assigned to a baby. However, a closer look at records where Fare = 0 suggests otherwise…

+
subset(df.train, Fare < 7)[order(subset(df.train, Fare < 7)$Fare, 
+                          subset(df.train, Fare < 7)$Pclass), 
+                          c("Age", "Title", "Pclass", "Fare")]
+
##     Age    Title Pclass Fare
+## 264  40       Mr      1  0.0
+## 634  30       Mr      1  0.0
+## 807  39       Mr      1  0.0
+## 816  30       Mr      1  0.0
+## 823  38 Jonkheer      1  0.0
+## 278  30       Mr      2  0.0
+## 414  30       Mr      2  0.0
+## 467  30       Mr      2  0.0
+## 482  30       Mr      2  0.0
+## 675  30       Mr      2  0.0
+## 733  30       Mr      2  0.0
+## 180  36       Mr      3  0.0
+## 272  25       Mr      3  0.0
+## 303  19       Mr      3  0.0
+## 598  49       Mr      3  0.0
+## 379  20       Mr      3  4.0
+## 873  33       Mr      1  5.0
+## 327  61       Mr      3  6.2
+## 844  34       Mr      3  6.4
+## 819  43       Mr      3  6.4
+## 203  34       Mr      3  6.5
+## 372  18       Mr      3  6.5
+## 144  19       Mr      3  6.8
+## 655  18     Miss      3  6.8
+## 412  30       Mr      3  6.9
+## 826  30       Mr      3  7.0
+## 130  45       Mr      3  7.0
+## 805  27       Mr      3  7.0
+
#     Age Title Pclass Fare
+# 264  40    Mr      1  0.0
+# 634  30    Mr      1  0.0
+# 807  39    Mr      1  0.0
+# 816  30    Mr      1  0.0
+# 823  38 Noble      1  0.0
+# 278  30    Mr      2  0.0
+# 414  30    Mr      2  0.0
+# 467  30    Mr      2  0.0
+# 482  30    Mr      2  0.0
+# 675  30    Mr      2  0.0
+# 733  30    Mr      2  0.0
+# 180  36    Mr      3  0.0
+# 272  25    Mr      3  0.0
+# 303  19    Mr      3  0.0
+# 598  49    Mr      3  0.0
+# 379  20    Mr      3  4.0
+# 873  33    Mr      1  5.0
+# 327  61    Mr      3  6.2
+# 844  34    Mr      3  6.4
+# 819  43    Mr      3  6.4
+# 203  34    Mr      3  6.5
+# 372  18    Mr      3  6.5
+# 144  19    Mr      3  6.8
+# 655  18  Miss      3  6.8
+# 412  30    Mr      3  6.9
+# 826  30    Mr      3  7.0
+# 130  45    Mr      3  7.0
+# 805  27    Mr      3  7.0
+

The jump in fares from 0 to the 4-7 range suggests errors. I replaced the zero Fare values with the median fare from the respective passenger class using the imputMedian function introduced earlier.

+
## impute missings on Fare feature with median fare by Pclass
+df.train$Fare[ which( df.train$Fare == 0 )] <- NA
+df.train$Fare <- imputeMedian(df.train$Fare, df.train$Pclass, 
+                              as.numeric(levels(df.train$Pclass)))
+

I see the titles as more than merely a guide for imputation of missing ages. A passenger’s title can reflect gender, his/her position on the ship (officers & royalty), and access to a lifeboat (where “Master” superceded “Mr”). Making the effort to get the Title feature model-ready seems worthwhile.

+

Recall from the bystats results above that the training data contains 17 different titles. We already know that “Master” and “Mr” should separate the males into roughly two groups by age. The following script…

+
df.train$Title <- factor(df.train$Title,
+                         c("Capt","Col","Major","Sir","Lady","Rev",
+                         "Dr","Don","Jonkheer","the Countess","Mrs",
+                         "Ms","Mr","Mme","Mlle","Miss","Master"))
+boxplot(df.train$Age ~ df.train$Title, 
+        main="Passenger Age by Title", xlab="Title", ylab="Age", 
+        y=range(df.train$Age, na.rm=TRUE))
+

…produces this boxplot (too wide for display here) showing passenger age by title, including shading which illustrates the manner in which I consolidated the titles. I created and applied a custom function for revaluing the titles, then reclassified Title to a factor type, as follows:

+
## function for assigning a new title value to old title(s) 
+changeTitles <- function(data, old.titles, new.title) {
+  for (honorific in old.titles) {
+    data$Title[ which( data$Title == honorific)] <- new.title
+  }
+  return (data$Title)
+}
+## Title consolidation
+#df.train.tmp = df.train
+#df.train=df.train.tmp
+write.table (df.train,"dftraintmp.csv")
+
+# Replaced the Noble with Sir, because is is an existing factor
+
+df.train$Title <- changeTitles(
+    df.train, 
+    c("Capt", "Col", "Don", "Dr", 
+    "Jonkheer", "Lady", "Major", 
+    "Rev", "Sir"),
+    "Sir")
+df.train$Title <- changeTitles(df.train, c("the Countess", "Ms"), 
+                               "Mrs")
+df.train$Title <- changeTitles(df.train, c("Mlle", "Mme"), "Miss")
+df.train$Title <- as.factor(df.train$Title)
+

I assigned the Countess of Rothes, a woman in first class and the sole passenger with a “Countess” title, to the “Mrs” group. In retrospect, I could have placed her under the “Noble” umbrella. Given that 91 of the 94 female first-class passengers in the training set survived, I was willing to live with that choice.

+

All of the work done designing the new Title column can be considered a part of feature engineering. The other features I chose to add are generated using custom function featureEngrg, which can be applied to both the training data in df.train and the Kaggle-provided test data in df.infer.

+
library(stringr)
+
## Warning: package 'stringr' was built under R version 3.2.3
+
library(plyr)
+require(plyr)     # for the revalue function 
+require(stringr)  # for the str_sub function
+
+## test a character as an EVEN single digit
+isEven <- function(x) x %in% c("0","2","4","6","8") 
+## test a character as an ODD single digit
+isOdd <- function(x) x %in% c("1","3","5","7","9") 
+
+## function to add features to training or test data frames
+featureEngrg <- function(data) {
+  ## Using Fate ILO Survived because term is shorter and just sounds good
+  data$Fate <- data$Survived
+  ## Revaluing Fate factor to ease assessment of confusion matrices later
+  data$Fate <- revalue(data$Fate, c("1" = "Survived", "0" = "Perished"))
+  ## Boat.dibs attempts to capture the "women and children first"
+  ## policy in one feature.  Assuming all females plus males under 15
+  ## got "dibs' on access to a lifeboat
+  data$Boat.dibs <- "No"
+  data$Boat.dibs[which(data$Sex == "female" | data$Age < 15)] <- "Yes"
+  data$Boat.dibs <- as.factor(data$Boat.dibs)
+  ## Family consolidates siblings and spouses (SibSp) plus
+  ## parents and children (Parch) into one feature
+  data$Family <- data$SibSp + data$Parch
+  ## Fare.pp attempts to adjust group purchases by size of family
+  data$Fare.pp <- data$Fare/(data$Family + 1)
+  ## Giving the traveling class feature a new look
+  data$Class <- data$Pclass
+  data$Class <- revalue(data$Class, 
+                        c("1"="First", "2"="Second", "3"="Third"))
+  ## First character in Cabin number represents the Deck 
+  data$Deck <- substring(data$Cabin, 1, 1)
+  data$Deck[ which( is.na(data$Deck ))] <- "UNK"
+  data$Deck <- as.factor(data$Deck)
+  ## Odd-numbered cabins were reportedly on the port side of the ship
+  ## Even-numbered cabins assigned Side="starboard"
+  data$cabin.last.digit <- str_sub(data$Cabin, -1)
+  data$Side <- "UNK"
+  data$Side[which(isEven(data$cabin.last.digit))] <- "port"
+  data$Side[which(isOdd(data$cabin.last.digit))] <- "starboard"
+  data$Side <- as.factor(data$Side)
+  data$cabin.last.digit <- NULL
+  return (data)
+}
+
+## add remaining features to training data frame
+df.train <- featureEngrg(df.train)
+

Some color on the features I’ve added: * Boat.dibs - assumes all females plus males under 15 get “dibs’ on access to a lifeboat. Filtering by Title=“Master” was considered, but the highest age in the training data for males addressed as”Master" was just 12, and I wanted to account for male teens with Title=“Mr” who could pass for a child. * Deck - levels are as shown in the Titanic cross-section displayed previously. Cabin data provided for just 23 percent of training data records, so it’s tough to give this one much emphasis. * Side - subject to the same concern (dearth of data) expressed for Deck

+

I finish the data munging process by paring down the data frame to the columns I will use in model building.

+
train.keeps <- c("Fate", "Sex", "Boat.dibs", "Age", "Title", 
+                 "Class", "Deck", "Side", "Fare", "Fare.pp", 
+                 "Embarked", "Family")
+df.train.munged <- df.train[train.keeps]
+str(df.train.munged)
+
## 'data.frame':    891 obs. of  12 variables:
+##  $ Fate     : Factor w/ 2 levels "Perished","Survived": 1 2 2 2 1 1 1 1 2 2 ...
+##  $ Sex      : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
+##  $ Boat.dibs: Factor w/ 2 levels "No","Yes": 1 2 2 2 1 1 1 2 2 2 ...
+##  $ Age      : num  22 38 26 35 35 30 54 2 27 14 ...
+##  $ Title    : Factor w/ 17 levels "Capt","Col","Major",..: 13 11 16 11 13 13 13 17 11 11 ...
+##  $ Class    : Factor w/ 3 levels "First","Second",..: 3 1 3 1 3 3 1 3 3 2 ...
+##  $ Deck     : Factor w/ 9 levels "A","B","C","D",..: 9 3 9 3 9 9 5 9 9 9 ...
+##  $ Side     : Factor w/ 3 levels "port","starboard",..: 3 2 3 2 3 3 1 3 3 3 ...
+##  $ Fare     : num  7.25 71.28 7.92 53.1 8.05 ...
+##  $ Fare.pp  : num  3.62 35.64 7.92 26.55 8.05 ...
+##  $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
+##  $ Family   : int  1 1 0 1 0 0 0 4 2 1 ...
+
+
+
+

Fitting a Model

+

Later, I will be conducting the predictive modeling effort using the caret package. Created by Max Kuhn of Pfizer Global R&D, caretprovides a unified interface for modeling & prediction, and streamlines the model tuning process using resampling. The package includes a createDataPartition function for splitting data into a training set and a test set (sometimes referred to as a validation set) via stratified random sampling. In this presentation, Kuhn delivered the best explanation I’ve seen of the decision on how to “spend” the available training data. His conclusion: > Statistically, the best course of action would be to use all the data for model building and use statistical methods to get good estimates of error. From a non-statistical perspective, many consumers of these models emphasize the need for an untouched set of samples to evaluate performance.

+

I selected an 80/20 split for training data and testing data. The code:

+
library(caret)
+
## Warning: package 'caret' was built under R version 3.2.3
+
## 
+## Attaching package: 'caret'
+## 
+## The following object is masked from 'package:survival':
+## 
+##     cluster
+
## split training data into train batch and test batch
+set.seed(23)
+training.rows <- createDataPartition(
+    df.train.munged$Fate, p = 0.8, list = FALSE)
+train.batch <- df.train.munged[training.rows, ]
+test.batch <- df.train.munged[-training.rows, ]
+
+# temporary fix for missing value
+#test.batch[36,5]="Mr"
+

Before I go pouring features into the popular Random Forest method, I’m going to give one of the simplest classification methods a crack at the Titanic prediction challenge. Logistic regression, which surfaced about 70 years ago, has been used extensively in multiple fields. I’ll start simple by passing essentially the features provided in the raw training data (remember that we combined SibSp and Parch to form Family) through the R function for fitting general linearized models. When entering the model formula, I typically have a habit of listing the features in an order roughly corresponding to what I initially believe their importance will be. In this case, I’ve ordered them roughly by the two main themes I discussed earlier (women & children first policy and location on the ship). By setting the argument family to binomial with a logit link, I’m asking glm( ) to produce a logistic regression.

+
Titanic.logit.1 <- glm(Fate ~ Sex + Class + Age + Family + 
+    Embarked+ Fare, data = train.batch, family=binomial("logit"))
+

To assess this first model and the various binary logistic regressions that will appear in its wake, we will use the chi-square statistic, which is basically a measure of the goodness of fit of observed values to expected values. The bigger the difference (or deviance) of the observed values from the expected values, the poorer the fit of the model. The null deviance shows how well passenger survival is predicted by a “null” model using only a constant (grand mean). As we adjust the model’s formula by adding and/or removing variables, we’ll look for those changes which prompt a drop in the residual deviance, indicating an improvement in fit.

+
Titanic.logit.1
+
## 
+## Call:  glm(formula = Fate ~ Sex + Class + Age + Family + Embarked + 
+##     Fare, family = binomial("logit"), data = train.batch)
+## 
+## Coefficients:
+## (Intercept)      Sexmale  ClassSecond   ClassThird          Age  
+##     4.47735     -2.63750     -1.14829     -2.32037     -0.04498  
+##      Family    EmbarkedQ    EmbarkedS         Fare  
+##    -0.22278     -0.10612     -0.54006      0.00177  
+## 
+## Degrees of Freedom: 713 Total (i.e. Null);  705 Residual
+## Null Deviance:       951 
+## Residual Deviance: 631   AIC: 649
+
#Call:  glm(formula = Fate ~ Sex + Class + Age + Family + Embarked + 
+#    Fare, family = binomial("logit"), data = train.batch)
+
+#Coefficients:
+#(Intercept)      Sexmale  ClassSecond   ClassThird          Age       Family  
+#  4.1991007   -2.7367328   -0.9333119   -2.0678612   -0.0441754   -0.2871471  
+#  EmbarkedQ    EmbarkedS         Fare  
+#  0.0003177   -0.4913073    0.0052758  
+
+#Degrees of Freedom: 713 Total (i.e. Null);  705 Residual
+#Null Deviance:        950.9 
+#Residual Deviance:    618.7     AIC: 636.7
+

The deviance was reduced by 332.2 points on 713-705=8 degrees of freedom (DF), a significant reduction…

+
1 - pchisq(332.2, df=8)
+
## [1] 0
+
#[1] 0
+

In other words, the model put forth is significantly different from the null model. Overall, the model appears to have performed well – but I’m willing to bet that we could improve on that residual deviance with a different combination of features. Calling anova(), an extractor function, generates the results of the analysis.

+
anova(Titanic.logit.1, test="Chisq")
+
## Analysis of Deviance Table
+## 
+## Model: binomial, link: logit
+## 
+## Response: Fate
+## 
+## Terms added sequentially (first to last)
+## 
+## 
+##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
+## NULL                       713        951             
+## Sex       1    202.7       712        748  < 2e-16 ***
+## Class     2     76.1       710        672  < 2e-16 ***
+## Age       1     23.7       709        648  1.1e-06 ***
+## Family    1     11.4       708        637  0.00075 ***
+## Embarked  2      5.5       706        632  0.06452 .  
+## Fare      1      0.5       705        631  0.48187    
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
#Analysis of Deviance Table
+
+#Model: binomial, link: logit
+
+#Response: Fate
+
+#Terms added sequentially (first to last)
+
+#     Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
+#NULL                       713     950.86              
+#Sex       1  218.443       712     732.42 < 2.2e-16 ***
+#Class     2   72.191       710     660.23 < 2.2e-16 ***
+#Age       1   19.971       709     640.26 7.862e-06 ***
+#Family    1   13.135       708     627.12 0.0002899 ***
+#Embarked  2    5.608       706     621.52 0.0605668 .  
+#Fare      1    2.855       705     618.66 0.0911186 .  
+#---
+#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+

Notice how the Sex and Class features accounted for the lion’s share of the reduction in the deviance, providing some support to our hypotheses about life boat access and location on ship. Since Fare isn’t doing much for us, let’s see if the Fare.pp we created fares any better (pun intended).

+
Titanic.logit.2 <- glm(Fate ~ Sex + Class + Age + Family + Embarked + Fare.pp,                       data = train.batch, family=binomial("logit"))
+anova(Titanic.logit.2, test="Chisq")
+
## Analysis of Deviance Table
+## 
+## Model: binomial, link: logit
+## 
+## Response: Fate
+## 
+## Terms added sequentially (first to last)
+## 
+## 
+##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)    
+## NULL                       713        951             
+## Sex       1    202.7       712        748  < 2e-16 ***
+## Class     2     76.1       710        672  < 2e-16 ***
+## Age       1     23.7       709        648  1.1e-06 ***
+## Family    1     11.4       708        637  0.00075 ***
+## Embarked  2      5.5       706        632  0.06452 .  
+## Fare.pp   1      0.3       705        631  0.60089    
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+
#Analysis of Deviance Table
+
+#Model: binomial, link: logit
+
+#Response: Fate
+
+#Terms added sequentially (first to last)
+
+#         Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
+#NULL                       713     950.86              
+#Sex       1  218.443       712     732.42 < 2.2e-16 ***
+#Class     2   72.191       710     660.23 < 2.2e-16 ***
+#Age       1   19.971       709     640.26 7.862e-06 ***
+#Family    1   13.135       708     627.12 0.0002899 ***
+#Embarked  2    5.608       706     621.52 0.0605668 .  
+#Fare.pp   1    1.312       705     620.21 0.2521103    
+#---
+#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+

Hmm, that was no help. Dropping fares altogether and passing a slightly slimmer formula through the glm() function will give us a new baseline for model improvement.

+
glm(Fate ~ Sex + Class + Age + Family + Embarked, 
+    data = train.batch, family=binomial("logit"))
+
## 
+## Call:  glm(formula = Fate ~ Sex + Class + Age + Family + Embarked, family = binomial("logit"), 
+##     data = train.batch)
+## 
+## Coefficients:
+## (Intercept)      Sexmale  ClassSecond   ClassThird          Age  
+##      4.6425      -2.6415      -1.2451      -2.4388      -0.0454  
+##      Family    EmbarkedQ    EmbarkedS  
+##     -0.2102      -0.1280      -0.5704  
+## 
+## Degrees of Freedom: 713 Total (i.e. Null);  706 Residual
+## Null Deviance:       951 
+## Residual Deviance: 632   AIC: 648
+
#Call:  glm(formula = Fate ~ Sex + Class + Age + Family + Embarked, family = #binomial("logit"), data = train.batch)
+
+#Coefficients:
+#(Intercept)      Sexmale  ClassSecond   ClassThird          Age       Family  
+#    4.60950     -2.74715     -1.19354     -2.38903     -0.04466     -0.24416  
+#  EmbarkedQ    EmbarkedS  
+#   -0.03949     -0.55186  
+
+#Degrees of Freedom: 713 Total (i.e. Null);  706 Residual
+#Null Deviance:        950.9 
+#Residual Deviance:    621.5    AIC: 637.5
+

Time to shift the model fitting to a higher gear. Henceforth, I’m going to use the train function in Kuhn’s caret package to fit binary logistic regression models, as well as models built using other methods.

+

Modeling taken to an extreme on a training data set can leave you with a model which very accurately maps the training data, but does not generalize well to new samples. This phenomenon, commonly referred to as overfitting, can be addressed by resampling the training samples in a way which approximates the fitted model’s performance on future data. I’m going to use a form of resampling known as 10-fold cross-validation (CV), repeated 3 times.

+

Later, I plan to compare the fitted logit model to other model types using the receiver operating characteristic (ROC) curve. The twoClassSummary function in caret can calculate the figures I’ll need for that if I give it class probabilities predicted by the logistic regression model.

+

All of these things I want – 3x 10-fold CV, estimation of class probabilities, metrics from twoClassSummary – can be passed through the trainControl function.

+
## Define control function to handle optional arguments for train function
+## Models to be assessed based on largest absolute area under ROC curve
+cv.ctrl <- trainControl(method = "repeatedcv", repeats = 3,
+                        summaryFunction = twoClassSummary,
+                        classProbs = TRUE)
+

Below is the train function call using the same formula (sans Fare) that we recently passed through glm function. I use the metric argument to tell train to optimize the model by maximizing the area under the ROC curve (AUC). summary(), another extractor function, is called to generate regression coefficients with standard errors and a z-test, plus the residual deviance metric we were watching earlier.

+
#install.packages("pROC")
+
+library (pROC)
+
## Warning: package 'pROC' was built under R version 3.2.3
+
## Type 'citation("pROC")' for a citation.
+## 
+## Attaching package: 'pROC'
+## 
+## The following objects are masked from 'package:stats':
+## 
+##     cov, smooth, var
+
set.seed(35)
+glm.tune.1 <- train(Fate ~ Sex + Class + Age + Family + Embarked,
+                    data = train.batch,
+                    method = "glm",
+                    metric = "ROC",
+                    trControl = cv.ctrl)
+glm.tune.1
+
## Generalized Linear Model 
+## 
+## 714 samples
+##  11 predictor
+##   2 classes: 'Perished', 'Survived' 
+## 
+## No pre-processing
+## Resampling: Cross-Validated (10 fold, repeated 3 times) 
+## Summary of sample sizes: 643, 643, 643, 642, 643, 643, ... 
+## Resampling results
+## 
+##   ROC   Sens  Spec  ROC SD  Sens SD  Spec SD
+##   0.85  0.85  0.7   0.05    0.055    0.085  
+## 
+## 
+
#714 samples
+# 11 predictors
+#  2 classes: 'Perished', 'Survived' 
+
+#No pre-processing
+#Resampling: Cross-Validation (10 fold, repeated 3 times) 
+
+#Summary of sample sizes: 642, 643, 643, 642, 643, 642, ... 
+
+#Resampling results
+
+#  ROC    Sens   Spec   ROC SD  Sens SD  Spec SD
+#  0.856  0.855  0.698  0.0433  0.071    0.0852 
+
+summary(glm.tune.1)
+
## 
+## Call:
+## NULL
+## 
+## Deviance Residuals: 
+##    Min      1Q  Median      3Q     Max  
+## -2.219  -0.615  -0.423   0.637   2.472  
+## 
+## Coefficients:
+##             Estimate Std. Error z value Pr(>|z|)    
+## (Intercept)  4.64254    0.48679    9.54  < 2e-16 ***
+## Sexmale     -2.64146    0.22237  -11.88  < 2e-16 ***
+## ClassSecond -1.24511    0.30481   -4.08  4.4e-05 ***
+## ClassThird  -2.43880    0.28496   -8.56  < 2e-16 ***
+## Age         -0.04542    0.00882   -5.15  2.6e-07 ***
+## Family      -0.21021    0.07263   -2.89   0.0038 ** 
+## EmbarkedQ   -0.12801    0.41992   -0.30   0.7605    
+## EmbarkedS   -0.57038    0.26374   -2.16   0.0306 *  
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for binomial family taken to be 1)
+## 
+##     Null deviance: 950.86  on 713  degrees of freedom
+## Residual deviance: 631.51  on 706  degrees of freedom
+## AIC: 647.5
+## 
+## Number of Fisher Scoring iterations: 5
+
#Call:
+# NULL
+# 
+# Deviance Residuals: 
+#     Min       1Q   Median       3Q      Max  
+# -2.2206  -0.5945  -0.4131   0.6208   2.5031  
+# 
+# Coefficients:
+#             Estimate Std. Error z value Pr(>|z|)    
+# (Intercept)  4.60950    0.50032   9.213  < 2e-16 ***
+# Sexmale     -2.74715    0.22441 -12.242  < 2e-16 ***
+# ClassSecond -1.19354    0.30646  -3.895 9.84e-05 ***
+# ClassThird  -2.38903    0.28411  -8.409  < 2e-16 ***
+# Age         -0.04466    0.00908  -4.918 8.73e-07 ***
+# Family      -0.24416    0.07787  -3.136  0.00171 ** 
+# EmbarkedQ   -0.03949    0.42227  -0.094  0.92549    
+# EmbarkedS   -0.55186    0.26154  -2.110  0.03485 *  
+# ---
+# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+# 
+# (Dispersion parameter for binomial family taken to be 1)
+# 
+#     Null deviance: 950.86  on 713  degrees of freedom
+# Residual deviance: 621.52  on 706  degrees of freedom
+# AIC: 637.52
+# 
+# Number of Fisher Scoring iterations: 5
+

This is as good a time as any to introduce the concept of class compression. Think of it as collapsing particular levels on a categorical variable. One of the earlier bar graphs showed about 70 percent of the Titanic’s passengers boarded the ship at Southampton. I’m going to use Embarked and the I() function, which inhibits interpretation & conversion of R objects, to create a new 2-level factor within the model formula. This factor is valued TRUE if a passenger’s port of origin was Southampton (“S”), or FALSE otherwise.

+
 set.seed(35)
+ glm.tune.2 <- train(Fate ~ Sex + Class + Age + Family + I(Embarked=="S"),
+                      data = train.batch, method = "glm",
+                      metric = "ROC", trControl = cv.ctrl)
+ summary(glm.tune.2)
+
## 
+## Call:
+## NULL
+## 
+## Deviance Residuals: 
+##    Min      1Q  Median      3Q     Max  
+## -2.205  -0.616  -0.421   0.643   2.475  
+## 
+## Coefficients:
+##                          Estimate Std. Error z value Pr(>|z|)    
+## (Intercept)               4.61217    0.47557    9.70  < 2e-16 ***
+## Sexmale                  -2.63740    0.22180  -11.89  < 2e-16 ***
+## ClassSecond              -1.25529    0.30287   -4.14  3.4e-05 ***
+## ClassThird               -2.46079    0.27577   -8.92  < 2e-16 ***
+## Age                      -0.04547    0.00881   -5.16  2.5e-07 ***
+## Family                   -0.20848    0.07231   -2.88   0.0039 ** 
+## `I(Embarked == "S")TRUE` -0.52963    0.22742   -2.33   0.0199 *  
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for binomial family taken to be 1)
+## 
+##     Null deviance: 950.86  on 713  degrees of freedom
+## Residual deviance: 631.60  on 707  degrees of freedom
+## AIC: 645.6
+## 
+## Number of Fisher Scoring iterations: 5
+
# Call:
+# NULL
+# 
+# Deviance Residuals: 
+#     Min       1Q   Median       3Q      Max  
+# -2.2165  -0.5935  -0.4127   0.6230   2.5039  
+# 
+# Coefficients:
+#                           Estimate Std. Error z value Pr(>|z|)    
+# (Intercept)               4.599379   0.488154   9.422  < 2e-16 ***
+# Sexmale                  -2.745061   0.223226 -12.297  < 2e-16 ***
+# ClassSecond              -1.196456   0.304837  -3.925 8.68e-05 ***
+# ClassThird               -2.395542   0.275479  -8.696  < 2e-16 ***
+# Age                      -0.044652   0.009076  -4.920 8.66e-07 ***
+# Family                   -0.243642   0.077633  -3.138   0.0017 ** 
+# `I(Embarked == "S")TRUE` -0.539793   0.227551  -2.372   0.0177 *  
+# ---
+# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+# 
+# (Dispersion parameter for binomial family taken to be 1)
+# 
+#     Null deviance: 950.86  on 713  degrees of freedom
+# Residual deviance: 621.53  on 707  degrees of freedom
+# AIC: 635.53
+# 
+# Number of Fisher Scoring iterations: 5
+

As I discussed earlier, the Title feature addresses more than one theme. For that reason, I believe it has real potential to improve this model. Besides, I put a good chunk of effort into it, so why not give it a go?

+
 set.seed(35)
+ glm.tune.3 <- train(Fate ~ Sex + Class + Title + Age 
+                      + Family + I(Embarked=="S"), 
+                      data = train.batch, method = "glm",
+                      metric = "ROC", trControl = cv.ctrl)
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
+## ifelse(type == : prediction from a rank-deficient fit may be misleading
+
 summary(glm.tune.3)
+
## 
+## Call:
+## NULL
+## 
+## Deviance Residuals: 
+##    Min      1Q  Median      3Q     Max  
+## -2.348  -0.577  -0.388   0.573   2.495  
+## 
+## Coefficients: (12 not defined because of singularities)
+##                          Estimate Std. Error z value Pr(>|z|)    
+## (Intercept)               19.9897   623.5558    0.03  0.97443    
+## Sexmale                  -15.2853   623.5554   -0.02  0.98044    
+## ClassSecond               -1.4387     0.3313   -4.34  1.4e-05 ***
+## ClassThird                -2.5456     0.2964   -8.59  < 2e-16 ***
+## TitleCol                       NA         NA      NA       NA    
+## TitleMajor                     NA         NA      NA       NA    
+## TitleSir                  -3.1380     0.8273   -3.79  0.00015 ***
+## TitleLady                      NA         NA      NA       NA    
+## TitleRev                       NA         NA      NA       NA    
+## TitleDr                        NA         NA      NA       NA    
+## TitleDon                       NA         NA      NA       NA    
+## TitleJonkheer                  NA         NA      NA       NA    
+## `Titlethe Countess`            NA         NA      NA       NA    
+## TitleMrs                 -14.5311   623.5557   -0.02  0.98141    
+## TitleMs                        NA         NA      NA       NA    
+## TitleMr                   -3.0466     0.5716   -5.33  9.8e-08 ***
+## TitleMme                       NA         NA      NA       NA    
+## TitleMlle                      NA         NA      NA       NA    
+## TitleMiss                -15.5786   623.5557   -0.02  0.98007    
+## TitleMaster                    NA         NA      NA       NA    
+## Age                       -0.0370     0.0106   -3.47  0.00052 ***
+## Family                    -0.4303     0.0892   -4.83  1.4e-06 ***
+## `I(Embarked == "S")TRUE`  -0.5542     0.2351   -2.36  0.01840 *  
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for binomial family taken to be 1)
+## 
+##     Null deviance: 950.86  on 713  degrees of freedom
+## Residual deviance: 588.32  on 703  degrees of freedom
+## AIC: 610.3
+## 
+## Number of Fisher Scoring iterations: 13
+
# Coefficients:
+#                           Estimate Std. Error z value Pr(>|z|)    
+# (Intercept)               19.98972  623.55577   0.032 0.974426    
+# Sexmale                  -15.28525  623.55543  -0.025 0.980443    
+# TitleMiss                -15.57857  623.55565  -0.025 0.980068    
+# TitleMr                   -3.04656    0.57156  -5.330 9.81e-08 ***
+# TitleMrs                 -14.53106  623.55571  -0.023 0.981408    
+# TitleNoble                -3.13799    0.82733  -3.793 0.000149 ***
+# Age                       -0.03695    0.01065  -3.471 0.000518 ***
+# Family                    -0.43025    0.08915  -4.826 1.39e-06 ***
+# ClassSecond               -1.43867    0.33135  -4.342 1.41e-05 ***
+# ClassThird                -2.54556    0.29641  -8.588  < 2e-16 ***
+# `I(Embarked == "S")TRUE`  -0.55423    0.23509  -2.358 0.018395 *  
+# ---
+# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+# 
+# (Dispersion parameter for binomial family taken to be 1)
+# 
+#     Null deviance: 950.86  on 713  degrees of freedom
+# Residual deviance: 588.32  on 703  degrees of freedom
+# AIC: 610.32
+# 
+# Number of Fisher Scoring iterations: 13
+

Nice! That gave us our first material decline in the residual deviance. Since the Title feature seems to give us everything that Age did (and more), I’m going to drop Age from the formula. I will also collapse the titles “Miss” and “Mrs” and leave a duo of Title-related factors which should represent the “women and children first” theme well.

+
set.seed(35)
+# subst Sir for Noble 
+glm.tune.4 <- train(Fate ~ Class + I(Title=="Mr") + I(Title=="Sir") 
+                      + Age + Family + I(Embarked=="S"), 
+                      data = train.batch, method = "glm",
+                      metric = "ROC", trControl = cv.ctrl)
+summary(glm.tune.4)
+
## 
+## Call:
+## NULL
+## 
+## Deviance Residuals: 
+##    Min      1Q  Median      3Q     Max  
+## -2.499  -0.596  -0.385   0.596   2.435  
+## 
+## Coefficients:
+##                          Estimate Std. Error z value Pr(>|z|)    
+## (Intercept)               4.32392    0.46933    9.21  < 2e-16 ***
+## ClassSecond              -1.31283    0.31657   -4.15  3.4e-05 ***
+## ClassThird               -2.46318    0.28933   -8.51  < 2e-16 ***
+## `I(Title == "Mr")TRUE`   -3.16131    0.25454  -12.42  < 2e-16 ***
+## `I(Title == "Sir")TRUE`  -3.03015    0.56164   -5.40  6.8e-08 ***
+## Age                      -0.02491    0.00891   -2.80   0.0052 ** 
+## Family                   -0.38013    0.07912   -4.80  1.6e-06 ***
+## `I(Embarked == "S")TRUE` -0.51538    0.23302   -2.21   0.0270 *  
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for binomial family taken to be 1)
+## 
+##     Null deviance: 950.86  on 713  degrees of freedom
+## Residual deviance: 599.63  on 706  degrees of freedom
+## AIC: 615.6
+## 
+## Number of Fisher Scoring iterations: 5
+
# Call:
+# NULL
+# 
+# Deviance Residuals: 
+#     Min       1Q   Median       3Q      Max  
+# -2.4813  -0.5646  -0.3840   0.6026   2.4523  
+# 
+# Coefficients:
+#                            Estimate Std. Error z value Pr(>|z|)    
+# (Intercept)                4.348090   0.479097   9.076  < 2e-16 ***
+# ClassSecond               -1.320352   0.318842  -4.141 3.46e-05 ***
+# ClassThird                -2.372211   0.284693  -8.333  < 2e-16 ***
+# `I(Title == "Mr")TRUE`    -3.238061   0.253776 -12.760  < 2e-16 ***
+# `I(Title == "Noble")TRUE` -2.616810   0.619869  -4.222 2.43e-05 ***
+# Age                       -0.026335   0.009127  -2.885  0.00391 ** 
+# Family                    -0.434170   0.084179  -5.158 2.50e-07 ***
+# `I(Embarked == "S")TRUE`  -0.508882   0.232502  -2.189  0.02862 *  
+# ---
+# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+# 
+# (Dispersion parameter for binomial family taken to be 1)
+# 
+#     Null deviance: 950.86  on 713  degrees of freedom
+# Residual deviance: 593.28  on 706  degrees of freedom
+# AIC: 609.28
+# 
+# Number of Fisher Scoring iterations: 5
+

Remember that there were a lot of male passengers in third class. Given the “women and children first” policy already mentioned plus reports that the Titanic’s internal layout was confusing (I recall reading that one crew member claimed it took him two weeks to become comfortable with finding his way around the ship), to say that “grown men in the lower decks had it tough” is such a gross understatement that I hesitated to put it in type. A feature reflecting those third-class men might make a further dent in that residual deviance. Indeed, it does…

+
 set.seed(35)
+ glm.tune.5 <- train(Fate ~ Class + I(Title=="Mr") + I(Title=="Sir") 
+                      + Age + Family + I(Embarked=="S") 
+                      + I(Title=="Mr"&Class=="Third"), 
+                      data = train.batch, 
+                      method = "glm", metric = "ROC", 
+                      trControl = cv.ctrl)
+ 
+summary(glm.tune.5)
+
## 
+## Call:
+## NULL
+## 
+## Deviance Residuals: 
+##    Min      1Q  Median      3Q     Max  
+## -3.055  -0.571  -0.421   0.358   2.334  
+## 
+## Coefficients:
+##                                           Estimate Std. Error z value
+## (Intercept)                                6.12777    0.71231    8.60
+## ClassSecond                               -1.95084    0.43127   -4.52
+## ClassThird                                -4.54488    0.60939   -7.46
+## `I(Title == "Mr")TRUE`                    -4.98118    0.54835   -9.08
+## `I(Title == "Sir")TRUE`                   -4.48154    0.72792   -6.16
+## Age                                       -0.02944    0.00975   -3.02
+## Family                                    -0.35226    0.08273   -4.26
+## `I(Embarked == "S")TRUE`                  -0.55795    0.23065   -2.42
+## `I(Title == "Mr" & Class == "Third")TRUE`  2.72557    0.60693    4.49
+##                                           Pr(>|z|)    
+## (Intercept)                                < 2e-16 ***
+## ClassSecond                                6.1e-06 ***
+## ClassThird                                 8.8e-14 ***
+## `I(Title == "Mr")TRUE`                     < 2e-16 ***
+## `I(Title == "Sir")TRUE`                    7.4e-10 ***
+## Age                                         0.0025 ** 
+## Family                                     2.1e-05 ***
+## `I(Embarked == "S")TRUE`                    0.0156 *  
+## `I(Title == "Mr" & Class == "Third")TRUE`  7.1e-06 ***
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for binomial family taken to be 1)
+## 
+##     Null deviance: 950.86  on 713  degrees of freedom
+## Residual deviance: 573.80  on 705  degrees of freedom
+## AIC: 591.8
+## 
+## Number of Fisher Scoring iterations: 6
+
# 
+# Call:
+# NULL
+# 
+# Deviance Residuals: 
+#     Min       1Q   Median       3Q      Max  
+# -3.0703  -0.5859  -0.3947   0.3725   2.4811  
+# 
+# Coefficients:
+#                                  Estimate Std. Error z value Pr(>|z|)    
+# (Intercept)                       6.33818    0.72561   8.735  < 2e-16 ***
+# ClassSecond                      -2.19222    0.48004  -4.567 4.95e-06 ***
+# ClassThird                       -4.65442    0.60918  -7.641 2.16e-14 ***
+# `I(Title == "Mr")TRUE`           -5.20467    0.54771  -9.503  < 2e-16 ***
+# `I(Title == "Noble")TRUE`        -4.07411    0.77141  -5.281 1.28e-07 ***
+# Age                              -0.03268    0.01023  -3.194  0.00140 ** 
+# Family                           -0.40503    0.08971  -4.515 6.34e-06 ***
+# `I(Embarked == "S")TRUE`         -0.59956    0.23065  -2.599  0.00934 ** 
+# `I(Title == "Mr"                  3.00867    0.60761   4.952 7.36e-07 ***
+#    & Class == "Third")TRUE`
+# ---
+# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+# 
+# (Dispersion parameter for binomial family taken to be 1)
+# 
+#     Null deviance: 950.86  on 713  degrees of freedom
+# Residual deviance: 561.11  on 705  degrees of freedom
+# AIC: 579.11
+# 
+# Number of Fisher Scoring iterations: 6
+

Unfortunately, the other features did not contribute to further deviance compression. Taking a different approach to representing the “women and children first” policy didn’t bear fruit (removing the title references in the formula and adding Boat.dibs produced a residual deviance of 565 – no better than what we already have, using a new feature which some may find confusing). Given that Deck and Side combined (a) shaved just a few points off of the deviance, and (b) were derived from such a small subset of the training data, I decided to withdraw them from consideration.

+
+

Other Models

+

Logistic regression is certainly not the only binary classification model available. There are plenty more –- perhaps too many for some data scientists to digest. For purpose of illustration, I’ll simply take the logistic regression model formula from glm.tune.1 and pass it through train() for each of three other model types, with one new twist: tuning variables specific to each model.

+

First up is boosting. I can instruct train to fit a stochastic boosting model for the binary response Fate using the adapackage and a range of values for each of three tuning parameters. Concretely, when fitting a model using train with method=”ada”, one has three levers to tweak: iter (number of boosting iterations, default=50), maxdepth (depth of trees), and nu (shrinkage parameter, default=1). Create a data frame with these three variables as column names and one row per tuning variable combination, and you’re good to go. Here is just one example of a tuning grid for ada:

+
## note the dot preceding each variable
+ada.grid <- expand.grid(.iter = c(50, 100),
+                        .maxdepth = c(4, 8),
+                        .nu = c(0.1, 1))
+

Specify method=”ada” and tuneGrid=ada.grid in train, and away we go…

+
set.seed(35)
+
+ada.tune <- train(Fate ~ Sex + Class + Age + Family + Embarked, 
+                  data = train.batch,
+                  method = "ada",
+                  metric = "ROC",
+                  tuneGrid = ada.grid,
+                  trControl = cv.ctrl)
+

The model output shows that, given the train.batch data and 8 combinations of tuning variables tested, the optimal model had an ROC of 0.871. The tuning parameter values used to build that model were iter = 100, maxdepth = 4, and nu = 0.1.

+
ada.tune
+
## Boosted Classification Trees 
+## 
+## 714 samples
+##  11 predictor
+##   2 classes: 'Perished', 'Survived' 
+## 
+## No pre-processing
+## Resampling: Cross-Validated (10 fold, repeated 3 times) 
+## Summary of sample sizes: 643, 643, 643, 642, 643, 643, ... 
+## Resampling results across tuning parameters:
+## 
+##   nu   maxdepth  iter  ROC   Sens  Spec  ROC SD  Sens SD  Spec SD
+##   0.1  4          50   0.87  0.92  0.66  0.051   0.041    0.101  
+##   0.1  4         100   0.87  0.90  0.69  0.052   0.045    0.102  
+##   0.1  8          50   0.86  0.90  0.69  0.051   0.046    0.094  
+##   0.1  8         100   0.85  0.89  0.70  0.049   0.051    0.087  
+##   1.0  4          50   0.86  0.88  0.70  0.048   0.044    0.084  
+##   1.0  4         100   0.85  0.88  0.70  0.049   0.050    0.082  
+##   1.0  8          50   0.82  0.87  0.72  0.051   0.053    0.098  
+##   1.0  8         100   0.82  0.87  0.72  0.054   0.055    0.093  
+## 
+## ROC was used to select the optimal model using  the largest value.
+## The final values used for the model were iter = 50, maxdepth = 4 and nu
+##  = 0.1.
+
# 714 samples
+#  11 predictors
+#   2 classes: 'Perished', 'Survived' 
+# 
+# No pre-processing
+# Resampling: Cross-Validation (10 fold, repeated 3 times) 
+# 
+# Summary of sample sizes: 642, 643, 643, 642, 642, 643, ... 
+# 
+# Resampling results across tuning parameters:
+# 
+#   iter  maxdepth  nu   ROC    Sens   Spec   ROC SD  Sens SD  Spec SD
+#   50    4         0.1  0.869  0.931  0.666  0.061   0.046    0.0784 
+#   50    4         1    0.855  0.907  0.703  0.0572  0.046    0.09   
+#   50    8         0.1  0.864  0.919  0.685  0.0571  0.0457   0.085  
+#   50    8         1    0.846  0.88   0.716  0.0559  0.0482   0.0944 
+#   100   4         0.1  0.871  0.923  0.679  0.0609  0.0449   0.0829 
+#   100   4         1    0.855  0.896  0.707  0.0559  0.0552   0.0884 
+#   100   8         0.1  0.867  0.919  0.7    0.0597  0.0429   0.0767 
+#   100   8         1    0.837  0.879  0.709  0.0646  0.0561   0.0908 
+# 
+# ROC was used to select the optimal model using  the largest value.
+# The final values used for the model were iter = 100, maxdepth = 4 and nu = 0.1.
+
+# doesnt work plot(ada.tune)     ## ada accuracy profile
+

alt text

+

Time to give the popular Random Forest (RF) model a shot at the Titanic challenge. The number of randomly pre-selected predictor variables for each node, designated mtry, is the sole parameter available for tuning an RF with train. Since the number of features is so small, there really isn’t much scope for tuning mtry in this case. Nevertheless, I’ll demonstrate here how it can be done. Let’s have mtry=2 and mtry=3 duke it out over the Titanic data.

+
library(randomForest)
+
## Warning: package 'randomForest' was built under R version 3.2.3
+
## randomForest 4.6-12
+## Type rfNews() to see new features/changes/bug fixes.
+## 
+## Attaching package: 'randomForest'
+## 
+## The following object is masked from 'package:Hmisc':
+## 
+##     combine
+## 
+## The following object is masked from 'package:ggplot2':
+## 
+##     margin
+
rf.grid <- data.frame(.mtry = c(2, 3))
+set.seed(35)
+rf.tune <- train(Fate ~ Sex + Class + Age + Family + Embarked, 
+                 data = train.batch,
+                 method = "rf",
+                 metric = "ROC",
+                 tuneGrid = rf.grid,
+                 trControl = cv.ctrl)
+

Strobl et al suggested setting mtry at the square root of the number of variables. In this case, that would be mtry = 2, which did produce the better RF model.

+
rf.tune
+
## Random Forest 
+## 
+## 714 samples
+##  11 predictor
+##   2 classes: 'Perished', 'Survived' 
+## 
+## No pre-processing
+## Resampling: Cross-Validated (10 fold, repeated 3 times) 
+## Summary of sample sizes: 643, 643, 643, 642, 643, 643, ... 
+## Resampling results across tuning parameters:
+## 
+##   mtry  ROC   Sens  Spec  ROC SD  Sens SD  Spec SD
+##   2     0.86  0.94  0.63  0.048   0.040    0.092  
+##   3     0.86  0.93  0.66  0.048   0.044    0.089  
+## 
+## ROC was used to select the optimal model using  the largest value.
+## The final value used for the model was mtry = 2.
+
# 714 samples
+#  11 predictors
+#   2 classes: 'Perished', 'Survived' 
+# 
+# No pre-processing
+# Resampling: Cross-Validation (10 fold, repeated 3 times) 
+# 
+# Summary of sample sizes: 643, 643, 643, 642, 643, 643, ... 
+# 
+# Resampling results across tuning parameters:
+# 
+#   mtry  ROC    Sens   Spec   ROC SD  Sens SD  Spec SD
+#   2     0.866  0.952  0.633  0.052   0.0288   0.0945 
+#   3     0.861  0.934  0.642  0.0514  0.0345   0.0916 
+# 
+# ROC was used to select the optimal model using  the largest value.
+# The final value used for the model was mtry = 2. 
+

And finally, we’ll fit a support vector machine (SVM) model to the Titanic data. There are two functions which can be tuned for SVM using train. The default value for one of them -– sigest –- produces good results on most occasions. The default grid of cost parameter C is 0.25, 0.5, and 1. If we set train argument tuneLength = 9, the grid expands to c(0.25, 0.5, 1, 2, 4, 8, 16, 32, 64). As SVM is considered sensitive to the scale and magnitude of the presented features, I’ll use the preProcess argument to instruct train to make arrangements for normalizing the data within resampling loops.

+
set.seed(35)
+library(kernlab)
+
## Warning: package 'kernlab' was built under R version 3.2.3
+
## 
+## Attaching package: 'kernlab'
+## 
+## The following object is masked from 'package:ggplot2':
+## 
+##     alpha
+
svm.tune <- train(Fate ~ Sex + Class + Age + Family + Embarked, 
+                  data = train.batch,
+                  method = "svmRadial",
+                  tuneLength = 9,
+                  preProcess = c("center", "scale"),
+                  metric = "ROC",
+                  trControl = cv.ctrl)
+

You may have noticed that the same random number seed was set prior to fitting each model. This ensures that the same resampling sets are used for each model, enabling an “apple-to-apples” comparison of the resampling profiles between models during model evaluation.

+
svm.tune
+
## Support Vector Machines with Radial Basis Function Kernel
+
## Loading required package: kernlab
+
## Warning: package 'kernlab' was built under R version 3.2.3
+
## 
+## Attaching package: 'kernlab'
+## 
+## The following object is masked from 'package:ggplot2':
+## 
+##     alpha
+
## 714 samples
+##  11 predictor
+##   2 classes: 'Perished', 'Survived' 
+## 
+## Pre-processing: centered (7), scaled (7) 
+## Resampling: Cross-Validated (10 fold, repeated 3 times) 
+## Summary of sample sizes: 643, 643, 643, 642, 643, 643, ... 
+## Resampling results across tuning parameters:
+## 
+##   C      ROC   Sens  Spec  ROC SD  Sens SD  Spec SD
+##    0.25  0.84  0.93  0.61  0.050   0.030    0.095  
+##    0.50  0.83  0.93  0.61  0.052   0.033    0.094  
+##    1.00  0.83  0.92  0.62  0.053   0.035    0.093  
+##    2.00  0.82  0.92  0.64  0.051   0.037    0.094  
+##    4.00  0.81  0.91  0.64  0.058   0.042    0.095  
+##    8.00  0.80  0.90  0.63  0.062   0.044    0.102  
+##   16.00  0.80  0.90  0.63  0.064   0.039    0.106  
+##   32.00  0.78  0.90  0.63  0.060   0.043    0.106  
+##   64.00  0.78  0.91  0.62  0.059   0.039    0.109  
+## 
+## Tuning parameter 'sigma' was held constant at a value of 0.29
+## ROC was used to select the optimal model using  the largest value.
+## The final values used for the model were sigma = 0.29 and C = 0.25.
+
# 714 samples
+#  11 predictors
+#   2 classes: 'Perished', 'Survived' 
+# 
+# Pre-processing: centered, scaled 
+# Resampling: Cross-Validation (10 fold, repeated 3 times) 
+# 
+# Summary of sample sizes: 643, 643, 643, 642, 643, 643, ... 
+# 
+# Resampling results across tuning parameters:
+# 
+#   C     ROC    Sens   Spec   ROC SD  Sens SD  Spec SD
+#   0.25  0.832  0.951  0.628  0.0609  0.0274   0.0948 
+#   0.5   0.833  0.947  0.629  0.0627  0.0282   0.0966 
+#   1     0.833  0.944  0.639  0.0589  0.032    0.0904 
+#   2     0.835  0.936  0.645  0.0623  0.0398   0.0892 
+#   4     0.826  0.933  0.644  0.0615  0.0426   0.0935 
+#   8     0.824  0.932  0.64   0.0568  0.0418   0.0845 
+#   16    0.82   0.923  0.634  0.0553  0.0441   0.0867 
+#   32    0.803  0.915  0.633  0.0617  0.0386   0.0876 
+#   64    0.788  0.906  0.626  0.056   0.0367   0.0855 
+# 
+# Tuning parameter 'sigma' was held constant at a value of 0.2204311
+# ROC was used to select the optimal model using  the largest value.
+# The final values used for the model were C = 2 and sigma = 0.22. 
+

Although the model output above does display ROC by cost parameter value, the following graph makes it abundantly clear that the ROC starts dropping at C=4. alt text

+
+
+

Model Evaluation

+

With all four models in hand, I can begin to evaluate their performance by whipping together some cross-tabulations of the observed and predicted Fate for the passengers in the test.batch data. caret makes this easy with the confusionMatrix function.

+
## Logistic regression model
+glm.pred <- predict(glm.tune.5, test.batch)
+confusionMatrix(glm.pred, test.batch$Fate)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction Perished Survived
+##   Perished       97       12
+##   Survived       12       56
+##                                         
+##                Accuracy : 0.864         
+##                  95% CI : (0.805, 0.911)
+##     No Information Rate : 0.616         
+##     P-Value [Acc > NIR] : 2.44e-13      
+##                                         
+##                   Kappa : 0.713         
+##  Mcnemar's Test P-Value : 1             
+##                                         
+##             Sensitivity : 0.890         
+##             Specificity : 0.824         
+##          Pos Pred Value : 0.890         
+##          Neg Pred Value : 0.824         
+##              Prevalence : 0.616         
+##          Detection Rate : 0.548         
+##    Detection Prevalence : 0.616         
+##       Balanced Accuracy : 0.857         
+##                                         
+##        'Positive' Class : Perished      
+## 
+
# Confusion Matrix and Statistics
+# 
+#           Reference
+# Prediction Perished Survived
+#   Perished       97       19
+#   Survived       12       49
+#                                           
+#                Accuracy : 0.8249          
+#                  95% CI : (0.7607, 0.8778)
+#     No Information Rate : 0.6158          
+#     P-Value [Acc > NIR] : 1.304e-09       
+#                                           
+#                   Kappa : 0.6225          
+#  Mcnemar's Test P-Value : 0.2812          
+#                                           
+#             Sensitivity : 0.8899          
+#             Specificity : 0.7206          
+#          Pos Pred Value : 0.8362          
+#          Neg Pred Value : 0.8033          
+#              Prevalence : 0.6158          
+#          Detection Rate : 0.5480          
+#    Detection Prevalence : 0.6554
+
## Boosted model
+ada.pred <- predict(ada.tune, test.batch)
+confusionMatrix(ada.pred, test.batch$Fate)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction Perished Survived
+##   Perished      100       17
+##   Survived        9       51
+##                                         
+##                Accuracy : 0.853         
+##                  95% CI : (0.792, 0.902)
+##     No Information Rate : 0.616         
+##     P-Value [Acc > NIR] : 3.51e-12      
+##                                         
+##                   Kappa : 0.683         
+##  Mcnemar's Test P-Value : 0.17          
+##                                         
+##             Sensitivity : 0.917         
+##             Specificity : 0.750         
+##          Pos Pred Value : 0.855         
+##          Neg Pred Value : 0.850         
+##              Prevalence : 0.616         
+##          Detection Rate : 0.565         
+##    Detection Prevalence : 0.661         
+##       Balanced Accuracy : 0.834         
+##                                         
+##        'Positive' Class : Perished      
+## 
+
# Confusion Matrix and Statistics
+# 
+#           Reference
+# Prediction Perished Survived
+#   Perished      100       23
+#   Survived        9       45
+#                                           
+#                Accuracy : 0.8192          
+#                  95% CI : (0.7545, 0.8729)
+#     No Information Rate : 0.6158          
+#     P-Value [Acc > NIR] : 3.784e-09       
+#                                           
+#                   Kappa : 0.6025          
+#  Mcnemar's Test P-Value : 0.02156         
+#                                           
+#             Sensitivity : 0.9174          
+#             Specificity : 0.6618          
+#          Pos Pred Value : 0.8130          
+#          Neg Pred Value : 0.8333          
+#              Prevalence : 0.6158          
+#          Detection Rate : 0.5650          
+#    Detection Prevalence : 0.6949
+
## Random Forest model
+rf.pred <- predict(rf.tune, test.batch)
+
## Loading required package: randomForest
+
## Warning: package 'randomForest' was built under R version 3.2.3
+
## randomForest 4.6-12
+## Type rfNews() to see new features/changes/bug fixes.
+## 
+## Attaching package: 'randomForest'
+## 
+## The following object is masked from 'package:Hmisc':
+## 
+##     combine
+## 
+## The following object is masked from 'package:ggplot2':
+## 
+##     margin
+
confusionMatrix(rf.pred, test.batch$Fate)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction Perished Survived
+##   Perished      102       18
+##   Survived        7       50
+##                                         
+##                Accuracy : 0.859         
+##                  95% CI : (0.799, 0.906)
+##     No Information Rate : 0.616         
+##     P-Value [Acc > NIR] : 9.46e-13      
+##                                         
+##                   Kappa : 0.692         
+##  Mcnemar's Test P-Value : 0.0455        
+##                                         
+##             Sensitivity : 0.936         
+##             Specificity : 0.735         
+##          Pos Pred Value : 0.850         
+##          Neg Pred Value : 0.877         
+##              Prevalence : 0.616         
+##          Detection Rate : 0.576         
+##    Detection Prevalence : 0.678         
+##       Balanced Accuracy : 0.836         
+##                                         
+##        'Positive' Class : Perished      
+## 
+
# Confusion Matrix and Statistics
+# 
+#           Reference
+# Prediction Perished Survived
+#   Perished      103       27
+#   Survived        6       41
+#                                           
+#                Accuracy : 0.8136          
+#                  95% CI : (0.7483, 0.8681)
+#     No Information Rate : 0.6158          
+#     P-Value [Acc > NIR] : 1.058e-08       
+#                                           
+#                   Kappa : 0.5817          
+#  Mcnemar's Test P-Value : 0.0004985       
+#                                           
+#             Sensitivity : 0.9450          
+#             Specificity : 0.6029          
+#          Pos Pred Value : 0.7923          
+#          Neg Pred Value : 0.8723          
+#              Prevalence : 0.6158          
+#          Detection Rate : 0.5819          
+#    Detection Prevalence : 0.7345 
+
## SVM model 
+svm.pred <- predict(svm.tune, test.batch)
+confusionMatrix(svm.pred, test.batch$Fate)
+
## Confusion Matrix and Statistics
+## 
+##           Reference
+## Prediction Perished Survived
+##   Perished       98       16
+##   Survived       11       52
+##                                         
+##                Accuracy : 0.847         
+##                  95% CI : (0.786, 0.897)
+##     No Information Rate : 0.616         
+##     P-Value [Acc > NIR] : 1.24e-11      
+##                                         
+##                   Kappa : 0.673         
+##  Mcnemar's Test P-Value : 0.441         
+##                                         
+##             Sensitivity : 0.899         
+##             Specificity : 0.765         
+##          Pos Pred Value : 0.860         
+##          Neg Pred Value : 0.825         
+##              Prevalence : 0.616         
+##          Detection Rate : 0.554         
+##    Detection Prevalence : 0.644         
+##       Balanced Accuracy : 0.832         
+##                                         
+##        'Positive' Class : Perished      
+## 
+
# Confusion Matrix and Statistics
+# 
+#           Reference
+# Prediction Perished Survived
+#   Perished      101       27
+#   Survived        8       41
+#                                           
+#                Accuracy : 0.8023          
+#                  95% CI : (0.7359, 0.8582)
+#     No Information Rate : 0.6158          
+#     P-Value [Acc > NIR] : 7.432e-08       
+#                                           
+#                   Kappa : 0.5589          
+#  Mcnemar's Test P-Value : 0.002346        
+#                                           
+#             Sensitivity : 0.9266          
+#             Specificity : 0.6029          
+#          Pos Pred Value : 0.7891          
+#          Neg Pred Value : 0.8367          
+#              Prevalence : 0.6158          
+#          Detection Rate : 0.5706          
+#    Detection Prevalence : 0.7232
+

(Perhaps now you’ve come to appreciate why I revalued the Fate feature earlier!) While there are no convincing conclusions to be drawn from the confusion matrices embedded within the outputs above, the logistic regression model we put together earlier appears to do the best job of selecting the survivors among the passengers in the test.batch. The Random Forest model, on the other hand, seems to have a slight edge on predicting those who perished.

+

We can also calculate, using each of the four fitted models, the predicted probabilities for the test.batch, and use those probabilities to plot the ROC

+
## Logistic regression model (BLACK curve)
+
+glm.probs <- predict(glm.tune.5, test.batch, type = "prob")
+glm.ROC <- roc(response = test.batch$Fate,
+                predictor = glm.probs$Survived,
+                levels = levels(test.batch$Fate))
+
+plot(glm.ROC, type="S")   
+
## 
+## Call:
+## roc.default(response = test.batch$Fate, predictor = glm.probs$Survived,     levels = levels(test.batch$Fate))
+## 
+## Data: glm.probs$Survived in 109 controls (test.batch$Fate Perished) < 68 cases (test.batch$Fate Survived).
+## Area under the curve: 0.89
+
## Area under the curve: 0.8609 
+#```
+
+#```{r53}
+## Boosted model (GREEN curve)
+ada.probs <- predict(ada.tune, test.batch, type = "prob")
+ada.ROC <- roc(response = test.batch$Fate,
+            predictor = ada.probs$Survived,
+            levels = levels(test.batch$Fate))
+plot(ada.ROC, add=TRUE, col="green")    
+
## 
+## Call:
+## roc.default(response = test.batch$Fate, predictor = ada.probs$Survived,     levels = levels(test.batch$Fate))
+## 
+## Data: ada.probs$Survived in 109 controls (test.batch$Fate Perished) < 68 cases (test.batch$Fate Survived).
+## Area under the curve: 0.88
+
## Area under the curve: 0.8759
+#```
+#```{r}
+## Random Forest model (RED curve)
+rf.probs <- predict(rf.tune, test.batch, type = "prob")
+rf.ROC <- roc(response = test.batch$Fate,
+           predictor = rf.probs$Survived,
+           levels = levels(test.batch$Fate))
+plot(rf.ROC, add=TRUE, col="red") 
+
## 
+## Call:
+## roc.default(response = test.batch$Fate, predictor = rf.probs$Survived,     levels = levels(test.batch$Fate))
+## 
+## Data: rf.probs$Survived in 109 controls (test.batch$Fate Perished) < 68 cases (test.batch$Fate Survived).
+## Area under the curve: 0.89
+
## Area under the curve: 0.8713
+#```
+
+#```{r}
+## SVM model (BLUE curve)
+svm.probs <- predict(svm.tune, test.batch, type = "prob")
+svm.ROC <- roc(response = test.batch$Fate,
+            predictor = svm.probs$Survived,
+            levels = levels(test.batch$Fate))
+plot(svm.ROC, add=TRUE, col="blue")
+

+
## 
+## Call:
+## roc.default(response = test.batch$Fate, predictor = svm.probs$Survived,     levels = levels(test.batch$Fate))
+## 
+## Data: svm.probs$Survived in 109 controls (test.batch$Fate Perished) < 68 cases (test.batch$Fate Survived).
+## Area under the curve: 0.88
+
## Area under the curve: 0.8077
+

alt text

+

The following R script uses caret function resamples to collect the resampling results, then calls the dotplot function to create a visualization of the resampling distributions. I’m typically not one for leaning on a single metric for important decisions, but if you have been looking for that one graph which sums up the performance of the four models, this is it.

+
cv.values <- resamples(list(Logit = glm.tune.5, Ada = ada.tune, 
+                            RF = rf.tune, SVM = svm.tune))
+dotplot(cv.values, metric = "ROC")
+

alt text

+

The next graph (my last, scout’s honor) compares the four models on the basis of ROC, sensitivity, and specificity. Here, sensitivity (“Sens” on the graph) is the probability that a model will predict a Titanic passenger’s death, given that the passenger actually did perish. Think of sensitivity in this case as the true perished rate. Specificity (“Spec”), on the other hand, is the probability that a model will predict survival, given that the passenger actually did survive. Simply put, all four models were better at predicting passenger fatalities than survivals, and none of them are significantly better or worse than the other three. Of the four, if I had to pick one, I’d probably put my money on the logistic regression model. alt text

+

Let me reiterate the point I made in the disclaimer, way up at the top of this tl;dr page: This journey, paved with text and graphs, was never intended to reveal a path to discovery of the best model for predicting the fate of the passengers referenced in the Titanic data set. I sought only to demonstrate use of a subset of the tools – methods and software (R in this case) – a data scientist can employ in pursuit of a binary classification model.

+
+
+

Cast Your Votes

+

Given everything we’ve been through here, it would be a shame if we didn’t submit at least one of the four models to the Titanic competition at Kaggle. Here is a script which munges the data Kaggle provided in their test.csv file, uses that data and the logistic regression model glm.tune.5 to predict the survival (or not) of passengers listed in the test file, links the predictions to the PassengerId in a data frame, and writes those results to a submission-ready csv file.

+
# get titles
+df.infer$Title <- getTitle(df.infer)
+
+# impute missing Age values
+df.infer$Title <- changeTitles(df.infer, c("Dona", "Ms"), "Mrs")
+titles.na.test <- c("Master", "Mrs", "Miss", "Mr")
+df.infer$Age <- imputeMedian(df.infer$Age, df.infer$Title, titles.na.test)
+
+# consolidate titles
+
+# same correction for Noble
+
+df.infer$Title <- changeTitles(df.infer, c("Col", "Dr", "Rev"), "Noble")
+df.infer$Title <- changeTitles(df.infer, c("Mlle", "Mme"), "Miss")
+df.infer$Title <- as.factor(df.infer$Title)
+
+# impute missing fares
+df.infer$Fare[ which( df.infer$Fare == 0)] <- NA
+df.infer$Fare <- imputeMedian(df.infer$Fare, df.infer$Pclass, 
+                                as.numeric(levels(df.infer$Pclass)))
+# add the other features
+df.infer <- featureEngrg(df.infer)
+
## The following `from` values were not present in `x`: 1, 0
+
# data prepped for casting predictions
+test.keeps <- train.keeps[-1]
+pred.these <- df.infer[test.keeps]
+
+# use the logistic regression model to generate predictions
+Survived <- predict(glm.tune.5, newdata = pred.these)
+
+# try the svm model 
+# Survived <- predict(svm.tune, newdata = pred.these)
+# reformat predictions to 0 or 1 and link to PassengerId in a data frame
+Survived <- revalue(Survived, c("Survived" = 1, "Perished" = 0))
+predictions <- as.data.frame(Survived)
+predictions$PassengerId <- df.infer$PassengerId
+
+# write predictions to csv file for submission to Kaggle
+write.csv(predictions[,c("PassengerId", "Survived")], 
+          file="Titanic_predictions.csv", row.names=FALSE, quote=FALSE)
+

If you must know, the logistic regression model scored 0.77990 on Kaggle – roughly middle of the pack on the leaderboard (as of late January 2014). I have submitted better scoring models; my best thus far, at 0.79426, trails just 17 percent of the 1150+ participants on the leaderboard.

+

While I’m certain that I could squeeze more out of a model like Random Forest to improve my Kaggle ranking, I see better uses of my time. The correlation between public scores and final scores at Kaggle competitions is historically poor (see this post). Besides, I’d rather devote more time helping people with today’s challenges.

+
+
+ + +
+ + + + + + + + diff --git a/TitanicAnalysis_cache/html/__packages b/TitanicAnalysis_cache/html/__packages new file mode 100644 index 0000000..dac8d47 --- /dev/null +++ b/TitanicAnalysis_cache/html/__packages @@ -0,0 +1,17 @@ +base +Rcpp +Amelia +corrgram +plyr +rpart +ada +lattice +survival +Formula +ggplot2 +Hmisc +stringr +caret +pROC +randomForest +kernlab diff --git a/TitanicAnalysis_cache/html/unnamed-chunk-43_b8774fab43761f790e2c4cdfb596bd15.RData b/TitanicAnalysis_cache/html/unnamed-chunk-43_b8774fab43761f790e2c4cdfb596bd15.RData new file mode 100644 index 0000000..d80c991 Binary files /dev/null and b/TitanicAnalysis_cache/html/unnamed-chunk-43_b8774fab43761f790e2c4cdfb596bd15.RData differ diff --git a/TitanicAnalysis_cache/html/unnamed-chunk-43_b8774fab43761f790e2c4cdfb596bd15.rdb b/TitanicAnalysis_cache/html/unnamed-chunk-43_b8774fab43761f790e2c4cdfb596bd15.rdb new file mode 100644 index 0000000..95176e2 Binary files /dev/null and b/TitanicAnalysis_cache/html/unnamed-chunk-43_b8774fab43761f790e2c4cdfb596bd15.rdb differ diff --git a/TitanicAnalysis_cache/html/unnamed-chunk-43_b8774fab43761f790e2c4cdfb596bd15.rdx b/TitanicAnalysis_cache/html/unnamed-chunk-43_b8774fab43761f790e2c4cdfb596bd15.rdx new file mode 100644 index 0000000..f1a9efe Binary files /dev/null and b/TitanicAnalysis_cache/html/unnamed-chunk-43_b8774fab43761f790e2c4cdfb596bd15.rdx differ diff --git a/TitanicAnalysis_cache/html/unnamed-chunk-44_a71260b362dbf0a43d9c9cd28dea27f2.RData b/TitanicAnalysis_cache/html/unnamed-chunk-44_a71260b362dbf0a43d9c9cd28dea27f2.RData new file mode 100644 index 0000000..b51dcf4 Binary files /dev/null and b/TitanicAnalysis_cache/html/unnamed-chunk-44_a71260b362dbf0a43d9c9cd28dea27f2.RData differ diff --git a/TitanicAnalysis_cache/html/unnamed-chunk-44_a71260b362dbf0a43d9c9cd28dea27f2.rdb b/TitanicAnalysis_cache/html/unnamed-chunk-44_a71260b362dbf0a43d9c9cd28dea27f2.rdb new file mode 100644 index 0000000..9badda0 Binary files /dev/null and b/TitanicAnalysis_cache/html/unnamed-chunk-44_a71260b362dbf0a43d9c9cd28dea27f2.rdb differ diff --git a/TitanicAnalysis_cache/html/unnamed-chunk-44_a71260b362dbf0a43d9c9cd28dea27f2.rdx b/TitanicAnalysis_cache/html/unnamed-chunk-44_a71260b362dbf0a43d9c9cd28dea27f2.rdx new file mode 100644 index 0000000..00a39fd Binary files /dev/null and b/TitanicAnalysis_cache/html/unnamed-chunk-44_a71260b362dbf0a43d9c9cd28dea27f2.rdx differ diff --git a/TitanicAnalysis_cache/html/unnamed-chunk-46_2f719016164bf79e1e7932dcbd1cf206.RData b/TitanicAnalysis_cache/html/unnamed-chunk-46_2f719016164bf79e1e7932dcbd1cf206.RData new file mode 100644 index 0000000..6bff5bd Binary files /dev/null and b/TitanicAnalysis_cache/html/unnamed-chunk-46_2f719016164bf79e1e7932dcbd1cf206.RData differ diff --git a/TitanicAnalysis_cache/html/unnamed-chunk-46_2f719016164bf79e1e7932dcbd1cf206.rdb b/TitanicAnalysis_cache/html/unnamed-chunk-46_2f719016164bf79e1e7932dcbd1cf206.rdb new file mode 100644 index 0000000..0490a60 Binary files /dev/null and b/TitanicAnalysis_cache/html/unnamed-chunk-46_2f719016164bf79e1e7932dcbd1cf206.rdb differ diff --git a/TitanicAnalysis_cache/html/unnamed-chunk-46_2f719016164bf79e1e7932dcbd1cf206.rdx b/TitanicAnalysis_cache/html/unnamed-chunk-46_2f719016164bf79e1e7932dcbd1cf206.rdx new file mode 100644 index 0000000..9dc2079 Binary files /dev/null and b/TitanicAnalysis_cache/html/unnamed-chunk-46_2f719016164bf79e1e7932dcbd1cf206.rdx differ diff --git a/TitanicAnalysis_files/figure-html/52-1.png b/TitanicAnalysis_files/figure-html/52-1.png new file mode 100644 index 0000000..31aa577 Binary files /dev/null and b/TitanicAnalysis_files/figure-html/52-1.png differ diff --git a/TitanicAnalysis_files/figure-html/52-2.png b/TitanicAnalysis_files/figure-html/52-2.png new file mode 100644 index 0000000..351054b Binary files /dev/null and b/TitanicAnalysis_files/figure-html/52-2.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-10-1.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-10-1.png new file mode 100644 index 0000000..67849af Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-10-1.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-27-1.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-27-1.png new file mode 100644 index 0000000..9e85780 Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-27-1.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-4-1.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-4-1.png new file mode 100644 index 0000000..bbaa50b Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-4-1.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-47-1.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-47-1.png new file mode 100644 index 0000000..38733a6 Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-47-1.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-5-1.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-1.png new file mode 100644 index 0000000..4ef9652 Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-5-2.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-2.png new file mode 100644 index 0000000..5c5fa9d Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-2.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-5-3.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-3.png new file mode 100644 index 0000000..9b70a85 Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-3.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-5-4.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-4.png new file mode 100644 index 0000000..3fe9055 Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-4.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-5-5.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-5.png new file mode 100644 index 0000000..e93f957 Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-5.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-5-6.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-6.png new file mode 100644 index 0000000..2bcb64d Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-6.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-5-7.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-7.png new file mode 100644 index 0000000..b288c8b Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-7.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-5-8.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-8.png new file mode 100644 index 0000000..82c3d6c Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-5-8.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-52-1.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-52-1.png new file mode 100644 index 0000000..d1ad923 Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-52-1.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-6-1.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-6-1.png new file mode 100644 index 0000000..f7b093e Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-7-1.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-7-1.png new file mode 100644 index 0000000..c1de047 Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-7-1.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-8-1.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-8-1.png new file mode 100644 index 0000000..964f3d1 Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-8-1.png differ diff --git a/TitanicAnalysis_files/figure-html/unnamed-chunk-9-1.png b/TitanicAnalysis_files/figure-html/unnamed-chunk-9-1.png new file mode 100644 index 0000000..9a5a4a0 Binary files /dev/null and b/TitanicAnalysis_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/Titanic_predictions.csv b/Titanic_predictions.csv new file mode 100644 index 0000000..8b2e5d1 --- /dev/null +++ b/Titanic_predictions.csv @@ -0,0 +1,419 @@ +PassengerId,Survived +892,0 +893,0 +894,0 +895,0 +896,0 +897,0 +898,1 +899,0 +900,1 +901,0 +902,0 +903,0 +904,1 +905,0 +906,1 +907,1 +908,0 +909,0 +910,0 +911,1 +912,0 +913,1 +914,1 +915,1 +916,1 +917,0 +918,1 +919,0 +920,0 +921,0 +922,0 +923,0 +924,0 +925,0 +926,0 +927,0 +928,1 +929,1 +930,0 +931,0 +932,0 +933,0 +934,0 +935,1 +936,1 +937,0 +938,0 +939,0 +940,1 +941,0 +942,0 +943,0 +944,1 +945,1 +946,0 +947,0 +948,0 +949,0 +950,0 +951,1 +952,0 +953,0 +954,0 +955,1 +956,1 +957,1 +958,1 +959,0 +960,1 +961,1 +962,1 +963,0 +964,1 +965,1 +966,1 +967,1 +968,0 +969,1 +970,0 +971,1 +972,1 +973,0 +974,0 +975,0 +976,0 +977,0 +978,1 +979,1 +980,1 +981,1 +982,1 +983,0 +984,1 +985,0 +986,1 +987,0 +988,1 +989,0 +990,1 +991,0 +992,1 +993,0 +994,0 +995,0 +996,1 +997,0 +998,0 +999,0 +1000,0 +1001,0 +1002,0 +1003,1 +1004,1 +1005,1 +1006,1 +1007,0 +1008,0 +1009,1 +1010,1 +1011,1 +1012,1 +1013,0 +1014,1 +1015,0 +1016,0 +1017,1 +1018,0 +1019,1 +1020,0 +1021,0 +1022,0 +1023,1 +1024,0 +1025,0 +1026,0 +1027,0 +1028,0 +1029,0 +1030,1 +1031,0 +1032,0 +1033,1 +1034,0 +1035,0 +1036,0 +1037,0 +1038,0 +1039,0 +1040,0 +1041,1 +1042,1 +1043,0 +1044,0 +1045,0 +1046,0 +1047,0 +1048,1 +1049,1 +1050,0 +1051,0 +1052,1 +1053,1 +1054,1 +1055,0 +1056,1 +1057,0 +1058,0 +1059,0 +1060,1 +1061,1 +1062,0 +1063,0 +1064,0 +1065,0 +1066,0 +1067,1 +1068,1 +1069,0 +1070,1 +1071,1 +1072,0 +1073,0 +1074,1 +1075,0 +1076,1 +1077,0 +1078,1 +1079,0 +1080,0 +1081,0 +1082,0 +1083,0 +1084,0 +1085,0 +1086,1 +1087,0 +1088,1 +1089,1 +1090,0 +1091,0 +1092,1 +1093,1 +1094,1 +1095,1 +1096,0 +1097,1 +1098,1 +1099,0 +1100,1 +1101,0 +1102,0 +1103,0 +1104,0 +1105,1 +1106,0 +1107,0 +1108,1 +1109,0 +1110,1 +1111,0 +1112,1 +1113,0 +1114,1 +1115,0 +1116,1 +1117,0 +1118,0 +1119,1 +1120,0 +1121,0 +1122,0 +1123,1 +1124,0 +1125,0 +1126,0 +1127,0 +1128,0 +1129,0 +1130,1 +1131,1 +1132,1 +1133,1 +1134,0 +1135,0 +1136,0 +1137,0 +1138,1 +1139,0 +1140,1 +1141,1 +1142,1 +1143,0 +1144,0 +1145,0 +1146,0 +1147,0 +1148,0 +1149,0 +1150,1 +1151,0 +1152,0 +1153,0 +1154,1 +1155,1 +1156,0 +1157,0 +1158,0 +1159,0 +1160,1 +1161,0 +1162,0 +1163,0 +1164,1 +1165,1 +1166,0 +1167,1 +1168,0 +1169,0 +1170,0 +1171,0 +1172,1 +1173,1 +1174,1 +1175,1 +1176,1 +1177,0 +1178,0 +1179,0 +1180,0 +1181,0 +1182,0 +1183,1 +1184,0 +1185,1 +1186,0 +1187,0 +1188,1 +1189,0 +1190,0 +1191,0 +1192,0 +1193,0 +1194,0 +1195,0 +1196,1 +1197,1 +1198,0 +1199,1 +1200,0 +1201,0 +1202,0 +1203,0 +1204,0 +1205,1 +1206,1 +1207,1 +1208,0 +1209,0 +1210,0 +1211,0 +1212,0 +1213,0 +1214,0 +1215,0 +1216,1 +1217,0 +1218,1 +1219,0 +1220,0 +1221,0 +1222,1 +1223,0 +1224,0 +1225,1 +1226,0 +1227,0 +1228,0 +1229,0 +1230,0 +1231,1 +1232,0 +1233,0 +1234,0 +1235,1 +1236,1 +1237,1 +1238,0 +1239,1 +1240,0 +1241,1 +1242,1 +1243,0 +1244,0 +1245,0 +1246,0 +1247,0 +1248,1 +1249,0 +1250,0 +1251,0 +1252,0 +1253,1 +1254,1 +1255,0 +1256,1 +1257,0 +1258,0 +1259,1 +1260,1 +1261,0 +1262,0 +1263,1 +1264,0 +1265,0 +1266,1 +1267,1 +1268,0 +1269,0 +1270,0 +1271,0 +1272,0 +1273,0 +1274,0 +1275,1 +1276,0 +1277,1 +1278,0 +1279,0 +1280,0 +1281,0 +1282,0 +1283,1 +1284,0 +1285,0 +1286,0 +1287,1 +1288,0 +1289,1 +1290,0 +1291,0 +1292,1 +1293,0 +1294,1 +1295,1 +1296,0 +1297,0 +1298,0 +1299,0 +1300,1 +1301,1 +1302,1 +1303,1 +1304,1 +1305,0 +1306,1 +1307,0 +1308,0 +1309,1 diff --git a/dftraintmp.csv b/dftraintmp.csv new file mode 100644 index 0000000..4579180 --- /dev/null +++ b/dftraintmp.csv @@ -0,0 +1,892 @@ +"PassengerId" "Survived" "Pclass" "Name" "Sex" "Age" "SibSp" "Parch" "Ticket" "Fare" "Cabin" "Embarked" "Title" +"1" 1 "0" "3" "Braund, Mr. Owen Harris" "male" 22 1 0 "A/5 21171" 7.25 NA "S" "Mr" +"2" 2 "1" "1" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "female" 38 1 0 "PC 17599" 71.2833 "C85" "C" "Mrs" +"3" 3 "1" "3" "Heikkinen, Miss. Laina" "female" 26 0 0 "STON/O2. 3101282" 7.925 NA "S" "Miss" +"4" 4 "1" "1" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" "female" 35 1 0 "113803" 53.1 "C123" "S" "Mrs" +"5" 5 "0" "3" "Allen, Mr. William Henry" "male" 35 0 0 "373450" 8.05 NA "S" "Mr" +"6" 6 "0" "3" "Moran, Mr. James" "male" 30 0 0 "330877" 8.4583 NA "Q" "Mr" +"7" 7 "0" "1" "McCarthy, Mr. Timothy J" "male" 54 0 0 "17463" 51.8625 "E46" "S" "Mr" +"8" 8 "0" "3" "Palsson, Master. Gosta Leonard" "male" 2 3 1 "349909" 21.075 NA "S" "Master" +"9" 9 "1" "3" "Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)" "female" 27 0 2 "347742" 11.1333 NA "S" "Mrs" +"10" 10 "1" "2" "Nasser, Mrs. Nicholas (Adele Achem)" "female" 14 1 0 "237736" 30.0708 NA "C" "Mrs" +"11" 11 "1" "3" "Sandstrom, Miss. Marguerite Rut" "female" 4 1 1 "PP 9549" 16.7 "G6" "S" "Miss" +"12" 12 "1" "1" "Bonnell, Miss. Elizabeth" "female" 58 0 0 "113783" 26.55 "C103" "S" "Miss" +"13" 13 "0" "3" "Saundercock, Mr. William Henry" "male" 20 0 0 "A/5. 2151" 8.05 NA "S" "Mr" +"14" 14 "0" "3" "Andersson, Mr. Anders Johan" "male" 39 1 5 "347082" 31.275 NA "S" "Mr" +"15" 15 "0" "3" "Vestrom, Miss. Hulda Amanda Adolfina" "female" 14 0 0 "350406" 7.8542 NA "S" "Miss" +"16" 16 "1" "2" "Hewlett, Mrs. (Mary D Kingcome) " "female" 55 0 0 "248706" 16 NA "S" "Mrs" +"17" 17 "0" "3" "Rice, Master. Eugene" "male" 2 4 1 "382652" 29.125 NA "Q" "Master" +"18" 18 "1" "2" "Williams, Mr. Charles Eugene" "male" 30 0 0 "244373" 13 NA "S" "Mr" +"19" 19 "0" "3" "Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)" "female" 31 1 0 "345763" 18 NA "S" "Mrs" +"20" 20 "1" "3" "Masselmani, Mrs. Fatima" "female" 35 0 0 "2649" 7.225 NA "C" "Mrs" +"21" 21 "0" "2" "Fynney, Mr. Joseph J" "male" 35 0 0 "239865" 26 NA "S" "Mr" +"22" 22 "1" "2" "Beesley, Mr. Lawrence" "male" 34 0 0 "248698" 13 "D56" "S" "Mr" +"23" 23 "1" "3" "McGowan, Miss. Anna \"Annie\"" "female" 15 0 0 "330923" 8.0292 NA "Q" "Miss" +"24" 24 "1" "1" "Sloper, Mr. William Thompson" "male" 28 0 0 "113788" 35.5 "A6" "S" "Mr" +"25" 25 "0" "3" "Palsson, Miss. Torborg Danira" "female" 8 3 1 "349909" 21.075 NA "S" "Miss" +"26" 26 "1" "3" "Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)" "female" 38 1 5 "347077" 31.3875 NA "S" "Mrs" +"27" 27 "0" "3" "Emir, Mr. Farred Chehab" "male" 30 0 0 "2631" 7.225 NA "C" "Mr" +"28" 28 "0" "1" "Fortune, Mr. Charles Alexander" "male" 19 3 2 "19950" 263 "C23 C25 C27" "S" "Mr" +"29" 29 "1" "3" "O'Dwyer, Miss. Ellen \"Nellie\"" "female" 21 0 0 "330959" 7.8792 NA "Q" "Miss" +"30" 30 "0" "3" "Todoroff, Mr. Lalio" "male" 30 0 0 "349216" 7.8958 NA "S" "Mr" +"31" 31 "0" "1" "Uruchurtu, Don. Manuel E" "male" 40 0 0 "PC 17601" 27.7208 NA "C" "Don" +"32" 32 "1" "1" "Spencer, Mrs. William Augustus (Marie Eugenie)" "female" 35 1 0 "PC 17569" 146.5208 "B78" "C" "Mrs" +"33" 33 "1" "3" "Glynn, Miss. Mary Agatha" "female" 21 0 0 "335677" 7.75 NA "Q" "Miss" +"34" 34 "0" "2" "Wheadon, Mr. Edward H" "male" 66 0 0 "C.A. 24579" 10.5 NA "S" "Mr" +"35" 35 "0" "1" "Meyer, Mr. Edgar Joseph" "male" 28 1 0 "PC 17604" 82.1708 NA "C" "Mr" +"36" 36 "0" "1" "Holverson, Mr. Alexander Oskar" "male" 42 1 0 "113789" 52 NA "S" "Mr" +"37" 37 "1" "3" "Mamee, Mr. Hanna" "male" 30 0 0 "2677" 7.2292 NA "C" "Mr" +"38" 38 "0" "3" "Cann, Mr. Ernest Charles" "male" 21 0 0 "A./5. 2152" 8.05 NA "S" "Mr" +"39" 39 "0" "3" "Vander Planke, Miss. Augusta Maria" "female" 18 2 0 "345764" 18 NA "S" "Miss" +"40" 40 "1" "3" "Nicola-Yarred, Miss. Jamila" "female" 14 1 0 "2651" 11.2417 NA "C" "Miss" +"41" 41 "0" "3" "Ahlin, Mrs. Johan (Johanna Persdotter Larsson)" "female" 40 1 0 "7546" 9.475 NA "S" "Mrs" +"42" 42 "0" "2" "Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)" "female" 27 1 0 "11668" 21 NA "S" "Mrs" +"43" 43 "0" "3" "Kraeff, Mr. Theodor" "male" 30 0 0 "349253" 7.8958 NA "C" "Mr" +"44" 44 "1" "2" "Laroche, Miss. Simonne Marie Anne Andree" "female" 3 1 2 "SC/Paris 2123" 41.5792 NA "C" "Miss" +"45" 45 "1" "3" "Devaney, Miss. Margaret Delia" "female" 19 0 0 "330958" 7.8792 NA "Q" "Miss" +"46" 46 "0" "3" "Rogers, Mr. William John" "male" 30 0 0 "S.C./A.4. 23567" 8.05 NA "S" "Mr" +"47" 47 "0" "3" "Lennon, Mr. Denis" "male" 30 1 0 "370371" 15.5 NA "Q" "Mr" +"48" 48 "1" "3" "O'Driscoll, Miss. Bridget" "female" 21 0 0 "14311" 7.75 NA "Q" "Miss" +"49" 49 "0" "3" "Samaan, Mr. Youssef" "male" 30 2 0 "2662" 21.6792 NA "C" "Mr" +"50" 50 "0" "3" "Arnold-Franchi, Mrs. Josef (Josefine Franchi)" "female" 18 1 0 "349237" 17.8 NA "S" "Mrs" +"51" 51 "0" "3" "Panula, Master. Juha Niilo" "male" 7 4 1 "3101295" 39.6875 NA "S" "Master" +"52" 52 "0" "3" "Nosworthy, Mr. Richard Cater" "male" 21 0 0 "A/4. 39886" 7.8 NA "S" "Mr" +"53" 53 "1" "1" "Harper, Mrs. Henry Sleeper (Myna Haxtun)" "female" 49 1 0 "PC 17572" 76.7292 "D33" "C" "Mrs" +"54" 54 "1" "2" "Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)" "female" 29 1 0 "2926" 26 NA "S" "Mrs" +"55" 55 "0" "1" "Ostby, Mr. Engelhart Cornelius" "male" 65 0 1 "113509" 61.9792 "B30" "C" "Mr" +"56" 56 "1" "1" "Woolner, Mr. Hugh" "male" 30 0 0 "19947" 35.5 "C52" "S" "Mr" +"57" 57 "1" "2" "Rugg, Miss. Emily" "female" 21 0 0 "C.A. 31026" 10.5 NA "S" "Miss" +"58" 58 "0" "3" "Novel, Mr. Mansouer" "male" 28.5 0 0 "2697" 7.2292 NA "C" "Mr" +"59" 59 "1" "2" "West, Miss. Constance Mirium" "female" 5 1 2 "C.A. 34651" 27.75 NA "S" "Miss" +"60" 60 "0" "3" "Goodwin, Master. William Frederick" "male" 11 5 2 "CA 2144" 46.9 NA "S" "Master" +"61" 61 "0" "3" "Sirayanian, Mr. Orsen" "male" 22 0 0 "2669" 7.2292 NA "C" "Mr" +"62" 62 "1" "1" "Icard, Miss. Amelie" "female" 38 0 0 "113572" 80 "B28" "S" "Miss" +"63" 63 "0" "1" "Harris, Mr. Henry Birkhardt" "male" 45 1 0 "36973" 83.475 "C83" "S" "Mr" +"64" 64 "0" "3" "Skoog, Master. Harald" "male" 4 3 2 "347088" 27.9 NA "S" "Master" +"65" 65 "0" "1" "Stewart, Mr. Albert A" "male" 30 0 0 "PC 17605" 27.7208 NA "C" "Mr" +"66" 66 "1" "3" "Moubarek, Master. Gerios" "male" 3.5 1 1 "2661" 15.2458 NA "C" "Master" +"67" 67 "1" "2" "Nye, Mrs. (Elizabeth Ramell)" "female" 29 0 0 "C.A. 29395" 10.5 "F33" "S" "Mrs" +"68" 68 "0" "3" "Crease, Mr. Ernest James" "male" 19 0 0 "S.P. 3464" 8.1583 NA "S" "Mr" +"69" 69 "1" "3" "Andersson, Miss. Erna Alexandra" "female" 17 4 2 "3101281" 7.925 NA "S" "Miss" +"70" 70 "0" "3" "Kink, Mr. Vincenz" "male" 26 2 0 "315151" 8.6625 NA "S" "Mr" +"71" 71 "0" "2" "Jenkin, Mr. Stephen Curnow" "male" 32 0 0 "C.A. 33111" 10.5 NA "S" "Mr" +"72" 72 "0" "3" "Goodwin, Miss. Lillian Amy" "female" 16 5 2 "CA 2144" 46.9 NA "S" "Miss" +"73" 73 "0" "2" "Hood, Mr. Ambrose Jr" "male" 21 0 0 "S.O.C. 14879" 73.5 NA "S" "Mr" +"74" 74 "0" "3" "Chronopoulos, Mr. Apostolos" "male" 26 1 0 "2680" 14.4542 NA "C" "Mr" +"75" 75 "1" "3" "Bing, Mr. Lee" "male" 32 0 0 "1601" 56.4958 NA "S" "Mr" +"76" 76 "0" "3" "Moen, Mr. Sigurd Hansen" "male" 25 0 0 "348123" 7.65 "F G73" "S" "Mr" +"77" 77 "0" "3" "Staneff, Mr. Ivan" "male" 30 0 0 "349208" 7.8958 NA "S" "Mr" +"78" 78 "0" "3" "Moutal, Mr. Rahamin Haim" "male" 30 0 0 "374746" 8.05 NA "S" "Mr" +"79" 79 "1" "2" "Caldwell, Master. Alden Gates" "male" 0.83 0 2 "248738" 29 NA "S" "Master" +"80" 80 "1" "3" "Dowdell, Miss. Elizabeth" "female" 30 0 0 "364516" 12.475 NA "S" "Miss" +"81" 81 "0" "3" "Waelens, Mr. Achille" "male" 22 0 0 "345767" 9 NA "S" "Mr" +"82" 82 "1" "3" "Sheerlinck, Mr. Jan Baptist" "male" 29 0 0 "345779" 9.5 NA "S" "Mr" +"83" 83 "1" "3" "McDermott, Miss. Brigdet Delia" "female" 21 0 0 "330932" 7.7875 NA "Q" "Miss" +"84" 84 "0" "1" "Carrau, Mr. Francisco M" "male" 28 0 0 "113059" 47.1 NA "S" "Mr" +"85" 85 "1" "2" "Ilett, Miss. Bertha" "female" 17 0 0 "SO/C 14885" 10.5 NA "S" "Miss" +"86" 86 "1" "3" "Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)" "female" 33 3 0 "3101278" 15.85 NA "S" "Mrs" +"87" 87 "0" "3" "Ford, Mr. William Neal" "male" 16 1 3 "W./C. 6608" 34.375 NA "S" "Mr" +"88" 88 "0" "3" "Slocovski, Mr. Selman Francis" "male" 30 0 0 "SOTON/OQ 392086" 8.05 NA "S" "Mr" +"89" 89 "1" "1" "Fortune, Miss. Mabel Helen" "female" 23 3 2 "19950" 263 "C23 C25 C27" "S" "Miss" +"90" 90 "0" "3" "Celotti, Mr. Francesco" "male" 24 0 0 "343275" 8.05 NA "S" "Mr" +"91" 91 "0" "3" "Christmann, Mr. Emil" "male" 29 0 0 "343276" 8.05 NA "S" "Mr" +"92" 92 "0" "3" "Andreasson, Mr. Paul Edvin" "male" 20 0 0 "347466" 7.8542 NA "S" "Mr" +"93" 93 "0" "1" "Chaffee, Mr. Herbert Fuller" "male" 46 1 0 "W.E.P. 5734" 61.175 "E31" "S" "Mr" +"94" 94 "0" "3" "Dean, Mr. Bertram Frank" "male" 26 1 2 "C.A. 2315" 20.575 NA "S" "Mr" +"95" 95 "0" "3" "Coxon, Mr. Daniel" "male" 59 0 0 "364500" 7.25 NA "S" "Mr" +"96" 96 "0" "3" "Shorney, Mr. Charles Joseph" "male" 30 0 0 "374910" 8.05 NA "S" "Mr" +"97" 97 "0" "1" "Goldschmidt, Mr. George B" "male" 71 0 0 "PC 17754" 34.6542 "A5" "C" "Mr" +"98" 98 "1" "1" "Greenfield, Mr. William Bertram" "male" 23 0 1 "PC 17759" 63.3583 "D10 D12" "C" "Mr" +"99" 99 "1" "2" "Doling, Mrs. John T (Ada Julia Bone)" "female" 34 0 1 "231919" 23 NA "S" "Mrs" +"100" 100 "0" "2" "Kantor, Mr. Sinai" "male" 34 1 0 "244367" 26 NA "S" "Mr" +"101" 101 "0" "3" "Petranec, Miss. Matilda" "female" 28 0 0 "349245" 7.8958 NA "S" "Miss" +"102" 102 "0" "3" "Petroff, Mr. Pastcho (\"Pentcho\")" "male" 30 0 0 "349215" 7.8958 NA "S" "Mr" +"103" 103 "0" "1" "White, Mr. Richard Frasar" "male" 21 0 1 "35281" 77.2875 "D26" "S" "Mr" +"104" 104 "0" "3" "Johansson, Mr. Gustaf Joel" "male" 33 0 0 "7540" 8.6542 NA "S" "Mr" +"105" 105 "0" "3" "Gustafsson, Mr. Anders Vilhelm" "male" 37 2 0 "3101276" 7.925 NA "S" "Mr" +"106" 106 "0" "3" "Mionoff, Mr. Stoytcho" "male" 28 0 0 "349207" 7.8958 NA "S" "Mr" +"107" 107 "1" "3" "Salkjelsvik, Miss. Anna Kristine" "female" 21 0 0 "343120" 7.65 NA "S" "Miss" +"108" 108 "1" "3" "Moss, Mr. Albert Johan" "male" 30 0 0 "312991" 7.775 NA "S" "Mr" +"109" 109 "0" "3" "Rekic, Mr. Tido" "male" 38 0 0 "349249" 7.8958 NA "S" "Mr" +"110" 110 "1" "3" "Moran, Miss. Bertha" "female" 21 1 0 "371110" 24.15 NA "Q" "Miss" +"111" 111 "0" "1" "Porter, Mr. Walter Chamberlain" "male" 47 0 0 "110465" 52 "C110" "S" "Mr" +"112" 112 "0" "3" "Zabour, Miss. Hileni" "female" 14.5 1 0 "2665" 14.4542 NA "C" "Miss" +"113" 113 "0" "3" "Barton, Mr. David John" "male" 22 0 0 "324669" 8.05 NA "S" "Mr" +"114" 114 "0" "3" "Jussila, Miss. Katriina" "female" 20 1 0 "4136" 9.825 NA "S" "Miss" +"115" 115 "0" "3" "Attalah, Miss. Malake" "female" 17 0 0 "2627" 14.4583 NA "C" "Miss" +"116" 116 "0" "3" "Pekoniemi, Mr. Edvard" "male" 21 0 0 "STON/O 2. 3101294" 7.925 NA "S" "Mr" +"117" 117 "0" "3" "Connors, Mr. Patrick" "male" 70.5 0 0 "370369" 7.75 NA "Q" "Mr" +"118" 118 "0" "2" "Turpin, Mr. William John Robert" "male" 29 1 0 "11668" 21 NA "S" "Mr" +"119" 119 "0" "1" "Baxter, Mr. Quigg Edmond" "male" 24 0 1 "PC 17558" 247.5208 "B58 B60" "C" "Mr" +"120" 120 "0" "3" "Andersson, Miss. Ellis Anna Maria" "female" 2 4 2 "347082" 31.275 NA "S" "Miss" +"121" 121 "0" "2" "Hickman, Mr. Stanley George" "male" 21 2 0 "S.O.C. 14879" 73.5 NA "S" "Mr" +"122" 122 "0" "3" "Moore, Mr. Leonard Charles" "male" 30 0 0 "A4. 54510" 8.05 NA "S" "Mr" +"123" 123 "0" "2" "Nasser, Mr. Nicholas" "male" 32.5 1 0 "237736" 30.0708 NA "C" "Mr" +"124" 124 "1" "2" "Webber, Miss. Susan" "female" 32.5 0 0 "27267" 13 "E101" "S" "Miss" +"125" 125 "0" "1" "White, Mr. Percival Wayland" "male" 54 0 1 "35281" 77.2875 "D26" "S" "Mr" +"126" 126 "1" "3" "Nicola-Yarred, Master. Elias" "male" 12 1 0 "2651" 11.2417 NA "C" "Master" +"127" 127 "0" "3" "McMahon, Mr. Martin" "male" 30 0 0 "370372" 7.75 NA "Q" "Mr" +"128" 128 "1" "3" "Madsen, Mr. Fridtjof Arne" "male" 24 0 0 "C 17369" 7.1417 NA "S" "Mr" +"129" 129 "1" "3" "Peter, Miss. Anna" "female" 21 1 1 "2668" 22.3583 "F E69" "C" "Miss" +"130" 130 "0" "3" "Ekstrom, Mr. Johan" "male" 45 0 0 "347061" 6.975 NA "S" "Mr" +"131" 131 "0" "3" "Drazenoic, Mr. Jozef" "male" 33 0 0 "349241" 7.8958 NA "C" "Mr" +"132" 132 "0" "3" "Coelho, Mr. Domingos Fernandeo" "male" 20 0 0 "SOTON/O.Q. 3101307" 7.05 NA "S" "Mr" +"133" 133 "0" "3" "Robins, Mrs. Alexander A (Grace Charity Laury)" "female" 47 1 0 "A/5. 3337" 14.5 NA "S" "Mrs" +"134" 134 "1" "2" "Weisz, Mrs. Leopold (Mathilde Francoise Pede)" "female" 29 1 0 "228414" 26 NA "S" "Mrs" +"135" 135 "0" "2" "Sobey, Mr. Samuel James Hayden" "male" 25 0 0 "C.A. 29178" 13 NA "S" "Mr" +"136" 136 "0" "2" "Richard, Mr. Emile" "male" 23 0 0 "SC/PARIS 2133" 15.0458 NA "C" "Mr" +"137" 137 "1" "1" "Newsom, Miss. Helen Monypeny" "female" 19 0 2 "11752" 26.2833 "D47" "S" "Miss" +"138" 138 "0" "1" "Futrelle, Mr. Jacques Heath" "male" 37 1 0 "113803" 53.1 "C123" "S" "Mr" +"139" 139 "0" "3" "Osen, Mr. Olaf Elon" "male" 16 0 0 "7534" 9.2167 NA "S" "Mr" +"140" 140 "0" "1" "Giglio, Mr. Victor" "male" 24 0 0 "PC 17593" 79.2 "B86" "C" "Mr" +"141" 141 "0" "3" "Boulos, Mrs. Joseph (Sultana)" "female" 35 0 2 "2678" 15.2458 NA "C" "Mrs" +"142" 142 "1" "3" "Nysten, Miss. Anna Sofia" "female" 22 0 0 "347081" 7.75 NA "S" "Miss" +"143" 143 "1" "3" "Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)" "female" 24 1 0 "STON/O2. 3101279" 15.85 NA "S" "Mrs" +"144" 144 "0" "3" "Burke, Mr. Jeremiah" "male" 19 0 0 "365222" 6.75 NA "Q" "Mr" +"145" 145 "0" "2" "Andrew, Mr. Edgardo Samuel" "male" 18 0 0 "231945" 11.5 NA "S" "Mr" +"146" 146 "0" "2" "Nicholls, Mr. Joseph Charles" "male" 19 1 1 "C.A. 33112" 36.75 NA "S" "Mr" +"147" 147 "1" "3" "Andersson, Mr. August Edvard (\"Wennerstrom\")" "male" 27 0 0 "350043" 7.7958 NA "S" "Mr" +"148" 148 "0" "3" "Ford, Miss. Robina Maggie \"Ruby\"" "female" 9 2 2 "W./C. 6608" 34.375 NA "S" "Miss" +"149" 149 "0" "2" "Navratil, Mr. Michel (\"Louis M Hoffman\")" "male" 36.5 0 2 "230080" 26 "F2" "S" "Mr" +"150" 150 "0" "2" "Byles, Rev. Thomas Roussel Davids" "male" 42 0 0 "244310" 13 NA "S" "Rev" +"151" 151 "0" "2" "Bateman, Rev. Robert James" "male" 51 0 0 "S.O.P. 1166" 12.525 NA "S" "Rev" +"152" 152 "1" "1" "Pears, Mrs. Thomas (Edith Wearne)" "female" 22 1 0 "113776" 66.6 "C2" "S" "Mrs" +"153" 153 "0" "3" "Meo, Mr. Alfonzo" "male" 55.5 0 0 "A.5. 11206" 8.05 NA "S" "Mr" +"154" 154 "0" "3" "van Billiard, Mr. Austin Blyler" "male" 40.5 0 2 "A/5. 851" 14.5 NA "S" "Mr" +"155" 155 "0" "3" "Olsen, Mr. Ole Martin" "male" 30 0 0 "Fa 265302" 7.3125 NA "S" "Mr" +"156" 156 "0" "1" "Williams, Mr. Charles Duane" "male" 51 0 1 "PC 17597" 61.3792 NA "C" "Mr" +"157" 157 "1" "3" "Gilnagh, Miss. Katherine \"Katie\"" "female" 16 0 0 "35851" 7.7333 NA "Q" "Miss" +"158" 158 "0" "3" "Corn, Mr. Harry" "male" 30 0 0 "SOTON/OQ 392090" 8.05 NA "S" "Mr" +"159" 159 "0" "3" "Smiljanic, Mr. Mile" "male" 30 0 0 "315037" 8.6625 NA "S" "Mr" +"160" 160 "0" "3" "Sage, Master. Thomas Henry" "male" 3.5 8 2 "CA. 2343" 69.55 NA "S" "Master" +"161" 161 "0" "3" "Cribb, Mr. John Hatfield" "male" 44 0 1 "371362" 16.1 NA "S" "Mr" +"162" 162 "1" "2" "Watt, Mrs. James (Elizabeth \"Bessie\" Inglis Milne)" "female" 40 0 0 "C.A. 33595" 15.75 NA "S" "Mrs" +"163" 163 "0" "3" "Bengtsson, Mr. John Viktor" "male" 26 0 0 "347068" 7.775 NA "S" "Mr" +"164" 164 "0" "3" "Calic, Mr. Jovo" "male" 17 0 0 "315093" 8.6625 NA "S" "Mr" +"165" 165 "0" "3" "Panula, Master. Eino Viljami" "male" 1 4 1 "3101295" 39.6875 NA "S" "Master" +"166" 166 "1" "3" "Goldsmith, Master. Frank John William \"Frankie\"" "male" 9 0 2 "363291" 20.525 NA "S" "Master" +"167" 167 "1" "1" "Chibnall, Mrs. (Edith Martha Bowerman)" "female" 35 0 1 "113505" 55 "E33" "S" "Mrs" +"168" 168 "0" "3" "Skoog, Mrs. William (Anna Bernhardina Karlsson)" "female" 45 1 4 "347088" 27.9 NA "S" "Mrs" +"169" 169 "0" "1" "Baumann, Mr. John D" "male" 30 0 0 "PC 17318" 25.925 NA "S" "Mr" +"170" 170 "0" "3" "Ling, Mr. Lee" "male" 28 0 0 "1601" 56.4958 NA "S" "Mr" +"171" 171 "0" "1" "Van der hoef, Mr. Wyckoff" "male" 61 0 0 "111240" 33.5 "B19" "S" "Mr" +"172" 172 "0" "3" "Rice, Master. Arthur" "male" 4 4 1 "382652" 29.125 NA "Q" "Master" +"173" 173 "1" "3" "Johnson, Miss. Eleanor Ileen" "female" 1 1 1 "347742" 11.1333 NA "S" "Miss" +"174" 174 "0" "3" "Sivola, Mr. Antti Wilhelm" "male" 21 0 0 "STON/O 2. 3101280" 7.925 NA "S" "Mr" +"175" 175 "0" "1" "Smith, Mr. James Clinch" "male" 56 0 0 "17764" 30.6958 "A7" "C" "Mr" +"176" 176 "0" "3" "Klasen, Mr. Klas Albin" "male" 18 1 1 "350404" 7.8542 NA "S" "Mr" +"177" 177 "0" "3" "Lefebre, Master. Henry Forbes" "male" 3.5 3 1 "4133" 25.4667 NA "S" "Master" +"178" 178 "0" "1" "Isham, Miss. Ann Elizabeth" "female" 50 0 0 "PC 17595" 28.7125 "C49" "C" "Miss" +"179" 179 "0" "2" "Hale, Mr. Reginald" "male" 30 0 0 "250653" 13 NA "S" "Mr" +"180" 180 "0" "3" "Leonard, Mr. Lionel" "male" 36 0 0 "LINE" 8.05 NA "S" "Mr" +"181" 181 "0" "3" "Sage, Miss. Constance Gladys" "female" 21 8 2 "CA. 2343" 69.55 NA "S" "Miss" +"182" 182 "0" "2" "Pernot, Mr. Rene" "male" 30 0 0 "SC/PARIS 2131" 15.05 NA "C" "Mr" +"183" 183 "0" "3" "Asplund, Master. Clarence Gustaf Hugo" "male" 9 4 2 "347077" 31.3875 NA "S" "Master" +"184" 184 "1" "2" "Becker, Master. Richard F" "male" 1 2 1 "230136" 39 "F4" "S" "Master" +"185" 185 "1" "3" "Kink-Heilmann, Miss. Luise Gretchen" "female" 4 0 2 "315153" 22.025 NA "S" "Miss" +"186" 186 "0" "1" "Rood, Mr. Hugh Roscoe" "male" 30 0 0 "113767" 50 "A32" "S" "Mr" +"187" 187 "1" "3" "O'Brien, Mrs. Thomas (Johanna \"Hannah\" Godfrey)" "female" 35 1 0 "370365" 15.5 NA "Q" "Mrs" +"188" 188 "1" "1" "Romaine, Mr. Charles Hallace (\"Mr C Rolmane\")" "male" 45 0 0 "111428" 26.55 NA "S" "Mr" +"189" 189 "0" "3" "Bourke, Mr. John" "male" 40 1 1 "364849" 15.5 NA "Q" "Mr" +"190" 190 "0" "3" "Turcin, Mr. Stjepan" "male" 36 0 0 "349247" 7.8958 NA "S" "Mr" +"191" 191 "1" "2" "Pinsky, Mrs. (Rosa)" "female" 32 0 0 "234604" 13 NA "S" "Mrs" +"192" 192 "0" "2" "Carbines, Mr. William" "male" 19 0 0 "28424" 13 NA "S" "Mr" +"193" 193 "1" "3" "Andersen-Jensen, Miss. Carla Christine Nielsine" "female" 19 1 0 "350046" 7.8542 NA "S" "Miss" +"194" 194 "1" "2" "Navratil, Master. Michel M" "male" 3 1 1 "230080" 26 "F2" "S" "Master" +"195" 195 "1" "1" "Brown, Mrs. James Joseph (Margaret Tobin)" "female" 44 0 0 "PC 17610" 27.7208 "B4" "C" "Mrs" +"196" 196 "1" "1" "Lurette, Miss. Elise" "female" 58 0 0 "PC 17569" 146.5208 "B80" "C" "Miss" +"197" 197 "0" "3" "Mernagh, Mr. Robert" "male" 30 0 0 "368703" 7.75 NA "Q" "Mr" +"198" 198 "0" "3" "Olsen, Mr. Karl Siegwart Andreas" "male" 42 0 1 "4579" 8.4042 NA "S" "Mr" +"199" 199 "1" "3" "Madigan, Miss. Margaret \"Maggie\"" "female" 21 0 0 "370370" 7.75 NA "Q" "Miss" +"200" 200 "0" "2" "Yrois, Miss. Henriette (\"Mrs Harbeck\")" "female" 24 0 0 "248747" 13 NA "S" "Miss" +"201" 201 "0" "3" "Vande Walle, Mr. Nestor Cyriel" "male" 28 0 0 "345770" 9.5 NA "S" "Mr" +"202" 202 "0" "3" "Sage, Mr. Frederick" "male" 30 8 2 "CA. 2343" 69.55 NA "S" "Mr" +"203" 203 "0" "3" "Johanson, Mr. Jakob Alfred" "male" 34 0 0 "3101264" 6.4958 NA "S" "Mr" +"204" 204 "0" "3" "Youseff, Mr. Gerious" "male" 45.5 0 0 "2628" 7.225 NA "C" "Mr" +"205" 205 "1" "3" "Cohen, Mr. Gurshon \"Gus\"" "male" 18 0 0 "A/5 3540" 8.05 NA "S" "Mr" +"206" 206 "0" "3" "Strom, Miss. Telma Matilda" "female" 2 0 1 "347054" 10.4625 "G6" "S" "Miss" +"207" 207 "0" "3" "Backstrom, Mr. Karl Alfred" "male" 32 1 0 "3101278" 15.85 NA "S" "Mr" +"208" 208 "1" "3" "Albimona, Mr. Nassef Cassem" "male" 26 0 0 "2699" 18.7875 NA "C" "Mr" +"209" 209 "1" "3" "Carr, Miss. Helen \"Ellen\"" "female" 16 0 0 "367231" 7.75 NA "Q" "Miss" +"210" 210 "1" "1" "Blank, Mr. Henry" "male" 40 0 0 "112277" 31 "A31" "C" "Mr" +"211" 211 "0" "3" "Ali, Mr. Ahmed" "male" 24 0 0 "SOTON/O.Q. 3101311" 7.05 NA "S" "Mr" +"212" 212 "1" "2" "Cameron, Miss. Clear Annie" "female" 35 0 0 "F.C.C. 13528" 21 NA "S" "Miss" +"213" 213 "0" "3" "Perkin, Mr. John Henry" "male" 22 0 0 "A/5 21174" 7.25 NA "S" "Mr" +"214" 214 "0" "2" "Givard, Mr. Hans Kristensen" "male" 30 0 0 "250646" 13 NA "S" "Mr" +"215" 215 "0" "3" "Kiernan, Mr. Philip" "male" 30 1 0 "367229" 7.75 NA "Q" "Mr" +"216" 216 "1" "1" "Newell, Miss. Madeleine" "female" 31 1 0 "35273" 113.275 "D36" "C" "Miss" +"217" 217 "1" "3" "Honkanen, Miss. Eliina" "female" 27 0 0 "STON/O2. 3101283" 7.925 NA "S" "Miss" +"218" 218 "0" "2" "Jacobsohn, Mr. Sidney Samuel" "male" 42 1 0 "243847" 27 NA "S" "Mr" +"219" 219 "1" "1" "Bazzani, Miss. Albina" "female" 32 0 0 "11813" 76.2917 "D15" "C" "Miss" +"220" 220 "0" "2" "Harris, Mr. Walter" "male" 30 0 0 "W/C 14208" 10.5 NA "S" "Mr" +"221" 221 "1" "3" "Sunderland, Mr. Victor Francis" "male" 16 0 0 "SOTON/OQ 392089" 8.05 NA "S" "Mr" +"222" 222 "0" "2" "Bracken, Mr. James H" "male" 27 0 0 "220367" 13 NA "S" "Mr" +"223" 223 "0" "3" "Green, Mr. George Henry" "male" 51 0 0 "21440" 8.05 NA "S" "Mr" +"224" 224 "0" "3" "Nenkoff, Mr. Christo" "male" 30 0 0 "349234" 7.8958 NA "S" "Mr" +"225" 225 "1" "1" "Hoyt, Mr. Frederick Maxfield" "male" 38 1 0 "19943" 90 "C93" "S" "Mr" +"226" 226 "0" "3" "Berglund, Mr. Karl Ivar Sven" "male" 22 0 0 "PP 4348" 9.35 NA "S" "Mr" +"227" 227 "1" "2" "Mellors, Mr. William John" "male" 19 0 0 "SW/PP 751" 10.5 NA "S" "Mr" +"228" 228 "0" "3" "Lovell, Mr. John Hall (\"Henry\")" "male" 20.5 0 0 "A/5 21173" 7.25 NA "S" "Mr" +"229" 229 "0" "2" "Fahlstrom, Mr. Arne Jonas" "male" 18 0 0 "236171" 13 NA "S" "Mr" +"230" 230 "0" "3" "Lefebre, Miss. Mathilde" "female" 21 3 1 "4133" 25.4667 NA "S" "Miss" +"231" 231 "1" "1" "Harris, Mrs. Henry Birkhardt (Irene Wallach)" "female" 35 1 0 "36973" 83.475 "C83" "S" "Mrs" +"232" 232 "0" "3" "Larsson, Mr. Bengt Edvin" "male" 29 0 0 "347067" 7.775 NA "S" "Mr" +"233" 233 "0" "2" "Sjostedt, Mr. Ernst Adolf" "male" 59 0 0 "237442" 13.5 NA "S" "Mr" +"234" 234 "1" "3" "Asplund, Miss. Lillian Gertrud" "female" 5 4 2 "347077" 31.3875 NA "S" "Miss" +"235" 235 "0" "2" "Leyson, Mr. Robert William Norman" "male" 24 0 0 "C.A. 29566" 10.5 NA "S" "Mr" +"236" 236 "0" "3" "Harknett, Miss. Alice Phoebe" "female" 21 0 0 "W./C. 6609" 7.55 NA "S" "Miss" +"237" 237 "0" "2" "Hold, Mr. Stephen" "male" 44 1 0 "26707" 26 NA "S" "Mr" +"238" 238 "1" "2" "Collyer, Miss. Marjorie \"Lottie\"" "female" 8 0 2 "C.A. 31921" 26.25 NA "S" "Miss" +"239" 239 "0" "2" "Pengelly, Mr. Frederick William" "male" 19 0 0 "28665" 10.5 NA "S" "Mr" +"240" 240 "0" "2" "Hunt, Mr. George Henry" "male" 33 0 0 "SCO/W 1585" 12.275 NA "S" "Mr" +"241" 241 "0" "3" "Zabour, Miss. Thamine" "female" 21 1 0 "2665" 14.4542 NA "C" "Miss" +"242" 242 "1" "3" "Murphy, Miss. Katherine \"Kate\"" "female" 21 1 0 "367230" 15.5 NA "Q" "Miss" +"243" 243 "0" "2" "Coleridge, Mr. Reginald Charles" "male" 29 0 0 "W./C. 14263" 10.5 NA "S" "Mr" +"244" 244 "0" "3" "Maenpaa, Mr. Matti Alexanteri" "male" 22 0 0 "STON/O 2. 3101275" 7.125 NA "S" "Mr" +"245" 245 "0" "3" "Attalah, Mr. Sleiman" "male" 30 0 0 "2694" 7.225 NA "C" "Mr" +"246" 246 "0" "1" "Minahan, Dr. William Edward" "male" 44 2 0 "19928" 90 "C78" "Q" "Dr" +"247" 247 "0" "3" "Lindahl, Miss. Agda Thorilda Viktoria" "female" 25 0 0 "347071" 7.775 NA "S" "Miss" +"248" 248 "1" "2" "Hamalainen, Mrs. William (Anna)" "female" 24 0 2 "250649" 14.5 NA "S" "Mrs" +"249" 249 "1" "1" "Beckwith, Mr. Richard Leonard" "male" 37 1 1 "11751" 52.5542 "D35" "S" "Mr" +"250" 250 "0" "2" "Carter, Rev. Ernest Courtenay" "male" 54 1 0 "244252" 26 NA "S" "Rev" +"251" 251 "0" "3" "Reed, Mr. James George" "male" 30 0 0 "362316" 7.25 NA "S" "Mr" +"252" 252 "0" "3" "Strom, Mrs. Wilhelm (Elna Matilda Persson)" "female" 29 1 1 "347054" 10.4625 "G6" "S" "Mrs" +"253" 253 "0" "1" "Stead, Mr. William Thomas" "male" 62 0 0 "113514" 26.55 "C87" "S" "Mr" +"254" 254 "0" "3" "Lobb, Mr. William Arthur" "male" 30 1 0 "A/5. 3336" 16.1 NA "S" "Mr" +"255" 255 "0" "3" "Rosblom, Mrs. Viktor (Helena Wilhelmina)" "female" 41 0 2 "370129" 20.2125 NA "S" "Mrs" +"256" 256 "1" "3" "Touma, Mrs. Darwis (Hanne Youssef Razi)" "female" 29 0 2 "2650" 15.2458 NA "C" "Mrs" +"257" 257 "1" "1" "Thorne, Mrs. Gertrude Maybelle" "female" 35 0 0 "PC 17585" 79.2 NA "C" "Mrs" +"258" 258 "1" "1" "Cherry, Miss. Gladys" "female" 30 0 0 "110152" 86.5 "B77" "S" "Miss" +"259" 259 "1" "1" "Ward, Miss. Anna" "female" 35 0 0 "PC 17755" 512.3292 NA "C" "Miss" +"260" 260 "1" "2" "Parrish, Mrs. (Lutie Davis)" "female" 50 0 1 "230433" 26 NA "S" "Mrs" +"261" 261 "0" "3" "Smith, Mr. Thomas" "male" 30 0 0 "384461" 7.75 NA "Q" "Mr" +"262" 262 "1" "3" "Asplund, Master. Edvin Rojj Felix" "male" 3 4 2 "347077" 31.3875 NA "S" "Master" +"263" 263 "0" "1" "Taussig, Mr. Emil" "male" 52 1 1 "110413" 79.65 "E67" "S" "Mr" +"264" 264 "0" "1" "Harrison, Mr. William" "male" 40 0 0 "112059" 61.9792 "B94" "S" "Mr" +"265" 265 "0" "3" "Henry, Miss. Delia" "female" 21 0 0 "382649" 7.75 NA "Q" "Miss" +"266" 266 "0" "2" "Reeves, Mr. David" "male" 36 0 0 "C.A. 17248" 10.5 NA "S" "Mr" +"267" 267 "0" "3" "Panula, Mr. Ernesti Arvid" "male" 16 4 1 "3101295" 39.6875 NA "S" "Mr" +"268" 268 "1" "3" "Persson, Mr. Ernst Ulrik" "male" 25 1 0 "347083" 7.775 NA "S" "Mr" +"269" 269 "1" "1" "Graham, Mrs. William Thompson (Edith Junkins)" "female" 58 0 1 "PC 17582" 153.4625 "C125" "S" "Mrs" +"270" 270 "1" "1" "Bissette, Miss. Amelia" "female" 35 0 0 "PC 17760" 135.6333 "C99" "S" "Miss" +"271" 271 "0" "1" "Cairns, Mr. Alexander" "male" 30 0 0 "113798" 31 NA "S" "Mr" +"272" 272 "1" "3" "Tornquist, Mr. William Henry" "male" 25 0 0 "LINE" 8.05 NA "S" "Mr" +"273" 273 "1" "2" "Mellinger, Mrs. (Elizabeth Anne Maidment)" "female" 41 0 1 "250644" 19.5 NA "S" "Mrs" +"274" 274 "0" "1" "Natsch, Mr. Charles H" "male" 37 0 1 "PC 17596" 29.7 "C118" "C" "Mr" +"275" 275 "1" "3" "Healy, Miss. Hanora \"Nora\"" "female" 21 0 0 "370375" 7.75 NA "Q" "Miss" +"276" 276 "1" "1" "Andrews, Miss. Kornelia Theodosia" "female" 63 1 0 "13502" 77.9583 "D7" "S" "Miss" +"277" 277 "0" "3" "Lindblom, Miss. Augusta Charlotta" "female" 45 0 0 "347073" 7.75 NA "S" "Miss" +"278" 278 "0" "2" "Parkes, Mr. Francis \"Frank\"" "male" 30 0 0 "239853" 15.0229 NA "S" "Mr" +"279" 279 "0" "3" "Rice, Master. Eric" "male" 7 4 1 "382652" 29.125 NA "Q" "Master" +"280" 280 "1" "3" "Abbott, Mrs. Stanton (Rosa Hunt)" "female" 35 1 1 "C.A. 2673" 20.25 NA "S" "Mrs" +"281" 281 "0" "3" "Duane, Mr. Frank" "male" 65 0 0 "336439" 7.75 NA "Q" "Mr" +"282" 282 "0" "3" "Olsson, Mr. Nils Johan Goransson" "male" 28 0 0 "347464" 7.8542 NA "S" "Mr" +"283" 283 "0" "3" "de Pelsmaeker, Mr. Alfons" "male" 16 0 0 "345778" 9.5 NA "S" "Mr" +"284" 284 "1" "3" "Dorking, Mr. Edward Arthur" "male" 19 0 0 "A/5. 10482" 8.05 NA "S" "Mr" +"285" 285 "0" "1" "Smith, Mr. Richard William" "male" 30 0 0 "113056" 26 "A19" "S" "Mr" +"286" 286 "0" "3" "Stankovic, Mr. Ivan" "male" 33 0 0 "349239" 8.6625 NA "C" "Mr" +"287" 287 "1" "3" "de Mulder, Mr. Theodore" "male" 30 0 0 "345774" 9.5 NA "S" "Mr" +"288" 288 "0" "3" "Naidenoff, Mr. Penko" "male" 22 0 0 "349206" 7.8958 NA "S" "Mr" +"289" 289 "1" "2" "Hosono, Mr. Masabumi" "male" 42 0 0 "237798" 13 NA "S" "Mr" +"290" 290 "1" "3" "Connolly, Miss. Kate" "female" 22 0 0 "370373" 7.75 NA "Q" "Miss" +"291" 291 "1" "1" "Barber, Miss. Ellen \"Nellie\"" "female" 26 0 0 "19877" 78.85 NA "S" "Miss" +"292" 292 "1" "1" "Bishop, Mrs. Dickinson H (Helen Walton)" "female" 19 1 0 "11967" 91.0792 "B49" "C" "Mrs" +"293" 293 "0" "2" "Levy, Mr. Rene Jacques" "male" 36 0 0 "SC/Paris 2163" 12.875 "D" "C" "Mr" +"294" 294 "0" "3" "Haas, Miss. Aloisia" "female" 24 0 0 "349236" 8.85 NA "S" "Miss" +"295" 295 "0" "3" "Mineff, Mr. Ivan" "male" 24 0 0 "349233" 7.8958 NA "S" "Mr" +"296" 296 "0" "1" "Lewy, Mr. Ervin G" "male" 30 0 0 "PC 17612" 27.7208 NA "C" "Mr" +"297" 297 "0" "3" "Hanna, Mr. Mansour" "male" 23.5 0 0 "2693" 7.2292 NA "C" "Mr" +"298" 298 "0" "1" "Allison, Miss. Helen Loraine" "female" 2 1 2 "113781" 151.55 "C22 C26" "S" "Miss" +"299" 299 "1" "1" "Saalfeld, Mr. Adolphe" "male" 30 0 0 "19988" 30.5 "C106" "S" "Mr" +"300" 300 "1" "1" "Baxter, Mrs. James (Helene DeLaudeniere Chaput)" "female" 50 0 1 "PC 17558" 247.5208 "B58 B60" "C" "Mrs" +"301" 301 "1" "3" "Kelly, Miss. Anna Katherine \"Annie Kate\"" "female" 21 0 0 "9234" 7.75 NA "Q" "Miss" +"302" 302 "1" "3" "McCoy, Mr. Bernard" "male" 30 2 0 "367226" 23.25 NA "Q" "Mr" +"303" 303 "0" "3" "Johnson, Mr. William Cahoone Jr" "male" 19 0 0 "LINE" 8.05 NA "S" "Mr" +"304" 304 "1" "2" "Keane, Miss. Nora A" "female" 21 0 0 "226593" 12.35 "E101" "Q" "Miss" +"305" 305 "0" "3" "Williams, Mr. Howard Hugh \"Harry\"" "male" 30 0 0 "A/5 2466" 8.05 NA "S" "Mr" +"306" 306 "1" "1" "Allison, Master. Hudson Trevor" "male" 0.92 1 2 "113781" 151.55 "C22 C26" "S" "Master" +"307" 307 "1" "1" "Fleming, Miss. Margaret" "female" 21 0 0 "17421" 110.8833 NA "C" "Miss" +"308" 308 "1" "1" "Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)" "female" 17 1 0 "PC 17758" 108.9 "C65" "C" "Mrs" +"309" 309 "0" "2" "Abelson, Mr. Samuel" "male" 30 1 0 "P/PP 3381" 24 NA "C" "Mr" +"310" 310 "1" "1" "Francatelli, Miss. Laura Mabel" "female" 30 0 0 "PC 17485" 56.9292 "E36" "C" "Miss" +"311" 311 "1" "1" "Hays, Miss. Margaret Bechstein" "female" 24 0 0 "11767" 83.1583 "C54" "C" "Miss" +"312" 312 "1" "1" "Ryerson, Miss. Emily Borie" "female" 18 2 2 "PC 17608" 262.375 "B57 B59 B63 B66" "C" "Miss" +"313" 313 "0" "2" "Lahtinen, Mrs. William (Anna Sylfven)" "female" 26 1 1 "250651" 26 NA "S" "Mrs" +"314" 314 "0" "3" "Hendekovic, Mr. Ignjac" "male" 28 0 0 "349243" 7.8958 NA "S" "Mr" +"315" 315 "0" "2" "Hart, Mr. Benjamin" "male" 43 1 1 "F.C.C. 13529" 26.25 NA "S" "Mr" +"316" 316 "1" "3" "Nilsson, Miss. Helmina Josefina" "female" 26 0 0 "347470" 7.8542 NA "S" "Miss" +"317" 317 "1" "2" "Kantor, Mrs. Sinai (Miriam Sternin)" "female" 24 1 0 "244367" 26 NA "S" "Mrs" +"318" 318 "0" "2" "Moraweck, Dr. Ernest" "male" 54 0 0 "29011" 14 NA "S" "Dr" +"319" 319 "1" "1" "Wick, Miss. Mary Natalie" "female" 31 0 2 "36928" 164.8667 "C7" "S" "Miss" +"320" 320 "1" "1" "Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)" "female" 40 1 1 "16966" 134.5 "E34" "C" "Mrs" +"321" 321 "0" "3" "Dennis, Mr. Samuel" "male" 22 0 0 "A/5 21172" 7.25 NA "S" "Mr" +"322" 322 "0" "3" "Danoff, Mr. Yoto" "male" 27 0 0 "349219" 7.8958 NA "S" "Mr" +"323" 323 "1" "2" "Slayter, Miss. Hilda Mary" "female" 30 0 0 "234818" 12.35 NA "Q" "Miss" +"324" 324 "1" "2" "Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)" "female" 22 1 1 "248738" 29 NA "S" "Mrs" +"325" 325 "0" "3" "Sage, Mr. George John Jr" "male" 30 8 2 "CA. 2343" 69.55 NA "S" "Mr" +"326" 326 "1" "1" "Young, Miss. Marie Grice" "female" 36 0 0 "PC 17760" 135.6333 "C32" "C" "Miss" +"327" 327 "0" "3" "Nysveen, Mr. Johan Hansen" "male" 61 0 0 "345364" 6.2375 NA "S" "Mr" +"328" 328 "1" "2" "Ball, Mrs. (Ada E Hall)" "female" 36 0 0 "28551" 13 "D" "S" "Mrs" +"329" 329 "1" "3" "Goldsmith, Mrs. Frank John (Emily Alice Brown)" "female" 31 1 1 "363291" 20.525 NA "S" "Mrs" +"330" 330 "1" "1" "Hippach, Miss. Jean Gertrude" "female" 16 0 1 "111361" 57.9792 "B18" "C" "Miss" +"331" 331 "1" "3" "McCoy, Miss. Agnes" "female" 21 2 0 "367226" 23.25 NA "Q" "Miss" +"332" 332 "0" "1" "Partner, Mr. Austen" "male" 45.5 0 0 "113043" 28.5 "C124" "S" "Mr" +"333" 333 "0" "1" "Graham, Mr. George Edward" "male" 38 0 1 "PC 17582" 153.4625 "C91" "S" "Mr" +"334" 334 "0" "3" "Vander Planke, Mr. Leo Edmondus" "male" 16 2 0 "345764" 18 NA "S" "Mr" +"335" 335 "1" "1" "Frauenthal, Mrs. Henry William (Clara Heinsheimer)" "female" 35 1 0 "PC 17611" 133.65 NA "S" "Mrs" +"336" 336 "0" "3" "Denkoff, Mr. Mitto" "male" 30 0 0 "349225" 7.8958 NA "S" "Mr" +"337" 337 "0" "1" "Pears, Mr. Thomas Clinton" "male" 29 1 0 "113776" 66.6 "C2" "S" "Mr" +"338" 338 "1" "1" "Burns, Miss. Elizabeth Margaret" "female" 41 0 0 "16966" 134.5 "E40" "C" "Miss" +"339" 339 "1" "3" "Dahl, Mr. Karl Edwart" "male" 45 0 0 "7598" 8.05 NA "S" "Mr" +"340" 340 "0" "1" "Blackwell, Mr. Stephen Weart" "male" 45 0 0 "113784" 35.5 "T" "S" "Mr" +"341" 341 "1" "2" "Navratil, Master. Edmond Roger" "male" 2 1 1 "230080" 26 "F2" "S" "Master" +"342" 342 "1" "1" "Fortune, Miss. Alice Elizabeth" "female" 24 3 2 "19950" 263 "C23 C25 C27" "S" "Miss" +"343" 343 "0" "2" "Collander, Mr. Erik Gustaf" "male" 28 0 0 "248740" 13 NA "S" "Mr" +"344" 344 "0" "2" "Sedgwick, Mr. Charles Frederick Waddington" "male" 25 0 0 "244361" 13 NA "S" "Mr" +"345" 345 "0" "2" "Fox, Mr. Stanley Hubert" "male" 36 0 0 "229236" 13 NA "S" "Mr" +"346" 346 "1" "2" "Brown, Miss. Amelia \"Mildred\"" "female" 24 0 0 "248733" 13 "F33" "S" "Miss" +"347" 347 "1" "2" "Smith, Miss. Marion Elsie" "female" 40 0 0 "31418" 13 NA "S" "Miss" +"348" 348 "1" "3" "Davison, Mrs. Thomas Henry (Mary E Finck)" "female" 35 1 0 "386525" 16.1 NA "S" "Mrs" +"349" 349 "1" "3" "Coutts, Master. William Loch \"William\"" "male" 3 1 1 "C.A. 37671" 15.9 NA "S" "Master" +"350" 350 "0" "3" "Dimic, Mr. Jovan" "male" 42 0 0 "315088" 8.6625 NA "S" "Mr" +"351" 351 "0" "3" "Odahl, Mr. Nils Martin" "male" 23 0 0 "7267" 9.225 NA "S" "Mr" +"352" 352 "0" "1" "Williams-Lambert, Mr. Fletcher Fellows" "male" 30 0 0 "113510" 35 "C128" "S" "Mr" +"353" 353 "0" "3" "Elias, Mr. Tannous" "male" 15 1 1 "2695" 7.2292 NA "C" "Mr" +"354" 354 "0" "3" "Arnold-Franchi, Mr. Josef" "male" 25 1 0 "349237" 17.8 NA "S" "Mr" +"355" 355 "0" "3" "Yousif, Mr. Wazli" "male" 30 0 0 "2647" 7.225 NA "C" "Mr" +"356" 356 "0" "3" "Vanden Steen, Mr. Leo Peter" "male" 28 0 0 "345783" 9.5 NA "S" "Mr" +"357" 357 "1" "1" "Bowerman, Miss. Elsie Edith" "female" 22 0 1 "113505" 55 "E33" "S" "Miss" +"358" 358 "0" "2" "Funk, Miss. Annie Clemmer" "female" 38 0 0 "237671" 13 NA "S" "Miss" +"359" 359 "1" "3" "McGovern, Miss. Mary" "female" 21 0 0 "330931" 7.8792 NA "Q" "Miss" +"360" 360 "1" "3" "Mockler, Miss. Helen Mary \"Ellie\"" "female" 21 0 0 "330980" 7.8792 NA "Q" "Miss" +"361" 361 "0" "3" "Skoog, Mr. Wilhelm" "male" 40 1 4 "347088" 27.9 NA "S" "Mr" +"362" 362 "0" "2" "del Carlo, Mr. Sebastiano" "male" 29 1 0 "SC/PARIS 2167" 27.7208 NA "C" "Mr" +"363" 363 "0" "3" "Barbara, Mrs. (Catherine David)" "female" 45 0 1 "2691" 14.4542 NA "C" "Mrs" +"364" 364 "0" "3" "Asim, Mr. Adola" "male" 35 0 0 "SOTON/O.Q. 3101310" 7.05 NA "S" "Mr" +"365" 365 "0" "3" "O'Brien, Mr. Thomas" "male" 30 1 0 "370365" 15.5 NA "Q" "Mr" +"366" 366 "0" "3" "Adahl, Mr. Mauritz Nils Martin" "male" 30 0 0 "C 7076" 7.25 NA "S" "Mr" +"367" 367 "1" "1" "Warren, Mrs. Frank Manley (Anna Sophia Atkinson)" "female" 60 1 0 "110813" 75.25 "D37" "C" "Mrs" +"368" 368 "1" "3" "Moussa, Mrs. (Mantoura Boulos)" "female" 35 0 0 "2626" 7.2292 NA "C" "Mrs" +"369" 369 "1" "3" "Jermyn, Miss. Annie" "female" 21 0 0 "14313" 7.75 NA "Q" "Miss" +"370" 370 "1" "1" "Aubart, Mme. Leontine Pauline" "female" 24 0 0 "PC 17477" 69.3 "B35" "C" "Mme" +"371" 371 "1" "1" "Harder, Mr. George Achilles" "male" 25 1 0 "11765" 55.4417 "E50" "C" "Mr" +"372" 372 "0" "3" "Wiklund, Mr. Jakob Alfred" "male" 18 1 0 "3101267" 6.4958 NA "S" "Mr" +"373" 373 "0" "3" "Beavan, Mr. William Thomas" "male" 19 0 0 "323951" 8.05 NA "S" "Mr" +"374" 374 "0" "1" "Ringhini, Mr. Sante" "male" 22 0 0 "PC 17760" 135.6333 NA "C" "Mr" +"375" 375 "0" "3" "Palsson, Miss. Stina Viola" "female" 3 3 1 "349909" 21.075 NA "S" "Miss" +"376" 376 "1" "1" "Meyer, Mrs. Edgar Joseph (Leila Saks)" "female" 35 1 0 "PC 17604" 82.1708 NA "C" "Mrs" +"377" 377 "1" "3" "Landergren, Miss. Aurora Adelia" "female" 22 0 0 "C 7077" 7.25 NA "S" "Miss" +"378" 378 "0" "1" "Widener, Mr. Harry Elkins" "male" 27 0 2 "113503" 211.5 "C82" "C" "Mr" +"379" 379 "0" "3" "Betros, Mr. Tannous" "male" 20 0 0 "2648" 4.0125 NA "C" "Mr" +"380" 380 "0" "3" "Gustafsson, Mr. Karl Gideon" "male" 19 0 0 "347069" 7.775 NA "S" "Mr" +"381" 381 "1" "1" "Bidois, Miss. Rosalie" "female" 42 0 0 "PC 17757" 227.525 NA "C" "Miss" +"382" 382 "1" "3" "Nakid, Miss. Maria (\"Mary\")" "female" 1 0 2 "2653" 15.7417 NA "C" "Miss" +"383" 383 "0" "3" "Tikkanen, Mr. Juho" "male" 32 0 0 "STON/O 2. 3101293" 7.925 NA "S" "Mr" +"384" 384 "1" "1" "Holverson, Mrs. Alexander Oskar (Mary Aline Towner)" "female" 35 1 0 "113789" 52 NA "S" "Mrs" +"385" 385 "0" "3" "Plotcharsky, Mr. Vasil" "male" 30 0 0 "349227" 7.8958 NA "S" "Mr" +"386" 386 "0" "2" "Davies, Mr. Charles Henry" "male" 18 0 0 "S.O.C. 14879" 73.5 NA "S" "Mr" +"387" 387 "0" "3" "Goodwin, Master. Sidney Leonard" "male" 1 5 2 "CA 2144" 46.9 NA "S" "Master" +"388" 388 "1" "2" "Buss, Miss. Kate" "female" 36 0 0 "27849" 13 NA "S" "Miss" +"389" 389 "0" "3" "Sadlier, Mr. Matthew" "male" 30 0 0 "367655" 7.7292 NA "Q" "Mr" +"390" 390 "1" "2" "Lehmann, Miss. Bertha" "female" 17 0 0 "SC 1748" 12 NA "C" "Miss" +"391" 391 "1" "1" "Carter, Mr. William Ernest" "male" 36 1 2 "113760" 120 "B96 B98" "S" "Mr" +"392" 392 "1" "3" "Jansson, Mr. Carl Olof" "male" 21 0 0 "350034" 7.7958 NA "S" "Mr" +"393" 393 "0" "3" "Gustafsson, Mr. Johan Birger" "male" 28 2 0 "3101277" 7.925 NA "S" "Mr" +"394" 394 "1" "1" "Newell, Miss. Marjorie" "female" 23 1 0 "35273" 113.275 "D36" "C" "Miss" +"395" 395 "1" "3" "Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)" "female" 24 0 2 "PP 9549" 16.7 "G6" "S" "Mrs" +"396" 396 "0" "3" "Johansson, Mr. Erik" "male" 22 0 0 "350052" 7.7958 NA "S" "Mr" +"397" 397 "0" "3" "Olsson, Miss. Elina" "female" 31 0 0 "350407" 7.8542 NA "S" "Miss" +"398" 398 "0" "2" "McKane, Mr. Peter David" "male" 46 0 0 "28403" 26 NA "S" "Mr" +"399" 399 "0" "2" "Pain, Dr. Alfred" "male" 23 0 0 "244278" 10.5 NA "S" "Dr" +"400" 400 "1" "2" "Trout, Mrs. William H (Jessie L)" "female" 28 0 0 "240929" 12.65 NA "S" "Mrs" +"401" 401 "1" "3" "Niskanen, Mr. Juha" "male" 39 0 0 "STON/O 2. 3101289" 7.925 NA "S" "Mr" +"402" 402 "0" "3" "Adams, Mr. John" "male" 26 0 0 "341826" 8.05 NA "S" "Mr" +"403" 403 "0" "3" "Jussila, Miss. Mari Aina" "female" 21 1 0 "4137" 9.825 NA "S" "Miss" +"404" 404 "0" "3" "Hakkarainen, Mr. Pekka Pietari" "male" 28 1 0 "STON/O2. 3101279" 15.85 NA "S" "Mr" +"405" 405 "0" "3" "Oreskovic, Miss. Marija" "female" 20 0 0 "315096" 8.6625 NA "S" "Miss" +"406" 406 "0" "2" "Gale, Mr. Shadrach" "male" 34 1 0 "28664" 21 NA "S" "Mr" +"407" 407 "0" "3" "Widegren, Mr. Carl/Charles Peter" "male" 51 0 0 "347064" 7.75 NA "S" "Mr" +"408" 408 "1" "2" "Richards, Master. William Rowe" "male" 3 1 1 "29106" 18.75 NA "S" "Master" +"409" 409 "0" "3" "Birkeland, Mr. Hans Martin Monsen" "male" 21 0 0 "312992" 7.775 NA "S" "Mr" +"410" 410 "0" "3" "Lefebre, Miss. Ida" "female" 21 3 1 "4133" 25.4667 NA "S" "Miss" +"411" 411 "0" "3" "Sdycoff, Mr. Todor" "male" 30 0 0 "349222" 7.8958 NA "S" "Mr" +"412" 412 "0" "3" "Hart, Mr. Henry" "male" 30 0 0 "394140" 6.8583 NA "Q" "Mr" +"413" 413 "1" "1" "Minahan, Miss. Daisy E" "female" 33 1 0 "19928" 90 "C78" "Q" "Miss" +"414" 414 "0" "2" "Cunningham, Mr. Alfred Fleming" "male" 30 0 0 "239853" 15.0229 NA "S" "Mr" +"415" 415 "1" "3" "Sundman, Mr. Johan Julian" "male" 44 0 0 "STON/O 2. 3101269" 7.925 NA "S" "Mr" +"416" 416 "0" "3" "Meek, Mrs. Thomas (Annie Louise Rowley)" "female" 35 0 0 "343095" 8.05 NA "S" "Mrs" +"417" 417 "1" "2" "Drew, Mrs. James Vivian (Lulu Thorne Christian)" "female" 34 1 1 "28220" 32.5 NA "S" "Mrs" +"418" 418 "1" "2" "Silven, Miss. Lyyli Karoliina" "female" 18 0 2 "250652" 13 NA "S" "Miss" +"419" 419 "0" "2" "Matthews, Mr. William John" "male" 30 0 0 "28228" 13 NA "S" "Mr" +"420" 420 "0" "3" "Van Impe, Miss. Catharina" "female" 10 0 2 "345773" 24.15 NA "S" "Miss" +"421" 421 "0" "3" "Gheorgheff, Mr. Stanio" "male" 30 0 0 "349254" 7.8958 NA "C" "Mr" +"422" 422 "0" "3" "Charters, Mr. David" "male" 21 0 0 "A/5. 13032" 7.7333 NA "Q" "Mr" +"423" 423 "0" "3" "Zimmerman, Mr. Leo" "male" 29 0 0 "315082" 7.875 NA "S" "Mr" +"424" 424 "0" "3" "Danbom, Mrs. Ernst Gilbert (Anna Sigrid Maria Brogren)" "female" 28 1 1 "347080" 14.4 NA "S" "Mrs" +"425" 425 "0" "3" "Rosblom, Mr. Viktor Richard" "male" 18 1 1 "370129" 20.2125 NA "S" "Mr" +"426" 426 "0" "3" "Wiseman, Mr. Phillippe" "male" 30 0 0 "A/4. 34244" 7.25 NA "S" "Mr" +"427" 427 "1" "2" "Clarke, Mrs. Charles V (Ada Maria Winfield)" "female" 28 1 0 "2003" 26 NA "S" "Mrs" +"428" 428 "1" "2" "Phillips, Miss. Kate Florence (\"Mrs Kate Louise Phillips Marshall\")" "female" 19 0 0 "250655" 26 NA "S" "Miss" +"429" 429 "0" "3" "Flynn, Mr. James" "male" 30 0 0 "364851" 7.75 NA "Q" "Mr" +"430" 430 "1" "3" "Pickard, Mr. Berk (Berk Trembisky)" "male" 32 0 0 "SOTON/O.Q. 392078" 8.05 "E10" "S" "Mr" +"431" 431 "1" "1" "Bjornstrom-Steffansson, Mr. Mauritz Hakan" "male" 28 0 0 "110564" 26.55 "C52" "S" "Mr" +"432" 432 "1" "3" "Thorneycroft, Mrs. Percival (Florence Kate White)" "female" 35 1 0 "376564" 16.1 NA "S" "Mrs" +"433" 433 "1" "2" "Louch, Mrs. Charles Alexander (Alice Adelaide Slow)" "female" 42 1 0 "SC/AH 3085" 26 NA "S" "Mrs" +"434" 434 "0" "3" "Kallio, Mr. Nikolai Erland" "male" 17 0 0 "STON/O 2. 3101274" 7.125 NA "S" "Mr" +"435" 435 "0" "1" "Silvey, Mr. William Baird" "male" 50 1 0 "13507" 55.9 "E44" "S" "Mr" +"436" 436 "1" "1" "Carter, Miss. Lucile Polk" "female" 14 1 2 "113760" 120 "B96 B98" "S" "Miss" +"437" 437 "0" "3" "Ford, Miss. Doolina Margaret \"Daisy\"" "female" 21 2 2 "W./C. 6608" 34.375 NA "S" "Miss" +"438" 438 "1" "2" "Richards, Mrs. Sidney (Emily Hocking)" "female" 24 2 3 "29106" 18.75 NA "S" "Mrs" +"439" 439 "0" "1" "Fortune, Mr. Mark" "male" 64 1 4 "19950" 263 "C23 C25 C27" "S" "Mr" +"440" 440 "0" "2" "Kvillner, Mr. Johan Henrik Johannesson" "male" 31 0 0 "C.A. 18723" 10.5 NA "S" "Mr" +"441" 441 "1" "2" "Hart, Mrs. Benjamin (Esther Ada Bloomfield)" "female" 45 1 1 "F.C.C. 13529" 26.25 NA "S" "Mrs" +"442" 442 "0" "3" "Hampe, Mr. Leon" "male" 20 0 0 "345769" 9.5 NA "S" "Mr" +"443" 443 "0" "3" "Petterson, Mr. Johan Emil" "male" 25 1 0 "347076" 7.775 NA "S" "Mr" +"444" 444 "1" "2" "Reynaldo, Ms. Encarnacion" "female" 28 0 0 "230434" 13 NA "S" "Ms" +"445" 445 "1" "3" "Johannesen-Bratthammer, Mr. Bernt" "male" 30 0 0 "65306" 8.1125 NA "S" "Mr" +"446" 446 "1" "1" "Dodge, Master. Washington" "male" 4 0 2 "33638" 81.8583 "A34" "S" "Master" +"447" 447 "1" "2" "Mellinger, Miss. Madeleine Violet" "female" 13 0 1 "250644" 19.5 NA "S" "Miss" +"448" 448 "1" "1" "Seward, Mr. Frederic Kimber" "male" 34 0 0 "113794" 26.55 NA "S" "Mr" +"449" 449 "1" "3" "Baclini, Miss. Marie Catherine" "female" 5 2 1 "2666" 19.2583 NA "C" "Miss" +"450" 450 "1" "1" "Peuchen, Major. Arthur Godfrey" "male" 52 0 0 "113786" 30.5 "C104" "S" "Major" +"451" 451 "0" "2" "West, Mr. Edwy Arthur" "male" 36 1 2 "C.A. 34651" 27.75 NA "S" "Mr" +"452" 452 "0" "3" "Hagland, Mr. Ingvald Olai Olsen" "male" 30 1 0 "65303" 19.9667 NA "S" "Mr" +"453" 453 "0" "1" "Foreman, Mr. Benjamin Laventall" "male" 30 0 0 "113051" 27.75 "C111" "C" "Mr" +"454" 454 "1" "1" "Goldenberg, Mr. Samuel L" "male" 49 1 0 "17453" 89.1042 "C92" "C" "Mr" +"455" 455 "0" "3" "Peduzzi, Mr. Joseph" "male" 30 0 0 "A/5 2817" 8.05 NA "S" "Mr" +"456" 456 "1" "3" "Jalsevac, Mr. Ivan" "male" 29 0 0 "349240" 7.8958 NA "C" "Mr" +"457" 457 "0" "1" "Millet, Mr. Francis Davis" "male" 65 0 0 "13509" 26.55 "E38" "S" "Mr" +"458" 458 "1" "1" "Kenyon, Mrs. Frederick R (Marion)" "female" 35 1 0 "17464" 51.8625 "D21" "S" "Mrs" +"459" 459 "1" "2" "Toomey, Miss. Ellen" "female" 50 0 0 "F.C.C. 13531" 10.5 NA "S" "Miss" +"460" 460 "0" "3" "O'Connor, Mr. Maurice" "male" 30 0 0 "371060" 7.75 NA "Q" "Mr" +"461" 461 "1" "1" "Anderson, Mr. Harry" "male" 48 0 0 "19952" 26.55 "E12" "S" "Mr" +"462" 462 "0" "3" "Morley, Mr. William" "male" 34 0 0 "364506" 8.05 NA "S" "Mr" +"463" 463 "0" "1" "Gee, Mr. Arthur H" "male" 47 0 0 "111320" 38.5 "E63" "S" "Mr" +"464" 464 "0" "2" "Milling, Mr. Jacob Christian" "male" 48 0 0 "234360" 13 NA "S" "Mr" +"465" 465 "0" "3" "Maisner, Mr. Simon" "male" 30 0 0 "A/S 2816" 8.05 NA "S" "Mr" +"466" 466 "0" "3" "Goncalves, Mr. Manuel Estanslas" "male" 38 0 0 "SOTON/O.Q. 3101306" 7.05 NA "S" "Mr" +"467" 467 "0" "2" "Campbell, Mr. William" "male" 30 0 0 "239853" 15.0229 NA "S" "Mr" +"468" 468 "0" "1" "Smart, Mr. John Montgomery" "male" 56 0 0 "113792" 26.55 NA "S" "Mr" +"469" 469 "0" "3" "Scanlan, Mr. James" "male" 30 0 0 "36209" 7.725 NA "Q" "Mr" +"470" 470 "1" "3" "Baclini, Miss. Helene Barbara" "female" 0.75 2 1 "2666" 19.2583 NA "C" "Miss" +"471" 471 "0" "3" "Keefe, Mr. Arthur" "male" 30 0 0 "323592" 7.25 NA "S" "Mr" +"472" 472 "0" "3" "Cacic, Mr. Luka" "male" 38 0 0 "315089" 8.6625 NA "S" "Mr" +"473" 473 "1" "2" "West, Mrs. Edwy Arthur (Ada Mary Worth)" "female" 33 1 2 "C.A. 34651" 27.75 NA "S" "Mrs" +"474" 474 "1" "2" "Jerwan, Mrs. Amin S (Marie Marthe Thuillard)" "female" 23 0 0 "SC/AH Basle 541" 13.7917 "D" "C" "Mrs" +"475" 475 "0" "3" "Strandberg, Miss. Ida Sofia" "female" 22 0 0 "7553" 9.8375 NA "S" "Miss" +"476" 476 "0" "1" "Clifford, Mr. George Quincy" "male" 30 0 0 "110465" 52 "A14" "S" "Mr" +"477" 477 "0" "2" "Renouf, Mr. Peter Henry" "male" 34 1 0 "31027" 21 NA "S" "Mr" +"478" 478 "0" "3" "Braund, Mr. Lewis Richard" "male" 29 1 0 "3460" 7.0458 NA "S" "Mr" +"479" 479 "0" "3" "Karlsson, Mr. Nils August" "male" 22 0 0 "350060" 7.5208 NA "S" "Mr" +"480" 480 "1" "3" "Hirvonen, Miss. Hildur E" "female" 2 0 1 "3101298" 12.2875 NA "S" "Miss" +"481" 481 "0" "3" "Goodwin, Master. Harold Victor" "male" 9 5 2 "CA 2144" 46.9 NA "S" "Master" +"482" 482 "0" "2" "Frost, Mr. Anthony Wood \"Archie\"" "male" 30 0 0 "239854" 15.0229 NA "S" "Mr" +"483" 483 "0" "3" "Rouse, Mr. Richard Henry" "male" 50 0 0 "A/5 3594" 8.05 NA "S" "Mr" +"484" 484 "1" "3" "Turkula, Mrs. (Hedwig)" "female" 63 0 0 "4134" 9.5875 NA "S" "Mrs" +"485" 485 "1" "1" "Bishop, Mr. Dickinson H" "male" 25 1 0 "11967" 91.0792 "B49" "C" "Mr" +"486" 486 "0" "3" "Lefebre, Miss. Jeannie" "female" 21 3 1 "4133" 25.4667 NA "S" "Miss" +"487" 487 "1" "1" "Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)" "female" 35 1 0 "19943" 90 "C93" "S" "Mrs" +"488" 488 "0" "1" "Kent, Mr. Edward Austin" "male" 58 0 0 "11771" 29.7 "B37" "C" "Mr" +"489" 489 "0" "3" "Somerton, Mr. Francis William" "male" 30 0 0 "A.5. 18509" 8.05 NA "S" "Mr" +"490" 490 "1" "3" "Coutts, Master. Eden Leslie \"Neville\"" "male" 9 1 1 "C.A. 37671" 15.9 NA "S" "Master" +"491" 491 "0" "3" "Hagland, Mr. Konrad Mathias Reiersen" "male" 30 1 0 "65304" 19.9667 NA "S" "Mr" +"492" 492 "0" "3" "Windelov, Mr. Einar" "male" 21 0 0 "SOTON/OQ 3101317" 7.25 NA "S" "Mr" +"493" 493 "0" "1" "Molson, Mr. Harry Markland" "male" 55 0 0 "113787" 30.5 "C30" "S" "Mr" +"494" 494 "0" "1" "Artagaveytia, Mr. Ramon" "male" 71 0 0 "PC 17609" 49.5042 NA "C" "Mr" +"495" 495 "0" "3" "Stanley, Mr. Edward Roland" "male" 21 0 0 "A/4 45380" 8.05 NA "S" "Mr" +"496" 496 "0" "3" "Yousseff, Mr. Gerious" "male" 30 0 0 "2627" 14.4583 NA "C" "Mr" +"497" 497 "1" "1" "Eustis, Miss. Elizabeth Mussey" "female" 54 1 0 "36947" 78.2667 "D20" "C" "Miss" +"498" 498 "0" "3" "Shellard, Mr. Frederick William" "male" 30 0 0 "C.A. 6212" 15.1 NA "S" "Mr" +"499" 499 "0" "1" "Allison, Mrs. Hudson J C (Bessie Waldo Daniels)" "female" 25 1 2 "113781" 151.55 "C22 C26" "S" "Mrs" +"500" 500 "0" "3" "Svensson, Mr. Olof" "male" 24 0 0 "350035" 7.7958 NA "S" "Mr" +"501" 501 "0" "3" "Calic, Mr. Petar" "male" 17 0 0 "315086" 8.6625 NA "S" "Mr" +"502" 502 "0" "3" "Canavan, Miss. Mary" "female" 21 0 0 "364846" 7.75 NA "Q" "Miss" +"503" 503 "0" "3" "O'Sullivan, Miss. Bridget Mary" "female" 21 0 0 "330909" 7.6292 NA "Q" "Miss" +"504" 504 "0" "3" "Laitinen, Miss. Kristina Sofia" "female" 37 0 0 "4135" 9.5875 NA "S" "Miss" +"505" 505 "1" "1" "Maioni, Miss. Roberta" "female" 16 0 0 "110152" 86.5 "B79" "S" "Miss" +"506" 506 "0" "1" "Penasco y Castellana, Mr. Victor de Satode" "male" 18 1 0 "PC 17758" 108.9 "C65" "C" "Mr" +"507" 507 "1" "2" "Quick, Mrs. Frederick Charles (Jane Richards)" "female" 33 0 2 "26360" 26 NA "S" "Mrs" +"508" 508 "1" "1" "Bradley, Mr. George (\"George Arthur Brayton\")" "male" 30 0 0 "111427" 26.55 NA "S" "Mr" +"509" 509 "0" "3" "Olsen, Mr. Henry Margido" "male" 28 0 0 "C 4001" 22.525 NA "S" "Mr" +"510" 510 "1" "3" "Lang, Mr. Fang" "male" 26 0 0 "1601" 56.4958 NA "S" "Mr" +"511" 511 "1" "3" "Daly, Mr. Eugene Patrick" "male" 29 0 0 "382651" 7.75 NA "Q" "Mr" +"512" 512 "0" "3" "Webber, Mr. James" "male" 30 0 0 "SOTON/OQ 3101316" 8.05 NA "S" "Mr" +"513" 513 "1" "1" "McGough, Mr. James Robert" "male" 36 0 0 "PC 17473" 26.2875 "E25" "S" "Mr" +"514" 514 "1" "1" "Rothschild, Mrs. Martin (Elizabeth L. Barrett)" "female" 54 1 0 "PC 17603" 59.4 NA "C" "Mrs" +"515" 515 "0" "3" "Coleff, Mr. Satio" "male" 24 0 0 "349209" 7.4958 NA "S" "Mr" +"516" 516 "0" "1" "Walker, Mr. William Anderson" "male" 47 0 0 "36967" 34.0208 "D46" "S" "Mr" +"517" 517 "1" "2" "Lemore, Mrs. (Amelia Milley)" "female" 34 0 0 "C.A. 34260" 10.5 "F33" "S" "Mrs" +"518" 518 "0" "3" "Ryan, Mr. Patrick" "male" 30 0 0 "371110" 24.15 NA "Q" "Mr" +"519" 519 "1" "2" "Angle, Mrs. William A (Florence \"Mary\" Agnes Hughes)" "female" 36 1 0 "226875" 26 NA "S" "Mrs" +"520" 520 "0" "3" "Pavlovic, Mr. Stefo" "male" 32 0 0 "349242" 7.8958 NA "S" "Mr" +"521" 521 "1" "1" "Perreault, Miss. Anne" "female" 30 0 0 "12749" 93.5 "B73" "S" "Miss" +"522" 522 "0" "3" "Vovk, Mr. Janko" "male" 22 0 0 "349252" 7.8958 NA "S" "Mr" +"523" 523 "0" "3" "Lahoud, Mr. Sarkis" "male" 30 0 0 "2624" 7.225 NA "C" "Mr" +"524" 524 "1" "1" "Hippach, Mrs. Louis Albert (Ida Sophia Fischer)" "female" 44 0 1 "111361" 57.9792 "B18" "C" "Mrs" +"525" 525 "0" "3" "Kassem, Mr. Fared" "male" 30 0 0 "2700" 7.2292 NA "C" "Mr" +"526" 526 "0" "3" "Farrell, Mr. James" "male" 40.5 0 0 "367232" 7.75 NA "Q" "Mr" +"527" 527 "1" "2" "Ridsdale, Miss. Lucy" "female" 50 0 0 "W./C. 14258" 10.5 NA "S" "Miss" +"528" 528 "0" "1" "Farthing, Mr. John" "male" 30 0 0 "PC 17483" 221.7792 "C95" "S" "Mr" +"529" 529 "0" "3" "Salonen, Mr. Johan Werner" "male" 39 0 0 "3101296" 7.925 NA "S" "Mr" +"530" 530 "0" "2" "Hocking, Mr. Richard George" "male" 23 2 1 "29104" 11.5 NA "S" "Mr" +"531" 531 "1" "2" "Quick, Miss. Phyllis May" "female" 2 1 1 "26360" 26 NA "S" "Miss" +"532" 532 "0" "3" "Toufik, Mr. Nakli" "male" 30 0 0 "2641" 7.2292 NA "C" "Mr" +"533" 533 "0" "3" "Elias, Mr. Joseph Jr" "male" 17 1 1 "2690" 7.2292 NA "C" "Mr" +"534" 534 "1" "3" "Peter, Mrs. Catherine (Catherine Rizk)" "female" 35 0 2 "2668" 22.3583 NA "C" "Mrs" +"535" 535 "0" "3" "Cacic, Miss. Marija" "female" 30 0 0 "315084" 8.6625 NA "S" "Miss" +"536" 536 "1" "2" "Hart, Miss. Eva Miriam" "female" 7 0 2 "F.C.C. 13529" 26.25 NA "S" "Miss" +"537" 537 "0" "1" "Butt, Major. Archibald Willingham" "male" 45 0 0 "113050" 26.55 "B38" "S" "Major" +"538" 538 "1" "1" "LeRoy, Miss. Bertha" "female" 30 0 0 "PC 17761" 106.425 NA "C" "Miss" +"539" 539 "0" "3" "Risien, Mr. Samuel Beard" "male" 30 0 0 "364498" 14.5 NA "S" "Mr" +"540" 540 "1" "1" "Frolicher, Miss. Hedwig Margaritha" "female" 22 0 2 "13568" 49.5 "B39" "C" "Miss" +"541" 541 "1" "1" "Crosby, Miss. Harriet R" "female" 36 0 2 "WE/P 5735" 71 "B22" "S" "Miss" +"542" 542 "0" "3" "Andersson, Miss. Ingeborg Constanzia" "female" 9 4 2 "347082" 31.275 NA "S" "Miss" +"543" 543 "0" "3" "Andersson, Miss. Sigrid Elisabeth" "female" 11 4 2 "347082" 31.275 NA "S" "Miss" +"544" 544 "1" "2" "Beane, Mr. Edward" "male" 32 1 0 "2908" 26 NA "S" "Mr" +"545" 545 "0" "1" "Douglas, Mr. Walter Donald" "male" 50 1 0 "PC 17761" 106.425 "C86" "C" "Mr" +"546" 546 "0" "1" "Nicholson, Mr. Arthur Ernest" "male" 64 0 0 "693" 26 NA "S" "Mr" +"547" 547 "1" "2" "Beane, Mrs. Edward (Ethel Clarke)" "female" 19 1 0 "2908" 26 NA "S" "Mrs" +"548" 548 "1" "2" "Padro y Manent, Mr. Julian" "male" 30 0 0 "SC/PARIS 2146" 13.8625 NA "C" "Mr" +"549" 549 "0" "3" "Goldsmith, Mr. Frank John" "male" 33 1 1 "363291" 20.525 NA "S" "Mr" +"550" 550 "1" "2" "Davies, Master. John Morgan Jr" "male" 8 1 1 "C.A. 33112" 36.75 NA "S" "Master" +"551" 551 "1" "1" "Thayer, Mr. John Borland Jr" "male" 17 0 2 "17421" 110.8833 "C70" "C" "Mr" +"552" 552 "0" "2" "Sharp, Mr. Percival James R" "male" 27 0 0 "244358" 26 NA "S" "Mr" +"553" 553 "0" "3" "O'Brien, Mr. Timothy" "male" 30 0 0 "330979" 7.8292 NA "Q" "Mr" +"554" 554 "1" "3" "Leeni, Mr. Fahim (\"Philip Zenni\")" "male" 22 0 0 "2620" 7.225 NA "C" "Mr" +"555" 555 "1" "3" "Ohman, Miss. Velin" "female" 22 0 0 "347085" 7.775 NA "S" "Miss" +"556" 556 "0" "1" "Wright, Mr. George" "male" 62 0 0 "113807" 26.55 NA "S" "Mr" +"557" 557 "1" "1" "Duff Gordon, Lady. (Lucille Christiana Sutherland) (\"Mrs Morgan\")" "female" 48 1 0 "11755" 39.6 "A16" "C" "Lady" +"558" 558 "0" "1" "Robbins, Mr. Victor" "male" 30 0 0 "PC 17757" 227.525 NA "C" "Mr" +"559" 559 "1" "1" "Taussig, Mrs. Emil (Tillie Mandelbaum)" "female" 39 1 1 "110413" 79.65 "E67" "S" "Mrs" +"560" 560 "1" "3" "de Messemaeker, Mrs. Guillaume Joseph (Emma)" "female" 36 1 0 "345572" 17.4 NA "S" "Mrs" +"561" 561 "0" "3" "Morrow, Mr. Thomas Rowan" "male" 30 0 0 "372622" 7.75 NA "Q" "Mr" +"562" 562 "0" "3" "Sivic, Mr. Husein" "male" 40 0 0 "349251" 7.8958 NA "S" "Mr" +"563" 563 "0" "2" "Norman, Mr. Robert Douglas" "male" 28 0 0 "218629" 13.5 NA "S" "Mr" +"564" 564 "0" "3" "Simmons, Mr. John" "male" 30 0 0 "SOTON/OQ 392082" 8.05 NA "S" "Mr" +"565" 565 "0" "3" "Meanwell, Miss. (Marion Ogden)" "female" 21 0 0 "SOTON/O.Q. 392087" 8.05 NA "S" "Miss" +"566" 566 "0" "3" "Davies, Mr. Alfred J" "male" 24 2 0 "A/4 48871" 24.15 NA "S" "Mr" +"567" 567 "0" "3" "Stoytcheff, Mr. Ilia" "male" 19 0 0 "349205" 7.8958 NA "S" "Mr" +"568" 568 "0" "3" "Palsson, Mrs. Nils (Alma Cornelia Berglund)" "female" 29 0 4 "349909" 21.075 NA "S" "Mrs" +"569" 569 "0" "3" "Doharr, Mr. Tannous" "male" 30 0 0 "2686" 7.2292 NA "C" "Mr" +"570" 570 "1" "3" "Jonsson, Mr. Carl" "male" 32 0 0 "350417" 7.8542 NA "S" "Mr" +"571" 571 "1" "2" "Harris, Mr. George" "male" 62 0 0 "S.W./PP 752" 10.5 NA "S" "Mr" +"572" 572 "1" "1" "Appleton, Mrs. Edward Dale (Charlotte Lamson)" "female" 53 2 0 "11769" 51.4792 "C101" "S" "Mrs" +"573" 573 "1" "1" "Flynn, Mr. John Irwin (\"Irving\")" "male" 36 0 0 "PC 17474" 26.3875 "E25" "S" "Mr" +"574" 574 "1" "3" "Kelly, Miss. Mary" "female" 21 0 0 "14312" 7.75 NA "Q" "Miss" +"575" 575 "0" "3" "Rush, Mr. Alfred George John" "male" 16 0 0 "A/4. 20589" 8.05 NA "S" "Mr" +"576" 576 "0" "3" "Patchett, Mr. George" "male" 19 0 0 "358585" 14.5 NA "S" "Mr" +"577" 577 "1" "2" "Garside, Miss. Ethel" "female" 34 0 0 "243880" 13 NA "S" "Miss" +"578" 578 "1" "1" "Silvey, Mrs. William Baird (Alice Munger)" "female" 39 1 0 "13507" 55.9 "E44" "S" "Mrs" +"579" 579 "0" "3" "Caram, Mrs. Joseph (Maria Elias)" "female" 35 1 0 "2689" 14.4583 NA "C" "Mrs" +"580" 580 "1" "3" "Jussila, Mr. Eiriik" "male" 32 0 0 "STON/O 2. 3101286" 7.925 NA "S" "Mr" +"581" 581 "1" "2" "Christy, Miss. Julie Rachel" "female" 25 1 1 "237789" 30 NA "S" "Miss" +"582" 582 "1" "1" "Thayer, Mrs. John Borland (Marian Longstreth Morris)" "female" 39 1 1 "17421" 110.8833 "C68" "C" "Mrs" +"583" 583 "0" "2" "Downton, Mr. William James" "male" 54 0 0 "28403" 26 NA "S" "Mr" +"584" 584 "0" "1" "Ross, Mr. John Hugo" "male" 36 0 0 "13049" 40.125 "A10" "C" "Mr" +"585" 585 "0" "3" "Paulner, Mr. Uscher" "male" 30 0 0 "3411" 8.7125 NA "C" "Mr" +"586" 586 "1" "1" "Taussig, Miss. Ruth" "female" 18 0 2 "110413" 79.65 "E68" "S" "Miss" +"587" 587 "0" "2" "Jarvis, Mr. John Denzil" "male" 47 0 0 "237565" 15 NA "S" "Mr" +"588" 588 "1" "1" "Frolicher-Stehli, Mr. Maxmillian" "male" 60 1 1 "13567" 79.2 "B41" "C" "Mr" +"589" 589 "0" "3" "Gilinski, Mr. Eliezer" "male" 22 0 0 "14973" 8.05 NA "S" "Mr" +"590" 590 "0" "3" "Murdlin, Mr. Joseph" "male" 30 0 0 "A./5. 3235" 8.05 NA "S" "Mr" +"591" 591 "0" "3" "Rintamaki, Mr. Matti" "male" 35 0 0 "STON/O 2. 3101273" 7.125 NA "S" "Mr" +"592" 592 "1" "1" "Stephenson, Mrs. Walter Bertram (Martha Eustis)" "female" 52 1 0 "36947" 78.2667 "D20" "C" "Mrs" +"593" 593 "0" "3" "Elsbury, Mr. William James" "male" 47 0 0 "A/5 3902" 7.25 NA "S" "Mr" +"594" 594 "0" "3" "Bourke, Miss. Mary" "female" 21 0 2 "364848" 7.75 NA "Q" "Miss" +"595" 595 "0" "2" "Chapman, Mr. John Henry" "male" 37 1 0 "SC/AH 29037" 26 NA "S" "Mr" +"596" 596 "0" "3" "Van Impe, Mr. Jean Baptiste" "male" 36 1 1 "345773" 24.15 NA "S" "Mr" +"597" 597 "1" "2" "Leitch, Miss. Jessie Wills" "female" 21 0 0 "248727" 33 NA "S" "Miss" +"598" 598 "0" "3" "Johnson, Mr. Alfred" "male" 49 0 0 "LINE" 8.05 NA "S" "Mr" +"599" 599 "0" "3" "Boulos, Mr. Hanna" "male" 30 0 0 "2664" 7.225 NA "C" "Mr" +"600" 600 "1" "1" "Duff Gordon, Sir. Cosmo Edmund (\"Mr Morgan\")" "male" 49 1 0 "PC 17485" 56.9292 "A20" "C" "Sir" +"601" 601 "1" "2" "Jacobsohn, Mrs. Sidney Samuel (Amy Frances Christy)" "female" 24 2 1 "243847" 27 NA "S" "Mrs" +"602" 602 "0" "3" "Slabenoff, Mr. Petco" "male" 30 0 0 "349214" 7.8958 NA "S" "Mr" +"603" 603 "0" "1" "Harrington, Mr. Charles H" "male" 30 0 0 "113796" 42.4 NA "S" "Mr" +"604" 604 "0" "3" "Torber, Mr. Ernst William" "male" 44 0 0 "364511" 8.05 NA "S" "Mr" +"605" 605 "1" "1" "Homer, Mr. Harry (\"Mr E Haven\")" "male" 35 0 0 "111426" 26.55 NA "C" "Mr" +"606" 606 "0" "3" "Lindell, Mr. Edvard Bengtsson" "male" 36 1 0 "349910" 15.55 NA "S" "Mr" +"607" 607 "0" "3" "Karaic, Mr. Milan" "male" 30 0 0 "349246" 7.8958 NA "S" "Mr" +"608" 608 "1" "1" "Daniel, Mr. Robert Williams" "male" 27 0 0 "113804" 30.5 NA "S" "Mr" +"609" 609 "1" "2" "Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)" "female" 22 1 2 "SC/Paris 2123" 41.5792 NA "C" "Mrs" +"610" 610 "1" "1" "Shutes, Miss. Elizabeth W" "female" 40 0 0 "PC 17582" 153.4625 "C125" "S" "Miss" +"611" 611 "0" "3" "Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)" "female" 39 1 5 "347082" 31.275 NA "S" "Mrs" +"612" 612 "0" "3" "Jardin, Mr. Jose Neto" "male" 30 0 0 "SOTON/O.Q. 3101305" 7.05 NA "S" "Mr" +"613" 613 "1" "3" "Murphy, Miss. Margaret Jane" "female" 21 1 0 "367230" 15.5 NA "Q" "Miss" +"614" 614 "0" "3" "Horgan, Mr. John" "male" 30 0 0 "370377" 7.75 NA "Q" "Mr" +"615" 615 "0" "3" "Brocklebank, Mr. William Alfred" "male" 35 0 0 "364512" 8.05 NA "S" "Mr" +"616" 616 "1" "2" "Herman, Miss. Alice" "female" 24 1 2 "220845" 65 NA "S" "Miss" +"617" 617 "0" "3" "Danbom, Mr. Ernst Gilbert" "male" 34 1 1 "347080" 14.4 NA "S" "Mr" +"618" 618 "0" "3" "Lobb, Mrs. William Arthur (Cordelia K Stanlick)" "female" 26 1 0 "A/5. 3336" 16.1 NA "S" "Mrs" +"619" 619 "1" "2" "Becker, Miss. Marion Louise" "female" 4 2 1 "230136" 39 "F4" "S" "Miss" +"620" 620 "0" "2" "Gavey, Mr. Lawrence" "male" 26 0 0 "31028" 10.5 NA "S" "Mr" +"621" 621 "0" "3" "Yasbeck, Mr. Antoni" "male" 27 1 0 "2659" 14.4542 NA "C" "Mr" +"622" 622 "1" "1" "Kimball, Mr. Edwin Nelson Jr" "male" 42 1 0 "11753" 52.5542 "D19" "S" "Mr" +"623" 623 "1" "3" "Nakid, Mr. Sahid" "male" 20 1 1 "2653" 15.7417 NA "C" "Mr" +"624" 624 "0" "3" "Hansen, Mr. Henry Damsgaard" "male" 21 0 0 "350029" 7.8542 NA "S" "Mr" +"625" 625 "0" "3" "Bowen, Mr. David John \"Dai\"" "male" 21 0 0 "54636" 16.1 NA "S" "Mr" +"626" 626 "0" "1" "Sutton, Mr. Frederick" "male" 61 0 0 "36963" 32.3208 "D50" "S" "Mr" +"627" 627 "0" "2" "Kirkland, Rev. Charles Leonard" "male" 57 0 0 "219533" 12.35 NA "Q" "Rev" +"628" 628 "1" "1" "Longley, Miss. Gretchen Fiske" "female" 21 0 0 "13502" 77.9583 "D9" "S" "Miss" +"629" 629 "0" "3" "Bostandyeff, Mr. Guentcho" "male" 26 0 0 "349224" 7.8958 NA "S" "Mr" +"630" 630 "0" "3" "O'Connell, Mr. Patrick D" "male" 30 0 0 "334912" 7.7333 NA "Q" "Mr" +"631" 631 "1" "1" "Barkworth, Mr. Algernon Henry Wilson" "male" 80 0 0 "27042" 30 "A23" "S" "Mr" +"632" 632 "0" "3" "Lundahl, Mr. Johan Svensson" "male" 51 0 0 "347743" 7.0542 NA "S" "Mr" +"633" 633 "1" "1" "Stahelin-Maeglin, Dr. Max" "male" 32 0 0 "13214" 30.5 "B50" "C" "Dr" +"634" 634 "0" "1" "Parr, Mr. William Henry Marsh" "male" 30 0 0 "112052" 61.9792 NA "S" "Mr" +"635" 635 "0" "3" "Skoog, Miss. Mabel" "female" 9 3 2 "347088" 27.9 NA "S" "Miss" +"636" 636 "1" "2" "Davis, Miss. Mary" "female" 28 0 0 "237668" 13 NA "S" "Miss" +"637" 637 "0" "3" "Leinonen, Mr. Antti Gustaf" "male" 32 0 0 "STON/O 2. 3101292" 7.925 NA "S" "Mr" +"638" 638 "0" "2" "Collyer, Mr. Harvey" "male" 31 1 1 "C.A. 31921" 26.25 NA "S" "Mr" +"639" 639 "0" "3" "Panula, Mrs. Juha (Maria Emilia Ojala)" "female" 41 0 5 "3101295" 39.6875 NA "S" "Mrs" +"640" 640 "0" "3" "Thorneycroft, Mr. Percival" "male" 30 1 0 "376564" 16.1 NA "S" "Mr" +"641" 641 "0" "3" "Jensen, Mr. Hans Peder" "male" 20 0 0 "350050" 7.8542 NA "S" "Mr" +"642" 642 "1" "1" "Sagesser, Mlle. Emma" "female" 24 0 0 "PC 17477" 69.3 "B35" "C" "Mlle" +"643" 643 "0" "3" "Skoog, Miss. Margit Elizabeth" "female" 2 3 2 "347088" 27.9 NA "S" "Miss" +"644" 644 "1" "3" "Foo, Mr. Choong" "male" 30 0 0 "1601" 56.4958 NA "S" "Mr" +"645" 645 "1" "3" "Baclini, Miss. Eugenie" "female" 0.75 2 1 "2666" 19.2583 NA "C" "Miss" +"646" 646 "1" "1" "Harper, Mr. Henry Sleeper" "male" 48 1 0 "PC 17572" 76.7292 "D33" "C" "Mr" +"647" 647 "0" "3" "Cor, Mr. Liudevit" "male" 19 0 0 "349231" 7.8958 NA "S" "Mr" +"648" 648 "1" "1" "Simonius-Blumer, Col. Oberst Alfons" "male" 56 0 0 "13213" 35.5 "A26" "C" "Col" +"649" 649 "0" "3" "Willey, Mr. Edward" "male" 30 0 0 "S.O./P.P. 751" 7.55 NA "S" "Mr" +"650" 650 "1" "3" "Stanley, Miss. Amy Zillah Elsie" "female" 23 0 0 "CA. 2314" 7.55 NA "S" "Miss" +"651" 651 "0" "3" "Mitkoff, Mr. Mito" "male" 30 0 0 "349221" 7.8958 NA "S" "Mr" +"652" 652 "1" "2" "Doling, Miss. Elsie" "female" 18 0 1 "231919" 23 NA "S" "Miss" +"653" 653 "0" "3" "Kalvik, Mr. Johannes Halvorsen" "male" 21 0 0 "8475" 8.4333 NA "S" "Mr" +"654" 654 "1" "3" "O'Leary, Miss. Hanora \"Norah\"" "female" 21 0 0 "330919" 7.8292 NA "Q" "Miss" +"655" 655 "0" "3" "Hegarty, Miss. Hanora \"Nora\"" "female" 18 0 0 "365226" 6.75 NA "Q" "Miss" +"656" 656 "0" "2" "Hickman, Mr. Leonard Mark" "male" 24 2 0 "S.O.C. 14879" 73.5 NA "S" "Mr" +"657" 657 "0" "3" "Radeff, Mr. Alexander" "male" 30 0 0 "349223" 7.8958 NA "S" "Mr" +"658" 658 "0" "3" "Bourke, Mrs. John (Catherine)" "female" 32 1 1 "364849" 15.5 NA "Q" "Mrs" +"659" 659 "0" "2" "Eitemiller, Mr. George Floyd" "male" 23 0 0 "29751" 13 NA "S" "Mr" +"660" 660 "0" "1" "Newell, Mr. Arthur Webster" "male" 58 0 2 "35273" 113.275 "D48" "C" "Mr" +"661" 661 "1" "1" "Frauenthal, Dr. Henry William" "male" 50 2 0 "PC 17611" 133.65 NA "S" "Dr" +"662" 662 "0" "3" "Badt, Mr. Mohamed" "male" 40 0 0 "2623" 7.225 NA "C" "Mr" +"663" 663 "0" "1" "Colley, Mr. Edward Pomeroy" "male" 47 0 0 "5727" 25.5875 "E58" "S" "Mr" +"664" 664 "0" "3" "Coleff, Mr. Peju" "male" 36 0 0 "349210" 7.4958 NA "S" "Mr" +"665" 665 "1" "3" "Lindqvist, Mr. Eino William" "male" 20 1 0 "STON/O 2. 3101285" 7.925 NA "S" "Mr" +"666" 666 "0" "2" "Hickman, Mr. Lewis" "male" 32 2 0 "S.O.C. 14879" 73.5 NA "S" "Mr" +"667" 667 "0" "2" "Butler, Mr. Reginald Fenton" "male" 25 0 0 "234686" 13 NA "S" "Mr" +"668" 668 "0" "3" "Rommetvedt, Mr. Knud Paust" "male" 30 0 0 "312993" 7.775 NA "S" "Mr" +"669" 669 "0" "3" "Cook, Mr. Jacob" "male" 43 0 0 "A/5 3536" 8.05 NA "S" "Mr" +"670" 670 "1" "1" "Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)" "female" 35 1 0 "19996" 52 "C126" "S" "Mrs" +"671" 671 "1" "2" "Brown, Mrs. Thomas William Solomon (Elizabeth Catherine Ford)" "female" 40 1 1 "29750" 39 NA "S" "Mrs" +"672" 672 "0" "1" "Davidson, Mr. Thornton" "male" 31 1 0 "F.C. 12750" 52 "B71" "S" "Mr" +"673" 673 "0" "2" "Mitchell, Mr. Henry Michael" "male" 70 0 0 "C.A. 24580" 10.5 NA "S" "Mr" +"674" 674 "1" "2" "Wilhelms, Mr. Charles" "male" 31 0 0 "244270" 13 NA "S" "Mr" +"675" 675 "0" "2" "Watson, Mr. Ennis Hastings" "male" 30 0 0 "239856" 15.0229 NA "S" "Mr" +"676" 676 "0" "3" "Edvardsson, Mr. Gustaf Hjalmar" "male" 18 0 0 "349912" 7.775 NA "S" "Mr" +"677" 677 "0" "3" "Sawyer, Mr. Frederick Charles" "male" 24.5 0 0 "342826" 8.05 NA "S" "Mr" +"678" 678 "1" "3" "Turja, Miss. Anna Sofia" "female" 18 0 0 "4138" 9.8417 NA "S" "Miss" +"679" 679 "0" "3" "Goodwin, Mrs. Frederick (Augusta Tyler)" "female" 43 1 6 "CA 2144" 46.9 NA "S" "Mrs" +"680" 680 "1" "1" "Cardeza, Mr. Thomas Drake Martinez" "male" 36 0 1 "PC 17755" 512.3292 "B51 B53 B55" "C" "Mr" +"681" 681 "0" "3" "Peters, Miss. Katie" "female" 21 0 0 "330935" 8.1375 NA "Q" "Miss" +"682" 682 "1" "1" "Hassab, Mr. Hammad" "male" 27 0 0 "PC 17572" 76.7292 "D49" "C" "Mr" +"683" 683 "0" "3" "Olsvigen, Mr. Thor Anderson" "male" 20 0 0 "6563" 9.225 NA "S" "Mr" +"684" 684 "0" "3" "Goodwin, Mr. Charles Edward" "male" 14 5 2 "CA 2144" 46.9 NA "S" "Mr" +"685" 685 "0" "2" "Brown, Mr. Thomas William Solomon" "male" 60 1 1 "29750" 39 NA "S" "Mr" +"686" 686 "0" "2" "Laroche, Mr. Joseph Philippe Lemercier" "male" 25 1 2 "SC/Paris 2123" 41.5792 NA "C" "Mr" +"687" 687 "0" "3" "Panula, Mr. Jaako Arnold" "male" 14 4 1 "3101295" 39.6875 NA "S" "Mr" +"688" 688 "0" "3" "Dakic, Mr. Branko" "male" 19 0 0 "349228" 10.1708 NA "S" "Mr" +"689" 689 "0" "3" "Fischer, Mr. Eberhard Thelander" "male" 18 0 0 "350036" 7.7958 NA "S" "Mr" +"690" 690 "1" "1" "Madill, Miss. Georgette Alexandra" "female" 15 0 1 "24160" 211.3375 "B5" "S" "Miss" +"691" 691 "1" "1" "Dick, Mr. Albert Adrian" "male" 31 1 0 "17474" 57 "B20" "S" "Mr" +"692" 692 "1" "3" "Karun, Miss. Manca" "female" 4 0 1 "349256" 13.4167 NA "C" "Miss" +"693" 693 "1" "3" "Lam, Mr. Ali" "male" 30 0 0 "1601" 56.4958 NA "S" "Mr" +"694" 694 "0" "3" "Saad, Mr. Khalil" "male" 25 0 0 "2672" 7.225 NA "C" "Mr" +"695" 695 "0" "1" "Weir, Col. John" "male" 60 0 0 "113800" 26.55 NA "S" "Col" +"696" 696 "0" "2" "Chapman, Mr. Charles Henry" "male" 52 0 0 "248731" 13.5 NA "S" "Mr" +"697" 697 "0" "3" "Kelly, Mr. James" "male" 44 0 0 "363592" 8.05 NA "S" "Mr" +"698" 698 "1" "3" "Mullens, Miss. Katherine \"Katie\"" "female" 21 0 0 "35852" 7.7333 NA "Q" "Miss" +"699" 699 "0" "1" "Thayer, Mr. John Borland" "male" 49 1 1 "17421" 110.8833 "C68" "C" "Mr" +"700" 700 "0" "3" "Humblen, Mr. Adolf Mathias Nicolai Olsen" "male" 42 0 0 "348121" 7.65 "F G63" "S" "Mr" +"701" 701 "1" "1" "Astor, Mrs. John Jacob (Madeleine Talmadge Force)" "female" 18 1 0 "PC 17757" 227.525 "C62 C64" "C" "Mrs" +"702" 702 "1" "1" "Silverthorne, Mr. Spencer Victor" "male" 35 0 0 "PC 17475" 26.2875 "E24" "S" "Mr" +"703" 703 "0" "3" "Barbara, Miss. Saiide" "female" 18 0 1 "2691" 14.4542 NA "C" "Miss" +"704" 704 "0" "3" "Gallagher, Mr. Martin" "male" 25 0 0 "36864" 7.7417 NA "Q" "Mr" +"705" 705 "0" "3" "Hansen, Mr. Henrik Juul" "male" 26 1 0 "350025" 7.8542 NA "S" "Mr" +"706" 706 "0" "2" "Morley, Mr. Henry Samuel (\"Mr Henry Marshall\")" "male" 39 0 0 "250655" 26 NA "S" "Mr" +"707" 707 "1" "2" "Kelly, Mrs. Florence \"Fannie\"" "female" 45 0 0 "223596" 13.5 NA "S" "Mrs" +"708" 708 "1" "1" "Calderhead, Mr. Edward Pennington" "male" 42 0 0 "PC 17476" 26.2875 "E24" "S" "Mr" +"709" 709 "1" "1" "Cleaver, Miss. Alice" "female" 22 0 0 "113781" 151.55 NA "S" "Miss" +"710" 710 "1" "3" "Moubarek, Master. Halim Gonios (\"William George\")" "male" 3.5 1 1 "2661" 15.2458 NA "C" "Master" +"711" 711 "1" "1" "Mayne, Mlle. Berthe Antonine (\"Mrs de Villiers\")" "female" 24 0 0 "PC 17482" 49.5042 "C90" "C" "Mlle" +"712" 712 "0" "1" "Klaber, Mr. Herman" "male" 30 0 0 "113028" 26.55 "C124" "S" "Mr" +"713" 713 "1" "1" "Taylor, Mr. Elmer Zebley" "male" 48 1 0 "19996" 52 "C126" "S" "Mr" +"714" 714 "0" "3" "Larsson, Mr. August Viktor" "male" 29 0 0 "7545" 9.4833 NA "S" "Mr" +"715" 715 "0" "2" "Greenberg, Mr. Samuel" "male" 52 0 0 "250647" 13 NA "S" "Mr" +"716" 716 "0" "3" "Soholt, Mr. Peter Andreas Lauritz Andersen" "male" 19 0 0 "348124" 7.65 "F G73" "S" "Mr" +"717" 717 "1" "1" "Endres, Miss. Caroline Louise" "female" 38 0 0 "PC 17757" 227.525 "C45" "C" "Miss" +"718" 718 "1" "2" "Troutt, Miss. Edwina Celia \"Winnie\"" "female" 27 0 0 "34218" 10.5 "E101" "S" "Miss" +"719" 719 "0" "3" "McEvoy, Mr. Michael" "male" 30 0 0 "36568" 15.5 NA "Q" "Mr" +"720" 720 "0" "3" "Johnson, Mr. Malkolm Joackim" "male" 33 0 0 "347062" 7.775 NA "S" "Mr" +"721" 721 "1" "2" "Harper, Miss. Annie Jessie \"Nina\"" "female" 6 0 1 "248727" 33 NA "S" "Miss" +"722" 722 "0" "3" "Jensen, Mr. Svend Lauritz" "male" 17 1 0 "350048" 7.0542 NA "S" "Mr" +"723" 723 "0" "2" "Gillespie, Mr. William Henry" "male" 34 0 0 "12233" 13 NA "S" "Mr" +"724" 724 "0" "2" "Hodges, Mr. Henry Price" "male" 50 0 0 "250643" 13 NA "S" "Mr" +"725" 725 "1" "1" "Chambers, Mr. Norman Campbell" "male" 27 1 0 "113806" 53.1 "E8" "S" "Mr" +"726" 726 "0" "3" "Oreskovic, Mr. Luka" "male" 20 0 0 "315094" 8.6625 NA "S" "Mr" +"727" 727 "1" "2" "Renouf, Mrs. Peter Henry (Lillian Jefferys)" "female" 30 3 0 "31027" 21 NA "S" "Mrs" +"728" 728 "1" "3" "Mannion, Miss. Margareth" "female" 21 0 0 "36866" 7.7375 NA "Q" "Miss" +"729" 729 "0" "2" "Bryhl, Mr. Kurt Arnold Gottfrid" "male" 25 1 0 "236853" 26 NA "S" "Mr" +"730" 730 "0" "3" "Ilmakangas, Miss. Pieta Sofia" "female" 25 1 0 "STON/O2. 3101271" 7.925 NA "S" "Miss" +"731" 731 "1" "1" "Allen, Miss. Elisabeth Walton" "female" 29 0 0 "24160" 211.3375 "B5" "S" "Miss" +"732" 732 "0" "3" "Hassan, Mr. Houssein G N" "male" 11 0 0 "2699" 18.7875 NA "C" "Mr" +"733" 733 "0" "2" "Knight, Mr. Robert J" "male" 30 0 0 "239855" 15.0229 NA "S" "Mr" +"734" 734 "0" "2" "Berriman, Mr. William John" "male" 23 0 0 "28425" 13 NA "S" "Mr" +"735" 735 "0" "2" "Troupiansky, Mr. Moses Aaron" "male" 23 0 0 "233639" 13 NA "S" "Mr" +"736" 736 "0" "3" "Williams, Mr. Leslie" "male" 28.5 0 0 "54636" 16.1 NA "S" "Mr" +"737" 737 "0" "3" "Ford, Mrs. Edward (Margaret Ann Watson)" "female" 48 1 3 "W./C. 6608" 34.375 NA "S" "Mrs" +"738" 738 "1" "1" "Lesurer, Mr. Gustave J" "male" 35 0 0 "PC 17755" 512.3292 "B101" "C" "Mr" +"739" 739 "0" "3" "Ivanoff, Mr. Kanio" "male" 30 0 0 "349201" 7.8958 NA "S" "Mr" +"740" 740 "0" "3" "Nankoff, Mr. Minko" "male" 30 0 0 "349218" 7.8958 NA "S" "Mr" +"741" 741 "1" "1" "Hawksford, Mr. Walter James" "male" 30 0 0 "16988" 30 "D45" "S" "Mr" +"742" 742 "0" "1" "Cavendish, Mr. Tyrell William" "male" 36 1 0 "19877" 78.85 "C46" "S" "Mr" +"743" 743 "1" "1" "Ryerson, Miss. Susan Parker \"Suzette\"" "female" 21 2 2 "PC 17608" 262.375 "B57 B59 B63 B66" "C" "Miss" +"744" 744 "0" "3" "McNamee, Mr. Neal" "male" 24 1 0 "376566" 16.1 NA "S" "Mr" +"745" 745 "1" "3" "Stranden, Mr. Juho" "male" 31 0 0 "STON/O 2. 3101288" 7.925 NA "S" "Mr" +"746" 746 "0" "1" "Crosby, Capt. Edward Gifford" "male" 70 1 1 "WE/P 5735" 71 "B22" "S" "Capt" +"747" 747 "0" "3" "Abbott, Mr. Rossmore Edward" "male" 16 1 1 "C.A. 2673" 20.25 NA "S" "Mr" +"748" 748 "1" "2" "Sinkkonen, Miss. Anna" "female" 30 0 0 "250648" 13 NA "S" "Miss" +"749" 749 "0" "1" "Marvin, Mr. Daniel Warner" "male" 19 1 0 "113773" 53.1 "D30" "S" "Mr" +"750" 750 "0" "3" "Connaghton, Mr. Michael" "male" 31 0 0 "335097" 7.75 NA "Q" "Mr" +"751" 751 "1" "2" "Wells, Miss. Joan" "female" 4 1 1 "29103" 23 NA "S" "Miss" +"752" 752 "1" "3" "Moor, Master. Meier" "male" 6 0 1 "392096" 12.475 "E121" "S" "Master" +"753" 753 "0" "3" "Vande Velde, Mr. Johannes Joseph" "male" 33 0 0 "345780" 9.5 NA "S" "Mr" +"754" 754 "0" "3" "Jonkoff, Mr. Lalio" "male" 23 0 0 "349204" 7.8958 NA "S" "Mr" +"755" 755 "1" "2" "Herman, Mrs. Samuel (Jane Laver)" "female" 48 1 2 "220845" 65 NA "S" "Mrs" +"756" 756 "1" "2" "Hamalainen, Master. Viljo" "male" 0.67 1 1 "250649" 14.5 NA "S" "Master" +"757" 757 "0" "3" "Carlsson, Mr. August Sigfrid" "male" 28 0 0 "350042" 7.7958 NA "S" "Mr" +"758" 758 "0" "2" "Bailey, Mr. Percy Andrew" "male" 18 0 0 "29108" 11.5 NA "S" "Mr" +"759" 759 "0" "3" "Theobald, Mr. Thomas Leonard" "male" 34 0 0 "363294" 8.05 NA "S" "Mr" +"760" 760 "1" "1" "Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)" "female" 33 0 0 "110152" 86.5 "B77" "S" "the Countess" +"761" 761 "0" "3" "Garfirth, Mr. John" "male" 30 0 0 "358585" 14.5 NA "S" "Mr" +"762" 762 "0" "3" "Nirva, Mr. Iisakki Antino Aijo" "male" 41 0 0 "SOTON/O2 3101272" 7.125 NA "S" "Mr" +"763" 763 "1" "3" "Barah, Mr. Hanna Assi" "male" 20 0 0 "2663" 7.2292 NA "C" "Mr" +"764" 764 "1" "1" "Carter, Mrs. William Ernest (Lucile Polk)" "female" 36 1 2 "113760" 120 "B96 B98" "S" "Mrs" +"765" 765 "0" "3" "Eklund, Mr. Hans Linus" "male" 16 0 0 "347074" 7.775 NA "S" "Mr" +"766" 766 "1" "1" "Hogeboom, Mrs. John C (Anna Andrews)" "female" 51 1 0 "13502" 77.9583 "D11" "S" "Mrs" +"767" 767 "0" "1" "Brewe, Dr. Arthur Jackson" "male" 46.5 0 0 "112379" 39.6 NA "C" "Dr" +"768" 768 "0" "3" "Mangan, Miss. Mary" "female" 30.5 0 0 "364850" 7.75 NA "Q" "Miss" +"769" 769 "0" "3" "Moran, Mr. Daniel J" "male" 30 1 0 "371110" 24.15 NA "Q" "Mr" +"770" 770 "0" "3" "Gronnestad, Mr. Daniel Danielsen" "male" 32 0 0 "8471" 8.3625 NA "S" "Mr" +"771" 771 "0" "3" "Lievens, Mr. Rene Aime" "male" 24 0 0 "345781" 9.5 NA "S" "Mr" +"772" 772 "0" "3" "Jensen, Mr. Niels Peder" "male" 48 0 0 "350047" 7.8542 NA "S" "Mr" +"773" 773 "0" "2" "Mack, Mrs. (Mary)" "female" 57 0 0 "S.O./P.P. 3" 10.5 "E77" "S" "Mrs" +"774" 774 "0" "3" "Elias, Mr. Dibo" "male" 30 0 0 "2674" 7.225 NA "C" "Mr" +"775" 775 "1" "2" "Hocking, Mrs. Elizabeth (Eliza Needs)" "female" 54 1 3 "29105" 23 NA "S" "Mrs" +"776" 776 "0" "3" "Myhrman, Mr. Pehr Fabian Oliver Malkolm" "male" 18 0 0 "347078" 7.75 NA "S" "Mr" +"777" 777 "0" "3" "Tobin, Mr. Roger" "male" 30 0 0 "383121" 7.75 "F38" "Q" "Mr" +"778" 778 "1" "3" "Emanuel, Miss. Virginia Ethel" "female" 5 0 0 "364516" 12.475 NA "S" "Miss" +"779" 779 "0" "3" "Kilgannon, Mr. Thomas J" "male" 30 0 0 "36865" 7.7375 NA "Q" "Mr" +"780" 780 "1" "1" "Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)" "female" 43 0 1 "24160" 211.3375 "B3" "S" "Mrs" +"781" 781 "1" "3" "Ayoub, Miss. Banoura" "female" 13 0 0 "2687" 7.2292 NA "C" "Miss" +"782" 782 "1" "1" "Dick, Mrs. Albert Adrian (Vera Gillespie)" "female" 17 1 0 "17474" 57 "B20" "S" "Mrs" +"783" 783 "0" "1" "Long, Mr. Milton Clyde" "male" 29 0 0 "113501" 30 "D6" "S" "Mr" +"784" 784 "0" "3" "Johnston, Mr. Andrew G" "male" 30 1 2 "W./C. 6607" 23.45 NA "S" "Mr" +"785" 785 "0" "3" "Ali, Mr. William" "male" 25 0 0 "SOTON/O.Q. 3101312" 7.05 NA "S" "Mr" +"786" 786 "0" "3" "Harmer, Mr. Abraham (David Lishin)" "male" 25 0 0 "374887" 7.25 NA "S" "Mr" +"787" 787 "1" "3" "Sjoblom, Miss. Anna Sofia" "female" 18 0 0 "3101265" 7.4958 NA "S" "Miss" +"788" 788 "0" "3" "Rice, Master. George Hugh" "male" 8 4 1 "382652" 29.125 NA "Q" "Master" +"789" 789 "1" "3" "Dean, Master. Bertram Vere" "male" 1 1 2 "C.A. 2315" 20.575 NA "S" "Master" +"790" 790 "0" "1" "Guggenheim, Mr. Benjamin" "male" 46 0 0 "PC 17593" 79.2 "B82 B84" "C" "Mr" +"791" 791 "0" "3" "Keane, Mr. Andrew \"Andy\"" "male" 30 0 0 "12460" 7.75 NA "Q" "Mr" +"792" 792 "0" "2" "Gaskell, Mr. Alfred" "male" 16 0 0 "239865" 26 NA "S" "Mr" +"793" 793 "0" "3" "Sage, Miss. Stella Anna" "female" 21 8 2 "CA. 2343" 69.55 NA "S" "Miss" +"794" 794 "0" "1" "Hoyt, Mr. William Fisher" "male" 30 0 0 "PC 17600" 30.6958 NA "C" "Mr" +"795" 795 "0" "3" "Dantcheff, Mr. Ristiu" "male" 25 0 0 "349203" 7.8958 NA "S" "Mr" +"796" 796 "0" "2" "Otter, Mr. Richard" "male" 39 0 0 "28213" 13 NA "S" "Mr" +"797" 797 "1" "1" "Leader, Dr. Alice (Farnham)" "female" 49 0 0 "17465" 25.9292 "D17" "S" "Dr" +"798" 798 "1" "3" "Osman, Mrs. Mara" "female" 31 0 0 "349244" 8.6833 NA "S" "Mrs" +"799" 799 "0" "3" "Ibrahim Shawah, Mr. Yousseff" "male" 30 0 0 "2685" 7.2292 NA "C" "Mr" +"800" 800 "0" "3" "Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)" "female" 30 1 1 "345773" 24.15 NA "S" "Mrs" +"801" 801 "0" "2" "Ponesell, Mr. Martin" "male" 34 0 0 "250647" 13 NA "S" "Mr" +"802" 802 "1" "2" "Collyer, Mrs. Harvey (Charlotte Annie Tate)" "female" 31 1 1 "C.A. 31921" 26.25 NA "S" "Mrs" +"803" 803 "1" "1" "Carter, Master. William Thornton II" "male" 11 1 2 "113760" 120 "B96 B98" "S" "Master" +"804" 804 "1" "3" "Thomas, Master. Assad Alexander" "male" 0.42 0 1 "2625" 8.5167 NA "C" "Master" +"805" 805 "1" "3" "Hedman, Mr. Oskar Arvid" "male" 27 0 0 "347089" 6.975 NA "S" "Mr" +"806" 806 "0" "3" "Johansson, Mr. Karl Johan" "male" 31 0 0 "347063" 7.775 NA "S" "Mr" +"807" 807 "0" "1" "Andrews, Mr. Thomas Jr" "male" 39 0 0 "112050" 61.9792 "A36" "S" "Mr" +"808" 808 "0" "3" "Pettersson, Miss. Ellen Natalia" "female" 18 0 0 "347087" 7.775 NA "S" "Miss" +"809" 809 "0" "2" "Meyer, Mr. August" "male" 39 0 0 "248723" 13 NA "S" "Mr" +"810" 810 "1" "1" "Chambers, Mrs. Norman Campbell (Bertha Griggs)" "female" 33 1 0 "113806" 53.1 "E8" "S" "Mrs" +"811" 811 "0" "3" "Alexander, Mr. William" "male" 26 0 0 "3474" 7.8875 NA "S" "Mr" +"812" 812 "0" "3" "Lester, Mr. James" "male" 39 0 0 "A/4 48871" 24.15 NA "S" "Mr" +"813" 813 "0" "2" "Slemen, Mr. Richard James" "male" 35 0 0 "28206" 10.5 NA "S" "Mr" +"814" 814 "0" "3" "Andersson, Miss. Ebba Iris Alfrida" "female" 6 4 2 "347082" 31.275 NA "S" "Miss" +"815" 815 "0" "3" "Tomlin, Mr. Ernest Portage" "male" 30.5 0 0 "364499" 8.05 NA "S" "Mr" +"816" 816 "0" "1" "Fry, Mr. Richard" "male" 30 0 0 "112058" 61.9792 "B102" "S" "Mr" +"817" 817 "0" "3" "Heininen, Miss. Wendla Maria" "female" 23 0 0 "STON/O2. 3101290" 7.925 NA "S" "Miss" +"818" 818 "0" "2" "Mallet, Mr. Albert" "male" 31 1 1 "S.C./PARIS 2079" 37.0042 NA "C" "Mr" +"819" 819 "0" "3" "Holm, Mr. John Fredrik Alexander" "male" 43 0 0 "C 7075" 6.45 NA "S" "Mr" +"820" 820 "0" "3" "Skoog, Master. Karl Thorsten" "male" 10 3 2 "347088" 27.9 NA "S" "Master" +"821" 821 "1" "1" "Hays, Mrs. Charles Melville (Clara Jennings Gregg)" "female" 52 1 1 "12749" 93.5 "B69" "S" "Mrs" +"822" 822 "1" "3" "Lulic, Mr. Nikola" "male" 27 0 0 "315098" 8.6625 NA "S" "Mr" +"823" 823 "0" "1" "Reuchlin, Jonkheer. John George" "male" 38 0 0 "19972" 61.9792 NA "S" "Jonkheer" +"824" 824 "1" "3" "Moor, Mrs. (Beila)" "female" 27 0 1 "392096" 12.475 "E121" "S" "Mrs" +"825" 825 "0" "3" "Panula, Master. Urho Abraham" "male" 2 4 1 "3101295" 39.6875 NA "S" "Master" +"826" 826 "0" "3" "Flynn, Mr. John" "male" 30 0 0 "368323" 6.95 NA "Q" "Mr" +"827" 827 "0" "3" "Lam, Mr. Len" "male" 30 0 0 "1601" 56.4958 NA "S" "Mr" +"828" 828 "1" "2" "Mallet, Master. Andre" "male" 1 0 2 "S.C./PARIS 2079" 37.0042 NA "C" "Master" +"829" 829 "1" "3" "McCormack, Mr. Thomas Joseph" "male" 30 0 0 "367228" 7.75 NA "Q" "Mr" +"830" 830 "1" "1" "Stone, Mrs. George Nelson (Martha Evelyn)" "female" 62 0 0 "113572" 80 "B28" "S" "Mrs" +"831" 831 "1" "3" "Yasbeck, Mrs. Antoni (Selini Alexander)" "female" 15 1 0 "2659" 14.4542 NA "C" "Mrs" +"832" 832 "1" "2" "Richards, Master. George Sibley" "male" 0.83 1 1 "29106" 18.75 NA "S" "Master" +"833" 833 "0" "3" "Saad, Mr. Amin" "male" 30 0 0 "2671" 7.2292 NA "C" "Mr" +"834" 834 "0" "3" "Augustsson, Mr. Albert" "male" 23 0 0 "347468" 7.8542 NA "S" "Mr" +"835" 835 "0" "3" "Allum, Mr. Owen George" "male" 18 0 0 "2223" 8.3 NA "S" "Mr" +"836" 836 "1" "1" "Compton, Miss. Sara Rebecca" "female" 39 1 1 "PC 17756" 83.1583 "E49" "C" "Miss" +"837" 837 "0" "3" "Pasic, Mr. Jakob" "male" 21 0 0 "315097" 8.6625 NA "S" "Mr" +"838" 838 "0" "3" "Sirota, Mr. Maurice" "male" 30 0 0 "392092" 8.05 NA "S" "Mr" +"839" 839 "1" "3" "Chip, Mr. Chang" "male" 32 0 0 "1601" 56.4958 NA "S" "Mr" +"840" 840 "1" "1" "Marechal, Mr. Pierre" "male" 30 0 0 "11774" 29.7 "C47" "C" "Mr" +"841" 841 "0" "3" "Alhomaki, Mr. Ilmari Rudolf" "male" 20 0 0 "SOTON/O2 3101287" 7.925 NA "S" "Mr" +"842" 842 "0" "2" "Mudd, Mr. Thomas Charles" "male" 16 0 0 "S.O./P.P. 3" 10.5 NA "S" "Mr" +"843" 843 "1" "1" "Serepeca, Miss. Augusta" "female" 30 0 0 "113798" 31 NA "C" "Miss" +"844" 844 "0" "3" "Lemberopolous, Mr. Peter L" "male" 34.5 0 0 "2683" 6.4375 NA "C" "Mr" +"845" 845 "0" "3" "Culumovic, Mr. Jeso" "male" 17 0 0 "315090" 8.6625 NA "S" "Mr" +"846" 846 "0" "3" "Abbing, Mr. Anthony" "male" 42 0 0 "C.A. 5547" 7.55 NA "S" "Mr" +"847" 847 "0" "3" "Sage, Mr. Douglas Bullen" "male" 30 8 2 "CA. 2343" 69.55 NA "S" "Mr" +"848" 848 "0" "3" "Markoff, Mr. Marin" "male" 35 0 0 "349213" 7.8958 NA "C" "Mr" +"849" 849 "0" "2" "Harper, Rev. John" "male" 28 0 1 "248727" 33 NA "S" "Rev" +"850" 850 "1" "1" "Goldenberg, Mrs. Samuel L (Edwiga Grabowska)" "female" 35 1 0 "17453" 89.1042 "C92" "C" "Mrs" +"851" 851 "0" "3" "Andersson, Master. Sigvard Harald Elias" "male" 4 4 2 "347082" 31.275 NA "S" "Master" +"852" 852 "0" "3" "Svensson, Mr. Johan" "male" 74 0 0 "347060" 7.775 NA "S" "Mr" +"853" 853 "0" "3" "Boulos, Miss. Nourelain" "female" 9 1 1 "2678" 15.2458 NA "C" "Miss" +"854" 854 "1" "1" "Lines, Miss. Mary Conover" "female" 16 0 1 "PC 17592" 39.4 "D28" "S" "Miss" +"855" 855 "0" "2" "Carter, Mrs. Ernest Courtenay (Lilian Hughes)" "female" 44 1 0 "244252" 26 NA "S" "Mrs" +"856" 856 "1" "3" "Aks, Mrs. Sam (Leah Rosen)" "female" 18 0 1 "392091" 9.35 NA "S" "Mrs" +"857" 857 "1" "1" "Wick, Mrs. George Dennick (Mary Hitchcock)" "female" 45 1 1 "36928" 164.8667 NA "S" "Mrs" +"858" 858 "1" "1" "Daly, Mr. Peter Denis " "male" 51 0 0 "113055" 26.55 "E17" "S" "Mr" +"859" 859 "1" "3" "Baclini, Mrs. Solomon (Latifa Qurban)" "female" 24 0 3 "2666" 19.2583 NA "C" "Mrs" +"860" 860 "0" "3" "Razi, Mr. Raihed" "male" 30 0 0 "2629" 7.2292 NA "C" "Mr" +"861" 861 "0" "3" "Hansen, Mr. Claus Peter" "male" 41 2 0 "350026" 14.1083 NA "S" "Mr" +"862" 862 "0" "2" "Giles, Mr. Frederick Edward" "male" 21 1 0 "28134" 11.5 NA "S" "Mr" +"863" 863 "1" "1" "Swift, Mrs. Frederick Joel (Margaret Welles Barron)" "female" 48 0 0 "17466" 25.9292 "D17" "S" "Mrs" +"864" 864 "0" "3" "Sage, Miss. Dorothy Edith \"Dolly\"" "female" 21 8 2 "CA. 2343" 69.55 NA "S" "Miss" +"865" 865 "0" "2" "Gill, Mr. John William" "male" 24 0 0 "233866" 13 NA "S" "Mr" +"866" 866 "1" "2" "Bystrom, Mrs. (Karolina)" "female" 42 0 0 "236852" 13 NA "S" "Mrs" +"867" 867 "1" "2" "Duran y More, Miss. Asuncion" "female" 27 1 0 "SC/PARIS 2149" 13.8583 NA "C" "Miss" +"868" 868 "0" "1" "Roebling, Mr. Washington Augustus II" "male" 31 0 0 "PC 17590" 50.4958 "A24" "S" "Mr" +"869" 869 "0" "3" "van Melkebeke, Mr. Philemon" "male" 30 0 0 "345777" 9.5 NA "S" "Mr" +"870" 870 "1" "3" "Johnson, Master. Harold Theodor" "male" 4 1 1 "347742" 11.1333 NA "S" "Master" +"871" 871 "0" "3" "Balkic, Mr. Cerin" "male" 26 0 0 "349248" 7.8958 NA "S" "Mr" +"872" 872 "1" "1" "Beckwith, Mrs. Richard Leonard (Sallie Monypeny)" "female" 47 1 1 "11751" 52.5542 "D35" "S" "Mrs" +"873" 873 "0" "1" "Carlsson, Mr. Frans Olof" "male" 33 0 0 "695" 5 "B51 B53 B55" "S" "Mr" +"874" 874 "0" "3" "Vander Cruyssen, Mr. Victor" "male" 47 0 0 "345765" 9 NA "S" "Mr" +"875" 875 "1" "2" "Abelson, Mrs. Samuel (Hannah Wizosky)" "female" 28 1 0 "P/PP 3381" 24 NA "C" "Mrs" +"876" 876 "1" "3" "Najib, Miss. Adele Kiamie \"Jane\"" "female" 15 0 0 "2667" 7.225 NA "C" "Miss" +"877" 877 "0" "3" "Gustafsson, Mr. Alfred Ossian" "male" 20 0 0 "7534" 9.8458 NA "S" "Mr" +"878" 878 "0" "3" "Petroff, Mr. Nedelio" "male" 19 0 0 "349212" 7.8958 NA "S" "Mr" +"879" 879 "0" "3" "Laleff, Mr. Kristo" "male" 30 0 0 "349217" 7.8958 NA "S" "Mr" +"880" 880 "1" "1" "Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)" "female" 56 0 1 "11767" 83.1583 "C50" "C" "Mrs" +"881" 881 "1" "2" "Shelley, Mrs. William (Imanita Parrish Hall)" "female" 25 0 1 "230433" 26 NA "S" "Mrs" +"882" 882 "0" "3" "Markun, Mr. Johann" "male" 33 0 0 "349257" 7.8958 NA "S" "Mr" +"883" 883 "0" "3" "Dahlberg, Miss. Gerda Ulrika" "female" 22 0 0 "7552" 10.5167 NA "S" "Miss" +"884" 884 "0" "2" "Banfield, Mr. Frederick James" "male" 28 0 0 "C.A./SOTON 34068" 10.5 NA "S" "Mr" +"885" 885 "0" "3" "Sutehall, Mr. Henry Jr" "male" 25 0 0 "SOTON/OQ 392076" 7.05 NA "S" "Mr" +"886" 886 "0" "3" "Rice, Mrs. William (Margaret Norton)" "female" 39 0 5 "382652" 29.125 NA "Q" "Mrs" +"887" 887 "0" "2" "Montvila, Rev. Juozas" "male" 27 0 0 "211536" 13 NA "S" "Rev" +"888" 888 "1" "1" "Graham, Miss. Margaret Edith" "female" 19 0 0 "112053" 30 "B42" "S" "Miss" +"889" 889 "0" "3" "Johnston, Miss. Catherine Helen \"Carrie\"" "female" 21 1 2 "W./C. 6607" 23.45 NA "S" "Miss" +"890" 890 "1" "1" "Behr, Mr. Karl Howell" "male" 26 0 0 "111369" 30 "C148" "C" "Mr" +"891" 891 "0" "3" "Dooley, Mr. Patrick" "male" 32 0 0 "370376" 7.75 NA "Q" "Mr" diff --git a/test.Rmd b/test.Rmd new file mode 100644 index 0000000..33f8ffa --- /dev/null +++ b/test.Rmd @@ -0,0 +1,22 @@ +--- +title: "test" +author: "Bill Holst" +date: "January 27, 2016" +output: html_document +--- + +This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see . + +When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: + +```{r} +summary(cars) +``` + +You can also embed plots, for example: + +```{r} +plot(cars) +``` + +Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot. diff --git a/test.html b/test.html new file mode 100644 index 0000000..8783d02 --- /dev/null +++ b/test.html @@ -0,0 +1,109 @@ + + + + + + + + + + + + + + +test + + + + + + + + + + + + + + + + + + + + + +
+ + + + + +

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

+

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

+
summary(cars)
+
##      speed           dist       
+##  Min.   : 4.0   Min.   :  2.00  
+##  1st Qu.:12.0   1st Qu.: 26.00  
+##  Median :15.0   Median : 36.00  
+##  Mean   :15.4   Mean   : 42.98  
+##  3rd Qu.:19.0   3rd Qu.: 56.00  
+##  Max.   :25.0   Max.   :120.00
+

You can also embed plots, for example:

+
plot(cars)
+

+

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

+ + +
+ + + + + + + +