|
| 1 | +library(caret) |
| 2 | +library(nnet) |
| 3 | +library(ggplot2) |
| 4 | +library(Amelia) |
| 5 | +library(Hmisc) |
| 6 | +library(NeuralNetTools) |
| 7 | + |
| 8 | +setwd("E:/data analytics/kaggle/titanic/data") |
| 9 | + |
| 10 | +readData = function(path.name, file.name, column.types, missing.types) { |
| 11 | + read.csv(paste(path.name, file.name, sep="/"), header=TRUE, |
| 12 | + colClasses=column.types, |
| 13 | + na.strings=missing.types ) |
| 14 | +} |
| 15 | + |
| 16 | +changeNames = function(name) { |
| 17 | + master_vector = grep("Master\\.",name) |
| 18 | + miss_vector = grep("Miss\\.", name) |
| 19 | + mrs_vector = grep("Mrs\\.", name) |
| 20 | + mr_vector = grep("Mr\\.", name) |
| 21 | + dr_vector = grep("Dr\\.", name) |
| 22 | + ms_vector = grep("Ms\\.", name) |
| 23 | + |
| 24 | + for(i in master_vector) { |
| 25 | + name[i] = "Master" |
| 26 | + } |
| 27 | + for(i in miss_vector) { |
| 28 | + name[i] = "Miss" |
| 29 | + } |
| 30 | + for(i in mrs_vector) { |
| 31 | + name[i] = "Mrs" |
| 32 | + } |
| 33 | + for(i in mr_vector) { |
| 34 | + name[i] = "Mr" |
| 35 | + } |
| 36 | + for(i in dr_vector) { |
| 37 | + name[i] = "Dr" |
| 38 | + } |
| 39 | + for(i in ms_vector) { |
| 40 | + name[i] = "Mrs" |
| 41 | + } |
| 42 | + return (name); |
| 43 | +} |
| 44 | + |
| 45 | + |
| 46 | +imputeMean = function(impute.var, filter.var, var.levels) { |
| 47 | + for (v in var.levels) { |
| 48 | + impute.var[ which( filter.var == v)] = |
| 49 | + impute(impute.var[which( filter.var == v)],mean) |
| 50 | + } |
| 51 | + return (impute.var) |
| 52 | +} |
| 53 | + |
| 54 | +imputeEmbarked = function(impute.var) { |
| 55 | + impute.var[which(is.na(impute.var))] = 'S' |
| 56 | + return (impute.var) |
| 57 | +} |
| 58 | + |
| 59 | +imputeFare = function(impute.var) { |
| 60 | + impute.var = ifelse(is.na(impute.var), mean(impute.var, na.rm = TRUE), impute.var) |
| 61 | + return (impute.var) |
| 62 | +} |
| 63 | + |
| 64 | +titanic.path = getwd() |
| 65 | +train.data.file = "train.csv" |
| 66 | +missing.types = c("NA", "") |
| 67 | +train.column.types = c('integer', # PassengerId |
| 68 | + 'factor', # Survived |
| 69 | + 'factor', # Pclass |
| 70 | + 'character', # Name |
| 71 | + 'factor', # Sex |
| 72 | + 'numeric', # Age |
| 73 | + 'integer', # SibSp |
| 74 | + 'integer', # Parch |
| 75 | + 'character', # Ticket |
| 76 | + 'numeric', # Fare |
| 77 | + 'character', # Cabin |
| 78 | + 'factor' # Embarked |
| 79 | +) |
| 80 | +trainSet= readData(titanic.path, train.data.file, |
| 81 | + train.column.types, missing.types) |
| 82 | + |
| 83 | +missmap(trainSet, main="Titanic Training Data - Missings Map", |
| 84 | + col=c("yellow", "black"), legend=FALSE) |
| 85 | + |
| 86 | +trainSet$Name = changeNames(trainSet$Name) |
| 87 | +names.na.train = c("Dr", "Master", "Mrs", "Miss", "Mr") |
| 88 | +trainSet$Age = imputeMean(trainSet$Age, trainSet$Name, names.na.train) |
| 89 | +trainSet$Embarked = imputeEmbarked(trainSet$Embarked) |
| 90 | +trainSet$Fare = imputeFare(trainSet$Fare) |
| 91 | + |
| 92 | +missmap(trainSet, main="Titanic Training Data - Missings Map", |
| 93 | + col=c("yellow", "black"), legend=FALSE) |
| 94 | + |
| 95 | +dim(trainSet) |
| 96 | +str(trainSet) |
| 97 | +head(trainSet) |
| 98 | +summary(trainSet) |
| 99 | + |
| 100 | + |
| 101 | +table(trainSet$Survived) |
| 102 | +ggplot(trainSet, aes(x = Survived)) + geom_bar() |
| 103 | + |
| 104 | +#Comparing Survived and passenger class using table and histograms |
| 105 | +summary(trainSet$Pclass) |
| 106 | +xtabs(~Survived + Pclass, data=trainSet) |
| 107 | +ggplot(trainSet, aes(x = Survived, fill = Pclass)) + geom_bar() |
| 108 | + |
| 109 | +#Comparing Survived and Sex using table and histograms |
| 110 | +summary(trainSet$Sex) |
| 111 | +xtabs(~Survived + Sex, data=trainSet) |
| 112 | +ggplot(trainSet, aes(x = Survived, fill = Sex)) + geom_bar() |
| 113 | + |
| 114 | + |
| 115 | +#Comparing Survived and Embarked using table and histograms |
| 116 | +summary(trainSet$Embarked) |
| 117 | +xtabs(~Survived + Embarked, data=trainSet) |
| 118 | +ggplot(trainSet, aes(x = Survived, fill = Embarked)) + geom_bar() |
| 119 | + |
| 120 | +# Comparing Age and Survived: The boxplots are very similar between Age |
| 121 | +# for survivors and those who died. |
| 122 | +xtabs(~Survived + Age, data=trainSet) |
| 123 | +ggplot(trainSet, aes(x = Survived, y = Age)) + geom_boxplot() |
| 124 | +summary(trainSet$Age) |
| 125 | + |
| 126 | +# Comparing Survived and Fare: The boxplots are much different between |
| 127 | +# fare for survivors and those who died. |
| 128 | +ggplot(trainSet, aes(x = Survived, y = Fare)) + geom_boxplot() |
| 129 | +# Also, there are no NA's. Include this variable. |
| 130 | +summary(trainSet$Fare) |
| 131 | + |
| 132 | +# Comparing Survived and Parch |
| 133 | +ggplot(trainSet, aes(x = Survived, y = Parch)) + geom_boxplot() |
| 134 | +summary(trainSet$Parch) |
| 135 | + |
| 136 | +# Set a random seed |
| 137 | +set.seed(42) |
| 138 | + |
| 139 | +#model tuning strategy |
| 140 | +ctrl = trainControl(method = "cv", # Use cross-validation |
| 141 | + number = 10) # Use 10 folds for cross-validation |
| 142 | + |
| 143 | +# Train the model using a "neural net" algorithm |
| 144 | +model_nn = train(Survived ~ Pclass + Sex + Age + Embarked + SibSp, |
| 145 | + data = trainSet, |
| 146 | + method = "nnet", |
| 147 | + trControl = ctrl) |
| 148 | +model_nn |
| 149 | + |
| 150 | +plotnet(model_nn) |
| 151 | + |
| 152 | + |
| 153 | +test.data.file = "test.csv" |
| 154 | +test.column.types = train.column.types[-2] |
| 155 | + |
| 156 | +testSet = readData(titanic.path, test.data.file, |
| 157 | + test.column.types, missing.types) |
| 158 | +dim(testSet) |
| 159 | +str(testSet) |
| 160 | +head(testSet) |
| 161 | +summary(testSet) |
| 162 | + |
| 163 | +testSet$Name = changeNames(testSet$Name) |
| 164 | +testSet$Age = imputeMean(testSet$Age, testSet$Name, |
| 165 | + names.na.train) |
| 166 | +testSet$Embarked = imputeEmbarked(testSet$Embarked) |
| 167 | + |
| 168 | +testSet$Survived = predict(model_nn, newdata = testSet) |
| 169 | + |
| 170 | +submission = testSet[,c("PassengerId", "Survived")] |
| 171 | + |
| 172 | +write.table(submission, file = "submission.csv", col.names = TRUE, row.names = FALSE, sep = ",") |
0 commit comments