Skip to content

Commit 2898f90

Browse files
author
algorithmica-repository
committed
Uploading class examples
1 parent 2f46f5d commit 2898f90

File tree

2 files changed

+271
-0
lines changed

2 files changed

+271
-0
lines changed

16-neuralnet/titanic4-ann.R

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
library(caret)
2+
library(nnet)
3+
library(ggplot2)
4+
library(Amelia)
5+
library(Hmisc)
6+
library(NeuralNetTools)
7+
8+
setwd("E:/data analytics/kaggle/titanic/data")
9+
10+
readData = function(path.name, file.name, column.types, missing.types) {
11+
read.csv(paste(path.name, file.name, sep="/"), header=TRUE,
12+
colClasses=column.types,
13+
na.strings=missing.types )
14+
}
15+
16+
changeNames = function(name) {
17+
master_vector = grep("Master\\.",name)
18+
miss_vector = grep("Miss\\.", name)
19+
mrs_vector = grep("Mrs\\.", name)
20+
mr_vector = grep("Mr\\.", name)
21+
dr_vector = grep("Dr\\.", name)
22+
ms_vector = grep("Ms\\.", name)
23+
24+
for(i in master_vector) {
25+
name[i] = "Master"
26+
}
27+
for(i in miss_vector) {
28+
name[i] = "Miss"
29+
}
30+
for(i in mrs_vector) {
31+
name[i] = "Mrs"
32+
}
33+
for(i in mr_vector) {
34+
name[i] = "Mr"
35+
}
36+
for(i in dr_vector) {
37+
name[i] = "Dr"
38+
}
39+
for(i in ms_vector) {
40+
name[i] = "Mrs"
41+
}
42+
return (name);
43+
}
44+
45+
46+
imputeMean = function(impute.var, filter.var, var.levels) {
47+
for (v in var.levels) {
48+
impute.var[ which( filter.var == v)] =
49+
impute(impute.var[which( filter.var == v)],mean)
50+
}
51+
return (impute.var)
52+
}
53+
54+
imputeEmbarked = function(impute.var) {
55+
impute.var[which(is.na(impute.var))] = 'S'
56+
return (impute.var)
57+
}
58+
59+
imputeFare = function(impute.var) {
60+
impute.var = ifelse(is.na(impute.var), mean(impute.var, na.rm = TRUE), impute.var)
61+
return (impute.var)
62+
}
63+
64+
titanic.path = getwd()
65+
train.data.file = "train.csv"
66+
missing.types = c("NA", "")
67+
train.column.types = c('integer', # PassengerId
68+
'factor', # Survived
69+
'factor', # Pclass
70+
'character', # Name
71+
'factor', # Sex
72+
'numeric', # Age
73+
'integer', # SibSp
74+
'integer', # Parch
75+
'character', # Ticket
76+
'numeric', # Fare
77+
'character', # Cabin
78+
'factor' # Embarked
79+
)
80+
trainSet= readData(titanic.path, train.data.file,
81+
train.column.types, missing.types)
82+
83+
missmap(trainSet, main="Titanic Training Data - Missings Map",
84+
col=c("yellow", "black"), legend=FALSE)
85+
86+
trainSet$Name = changeNames(trainSet$Name)
87+
names.na.train = c("Dr", "Master", "Mrs", "Miss", "Mr")
88+
trainSet$Age = imputeMean(trainSet$Age, trainSet$Name, names.na.train)
89+
trainSet$Embarked = imputeEmbarked(trainSet$Embarked)
90+
trainSet$Fare = imputeFare(trainSet$Fare)
91+
92+
missmap(trainSet, main="Titanic Training Data - Missings Map",
93+
col=c("yellow", "black"), legend=FALSE)
94+
95+
dim(trainSet)
96+
str(trainSet)
97+
head(trainSet)
98+
summary(trainSet)
99+
100+
101+
table(trainSet$Survived)
102+
ggplot(trainSet, aes(x = Survived)) + geom_bar()
103+
104+
#Comparing Survived and passenger class using table and histograms
105+
summary(trainSet$Pclass)
106+
xtabs(~Survived + Pclass, data=trainSet)
107+
ggplot(trainSet, aes(x = Survived, fill = Pclass)) + geom_bar()
108+
109+
#Comparing Survived and Sex using table and histograms
110+
summary(trainSet$Sex)
111+
xtabs(~Survived + Sex, data=trainSet)
112+
ggplot(trainSet, aes(x = Survived, fill = Sex)) + geom_bar()
113+
114+
115+
#Comparing Survived and Embarked using table and histograms
116+
summary(trainSet$Embarked)
117+
xtabs(~Survived + Embarked, data=trainSet)
118+
ggplot(trainSet, aes(x = Survived, fill = Embarked)) + geom_bar()
119+
120+
# Comparing Age and Survived: The boxplots are very similar between Age
121+
# for survivors and those who died.
122+
xtabs(~Survived + Age, data=trainSet)
123+
ggplot(trainSet, aes(x = Survived, y = Age)) + geom_boxplot()
124+
summary(trainSet$Age)
125+
126+
# Comparing Survived and Fare: The boxplots are much different between
127+
# fare for survivors and those who died.
128+
ggplot(trainSet, aes(x = Survived, y = Fare)) + geom_boxplot()
129+
# Also, there are no NA's. Include this variable.
130+
summary(trainSet$Fare)
131+
132+
# Comparing Survived and Parch
133+
ggplot(trainSet, aes(x = Survived, y = Parch)) + geom_boxplot()
134+
summary(trainSet$Parch)
135+
136+
# Set a random seed
137+
set.seed(42)
138+
139+
#model tuning strategy
140+
ctrl = trainControl(method = "cv", # Use cross-validation
141+
number = 10) # Use 10 folds for cross-validation
142+
143+
# Train the model using a "neural net" algorithm
144+
model_nn = train(Survived ~ Pclass + Sex + Age + Embarked + SibSp,
145+
data = trainSet,
146+
method = "nnet",
147+
trControl = ctrl)
148+
model_nn
149+
150+
plotnet(model_nn)
151+
152+
153+
test.data.file = "test.csv"
154+
test.column.types = train.column.types[-2]
155+
156+
testSet = readData(titanic.path, test.data.file,
157+
test.column.types, missing.types)
158+
dim(testSet)
159+
str(testSet)
160+
head(testSet)
161+
summary(testSet)
162+
163+
testSet$Name = changeNames(testSet$Name)
164+
testSet$Age = imputeMean(testSet$Age, testSet$Name,
165+
names.na.train)
166+
testSet$Embarked = imputeEmbarked(testSet$Embarked)
167+
168+
testSet$Survived = predict(model_nn, newdata = testSet)
169+
170+
submission = testSet[,c("PassengerId", "Survived")]
171+
172+
write.table(submission, file = "submission.csv", col.names = TRUE, row.names = FALSE, sep = ",")

18-clustering-iterative/kmeans.R

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
library(ggplot2)
2+
library(Amelia)
3+
library(stats)
4+
5+
# Step-1: Load the data
6+
7+
setwd("E:/data analytics/datasets/")
8+
teens = read.csv("snsdata.csv", header = TRUE, na.strings=c("NA",""))
9+
10+
# Step-2: Explore the data
11+
str(teens)
12+
dim(teens)
13+
head(teens)
14+
15+
# Step-3: Preprocess data/Feature Engineering
16+
17+
#Do we have missing data?
18+
missmap(teens, main="Teen data from social network - Missings Map",
19+
col=c("yellow", "black"), legend=FALSE)
20+
21+
22+
#Analyze gender variable
23+
table(teens$gender, useNA = "ifany")
24+
25+
#Handling missing data of gender variable
26+
teens$female = ifelse(teens$gender == "F" & !is.na(teens$gender), 1, 0)
27+
teens$no_gender = ifelse(is.na(teens$gender), 1, 0)
28+
table(teens$gender, useNA = "ifany")
29+
table(teens$female, useNA = "ifany")
30+
table(teens$no_gender, useNA = "ifany")
31+
32+
missmap(teens, main="Teen data from social network - Missings Map",
33+
col=c("yellow", "black"), legend=FALSE)
34+
35+
#Analyze age variable
36+
summary(teens$age)
37+
teens$age = ifelse(teens$age >= 13 & teens$age < 20,
38+
teens$age, NA)
39+
summary(teens$age)
40+
41+
#Handle the missing values of age variable
42+
ave_age = ave(teens$age, teens$gradyear, FUN =
43+
function(x) mean(x, na.rm = TRUE))
44+
teens$age = ifelse(is.na(teens$age), ave_age, teens$age)
45+
summary(teens$age)
46+
47+
# Taking subset of features
48+
interests = teens[5:40]
49+
50+
# Normalizing the variables sothat distance calculation is not biased
51+
#interests_z = as.data.frame(lapply(interests, scale))
52+
interests_z = scale(interests)
53+
54+
# Step-4: Build the model
55+
#The high-school-age characters in general:
56+
#a Brain, an Athlete, a Basket Case, a Princess, and a Criminal.
57+
set.seed(120)
58+
teen_clusters = kmeans(interests_z, 5)
59+
60+
# Step-5: Evaluating model performance
61+
str(teen_clusters)
62+
63+
teen_clusters$size
64+
65+
teen_clusters$centers
66+
67+
teen_clusters$totss
68+
69+
teen_clusters$withinss
70+
71+
teen_clusters$tot.withinss
72+
73+
teen_clusters$betweenss
74+
75+
# Step-6: Validity of the model
76+
teens$cluster = teen_clusters$cluster
77+
78+
teens[1:5, c("cluster", "gender", "age", "friends")]
79+
80+
aggregate(data = teens, age ~ cluster, mean)
81+
82+
aggregate(data = teens, female ~ cluster, mean)
83+
84+
aggregate(data = teens, friends ~ cluster, mean)
85+
86+
87+
library(animation)
88+
89+
cent <- 1.5 * c(1, 1, -1, -1, 1, -1, 1, -1)
90+
x <- NULL
91+
for (i in 1:8) x <- c(x, rnorm(25, mean=cent[i]))
92+
x <- matrix(x, ncol=2)
93+
colnames(x) <- c("X1", "X2")
94+
dim(x)
95+
96+
head(x)
97+
98+
par(mar=c(3, 3, 1, 1.5), mgp=c(1.5, 0.5, 0), bg="white")
99+
kmeans.ani(x, centers=3, pch=1:4, col=1:4)

0 commit comments

Comments
 (0)