Skip to content

Commit 2f46f5d

Browse files
author
algorithmica-repository
committed
Uploading class examples
1 parent b13305c commit 2f46f5d

File tree

12 files changed

+272
-142
lines changed

12 files changed

+272
-142
lines changed
File renamed without changes.
File renamed without changes.

12-decisiontree/decision-trees.R

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
library(caret)
2+
library(randomForest)
3+
library(ggplot2)
4+
library(Amelia)
5+
6+
setwd("E:/data analytics/kaggle/titanic/data")
7+
8+
trainSet = read.csv("train.csv", header = TRUE, na.strings=c("NA",""))
9+
dim(trainSet)
10+
str(trainSet)
11+
head(trainSet)
12+
trainSet$Survived = factor(trainSet$Survived)
13+
trainSet$Pclass = factor(trainSet$Pclass)
14+
summary(trainSet)
15+
16+
missmap(trainSet, main="Titanic Training Data - Missings Map",
17+
col=c("yellow", "black"), legend=FALSE)
18+
19+
table(trainSet$Survived)
20+
ggplot(trainSet, aes(x = Survived)) + geom_bar()
21+
22+
#Comparing Survived and passenger class using table and histograms
23+
summary(trainSet$Pclass)
24+
xtabs(~Survived + Pclass, data=trainSet)
25+
ggplot(trainSet, aes(x = Survived, fill = Pclass)) + geom_bar()
26+
27+
#Comparing Survived and Sex using table and histograms
28+
summary(trainSet$Sex)
29+
xtabs(~Survived + Sex, data=trainSet)
30+
ggplot(trainSet, aes(x = Survived, fill = Sex)) + geom_bar()
31+
32+
33+
#Comparing Survived and Embarked using table and histograms
34+
summary(trainSet$Embarked)
35+
xtabs(~Survived + Embarked, data=trainSet)
36+
ggplot(trainSet, aes(x = Survived, fill = Embarked)) + geom_bar()
37+
38+
# Comparing Age and Survived: The boxplots are very similar between Age
39+
# for survivors and those who died.
40+
xtabs(~Survived + Age, data=trainSet)
41+
ggplot(trainSet, aes(x = Survived, y = Age)) + geom_boxplot()
42+
# Also, there are lots of NA's. Exclude this variable
43+
summary(trainSet$Age)
44+
45+
# Comparing Survived and Fare: The boxplots are much different between
46+
# fare for survivors and those who died.
47+
ggplot(trainSet, aes(x = Survived, y = Fare)) + geom_boxplot()
48+
# Also, there are no NA's. Include this variable.
49+
summary(trainSet$Fare)
50+
51+
# Comparing Survived and Parch
52+
ggplot(trainSet, aes(x = Survived, y = Parch)) + geom_boxplot()
53+
summary(trainSet$Parch)
54+
55+
# Set a random seed
56+
set.seed(42)
57+
58+
#model tuning strategy
59+
ctrl = trainControl(method = "cv", # Use cross-validation
60+
number = 10) # Use 10 folds for cross-validation
61+
62+
# Train the model using a "random forest" algorithm
63+
model_rf = train(Survived ~ Pclass + Sex + Age + Embarked + SibSp + Fare,
64+
data = trainSet,
65+
method = "rpart",
66+
trControl = ctrl)
67+
model_rf
68+
69+
testSet = read.table("test.csv", sep = ",", header = TRUE)
70+
dim(testSet)
71+
str(testSet)
72+
head(testSet)
73+
testSet$Pclass = factor(testSet$Pclass)
74+
summary(testSet)
75+
testSet$Fare = ifelse(is.na(testSet$Fare), mean(testSet$Fare, na.rm = TRUE), testSet$Fare)
76+
77+
78+
testSet$Survived = predict(model_logit, newdata = testSet)
79+
80+
submission = testSet[,c("PassengerId", "Survived")]
81+
82+
write.table(submission, file = "submission.csv", col.names = TRUE, row.names = FALSE, sep = ",")
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
library(ggplot2)
2+
library(dplyr)
3+
library(klaR)
4+
library(e1071)
5+
library(caret)
6+
library(Lock5Data)
7+
data(RestaurantTips)
8+
9+
dim(RestaurantTips)
10+
str(RestaurantTips)
11+
head(RestaurantTips)
12+
13+
ctrl = trainControl(method="cv", 10)
14+
15+
reg_model = train(Tip ~ ., data=RestaurantTips, method="lm", trControl=ctrl)
16+
reg_model
17+
reg_model$finalModel
18+
reg_model$finalModel$residuals
19+
20+
predicted = predict(reg_model, RestaurantTips)
21+
str(predicted)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
library(ggplot2)
2+
library(dplyr)
3+
library(klaR)
4+
library(e1071)
5+
library(caret)
6+
library(Lock5Data)
7+
data(RestaurantTips)
8+
9+
dim(RestaurantTips)
10+
str(RestaurantTips)
11+
head(RestaurantTips)
12+
13+
RestaurantTips$attr1 = RestaurantTips$Bill + RestaurantTips$Guests
14+
15+
RestaurantTips$attr2 = RestaurantTips$Bill + RestaurantTips$Guests + rnorm(157,0,1) *0.05
16+
17+
ctrl = trainControl(method="cv", 10)
18+
19+
reg_model = train(Tip ~ Bill + Guests + attr2, data=RestaurantTips, method="lm", trControl=ctrl)
20+
reg_model
21+
reg_model$finalModel
22+
23+
predicted = predict(reg_model, RestaurantTips)
24+
str(predicted)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
library(ggplot2)
2+
library(dplyr)
3+
library(klaR)
4+
library(e1071)
5+
library(caret)
6+
library(Lock5Data)
7+
data(RestaurantTips)
8+
9+
dim(RestaurantTips)
10+
str(RestaurantTips)
11+
head(RestaurantTips)
12+
13+
RestaurantTips$attr1 = RestaurantTips$Bill + RestaurantTips$Guests
14+
15+
RestaurantTips$attr2 = RestaurantTips$Bill + RestaurantTips$Guests + rnorm(157,0,1) *0.05
16+
17+
ctrl = trainControl(method="cv", 10)
18+
19+
reg_model1 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="lm", trControl=ctrl)
20+
21+
reg_model2 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="ridge", trControl=ctrl)
22+
23+
reg_model3 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="lasso", trControl=ctrl)
24+
25+
reg_model1
26+
reg_model1$finalModel
27+
28+
reg_model2
29+
reg_model2$finalModel
30+
31+
reg_model3
32+
reg_model3$finalModel

15-svm/svm2.R

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
library(caret)
2+
library(e1071)
3+
library(ggplot2)
4+
library(Amelia)
5+
library(kernlab)
6+
library(ISLR)
7+
8+
data(Auto)
9+
10+
str(Auto)
11+
dim(Auto)
12+
head(Auto)
13+
summary(Auto)
14+
15+
Auto$y = NA
16+
Auto$y[Auto$mpg > median(Auto$mpg)] = 1
17+
Auto$y[Auto$mpg <= median(Auto$mpg)] = 0
18+
Auto$y = as.factor(Auto$y)
19+
20+
21+
missmap(Auto, main="Missings Map",
22+
col=c("yellow", "black"), legend=FALSE)
23+
24+
# Set a random seed
25+
set.seed(42)
26+
27+
#model tuning strategy
28+
ctrl = trainControl(method = "cv", # Use cross-validation
29+
number = 10) # Use 10 folds for cross-validation
30+
31+
preProc_opt = c("knnImpute", "center", "scale")
32+
33+
Lmodel = train(y ~ ., preProc=preProc_opt,
34+
data = Auto,
35+
method = "svmLinear",
36+
trControl = ctrl, tuneLength=5)
37+
Lmodel
38+
Lmodel$finalModel
39+
40+
plot(Lmodel)
41+
42+
Pmodel = train(y ~ ., data = Auto, preProc=preProc_opt,
43+
method = "svmPoly",
44+
trControl = ctrl, tuneLength=5)
45+
Pmodel
46+
47+
Rmodel = train(y ~ ., preProc=preProc_opt,
48+
data = Auto,
49+
method = "svmRadial",
50+
trControl = ctrl, tuneLength=5)
51+
Rmodel
52+
plot(Rmodel)
53+
54+
resamps = resamples(list(Linear = Lmodel, Poly = Pmodel, Radial = Rmodel))
55+
summary(resamps)
56+
bwplot(resamps, metric = "Accuracy")
57+
densityplot(resamps, metric = "Accuracy")
58+
59+
60+
61+
62+
63+
testSet = read.table("test.csv", sep = ",", header = TRUE)
64+
dim(testSet)
65+
str(testSet)
66+
head(testSet)
67+
testSet$Pclass = factor(testSet$Pclass)
68+
summary(testSet)
69+
testSet$Fare = ifelse(is.na(testSet$Fare), mean(testSet$Fare, na.rm = TRUE), testSet$Fare)
70+
71+
72+
testSet$Survived = predict(model_logit, newdata = testSet)
73+
74+
submission = testSet[,c("PassengerId", "Survived")]
75+
76+
write.table(submission, file = "submission.csv", col.names = TRUE, row.names = FALSE, sep = ",")

7.dimension-reduction/pca1.R

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
library(caret)
2+
m=matrix(1:8,4,2)
3+
svd
4+
5+
pca = princomp(m, cor=T)
6+
summary(pca)
7+
pca$loadings
8+
pca$scores
9+
10+
preProcess(method=c("pca"))

7.dimension-reduction/pca2.R

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
setwd("E:/data analytics/datasets")
2+
3+
data = read.table("protein.txt", header=TRUE, sep="\t")
4+
5+
dim(data)
6+
str(data)
7+
head(data)
8+
9+
# For PCA analysis, keep all the variables
10+
# except the first column with country names:
11+
data = data[, -1]
12+
13+
summary(data)
14+
cor(data)
15+
cov(data)
16+
17+
pca = princomp(data, cor=T)
18+
names(pca)
19+
pca
20+
summary(pca)
21+
plot(pca, type="lines")
22+
23+
loadings(pca)
24+
pca$scores
25+
26+
plot(pca$scores[,1])
27+
barplot(pca$scores[,1])

0 commit comments

Comments
 (0)