algorithmica-repository
diff --git a/‎naive-bayes/naivebayes1.R‎ renamed to ‎11-naive-bayes/naivebayes1.R‎ b/‎naive-bayes/naivebayes1.R‎ renamed to ‎11-naive-bayes/naivebayes1.R‎
diff --git a/‎naive-bayes/naivebayes2.R‎ renamed to ‎11-naive-bayes/naivebayes2.R‎ b/‎naive-bayes/naivebayes2.R‎ renamed to ‎11-naive-bayes/naivebayes2.R‎
diff --git a/‎12-decisiontree/decision-trees.R‎
Lines changed: 82 additions & 0 deletions b/‎12-decisiontree/decision-trees.R‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎13-linear-regression/linear-regression1.R‎
Lines changed: 21 additions & 0 deletions b/‎13-linear-regression/linear-regression1.R‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎13-linear-regression/linear-regression2.R‎
Lines changed: 24 additions & 0 deletions b/‎13-linear-regression/linear-regression2.R‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎13-linear-regression/linear-regression3.R‎
Lines changed: 32 additions & 0 deletions b/‎13-linear-regression/linear-regression3.R‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎logistic-regression/logistic-regression.R‎ renamed to ‎14-logistic-regression/logistic-regression.R‎ b/‎logistic-regression/logistic-regression.R‎ renamed to ‎14-logistic-regression/logistic-regression.R‎
diff --git a/‎15-svm/svm2.R‎
Lines changed: 76 additions & 0 deletions b/‎15-svm/svm2.R‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎7.dimension-reduction/pca1.R‎
Lines changed: 10 additions & 0 deletions b/‎7.dimension-reduction/pca1.R‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎7.dimension-reduction/pca2.R‎
Lines changed: 27 additions & 0 deletions b/‎7.dimension-reduction/pca2.R‎
Lines changed: 27 additions & 0 deletions
@@ -0,0 +1,82 @@
+library(caret)
+library(randomForest)
+library(ggplot2)
+library(Amelia)
+
+setwd("E:/data analytics/kaggle/titanic/data")
+
+trainSet = read.csv("train.csv", header = TRUE, na.strings=c("NA",""))
+dim(trainSet)
+str(trainSet)
+head(trainSet)
+trainSet$Survived = factor(trainSet$Survived)
+trainSet$Pclass = factor(trainSet$Pclass)
+summary(trainSet)
+
+missmap(trainSet, main="Titanic Training Data - Missings Map", 
+        col=c("yellow", "black"), legend=FALSE)
+
+table(trainSet$Survived)
+ggplot(trainSet, aes(x = Survived)) + geom_bar()
+
+#Comparing Survived and passenger class using table and histograms
+summary(trainSet$Pclass)
+xtabs(~Survived + Pclass, data=trainSet)
+ggplot(trainSet, aes(x = Survived, fill = Pclass)) + geom_bar()
+
+#Comparing Survived and Sex using table and histograms
+summary(trainSet$Sex)
+xtabs(~Survived + Sex, data=trainSet)
+ggplot(trainSet, aes(x = Survived, fill = Sex)) + geom_bar()
+
+
+#Comparing Survived and Embarked using table and histograms
+summary(trainSet$Embarked)
+xtabs(~Survived + Embarked, data=trainSet)
+ggplot(trainSet, aes(x = Survived, fill = Embarked)) + geom_bar()
+
+# Comparing Age and Survived: The boxplots are very similar between Age
+# for survivors and those who died. 
+xtabs(~Survived + Age, data=trainSet)
+ggplot(trainSet, aes(x = Survived, y = Age)) + geom_boxplot() 
+# Also, there are lots of NA's. Exclude this variable
+summary(trainSet$Age)
+
+# Comparing Survived and Fare: The boxplots are much different between 
+# fare for survivors and those who died.
+ggplot(trainSet, aes(x = Survived, y = Fare)) + geom_boxplot() 
+# Also, there are no NA's. Include this variable.
+summary(trainSet$Fare)
+
+# Comparing Survived and Parch
+ggplot(trainSet, aes(x = Survived, y = Parch)) + geom_boxplot() 
+summary(trainSet$Parch)
+
+# Set a random seed 
+set.seed(42)
+
+#model tuning strategy
+ctrl = trainControl(method = "cv", # Use cross-validation
+                    number = 10) # Use 10 folds for cross-validation
+
+# Train the model using a "random forest" algorithm
+model_rf = train(Survived ~ Pclass + Sex + Age + Embarked + SibSp + Fare, 
+                 data = trainSet, 
+                 method = "rpart",
+                 trControl = ctrl)
+model_rf
+
+testSet = read.table("test.csv", sep = ",", header = TRUE)
+dim(testSet)
+str(testSet)
+head(testSet)
+testSet$Pclass = factor(testSet$Pclass)
+summary(testSet)
+testSet$Fare = ifelse(is.na(testSet$Fare), mean(testSet$Fare, na.rm = TRUE), testSet$Fare)
+
+
+testSet$Survived = predict(model_logit, newdata = testSet)
+
+submission = testSet[,c("PassengerId", "Survived")]
+
+write.table(submission, file = "submission.csv", col.names = TRUE, row.names = FALSE, sep = ",")
@@ -0,0 +1,21 @@
+library(ggplot2)
+library(dplyr)
+library(klaR)
+library(e1071)
+library(caret)
+library(Lock5Data)
+data(RestaurantTips)
+
+dim(RestaurantTips)
+str(RestaurantTips)
+head(RestaurantTips)
+
+ctrl = trainControl(method="cv", 10)
+
+reg_model = train(Tip ~ ., data=RestaurantTips, method="lm", trControl=ctrl)
+reg_model
+reg_model$finalModel
+reg_model$finalModel$residuals
+
+predicted = predict(reg_model, RestaurantTips)
+str(predicted)
@@ -0,0 +1,24 @@
+library(ggplot2)
+library(dplyr)
+library(klaR)
+library(e1071)
+library(caret)
+library(Lock5Data)
+data(RestaurantTips)
+
+dim(RestaurantTips)
+str(RestaurantTips)
+head(RestaurantTips)
+  
+RestaurantTips$attr1 = RestaurantTips$Bill + RestaurantTips$Guests
+
+RestaurantTips$attr2 = RestaurantTips$Bill + RestaurantTips$Guests + rnorm(157,0,1) *0.05
+
+ctrl = trainControl(method="cv", 10)
+
+reg_model = train(Tip ~ Bill +  Guests  + attr2, data=RestaurantTips, method="lm", trControl=ctrl)
+reg_model
+reg_model$finalModel
+
+predicted = predict(reg_model, RestaurantTips)
+str(predicted)
@@ -0,0 +1,32 @@
+library(ggplot2)
+library(dplyr)
+library(klaR)
+library(e1071)
+library(caret)
+library(Lock5Data)
+data(RestaurantTips)
+
+dim(RestaurantTips)
+str(RestaurantTips)
+head(RestaurantTips)
+  
+RestaurantTips$attr1 = RestaurantTips$Bill + RestaurantTips$Guests
+
+RestaurantTips$attr2 = RestaurantTips$Bill + RestaurantTips$Guests + rnorm(157,0,1) *0.05
+
+ctrl = trainControl(method="cv", 10)
+
+reg_model1 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="lm", trControl=ctrl)
+
+reg_model2 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="ridge", trControl=ctrl)
+
+reg_model3 = train(Tip ~ Bill + Guests + attr1, data=RestaurantTips, method="lasso", trControl=ctrl)
+
+reg_model1
+reg_model1$finalModel
+
+reg_model2
+reg_model2$finalModel
+
+reg_model3
+reg_model3$finalModel
@@ -0,0 +1,76 @@
+library(caret)
+library(e1071)
+library(ggplot2)
+library(Amelia)
+library(kernlab)
+library(ISLR)
+
+data(Auto)
+
+str(Auto)
+dim(Auto)
+head(Auto)
+summary(Auto)
+
+Auto$y = NA
+Auto$y[Auto$mpg > median(Auto$mpg)] = 1
+Auto$y[Auto$mpg <= median(Auto$mpg)] = 0
+Auto$y = as.factor(Auto$y)
+
+
+missmap(Auto, main="Missings Map", 
+        col=c("yellow", "black"), legend=FALSE)
+
+# Set a random seed 
+set.seed(42)
+
+#model tuning strategy
+ctrl = trainControl(method = "cv", # Use cross-validation
+                    number = 10) # Use 10 folds for cross-validation
+
+preProc_opt = c("knnImpute", "center", "scale")
+
+Lmodel = train(y ~ ., preProc=preProc_opt,
+                 data = Auto, 
+                 method = "svmLinear",
+                 trControl = ctrl, tuneLength=5)
+Lmodel
+Lmodel$finalModel
+
+plot(Lmodel)
+
+Pmodel = train(y ~ ., data = Auto, preProc=preProc_opt,
+                   method = "svmPoly",
+                   trControl = ctrl, tuneLength=5)
+Pmodel
+
+Rmodel = train(y ~ ., preProc=preProc_opt,
+                   data = Auto, 
+                   method = "svmRadial",
+                   trControl = ctrl, tuneLength=5)
+Rmodel
+plot(Rmodel)
+
+resamps = resamples(list(Linear = Lmodel, Poly = Pmodel, Radial = Rmodel))
+summary(resamps)
+bwplot(resamps, metric = "Accuracy")
+densityplot(resamps, metric = "Accuracy")
+
+
+
+
+
+testSet = read.table("test.csv", sep = ",", header = TRUE)
+dim(testSet)
+str(testSet)
+head(testSet)
+testSet$Pclass = factor(testSet$Pclass)
+summary(testSet)
+testSet$Fare = ifelse(is.na(testSet$Fare), mean(testSet$Fare, na.rm = TRUE), testSet$Fare)
+
+
+testSet$Survived = predict(model_logit, newdata = testSet)
+
+submission = testSet[,c("PassengerId", "Survived")]
+
+write.table(submission, file = "submission.csv", col.names = TRUE, row.names = FALSE, sep = ",")
@@ -0,0 +1,10 @@
+library(caret)
+m=matrix(1:8,4,2)
+svd
+
+pca = princomp(m, cor=T)
+summary(pca)
+pca$loadings
+pca$scores
+
+preProcess(method=c("pca"))
@@ -0,0 +1,27 @@
+setwd("E:/data analytics/datasets")
+
+data = read.table("protein.txt", header=TRUE, sep="\t")
+
+dim(data)
+str(data)
+head(data)
+
+# For PCA analysis, keep all the variables
+# except the first column with country names:
+data = data[, -1]
+
+summary(data)
+cor(data)
+cov(data)
+
+pca = princomp(data, cor=T)
+names(pca)
+pca
+summary(pca)
+plot(pca, type="lines")
+
+loadings(pca)
+pca$scores
+
+plot(pca$scores[,1]) 
+barplot(pca$scores[,1])