Skip to content

Commit b13305c

Browse files
author
algorithmica-repository
committed
Uploading naive bayes examples discussed in class
1 parent dbcfc6b commit b13305c

File tree

3 files changed

+178
-119
lines changed

3 files changed

+178
-119
lines changed

naive-bayes/naivebayes.R

Lines changed: 0 additions & 119 deletions
This file was deleted.

naive-bayes/naivebayes1.R

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# to simplify selections
2+
library(dplyr)
3+
# for stemming the words
4+
library(SnowballC)
5+
# libraries required by caret
6+
library(klaR)
7+
library(e1071)
8+
# for the Naive Bayes modelling
9+
library(caret)
10+
# to process the text into a corpus
11+
library(tm)
12+
13+
# Set seed for reproducibility
14+
set.seed(1234)
15+
16+
# Read the data
17+
setwd("E:/data analytics/datasets")
18+
sms_raw = read.table(unz("smsspamcollection.zip","SMSSpamCollection"),
19+
header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE)
20+
# Explore the dataset
21+
dim(sms_raw)
22+
str(sms_raw)
23+
head(sms_raw)
24+
25+
colnames(sms_raw) = c("type", "text")
26+
sms_raw$type = factor(sms_raw$type)
27+
28+
# Preparing the dataset
29+
sms_corpus = Corpus(VectorSource(sms_raw$text))
30+
31+
as.character(sms_corpus[[1]])
32+
inspect(sms_corpus[1:10])
33+
34+
#To avoid the issue with DocumentTermMatrix method, use one of following solutions:
35+
#1) Adding content_transformer avoids the type conversion issue with non-standard transformations
36+
#2) Add the tm_map(PlainTextDocument) after all the cleaning is done
37+
38+
sms_corpus_clean = sms_corpus %>%
39+
tm_map(content_transformer(tolower)) %>%
40+
tm_map(removeNumbers) %>%
41+
tm_map(removePunctuation) %>%
42+
tm_map(removeWords, stopwords(kind="en")) %>%
43+
tm_map(stripWhitespace) %>%
44+
tm_map(stemDocument)
45+
46+
inspect(sms_corpus_clean[1:10])
47+
48+
sms_corpus_clean = DocumentTermMatrix(sms_corpus_clean,control=list(minWordLength=2))
49+
dim(sms_corpus_clean)
50+
sms_corpus_clean = removeSparseTerms(sms_corpus_clean,0.98)
51+
dim(sms_corpus_clean)
52+
inspect(sms_corpus_clean[1:10,1:10])
53+
54+
# Convert the dtm into boolean values instead of term frequencies
55+
convert_counts <- function(x) {
56+
x = ifelse(x > 0, 1, 0)
57+
x = factor(x, levels = c(0, 1), labels = c("No", "Yes"))
58+
}
59+
sms_corpus_clean_binary= sms_corpus_clean %>% apply(MARGIN=2, FUN=convert_counts)
60+
dim(sms_corpus_clean_binary)
61+
sms_corpus_clean_binary[1:10,1:10]
62+
63+
64+
#Train the model
65+
ctrl = trainControl(method="cv", 10)
66+
sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", trControl=ctrl)
67+
sms_model
68+
69+
70+
ctrl = trainControl(method="cv", 10)
71+
72+
sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", trControl=ctrl)
73+
str(sms_model)
74+
sms_model$trainingData
75+
sms_model$resample
76+
sms_model$time
77+
78+
#Test the model
79+
sms_predict = predict(sms_model, sms_corpus_clean_binary, type="prob")
80+
str(sms_predict)
81+
head(sms_predict)
82+
83+
cm = confusionMatrix(sms_predict, sms_raw$type, positive="spam")
84+
cm
85+
86+
87+

naive-bayes/naivebayes2.R

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# to simplify selections
2+
library(dplyr)
3+
# for stemming the words
4+
library(SnowballC)
5+
# libraries required by caret
6+
library(klaR)
7+
library(e1071)
8+
# for the Naive Bayes modelling
9+
library(caret)
10+
# to process the text into a corpus
11+
library(tm)
12+
# to get nice looking tables
13+
library(pander)
14+
15+
# Set seed for reproducibility
16+
set.seed(1234)
17+
18+
frqtab = function(x, caption) {
19+
round(100*prop.table(table(x)), 1)
20+
}
21+
22+
# Read the data
23+
setwd("E:/data analytics/datasets")
24+
sms_raw = read.table(unz("smsspamcollection.zip","SMSSpamCollection"),
25+
header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE)
26+
sms_raw = sms_raw[sample(nrow(sms_raw)),]
27+
28+
# Explore the dataset
29+
dim(sms_raw)
30+
str(sms_raw)
31+
head(sms_raw)
32+
33+
colnames(sms_raw) = c("type", "text")
34+
sms_raw$type = factor(sms_raw$type)
35+
36+
# Preparing the dataset
37+
sms_corpus = Corpus(VectorSource(sms_raw$text))
38+
39+
inspect(sms_corpus[1:10])
40+
41+
#To avoid the issue with DocumentTermMatrix method, use one of following solutions:
42+
#1) Adding content_transformer avoids the type conversion issue with non-standard transformations
43+
#2) Add the tm_map(PlainTextDocument) after all the cleaning is done
44+
45+
sms_corpus_clean = sms_corpus %>%
46+
tm_map(content_transformer(tolower)) %>%
47+
tm_map(removeNumbers) %>%
48+
tm_map(removePunctuation) %>%
49+
tm_map(removeWords, stopwords(kind="en")) %>%
50+
tm_map(stripWhitespace) %>%
51+
tm_map(stemDocument)
52+
53+
inspect(sms_corpus_clean[1:10])
54+
55+
sms_corpus_clean = DocumentTermMatrix(sms_corpus_clean,control=list(minWordLength=2))
56+
dim(sms_corpus_clean)
57+
inspect(sms_corpus_clean[1:10,1:10])
58+
59+
# Convert the dtm into boolean values instead of term frequencies
60+
convert_counts <- function(x) {
61+
x = ifelse(x > 0, 1, 0)
62+
x = factor(x, levels = c(0, 1), labels = c("No", "Yes"))
63+
}
64+
sms_corpus_clean_binary= sms_corpus_clean %>% apply(MARGIN=2, FUN=convert_counts)
65+
dim(sms_corpus_clean_binary)
66+
sms_corpus_clean_binary[1:10,1:10]
67+
68+
69+
#Train the model
70+
ctrl = trainControl(method="cv", 10)
71+
sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", trControl=ctrl)
72+
sms_model
73+
74+
75+
ctrl = trainControl(method="cv", 10)
76+
77+
grid=data.frame(.fL=c(0,1), .usekernel=FALSE)
78+
79+
sms_model = train(sms_corpus_clean_binary, sms_raw$type, method="nb", tuneGrid = grid, trControl=ctrl)
80+
sms_model
81+
82+
#Test the model
83+
sms_predict = predict(sms_model, sms_corpus_clean_binary, type="prob")
84+
str(sms_predict)
85+
head(sms_predict)
86+
87+
cm = confusionMatrix(sms_predict, sms_raw$type, positive="spam")
88+
cm
89+
90+
91+

0 commit comments

Comments
 (0)