# 数据导入# 获得文件路径/目录setwd("C:\\Users\\kelanj\\Documents")spam.path<-file.path("data","spam")ham.path<-file.path("data","ham")# 获得目录下的文件名 向量spam.docs <- dir(spam.path)ham.docs<-dir(ham.path)spam.docs[1:4]#查看前四个文件

## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"## [2] "00001.7848dde101aa985090474a91ec93fcf0"## [3] "00002.9438920e9a55591b18e60d1ed37d992b"## [4] "00002.d94f1b97e48ed3b553b3508d116e6a09"


## [1] "00001.1a31cc283af0060967a233d26548a6ce"## [2] "00001.7c53336b37003a9286aba55d2945844c"## [3] "00002.5a587ae61666c5aa097c8e866aedcc59"## [4] "00002.9c4069e25e1ef370c078db7ee85ff9ac"

# 编写函数getContent 实现一封邮件内容读取 返回内容的字符串# 注意:邮件格式规定"每份邮件包含头部和正文两个部分一般由第一个空行分割"getContent<-function(path){conn<-file(path,open = "rt")#不需要指定encoding否则读取的时候会出错line<-readLines(conn,warn = F)content<- tryCatch(line[seq(which(line == "")[1]+1, length(line), 1)], error = function(e) e)close(conn)content<-paste(content,collapse = '\n')return(content)}# 分别获取spam 1897个文件和ham 3900个文件 的邮件内容spamContent<-sapply(spam.docs,function(path) getContent(file.path(spam.path,path)))hamContent<-sapply(ham.docs,function(path) getContent(file.path(ham.path,path)))s.h.content<-c(spamContent,hamContent)#合并邮件内容





library(NLP)library(tm)library(SnowballC)#提取词干library(slam)#将各种url转化为httpmyremoveURL<-function(x){x<-gsub(pattern = "(https?|ftp|file):\\/\\/[-A-Za-z0-9+&@#\\/%?=~_|!:,\\.;]+[-A-Za-z0-9+&@#\\/%=~_|]","http",x)}#除去html标签myremoveHTML<-function(x){x<-gsub(pattern = "<[^>]+>","",x)}#自己的英文停词表myenstopwords<-function(){c(stopwords(),"will","also")}#自己的文本处理函数 cleanContent1<-function(content){contentCorpus<-Corpus(VectorSource(content))contentCorpus<-tm_map(contentCorpus,PlainTextDocument)contentCorpus <- tm_map(contentCorpus, myremoveURL)contentCorpus <- tm_map(contentCorpus, myremoveHTML)contentCorpus <- tm_map(contentCorpus, tolower)contentCorpus <- tm_map(contentCorpus, removeNumbers)contentCorpus<-tm_map(contentCorpus,removeWords,myenstopwords())contentCorpus <- tm_map(contentCorpus, removePunctuation)contentCorpus <- tm_map(contentCorpus, stripWhitespace)return(contentCorpus)}


Sys.setlocale(category = "LC_ALL", locale = "us")

## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"


s.h.dtm<-DocumentTermMatrix(s.h.corpus)s.h.dtm#98191列,5797行 前1897行是spam,后3900行是ham

## <<DocumentTermMatrix (documents: 5797, terms: 71600)>>## Non-/sparse entries: 553817/414511383## Sparsity : 100%## Maximal term length: 868## Weighting: term frequency (tf)







#先转换为正常的矩阵进行 词频统计s.dtm.train<-as.matrix(s.h.dtm.train[1:1423,])s.sum<-col_sums(s.dtm.train)s.term<-names(s.sum)s.freq<-as.numeric(s.sum)#转换为数据框s.frame<-as.data.frame(cbind(s.term,s.freq),row.names=NULL,optional=F)s.frame$s.freq<-as.numeric(s.frame$s.freq)head(s.frame)

## s.term s.freq## 1 abandoned43## 2accept8## 3address 330## 4 agree88## 5agreed 164## 6 alternative 269






myfindFreqTerms <- function(x,lowfreq=0,highfreq=Inf){stopifnot(inherits(x,c("DocumentTermMatrix","TermDocumentMatrix","simple_triplet_matrix")),is.numeric(lowfreq),is.numeric(highfreq))if(inherits(x,"DocumentTermMatrix"))x<-t(x)rs <- slam::row_sums(x)y <- which(rs >= lowfreq & rs<= highfreq)return(x[y,])}s.h.dict<-Terms(myfindFreqTerms(s.h.dtm.train,100))length(s.h.dict)#共有1151个term

## [1] 1151



## <<DocumentTermMatrix (documents: 4349, terms: 1151)>>## Non-/sparse entries: 224076/4781623## Sparsity : 96%## Maximal term length: 35## Weighting: term frequency (tf)


## <<DocumentTermMatrix (documents: 1448, terms: 1151)>>## Non-/sparse entries: 52503/1614145## Sparsity : 97%## Maximal term length: 35## Weighting: term frequency (tf)



#首先将训练集中的0 1值转换为因子No Yesconvert_counts <- function(x){x <- ifelse(x>0,1,0)x <- factor(x, levels=c(0,1),labels=c("No","Yes"))return(x)}s_h_train <- apply(s.h.train, MARGIN=2, convert_counts)s_h_test<-apply(s.h.test, MARGIN = 2, convert_counts)


library(e1071)s_h_train_type<-c(rep("spam",1423),rep("ham",2926))s_h_test_type<-c(rep("spam",473),rep("ham",975))s_h_train_type<-as.data.frame(s_h_train_type)model_s_h<-naiveBayes(s_h_train,s_h_train_type$s_h_train_type,laplace=1)s_h_prediction<-predict(model_s_h,s_h_test,type = "class")




## ## ## Cell Contents## |-------------------------|## | N |## | Chi-square contribution |## | N / Row Total |## | N / Col Total |## |-------------------------|## ## ## Total Observations in Table: 1448 ## ## ## | actual ## predicted | ham |spam | Row Total | ## -------------|-----------|-----------|-----------|##ham | 967 | 69 |1036 | ## | 104.053 | 214.486 | | ## |0.933 |0.067 |0.715 | ## |0.992 |0.146 | | ## -------------|-----------|-----------|-----------|## spam | 8 | 404 | 412 | ## | 261.648 | 539.337 | | ## |0.019 |0.981 |0.285 | ## |0.008 |0.854 | | ## -------------|-----------|-----------|-----------|## Column Total | 975 | 473 |1448 | ## |0.673 |0.327 | | ## -------------|-----------|-----------|-----------|## ##

分析: 从表中可以看出,975条非垃圾短信中有8条短信被错误的归为垃圾短信,比例为:0.8%,而473条垃圾短信中有69条短信被错误的归为非垃圾短信,比例为14.6%。



model_s_h<-naiveBayes(s_h_train,s_h_train_type$s_h_train_type,laplace=0.001)s_h_prediction<-predict(model_s_h,s_h_test,type = "class")CrossTable(s_h_prediction,s_h_test_type,prop.chisq=TRUE,prop.t=FALSE,dnn=c("predicted","actual"))

## ## ## Cell Contents## |-------------------------|## | N |## | Chi-square contribution |## | N / Row Total |## | N / Col Total |## |-------------------------|## ## ## Total Observations in Table: 1448 ## ## ## | actual ## predicted | ham |spam | Row Total | ## -------------|-----------|-----------|-----------|##ham | 969 | 66 |1035 | ## | 106.231 | 218.975 | | ## |0.936 |0.064 |0.715 | ## |0.994 |0.140 | | ## -------------|-----------|-----------|-----------|## spam | 6 | 407 | 413 | ## | 266.220 | 548.762 | | ## |0.015 |0.985 |0.285 | ## |0.006 |0.860 | | ## -------------|-----------|-----------|-----------|## Column Total | 975 | 473 |1448 | ## |0.673 |0.327 | | ## -------------|-----------|-----------|-----------|## ##



