700字范文 > 决策树（CART算法）针对中文文本分类

决策树（CART算法）针对中文文本分类

时间：2021-02-01 16:26:39

相关推荐

决策树（CART算法）针对中文文本分类

改编自博客：

/github_36326955/article/details/54891204

根据下面的参考了链接可知，sklearn中的决策树用的是CART算法

http://sofasofa.io/forum_main_post.php?postid=1000402&

做个笔记

代码按照1 2 3 4的顺序进行即可：

1.py(corpus_segment.py)

[python]view plaincopy#!/usr/bin/envpython #-*-coding:UTF-8-*- """ @version:python2.7.8 @author:XiangguoSun @contact:sunxiangguodut@ @file:corpus_segment.py @time:/2/515:28 @software:PyCharm """ importsys importos importjieba #配置utf-8输出环境 reload(sys) sys.setdefaultencoding('utf-8') #保存至文件 defsavefile(savepath,content): withopen(savepath,"wb")asfp: fp.write(content) ''''' 上面两行是python2.6以上版本增加的语法，省略了繁琐的文件close和try操作 2.5版本需要from__future__importwith_statement 新手可以参考这个链接来学习/archives/325 ''' #读取文件 defreadfile(path): withopen(path,"rb")asfp: content=fp.read() returncontent defcorpus_segment(corpus_path,seg_path): ''''' corpus_path是未分词语料库路径 seg_path是分词后语料库存储路径 ''' catelist=os.listdir(corpus_path)#获取corpus_path下的所有子目录 ''''' 其中子目录的名字就是类别名，例如： train_corpus/art/21.txt中，'train_corpus/'是corpus_path，'art'是catelist中的一个成员 ''' #获取每个目录（类别）下所有的文件 formydirincatelist: ''''' 这里mydir就是train_corpus/art/21.txt中的art（即catelist中的一个类别） ''' class_path=corpus_path+mydir+"/"#拼出分类子目录的路径如：train_corpus/art/ seg_dir=seg_path+mydir+"/"#拼出分词后存贮的对应目录路径如：train_corpus_seg/art/ ifnotos.path.exists(seg_dir):#是否存在分词目录，如果没有则创建该目录 os.makedirs(seg_dir) file_list=os.listdir(class_path)#获取未分词语料库中某一类别中的所有文本 ''''' train_corpus/art/中的 21.txt, 22.txt, 23.txt ... file_list=['21.txt','22.txt',...] ''' forfile_pathinfile_list:#遍历类别目录下的所有文件 fullname=class_path+file_path#拼出文件名全路径如：train_corpus/art/21.txt content=readfile(fullname)#读取文件内容 '''''此时，content里面存贮的是原文本的所有字符，例如多余的空格、空行、回车等等，接下来，我们需要把这些无关痛痒的字符统统去掉，变成只有标点符号做间隔的紧凑的文本内容 ''' content=content.replace("\r\n","")#删除换行 content=content.replace("","")#删除空行、多余的空格 content_seg=jieba.cut(content)#为文件内容分词 savefile(seg_dir+file_path,"".join(content_seg))#将处理后的文件保存到分词后语料目录 print"中文语料分词结束！！！" ''''' 如果你对if__name__=="__main__":这句不懂，可以参考下面的文章 /post/3492bc_bd0c4ce 简单来说如果其他python文件调用这个文件的函数，或者把这个文件作为模块导入到你的工程中时，那么下面的代码将不会被执行，而如果单独在命令行中运行这个文件，或者在IDE（如pycharm）中运行这个文件时候，下面的代码才会运行。即，这部分代码相当于一个功能测试。如果你还没懂，建议你放弃IT这个行业。 ''' if__name__=="__main__": #对训练集进行分词 corpus_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train/"#未分词分类语料库路径 seg_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"#分词后分类语料库路径,本程序输出结果 corpus_segment(corpus_path,seg_path) #对测试集进行分词 corpus_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/answer/"#未分词分类语料库路径 seg_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"#分词后分类语料库路径，本程序输出结果 corpus_segment(corpus_path,seg_path)

2.py(corpus2Bunch.py)[python]view plaincopy#!/usr/bin/envpython #-*-coding:UTF-8-*- """ @version:python2.7.8 @author:XiangguoSun @contact:sunxiangguodut@ @file:corpus2Bunch.py @time:/2/77:41 @software:PyCharm """ importsys reload(sys) sys.setdefaultencoding('utf-8') importos#python内置的包，用于进行文件目录操作，我们将会用到os.listdir函数 importcPickleaspickle#导入cPickle包并且取一个别名pickle ''''' 事实上python中还有一个也叫作pickle的包，与这里的名字相同了，无所谓关于cPickle与pickle，请参考博主另一篇博文： python核心模块之pickle和cPickle讲解 /github_36326955/article/details/54882506 本文件代码下面会用到cPickle中的函数cPickle.dump ''' fromsklearn.datasets.baseimportBunch #这个您无需做过多了解，您只需要记住以后导入Bunch数据结构就像这样就可以了。 #今后的博文会对sklearn做更有针对性的讲解 def_readfile(path): '''''读取文件''' #函数名前面带一个_,是标识私有函数 #仅仅用于标明而已，不起什么作用， #外面想调用还是可以调用， #只是增强了程序的可读性 withopen(path,"rb")asfp:#withas句法前面的代码已经多次介绍过，今后不再注释 content=fp.read() returncontent defcorpus2Bunch(wordbag_path,seg_path): catelist=os.listdir(seg_path)#获取seg_path下的所有子目录，也就是分类信息 #创建一个Bunch实例 bunch=Bunch(target_name=[],label=[],filenames=[],contents=[]) bunch.target_name.extend(catelist) ''''' extend(addlist)是pythonlist中的函数，意思是用新的list（addlist）去扩充原来的list ''' #获取每个目录下所有的文件 formydirincatelist: class_path=seg_path+mydir+"/"#拼出分类子目录的路径 file_list=os.listdir(class_path)#获取class_path下的所有文件 forfile_pathinfile_list:#遍历类别目录下文件 fullname=class_path+file_path#拼出文件名全路径 bunch.label.append(mydir) bunch.filenames.append(fullname) bunch.contents.append(_readfile(fullname))#读取文件内容 '''''append(element)是pythonlist中的函数，意思是向原来的list中添加element，注意与extend()函数的区别''' #将bunch存储到wordbag_path路径中 withopen(wordbag_path,"wb")asfile_obj: pickle.dump(bunch,file_obj) print"构建文本对象结束！！！" if__name__=="__main__":#这个语句前面的代码已经介绍过，今后不再注释 #对训练集进行Bunch化操作： wordbag_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"#Bunch存储路径，程序输出 seg_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_corpus_seg/"#分词后分类语料库路径，程序输入 corpus2Bunch(wordbag_path,seg_path) #对测试集进行Bunch化操作： wordbag_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"#Bunch存储路径，程序输出 seg_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_corpus_seg/"#分词后分类语料库路径，程序输入 corpus2Bunch(wordbag_path,seg_path)

3.py(TFIDF_space.py)

[python]view plaincopy#!/usr/bin/envpython #-*-coding:UTF-8-*- """ @version:python2.7.8 @author:XiangguoSun @contact:sunxiangguodut@ @file:TFIDF_space.py @time:/2/811:39 @software:PyCharm """ importsys reload(sys) sys.setdefaultencoding('utf-8') fromsklearn.datasets.baseimportBunch importcPickleaspickle fromsklearn.feature_extraction.textimportTfidfVectorizer def_readfile(path): withopen(path,"rb")asfp: content=fp.read() returncontent def_readbunchobj(path): withopen(path,"rb")asfile_obj: bunch=pickle.load(file_obj) returnbunch def_writebunchobj(path,bunchobj): withopen(path,"wb")asfile_obj: pickle.dump(bunchobj,file_obj) defvector_space(stopword_path,bunch_path,space_path,train_tfidf_path=None): stpwrdlst=_readfile(stopword_path).splitlines() bunch=_readbunchobj(bunch_path) tfidfspace=Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={}) iftrain_tfidf_pathisnotNone: trainbunch=_readbunchobj(train_tfidf_path) tfidfspace.vocabulary=trainbunch.vocabulary vectorizer=TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf=True,max_df=0.5,vocabulary=trainbunch.vocabulary) tfidfspace.tdm=vectorizer.fit_transform(bunch.contents) else: vectorizer=TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf=True,max_df=0.5) tfidfspace.tdm=vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary=vectorizer.vocabulary_ _writebunchobj(space_path,tfidfspace) print"tf-idf词向量空间实例创建成功！！！" if__name__=='__main__': #stopword_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#输入的文件 #bunch_path="train_word_bag/train_set.dat"#输入的文件 #space_path="train_word_bag/tfdifspace.dat"#输出的文件 #vector_space(stopword_path,bunch_path,space_path) # #bunch_path="test_word_bag/test_set.dat"#输入的文件 #space_path="test_word_bag/testspace.dat" #train_tfidf_path="train_word_bag/tfdifspace.dat" #vector_space(stopword_path,bunch_path,space_path,train_tfidf_path) stopword_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/hlt_stop_words.txt"#输入的文件 train_bunch_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/train_set.dat"#输入的文件 space_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"#输出的文件 vector_space(stopword_path,train_bunch_path,space_path) train_tfidf_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/train_word_bag/tfidfspace.dat"#输入的文件，由上面生成 test_bunch_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/test_set.dat"#输入的文件 test_space_path="/home/appleyuchi/PycharmProjects/MultiNB/csdn_blog/54891204_tenwhy/chinese_text_classification-master/test_word_bag/testspace.dat"#输出的文件 vector_space(stopword_path,test_bunch_path,test_space_path,train_tfidf_path)

4.py

#!/usr/bin/env python # -*- coding: UTF-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import cPickle as pickle from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法 # 读取bunch对象 def _readbunchobj(path): with open(path, "rb") as file_obj: bunch = pickle.load(file_obj) return bunch # 导入训练集 trainpath = "../train_word_bag/tfidfspace.dat" train_set = _readbunchobj(trainpath) # 导入测试集 testpath = "../test_word_bag/testspace.dat" test_set = _readbunchobj(testpath) # 训练分类器：输入词袋向量和分类标签，alpha:0.001 alpha越小，迭代次数越多，精度越高 # clf = MultinomialNB(alpha=0.1).fit(train_set.tdm, train_set.label) ###################################################### from sklearn import treeprint '*************************决策树************************' clf = tree.DecisionTreeClassifier()clf.fit(train_set.tdm, train_set.label) # 预测分类结果 print '*************************开始预测************************' predicted = clf.predict(test_set.tdm) for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted): if flabel != expct_cate: print file_name,": 实际类别:",flabel," -->预测类别:",expct_cate print "预测完毕!!!" # 计算分类精度： from sklearn import metrics def metrics_result(actual, predict): print '精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted')) print '召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted')) print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted')) metrics_result(test_set.label, predicted)

依然使用复旦大学的新闻数据集

运行结果（这里复制一部分）：

../test_corpus_seg/C37-Military/C37-Military008.txt : 实际类别: C37-Military -->预测类别: C11-Space

../test_corpus_seg/C37-Military/C37-Military031.txt : 实际类别: C37-Military -->预测类别: C38-Politics

../test_corpus_seg/C37-Military/C37-Military101.txt : 实际类别: C37-Military -->预测类别: C38-Politics

../test_corpus_seg/C37-Military/C37-Military006.txt : 实际类别: C37-Military -->预测类别: C32-Agriculture

../test_corpus_seg/C37-Military/C37-Military125.txt : 实际类别: C37-Military -->预测类别: C29-Transport

预测完毕!!!

精度:0.878

召回:0.879

f1-score:0.878

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。