700字范文 > [nlp] sentiment analysis(情感分析)

[nlp] sentiment analysis(情感分析)

时间：2018-06-19 21:42:58

/udacity/deep-learning-v2-pytorch/udacity/deep-learning-v2-pytorch/tree/master/sentiment-rnn/data # 数据PyTorch 实现双向LSTM 情感分析

文章目录

网络结构模型训练与预测1、Data Preprocessing2、Encoding the words3、Encoding the labels4、Padding sequences5、Training, Test划分6. DataLoaders and Batching7. 双向LSTM模型8 Train9 Test模型Inference完整代码sklearn方法

网络结构

模型训练与预测

1、Data Preprocessing

我们要去除标点符号。同时，去除不同文本之间有分隔符号\n，我们先把\n当成分隔符号，分割所有评论。然后在将所有评论再次连接成为一个大的文本。

import numpy as np# read data from text fileswith open('./data/reviews.txt', 'r') as f:reviews = f.read()with open('./data/labels.txt', 'r') as f:labels = f.read()print(reviews[:1000])print()print(labels[:20])

from string import punctuation# punctuation:'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'# get rid of punctuationreviews = reviews.lower() # lowercase, standardizeall_text = ''.join([c for c in reviews if c not in punctuation])# split by new lines and spacesreviews_split = all_text.split('\n')all_text = ' '.join(reviews_split)# create a list of wordswords = all_text.split()

2、Encoding the words

embedding lookup要求输入的网络数据是整数。最简单的方法就是创建数据字典：{单词：整数}。然后将评论全部一一对应转换成整数，传入网络。

# feel free to use this import from collections import Counter## Build a dictionary that maps words to integerscounts = Counter(words)vocab = sorted(counts, key=counts.get, reverse=True)vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}## use the dict to tokenize each review in reviews_split## store the tokenized reviews in reviews_intsreviews_ints = []for review in reviews_split:reviews_ints.append([vocab_to_int[word] for word in review.split()])# stats about vocabularyprint('Unique words: ', len((vocab_to_int))) # should ~ 74000+print()# print tokens in first reviewprint('Tokenized review: \n', reviews_ints[:1])

3、Encoding the labels

将标签 “positive” or "negative"转换为数值。

# 1=positive, 0=negative label conversionlabels_split = labels.split('\n')encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])# outlier review statsreview_lens = Counter([len(x) for x in reviews_ints])print("Zero-length reviews: {}".format(review_lens[0]))print("Maximum review length: {}".format(max(review_lens)))

消除长度为0的行

print('Number of reviews before removing outliers: ', len(reviews_ints))## remove any reviews/labels with zero length from the reviews_ints list.# get indices of any reviews with length 0non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]# remove 0-length reviews and their labelsreviews_ints = [reviews_ints[ii] for ii in non_zero_idx]encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])print('Number of reviews after removing outliers: ', len(reviews_ints))

4、Padding sequences

将所以句子统一长度为200个单词：

1、评论长度小于200的，我们对其左边填充0

2、对于大于200的，我们只截取其前200个单词

#选择每个句子长为200seq_len = 200from keras import preprocessingfeatures = np.zeros((len(reviews_ints),seq_len),dtype=int)#将reviews_ints值逐行赋值给featuresfeatures = preprocessing.sequence.pad_sequences(reviews_ints,200)features.shape

def pad_features(reviews_ints, seq_length):''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.'''# getting the correct rows x cols shapefeatures = np.zeros((len(reviews_ints), seq_length), dtype=int)# for each review, I grab that review and for i, row in enumerate(reviews_ints):features[i, -len(row):] = np.array(row)[:seq_length]return features# Test your implementation!seq_length = 200features = pad_features(reviews_ints, seq_length=seq_length)## test statements - do not change - ##assert len(features)==len(reviews_ints), "Your features should have as many rows as reviews."assert len(features[0])==seq_length, "Each feature row should contain seq_length values."# print first 10 values of the first 30 batches print(features[:30,:10])

5、Training, Test划分

split_frac = 0.8## split data into training, validation, and test data (features and labels, x and y)split_idx = int(len(features)*split_frac)train_x, remaining_x = features[:split_idx], features[split_idx:]train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]test_idx = int(len(remaining_x)*0.5)val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]## print out the shapes of your resultant feature dataprint("\t\t\tFeature Shapes:")print("Train set: \t\t{}".format(train_x.shape), "\nValidation set: \t{}".format(val_x.shape),"\nTest set: \t\t{}".format(test_x.shape))

from sklearn.model_selection import ShuffleSplitss = ShuffleSplit(n_splits=1,test_size=0.2,random_state=0)for train_index,test_index in ss.split(np.array(reviews_ints)):train_x = features[train_index]train_y = encoded_labels[train_index]test_x = features[test_index]test_y = encoded_labels[test_index]print("\t\t\tFeature Shapes:")print("Train set: \t\t{}".format(train_x.shape), "\nTrain_Y set: \t{}".format(train_y.shape),"\nTest set: \t\t{}".format(test_x.shape))from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size=0.2, random_state=42)

6. DataLoaders and Batching

import torchfrom torch.utils.data import TensorDataset, DataLoader# create Tensor datasetstrain_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))# dataloadersbatch_size = 50# make sure the SHUFFLE your training datatrain_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

# obtain one batch of training datadataiter = iter(train_loader)sample_x, sample_y = dataiter.next()print('Sample input size: ', sample_x.size()) # batch_size, seq_lengthprint('Sample input: \n', sample_x)print()print('Sample label size: ', sample_y.size()) # batch_sizeprint('Sample label: \n', sample_y)

7. 双向LSTM模型

判断是否有GPU

# First checking if GPU is availabletrain_on_gpu=torch.cuda.is_available()if(train_on_gpu):print('Training on GPU.')else:print('No GPU available, training on CPU.')

import torch.nn as nnclass SentimentRNN(nn.Module):"""The RNN model that will be used to perform Sentiment analysis."""def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, bidirectional=True, drop_prob=0.5):"""Initialize the model by setting up the layers."""super(SentimentRNN, self).__init__()self.output_size = output_sizeself.n_layers = n_layersself.hidden_dim = hidden_dimself.bidirectional = bidirectional# embedding and LSTM layersself.embedding = nn.Embedding(vocab_size, embedding_dim)self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True,bidirectional=bidirectional)# dropout layerself.dropout = nn.Dropout(0.3)# linear and sigmoid layersif bidirectional:self.fc = nn.Linear(hidden_dim*2, output_size)else:self.fc = nn.Linear(hidden_dim, output_size)self.sig = nn.Sigmoid()def forward(self, x, hidden):"""Perform a forward pass of our model on some input and hidden state."""batch_size = x.size(0)# embeddings and lstm_outx = x.long()embeds = self.embedding(x)lstm_out, hidden = self.lstm(embeds, hidden)# if bidirectional:# lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim*2)# else:# lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)# dropout and fully-connected layerout = self.dropout(lstm_out)out = self.fc(out)# sigmoid functionsig_out = self.sig(out)# reshape to be batch_size firstsig_out = sig_out.view(batch_size, -1)sig_out = sig_out[:, -1] # get last batch of labels# return last sigmoid output and hidden statereturn sig_out, hiddendef init_hidden(self, batch_size):''' Initializes hidden state '''# Create two new tensors with sizes n_layers x batch_size x hidden_dim,# initialized to zero, for hidden state and cell state of LSTMweight = next(self.parameters()).datanumber = 1if self.bidirectional:number = 2if (train_on_gpu):hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().cuda(),weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_().cuda())else:hidden = (weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_(),weight.new(self.n_layers*number, batch_size, self.hidden_dim).zero_())return hidden

是否使用双向LSTM（在测试集上效果更好一些）

# Instantiate the model w/ hyperparamsvocab_size = len(vocab_to_int)+1 # +1 for the 0 padding + our word tokensoutput_size = 1embedding_dim = 400hidden_dim = 256n_layers = 2bidirectional = False #这里为True，为双向LSTMnet = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, bidirectional)print(net)

8 Train

# loss and optimization functionslr=0.001criterion = nn.BCELoss()optimizer = torch.optim.Adam(net.parameters(), lr=lr)# training paramsepochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasingprint_every = 100clip=5 # gradient clipping# move model to GPU, if availableif(train_on_gpu):net.cuda()net.train()# train for some number of epochsfor e in range(epochs):# initialize hidden stateh = net.init_hidden(batch_size)counter = 0# batch loopfor inputs, labels in train_loader:counter += 1if(train_on_gpu):inputs, labels = inputs.cuda(), labels.cuda()# Creating new variables for the hidden state, otherwise# we'd backprop through the entire training historyh = tuple([each.data for each in h])# zero accumulated gradientsnet.zero_grad()# get the output from the modeloutput, h = net(inputs, h)# calculate the loss and perform backproploss = criterion(output.squeeze(), labels.float())loss.backward()# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.nn.utils.clip_grad_norm_(net.parameters(), clip)optimizer.step()# loss statsif counter % print_every == 0:# Get validation lossval_h = net.init_hidden(batch_size)val_losses = []net.eval()for inputs, labels in valid_loader:# Creating new variables for the hidden state, otherwise# we'd backprop through the entire training historyval_h = tuple([each.data for each in val_h])if(train_on_gpu):inputs, labels = inputs.cuda(), labels.cuda()output, val_h = net(inputs, val_h)val_loss = criterion(output.squeeze(), labels.float())val_losses.append(val_loss.item())net.train()print("Epoch: {}/{}...".format(e+1, epochs),"Step: {}...".format(counter),"Loss: {:.6f}...".format(loss.item()),"Val Loss: {:.6f}".format(np.mean(val_losses)))

9 Test

# Get test data loss and accuracytest_losses = [] # track lossnum_correct = 0# init hidden stateh = net.init_hidden(batch_size)net.eval()# iterate over test datafor inputs, labels in test_loader:# Creating new variables for the hidden state, otherwise# we'd backprop through the entire training historyh = tuple([each.data for each in h])if(train_on_gpu):inputs, labels = inputs.cuda(), labels.cuda()# get predicted outputsoutput, h = net(inputs, h)# calculate losstest_loss = criterion(output.squeeze(), labels.float())test_losses.append(test_loss.item())# convert output probabilities to predicted class (0 or 1)pred = torch.round(output.squeeze()) # rounds to the nearest integer# compare predictions to true labelcorrect_tensor = pred.eq(labels.float().view_as(pred))correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())num_correct += np.sum(correct)# -- stats! -- ### avg test lossprint("Test loss: {:.3f}".format(np.mean(test_losses)))# accuracy over all test datatest_acc = num_correct/len(test_loader.dataset)print("Test accuracy: {:.3f}".format(test_acc))

模型Inference

# negative test reviewtest_review_neg = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow.'from string import punctuationdef tokenize_review(test_review):test_review = test_review.lower() # lowercase# get rid of punctuationtest_text = ''.join([c for c in test_review if c not in punctuation])# splitting by spacestest_words = test_text.split()# tokenstest_ints = []test_ints.append([vocab_to_int[word] for word in test_words])return test_ints# test code and generate tokenized reviewtest_ints = tokenize_review(test_review_neg)print(test_ints)# test sequence paddingseq_length=200features = pad_features(test_ints, seq_length)print(features)# test conversion to tensor and pass into your modelfeature_tensor = torch.from_numpy(features)print(feature_tensor.size())

def predict(net, test_review, sequence_length=200):net.eval()# tokenize reviewtest_ints = tokenize_review(test_review)# pad tokenized sequenceseq_length=sequence_lengthfeatures = pad_features(test_ints, seq_length)# convert to tensor to pass into your modelfeature_tensor = torch.from_numpy(features)batch_size = feature_tensor.size(0)# initialize hidden stateh = net.init_hidden(batch_size)if(train_on_gpu):feature_tensor = feature_tensor.cuda()# get the output from the modeloutput, h = net(feature_tensor, h)# convert output probabilities to predicted class (0 or 1)pred = torch.round(output.squeeze()) # printing output value, before roundingprint('Prediction value, pre-rounding: {:.6f}'.format(output.item()))# print custom responseif(pred.item()==1):print("Positive review detected!")else:print("Negative review detected.")

# positive test reviewtest_review_pos = 'This movie had the best acting and the dialogue was so good. I loved it.'# call functionseq_length=200 # good to use the length that was trained onpredict(net, test_review_neg, seq_length)

完整代码

import torchfrom torch import nnimport torch.nn.functional as Ffrom torch.nn.utils import clip_grad_norm_# from torchvision.datasets import ImageFolderfrom torch.utils.data import DataLoader,Dataset,TensorDatasetfrom collections import Counterfrom sklearn.model_selection import train_test_splitfrom string import punctuationimport osfrom tqdm import tqdmimport reimport numpy as npimport timeimport pickleif os.path.exists('data.pkl'):data = pickle.load(open('data.pkl', 'rb'))X_train = data['X_train']X_test = data['X_test']y_train = data['y_train']y_test = data['y_test']vocabulary = data['vocabulary']seq_len = data['seq_len']del dataelse:# 1、数据预处理with open('./data/reviews.txt','r') as fp:reviews = fp.readlines()with open('./data/labels.txt','r') as fp:labels = fp.readlines()labels_ = []for label in labels:labels_.append(1 if label.strip()=='positive' else 0)labels = np.array(labels_)del labels_counter=Counter()datas= []for review in tqdm(reviews):# review = re.split(r'\W+',review.strip())review = "".join([c for c in review.strip().lower() if c not in punctuation]) # 去除特殊字符tmp = [item for item in review.split(" ") if len(item)>0]# 去掉停用词（Word2vec 可不做）datas.append(tmp)counter.update(tmp)# 2、构建词汇表pad = '<pad>'vocabulary = sorted(list(counter.keys()))vocabulary.insert(0,pad) # 0作为填充word_to_ix = dict(zip(vocabulary,np.arange(len(vocabulary))))del counterdel reviews# 3、序列量化# 将所以句子统一长度为200个单词：# 1、评论长度小于200的，我们对其左边填充0# 2、对于大于200的，我们只截取其前200个单词seq_len = 200new_datas=[]for data in tqdm(datas):if len(data) >= seq_len:data = data[:seq_len]else:data = [pad]*(seq_len-len(data))+datanew_datas.append([word_to_ix[word] for word in data])datas = np.array(new_datas)del new_datas# Training, Test划分X_train, X_test, y_train, y_test = train_test_split(datas,labels,test_size=0.2,random_state=30)del datasdel labelspickle.dump({'X_train':X_train,'X_test':X_test,'y_train':y_train,'y_test':y_test,'vocabulary':vocabulary,'seq_len':seq_len},open('data.pkl','wb'))# 4、构建模型class SentimentRNN(nn.Module):"""The RNN model that will be used to perform Sentiment analysis."""def __init__(self, vocab_size, output_size,sent_dim,embedding_dim,hidden_dim, n_layers, bidirectional=True, drop_prob=0.5,device='cpu'):"""Initialize the model by setting up the layers."""super().__init__()self.hidden_dim = hidden_dimself.sent_dim = sent_dimself.device = deviceself.n_layers = n_layersself.embedding = nn.Embedding(vocab_size, embedding_dim)self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,dropout=drop_prob, batch_first=True,bidirectional=bidirectional)# dropout layerself.dropout = nn.Dropout(0.3)# linear and sigmoid layersif bidirectional:self.number = 2self.fc = nn.Linear(hidden_dim * 2, output_size)else:self.number = 1self.fc = nn.Linear(hidden_dim, output_size)self.sig = nn.Sigmoid()self.hidden = Nonedef forward(self,x,hidden):x = x.long()embedded = self.embedding(x)lstm_out, hidden = self.lstm(embedded,hidden)# if bidirectional:# lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim*2)# else:# lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)# dropout and fully-connected layerout = self.dropout(lstm_out)out = self.fc(out)# sigmoid functionsig_out = self.sig(out)# reshape to be batch_size first# sig_out = sig_out.view(batch_size, -1)# sig_out = sig_out[:, -1] # get last batch of labelssig_out = sig_out[:,-1,:].squeeze() # 取最后一个序列的输出作为输出return sig_out,hiddendef init_hidden(self,batch_size):# the axes semantics are (bn,sent_dim,hidden_size)return (torch.zeros(self.n_layers*self.number, batch_size, self.hidden_dim, device=self.device),(torch.zeros(self.n_layers*self.number, batch_size, self.hidden_dim, device=self.device)))class SentimentRNNV2(nn.Module):"""The RNN model that will be used to perform Sentiment analysis."""def __init__(self, vocab_size, output_size,sent_dim,embedding_dim,hidden_dim, n_layers, bidirectional=True, drop_prob=0.5,device='cpu'):"""Initialize the model by setting up the layers."""super().__init__()self.hidden_dim = hidden_dimself.sent_dim = sent_dimself.device = deviceself.n_layers = n_layersself.bidirectional = bidirectionalself.embedding = nn.Embedding(vocab_size, embedding_dim)self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,dropout=drop_prob, batch_first=True,bidirectional=bidirectional)# dropout layerself.dropout = nn.Dropout(0.3)# linear and sigmoid layersif bidirectional:self.number = 2self.fc = nn.Linear(sent_dim*hidden_dim * 2, output_size)else:self.number = 1self.fc = nn.Linear(sent_dim*hidden_dim, output_size)self.sig = nn.Sigmoid()self.hidden = Nonedef forward(self,x,hidden):x = x.long()embedded = self.embedding(x)lstm_out, hidden = self.lstm(embedded,hidden)if self.bidirectional:lstm_out = lstm_out.contiguous().view(-1, self.sent_dim*self.hidden_dim*2)else:lstm_out = lstm_out.contiguous().view(-1, self.sent_dim*self.hidden_dim)# dropout and fully-connected layerout = self.dropout(lstm_out)out = self.fc(out)# sigmoid functionsig_out = self.sig(out).squeeze(1)return sig_out,hiddendef init_hidden(self,batch_size):# the axes semantics are (bn,sent_dim,hidden_size)return (torch.zeros(self.n_layers*self.number, batch_size, self.hidden_dim, device=self.device),(torch.zeros(self.n_layers*self.number, batch_size, self.hidden_dim, device=self.device)))def train(model,optimizer,dataloader,criterion,device,epoch):model.train()total_acc, total_count = 0, 0log_interval = 500start_time = time.time()total_loss = 0for idx, (data, label) in enumerate(dataloader):label = label.to(device)data = data.to(device)optimizer.zero_grad()# model.zero_grad()if idx==0:hidden = model.init_hidden(data.size(0))# Creating new variables for the hidden state, otherwise# we'd backprop through the entire training historyhidden = tuple([each.data for each in hidden]) # 必须加上这句否则报错 hidden不做梯度更新predited_label,hidden = model(data,hidden)loss = criterion(predited_label, label)total_loss += loss.item()loss = loss/len(predited_label)loss.backward()torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)optimizer.step()# total_acc += (predited_label.argmax(1) == label).sum().item()total_acc += (predited_label.round() == label).sum().item()total_count += label.size(0)if idx % log_interval == 0 and idx > 0:elapsed = time.time() - start_timeprint('| epoch {:3d} | {:5d}/{:5d} batches ''| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),total_acc/total_count))total_acc, total_count = 0, 0start_time = time.time()return total_acc/total_count,total_count/total_countdef evaluate(model,dataloader,criterion,device):model.eval()total_acc, total_count = 0, 0total_loss = 0with torch.no_grad():for idx, (data, label) in enumerate(dataloader):label = label.to(device)data = data.to(device)if idx==0:hidden = model.init_hidden(data.size(0))predited_label,hidden = model(data,hidden)loss = criterion(predited_label, label)total_loss += loss.item()# total_acc += (predited_label.argmax(1) == label).sum().item()total_acc += (predited_label.round() == label).sum().item()total_count += label.size(0)return total_acc/total_count,total_loss/total_countdef main():batch_size = 32train_data = TensorDataset(torch.from_numpy(X_train),torch.from_numpy(y_train).float())test_data = TensorDataset(torch.from_numpy(X_test),torch.from_numpy(y_test).float())train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True,drop_last=True)valid_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True,drop_last=True)# ,drop_last=True 不足一个batch 丢弃EPOCHS = 10LR = 5 # learning ratevocab_size = len(vocabulary)sent_dim = seq_lenembedding_dim = 64#400hidden_dim = 32#256output_size = 1device = "cuda:0"n_layers = 2bidirectional = False # 这里为True，为双向LSTMmodel = SentimentRNN(vocab_size, output_size,sent_dim, embedding_dim, hidden_dim,n_layers, bidirectional,device=device).to(device)# model = SentimentRNNV2(vocab_size, output_size, sent_dim, embedding_dim, hidden_dim,# n_layers, bidirectional, device=device).to(device)criterion = torch.nn.BCELoss(reduction='sum')optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9)scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5)total_accu = Nonefor epoch in range(1, EPOCHS + 1):epoch_start_time = time.time()accu_train,loss_train = train(model, optimizer, train_dataloader, criterion, device,epoch)accu_val,loss_val = evaluate(model, valid_dataloader, criterion, device)if total_accu is not None and total_accu > accu_val:scheduler.step()else:total_accu = accu_valprint('-' * 59)print('| end of epoch {:3d} | time: {:5.2f}s | ''valid accuracy {:8.3f} '.format(epoch,time.time() - epoch_start_time,accu_val))print('-' * 59)if __name__ == "__main__":main()

sklearn方法

from collections import Counterfrom sklearn.model_selection import train_test_splitfrom sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizerfrom sklearn.linear_model import LogisticRegressionfrom string import punctuationimport osfrom tqdm import tqdmimport reimport numpy as npimport timeimport picklefrom spacy.lang.en.stop_words import STOP_WORDSif os.path.exists('data.pkl'):data = pickle.load(open('data.pkl', 'rb'))X_train = data['X_train']X_test = data['X_test']y_train = data['y_train']y_test = data['y_test']del dataelse:# 1、数据预处理with open('./data/reviews.txt','r') as fp:reviews = fp.readlines()with open('./data/labels.txt','r') as fp:labels = fp.readlines()labels_ = []for label in labels:labels_.append(1 if label.strip()=='positive' else 0)labels = np.array(labels_)del labels_counter=Counter()datas= []for review in tqdm(reviews):# review = re.split(r'\W+',review.strip())review = "".join([c for c in review.strip().lower() if c not in punctuation]) # 去除特殊字符tmp = [item for item in review.split(" ") if len(item)>0 and item not in STOP_WORDS] # 去掉停用词datas.append(tmp)counter.update(tmp)# 2、构建词汇表# 去除掉高频词和低频词mean_value = np.mean(list(counter.values()))min_value = 10 # mean_value/2max_value = 20000 # mean_value*8counter = {k: v for k, v in counter.items() if v < max_value and v > min_value}vocabulary = sorted(list(counter.keys()))# 3、序列量化new_datas=[]tv = TfidfVectorizer(vocabulary=vocabulary)for data in tqdm(datas):new_datas.append(tv.fit_transform([" ".join(data)]).toarray())datas = np.concatenate(new_datas,0)del new_datas# Training, Test划分X_train, X_test, y_train, y_test = train_test_split(datas,labels,test_size=0.2,random_state=30)del datasdel labelspickle.dump({'X_train':X_train,'X_test':X_test,'y_train':y_train,'y_test':y_test},open('data.pkl','wb'))# 4、构建模型lr = LogisticRegression()lr.fit(X_train,y_train)print('score:%.5f'%lr.score(X_test,y_test))

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。