以下是 MATLAB 自然语言处理 (NLP) 的入门教程,涵盖基础概念、核心功能。
MATLAB下载安装教程:https://blog.csdn.net/tyatyatya/article/details/147879353
MATLAB下载地址链接:https://pan.quark.cn/s/364584a880f7
MATLAB 提供了完整的 NLP 工具链,支持文本处理、特征提取、分类和生成等任务:
% 创建文本数据存储
documents = importdata('text_data.txt');
tbl = table(documents, 'VariableNames', {'Text'});
% 创建词袋模型
bag = bagOfWords(tbl.Text);
% 移除停用词
cleanBag = removeStopWords(bag);
% 词干提取
stemmedBag = stemWords(cleanBag);
% TF-IDF特征
tfidf = tfidf(bag);
% 加载预训练词向量
embedding = wordEmbedding('glove-100d');
% 文本向量化
docVectors = transform(embedding, tbl.Text);
% 加载IMDB影评数据集
tbl = readtable('imdb_reviews.csv');
% 创建词袋模型
bag = bagOfWords(tbl.Review);
% 划分训练集和测试集
cv = cvpartition(height(tbl), 'HoldOut', 0.2);
idxTrain = training(cv);
idxTest = test(cv);
% 训练分类器
classifier = trainDocumentClassifier(bag(idxTrain), tbl.Sentiment(idxTrain), ...
'Classifier', 'svm', ...
'TextRepresentation', 'tfidf');
% 评估性能
YPred = classify(classifier, bag(idxTest));
accuracy = mean(YPred == tbl.Sentiment(idxTest));
fprintf('分类准确率: %.2f%%\n', accuracy*100);
% 加载数据
tbl = readtable('twitter_sentiment.csv');
% 创建词向量
embedding = wordEmbedding(tbl.Text, 'NumDimensions', 100);
% 准备序列数据
maxSequenceLength = 100;
tokenizedData = tokenizedDocument(tbl.Text);
sequences = paddedSequence(tokenizedData, maxSequenceLength);
% 创建LSTM网络
layers = [
sequenceInputLayer(maxSequenceLength)
embeddingLayer(vocabSize(embedding), 100, 'Embedding', embedding)
lstmLayer(64, 'OutputMode', 'last')
fullyConnectedLayer(3) % 3分类问题
softmaxLayer
classificationLayer
];
% 训练网络
options = trainingOptions('adam', ...
'MaxEpochs', 10, ...
'MiniBatchSize', 64, ...
'ValidationData', {sequences(idxTest), tbl.Sentiment(idxTest)}, ...
'Verbose', false, ...
'Plots', 'training-progress');
net = trainNetwork(sequences(idxTrain), tbl.Sentiment(idxTrain), layers, options);
% 加载Twitter情感数据集
tbl = readtable('twitter_sentiment.csv');
% 数据预处理
documents = tokenizedDocument(tbl.Text);
documents = removeStopWords(documents);
documents = stemWords(documents);
% 创建词袋模型
bag = bagOfWords(documents);
% 特征提取
tfidf = tfidf(bag);
% 划分数据集
cv = cvpartition(height(tbl), 'KFold', 5);
% 训练并评估模型
accuracies = zeros(cv.NumTestSets, 1);
for i = 1:cv.NumTestSets
idxTrain = training(cv, i);
idxTest = test(cv, i);
% 训练SVM分类器
classifier = fitcsvm(tfidf(idxTrain,:), tbl.Sentiment(idxTrain));
% 预测
YPred = predict(classifier, tfidf(idxTest,:));
% 计算准确率
accuracies(i) = mean(YPred == tbl.Sentiment(idxTest));
end
% 输出平均准确率
fprintf('交叉验证平均准确率: %.2f%%\n', mean(accuracies)*100);
% 可视化混淆矩阵
cm = confusionmat(tbl.Sentiment(idxTest), YPred);
figure
imagesc(cm)
colorbar
xticklabels({'负面', '中性', '正面'})
yticklabels({'负面', '中性', '正面'})
title('情感分析混淆矩阵')
% 加载预训练NER模型
net = nerNetwork('english');
% 示例文本
text = "Apple is looking at buying U.K. startup for $1 billion";
% 识别实体
entities = recognizeNamedEntities(net, text);
% 输出结果
disp(entities);
% 加载新闻数据集
tbl = readtable('news_articles.csv');
% 创建词袋模型
bag = bagOfWords(tbl.Text);
% 训练LDA模型
numTopics = 5;
ldaModel = trainLDA(bag, numTopics);
% 显示每个主题的关键词
topicWords = topicWords(ldaModel, 10); % 每个主题10个关键词
for i = 1:numTopics
fprintf('主题 %d: %s\n', i, join(topicWords(:,i), ', '));
end