目标:用词袋模型实现文本分类与实体识别
// OpenNLP文档分类器(基于词袋模型)
import opennlp.tools.doccat.*;
import opennlp.tools.util.*;
public class DocumentClassifier {
// 训练模型(需提前准备训练数据)
public static void trainModel(String trainingDataPath, String modelPath) throws Exception {
// 1. 加载训练数据(每行格式:category\ttext)
ObjectStream<String> lineStream = new PlainTextByLineStream(
new MarkableFileInputStreamFactory(new File(trainingDataPath)),
"UTF-8");
ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream);
// 2. 定义训练参数
TrainingParameters params = TrainingParameters.defaultParams();
params.put(TrainingParameters.CUTOFF_PARAM, "1"); // 词频过滤
// 3. 训练模型
DoccatModel model = DocumentCategorizerME.train(
"en", sampleStream, params, new DoccatFactory());
// 4. 保存模型
try (FileOutputStream fos = new FileOutputStream(modelPath)) {
model.serialize(fos);
}
}
// 分类预测
public static String classifyText(String text, String modelPath) throws Exception {
DoccatModel model = new DoccatModel(new FileInputStream(modelPath));
DocumentCategorizerME categorizer = new DocumentCategorizerME(model);
// 4. 获取分类结果(概率最高的类别)
double[] probabilities = categorizer.categorize(text.split(" "));
int bestCategoryIndex = 0;
for (int i = 1; i < probabilities.length; i++) {
if (probabilities[i] > probabilities[bestCategoryIndex]) {
bestCategoryIndex = i;
}
}
return model.getOutcome(bestCategoryIndex);
}
public static void main(String[] args) throws Exception {
// 训练模型(假设训练数据在training.txt)
trainModel("training.txt", "model.bin");
// 预测新文本
String result = classifyText("Java NLP is powerful!", "model.bin");
System.out.println("分类结果:" + result); // 输出:"technology"
}
}
注释说明:
category\ttext
格式,如technology\tJava is a powerful language
。CUTOFF_PARAM
过滤低频词可提升泛化能力。serialize
将模型保存为.bin
文件,方便复用。目标:用RNN实现情感分析
// Deeplearning4j的LSTM情感分析
import org.deeplearning4j.nn.conf.*;
import org.deeplearning4j.nn.conf.layers.*;
import org.deeplearning4j.nn.multilayer.*;
import org.nd4j.linalg.activations.*;
import org.nd4j.linalg.dataset.api.iterator.*;
public class SentimentLSTM {
// 定义网络结构
public static MultiLayerNetwork configureNetwork() {
int inputSize = 100; // 词向量维度
int hiddenSize = 128;
int outputSize = 2; // 正/负两类
NeuralNetConfiguration.List listConf = new NeuralNetConfiguration.List()
.layer(0, new LSTM.Builder()
.nIn(inputSize)
.nOut(hiddenSize)
.activation(Activation.TANH)
.build())
.layer(1, new RnnOutputLayer.Builder()
.nIn(hiddenSize)
.nOut(outputSize)
.activation(Activation.SOFTMAX)
.lossFunction(LossFunctions.LossFunction.XENT)
.build());
return new MultiLayerNetwork(listConf);
}
// 训练与预测(需数据预处理)
public static void trainAndPredict(DataSetIterator trainData, int epochs) {
MultiLayerNetwork model = configureNetwork();
model.init();
for (int i = 0; i < epochs; i++) {
model.fit(trainData);
}
// 预测示例
INDArray input = ...; // 预处理后的输入向量
INDArray output = model.output(input);
System.out.println("预测情感:" + output.argMax(1).getInt(0)); // 0或1
}
}
注释说明:
nIn
和nOut
定义输入输出维度,TANH
激活函数处理非线性。DistributedTraining
接口支持多GPU加速。目标:实现命名实体识别与依存句法分析
// Stanford CoreNLP的深度语义分析
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
public class CoreNLPSemanticAnalysis {
public static void analyzeText(String text) {
// 1. 配置分析管道(启用NER和情感分析)
Properties props = new Properties();
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,sentiment");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
// 2. 执行分析
Annotation document = new Annotation(text);
pipeline.annotate(document);
// 3. 提取实体
for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
String ner = token.get(NERAnnotation.class);
if (!ner.equals("O")) { // 非普通实体
System.out.println("实体:" + token.word() + " 类型:" + ner);
}
}
// 4. 句子情感分析
String sentiment = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
System.out.println("句子情感:" + sentiment); // 如"Very positive"
}
}
public static void main(String[] args) {
analyzeText("Java NLP is powerful, but TensorFlow is better.");
}
}
注释说明:
annotators
参数启用ner
(实体识别)和sentiment
(情感分析)。SentimentCoreAnnotations
提供句子级情感分类。props
中的语言参数(如"en"
)支持多语言。// 混合模型:OpenNLP预处理+DL4J深度学习
public class HybridNLP {
public static void main(String[] args) throws Exception {
// 1. OpenNLP分词与去停用词
TokenizerModel tokenizerModel = new TokenizerModel(
new FileInputStream("en-token.bin"));
TokenizerME tokenizer = new TokenizerME(tokenizerModel);
String text = "Java NLP is powerful!";
String[] tokens = tokenizer.tokenize(text);
// 2. 去除停用词(如"the", "is")
StopWordRemover remover = new StopWordRemover(
Arrays.asList("is", "the"), true);
String[] filtered = remover.removeStopWords(tokens);
// 3. 构建词向量(假设使用Word2Vec)
INDArray wordVectors = ...; // 需预训练的词向量
// 4. 输入到DL4J模型进行分类
MultiLayerNetwork model = configureNetwork(); // 参考前文LSTM配置
INDArray output = model.output(wordVectors);
System.out.println("预测结果:" + output.argMax().getInt(0));
}
}
注释说明:
Word2Vec
或GloVe
生成固定维度的向量。// JRAW+OpenNLP:实时爬取与分析
import com.reddit4j.Reddit;
import com.reddit4j.http.RedditHttpClient;
public class RedditSentimentAnalysis {
public static void main(String[] args) {
// 1. 初始化Reddit客户端
Reddit reddit = Reddit.builder()
.userAgent("NLP-Analysis-Bot")
.build();
// 2. 获取子版块(如r/Java)的热门帖子
reddit.subreddit("Java").hot().forEach(post -> {
String text = post.getTitle() + " " + post.getSelftext();
// 3. 情感分析(调用OpenNLP模型)
String sentiment = classifyText(text, "sentiment_model.bin");
// 4. 存储结果到MongoDB
MongoDBHelper.insertAnalysisResult(post.getId(), sentiment);
});
}
}
注释说明:
RedditHttpClient
支持分页获取数据。MongoDBHelper
类存储分析结果。// Deeplearning4j的分布式训练配置
public class DistributedTraining {
public static void main(String[] args) {
// 1. 配置分布式环境
MultiLayerNetwork model = configureNetwork();
model.setListeners(new ScoreIterationListener(1));
// 2. 分布式训练(需Spark环境)
SparkDl4jMultiLayer sparkModel = new SparkDl4jMultiLayer(
sparkContext, model, new SparkTrainingWorkspaceManager());
sparkModel.fit(dataRDD); // dataRDD为分布式数据集
// 3. 模型持久化
sparkModel.saveModel("hdfs://path/to/model");
}
}
注释说明:
SparkDl4jMultiLayer
实现分布式训练。// Spring Boot REST API示例
@RestController
public class NLPController {
@Autowired
private MultiLayerNetwork model; // 注入训练好的DL4J模型
@PostMapping("/analyze")
public ResponseEntity<String> analyzeText(@RequestBody String text) {
// 1. 预处理(分词、向量化)
INDArray input = preprocess(text);
// 2. 模型推理
INDArray output = model.output(input);
int result = output.argMax().getInt(0);
// 3. 返回结果
return ResponseEntity.ok("{\"result\": " + result + "}");
}
}
// Docker部署配置(Dockerfile)
FROM openjdk:8-jdk-alpine
COPY target/nlp-service.jar /app/nlp-service.jar
EXPOSE 8080
CMD ["java", "-jar", "/app/nlp-service.jar"]
注释说明:
kubectl
部署到云环境,实现弹性扩缩容。// NLP模型中的数据加密(AES-256)
public class SecureNLP {
private static final String KEY = "0123456789abcdef"; // 16字节密钥
public static String encrypt(String text) throws Exception {
Cipher cipher = Cipher.getInstance("AES/CBC/PKCS5Padding");
SecretKeySpec keySpec = new SecretKeySpec(KEY.getBytes(), "AES");
IvParameterSpec ivSpec = new IvParameterSpec(KEY.getBytes());
cipher.init(Cipher.ENCRYPT_MODE, keySpec, ivSpec);
return Base64.getEncoder().encodeToString(
cipher.doFinal(text.getBytes(StandardCharsets.UTF_8)));
}
public static String decrypt(String encrypted) throws Exception {
Cipher cipher = Cipher.getInstance("AES/CBC/PKCS5Padding");
SecretKeySpec keySpec = new SecretKeySpec(KEY.getBytes(), "AES");
IvParameterSpec ivSpec = new IvParameterSpec(KEY.getBytes());
cipher.init(Cipher.DECRYPT_MODE, keySpec, ivSpec);
return new String(
cipher.doFinal(Base64.getDecoder().decode(encrypted)),
StandardCharsets.UTF_8);
}
}
注释说明:
// 结合OpenCV与NLP的多模态分析
public class MultimodalAnalysis {
public static void main(String[] args) {
// 1. 图像分析(OpenCV)
Mat image = Imgcodecs.imread("product.jpg");
String description = OpenCVHelper.describeImage(image);
// 2. 文本分析(Stanford CoreNLP)
String text = "用户评论:这款相机画质很棒!";
String sentiment = analyzeText(text);
// 3. 融合决策
if (sentiment.equals("positive") && description.contains("high resolution")) {
System.out.println("推荐该产品!");
}
}
}
注释说明:
“语言不是障碍,代码是桥梁!——墨夶”
通过本文,你已掌握:
// 全局配置:Stanford CoreNLP的深度学习模型加载
public class CoreNLPConfig {
public static StanfordCoreNLP getDeepLearningPipeline() {
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, parse, sentiment");
props.setProperty("sentiment.model", "edu/stanford/nlp/models/sentiment/sentiment.ser.gz"); // 深度学习模型路径
return new StanfordCoreNLP(props);
}
}