首先,我们需要在电商网站上部署 JavaScript 或服务器端代码来跟踪用户的每一次互动,并将这些事件发送到 Kafka 主题 user_events
中。这里给出一个简单的 Python 示例,演示如何创建一个 Kafka 生产者来模拟用户行为日志的发送。
from kafka import KafkaProducer
import json
import time
import random
# 初始化 Kafka 生产者
producer = KafkaProducer(bootstrap_servers=['localhost:9092'], value_serializer=lambda v: json.dumps(v).encode('utf-8'))
topic_name = 'user_events'
def simulate_user_behavior():
user_ids = ["user_001", "user_002", "user_003"]
product_ids = ["prod_001", "prod_002", "prod_003"]
actions = ["view", "add_to_cart", "purchase"]
while True:
# 模拟生成用户行为
user_id = random.choice(user_ids)
product_id = random.choice(product_ids)
action = random.choice(actions)
event = {
"event_type": action,
"user_id": user_id,
"product_id": product_id,
"timestamp": int(time.time() * 1000)
}
# 将事件发送到 Kafka 主题
producer.send(topic_name, value=event)
print(f"Sent event: {event}")
# 每隔一秒发送一次事件
time.sleep(1)
if __name__ == "__main__":
try:
simulate_user_behavior()
except KeyboardInterrupt:
producer.close()
接下来,我们将编写一个 Flink 应用程序来消费 Kafka 中的数据,并对其进行必要的预处理。这包括解析原始事件、过滤掉不完整的记录,以及为后续的特征提取做准备。
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
public class UserBehaviorPreprocessing {
public static void main(String[] args) throws Exception {
// 设置执行环境
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 配置 Kafka 消费者参数
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", "localhost:9092");
properties.setProperty("group.id", "flink-user-behavior-group");
// 创建 Kafka 消费者
FlinkKafkaConsumer<String> kafkaSource = new FlinkKafkaConsumer<>(
"user_events",
new SimpleStringSchema(),
properties
);
// 解析 Kafka 消息为 UserEvent 对象
DataStream<UserEvent> events = env.addSource(kafkaSource)
.map(message -> parseJsonToUserEvent(message));
// 输出预处理后的事件到控制台(实际应用中应继续处理)
events.print();
// 执行程序
env.execute("User Behavior Preprocessing");
}
// 辅助方法:将 JSON 字符串解析成 UserEvent 对象
private static UserEvent parseJsonToUserEvent(String jsonStr) {
ObjectMapper mapper = new ObjectMapper();
try {
JsonNode rootNode = mapper.readTree(jsonStr);
UserEvent event = new UserEvent();
event.eventType = rootNode.path("event_type").asText();
event.userId = rootNode.path("user_id").asText();
event.productId = rootNode.path("product_id").asText();
event.timestamp = rootNode.path("timestamp").asLong();
return event;
} catch (Exception e) {
throw new RuntimeException("Failed to parse JSON", e);
}
}
// 辅助类:表示单个用户行为事件
public static class UserEvent {
public String eventType;
public String userId;
public String productId;
public long timestamp;
}
}
在此阶段,我们会根据用户的行为模式计算出一系列特征向量,用于后续的推荐模型训练。例如,我们可以统计每个用户在过去一周内最常浏览的商品类别,或者他们倾向于购买哪个价位段的产品。这部分逻辑可以在 Flink 中通过窗口操作来实现。
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.util.Collector;
public class FeatureEngineering {
public static void main(String[] args) throws Exception {
// 设置执行环境
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 假设我们已经有了一个预处理过的用户行为流
DataStream<UserEvent> events = ...; // 省略具体来源
// 计算每个用户在过去7天内的浏览商品类别分布
DataStream<Tuple2<String, Tuple2<String, Integer>>> categoryDistribution =
events
.assignTimestampsAndWatermarks(WatermarkStrategy.<UserEvent>forMonotonousTimestamps()
.withTimestampAssigner((event, timestamp) -> event.timestamp))
.keyBy(event -> event.userId) // 按用户 ID 分组
.window(SlidingEventTimeWindows.of(Time.days(7), Time.hours(1))) // 每小时滑动窗口,覆盖过去7天的数据
.aggregate(new CategoryAggregator(), new CategoryFormatter());
// 输出特征向量到控制台(实际应用中应保存到 ES 或其他存储)
categoryDistribution.print();
// 执行程序
env.execute("Feature Engineering for Recommendations");
}
// 自定义聚合函数,用于计算每个用户浏览的商品类别分布
public static class CategoryAggregator implements AggregateFunction<UserEvent, CategoryAccumulator, Tuple2<String, Integer>> {
@Override
public CategoryAccumulator createAccumulator() {
return new CategoryAccumulator();
}
@Override
public CategoryAccumulator add(UserEvent value, CategoryAccumulator accumulator) {
if ("view".equals(value.eventType)) {
// 这里假设产品 ID 包含了分类信息,例如 prod_001_gadgets 表示这是一个小工具类别的商品
String categoryId = value.productId.split("_")[2];
accumulator.categoryCounts.put(categoryId, accumulator.categoryCounts.getOrDefault(categoryId, 0) + 1);
}
return accumulator;
}
@Override
public Tuple2<String, Integer> getResult(CategoryAccumulator accumulator) {
// 返回浏览次数最多的商品类别及其出现次数
Entry<String, Integer> maxEntry = null;
for (Entry<String, Integer> entry : accumulator.categoryCounts.entrySet()) {
if (maxEntry == null || entry.getValue().compareTo(maxEntry.getValue()) > 0) {
maxEntry = entry;
}
}
return maxEntry != null ? Tuple2.of(maxEntry.getKey(), maxEntry.getValue()) : null;
}
@Override
public CategoryAccumulator merge(CategoryAccumulator a, CategoryAccumulator b) {
for (Entry<String, Integer> entry : b.categoryCounts.entrySet()) {
a.categoryCounts.put(entry.getKey(), a.categoryCounts.getOrDefault(entry.getKey(), 0) + entry.getValue());
}
return a;
}
}
// 格式化输出结果
public static class CategoryFormatter implements WindowFunction<Tuple2<String, Integer>, Tuple2<String, Tuple2<String, Integer>>, String, TimeWindow> {
@Override
public void apply(
String key,
TimeWindow window,
Iterable<Tuple2<String, Integer>> input,
Collector<Tuple2<String, Tuple2<String, Integer>>> out
) {
for (Tuple2<String, Integer> result : input) {
if (result != null) {
out.collect(Tuple2.of(key, result));
}
}
}
}
// 辅助类:用于保存中间状态
public static class CategoryAccumulator {
public Map<String, Integer> categoryCounts = new HashMap<>();
}
}
对于推荐模型的训练部分,通常会涉及到较为复杂的机器学习算法,如矩阵分解、深度神经网络等。由于这部分内容相对独立于 Flink 流程,因此可以考虑使用 Spark MLlib 或 TensorFlow 等工具来进行离线训练,并将训练好的模型导出为 PMML 文件或其他格式供在线预测使用。
最后一步是将生成的推荐列表存入 Elasticsearch 中,以便快速检索和展示给用户。这里我们使用 Java 客户端 API 来插入文档。
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.xcontent.XContentType;
import java.io.IOException;
public class SaveRecommendationsToES {
private static RestHighLevelClient client = new RestHighLevelClient(
RestClient.builder(new HttpHost("localhost", 9200, "http")));
public static void saveRecommendation(String userId, List<String> recommendedProducts) throws IOException {
IndexRequest request = new IndexRequest("recommendations")
.id(userId)
.source(XContentType.JSON, "products", recommendedProducts);
client.index(request, RequestOptions.DEFAULT);
}
public static void closeClient() throws IOException {
client.close();
}
}
当用户访问商品详情页或其他相关页面时,可以通过 AJAX 请求调用 ES 的 API 获取个性化推荐结果,并动态加载到页面上。以下是 JavaScript 示例代码片段:
function loadPersonalizedRecommendations(userId) {
fetch(`/api/recommendations/${userId}`)
.then(response => response.json())
.then(data => {
const recommendationsContainer = document.getElementById('recommendations');
data.products.forEach(productId => {
// 动态创建 HTML 元素并添加到容器中
const productElement = document.createElement('div');
productElement.className = 'recommended-product';
productElement.innerHTML = `${productId}">${productId}`;
recommendationsContainer.appendChild(productElement);
});
})
.catch(error => console.error('Error loading personalized recommendations:', error));
}