本次使用的数据集来源于UCI机器学习库。
Auto MPG数据集是一个经典的数据集,本次将使用它来构建预测70年代末到80年代初汽车燃油效率模型。
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 64) 640
_________________________________________________________________
dense_1 (Dense) (None, 64) 4160
_________________________________________________________________
dense_2 (Dense) (None, 1) 65
=================================================================
Total params: 4,865
Trainable params: 4,865
Non-trainable params: 0
_________________________________________________________________
EPOCHS = 1000
history = md.fit(normed_train_data, train_labels, epochs=EPOCHS, validation_split=0.2, verbose=0)
# !/usr/bin/env python
# —*— coding: utf-8 —*—
# @Time: 2020/1/9 8:27
# @Author: Martin
# @File: Regression.py
# @Software:PyCharm
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow import keras
# 下载数据集
data_path = keras.utils.get_file(
"auto-mpg.data",
"http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
# 导入数据集
column_names = [
'MPG',
'Cylinders',
'Displacement',
'Horsepower',
'Weight',
'Acceleration',
'Model Year',
'Origin']
raw_dataset = pd.read_csv(
data_path,
names=column_names,
na_values='?',
comment='\t',
sep=' ',
skipinitialspace=True)
dataset = raw_dataset.copy()
# 数据清洗
dataset = dataset.dropna()
origin = dataset.pop('Origin')
dataset['USA'] = (origin == 1) * 1.0
dataset['Europe'] = (origin == 2) * 1.0
dataset['Japan'] = (origin == 3) * 1.0
# 拆分训练数据集和测试数据集
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
# 数据检查
sns.pairplot(train_dataset[["MPG", "Cylinders",
"Displacement", "Weight"]], diag_kind="kde")
plt.show()
train_stats = train_dataset.describe()
train_stats.pop('MPG')
train_stats = train_stats.transpose()
# 分离特征
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')
# 数据规范化
def norm(x):
return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)
# 构建模型
def build_model():
model = keras.Sequential([
keras.layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
keras.layers.Dense(64, activation='relu'),
keras.layers.Dense(1)
])
optimizer = tf.keras.optimizers.RMSprop(0.001)
model.compile(loss='mse',
optimizer=optimizer,
metrics=['mae', 'mse'])
return model
md = build_model()
# 训练模型
EPOCHS = 1000
history = md.fit(normed_train_data, train_labels, epochs=EPOCHS, validation_split=0.2, verbose=0)
# 进行预测
test_predictions = md.predict(normed_test_data).flatten()
# 预测结果可视化
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
plt.axis('equal')
plt.axis('square')
plt.xlim([0, plt.xlim()[1]])
plt.ylim([0, plt.ylim()[1]])
_ = plt.plot([-100, 100], [-100, 100])
plt.show()
# 误差分布可视化
error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [MPG]')
_ = plt.ylabel('Count')
plt.show()