DBSCAN(Density-Based Spatial Clustering of Applications with Noise)是一种基于密度的聚类算法,适用于发现任意形状的簇并识别噪声点。核心参数包括:
eps
:邻域半径,决定样本的邻域范围。min_samples
:核心点所需的最小邻域样本数。安装依赖库
pip install numpy matplotlib scikit-learn
示例代码
以下是一个完整的DBSCAN聚类示例,包含数据生成、模型训练和可视化:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons
# 生成模拟数据(半月形数据集)
X, _ = make_moons(n_samples=300, noise=0.05, random_state=42)
# 初始化DBSCAN模型
dbscan = DBSCAN(eps=0.3, min_samples=5)
clusters = dbscan.fit_predict(X)
# 可视化结果
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', s=50, alpha=0.7)
plt.title("DBSCAN Clustering")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.colorbar(label="Cluster Label")
plt.show()
调整eps
调整min_samples
DBSCAN将噪声点标记为-1
。可通过以下代码统计噪声点比例:
noise_ratio = np.sum(clusters == -1) / len(clusters)
print(f"Noise ratio: {noise_ratio:.2%}")
StandardScaler
)。通过调整参数和应用场景适配,DBSCAN能有效解决复杂分布数据的聚类问题。
密度聚类(如DBSCAN、OPTICS)基于样本分布的紧密程度划分簇,适用于非凸数据集。以下是Python实现的优化实例及关键技巧。
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
X, _ = make_moons(n_samples=300, noise=0.05)
dbscan = DBSCAN(eps=0.3, min_samples=5)
labels = dbscan.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')
plt.show()
优化点:调整eps
和min_samples
可平衡噪声敏感度与簇密度。
from sklearn.neighbors import NearestNeighbors
import numpy as np
neigh = NearestNeighbors(n_neighbors=5)
nbrs = neigh.fit(X)
distances, _ = nbrs.kneighbors(X)
distances = np.sort(distances[:, -1], axis=0)
plt.plot(distances)
plt.xlabel('Points sorted by distance')
plt.ylabel('5th nearest neighbor distance')
作用:通过K距离图确定最佳eps
(拐点处)。
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
dbscan = DBSCAN(eps=0.2, min_samples=5).fit(X_pca)
优势:PCA减少维度诅咒影响,提升密度计算效率。
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
labels = clusterer.fit_predict(X)
特点:自动处理不同密度的簇,无需手动调参。
anomaly_scores = -dbscan.fit_predict(X) # 噪声点标记为-1
plt.scatter(X[:, 0], X[:, 1], c=anomaly_scores, cmap='Reds')
应用:将噪声点作为异常值输出。
StandardScaler
避免量纲影响距离计算。DBSCAN
的algorithm='ball_tree'
加速大规模数据。metric
参数支持余弦相似度等。完整代码及案例可参考Scikit-learn和HDBSCAN官方文档。
K-Distance Graph是一种用于异常检测或密度估计的可视化工具,常用于评估数据点的局部密度。通过计算每个点的第K近邻距离并排序绘制,可识别数据中的异常点(距离较高的点)。以下是基于Python的实现示例及扩展应用。
使用scikit-learn
的NearestNeighbors
计算K近邻距离,结合matplotlib
绘图:
import numpy as np
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
def plot_k_distance(data, k=5):
neigh = NearestNeighbors(n_neighbors=k)
neigh.fit(data)
distances, _ = neigh.kneighbors(data)
k_distances = distances[:, -1] # 取第K近邻距离
sorted_k_distances = np.sort(k_distances)
plt.plot(sorted_k_distances)
plt.xlabel('Points sorted by distance')
plt.ylabel(f'{k}-Distance')
plt.title('K-Distance Graph')
plt.show()
# 示例数据
data = np.random.rand(100, 2)
plot_k_distance(data, k=5)
调整K值观察图形变化:
for k in [3, 5, 10]:
plot_k_distance(data, k=k)
使用K-Distance识别异常点(如高于阈值的点):
def detect_anomalies(data, k=5, threshold=1.5):
neigh = NearestNeighbors(n_neighbors=k)
neigh.fit(data)
distances, _ = neigh.kneighbors(data)
k_distances = distances[:, -1]
anomalies = data[k_distances > threshold * np.mean(k_distances)]
return anomalies
anomalies = detect_anomalies(data, k=5, threshold=1.5)
适用于多维数据(如PCA降维前):
from sklearn.datasets import make_blobs
high_dim_data, _ = make_blobs(n_samples=100, n_features=10)
plot_k_distance(high_dim_data, k=5)
通过曲线拐点(Elbow Method)自动选择阈值:
def auto_threshold(data, k=5):
neigh = NearestNeighbors(n_neighbors=k)
neigh.fit(data)
distances, _ = neigh.kneighbors(data)
k_distances = np.sort(distances[:, -1])
diffs = np.diff(k_distances, 2) # 二阶导数找拐点
threshold_index = np.argmax(diffs) + 1
return k_distances[threshold_index]
threshold = auto_threshold(data)
结合上述功能的完整类实现:
class KDistanceAnalyzer:
def __init__(self, k=5):
self.k = k
def fit(self, data):
self.data = data
self.neigh = NearestNeighbors(n_neighbors=self.k)
self.neigh.fit(data)
distances, _ = self.neigh.kneighbors(data)
self.k_distances = np.sort(distances[:, -1])
def plot(self):
plt.plot(self.k_distances)
plt.xlabel('Points sorted by distance')
plt.ylabel(f'{self.k}-Distance')
plt.title('K-Distance Graph')
plt.show()
def detect_anomalies(self, threshold=1.5):
mean_dist = np.mean(self.k_distances)
anomalies = self.data[self.k_distances > threshold * mean_dist]
return anomalies
# 使用示例
analyzer = KDistanceAnalyzer(k=5)
analyzer.fit(data)
analyzer.plot()
anomalies = analyzer.detect_anomalies(threshold=1.5)
结合seaborn
增强可视化:
import seaborn as sns
def plot_with_seaborn(distances):
sns.lineplot(x=range(len(distances)), y=distances)
plt.title('K-Distance with Seaborn')
plt.show()
plot_with_seaborn(analyzer.k_distances)
通过以上方法,可灵活应用K-Distance Graph于不同场景,并根据实际需求调整参数和阈值。
标准化是数据预处理中常用的技术,将数据转换为均值为0、标准差为1的分布。C++中可通过计算均值和标准差手动实现,或借助库(如Eigen、OpenCV)。以下是实例,涵盖基础实现和实际应用场景。
示例1:一维数组标准化
#include
#include
#include
void standardScaler(std::vector& data) {
double mean = 0.0, stddev = 0.0;
for (double x : data) mean += x;
mean /= data.size();
for (double x : data) stddev += (x - mean) * (x - mean);
stddev = sqrt(stddev / data.size());
for (double& x : data) x = (x - mean) / stddev;
}
int main() {
std::vector data = {1.0, 2.0, 3.0, 4.0, 5.0};
standardScaler(data);
for (double x : data) std::cout << x << " ";
return 0;
}
示例2:二维数组标准化(按列)
#include
#include
using Matrix = std::vector>;
void standardizeColumns(Matrix& mat) {
for (size_t col = 0; col < mat[0].size(); ++col) {
double mean = 0.0, stddev = 0.0;
for (size_t row = 0; row < mat.size(); ++row) mean += mat[row][col];
mean /= mat.size();
for (size_t row = 0; row < mat.size(); ++row)
stddev += pow(mat[row][col] - mean, 2);
stddev = sqrt(stddev / mat.size());
for (size_t row = 0; row < mat.size(); ++row)
mat[row][col] = (mat[row][col] - mean) / stddev;
}
}
示例3:Eigen矩阵标准化
#include
#include
Eigen::MatrixXd standardScalerEigen(Eigen::MatrixXd data) {
Eigen::VectorXd mean = data.colwise().mean();
Eigen::MatrixXd centered = data.rowwise() - mean.transpose();
Eigen::VectorXd stddev = (centered.array().square().colwise().sum() / data.rows()).sqrt();
return centered.array().rowwise() / stddev.transpose().array();
}
int main() {
Eigen::MatrixXd data(3, 2);
data << 1, 2, 3, 4, 5, 6;
Eigen::MatrixXd scaled = standardScalerEigen(data);
std::cout << scaled << std::endl;
return 0;
}
示例4:处理NaN值
Eigen::MatrixXd standardScalerWithNaN(Eigen::MatrixXd data) {
for (int col = 0; col < data.cols(); ++col) {
Eigen::VectorXd colData = data.col(col);
auto valid = colData.array().unaryExpr([](double x) { return !std::isnan(x); });
double mean = (colData.array() * valid.cast()).sum() / valid.count();
double stddev = sqrt(((colData.array() - mean).square() * valid.cast()).sum() / valid.count());
data.col(col) = (colData.array() - mean) / stddev;
}
return data;
}
示例5:CSV数据标准化
#include
#include
#include
std::vector> readCSV(const std::string& filename) {
std::vector> data;
std::ifstream file(filename);
std::string line;
while (std::getline(file, line)) {
std::vector row;
std::stringstream ss(line);
double value;
while (ss >> value) row.push_back(value);
data.push_back(row);
}
return data;
}
void writeCSV(const std::string& filename, const std::vector>& data) {
std::ofstream file(filename);
for (const auto& row : data) {
for (size_t i = 0; i < row.size(); ++i) {
file << row[i];
if (i != row.size() - 1) file << ",";
}
file << "\n";
}
}
int main() {
auto data = readCSV("input.csv");
standardizeColumns(data); // 假设已实现按列标准化
writeCSV("output.csv", data);
return 0;
}
示例6:实时数据流标准化
#include
#include
class StreamingScaler {
private:
std::queue window;
double sum = 0.0, sumSq = 0.0;
size_t windowSize;
public:
StreamingScaler(size_t size) : windowSize(size) {}
double scale(double x) {
window.push(x);
sum += x;
sumSq += x * x;
if (window.size() > windowSize) {
double old = window.front();
sum -= old;
sumSq -= old * old;
window.pop();
}
double mean = sum / window.size();
double stddev = sqrt((sumSq / window.size()) - mean * mean);
return (x - mean) / stddev;
}
};
示例7:多线程标准化
#include
#include
#include
void standardizeRange(std::vector