function [beta] = sc_approx_pooling(feaSet, B, pyramid, gamma, knn)
%================================================
%
% Usage:
% Compute the linear spatial pyramid feature using sparse coding.
%
% Inputss:
% feaSet -structure defining the feature set of an image
% .feaArr local feature array extracted from the
% image, column-wise
% .x x locations of each local feature, 2nd
% dimension of the matrix
% .y y locations of each local feature, 1st
% dimension of the matrix
% .width width of the image
% .height height of the image
% B -sparse dictionary, column-wise
% pyramid -defines structure of pyramid
% gamma -sparsity regularization parameter
% knn -k nearest neighbors selected for sparse coding
%
% Output:
% beta -multiscale max pooling feature
%
% Written by Jianchao Yang @ NEC Research Lab America (Cupertino)
% Mentor: Kai Yu
% July 2008
%
% Revised May. 2010
%===============================================
dSize = size(B, 2);
nSmp = size(feaSet.feaArr, 2);
img_width = feaSet.width;
img_height = feaSet.height;
idxBin = zeros(nSmp, 1);
sc_codes = zeros(dSize, nSmp);
% compute the local feature for each local feature
D = feaSet.feaArr'*B;
IDX = zeros(nSmp, knn);
for ii = 1:nSmp,
d = D(ii, :);
[dummy, idx] = sort(d, 'descend');
IDX(ii, :) = idx(1:knn);
end
for ii = 1:nSmp,
y = feaSet.feaArr(:, ii);
idx = IDX(ii, :);
BB = B(:, idx);
sc_codes(idx, ii) = feature_sign(BB, y, 2*gamma); %sc_codes:sparse coding coefficient(S) X=BS
end
sc_codes = abs(sc_codes);
% spatial levels
pLevels = length(pyramid);
% spatial bins on each level
pBins = pyramid.^2;
% total spatial bins
tBins = sum(pBins);
beta = zeros(dSize, tBins);
bId = 0;
for iter1 = 1:pLevels,
nBins = pBins(iter1); %[1 4 16]
wUnit = img_width / pyramid(iter1);
hUnit = img_height / pyramid(iter1);
% find to which spatial bin each local descriptor belongs
xBin = ceil(feaSet.x / wUnit); %feaSet.x 960*1 vector
yBin = ceil(feaSet.y / hUnit); %feaSet.y 960*1 vector
idxBin = (yBin - 1)*pyramid(iter1) + xBin;
for iter2 = 1:nBins,
bId = bId + 1;
sidxBin = find(idxBin == iter2)
if isempty(sidxBin),
continue;
end
beta(:, bId) = max(sc_codes(:, sidxBin), [], 2); %max(s, [], 2):max for each row; beta:have 1+4+16 columns, zj=max(|sj1|,|sj2|..|sjm|) X=BS where X:n*m B:n*k S:k*m
end
end
if bId ~= tBins,
error('Index number error!');
end
beta = beta(:);
beta = beta./sqrt(sum(beta.^2));
有两点需要注意:
1)在求解beta时,作者将原始图像用spatial pyramid划分为3 levels,相应的cells个数分别为1,4,16。在论文中由于X=UV(U为稀疏编码,V为字典),所以
但是在代码中,作者是按照X=BS(B为字典,S为稀疏编码)求解的,所以
2)在论文中作者说z和U有相同的列数,但在代码中作者是在3个level的每一个cell上求最大值,因此z的列数为1+4+16=21。
function [beta] = sc_approx_pooling(feaSet, B, pyramid, gamma, knn)
%================================================
%
% Usage:
% Compute the linear spatial pyramid feature using sparse coding.
%
% Inputss:
% feaSet -structure defining the feature set of an image
% .feaArr local feature array extracted from the
% image, column-wise
% .x x locations of each local feature, 2nd
% dimension of the matrix
% .y y locations of each local feature, 1st
% dimension of the matrix
% .width width of the image
% .height height of the image
% B -sparse dictionary, column-wise
% pyramid -defines structure of pyramid
% gamma -sparsity regularization parameter
% knn -k nearest neighbors selected for sparse coding
%
% Output:
% beta -multiscale max pooling feature
%
% Written by Jianchao Yang @ NEC Research Lab America (Cupertino)
% Mentor: Kai Yu
% July 2008
%
% Revised May. 2010
%===============================================
dSize = size(B, 2);
nSmp = size(feaSet.feaArr, 2);
img_width = feaSet.width;
img_height = feaSet.height;
idxBin = zeros(nSmp, 1);
sc_codes = zeros(dSize, nSmp);
% compute the local feature for each local feature
D = feaSet.feaArr'*B;
IDX = zeros(nSmp, knn);
for ii = 1:nSmp,
d = D(ii, :);
[dummy, idx] = sort(d, 'descend');
IDX(ii, :) = idx(1:knn);
end
for ii = 1:nSmp,
y = feaSet.feaArr(:, ii);
idx = IDX(ii, :);
BB = B(:, idx);
sc_codes(idx, ii) = feature_sign(BB, y, 2*gamma);
end
sc_codes = abs(sc_codes);
% spatial levels
pLevels = length(pyramid);
% spatial bins on each level
pBins = pyramid.^2;
% total spatial bins
tBins = sum(pBins);
beta = zeros(dSize, tBins);
bId = 0;
for iter1 = 1:pLevels,
nBins = pBins(iter1);
wUnit = img_width / pyramid(iter1);
hUnit = img_height / pyramid(iter1);
% find to which spatial bin each local descriptor belongs
xBin = ceil(feaSet.x / wUnit);
yBin = ceil(feaSet.y / hUnit);
idxBin = (yBin - 1)*pyramid(iter1) + xBin;
for iter2 = 1:nBins,
bId = bId + 1;
sidxBin = find(idxBin == iter2)
if isempty(sidxBin),
continue;
end
beta(:, bId) = max(sc_codes(:, sidxBin), [], 2);%求每行的最大值
end
end
if bId ~= tBins,
error('Index number error!');
end
beta = beta(:);
beta = beta./sqrt(sum(beta.^2));
function [beta] = sc_pooling(feaSet, B, pyramid, gamma)
%================================================
%
% Usage:
% Compute the linear spatial pyramid feature using sparse coding.
%
% Inputss:
% feaSet -structure defining the feature set of an image
% .feaArr local feature array extracted from the
% image, column-wise
% .x x locations of each local feature, 2nd
% dimension of the matrix
% .y y locations of each local feature, 1st
% dimension of the matrix
% .width width of the image
% .height height of the image
% B -sparse dictionary, column-wise
% gamma -sparsity regularization parameter
% pyramid -defines structure of pyramid
%
% Output:
% beta -multiscale max pooling feature
%
% Written by Jianchao Yang @ NEC Research Lab America (Cupertino)
% Mentor: Kai Yu
% July 2008
%
% Revised May. 2010
%===============================================
dSize = size(B, 2);
nSmp = size(feaSet.feaArr, 2);
img_width = feaSet.width;
img_height = feaSet.height;
idxBin = zeros(nSmp, 1);
sc_codes = zeros(dSize, nSmp);
% compute the local feature for each local feature
beta = 1e-4;
A = B'*B + 2*beta*eye(dSize);
Q = -B'*feaSet.feaArr;
for iter1 = 1:nSmp,
sc_codes(:, iter1) = L1QP_FeatureSign_yang(gamma, A, Q(:, iter1)); %即Zj={u1j,u2j,...uMj},M: size of the dictionary
end
sc_codes = abs(sc_codes);
% spatial levels
pLevels = length(pyramid);
% spatial bins on each level
pBins = pyramid.^2; %[1,4,16]
% total spatial bins
tBins = sum(pBins);
beta = zeros(dSize, tBins);
bId = 0;
for iter1 = 1:pLevels,
nBins = pBins(iter1);
wUnit = img_width / pyramid(iter1);
hUnit = img_height / pyramid(iter1);
% find to which spatial bin each local descriptor belongs
xBin = ceil(feaSet.x / wUnit);
yBin = ceil(feaSet.y / hUnit);
idxBin = (yBin - 1)*pyramid(iter1) + xBin;
for iter2 = 1:nBins,
bId = bId + 1;
sidxBin = find(idxBin == iter2);
if isempty(sidxBin),
continue;
end
beta(:, bId) = max(sc_codes(:, sidxBin), [], 2); %max(x,[],1)按列求最大值,max(x,[],2)按行求最大值
end
end
if bId ~= tBins,
error('Index number error!');
end
beta = beta(:);
beta = beta./sqrt(sum(beta.^2));
% This is an example code for running the ScSPM algorithm described in "Linear
% Spatial Pyramid Matching using Sparse Coding for Image Classification" (CVPR'09)
%
% Written by Jianchao Yang @ IFP UIUC
% For any questions, please email to [email protected].
%
% Revised May, 2010 by Jianchao Yang
clear all;
clc;
%% set path
addpath('large_scale_svm');
addpath('sift');
addpath(genpath('sparse_coding'));
%% parameter setting
% directory setup
img_dir = 'image'; % directory for dataset images
data_dir = 'data'; % directory to save the sift features of the chosen dataset
dataSet = 'Caltech101';
% sift descriptor extraction
skip_cal_sift = true; % if 'skip_cal_sift' is false, set the following parameter
gridSpacing = 6;
patchSize = 16;
maxImSize = 300;
nrml_threshold = 1; % low contrast region normalization threshold (descriptor length)
% dictionary training for sparse coding
skip_dic_training = true; %have provided dict_Caltech101_1024,so skip dic training
nBases = 1024;
nsmp = 200000;
beta = 1e-5; % a small regularization for stablizing sparse coding
num_iters = 50;
% feature pooling parameters
pyramid = [1, 2, 4]; % spatial block number on each level of the pyramid
gamma = 0.15;
knn = 200; % find the k-nearest neighbors for approximate sparse coding
% if set 0, use the standard sparse coding
% classification test on the dataset
nRounds = 5; % number of random tests
lambda = 0.1; % regularization parameter for w
tr_num = 30; % training number per category
rt_img_dir = fullfile(img_dir, dataSet);
rt_data_dir = fullfile(data_dir, dataSet);
%% calculate sift features or retrieve the database directory
if skip_cal_sift,
database = retr_database_dir(rt_data_dir);
else
[database, lenStat] = CalculateSiftDescriptor(rt_img_dir, rt_data_dir, gridSpacing, patchSize, maxImSize, nrml_threshold);
end;
%% load sparse coding dictionary (one dictionary trained on Caltech101 is provided: dict_Caltech101_1024.mat)
Bpath = ['dictionary/dict_' dataSet '_' num2str(nBases) '.mat'];
Xpath = ['dictionary/rand_patches_' dataSet '_' num2str(nsmp) '.mat'];
if ~skip_dic_training,
try
load(Xpath);
catch
X = rand_sampling(database, nsmp);%如果没有提供patches,则需自己从database中提取nsmp个patches
save(Xpath, 'X');
end
[B, S, stat] = reg_sparse_coding(X, nBases, eye(nBases), beta, gamma, num_iters);
save(Bpath, 'B', 'S', 'stat');
else
load(Bpath);
end
nBases = size(B, 2); % size of the dictionary
%% calculate the sparse coding feature
dimFea = sum(nBases*pyramid.^2); %每张图片特征的维数 1024(1+4+16)
numFea = length(database.path); %图片个数,database.path:contain the pathes for each image of each class
sc_fea = zeros(dimFea, numFea);%每列一个特征,共numFea个sift特征
sc_label = zeros(numFea, 1);
disp('==================================================');
fprintf('Calculating the sparse coding feature...\n');
fprintf('Regularization parameter: %f\n', gamma);
disp('==================================================');
for iter1 = 1:numFea,
if ~mod(iter1, 50),
fprintf('.\n');
else
fprintf('.');
end;
fpath = database.path{iter1};
load(fpath);
if knn,
sc_fea(:, iter1) = sc_approx_pooling(feaSet, B, pyramid, gamma, knn);
else
sc_fea(:, iter1) = sc_pooling(feaSet, B, pyramid, gamma);
end
sc_label(iter1) = database.label(iter1);
end;
%% evaluate the performance of the computed feature using linear SVM
[dimFea, numFea] = size(sc_fea);
clabel = unique(sc_label);
nclass = length(clabel);
accuracy = zeros(nRounds, 1);
for ii = 1:nRounds,
fprintf('Round: %d...\n', ii);
tr_idx = [];
ts_idx = [];
for jj = 1:nclass,
idx_label = find(sc_label == clabel(jj));
num = length(idx_label);
idx_rand = randperm(num);
tr_idx = [tr_idx; idx_label(idx_rand(1:tr_num))];
ts_idx = [ts_idx; idx_label(idx_rand(tr_num+1:end))];
end;
tr_fea = sc_fea(:, tr_idx);
tr_label = sc_label(tr_idx);
ts_fea = sc_fea(:, ts_idx);
ts_label = sc_label(ts_idx);
[w, b, class_name] = li2nsvm_multiclass_lbfgs(tr_fea', tr_label, lambda); % train multi-class via lbfgs, one-against-all decomposition
[C, Y] = li2nsvm_multiclass_fwd(ts_fea', w, b, class_name); % make multi-class prediction
acc = zeros(length(class_name), 1);
for jj = 1 : length(class_name)
c = class_name(jj);
idx = find(ts_label == c);
curr_pred_label = C(idx);
curr_gnd_label = ts_label(idx);
acc(jj) = length(find(curr_pred_label == curr_gnd_label))/length(idx);
end;
accuracy(ii) = mean(acc);
end;
fprintf('Mean accuracy: %f\n', mean(accuracy));
fprintf('Standard deviation: %f\n', std(accuracy));