This example shows how classification specific refinement strategies are used.
To do classification, for each class a PDF is approximated with LearnerSGDE and the class with the highest probability gets assigned for new data points to be classified. The ripley data sets is used, although the small number of training data poitns in combination with only a basic setup does not yield good results for any refinement strategy. This example is merely a tech-example.
Helper to create learner
Helper to evaluate the classifiers
std::vector<std::string> doClassification(std::vector<sgpp::base::Grid*> grids,
std::vector<sgpp::base::DataVector*> alphas,
Get the training/test data
std::string basePath = "../../datasets/ripley/ripleyGarcke";
std::cout <<
"Read training data: " << dataTrain.
getNrows() << std::endl;
std::cout <<
"Read test data : " << dataTest.
getNrows() << std::endl;
Only uint class labels starting 0 and incrementing by 1 per class are possible right no (to match grid-indices in vectors). For use in DataVector class labels are cast to double. Preprocess to have class label 0, 1, ... -1 -> 0 and 1 -> 1
for (
size_t i = 0;
i < targetTrain.
getSize();
i++) {
if (targetTrain.
get(
i) < 0.0) {
} else {
}
}
for (
size_t i = 0;
i < targetTest.
getSize();
i++) {
if (targetTest.
get(
i) < 0.0) {
} else {
}
}
std::cout << "Preprocessing the data" << std::endl;
Split Training data according to class
if (targetTrain.
get(
i) < 1) {
dataCl1.appendRow(row);
} else {
dataCl2.appendRow(row);
}
}
std::cout << "Data points of class -1.0 (= 0): " << dataCl1.getNrows() << std::endl;
std::cout << "Data points of class +1.0 (= 1): " << dataCl2.getNrows() << std::endl;
Approximate a probability density function for the class data using LearnerSGDE, one for each class. Initialize the learners with the data
Bundle grids and surplus vector pointer needed for refinement and evaluation
std::vector<sgpp::base::Grid*> grids;
std::vector<sgpp::base::DataVector*> alphas;
grids.push_back(learner1.
getGrid());
grids.push_back(learner2.
getGrid());
Create refinement functors
size_t numRefinements = 3;
bool levelPenalize = false;
bool preCompute = true;
levelPenalize);
levelPenalize, preCompute);
levelPenalize, preCompute);
Data-based refinement. Needs a problem dependent coeffA. The values were determined by testing (aim at ~10 % of the training data is to be marked relevant. Cross-validation or similar can/should be employed to determine this value.
std::vector<double> coeffA;
coeffA.push_back(1.2);
coeffA.push_back(1.2);
numRefinements, levelPenalize, coeffA);
Choose the refinement functor to be used
Repeat alternating refinement and training for n steps and do evaluation after each step Uses the refinement strategy defined in fun An initial evaluation with the regular grid is done at "step 0"
size_t numSteps = 5;
std::vector<std::string> eval = doClassification(grids, alphas, dataTest, targetTest);
std::cout << "Evaluation:" << std::endl << std::endl;
std::cout << " Step | c=1 c=2 | total" << std::endl;
std::cout << "------------------------------" << std::endl;
std::cout << " 0 | " << eval.at(0) << " | " << eval.at(1) << std::endl;
for (
size_t i = 1;
i < numSteps + 1;
i++) {
if (preCompute) {
}
grids.at(0)->getGenerator().refine(*fun);
grids.at(1)->getGenerator().refine(*fun);
learner1.
train(*grids.at(0), *alphas.at(0), dataCl1,
lambda);
learner2.
train(*grids.at(1), *alphas.at(1), dataCl2,
lambda);
eval = doClassification(grids, alphas, dataTest, targetTest);
std::cout <<
" " <<
i <<
" | " << eval.at(0) <<
" | " << eval.at(1) << std::endl;
}
std::cout << std::endl << "Done" << std::endl;
return 0;
}
Helper function It configures and creates a SGDE learner with meaningful parameters
solverConfig.
eps_ = 1e-10;
crossvalidationConfig.
enable_ =
false;
crossvalidationConfig.
kfold_ = 3;
crossvalidationConfig.
lambda_ = 3.16228e-06;
crossvalidationConfig.
seed_ = 1234567;
crossvalidationConfig.
silent_ =
true;
crossvalidationConfig);
return learner;
}
Helper function it does the classification, gets the predictions and generates some error-output
std::vector<std::string> doClassification(std::vector<sgpp::base::Grid*> grids,
std::vector<sgpp::base::DataVector*> alphas,
double best_eval = 0.0;
double eval = 0.0;
std::vector<std::unique_ptr<sgpp::base::OperationEval>> evalOps;
for (
size_t i = 0;
i < grids.size();
i++) {
std::unique_ptr<sgpp::base::OperationEval> e(
evalOps.push_back(std::move(e));
}
best_eval = evalOps.at(0)->eval(*alphas.at(0),
p);
for (
size_t j = 1;
j < grids.size();
j++) {
eval = evalOps.at(
j)->eval(*alphas.at(
j),
p);
if (eval > best_eval) {
best_eval = eval;
indices.set(
i, static_cast<double>(
j));
}
}
}
std::vector<int> classCounts(grids.size(), 0);
std::vector<int> classErrorCounts(grids.size(), 0);
totalError.sub(testLabel);
size_t totalCount = 0;
classCounts.at(static_cast<size_t>(floor(testLabel.
get(
i)))) += 1;
if (fabs(totalError.get(
i)) > 0.01) {
totalCount++;
classErrorCounts.at(static_cast<size_t>(floor(testLabel.
get(
i)))) += 1;
}
}
std::stringstream ss;
for (
size_t i = 0;
i < grids.size();
i++) {
double ce = (100.0 * (1.0 - (
static_cast<double>(classErrorCounts.at(
i)) / classCounts.at(
i))));
ss << std::fixed << std::setprecision(2) << ce;
if (
i < grids.size() - 1) {
ss << " ";
}
}
std::stringstream ss2;
ss2 << std::fixed << std::setprecision(3);
ss2 << (100.0 *
(1.0 - (
static_cast<double>(totalCount) / static_cast<double>(testData.
getNrows()))));
std::vector<std::string> result;
result.push_back(ss.str());
result.push_back(ss2.str());
return result;
}