package com.spbsu.ml.models.gpf;
import com.spbsu.commons.math.vectors.VecTools;
import com.spbsu.commons.math.vectors.impl.vectors.ArrayVec;
import com.spbsu.commons.random.FastRandom;
import com.spbsu.commons.util.ArrayTools;
import com.spbsu.ml.models.gpf.weblogmodel.BlockV1;
import com.spbsu.ml.models.gpf.weblogmodel.WebLogV1GPFSession;
import org.junit.Ignore;
import org.junit.Test;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.util.zip.GZIPInputStream;
import static junit.framework.Assert.assertEquals;
/**
* User: irlab
* Date: 22.05.14
*/
@Ignore
public class GPFLinearTest {
private final int random_seed = 0;
@Test
public void testArtificialClicks() throws IOException {
final List<Session<BlockV1>> dataset_nonfinal;
try (InputStream is = new GZIPInputStream(WebLogV1GPFSession.class.getResourceAsStream("ses_100k_simple_rand1_h10k.dat.gz"))) {
dataset_nonfinal = WebLogV1GPFSession.loadDatasetFromJSON(is, new GPFLinearModel(), 100);
}
final List<Session<BlockV1>> dataset = dataset_nonfinal;
System.out.println("dataset size: " + dataset.size());
FastRandom rand = new FastRandom(random_seed);
// generate random model
final GPFLinearModel model_true = new GPFLinearModel();
model_true.PRUNE_A_THRESHOLD = 1E-5;
model_true.trainClickProbability(dataset);
for (int i = 0; i < model_true.NFEATS; i++)
model_true.theta.set(i, rand.nextGaussian());
// generate artificial clicks
int n_sum_clicks = 0;
for (int nSes = 0; nSes < dataset.size(); nSes++) {
//System.out.println("session " + nSes);
final Session<BlockV1> ses = dataset.get(nSes);
final List<Integer> click_indexes = new ArrayList<Integer>();
int state = Session.Q_INDEX;
int click_s = 0;
while (state != Session.E_INDEX) {
final double[] probs = new double[ses.getEdgesFrom(state).length];
for (int j = 0; j < probs.length; j++)
probs[j] = model_true.eval_f(ses, state, ses.getEdgesFrom(state)[j], click_s);
double sum = 0;
for (int j = 0; j < probs.length; j++)
sum += probs[j];
for (int j = 0; j < probs.length; j++)
probs[j] /= sum;
// StringBuffer probs_str = new StringBuffer();
// for (int j = 0; j < probs.length; j++)
// probs_str.append("" + (j == 0 ? "" : ", ") + "(" + state + "->" + ses.getEdgesFrom(state)[j] + ": " + probs[j] + ")");
// System.out.println(" probs: " + probs_str);
state = ses.getEdgesFrom(state)[rand.nextSimple(new ArrayVec(probs))];
//System.out.println(" state " + state + " " + ses.getBlock(state));
click_s = ses.getBlock(state).blockType == Session.BlockType.RESULT && rand.nextDouble() <= model_true.getClickGivenViewProbability(ses.getBlock(state)) ? 1 : 0;
if (click_s == 1)
click_indexes.add(state);
}
ses.setClick_indexes(ArrayTools.convert(click_indexes.toArray(new Integer[click_indexes.size()])));
//System.out.println("" + nSes + ": " + click_indexes);
n_sum_clicks += click_indexes.size();
}
System.out.println("clicks generated, avg: " + n_sum_clicks/(float)dataset.size() + " clicks/session, " + dataset.size() + " sessions");
int nObservations = 0;
for (final Session ses: dataset)
nObservations += ses.getClick_indexes().length + 1;
final int fullds_nobservations_all = nObservations;
for (int random_seed_local = 1; random_seed_local < 2; random_seed_local++) {
System.out.println("\n############################################################################");
System.out.println("random_seed_local =\t" + random_seed_local);
rand = new FastRandom(random_seed_local);
// generate random model
final GPFLinearModel model0 = new GPFLinearModel(model_true);
for (int i = 0; i < model0.NFEATS; i++)
model0.theta.set(i, rand.nextGaussian());
final GPFLinearOptimization optimizer = new GPFLinearOptimization();
final double model_true_expll = Math.exp(-optimizer.evalDatasetGradientValue(model_true, dataset, false).loglikelihood);
System.out.println("model_true loglikelihood: " + model_true_expll);
assertEquals(8.4, model_true_expll, 0.1);
final long t1 = System.currentTimeMillis();
final double model0_expll = Math.exp(-optimizer.evalDatasetGradientValue(model0, dataset, false).loglikelihood);
System.out.println("model0 loglikelihood: " + model0_expll);
assertEquals(97.9, model0_expll, 0.1);
final long t2 = System.currentTimeMillis();
System.out.println("time loglikelihood eval: " + (t2-t1) + " ms");
final int iteration_dataset_pass_count = 20;
optimizer.SGD_BLOCK_SIZE = 1;
final int iteration_count = iteration_dataset_pass_count * dataset.size() / optimizer.SGD_BLOCK_SIZE;
optimizer.step_eta0 = 0.1; //0.01;
optimizer.step_gamma = 0.75;
optimizer.step_a = dataset.size() / optimizer.SGD_BLOCK_SIZE;
model0.PRUNE_A_THRESHOLD = model_true.PRUNE_A_THRESHOLD;
System.out.println("optimizer.SGD_BLOCK_SIZE = " + optimizer.SGD_BLOCK_SIZE);
System.out.println("optimizer.step_eta0 = " + optimizer.step_eta0);
System.out.println("optimizer.step_a = " + optimizer.step_a);
System.out.println("optimizer.step_gamma = " + optimizer.step_gamma);
System.out.println("model0.PRUNE_A_THRESHOLD = " + model0.PRUNE_A_THRESHOLD);
optimizer.listener = new GPFLinearOptimization.IterationEventListener() {
@Override
public void iterationPerformed(final GPFLinearOptimization.IterationEvent e) {
if (optimizer.SGD_BLOCK_SIZE < dataset.size() && e.iter % (dataset.size() / optimizer.SGD_BLOCK_SIZE) != 0) return;
final double model0_dist = Math.sqrt(model0.theta.l2(e.model.theta));
final double model_true_dist = Math.sqrt(model_true.theta.l2(e.model.theta));
double fullds_loglikelihood = e.fullds_loglikelihood;
int fullds_nobservations_correct = e.fullds_nobservations_correct;
if (fullds_loglikelihood == 0.) {
final GPFLinearOptimization.DatasetGradientValue gradV = optimizer.evalDatasetGradientValue(e.model, dataset, false);
fullds_loglikelihood = gradV.loglikelihood;
fullds_nobservations_correct = gradV.nObservations;
}
System.out.println("" + (new Date()) +
"\t" + e.iter + "(" + (e.iter * optimizer.SGD_BLOCK_SIZE / dataset.size()) + "/" + (iteration_count * optimizer.SGD_BLOCK_SIZE / dataset.size()) + ")" +
"\tL=" + Math.exp(-fullds_loglikelihood) +
"\teta=" + e.step_size +
"\tmodel0_dist=" + model0_dist +
"\tmodel_true_dist=" + model_true_dist +
(optimizer.do_ignore_improbable_sessions ? "" : "\timprobable_obs=" + (fullds_nobservations_all - fullds_nobservations_correct) + "(" + (fullds_nobservations_all - fullds_nobservations_correct)/(float)fullds_nobservations_all + ")") +
"\tL_partial=" + Math.exp(-e.loglikelihood) +
"\tgrad_norm=" + VecTools.norm(e.gradient) +
"\tgrad=[" + e.gradient + "]");
}
@Override
public void backstepPerformed(final GPFLinearOptimization.IterationEvent e) {
System.out.println(" L > last_L: " + Math.exp(-e.fullds_loglikelihood) + " > " + Math.exp(-e.loglikelihood) + ", go back and set a_m = " + optimizer.step_a_m);
}
};
final GPFLinearModel model_optimized = optimizer.StochasticGradientDescent(model0, dataset, iteration_count);
final long t3 = System.currentTimeMillis();
System.out.println("time optimization: " + (t3-t2)/1000 + " sec");
final double model_final_expll = Math.exp(-optimizer.evalDatasetGradientValue(model_optimized, dataset, false).loglikelihood);
System.out.println("final loglikelihood: " + model_final_expll);
assertEquals(10.3, model_final_expll, 0.1);
}
}
@Test
public void testOptimizeSGD() throws IOException {
final List<Session<BlockV1>> dataset_nonfinal;
try (InputStream is = new GZIPInputStream(WebLogV1GPFSession.class.getResourceAsStream("ses_100k_simple_rand1_h10k.dat.gz"))) {
dataset_nonfinal = WebLogV1GPFSession.loadDatasetFromJSON(is, new GPFLinearModel(), 100);
}
final List<Session<BlockV1>> dataset = dataset_nonfinal;
final List<Session<BlockV1>> test_dataset_nonfinal;
try (InputStream is = new GZIPInputStream(WebLogV1GPFSession.class.getResourceAsStream("ses_100k_simple_rand2_h10k.dat.gz"))) {
test_dataset_nonfinal = WebLogV1GPFSession.loadDatasetFromJSON(is, new GPFLinearModel(), 100);
}
final List<Session<BlockV1>> test_dataset = test_dataset_nonfinal;
final boolean test_sorted_clicks_model = false;
if (test_sorted_clicks_model) {
System.out.println("test_sorted_clicks_model");
for (final Session ses: dataset)
ses.sortUniqueClicks();
for (final Session ses: test_dataset)
ses.sortUniqueClicks();
}
int nObservations = 0;
for (final Session ses: dataset)
nObservations += ses.getClick_indexes().length + 1;
final int fullds_nobservations_all = nObservations;
int n_sum_clicks = 0;
for (final Session ses: dataset)
n_sum_clicks += ses.getClick_indexes().length;
System.out.println("dataset size: " + dataset.size() + " sessions, avg " + (n_sum_clicks / (float)dataset.size()) + " clicks/session");
final FastRandom rand = new FastRandom(random_seed);
double best_ll = 1111;
double test_ll = 1111;
for (int ntry = 0; ntry < 1; ntry++) {
System.out.println("########################################################\n");
System.out.println("" + new Date() + ": ntry: " + ntry + "\n");
// generate random model
final GPFLinearModel model0 = new GPFLinearModel();
model0.trainClickProbability(dataset);
for (int i = 0; i < model0.NFEATS; i++)
model0.theta.set(i, rand.nextGaussian());
final GPFLinearOptimization optimizer = new GPFLinearOptimization();
final long t1 = System.currentTimeMillis();
final double model0_expll = Math.exp(-optimizer.evalDatasetGradientValue(model0, dataset, false).loglikelihood);
System.out.println("model0 loglikelihood: " + model0_expll);
assertEquals(13.3, model0_expll, 0.1);
final long t2 = System.currentTimeMillis();
System.out.println("time loglikelihood eval: " + (t2-t1) + " ms");
final int iteration_dataset_pass_count = 10;
optimizer.SGD_BLOCK_SIZE = 1;
final int iteration_count = iteration_dataset_pass_count * dataset.size() / optimizer.SGD_BLOCK_SIZE;
optimizer.step_eta0 = 0.1; //0.01;
optimizer.step_gamma = 0.75;
optimizer.step_a = dataset.size() / optimizer.SGD_BLOCK_SIZE;
model0.PRUNE_A_THRESHOLD = 1E-5;
System.out.println("optimizer.SGD_BLOCK_SIZE = " + optimizer.SGD_BLOCK_SIZE);
System.out.println("optimizer.step_eta0 = " + optimizer.step_eta0);
System.out.println("optimizer.step_a = " + optimizer.step_a);
System.out.println("optimizer.step_gamma = " + optimizer.step_gamma);
System.out.println("model0.PRUNE_A_THRESHOLD = " + model0.PRUNE_A_THRESHOLD);
optimizer.listener = new GPFLinearOptimization.IterationEventListener() {
@Override
public void iterationPerformed(final GPFLinearOptimization.IterationEvent e) {
final int iterations_per_dataset = dataset.size() / optimizer.SGD_BLOCK_SIZE;
if (optimizer.SGD_BLOCK_SIZE < dataset.size()) {
if (e.iter < iterations_per_dataset) {
return;
//if (e.iter % (iterations_per_dataset / 20) != 0)
// return;
} else { // e.iter >= iterations_per_dataset
if (e.iter % (dataset.size() / optimizer.SGD_BLOCK_SIZE) != 0)
return;
}
}
final double model0_dist = Math.sqrt(model0.theta.l2(e.model.theta));
double fullds_loglikelihood = e.fullds_loglikelihood;
int fullds_nobservations_correct = e.fullds_nobservations_correct;
if (fullds_loglikelihood == 0.) {
final GPFLinearOptimization.DatasetGradientValue gradV = optimizer.evalDatasetGradientValue(e.model, dataset, false);
fullds_loglikelihood = gradV.loglikelihood;
fullds_nobservations_correct = gradV.nObservations;
}
double test_dataset_ll = 0;
if (test_dataset != null)
test_dataset_ll = optimizer.evalDatasetGradientValue(e.model, test_dataset, false).loglikelihood;
System.out.println("" + (new Date()) +
"\t" + e.iter + "(" + (e.iter * optimizer.SGD_BLOCK_SIZE / dataset.size()) + "/" + (iteration_count * optimizer.SGD_BLOCK_SIZE / dataset.size()) + ")" +
"\tL=" + Math.exp(-fullds_loglikelihood) +
"\teta=" + e.step_size +
(test_dataset != null ? "\ttest_L=" + Math.exp(-test_dataset_ll) : "") +
"\tmodel0_dist=" + model0_dist +
(optimizer.do_ignore_improbable_sessions ? "" : "\timprobable_obs=" + (fullds_nobservations_all - fullds_nobservations_correct) + "(" + (fullds_nobservations_all - fullds_nobservations_correct)/(float)fullds_nobservations_all + ")") +
"\tL_partial=" + Math.exp(-e.loglikelihood) +
"\tgrad_norm=" + VecTools.norm(e.gradient) +
""); //"\tgrad=[" + e.gradient + "]");
if (e.iter % (iterations_per_dataset * 20) == 0)
System.out.println(">>current model: " + e.model.explainTheta());
}
@Override
public void backstepPerformed(final GPFLinearOptimization.IterationEvent e) {
System.out.println(" L > last_L: " + Math.exp(-e.fullds_loglikelihood) + " > " + Math.exp(-e.loglikelihood) + ", go back and set a_m = " + optimizer.step_a_m);
}
};
final GPFLinearModel model_optimized = optimizer.StochasticGradientDescent(model0, dataset, iteration_count);
final long t3 = System.currentTimeMillis();
System.out.println("time optimization: " + (t3-t2)/1000 + " sec");
final double ll = Math.exp(-optimizer.evalDatasetGradientValue(model_optimized, dataset, false).loglikelihood);
System.out.println("final loglikelihood: " + ll);
System.out.println("final theta: " + model_optimized.theta );
System.out.println("final theta explain: " + model_optimized.explainTheta() );
if (ll < best_ll) {
best_ll = ll;
test_ll = Math.exp(-optimizer.evalDatasetGradientValue(model_optimized, test_dataset, false).loglikelihood);
}
}
System.out.println("" + new Date() + ": best ll: " + best_ll);
assertEquals(5.2, best_ll, 0.1);
assertEquals(4.9, test_ll, 0.1);
}
@Test
public void testSERPProbs() throws IOException {
final GPFLinearModel model = new GPFLinearModel();
final List<Session<BlockV1>> dataset;
try (InputStream is = new GZIPInputStream(WebLogV1GPFSession.class.getResourceAsStream("ses_100k_simple_rand1_h10k.dat.gz"))) {
dataset = WebLogV1GPFSession.loadDatasetFromJSON(is, new GPFLinearModel(), 100);
}
// init model
model.trainClickProbability(dataset);
// optimized sort_clicks
//String theta_str = "-1.702713106887966 0.6404247678125509 0.8839508435362965 0.21594343210697917 -0.7358391375584755 -0.8042641035860408 0.3583499027340962 -0.11674658767248532 0.14040130919303337 0.03769440360443547 0.03935260864525687 0.02853823412929953 2.9422529205133463 -2.874955306294313 3.9219531435872557 0.15268044240500608 -1.3913468238331568 0.7810782232327959 0.041232789657154746 -0.45027594953466205 -0.9911457338442456 -1.0436641653093275 -1.283091206075993 -1.2334707757320833 -2.0290071795725835 -1.2284048134884975 -0.8402670201797776 0.09780373481660343 -0.6956589612984125 -0.7010852279098979 0.39816008299399064 -0.03645514963018488 -0.7665757899838521 -0.44931334579482907 1.2404606430397838 2.1373765546696415 -2.3185767965067376 -0.3782845023765775";
// optimized r602_2.out
final String theta_str = "-0.9205664691357801 0.9041998193447492 1.0046610326248397 0.29671349018552656 -0.18053090095708907 0.1772697097979266 -0.20372762113889378 -0.7347344786004694 -0.590408428912083 -0.7299015246974587 -0.792449157275554 -0.7089522500922206 2.2627922543859196 -3.100817014916263 4.067342185744358 -2.5792603725334557 1.1895147789581328 1.3296377365812424 0.7446332963557005 0.1835711196264189 0.016013162804432185 -0.18441528045214423 -0.6162991227657141 -0.6939594938332577 1.0894364501659024 0.3579520755136945 0.496534915034393 0.4146067640917571 0.3836857168202354 -0.015704278848354097 0.0913408379926171 -0.04884275707431338 -0.12681530930644924 -0.30889371408471994 0.7818935938652342 2.008411165741512 -4.581704099106069 0.7613830127598948";
final String[] theta_str_arr = theta_str.split(" ");
final ArrayVec theta = new ArrayVec(model.NFEATS);
for (int i = 0; i < theta.dim(); i++)
theta.set(i, Double.parseDouble(theta_str_arr[i]));
model.theta.assign(theta);
// init session
final Session session = new Session();
final BlockV1[] blocks = new BlockV1[11];
for (int i = 0; i < blocks.length; i++) {
blocks[i] = new BlockV1(
Session.BlockType.RESULT,
i == 3 ? BlockV1.ResultType.IMAGES : BlockV1.ResultType.WEB,
i,
i <= 3 ? BlockV1.ResultGrade.RELEVANT_PLUS : BlockV1.ResultGrade.NOT_ASED);
}
final int[] clicks = new int[] {3, 2, 6, 10};
WebLogV1GPFSession.setSessionData(session, blocks, clicks);
System.out.println(model.explainTheta() + "\n");
System.out.println("selected session");
System.out.println(model.explainSessionProb(session));
for (int i = 0; i < 5; i++) {
System.out.println("\n\nsession #" + (i+1));
System.out.println(model.explainSessionProb(dataset.get(i)));
}
}
}