/*
TagRecommender:
A framework to implement and evaluate algorithms for the recommendation
of tags.
Copyright (C) 2013 Dominik Kowald
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import cc.mallet.topics.ParallelTopicModel;
import cc.mallet.types.InstanceList;
import common.Bookmark;
import common.CalculationType;
import common.Features;
import common.Utilities;
import engine.Algorithm;
import engine.EngineInterface;
import engine.EntityRecommenderEngine;
import engine.EntityType;
import engine.TagRecommenderEvalEngine;
import file.BookmarkReader;
import file.BookmarkSplitter;
import file.postprocessing.CatDescFiltering;
import file.preprocessing.BibsonomyProcessor;
import file.preprocessing.CiteULikeProcessor;
import file.preprocessing.LastFMProcessor;
import file.preprocessing.MovielensProcessor;
import file.preprocessing.PintsProcessor;
import file.preprocessing.TensorProcessor;
import itemrecommendations.CFResourceCalculator;
import itemrecommendations.CIRTTCalculator;
import itemrecommendations.HuangCalculator;
import itemrecommendations.MPResourceCalculator;
import itemrecommendations.SustainCalculator;
import itemrecommendations.ZhengCalculator;
import processing.BLLCalculator;
import processing.CFTagRecommender;
import processing.ContentBasedCalculator;
import processing.FolkRankCalculator;
import processing.GIRPTMCalculator;
import processing.MPCalculator;
import processing.MPurCalculator;
import processing.MalletCalculator;
import processing.MetricsCalculator;
import processing.RecencyCalculator;
import processing.ThreeLTCalculator;
import processing.analyzing.UserTagDistribution;
import processing.hashtag.HashtagRecommendationEngine;
import processing.hashtag.analysis.ProcessFrequencyRecency;
import processing.hashtag.analysis.ProcessFrequencyRecencySocial;
import processing.hashtag.social.SocialStrengthCalculator;
import processing.hashtag.solr.CFSolrHashtagCalculator;
import processing.hashtag.solr.SolrHashtagCalculator;
import processing.hashtag.solr.Tweet;
public class Pipeline {
// are set automatically in code
private static int TRAIN_SIZE;
private static int TEST_SIZE;
// set for postprocessing (number of bookmarks - null is nothing)
private final static Integer MIN_USER_BOOKMARKS = null;
private final static Integer MAX_USER_BOOKMARKS = null;
private final static Integer MIN_RESOURCE_BOOKMARKS = null;
private final static Integer MAX_RESOURCE_BOOKMARKS = null;
// set for categorizer/describer split (true is describer, false is
// categorizer - null for nothing)
private final static Boolean DESCRIBER = null;
// placeholder for the topic posfix
private static String TOPIC_NAME = null;
// placeholder for the used dataset
private final static String DATASET = "twitter";
private final static String SUBDIR = "/researchers";
public static void main(String[] args) {
System.out.println(
"TagRecommender:\n" + "" + "A framework to implement and evaluate algorithms for the recommendation\n"
+ "of tags." + "Copyright (C) 2013 - 2015 Dominik Kowald\n\n"
+ "This program is free software: you can redistribute it and/or modify\n"
+ " it under the terms of the GNU Affero General Public License as published by\n"
+ "the Free Software Foundation, either version 3 of the License, or\n"
+ "(at your option) any later version.\n\n"
+ "This program is distributed in the hope that it will be useful,\n"
+ "but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
+ "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n"
+ "GNU Affero General Public License for more details.\n\n"
+ "You should have received a copy of the GNU Affero General Public License\n"
+ "along with this program. If not, see <http://www.gnu.org/licenses/>.\n"
+ "-----------------------------------------------------------------------------\n\n");
String dir = DATASET + "_core" + SUBDIR + "/";
String path = dir + DATASET + "_sample";
String networkFileName = "./data/csv/" + dir + "network.txt";
String solrServerNameWithPort = ""; // only necessary when solr core is used
// Method Testing -> just uncomment the methods you want to test
// Test the BLL and BLL+MP_r algorithms (= baseline to beat :))
// startActCalculator(dir, path, 1, 0.5, null, -5, false,
// CalculationType.NONE, false);
// Test the BLL_AC and BLL_AC+MP_r algorithms (could take a while)
// startActCalculator(dir, path, 1, -5, -5, false,
// CalculationType.USER_TO_RESOURCE_ONLY, false);
// startActCalculator(dir, path, 1, 0.5, null, -5, true,
// CalculationType.USER_TO_RESOURCE, false);
// Test the MR approach
// startRecCalculator(dir, path);
// Test the GIRP and GIRPTM algorithms
// startGirpCalculator(dir, path, true);
// Test the MP_u, MP_r and MP_u_r algorithms
// startModelCalculator(dir, path, 1, -5, true);
// Test the MP algorithm
// startBaselineCalculator(dir, path, 1, true);
// Test the CF_u, CF_r and CF_u_r algorithms with 20 neighbors (change
// it if you want)
// startCfTagCalculator(dir, path, 1, 20, -5, false);
// Test the PR and FR algorithms
// startFolkRankCalculator(dir, path, 1);
// Test the LDA algorithm with 1000 topics (change it if you want)
// startLdaCalculator(dir, path, 1000, 1, false);
// Test the 3L algorithm
// start3LayersJavaCalculator(dir, path, "", 1, -5, -5, true, false,
// false);
// Test the 3L_tag algorithm
// start3LayersJavaCalculator(dir, path, "", 1, -5, -5, true, true,
// false);
// Test the 3LT_topic algorithm
// start3LayersJavaCalculator(dir, path, "", 1, -5, -5, true, false,
// true);
// Commandline Arguments
if (args.length < 3) {
System.out.println("Too few arguments!");
return;
}
String subdir = "/";
String op = args[0];
String samplePath = "", sampleDir = "", sampleNetwork = "";
int sampleCount = 1;
if (args[1].equals("cul")) {
sampleDir = "cul_core";
} else if (args[1].equals("flickr")) {
sampleDir = "flickr_core";
} else if (args[1].equals("bib")) {
sampleDir = "bib_core";
} else if (args[1].equals("wiki")) {
sampleDir = "wiki_core";
} else if (args[1].equals("ml")) {
sampleDir = "ml_core";
} else if (args[1].equals("lastfm")) {
sampleDir = "lastfm_core";
} else if (args[1].equals("del")) {
sampleDir = "del_core";
} else if (args[1].equals("twitter_res")) {
sampleDir = "twitter_core";
subdir = "/researchers";
} else if (args[1].equals("twitter_gen")) {
sampleDir = "twitter_core";
subdir = "/general";
} else {
System.out.println("Dataset not available");
return;
}
sampleDir += subdir;
samplePath += (sampleDir + "/" + args[2]);
sampleNetwork = "./data/csv/" + sampleDir + "/network.txt";
boolean narrowFolksonomy = args[1].equals("flickr") || args[1].contains("twitter");
if (op.equals("cf")) {
startCfTagCalculator(sampleDir, samplePath, sampleCount, 20, -5, false);
} else if (op.equals("cfr")) {
startCfTagCalculator(sampleDir, samplePath, sampleCount, 20, -5, !narrowFolksonomy);
} else if (op.equals("fr")) {
startFolkRankCalculator(sampleDir, samplePath, sampleCount);
} else if (op.equals("bll_c")) {
startActCalculator(sampleDir, samplePath, sampleCount, 0.5, null, -5, !narrowFolksonomy,
CalculationType.NONE, true);
} else if (op.equals("bll_c_ac")) {
if (!narrowFolksonomy) {
startActCalculator(sampleDir, samplePath, sampleCount, 0.5, null, -5, !narrowFolksonomy,
CalculationType.USER_TO_RESOURCE, true);
}
} else if (op.equals("girptm")) {
startGirpCalculator(sampleDir, samplePath, !narrowFolksonomy);
} else if (op.equals("mp_ur")) {
startModelCalculator(sampleDir, samplePath, sampleCount, -5, !narrowFolksonomy);
} else if (op.equals("mp")) {
startBaselineCalculator(sampleDir, samplePath, sampleCount, true);
} else if (op.equals("3layers")) {
start3LayersJavaCalculator(sampleDir, samplePath, "", sampleCount, -5, -5, !narrowFolksonomy, false, false);
} else if (op.equals("3LT")) {
start3LayersJavaCalculator(sampleDir, samplePath, "", sampleCount, -5, -5, !narrowFolksonomy, true, false);
start3LayersJavaCalculator(sampleDir, samplePath, "", sampleCount, -5, -5, !narrowFolksonomy, false, true);
} else if (op.equals("lda")) {
startLdaCalculator(sampleDir, samplePath, 1000, sampleCount, !narrowFolksonomy);
} else if (op.equals("lda_samples")) {
createLdaSamples(samplePath, sampleCount, 1000, true, false);
} else if (op.equals("tensor_samples")) {
writeTensorFiles(samplePath, true);
} else if (op.equals("mymedialite_samples")) {
writeTensorFiles(samplePath, false);
} else if (op.equals("core")) {
BookmarkSplitter.calculateCore(samplePath, samplePath, 3, 3, 3);
} else if (op.equals("split_l1o")) {
BookmarkSplitter.splitSample(samplePath, samplePath, sampleCount, 0, true, false, true, null, sampleNetwork);
} else if (op.equals("split_8020")) {
BookmarkSplitter.splitSample(samplePath, samplePath, sampleCount, 20, false, false, true, null, sampleNetwork);
} else if (op.equals("percentage_sample")) {
BookmarkSplitter.drawUserPercentageSample(samplePath, 3, 1);
} else if (op.equals("process_bibsonomy")) {
BibsonomyProcessor.processUnsortedFile(sampleDir, "tas", args[2]);
} else if (op.equals("process_citeulike")) {
CiteULikeProcessor.processFile("current", args[2]);
} else if (op.equals("process_lastfm")) {
LastFMProcessor.processFile("user_taggedartists-timestamps.dat", args[2]);
} else if (op.equals("process_ml")) {
MovielensProcessor.processFile("tags.dat", args[2], "ratings.dat");
} else if (op.equals("process_del")) {
PintsProcessor.processFile(sampleDir, "delicious", args[2]);
} else if (op.equals("process_flickr")) {
PintsProcessor.processFile(sampleDir, "flickr", args[2]);
} else if (op.equals("item_mp")) {
startBaselineCalculatorForResources(sampleDir, samplePath, sampleCount, false, false);
} else if (op.equals("item_cft")) {
startCfResourceCalculator(sampleDir, samplePath, sampleCount, 20, true, false, false, false, Features.TAGS,
false);
} else if (op.equals("item_cfb")) {
startCfResourceCalculator(sampleDir, samplePath, sampleCount, 20, true, false, false, false,
Features.ENTITIES, false);
} else if (op.equals("item_cbt")) {
TOPIC_NAME = "lda_500";
startCfResourceCalculator(sampleDir, samplePath, 1, 20, false, true, false, false, Features.TOPICS, false);
} else if (op.equals("item_zheng")) {
startZhengResourceCalculator(sampleDir, samplePath, sampleCount);
} else if (op.equals("item_huang")) {
startHuangResourceCalculator(sampleDir, samplePath, sampleCount);
} else if (op.equals("item_cirtt")) {
startResourceCIRTTCalculator(sampleDir, samplePath, "", sampleCount, 20, Features.ENTITIES, false, true,
false, true);
} else if (op.equals("item_sustain")) {
startSustainApproach(sampleDir, samplePath, 2.845, 0.5, 6.396, 0.0936, 0, 0, 20, 0.5);
} else if (op.equals("stats")) {
try {
getStatistics(samplePath, false);
} catch (Exception e) {
e.printStackTrace();
}
} else if (op.equals("hashtag_analysis")) {
analysisSocial(sampleDir, samplePath, sampleNetwork, "all", null);
} else if (op.equals("hashtag_hybrid")) {
startSocialRecommendation(sampleDir, samplePath, sampleNetwork, "hybrid", 0.5, null, 0.5, null, null, null);
} else if (op.equals("hashtag_socialmp")) {
startSocialRecommendation(sampleDir, samplePath, sampleNetwork, "social_freq", 0.5, null, 0.5, null, null, null);
} else if (op.equals("hashtag_socialbll")) {
startSocialRecommendation(sampleDir, samplePath, sampleNetwork, "social", 0.5, null, 0.5, null, null, null);
} else if (op.equals("hashtag_social_recency")) {
startSocialRecommendation(sampleDir, samplePath, sampleNetwork, "social_recency", 0.5, null, 0.5, null,
null, null);
} else if (op.equals("hashtag_cb_res")) {
startSocialRecommendation(sampleDir, samplePath, sampleNetwork, "hybrid", 1.699, null, 1.242, null,
solrServerNameWithPort, "researcher");
} else if (op.equals("hashtag_cb_gen")) {
startSocialRecommendation(sampleDir, samplePath, sampleNetwork, "hybrid", 1.723, null, 1.269, null,
solrServerNameWithPort, "general");
} else {
System.out.println("Unknown operation");
}
}
// Tag Recommenders methods
// ---------------------------------------------------------------------------------------------------------------------------------------------
private static void startSolrHashtagCalculator(String sampleDir, String samplePath, String solrUrl, String solrCore, boolean train, boolean hours, Integer mostRecentTweets) {
if (train) {
String suffix = SolrHashtagCalculator.predictTrainSample(sampleDir, solrCore, solrUrl, hours,
mostRecentTweets);
writeMetrics(sampleDir, sampleDir + "/" + solrCore, suffix, 1, 10, null, null, null);
} else {
// using test set content!
String suffix = SolrHashtagCalculator.predictSample(sampleDir, solrCore, solrUrl);
writeMetrics(sampleDir, sampleDir + "/" + solrCore, suffix, 1, 10, null, null, null);
}
}
private static void startAllTagRecommenderApproaches(String sampleDir, String samplePath, boolean all) {
startBaselineCalculator(sampleDir, samplePath, 1, true); // MP
startModelCalculator(sampleDir, samplePath, 1, -5, all); // MPur
startGirpCalculator(sampleDir, samplePath, all); // GIRPTM
startActCalculator(sampleDir, samplePath, 1, 0.5, null, -5, all, CalculationType.NONE, true); // BLL
startActCalculator(sampleDir, samplePath, 1, 0.5, null, -5, all, CalculationType.USER_TO_RESOURCE, true); // BLLac
start3LayersJavaCalculator(sampleDir, samplePath, "", 1, -5, -5, all, false, false); // 3L
start3LayersJavaCalculator(sampleDir, samplePath, "", 1, -5, -5, all, true, false); // 3LTtop
start3LayersJavaCalculator(sampleDir, samplePath, "", 1, -5, -5, all, false, true); // 3LTtag
startCfTagCalculator(sampleDir, samplePath, 1, 20, -5, false); // CFur
startFolkRankCalculator(sampleDir, samplePath, 1); // APR+FR
startLdaCalculator(sampleDir, samplePath, 1000, 1, all); // LDA
}
private static void startSampleTagRecommenderApproaches(String sampleDir, String samplePath, boolean all) {
startModelCalculator(sampleDir, samplePath, 1, -5, all); // MPur
startGirpCalculator(sampleDir, samplePath, all); // GIRPTM
startActCalculator(sampleDir, samplePath, 1, 0.5, null, -5, all, CalculationType.USER_TO_RESOURCE, false); // BLLac
startFolkRankCalculator(sampleDir, samplePath, 1); // APR+FR
}
private static void startActCalculator(String sampleDir, String sampleName, int sampleCount, double dVal,
Double lambda, int betaUpperBound, boolean all, CalculationType type, boolean allMetrics) {
getTrainTestSize(sampleName);
List<Integer> betaValues = getBetaValues(betaUpperBound);
String ac = type == CalculationType.USER_TO_RESOURCE ? "_ac" : "";
BookmarkReader reader = null;
for (int i = 1; i <= sampleCount; i++) {
reader = BLLCalculator.predictSample(sampleName, TRAIN_SIZE, TEST_SIZE, true, false, dVal, 5, type, lambda);
if (type == CalculationType.USER_TO_RESOURCE_ONLY) {
writeMetrics(sampleDir, sampleName, "ac_5_5", sampleCount, 10, null, allMetrics ? reader : null, null);
} else {
writeMetrics(sampleDir, sampleName, "bll" + ac + "_" + 5 + "_" + dVal, sampleCount, 10, null,
allMetrics ? reader : null, null);
}
if (all) {
for (int betaVal : betaValues) {
reader = BLLCalculator.predictSample(sampleName, TRAIN_SIZE, TEST_SIZE, true, true, dVal, betaVal,
type, lambda);
writeMetrics(sampleDir, sampleName, "bll_c" + ac + "_" + betaVal + "_" + dVal, sampleCount, 10,
null, allMetrics ? reader : null, null);
}
}
}
}
private static void startSocialRecommendation(String sampleDir, String sampleName, String networkFilename,
String algo, double dIndividual, Double lambdaIndividual, double dSocial, Double lambdaSocial,
String solrUrl, String solrCore) {
double betaBLL = 0.5;
double betaCB = 0.3;
String[] algos = null;
if (algo == null) {
algos = new String[] { "social_freq", "social", "hybrid", "social_recency", "social_link_weight" };
} else {
algos = new String[] { algo };
}
System.out.println("algos >> " + algos);
getTrainTestSize(sampleName);
Map<Integer, Map<Integer, Double>> contentBasedValues = null;
if (solrUrl != null && solrCore != null) {
BookmarkReader reader = new BookmarkReader(0, false);
reader.readFile(sampleName);
if (new File("./data/results/" + sampleDir + "/" + solrCore + "_cbpredictions.ser").exists()) {
System.out.println("Found cb file ...");
contentBasedValues = SolrHashtagCalculator.deSerializeHashtagPrediction(
"./data/results/" + sampleDir + "/" + solrCore + "_cbpredictions.ser");
} else {
contentBasedValues = SolrHashtagCalculator.getNormalizedHashtagPredictions(sampleDir, solrCore, solrUrl,
reader, null);
writeMetrics(sampleDir, sampleDir + "/" + solrCore, "solrht_normalized", 1, 10, null, null, null);
}
System.out.println("Number of content-based recommendations: " + contentBasedValues.size());
}
for (String a : algos) {
System.out.println("Algorithm >> " + a);
HashtagRecommendationEngine calculator = null;
if ("social_link_weight".equals(a)) {
String mentionFilename = "./data/csv/" + sampleDir + "/mentionNetwork.txt";
String retweetFilename = "./data/csv/" + sampleDir + "/retweetNetwork.txt";
String replyFilename = "./data/csv/" + sampleDir + "/replyNetwork.txt";
System.out.println("Social init ... ");
calculator = new HashtagRecommendationEngine(sampleDir, sampleName, networkFilename, mentionFilename,
retweetFilename, replyFilename, TRAIN_SIZE, TEST_SIZE, dIndividual, lambdaIndividual);
System.out.println("Social init done ... ");
} else if ("social_top_per_temp".equals(a)) {
System.out.println("Solr Core >> " + solrCore);
System.out.println("Solr Url >> " + solrUrl);
calculator = new HashtagRecommendationEngine(sampleDir, sampleName, networkFilename, solrUrl, solrCore,
TRAIN_SIZE, TEST_SIZE, dIndividual, lambdaIndividual);
} else if ("hybrid_link".equals(a)) {
String mentionFilename = "./data/csv/" + sampleDir + "/mentionNetwork.txt";
String retweetFilename = "./data/csv/" + sampleDir + "/retweetNetwork.txt";
String replyFilename = "./data/csv/" + sampleDir + "/replyNetwork.txt";
SocialStrengthCalculator socialStrengthCalculator = new SocialStrengthCalculator(mentionFilename,
retweetFilename, replyFilename);
calculator = new HashtagRecommendationEngine(sampleDir, sampleName, networkFilename, TRAIN_SIZE,
TEST_SIZE, dIndividual, lambdaIndividual);
calculator.setSocialStrengthCalculator(socialStrengthCalculator);
} else {
System.out.println("Social init ... ");
calculator = new HashtagRecommendationEngine(sampleDir, sampleName, networkFilename, TRAIN_SIZE,
TEST_SIZE, dIndividual, lambdaIndividual);
System.out.println("Social init done ... ");
}
if ("social_top_per_temp".equals(a)) {
BufferedWriter bw = null;
try {
bw = new BufferedWriter(new FileWriter(
"./data/results/" + sampleDir + "/social_top_per_temp_etah_etal_ndcg.txt", true));
} catch (IOException e1) {
e1.printStackTrace();
}
double eta_h = 0.1;
double eta_l = 0.2;
String suffix = "social_top_per_temp_" + eta_h + "_" + eta_l + "_" + a;
System.out.println(" Pipeline >> eta_h " + new DecimalFormat("##.##").format(eta_h) + " >> eta_l >> "
+ new DecimalFormat("##.##").format(eta_l));
calculator.setEta_h(eta_h);
calculator.setEta_l(eta_l);
calculator.predictSample(betaBLL, betaCB, dSocial, lambdaSocial, a, null, suffix);
writeMetrics(sampleDir, sampleName, suffix, 1, 10, null, null, null);
double ndcg10 = MetricsCalculator.getNDCG10();
String line = eta_h + ";" + eta_l + ";" + ndcg10 + "\n";
System.out.println(" line >> " + line);
} else {
String suffix = "social" + betaCB + "_" + dSocial + "_" + a;
calculator.predictSample(betaBLL, betaCB, dSocial, lambdaSocial, a, contentBasedValues, suffix);
writeMetrics(sampleDir, sampleName, suffix, 1, 10, null, null, null);
System.out.println("Algorithm done >> " + a);
}
}
}
private static void startCfCbHashtagCalculator(String sampleDir, String sampleName, double beta, String solrUrl,
String solrCore) {
getTrainTestSize(sampleName);
CFSolrHashtagCalculator.predictSample(sampleDir, sampleName, TRAIN_SIZE, beta, solrUrl, solrCore);
writeMetrics(sampleDir, sampleName, "cf_cb_" + beta, 1, 10, null, null, null);
}
private static void analysisSocial(String sampleDir, String sampleName, String networkFilename, String type,
Integer granularity) {
getTrainTestSize(sampleName);
HashtagRecommendationEngine calculator = new HashtagRecommendationEngine(sampleDir, sampleName, networkFilename,
TRAIN_SIZE, TEST_SIZE, 0.5, null);
if (type.equals("social")) {
new ProcessFrequencyRecencySocial(sampleDir, calculator.getUserTagTimes(), calculator.getNetwork(),
granularity);
} else if (type.equals("personal")) {
new ProcessFrequencyRecency().ProcessTagAnalytics(sampleDir, calculator.getUserTagTimes(), granularity);
} else if (type.equals("all")) {
new ProcessFrequencyRecency().ProcessTagAnalytics(sampleDir, calculator.getUserTagTimes(), granularity);
new ProcessFrequencyRecencySocial(sampleDir, calculator.getUserTagTimes(), calculator.getNetwork(),
granularity);
}
}
private static void startGirpCalculator(String sampleDir, String sampleName, boolean all) {
getTrainTestSize(sampleName);
BookmarkReader reader = null;
reader = GIRPTMCalculator.predictSample(sampleName, TRAIN_SIZE, TEST_SIZE, true, false);
writeMetrics(sampleDir, sampleName, "girp", 1, 10, null, reader, null);
if (all) {
reader = GIRPTMCalculator.predictSample(sampleName, TRAIN_SIZE, TEST_SIZE, true, true);
writeMetrics(sampleDir, sampleName, "girptm", 1, 10, null, reader, null);
}
}
private static void startModelCalculator(String sampleDir, String sampleName, int sampleCount, int betaUpperBound,
boolean all) {
getTrainTestSize(sampleName);
List<Integer> betaValues = getBetaValues(betaUpperBound);
BookmarkReader reader = null;
for (int i = 1; i <= sampleCount; i++) {
reader = MPurCalculator.predictSample(sampleName, TRAIN_SIZE, TEST_SIZE, true, false, 5);
if (all)
reader = MPurCalculator.predictSample(sampleName, TRAIN_SIZE, TEST_SIZE, false, true, 5);
}
writeMetrics(sampleDir, sampleName, "mp_u_" + 5, sampleCount, 10, null, reader, null);
if (all)
writeMetrics(sampleDir, sampleName, "mp_r_" + 5, sampleCount, 10, null, reader, null);
if (all) {
for (int beta : betaValues) {
for (int i = 1; i <= sampleCount; i++) {
reader = MPurCalculator.predictSample(sampleName, TRAIN_SIZE, TEST_SIZE, true, true, beta);
}
writeMetrics(sampleDir, sampleName, "mp_ur_" + beta, sampleCount, 10, null, reader, null);
}
}
}
private static void startCfTagCalculator(String sampleDir, String sampleName, int sampleCount, int neighbors,
int betaUpperBound, boolean all) {
getTrainTestSize(sampleName);
List<Integer> betaValues = getBetaValues(betaUpperBound);
BookmarkReader reader = null;
for (int i = 1; i <= sampleCount; i++) {
reader = CFTagRecommender.predictTags(sampleName, TRAIN_SIZE, TEST_SIZE, neighbors, true, false, 5);
if (all)
reader = CFTagRecommender.predictTags(sampleName, TRAIN_SIZE, TEST_SIZE, neighbors, false, true, 5);
}
writeMetrics(sampleDir, sampleName, "usercf_" + 5, sampleCount, 10, null, reader, null);
if (all)
writeMetrics(sampleDir, sampleName, "rescf_" + 5, sampleCount, 10, null, reader, null);
if (all) {
for (int beta : betaValues) {
for (int i = 1; i <= sampleCount; i++) {
reader = CFTagRecommender.predictTags(sampleName, TRAIN_SIZE, TEST_SIZE, neighbors, true, true,
beta);
}
writeMetrics(sampleDir, sampleName, "cf_" + beta, sampleCount, 10, null, reader, null);
}
}
}
private static void startFolkRankCalculator(String sampleDir, String sampleName, int size) {
getTrainTestSize(sampleName);
BookmarkReader reader = null;
for (int i = 1; i <= size; i++) {
reader = FolkRankCalculator.predictSample(sampleName, TRAIN_SIZE, TEST_SIZE);
}
writeMetrics(sampleDir, sampleName, "fr", size, 10, null, reader, null);
writeMetrics(sampleDir, sampleName, "apr", size, 10, null, reader, null);
}
private static void startBaselineCalculator(String sampleDir, String sampleName, int size, boolean mp) {
getTrainTestSize(sampleName);
BookmarkReader reader = null;
for (int i = 1; i <= size; i++) {
reader = MPCalculator.predictPopularTags(sampleName, TRAIN_SIZE, TEST_SIZE, mp);
}
writeMetrics(sampleDir, sampleName, "mp", size, 10, null, reader, null);
}
private static void startLdaCalculator(String sampleDir, String sampleName, int topics, int sampleCount,
boolean all) {
getTrainTestSize(sampleName);
BookmarkReader reader = null;
for (int i = 1; i <= sampleCount; i++) {
reader = MalletCalculator.predictSample(sampleName, TRAIN_SIZE, TEST_SIZE, topics, true, all);
}
writeMetrics(sampleDir, sampleName, "lda_" + topics, sampleCount, 10, null, reader, null);
}
private static void start3LayersJavaCalculator(String sampleDir, String sampleName, String topicString, int size,
int dUpperBound, int betaUpperBound, boolean resBased, boolean tagBLL, boolean topicBLL) {
getTrainTestSize(sampleName);
List<Integer> dValues = getBetaValues(dUpperBound);
List<Integer> betaValues = getBetaValues(betaUpperBound);
String suffix = "layers";
if (tagBLL && topicBLL) {
suffix += "bll";
} else if (tagBLL) {
suffix += "tagbll";
} else if (topicBLL) {
suffix += "topicbll";
}
BookmarkReader reader = null;
for (int i = 1; i <= size; i++) {
for (int d : dValues) {
if (resBased) {
for (int b : betaValues) {
reader = ThreeLTCalculator.predictSample(
sampleName + (!topicString.isEmpty() ? "_" + topicString : ""), TRAIN_SIZE, TEST_SIZE,
d, b, true, true, tagBLL, topicBLL, CalculationType.NONE);
writeMetrics(sampleDir, sampleName, suffix + "_" + b + "_" + d, size, 10,
!topicString.isEmpty() ? topicString : null, reader, null);
}
}
reader = ThreeLTCalculator.predictSample(sampleName + (!topicString.isEmpty() ? "_" + topicString : ""),
TRAIN_SIZE, TEST_SIZE, d, 5, true, false, tagBLL, topicBLL, CalculationType.NONE);
writeMetrics(sampleDir, sampleName, "user" + suffix + "_" + 5 + "_" + d, size, 10,
!topicString.isEmpty() ? topicString : null, reader, null);
}
}
}
private static void startContentBasedCalculator(String sampleDir, String sampleName) {
getTrainTestSize(sampleName);
BookmarkReader reader = ContentBasedCalculator.predictSample(sampleName, TRAIN_SIZE, TEST_SIZE);
writeMetrics(sampleDir, sampleName, "cb", 1, 10, null, reader, null);
}
// Helpers
// -----------------------------------------------------------------------------------------------------------------------------------------------------------
private static void createLdaSamples(String sampleName, int size, int topics, boolean tagrec,
boolean personalizedTopicCreation) {
getTrainTestSize(sampleName);
for (int i = 1; i <= size; i++) {
MalletCalculator.createSample(sampleName, (short) topics, tagrec, TRAIN_SIZE, personalizedTopicCreation);
}
}
private static void writeTensorFiles(String sampleName, boolean tagRec) {
getTrainTestSize(sampleName);
CatDescFiltering filter = null;
if (DESCRIBER != null) {
filter = CatDescFiltering.instantiate(sampleName, TRAIN_SIZE);
filter.setDescriber(DESCRIBER.booleanValue());
}
TensorProcessor.writeFiles(sampleName, TRAIN_SIZE, TEST_SIZE, tagRec, MIN_USER_BOOKMARKS, MAX_USER_BOOKMARKS,
filter);
}
private static void writeMetrics(String sampleDir, String sampleName, String prefix, int sampleCount, int k,
String posfix, BookmarkReader reader, Integer trainSize) {
CatDescFiltering filter = null;
if (DESCRIBER != null) {
filter = CatDescFiltering.instantiate(sampleName, TRAIN_SIZE);
filter.setDescriber(DESCRIBER.booleanValue());
}
String topicString = ((posfix == null || posfix == "0") ? "" : "_" + posfix);
for (int i = 1; i <= k; i++) {
for (int j = 1; j <= sampleCount; j++) {
MetricsCalculator.calculateMetrics(sampleName + topicString + "_" + prefix, i,
sampleDir + "/" + prefix + topicString + "_metrics", false, reader, MIN_USER_BOOKMARKS,
MAX_USER_BOOKMARKS, MIN_RESOURCE_BOOKMARKS, MAX_RESOURCE_BOOKMARKS, filter, true, trainSize);
}
MetricsCalculator.writeAverageMetrics(sampleDir + "/" + prefix + topicString + "_metrics", i,
(double) sampleCount, true, i == k, DESCRIBER);
}
MetricsCalculator.resetMetrics();
}
// e.g., -5 will be transformed to 0.5 and 2 will be transformed to 0.1 and
// 0.2
private static List<Integer> getBetaValues(int betaUpperBound) {
List<Integer> betaValues = new ArrayList<Integer>();
if (betaUpperBound < 0) {
betaValues.add(betaUpperBound * (-1));
} else {
for (int betaVal = 1; betaVal <= betaUpperBound; betaVal++) {
betaValues.add(betaVal);
}
}
return betaValues;
}
private static void getTrainTestStatistics(String dataset) {
System.out.println("FULL SET -----");
getStatistics(dataset, false);
System.out.println("TRAIN SET -----");
getStatistics(dataset + "_train", false);
System.out.println("TEST SET -----");
getStatistics(dataset + "_test", false);
}
private static void getStatistics(String dataset, boolean writeAll) {
if (TOPIC_NAME != null) {
dataset += ("_" + TOPIC_NAME);
}
BookmarkReader reader = new BookmarkReader(0, false);
reader.readFile(dataset);
int bookmarks = reader.getBookmarks().size();
System.out.println("Posts: " + bookmarks);
int users = reader.getUsers().size();
System.out.println("Users: " + users);
int resources = reader.getResources().size();
System.out.println("Resources: " + resources);
int tags = reader.getTags().size();
System.out.println("Tags: " + tags);
int tagAssignments = reader.getTagAssignmentsCount();
System.out.println("Tag-Assignments: " + tagAssignments);
int categories = reader.getCategories().size();
System.out.println("Topics: " + categories);
double avgTASPerPost = (double) tagAssignments / bookmarks;
System.out.println("Avg. TAS per post: " + avgTASPerPost);
double avgBookmarksPerUser = (double) bookmarks / users;
System.out.println("Avg. resources/posts per user: " + avgBookmarksPerUser);
double avgBookmarksPerResource = (double) bookmarks / resources;
System.out.println("Avg. users/posts per resource: " + avgBookmarksPerResource);
System.out.println("First timestamp: " + reader.getFirstTimestamp().toString());
System.out.println("Last timestamp: " + reader.getLastTimestamp().toString());
// write user distribution
UserTagDistribution.calculate(reader, dataset);
if (writeAll) {
try {
getTrainTestSize(dataset);
FileWriter userWriter = new FileWriter(new File("./data/metrics/" + dataset + "_userStats.txt"));
BufferedWriter userBW = new BufferedWriter(userWriter);
userBW.write("UserID| NoOfResources| NoOfTopics| Topic-Similarity\n");
List<Bookmark> trainList = reader.getBookmarks().subList(0, TRAIN_SIZE);
List<Integer> testUsers = reader.getUniqueUserListFromTestSet(TRAIN_SIZE);
System.out.println();
double avgTopicsPerUser = 0.0;
double avgTopicDiversityPerUser = 0.0;
List<Map<Integer, Double>> userTopics = Utilities.getRelativeTopicMaps(trainList, false);
List<List<Bookmark>> userBookmarks = Utilities.getBookmarks(trainList, false);
for (int userID : testUsers) {
Map<Integer, Double> topicsOfUser = userTopics.get(userID);
double topicDiversityOfUser = Bookmark.getBookmarkDiversity(userBookmarks.get(userID));
userBW.write(userID + "| " + reader.getUserCounts().get(userID) + "| "
+ topicsOfUser.keySet().size() + "| " + topicDiversityOfUser + "\n");
avgTopicsPerUser += topicsOfUser.keySet().size();
avgTopicDiversityPerUser += topicDiversityOfUser;
}
System.out.println("Avg. topics per user: " + avgTopicsPerUser / testUsers.size());
System.out.println("Avg. topic-similarity per user: " + avgTopicDiversityPerUser / testUsers.size());
double avgTopicsPerResource = Bookmark.getAvgNumberOfTopics(trainList);
System.out.println("Avg. topics per resource: " + avgTopicsPerResource);
userBW.flush();
userBW.close();
} catch (IOException e) {
System.out.println(e.getMessage());
}
}
System.out.println();
}
private static void getTrainTestSize(String sample) {
if (TOPIC_NAME != null) {
sample += ("_" + TOPIC_NAME);
}
BookmarkReader trainReader = new BookmarkReader(-1, false);
trainReader.readFile(sample + "_train");
TRAIN_SIZE = trainReader.getBookmarks().size();
System.out.println("Train-size: " + TRAIN_SIZE);
BookmarkReader testReader = new BookmarkReader(-1, false);
testReader.readFile(sample + "_test");
TEST_SIZE = testReader.getBookmarks().size();
System.out.println("Test-size: " + TEST_SIZE);
}
/**
*
* Passing the trainSize means that MyMediaLite files will be evaluated
*
*
*/
private static void evaluate(String sampleDir, String sampleName, String prefix, String postfix, boolean calcTags,
boolean tensor, BookmarkReader reader) {
if (reader == null) {
getTrainTestSize(sampleName + (postfix != null ? "_" + postfix : ""));
reader = new BookmarkReader(TRAIN_SIZE, false);
reader.readFile(sampleName + (postfix != null ? "_" + postfix : ""));
}
if (calcTags) {
writeMetrics(sampleDir, sampleName, prefix, 1, 10, postfix, reader, tensor ? TRAIN_SIZE : null);
} else {
writeMetricsForResources(sampleDir, sampleName, prefix, 1, 20, postfix, reader, tensor ? TRAIN_SIZE : null);
}
}
// Item Recommendation
// ------------------------------------------------------------------------------------------------------------------------------------
private static void startBaselineCalculatorForResources(String sampleDir, String sampleName, int size,
boolean random, boolean writeTime) {
BookmarkReader reader = null;
String posfix = "";
if (TOPIC_NAME != null) {
posfix = "_" + TOPIC_NAME;
}
for (int i = 1; i <= size; i++) {
getTrainTestSize(sampleName + posfix);
if (random) {
reader = MPResourceCalculator.predictRandomResources(sampleName + posfix, TRAIN_SIZE, writeTime);
} else {
reader = MPResourceCalculator.predictPopularResources(sampleName + posfix, TRAIN_SIZE, writeTime);
}
}
if (random) {
writeMetricsForResources(sampleDir, sampleName, "rand", size, 20, TOPIC_NAME, reader, null);
} else {
writeMetricsForResources(sampleDir, sampleName, "mp", size, 20, TOPIC_NAME, reader, null);
}
}
private static void startResourceCIRTTCalculator(String sampleDir, String sampleName, String topicString, int size,
int neighborSize, Features features, boolean userSim, boolean bll, boolean novelty,
boolean calculateOnTag) {
BookmarkReader reader = null;
String posfix = "";
if (TOPIC_NAME != null) {
posfix = "_" + TOPIC_NAME;
}
String suffix = "r3l_" + features;
if (bll) {
suffix += "_bll";
}
for (int i = 1; i <= size; i++) {
getTrainTestSize(sampleName + posfix);
reader = CIRTTCalculator.predictSample(sampleName + (!topicString.isEmpty() ? "_" + topicString : ""),
TRAIN_SIZE, TEST_SIZE, neighborSize, features, userSim, bll, novelty, calculateOnTag);
}
writeMetricsForResources(sampleDir, sampleName, suffix, size, 20, !topicString.isEmpty() ? topicString : null,
reader, null);
}
private static void startZhengResourceCalculator(String sampleDir, String sampleName, int size) {
BookmarkReader reader = null;
String posfix = "";
if (TOPIC_NAME != null) {
posfix = "_" + TOPIC_NAME;
}
for (int i = 1; i <= size; i++) {
getTrainTestSize(sampleName + posfix);
reader = ZhengCalculator.predictSample(sampleName + posfix, TRAIN_SIZE);
}
writeMetricsForResources(sampleDir, sampleName, "zheng_tagtime", size, 20, TOPIC_NAME, reader, null);
}
private static void startHuangResourceCalculator(String sampleDir, String sampleName, int size) {
BookmarkReader reader = null;
String posfix = "";
if (TOPIC_NAME != null) {
posfix = "_" + TOPIC_NAME;
}
for (int i = 1; i <= size; i++) {
getTrainTestSize(sampleName + posfix);
reader = HuangCalculator.predictSample(sampleName + posfix, TRAIN_SIZE);
}
writeMetricsForResources(sampleDir, sampleName, "huang_tag_user", size, 20, TOPIC_NAME, reader, null);
}
private static void startCfResourceCalculator(String sampleDir, String sampleName, int size, int neighborSize,
boolean userBased, boolean resBased, boolean allResources, boolean bll, Features features,
boolean writeTime) {
BookmarkReader reader = null;
String posfix = "";
if (TOPIC_NAME != null) {
posfix = "_" + TOPIC_NAME;
}
String suffix = "cf_";
if (!userBased) {
suffix = "rescf_";
} else if (!resBased) {
suffix = "usercf_";
}
if (!userBased && !allResources) {
suffix += "mixed_";
}
if (bll) {
suffix += "bll_";
}
suffix += features + "_";
for (int i = 1; i <= size; i++) {
getTrainTestSize(sampleName);
reader = CFResourceCalculator.predictResources(sampleName + posfix, TRAIN_SIZE, TEST_SIZE, neighborSize,
userBased, resBased, allResources, bll, features, writeTime);
}
writeMetricsForResources(sampleDir, sampleName, suffix + "5", size, 20, TOPIC_NAME, reader, null);
}
private static void startSustainApproach(String sampleDir, String sampleName, double r, double tau, double beta,
double learning_rate, int trainingRecency, int candidateNumber, int sampleSize, double cfWeight) {
BookmarkReader reader = null;
getTrainTestSize(sampleName);
SustainCalculator sustain = new SustainCalculator(sampleName, TRAIN_SIZE);
reader = sustain.predictResources(r, tau, beta, learning_rate, trainingRecency, candidateNumber, sampleSize,
cfWeight);
String prefix = "sustain";
writeMetricsForResources(sampleDir, sampleName, prefix, 1, 20, null, reader, TRAIN_SIZE);
}
private static void writeMetricsForResources(String sampleDir, String sampleName, String prefix, int sampleCount,
int k, String posfix, BookmarkReader reader, Integer trainSize) {
String topicString = ((posfix == null || posfix == "0") ? "_" : "_" + posfix + "_");
for (int i = 1; i <= k; i++) {
for (int j = 1; j <= sampleCount; j++) {
MetricsCalculator.calculateMetrics(sampleName + topicString + prefix, i,
sampleDir + "/" + prefix + topicString + "_metrics", false, reader, MIN_USER_BOOKMARKS,
MAX_USER_BOOKMARKS, MIN_RESOURCE_BOOKMARKS, MAX_RESOURCE_BOOKMARKS, null, false, trainSize);
}
MetricsCalculator.writeAverageMetrics(sampleDir + "/" + prefix + topicString + "metrics", i,
(double) sampleCount, false, i == k, null);
}
}
}