package happy.research.cf; import happy.coding.math.Stats; import happy.coding.system.Debug; import java.io.BufferedReader; import java.io.FileReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class Dataset { private final static Logger logger = LoggerFactory.getLogger(Dataset.class); public enum DATASET { EPINIONS("Epinions"), EXTENDED_EPINIONS("ExtendedEpinions"), MOVIELENS("MovieLens"), FILMTRUST("FilmTrust"), FLIXSTER( "Flixster"), JESTER("Jester"), NETFLIX("Netflix"), BOOKCROSSING("BookCrossing"), VIRTUALRATINGS( "VirtualRatings"), TEMP("TempDataSet"); public String datasetLabel = null; DATASET(String label) { datasetLabel = label; } }; public static String LABEL = null; public static int scaleSize = 0; public static double minScale = 1.0; public static double maxScale = 5.0; public static double range = maxScale - minScale; public static double[] scales = null; public static double median = 0.0; public static double relevance_threshold = 0.0; public static double mean = 0.0; public static double sd = 0.0; public static double user_mean = 0.0; public static double user_sd = 0.0; public static double item_mean = 0.0; public static double item_sd = 0.0; public static int users = 0; public static int items = 0; public static int size = 0; public static int maxUserRating = 0; public static int minUserRating = 0; public static int maxItemRating = 0; public static int minItemRating = 0; public static double sparsity = 0.0; public static Map<Double, Integer> scaleNum = null; public static Map<Double, Double> scaleRatio = null; public static DATASET dataset = null; public static String DIRECTORY = null; public static String TEMP_DIRECTORY = null; public static String RATING_SET = "ratings.txt"; public static String TRUST_SET = "trust.txt"; public static String DISTRUST_SET = "distrust.txt"; public static String REGMX = " "; public static void printSpecs() { logger.info("Dataset.label = {}", LABEL); logger.info("Dataset.users = {}, items = {}, size = {}", new Object[] { users, items, size }); logger.info("Dataset.sparsity = {}%, density = {}%", (float) (sparsity * 100), (float) ((1 - sparsity) * 100)); logger.info("Dataset.scales = {}", scales); String dist = "["; String rato = "["; for (int i = 0; i < scales.length; i++) { dist += scaleNum.get(scales[i]); rato += scaleRatio.get(scales[i]).floatValue() * 100 + "%"; if (i < scales.length - 1) { dist += ", "; rato += ", "; } } dist += "]"; rato += "]"; logger.info("Dataset.scaleDist = {}", dist); logger.info("Dataset.scaleDist = {}", rato); logger.info("Dataset.mean = {}, sd = {}", (float) mean, (float) sd); logger.info("Dataset.user_mean = {}, user_sd = {}", (float) user_mean, (float) user_sd); logger.info("Dataset.item_mean = {}, item_sd = {}", (float) item_mean, (float) item_sd); logger.info("Dataset.max_user_rating = {}, min_user_rating = {}", maxUserRating, minUserRating); logger.info("Dataset.max_item_rating = {}, min_item_rating = {}", maxItemRating, minItemRating); } public static void init(String label) throws Exception { LABEL = label; boolean found = false; for (DATASET d : DATASET.values()) { if (d.datasetLabel.equalsIgnoreCase(LABEL)) { dataset = d; found = true; break; } } if (!found) dataset = DATASET.TEMP; switch (dataset) { case BOOKCROSSING: scales = new double[] { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 }; relevance_threshold = 9.0; break; case FILMTRUST: scales = new double[] { 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0 }; relevance_threshold = 3.0; break; case FLIXSTER: scales = new double[] { 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0 }; relevance_threshold = 4.5; break; default: if (Debug.OFF) { scales = new double[] { 1.0, 2.0, 3.0, 4.0, 5.0 }; relevance_threshold = 4.5; } else { scales = new double[] { 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0 }; relevance_threshold = 3.5; } break; } scaleSize = scales.length; minScale = scales[0]; maxScale = scales[scaleSize - 1]; range = maxScale - minScale; median = Stats.median(scales); } /** * load training rating data, the statistics of rating sets are summarized as also * * @param ratingSet * filePath of training ratings * @return Map[]{ userRatingsMap, itemRatingsMap } where * <ul> * <li>userRatingsMap: { user - {item - rating} }</li> * <li>itemRatingsMap: { item - {user - rating} }</li> * </ul> * @throws Exception */ @SuppressWarnings("rawtypes") public static Map[] loadTrainSet(String ratingSet) throws Exception { HashMap<String, Map<String, Rating>> userRatingsMap = new HashMap<>(); HashMap<String, Map<String, Rating>> itemRatingsMap = new HashMap<>(); List<Double> ratings = new ArrayList<>(); scaleNum = new HashMap<>(); scaleRatio = new HashMap<>(); for (Double scale : scales) { scaleNum.put(scale, 0); scaleRatio.put(scale, 0.0); } maxUserRating = 0; minUserRating = 0; maxItemRating = 0; minItemRating = 0; BufferedReader fr = new BufferedReader(new FileReader(ratingSet)); String line = null; while ((line = fr.readLine()) != null) { if (line.trim().isEmpty()) continue; String[] data = line.split(Dataset.REGMX); String userId = data[0]; String itemId = data[1]; double rating = Double.parseDouble(data[2]); Long timestamp = 0l; if (data.length > 3) timestamp = Long.parseLong(data[3]); if (VirRatingsCF.auto) { boolean flag = (VirRatingsCF.userIds != null && !VirRatingsCF.userIds.contains(userId)) || (VirRatingsCF.userIds == null); if (Integer.parseInt(userId) > VirRatingsCF.PhyRatingUpbound && flag) continue; } Rating r = new Rating(); r.setUserId(userId); r.setItemId(itemId); r.setRating(rating); r.setTimestamp(timestamp); ratings.add(rating); Map<String, Rating> itemRatings = null; if (userRatingsMap.containsKey(userId)) itemRatings = userRatingsMap.get(userId); else itemRatings = new HashMap<>(); itemRatings.put(itemId, r); userRatingsMap.put(userId, itemRatings); Map<String, Rating> userRatings = null; if (itemRatingsMap.containsKey(itemId)) userRatings = itemRatingsMap.get(itemId); else userRatings = new HashMap<>(); userRatings.put(userId, r); itemRatingsMap.put(itemId, userRatings); int num = 0; if (scaleNum.containsKey(rating)) num = scaleNum.get(rating); scaleNum.put(rating, num + 1); } fr.close(); /* Retrieve the statistics of the data set */ users = userRatingsMap.size(); items = itemRatingsMap.size(); size = ratings.size(); sparsity = 1 - (size + 0.0) / (users * items); mean = Stats.mean(ratings); sd = Stats.sd(ratings, mean); //logger.debug("Rating median = {}", Stats.median(ratings)); for (Entry<Double, Integer> en : scaleNum.entrySet()) scaleRatio.put(en.getKey(), en.getValue() / (size + 0.0)); List<Integer> userNum = new ArrayList<>(); for (Map<String, Rating> data : userRatingsMap.values()) { userNum.add(data.size()); if (minUserRating == 0) minUserRating = data.size(); if (maxUserRating < data.size()) maxUserRating = data.size(); if (minUserRating > data.size()) minUserRating = data.size(); } List<Integer> itemNum = new ArrayList<>(); // double sumI = 0; for (Map<String, Rating> data : itemRatingsMap.values()) { itemNum.add(data.size()); if (minItemRating == 0) minItemRating = data.size(); if (maxItemRating < data.size()) maxItemRating = data.size(); if (minItemRating > data.size()) minItemRating = data.size(); // sumI += RatingUtils.mean(data.values()); } // System.out.println(sumI / itemRatingsMap.size()); user_mean = Stats.mean(userNum); user_sd = Stats.sd(userNum, user_mean); item_mean = Stats.mean(itemNum); item_sd = Stats.sd(itemNum, item_mean); return new Map[] { userRatingsMap, itemRatingsMap }; } public static HashMap<String, Map<String, Rating>> loadRatingSet(String ratingSet) throws Exception { HashMap<String, Map<String, Rating>> userRatingsMap = new HashMap<>(); BufferedReader fr = new BufferedReader(new FileReader(ratingSet)); String line = null; while ((line = fr.readLine()) != null) { if (line.trim().isEmpty()) continue; String[] data = line.split(Dataset.REGMX); String userId = data[0]; String itemId = data[1]; double rating = Double.parseDouble(data[2]); Long timestamp = 0l; if (data.length > 3) timestamp = Long.parseLong(data[3]); Rating r = new Rating(); r.setUserId(userId); r.setItemId(itemId); r.setRating(rating); r.setTimestamp(timestamp); Map<String, Rating> itemRatings = null; if (userRatingsMap.containsKey(userId)) itemRatings = userRatingsMap.get(userId); else itemRatings = new HashMap<>(); itemRatings.put(itemId, r); userRatingsMap.put(userId, itemRatings); } fr.close(); return userRatingsMap; } /** * load training test data * * @param ratingSet * filePath of test ratings * @return Map[]{ userRatingsMap, itemRatingsMap } where * <ul> * <li>userRatingsMap: { user - {item - rating} }</li> * <li>itemRatingsMap: { item - {user - rating} }</li> * </ul> * @throws Exception */ @SuppressWarnings("rawtypes") public static Map[] loadTestSet(String ratingSet) throws Exception { HashMap<String, Map<String, Rating>> userRatingsMap = new HashMap<>(); HashMap<String, Map<String, Rating>> itemRatingsMap = new HashMap<>(); BufferedReader fr = new BufferedReader(new FileReader(ratingSet)); String line = null; while ((line = fr.readLine()) != null) { if (line.trim().isEmpty()) continue; String[] data = line.split(Dataset.REGMX); String userId = data[0]; String itemId = data[1]; double rating = Double.parseDouble(data[2]); Long timestamp = 0l; if (data.length > 3) timestamp = Long.parseLong(data[3]); // if (userId > VirRatingsCF.PhyRatingUpbound + num) continue; Rating r = new Rating(); r.setUserId(userId); r.setItemId(itemId); r.setRating(rating); r.setTimestamp(timestamp); Map<String, Rating> itemRatings = null; if (userRatingsMap.containsKey(userId)) itemRatings = userRatingsMap.get(userId); else itemRatings = new HashMap<>(); itemRatings.put(itemId, r); userRatingsMap.put(userId, itemRatings); Map<String, Rating> userRatings = null; if (itemRatingsMap.containsKey(itemId)) userRatings = itemRatingsMap.get(itemId); else userRatings = new HashMap<>(); userRatings.put(userId, r); itemRatingsMap.put(itemId, userRatings); } fr.close(); return new Map[] { userRatingsMap, itemRatingsMap }; } public static void main(String[] args) throws Exception { ConfigParams.defaultInstance(); String ratingSet = DIRECTORY + "u1.base"; loadTrainSet(ratingSet); printSpecs(); } }