package happy.research.data; import happy.coding.io.FileIO; import happy.coding.io.Logs; import happy.coding.system.Systems; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.junit.Test; import com.google.common.collect.HashBasedTable; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; import com.google.common.collect.Table; public class CiaoDataset { public final static String dirPath = "D:\\Java\\Workspace2\\CF-RS\\Datasets\\Ciao\\"; public final static String ratingSet = dirPath + "movie-ratings.txt"; public final static String reviewSet = dirPath + "review-ratings.txt"; public final static String trustSet = dirPath + "trusts.txt"; /** * Sample a sample dataset for our work * * @throws Exception */ @Test public void sample() throws Exception { Table<String, String, String> reviews = loadMovieReviewSet(ratingSet); Table<String, String, Integer> ratings = loadReviewSet(reviewSet); Table<String, String, Integer> trusts = loadTrustSet(trustSet); String sep = ","; List<String> sampleRatings = new ArrayList<>(); List<String> sampleReviews = new ArrayList<>(); // first: determine users and trustors Set<String> users = new HashSet<>(); users.addAll(trusts.rowKeySet()); users.addAll(trusts.columnKeySet()); Set<String> trustors = new HashSet<>(trusts.rowKeySet()); // determine reviews Set<String> rvs = new HashSet<>(); for (String trustor : trustors) { // {movie : review} Map<String, String> rws = reviews.row(trustor); for (Entry<String, String> en : rws.entrySet()) { // String mv = en.getKey(); String rv = en.getValue(); rvs.add(rv); sampleReviews.add(trustor + sep + rv); } } // determine ratings for (String user : users) { Map<String, Integer> rts = ratings.row(user); for (Entry<String, Integer> en : rts.entrySet()) { String rv = en.getKey(); if (rvs.contains(rv)) sampleRatings.add(user + sep + rv + sep + en.getValue()); } } // output datasets String outPath = Systems.getDesktop(); FileIO.writeList(outPath + "review-ratings.txt", sampleRatings); FileIO.writeList(outPath + "user-reviews.txt", sampleReviews); } /** * Compute the statistics of the CiaoDVDs dataset * * @param dirPath * the directory path of the dataset * @throws Exception */ @Test public void statistics() throws Exception { // movie ratings: {user, movie, rating} Table<String, String, Integer> ratings = loadRatingSet(ratingSet); Multiset<Integer> scales = HashMultiset.create(); scales.addAll(ratings.values()); Logs.info("Movie rating scales:"); Logs.info(scales.toString()); Logs.info("Users: {}, movies: {}, ratings: {}\n", new Object[] { ratings.rowKeySet().size(), ratings.columnKeySet().size(), ratings.size() }); // review ratings: {user, review, rating} Table<String, String, Integer> reviews = loadReviewSet(reviewSet); scales.clear(); scales.addAll(reviews.values()); Logs.info("Movie review scales:"); Logs.info(scales.toString()); Logs.info("Users: {}, reviews: {}, ratings: {}\n", new Object[] { reviews.rowKeySet().size(), reviews.columnKeySet().size(), reviews.size() }); // trust ratings: {trustor, trustee, rating} Table<String, String, Integer> trusts = loadTrustSet(trustSet); Logs.info("Trustors: {}, trustees: {}, trusts: {}\n", new Object[] { trusts.rowKeySet().size(), trusts.columnKeySet().size(), trusts.size() }); Set<String> users = new HashSet<>(); users.addAll(ratings.rowKeySet()); users.addAll(reviews.rowKeySet()); users.addAll(trusts.rowKeySet()); users.addAll(trusts.columnKeySet()); Logs.info("Overall users: {}", users.size()); } /** * load the dataset of trust info * * @param trustsPath * the path to the user-trust dataset * @return the trusts dataset table: {trustor, trustee, rating} * @throws Exception */ protected Table<String, String, Integer> loadTrustSet(String trustsPath) throws Exception { BufferedReader br; String line; Table<String, String, Integer> trusts = HashBasedTable.create(); br = new BufferedReader(new FileReader(new File(trustsPath))); while ((line = br.readLine()) != null) { String[] data = line.split(","); trusts.put(data[0], data[1], Integer.parseInt(data[2])); } br.close(); return trusts; } /** * load the dataset of review ratings; * * @param reviewsPath * the path to the review-ratings dataset * @return the review dataset table: {user, review, rating} * @throws Exception */ protected Table<String, String, Integer> loadReviewSet(String reviewsPath) throws FileNotFoundException, IOException { BufferedReader br; String line; Table<String, String, Integer> reviews = HashBasedTable.create(); br = new BufferedReader(new FileReader(new File(reviewsPath))); while ((line = br.readLine()) != null) { String[] data = line.split(","); reviews.put(data[0], data[1], Integer.parseInt(data[2])); } br.close(); return reviews; } /** * load the dataset of movie ratings; * * @param ratingsPath * the path to the moviep-rating dataset * @return the rating dataset table: {user, movie, rating} * @throws Exception */ protected Table<String, String, Integer> loadRatingSet(String ratingsPath) throws Exception { Table<String, String, Integer> ratings = HashBasedTable.create(); BufferedReader br = new BufferedReader(new FileReader(new File( ratingsPath))); String line = null; while ((line = br.readLine()) != null) { String[] data = line.split(","); ratings.put(data[0], data[1], Integer.parseInt(data[4])); } br.close(); return ratings; } /** * load the dataset of movie reviews; * * @param ratingsPath * the path to the moviep-rating dataset * @return the rating dataset table: {user, movie, review} * @throws Exception */ protected Table<String, String, String> loadMovieReviewSet( String ratingsPath) throws Exception { Table<String, String, String> dataset = HashBasedTable.create(); BufferedReader br = new BufferedReader(new FileReader(new File( ratingsPath))); String line = null; while ((line = br.readLine()) != null) { String[] data = line.split(","); dataset.put(data[0], data[1], data[3]); } br.close(); return dataset; } }