package happy.research.tp; import happy.coding.io.FileIO; import happy.coding.io.Logs; import happy.coding.math.Randoms; import happy.coding.system.Systems; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import librec.data.DataDAO; import librec.data.MatrixEntry; import librec.data.SparseMatrix; import librec.data.SparseVector; import librec.data.VectorEntry; import org.junit.Test; import com.google.common.collect.HashBasedTable; import com.google.common.collect.HashMultimap; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multimap; import com.google.common.collect.Multiset; import com.google.common.collect.Table; public class DatasetUtils { public final static String sep = ","; /** * load user-review dataset * * @return movie-review dataset: {user, a list of reviews} */ public static Multimap<String, String> loadReviews(String path) throws Exception { Multimap<String, String> dataset = HashMultimap.create(); BufferedReader br = new BufferedReader(new FileReader(new File(path))); String line = null; while ((line = br.readLine()) != null) { String[] data = line.split(","); dataset.put(data[0], data[1]); } br.close(); return dataset; } /** * load review-ratings dataset, note that the rating is normalized to (0, 1] * * @return review-ratings dataset: {user, review, rating} */ public static Table<String, String, Float> loadRatings(String path) throws Exception { return loadRatings(path, true); } public static Table<String, String, Float> loadRatings(String path, boolean normalized) throws Exception { Table<String, String, Float> dataset = HashBasedTable.create(); BufferedReader br = new BufferedReader(new FileReader(new File(path))); String line = null; while ((line = br.readLine()) != null) { String[] data = line.split(","); // TODO: we add 1 to existing ratings to handle the case of 0 for // ciao dataset int rate = Integer.parseInt(data[2]); float rating = rate; if (normalized) rating = TrustModel.dataset.equals(TrustModel.CiaoDVDs) ? (rate + 1) / 6.0f : rate / 5.0f; dataset.put(data[0], data[1], rating); } br.close(); return dataset; } /** * load user-trust dataset * * @return user-trusts dataset: {trustor, trustee, trust} */ public static Table<String, String, Integer> loadTrusts(String path) throws Exception { Table<String, String, Integer> dataset = HashBasedTable.create(); BufferedReader br = new BufferedReader(new FileReader(new File(path))); String line = null; while ((line = br.readLine()) != null) { String[] data = line.split(","); dataset.put(data[0], data[1], Integer.parseInt(data[2])); } br.close(); return dataset; } /** * Converted the format of extended epinions dataset' so that our programs * can directly function on the transformed dataset. * */ @Test public void formatEpinions() throws Exception { String dirPath = "D:\\Java\\Workspace\\CF-RS\\Datasets\\ExtendedEpinions\\"; String sourcePath = dirPath + "Original Dataset\\"; String destPath = dirPath + "UMAP2014\\"; Set<String> trustors = new HashSet<>(); Set<String> users = new HashSet<>(); Map<String, Integer> userIdMap = new HashMap<>(); Map<String, Integer> reviewIdMap = new HashMap<>(); // trusts.txt: {trustorId, trusteeId, trust} String destFile = destPath + "trusts.txt"; FileIO.deleteFile(destFile); List<String> contents = new ArrayList<>(2002); // 2000*0.75 = 1500 BufferedReader br = new BufferedReader(new FileReader(new File(sourcePath + "user_rating.txt"))); String line = null, newline = null; while ((line = br.readLine()) != null) { String[] data = line.split("\t"); String trustor = data[0]; String trustee = data[1]; int trust = Integer.parseInt(data[2]); if (trust == 1) { trustors.add(trustor); users.add(trustor); users.add(trustee); if (!userIdMap.containsKey(trustor)) userIdMap.put(trustor, userIdMap.size() + 1); if (!userIdMap.containsKey(trustee)) userIdMap.put(trustee, userIdMap.size() + 1); newline = userIdMap.get(trustor) + sep + userIdMap.get(trustee) + sep + trust; contents.add(newline); if (contents.size() > 1500) { FileIO.writeList(destFile, contents, null, true); contents.clear(); } } } br.close(); if (contents.size() > 0) FileIO.writeList(destFile, contents, null, true); contents.clear(); // user-reviews.txt: {userId,reviewId} destFile = destPath + "user-reviews.txt"; FileIO.deleteFile(destFile); br = new BufferedReader(new FileReader(new File(sourcePath + "mc.txt"))); while ((line = br.readLine()) != null) { String[] data = line.split("\\|"); String userId = data[1]; String reviewId = data[0]; // only keep reviews written by trustors if (trustors.contains(userId)) { if (!reviewIdMap.containsKey(reviewId)) reviewIdMap.put(reviewId, reviewIdMap.size() + 1); newline = userIdMap.get(userId) + sep + reviewIdMap.get(reviewId); contents.add(newline); if (contents.size() > 1500) { FileIO.writeList(destFile, contents, null, true); contents.clear(); } } } br.close(); if (contents.size() > 0) FileIO.writeList(destFile, contents, null, true); contents.clear(); // review-ratings.txt: {userId, reviewId, rating} destFile = destPath + "review-ratings.txt"; FileIO.deleteFile(destFile); br = new BufferedReader(new FileReader(new File(sourcePath + "ratings.txt"))); while ((line = br.readLine()) != null) { String[] data = line.split("\t"); String userId = data[1]; String reviewId = data[0]; int rate = Integer.parseInt(data[2]); // only keep ratings given by users if (users.contains(userId) && reviewIdMap.containsKey(reviewId)) { newline = userIdMap.get(userId) + sep + reviewIdMap.get(reviewId) + sep + rate; contents.add(newline); if (contents.size() > 1500) { FileIO.writeList(destFile, contents, null, true); contents.clear(); } } } br.close(); if (contents.size() > 0) FileIO.writeList(destFile, contents, null, true); contents.clear(); } /** * Sample a small dataset from the whole and large Epinions dataset * * @throws Exception */ @Test public void sampleEpinions() throws Exception { String dirPath = null; switch (Systems.getOs()) { case Windows: dirPath = "D:\\Java\\Workspace\\CF-RS\\Datasets\\UMAP2014\\"; break; case Linux: case Mac: dirPath = "/home/gguo1/Java/Workspace/CF-RS/Datasets/UMAP2014/"; break; } String sourcePath = FileIO.makeDirPath(dirPath, "Epinions"); String destPath = FileIO.makeDirectory(dirPath, "Epinions_Sample"); if (FileIO.exist(destPath)) FileIO.deleteDirectory(destPath); FileIO.makeDirectory(destPath); Table<String, String, Integer> trust = loadTrusts(sourcePath + "trusts.txt"); List<String> ts = new ArrayList<>(trust.rowKeySet()); // randomly sample 1500 users int[] idxes = Randoms.nextIntArray(1500, ts.size()); List<String> trustors = new ArrayList<>(); for (int idx : idxes) trustors.add(ts.get(idx)); String newline = null; List<String> newlines = new ArrayList<>(2002); String destFile = destPath + "trusts.txt"; FileIO.deleteFile(destFile); Set<String> users = new HashSet<>(); users.addAll(trustors); for (String t : trustors) { Map<String, Integer> tees = trust.row(t); for (Entry<String, Integer> en : tees.entrySet()) { String trustee = en.getKey(); users.add(trustee); newline = t + sep + trustee + sep + en.getValue(); newlines.add(newline); if (newlines.size() >= 1500) { FileIO.writeList(destFile, newlines, null, true); newlines.clear(); } } } if (newlines.size() > 0) FileIO.writeList(destFile, newlines, null, true); newlines.clear(); // user reviews Multimap<String, String> urvs = loadReviews(sourcePath + "user-reviews.txt"); destFile = destPath + "user-reviews.txt"; FileIO.deleteFile(destFile); Set<String> reviews = new HashSet<>(); for (String u : users) { Collection<String> rvs = urvs.get(u); reviews.addAll(rvs); for (String rv : rvs) { newline = u + sep + rv; newlines.add(newline); if (newlines.size() >= 1500) { FileIO.writeList(destFile, newlines, null, true); newlines.clear(); } } } if (newlines.size() > 0) FileIO.writeList(destFile, newlines, null, true); newlines.clear(); // review ratings Table<String, String, Float> urts = loadRatings(sourcePath + "review-ratings.txt", false); destFile = destPath + "review-ratings.txt"; FileIO.deleteFile(destFile); for (String u : users) { Map<String, Float> rts = urts.row(u); for (String rv : reviews) { if (rts.containsKey(rv)) { newline = u + sep + rv + sep + rts.get(rv).intValue(); newlines.add(newline); if (newlines.size() >= 1500) { FileIO.writeList(destFile, newlines, null, true); newlines.clear(); } } } } if (newlines.size() > 0) FileIO.writeList(destFile, newlines, null, true); newlines.clear(); } @Test public void distribution() throws Exception { String dirPath = "D:\\Java\\Datasets\\UMAP2014\\CiaoDVDs\\"; String ratingPath = dirPath + "review-ratings.txt"; DataDAO rateDao = new DataDAO(ratingPath); SparseMatrix rateMatrix = rateDao.readData(); int rows = rateMatrix.numRows(); // int cols = rateMatrix.numColumns(); String reviewPath = dirPath + "user-reviews.txt"; DataDAO reviewDao = new DataDAO(reviewPath, rateDao.getUserIds(), rateDao.getItemIds()); SparseMatrix reviewMatrix = reviewDao.readData(new int[] { 0, 1 }, true, -1); String trustPath = dirPath + "trusts.txt"; DataDAO dao = new DataDAO(trustPath); SparseMatrix trustMatrix = dao.readData(); Multiset<Integer> nums = HashMultiset.create(); for (MatrixEntry me : trustMatrix) { int u = me.row(); int v = me.column(); // u writes, v rates int num = 0; SparseVector urs = reviewMatrix.row(u); if (v < rows) { SparseVector vrs = rateMatrix.row(v); for (VectorEntry ve : urs) { int rw = ve.index(); if (vrs.contains(rw)) num++; } } // u rates, v writes if (v < rows) { SparseVector vrs_w = reviewMatrix.row(v); if (u < rows) { SparseVector urs_r = rateMatrix.row(u); for (VectorEntry ve : vrs_w) { int rw = ve.index(); if (urs_r.contains(rw)) num++; } } } nums.add(num); } Logs.debug(nums); } }