package happy.research.data; import happy.coding.io.FileIO; import happy.coding.io.FileIO.Converter; import happy.coding.io.Logs; import happy.coding.system.Systems; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class CiaoParser { private static String domain = "dvd.ciao.co.uk"; private static String desktop = Systems.getDesktop(); public static void main(String[] args) throws Exception { String task = "anony"; switch (task) { case "products": CiaoParser.parseCategoryPages(); break; case "reviews": CiaoParser.parseReviewPages(); break; case "users": CiaoParser.getAllUsers(); break; case "ratings-reviews": CiaoParser.getAllRatings(); break; case "trust": CiaoParser.getAllTrust(); break; case "anony": CiaoParser.anonymous(); break; default: break; } } public static void parseCategoryPages() throws Exception { String filePath = FileIO.getResource("dvd.ciao.txt"); List<String> urls = FileIO.readAsList(filePath); String dir = Systems.getDesktop() + "dvd.ciao.co.uk\\"; for (String url : urls) { // each category String[] data = url.split(": "); String category = data[0]; String dirCate = FileIO.makeDirPath(dir, category); String dirPath = FileIO.makeDirPath(dir, category, "webPages"); // clear FileIO.deleteFile(dirCate + "movies.txt"); File dirs = new File(dirPath); for (File f : dirs.listFiles()) { // each web page String html = FileIO.readAsString(dirPath + f.getName()); Document doc = Jsoup.parse(html); Logs.debug(category + ": " + f.getName()); List<String> movies = new ArrayList<>(); Elements products = doc.select("td.prodInfo"); for (Element product : products) { // each product Element prod = product.select("p.prodName").first(); String name = prod.text(); String link = prod.select("a").first().attr("href"); String id = link.substring(link.lastIndexOf("_") + 1); // number of user reviews prod = product.select("p.prodRating").first(); String cnt = prod.select(".userReviewsCount").text() .replace("(", "").replace(")", ""); int count = 0; if (!cnt.isEmpty()) count = Integer.parseInt(cnt); // do not consider movies without any reviews if (count > 0) { String movie = id + "::" + name + "::" + link; movies.add(movie); } } FileIO.writeList(dirCate + "movies.txt", movies, null, true); } } } public static void getAllUsers() throws Exception { String filePath = FileIO.getResource("dvd.ciao.txt"); List<String> urls = FileIO.readAsList(filePath); String dir = Systems.getDesktop() + "dvd.ciao.co.uk\\"; String userFile = dir + "users.txt"; Map<String, String> userMap = new HashMap<>(); for (String url : urls) { // each category String[] data = url.split(": "); String category = data[0]; String dirPath = FileIO.makeDirPath(dir, category); // users String userPath = dirPath + "users.txt"; List<String> users = FileIO.readAsList(userPath); for (String line : users) { String[] d = line.split(","); userMap.put(d[0], d[1]); } } FileIO.writeMap(userFile, userMap); } public static void getAllTrust() throws Exception { Map<String, String> users = new HashMap<>(); List<String> userLines = FileIO.readAsList(FileIO .getResource("users.txt")); for (String line : userLines) { String[] data = line.split(","); users.put(data[1], data[0]); } String usersPath = FileIO.makeDirPath(desktop, domain, "users.ciao.co.uk"); FileIO.deleteFile(usersPath + "trust.txt"); File dir = new File(usersPath); File[] files = dir.listFiles(); for (int i = 0, n = files.length; i < n; i++) { // for each user File userFile = files[i]; final String userID = userFile.getName(); String html = null; Document doc = null; List<String> friends = new ArrayList<>(); File[] pages = userFile.listFiles(); for (int j = 0, m = pages.length; j < m; j++) { // for each trust page File file = pages[j]; String name = file.getName(); if (name.startsWith("friends")) { html = FileIO.readAsString(file.getPath()); doc = Jsoup.parse(html); Element trustTable = doc.select("form table.trust").first(); Elements trs = trustTable.select("tbody tr"); for (Element tr : trs) { Element td = tr.select("td").get(1); Element a = td.select("a").first(); String link = a.attr("href"); if (users.containsKey(link)) { friends.add(users.get(link)); } } } } FileIO.writeList(usersPath + "trust.txt", friends, new Converter<String, String>() { @Override public String transform(String friend) { return userID + "," + friend + ",1"; } }, true); } } public static void anonymous() throws Exception { String dirPath = FileIO.makeDirPath(desktop, domain); String ratingFile = dirPath + "ratings.txt"; String trustFile = dirPath + "trust.txt"; String reviewFile = dirPath + "review-ratings.txt"; String sep = ","; Map<String, Integer> userIdMap = new HashMap<>(); Map<String, Integer> movieIdMap = new HashMap<>(); Map<String, Integer> genreIdMap = new HashMap<>(); Map<String, Integer> reviewIdMap = new HashMap<>(); // ratings List<String> newlines = new ArrayList<>(); BufferedReader br = new BufferedReader(new FileReader(new File( ratingFile))); String line = null; while ((line = br.readLine()) != null) { String[] data = line.split(","); String user = data[0]; String movie = data[1]; String genre = data[2]; String review = data[3]; String rating = data[4]; String date = data[5]; if (!userIdMap.containsKey(user)) userIdMap.put(user, userIdMap.size() + 1); if (!movieIdMap.containsKey(movie)) movieIdMap.put(movie, movieIdMap.size() + 1); if (!genreIdMap.containsKey(genre)) genreIdMap.put(genre, genreIdMap.size() + 1); if (!reviewIdMap.containsKey(review)) reviewIdMap.put(review, reviewIdMap.size() + 1); String newline = userIdMap.get(user) + sep + movieIdMap.get(movie) + sep + genreIdMap.get(genre) + sep + reviewIdMap.get(review) + sep + rating + sep + date; newlines.add(newline); } br.close(); FileIO.writeList(dirPath + "ratings-converted.txt", newlines); // review-ratings newlines.clear(); br = new BufferedReader(new FileReader(new File(reviewFile))); while ((line = br.readLine()) != null) { String[] data = line.split(","); String user = data[0]; String review = data[1]; String rating = data[2]; if (!userIdMap.containsKey(user)) userIdMap.put(user, userIdMap.size() + 1); String newline = userIdMap.get(user) + sep + reviewIdMap.get(review) + sep + rating; newlines.add(newline); } br.close(); FileIO.writeList(dirPath + "review-ratings-converted.txt", newlines); // trust newlines.clear(); br = new BufferedReader(new FileReader(new File(trustFile))); while ((line = br.readLine()) != null) { String[] data = line.split(","); String trustor = data[0]; String trustee = data[1]; String rating = data[2]; if (!userIdMap.containsKey(trustor)) userIdMap.put(trustor, userIdMap.size() + 1); if (!userIdMap.containsKey(trustee)) userIdMap.put(trustee, userIdMap.size() + 1); String newline = userIdMap.get(trustor) + sep + userIdMap.get(trustee) + sep + rating; newlines.add(newline); } br.close(); FileIO.writeList(dirPath + "trusts-converted.txt", newlines); } public static void getAllRatings() throws Exception { String filePath = FileIO.getResource("dvd.ciao.txt"); List<String> urls = FileIO.readAsList(filePath); String dir = Systems.getDesktop() + "dvd.ciao.co.uk\\"; String ratingFile = dir + "ratings.txt"; String reviewFile = dir + "review-ratings.txt"; FileIO.deleteFile(ratingFile); FileIO.deleteFile(reviewFile); for (String url : urls) { // each category String[] data = url.split(": "); String category = data[0]; String dirPath = FileIO.makeDirPath(dir, category); // ratings String ratingPath = dirPath + "ratings.txt"; List<String> ratings = FileIO.readAsList(ratingPath); FileIO.writeList(ratingFile, ratings, null, true); // reviews String reviewPath = dirPath + "review-ratings.txt"; List<String> reviews = FileIO.readAsList(reviewPath); FileIO.writeList(reviewFile, reviews, null, true); } } public static void parseReviewPages() throws Exception { String filePath = FileIO.getResource("dvd.ciao.txt"); List<String> urls = FileIO.readAsList(filePath); String dir = Systems.getDesktop() + "dvd.ciao.co.uk\\"; for (String url : urls) { // each category String[] data = url.split(": "); String category = data[0]; String dirCate = FileIO.makeDirPath(dir, category); File dirs = new File(dirCate); for (File f : dirs.listFiles()) { // each product folder if (f.getName().equals("webPages")) continue; if (!f.isDirectory()) continue; String prodPath = FileIO.makeDirPath(dirCate, f.getName()); String revwPath = FileIO.makeDirPath(prodPath, "Reviews"); List<String> reviews = new ArrayList<>(); File reviewDirs = new File(revwPath); for (File rf : reviewDirs.listFiles()) { // each review page String html = FileIO.readAsString(rf.getPath()); Document doc = Jsoup.parse(html); Elements es = doc.select("div.m-shortReviewSnippet"); for (Element e : es) { Element a = e.select( "p.m-shet-review-title a.ReviewTitle").first(); if (a == null) continue; // some reviews do not have specific link // to detailed contents // url String link = a.attr("href"); int idx = link.lastIndexOf("_"); String id = link.substring(idx + 1); reviews.add(id + "::" + link); } } FileIO.writeList(prodPath + "reviews.txt", reviews, null, false); } } } }