package happy.research.data; import happy.coding.io.FileIO; import happy.coding.io.Logs; import happy.coding.io.net.WebCrawler; import happy.coding.system.Debug; import happy.coding.system.Systems; import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class CiaoCrawler extends WebCrawler { protected static String folder; protected static String sep = ",\t"; protected String dir; private String task; private String domain = "dvd.ciao.co.uk"; private String desktop = Systems.getDesktop(); CiaoCrawler(String url, String task, int id) throws Exception { super(url, id); dir = folder + domain; dir = FileIO.makeDirectory(dir); this.task = task; } public CiaoCrawler() throws Exception { this(null, null, 0); } public void run_reviews(String url) throws Exception { String[] data = url.split(": "); String category = data[0]; String catPath = FileIO.makeDirPath(desktop, domain, category); File Dir = new File(catPath); File[] prodDirs = Dir.listFiles(); int tk = prodDirs.length; for (int k = 0; k < tk; k++) { File prodDir = prodDirs[k]; // for each sub directory if (prodDir.getName().equals("webPages")) continue; if (!prodDir.isDirectory()) continue; String prodPath = FileIO.makeDirPath(catPath, prodDir.getName()); List<String> reviews = FileIO.readAsList(prodPath + "reviews.txt"); String reviewPath = FileIO.makeDirectory(prodPath, "Detailed_Reviews"); int tv = reviews.size(); for (int v = 0; v < tv; v++) { String rv = reviews.get(v); String[] rd = rv.split("::"); String id = rd[0]; String link = rd[1]; String rvPath = FileIO.makeDirectory(reviewPath, id); String rvFilePath = rvPath + id + ".html"; String rvRatingPath = rvPath + id + "-ratings.html"; String html = null; if (FileIO.exist(rvFilePath)) { if (FileIO.exist(rvRatingPath)) { Logs.debug(category + ": " + prodDir.getName() + " (" + (k + 1) + "/" + tk + "): review " + (v + 1) + "/" + tv); continue; } html = FileIO.readAsString(rvFilePath); } else { html = read_url(link); if (html == null) continue; FileIO.writeString(rvFilePath, html); } Document doc = Jsoup.parse(html); Element e = doc.select( "div#CompTooltip2 div.CWMSReviewRatings p a.CWLINKSub") .first(); if (e == null) continue; // this review has not been rated by others; String js = e.attr("onmouseover"); String predix = "javascript:this.href=jlinkBuild("; int start = predix.length(); int end = js.lastIndexOf(")"); String parse = js.substring(start, end); parse = parse.replace("'", ""); parse = parse.replace(",", ""); String href = "http://dvd.ciao.co.uk" + parse; html = read_url(href); FileIO.writeString(rvRatingPath, html); Logs.debug(category + ": " + prodDir.getName() + " (" + (k + 1) + "/" + tk + "): review " + (v + 1) + "/" + tv); } } } public void run_ratings(String url) throws Exception { String[] data = url.split(": "); String category = data[0]; String catPath = FileIO.makeDirPath(desktop, domain, category); File Dir = new File(catPath); File[] prodDirs = Dir.listFiles(); int tk = prodDirs.length; for (int k = 0; k < tk; k++) { File prodDir = prodDirs[k]; // for each product if (prodDir.getName().equals("webPages")) continue; if (!prodDir.isDirectory()) continue; String reviewPath = FileIO.makeDirPath(catPath, prodDir.getName(), "Detailed_Reviews"); File reviewDir = new File(reviewPath); File[] reviewDirs = reviewDir.listFiles(); for (int i = 0; i < reviewDirs.length; i++) { File reviewFile = reviewDirs[i]; String name = reviewFile.getName(); Logs.debug("{}: {} ({}/{}): {} ({}/{})", category, prodDir.getName(), (k + 1), tk, name, (i + 1), reviewDirs.length); String dirPath = FileIO.makeDirPath(reviewPath, name); String ratingPath = dirPath + name + "-ratings.html"; if (!FileIO.exist(ratingPath)) continue; // no ratings String html = FileIO.readAsString(ratingPath); Document doc = Jsoup.parse(html); Elements as = doc.select("a.CWLINKSub.CWMSFontSizeSmaller"); if (as == null || as.size() == 0) continue; // no further review rater pages for (Element a : as) { if (a.hasAttr("id")) // back to ratings/review continue; String link = "http://dvd.ciao.co.uk" + a.attr("href"); html = read_url(link); int rating = Integer.parseInt(link.substring(link .lastIndexOf('=') + 1)); int page = 1; String valuePath = dirPath + "ratings-" + rating + "-"; FileIO.writeString(valuePath + page + ".html", html); // multiple pages Document doc2 = Jsoup.parse(html); Elements ns = doc2.select("div#Pagination"); if (ns != null && ns.size() > 0) { Element ps = ns.select("li").last(); int numPages = Integer.parseInt(ps.text()); if (numPages <= 1) continue; // for debug purpose Logs.info(valuePath + numPages); // generate url ps = ns.select("li a").first(); String pLink = ps.attr("href"); pLink = pLink.substring(0, pLink.lastIndexOf('/') + 1); for (int p = 2; p <= numPages; p++) { String ppLink = pLink + (p - 1) * 15; html = read_url(ppLink); FileIO.writeString(valuePath + p + ".html", html); } } } } } } public void run_dvd_ratings(String url) throws Exception { String[] data = url.split(": "); String category = data[0]; String category_url = data[1]; category_url = category_url.substring(0, category_url.lastIndexOf('_')); String categoryID = category_url.substring(category_url .lastIndexOf('_') + 1); String catPath = FileIO.makeDirPath(desktop, domain, category); File Dir = new File(catPath); File[] prodDirs = Dir.listFiles(); int tk = prodDirs.length; for (int k = 0; k < tk; k++) { File prodDir = prodDirs[k]; // for each product String productID = prodDir.getName(); if (productID.equals("webPages")) continue; if (!prodDir.isDirectory()) continue; String prodPath = FileIO.makeDirPath(catPath, productID); String reviewPath = FileIO .makeDirPath(prodPath, "Detailed_Reviews"); String dvdPath = prodPath + "dvd-ratings.txt"; FileIO.deleteFile(dvdPath); File reviewDir = new File(reviewPath); File[] reviewDirs = reviewDir.listFiles(); List<String> reviews = new ArrayList<>(); for (int i = 0; i < reviewDirs.length; i++) { // for each review File reviewFile = reviewDirs[i]; String reviewID = reviewFile.getName(); Logs.debug("{}: {} ({}/{}): {} ({}/{})", category, productID, (k + 1), tk, reviewID, (i + 1), reviewDirs.length); String dirPath = FileIO.makeDirPath(reviewPath, reviewID); String ratingPath = dirPath + reviewID + ".html"; if (!FileIO.exist(ratingPath)) continue; // review page is not existing String html = FileIO.readAsString(ratingPath); Document doc = Jsoup.parse(html); Element div = doc.select("div#OH_BingUserInfo").first(); if (div == null) continue; // no user review exists // user-info Element a = div.select("p.m-reer-usertab.clearfix a.black") .first(); String raw = a.attr("onmousedown"); raw = raw.substring(raw.indexOf("(") + 1, raw.lastIndexOf(")")); String userUrl = raw.replace(",", "").replace("'", ""); String userID = userUrl.substring(userUrl.lastIndexOf('_') + 1); // user-rating value Element r = div.select( "p.m-reer-usertab.clearfix img.ratingStars").first(); String rating = r.attr("alt"); // user-rating date div = doc.select("div#OH_BingUserOpinion").first(); Element date = div .select("div.m-reer-opheader.reviewTitle span.m-reer-ddwrap span[property]") .first(); String datetime = date.attr("content"); // content String review = userID + "," + productID + "," + categoryID + "," + reviewID + "," + rating + "," + datetime + "," + userUrl; reviews.add(review); } FileIO.writeList(dvdPath, reviews); } } public void run_review_ratings(String url) throws Exception { String[] data = url.split(": "); String category = data[0]; String catPath = FileIO.makeDirPath(desktop, domain, category); File Dir = new File(catPath); File[] prodDirs = Dir.listFiles(); int tk = prodDirs.length; for (int k = 0; k < tk; k++) { File prodDir = prodDirs[k]; // for each product String productID = prodDir.getName(); if (productID.equals("webPages")) continue; if (!prodDir.isDirectory()) continue; String prodPath = FileIO.makeDirPath(catPath, productID); String reviewPath = FileIO .makeDirPath(prodPath, "Detailed_Reviews"); String dvdPath = prodPath + "review-ratings.txt"; FileIO.deleteFile(dvdPath); File reviewDir = new File(reviewPath); File[] reviewDirs = reviewDir.listFiles(); for (int i = 0; i < reviewDirs.length; i++) { // for each review File reviewFile = reviewDirs[i]; String reviewID = reviewFile.getName(); Logs.debug("{}: {} ({}/{}): {} ({}/{})", category, productID, (k + 1), tk, reviewID, (i + 1), reviewDirs.length); String reviewDirPath = FileIO.makeDirPath(reviewPath, reviewID); File ratingDir = new File(reviewDirPath); File[] ratingDirs = ratingDir.listFiles(); int fileNum = ratingDirs.length; if (fileNum == 0) continue; // no any rating files List<String> reviews = new ArrayList<>(); for (int t = 0; t < fileNum; t++) { File ratingFile = ratingDirs[t]; String name = ratingFile.getName(); if (name.equals(reviewID + ".html")) continue; // review page // ratings to the review String html = FileIO.readAsString(ratingFile.getPath()); Document doc = Jsoup.parse(html); Elements divs = doc.select("div.CWMSRatingBlock"); for (Element div : divs) { String text = div.text(); int rating = 0; if (text.startsWith("not helpful")) rating = 1; else if (text.startsWith("somewhat helpful")) rating = 2; else if (text.startsWith("helpful")) rating = 3; else if (text.startsWith("very helpful")) rating = 4; else if (text.startsWith("exceptional")) rating = 5; else if (text.startsWith("off topic")) rating = 0; // for each user - rating Elements lis = div.select("li.clearfix"); for (Element li : lis) { Element a = li.select("a.CWLINKSub").first(); String link = a.attr("href"); String userID = link.substring(link .lastIndexOf('_') + 1); String review = userID + "," + reviewID + "," + rating + "," + link; reviews.add(review); } } } FileIO.writeList(dvdPath, reviews, null, true); } } } public void run_category_reviews(String url) throws Exception { String[] data = url.split(": "); String category = data[0]; String catPath = FileIO.makeDirPath(desktop, domain, category); File Dir = new File(catPath); File[] prodDirs = Dir.listFiles(); int tk = prodDirs.length; String movie_reviews = catPath + "movie-review-ratings.txt"; FileIO.deleteFile(movie_reviews); for (int k = 0; k < tk; k++) { File prodDir = prodDirs[k]; // for each product String productID = prodDir.getName(); if (productID.equals("webPages")) continue; if (!prodDir.isDirectory()) continue; String prodPath = FileIO.makeDirPath(catPath, productID); String dvdPath = prodPath + "review-ratings.txt"; if (!FileIO.exist(dvdPath)) continue; // no review ratings // read review ratings from each product, and remove the duplicated // review ratings Set<String> review_ratings = FileIO.readAsSet(dvdPath); FileIO.writeList(movie_reviews, review_ratings, null, true); } } /** * Concate all the dvd ratings about the products in a specific category * * @param url * @throws Exception */ public void run_category_ratings(String url) throws Exception { String[] data = url.split(": "); String category = data[0]; String catPath = FileIO.makeDirPath(desktop, domain, category); File Dir = new File(catPath); File[] prodDirs = Dir.listFiles(); int tk = prodDirs.length; String ratingFile = catPath + "movie-ratings.txt"; FileIO.deleteFile(ratingFile); for (int k = 0; k < tk; k++) { File prodDir = prodDirs[k]; // for each product String productID = prodDir.getName(); if (productID.equals("webPages")) continue; if (!prodDir.isDirectory()) continue; Logs.debug("{}: {} ({}/{})", new Object[] { category, productID, (k + 1), tk }); String prodPath = FileIO.makeDirPath(catPath, productID); String dvdPath = prodPath + "dvd-ratings.txt"; if (!FileIO.exist(dvdPath)) continue; List<String> dvd_ratings = FileIO.readAsList(dvdPath); FileIO.writeList(ratingFile, dvd_ratings, null, true); } } public void run_category_clean(String url) throws Exception { String[] data = url.split(": "); String category = data[0]; String catPath = FileIO.makeDirPath(desktop, domain, category); String ratingFile = catPath + "movie-ratings.txt"; String reviewFile = catPath + "movie-review-ratings.txt"; String newRatingFile = catPath + "ratings.txt"; String newReviewFile = catPath + "review-ratings.txt"; String userFile = catPath + "users.txt"; Map<String, String> users = new HashMap<>(); String user = null, link = null, newLine = null; // ratings List<String> ratings = FileIO.readAsList(ratingFile); List<String> newRatings = new ArrayList<>(ratings.size()); for (String line : ratings) { String[] d = line.split(","); user = d[0]; link = d[6]; users.put(user, link); newLine = line.substring(0, line.lastIndexOf(',')); newRatings.add(newLine); } FileIO.writeList(newRatingFile, newRatings); ratings = null; newRatings = null; // reviews List<String> reviews = FileIO.readAsList(reviewFile); List<String> newReviews = new ArrayList<>(reviews.size()); for (String line : reviews) { String[] d = line.split(","); user = d[0]; link = d[3]; users.put(user, link); newLine = line.substring(0, line.lastIndexOf(',')); newReviews.add(newLine); } FileIO.writeList(newReviewFile, newReviews); reviews = null; newReviews = null; // users FileIO.writeMap(userFile, users); } public void run_user(String url) throws Exception { String[] data = url.split(","); String userID = data[0]; String userUrl = data[1]; String userPath = FileIO.makeDirPath(desktop, domain, "users.ciao.co.uk"); String html = read_url(userUrl); // check if user exists now Document doc = Jsoup.parse(html); Elements tabs = doc.select("table.tabs"); if (tabs == null || tabs.size() == 0) return; // no such user profile userPath = FileIO.makeDirectory(userPath, userID); FileIO.writeString(userPath + userID + ".html", html); // trusted neighbors String link = "http://www.ciao.co.uk/member_view.php/MemberId/" + userID + "/TabId/5/subTabId/1"; html = read_url(link); // find the max pages doc = Jsoup.parse(html); Element page = doc .select("table#comparePricesShowAllTop td.rangepages").first(); if (page == null) return; // no friends at all FileIO.writeString(userPath + "friends-1.html", html); if (page.text().length() <= 1) return; // no more pages Element a = page.select("a").last(); int maxPage = Integer.parseInt(a.text()); for (int i = 2; i <= maxPage; i++) { String nextPage = link + "/Start/" + (i - 1) * 15; html = read_url(nextPage); FileIO.writeString(userPath + "friends-" + i + ".html", html); } } public static void crawl_data() throws Exception { if (Debug.OFF) { // home page CiaoCrawler cc = new CiaoCrawler(); cc.run_home_page(); return; } String sourceFile = "users.txt"; // "dvd.ciao.txt" String filePath = FileIO.getResource(sourceFile); List<String> urls = FileIO.readAsList(filePath); String[] tasks = { "users" }; int nd = 8; for (String task : tasks) { Logs.info("Current task: " + task); for (int i = 0; i < urls.size(); i += nd) { Thread[] tds = new Thread[nd]; boolean flag = false; for (int j = 0; j < nd; j++) { if (i + j >= urls.size()) { flag = true; break; } String url = urls.get(i + j).trim(); tds[j] = new Thread(new CiaoCrawler(url, task, i + j + 1)); tds[j].start(); } for (Thread td : tds) { if (td != null) td.join(); } if (flag) break; } } } public static void main(String[] args) throws Exception { CiaoCrawler.setFolder(Systems.getDesktop()); CiaoCrawler.sleep = 5000; crawl_data(); } @Override public void run_thread() throws Exception { switch (task) { case "web_pages": run_web_pages(url); break; case "products": run_products(url); break; case "reviews": run_reviews(url); break; case "ratings": run_ratings(url); break; case "dvd-ratings": run_dvd_ratings(url); break; case "category-ratings": run_category_ratings(url); break; case "review-ratings": run_review_ratings(url); break; case "category-reviews": run_category_reviews(url); break; case "category-clean": run_category_clean(url); break; case "users": run_user(url); break; default: break; } } private void run_home_page() throws Exception { String url = "http://dvd.ciao.co.uk/"; String html = read_url(url); FileIO.writeString(dir + "dvd.ciao.html", html); Document doc = Jsoup.parse(html); Element categories = doc.getElementById("category_tree_table"); Elements cs = categories.select("dl"); List<String> cls = new ArrayList<>(); for (Element c : cs) { Element cat = c.select("dt").first().select("a").first(); String category = cat.text(); String link = cat.attr("href"); cls.add(category + ": " + link); } FileIO.writeList(dir + "dvd.ciao.txt", cls); } public void run_web_pages(String url) throws Exception { String[] data = url.split(": "); String category = data[0]; String link = data[1]; String dirPath = FileIO.makeDirectory(dir, category, "webPages"); int pageSize = 15; String html = read_url(link); FileIO.writeString(dirPath + "page_" + 1 + ".html", html); Document doc = Jsoup.parse(html); int maxPage = Integer.parseInt(doc.select( "div.CWCiaoKievPagination.clearfix li.last").text()); Logs.debug(category + ": progress [" + 1 + "/" + maxPage + "]"); for (int i = 2; i <= maxPage; i++) { String pageLink = link + "~s" + (i - 1) * pageSize; String content = read_url(pageLink); FileIO.writeString(dirPath + "page_" + i + ".html", content); Logs.debug(category + ": progress [" + i + "/" + maxPage + "]"); } } public void run_products(String url) throws Exception { String[] data = url.split(": "); String category = data[0]; // String link = data[1]; String dirPath = FileIO.makeDirPath(desktop, domain, category); List<String> links = FileIO.readAsList(dirPath + "movies.txt"); int tk = links.size(); for (int k = 0; k < tk; k++) { String link = links.get(k); String[] d = link.split("::"); String id = d[0]; String name = d[1]; String productLink = d[2]; int idx = productLink.lastIndexOf("/"); String p1 = productLink.substring(0, idx) + "/Reviews"; String reviewLink = p1 + productLink.substring(idx); // create folder String path = FileIO.makeDirectory(dirPath, id); // product page String html = null; String pagePath = path + id + ".html"; if (!FileIO.exist(pagePath)) { html = read_url(productLink); FileIO.deleteFile(path + name + ".html"); FileIO.writeString(pagePath, html); } // product reviews // get first page anyway to identify the maximum pages path = FileIO.makeDirectory(path, "Reviews"); String reviewPath = path + "page_1.html"; if (FileIO.exist(reviewPath)) { html = FileIO.readAsString(reviewPath); } else { html = read_url(reviewLink); FileIO.writeString(reviewPath, html); } Logs.debug(category + ": " + id + " (" + (k + 1) + "/" + tk + ")" + ": page " + 1); Document doc = Jsoup.parse(html); Elements nav = doc.select("div#Pagination"); if (!nav.isEmpty()) { int maxPage = 1; Elements last = nav.select("li.last"); if (!last.isEmpty()) maxPage = Integer.parseInt(last.first().text()); // more // than // 11 // pages else maxPage = Integer.parseInt(nav.select("li").last().text()); // less // or // equal // 11 // pages for (int i = 2; i <= maxPage; i++) { String filePath = path + "page_" + i + ".html"; if (FileIO.exist(filePath)) continue; reviewLink = reviewLink + "/Start/" + ((i - 1) * 15); html = read_url(reviewLink); FileIO.writeString(filePath, html); Logs.debug(category + ": " + id + " (" + (k + 1) + "/" + tk + ")" + ": page " + i + "/" + maxPage); } } } } public static void setFolder(String folder) { CiaoCrawler.folder = folder; } }