package happy.research.data;
import happy.coding.io.FileIO;
import happy.coding.io.Logs;
import happy.coding.io.Strings;
import happy.coding.io.net.WebCrawler;
import happy.coding.system.Debug;
import happy.coding.system.Systems;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class DoubanCrawler extends WebCrawler
{
protected static String folder;
protected static String sep = ",\t";
protected String dir;
private String task;
DoubanCrawler(String url, String task, int id) throws Exception
{
super(url, id);
dir = folder + "douban.com/";
FileIO.makeDirectory(dir);
this.task = task;
}
public DoubanCrawler() throws Exception
{
this(null, null, 0);
}
public void run_reviews(String url) throws Exception
{
url = url.trim();
String html = read_url(url);
Document doc = Jsoup.parse(html);
String name = doc.select("span[property=v:itemreviewed]").text();
name = Strings.filterWebString(name, '_');
String dirPath = dir + name + "/reviews/";
FileIO.makeDirectory(dirPath);
// save rating pages
int k = 0;
url = url + "reviews";
String link = url;
while (true)
{
k++;
String page = null;
String path = dirPath + "page_" + k + ".html";
if (!FileIO.exist(path))
{
page = read_url(link);
FileIO.writeString(path, page);
Logs.debug(name + " reviews with page: " + k);
} else
{
page = FileIO.readAsString(path);
}
// find the next page link;
Document doc2 = Jsoup.parse(page);
Elements es = doc2.select("div#paginator a.next");
if (es == null || es.size() == 0)
{
break;
} else
{
link = url + es.first().attr("href");
}
}
}
public void parse_ratings() throws Exception
{
File directory = new File(dir);
File[] movies = directory.listFiles();
// different movies
for (File movie : movies)
{
if (!movie.isDirectory()) continue;
String movie_path = movie.getPath();
File ratings = new File(movie_path + "/ratings/");
// different pages
String rating_file = movie_path + "/ratings.csv";
FileIO.deleteFile(rating_file);
for (File page : ratings.listFiles())
{
Document doc = Jsoup.parse(FileIO.readAsString(page.getPath()));
Elements es = doc.select("div#collections_tab .sub_ins table");
StringBuilder sb = new StringBuilder();
for (int i = 0; i < es.size(); i++)
{
Element e = es.get(i);
String line = "";
Element user = e.select("div.pl2 a").first();
Element rate = e.select("p.pl").first();
line += user.text() + sep; // user
if (rate.select("span").size() == 0) continue; // if no rating, ignore it
line += rate.select("span").first().attr("class").substring(7, 8) + sep; // rating
line += rate.text().substring(0, 10) + sep; // time
line += user.attr("href"); // user url
if (i < es.size() - 1) line += "\n";
sb.append(line);
}
FileIO.writeString(rating_file, sb.toString(), true);
}
}
}
public void parse_reviews() throws Exception
{
File directory = new File(dir);
File[] movies = directory.listFiles();
// different movies
for (File movie : movies)
{
if (!movie.isDirectory()) continue;
String movie_path = movie.getPath();
File ratings = new File(movie_path + "/reviews/");
// different pages
String rating_file = movie_path + "/reviews.csv";
FileIO.deleteFile(rating_file);
for (File page : ratings.listFiles())
{
Document doc = Jsoup.parse(FileIO.readAsString(page.getPath()));
Elements es = doc.select("div.ctsh");
StringBuilder sb = new StringBuilder();
for (int i = 0; i < es.size(); i++)
{
Element e = es.get(i);
Element li = e.select("li.nlst").first();
String detail_url = li.select("a.j.a_unfolder").first().attr("href");
String title = li.select("a[title]").first().attr("title");
li = e.select("li.ilst").first();
String user = li.select("a").first().attr("title");
String user_url = li.select("a").first().attr("href");
li = e.select("li.clst.report-link").first();
String rate = li.select("span.pl.ll.obss").first().child(1).attr("class").substring(7, 8);
String review = li.select("div.review-short").first().childNode(0).toString().substring(1);
String str = li.select("span.fleft").first().text();
String[] val = str.replace(" ", " ").split(" ");
String datetime = val[0] + " " + val[1];
String helpful = val[4].substring(0, val[4].indexOf("有用"));
String line = title + sep + rate + sep + user + sep + user_url + sep + datetime + sep + helpful
+ sep + detail_url + sep + review;
if (i < es.size() - 1) line += "\n";
sb.append(line);
}
FileIO.writeString(rating_file, sb.toString(), true);
}
}
}
public void parse_comments() throws Exception
{
File directory = new File(dir);
File[] movies = directory.listFiles();
// different movies
for (File movie : movies)
{
if (!movie.isDirectory()) continue;
String movie_path = movie.getPath();
File ratings = new File(movie_path + "/comments/");
// different pages
String rating_file = movie_path + "/comments.csv";
FileIO.deleteFile(rating_file);
for (File page : ratings.listFiles())
{
Logs.debug("current page: " + page.getName() + "/" + ratings.listFiles().length);
Document doc = Jsoup.parse(FileIO.readAsString(page.getPath()));
Elements es = doc.select("div.comment-item");
StringBuilder sb = new StringBuilder();
for (int i = 0; i < es.size(); i++)
{
Element e = es.get(i).select("div.comment").first();
Element li = e.select("span.comment-vote").first();
String helpful = li.select("span.votes.pr5").first().text();
li = e.select("span.comment-info").first();
String user = li.select("a").first().text();
String user_url = li.select("a").first().attr("href");
Element eli = li.select("span[title]").first();
String rate = "";
String date = null;
if (eli != null)
{
rate = eli.attr("class").substring(7, 8);
date = li.childNode(4).toString();
} else
{
date = li.childNode(2).toString();
}
String comment = e.select("p").first().text();
String line = user + sep + user_url + sep + rate + sep + date + sep + helpful + sep + comment;
if (i < es.size() - 1) line += "\n";
sb.append(line);
}
FileIO.writeString(rating_file, sb.toString(), true);
}
}
}
public void parse_web_pages() throws Exception
{
File directory = new File(dir);
File[] dirs = directory.listFiles();
for (File d : dirs)
{
if (!d.isDirectory()) continue;
String fname = d.getName();
String file = d.getPath() + "/" + fname + ".html";
Document doc = Jsoup.parse(FileIO.readAsString(file));
DoubanMovie dm = new DoubanMovie();
String name = doc.select("span[property=v:itemreviewed]").text();
dm.setName(name);
String year = doc.select("span.year").text();
year = year.substring(1, year.lastIndexOf(')'));
dm.setYear(Integer.parseInt(year));
Element info = doc.select("div#info").get(0);
Element e = info.select("span:contains(制片国家/地区)").first();
String val = e.nextSibling().toString();
List<String> vals = new ArrayList<>();
for (String str : val.split("/"))
vals.add(str.trim());
dm.setCountries(vals);
val = info.select("span:contains(语言)").first().nextSibling().toString();
vals = new ArrayList<>();
for (String str : val.split("/"))
vals.add(str.trim());
dm.setLanguage(vals);
Elements es = info.select("span:contains(官方网站)");
if (es != null && es.size() > 0)
{
e = es.first().nextElementSibling();
dm.setOfficial_url(e.attr("href"));
}
e = info.select("span:contains(IMDb链接)").first().nextElementSibling();
dm.setImdb_url(e.attr("href"));
es = info.select("span[property=v:genre]");
vals = new ArrayList<>();
for (Element ex : es)
vals.add(ex.text());
dm.setTypes(vals);
es = info.select("a[rel=v:starring]");
vals = new ArrayList<>();
for (Element ex : es)
vals.add(ex.text());
dm.setActors(vals);
es = info.select("span:contains(又名)");
if (es != null && es.size() > 0)
{
val = es.first().nextSibling().toString();
vals = new ArrayList<>();
for (String str : val.split("/"))
vals.add(str.trim());
dm.setAlias(vals);
}
dm.setReleseDates(info.select("span[property=v:initialReleaseDate").first().attr("content"));
dm.setLength(info.select("span[property=v:runtime").first().text());
dm.setDirector(info.select("a[rel=v:directedBy]").first().text());
e = info.select("span:contains(编剧)").first();
vals = new ArrayList<>();
for (Element ex : e.select("a"))
vals.add(ex.text());
dm.setScenarist(vals);
dm.setAvg_rating(Double.parseDouble(doc.select("strong[property=v:average]").text()));
dm.setNum_ratings(Integer.parseInt(doc.select("span[property=v:votes]").first().text()));
e = doc.select("div[rel=v:rating]").first();
vals = new ArrayList<>();
for (String str : e.ownText().split(" "))
vals.add(str);
dm.setRatio_rates(vals);
val = doc.select("meta[http-equiv=mobile-agent]").first().attr("content");
val = val.substring(val.indexOf("url=") + 4);
dm.setDouban_url(val);
val = val.replace("http://m.douban.com/movie/subject/", "").replace("/", "");
dm.setId(val);
dm.setDescription(doc.select("span[property=v:summary]").first().text());
FileIO.writeString(d.getPath() + "/summary.txt", dm.toString());
}
}
public static void parse_data() throws Exception
{
DoubanCrawler dc = new DoubanCrawler();
String[] tasks = { "comments" };
for (String task : tasks)
{
switch (task)
{
case "web_pages":
dc.parse_web_pages();
break;
case "ratings":
dc.parse_ratings();
break;
case "reviews":
dc.parse_reviews();
break;
case "comments":
dc.parse_comments();
break;
default:
break;
}
}
}
public static void crawl_data() throws Exception
{
String filePath = FileIO.getResource("douban.txt");
List<String> urls = FileIO.readAsList(filePath);
String[] tasks = { "reviews" };
int nd = 4;
for (String task : tasks)
{
Logs.info("Current task: " + task);
for (int i = 0; i < urls.size(); i += nd)
{
Thread[] tds = new Thread[nd];
boolean flag = false;
for (int j = 0; j < nd; j++)
{
if (i + j >= urls.size())
{
flag = true;
break;
}
String url = urls.get(i + j).trim();
tds[j] = new Thread(new DoubanCrawler(url, task, i + j + 1));
tds[j].start();
}
for (Thread td : tds)
{
if (td != null) td.join();
}
if (flag) break;
}
}
}
public static void main(String[] args) throws Exception
{
if (Debug.OFF)
{
DoubanCrawler.setFolder(Systems.getDesktop());
crawl_data();
} else
{
DoubanCrawler.setFolder("D:/Dropbox/PhD/My Work/Ongoing/Data Crawl/");
parse_data();
}
}
@Override
public void run_thread() throws Exception
{
switch (task)
{
case "web_pages":
run_web_pages(url);
break;
case "ratings":
run_ratings(url);
break;
case "comments":
run_comments(url);
break;
case "reviews":
run_reviews(url);
break;
default:
break;
}
}
public void run_comments(String url) throws Exception
{
url = url.trim();
String html = read_url(url);
Document doc = Jsoup.parse(html);
String name = doc.select("span[property=v:itemreviewed]").text();
name = Strings.filterWebString(name, '_');
String dirPath = dir + name + "/comments/";
FileIO.makeDirectory(dirPath);
// save rating pages
int k = 0;
url = url + "comments";
String link = url;
while (true)
{
k++;
String page_file = dirPath + "page_" + k + ".html";
String contents = null;
if (!FileIO.exist(page_file))
{
contents = read_url(link);
FileIO.writeString(page_file, contents);
Logs.debug(name + " comments with page: " + k);
} else
{
contents = FileIO.readAsString(page_file);
}
// find the next page link;
Document doc2 = Jsoup.parse(contents);
Elements es = doc2.select("div#paginator a.next");
if (es == null || es.size() == 0)
{
break;
} else
{
link = url + es.first().attr("href");
}
}
}
public void run_web_pages(String url) throws Exception
{
String html = read_url(url);
Document doc = Jsoup.parse(html);
String name = doc.select("span[property=v:itemreviewed]").text();
name = Strings.filterWebString(name, '_');
String dirPath = dir + name + "/";
FileIO.makeDirectory(dirPath);
FileIO.writeString(dirPath + name + ".html", html);
}
public void run_ratings(String url) throws Exception
{
String html = read_url(url);
Document doc = Jsoup.parse(html);
String name = doc.select("span[property=v:itemreviewed]").text();
name = Strings.filterWebString(name, '_');
String dirPath = dir + name + "/ratings/";
FileIO.makeDirectory(dirPath);
// save rating pages
int k = 0;
while (true)
{
String link = url + "collections?start=" + (k * 20);
String page = read_url(link);
k++;
FileIO.writeString(dirPath + "page_" + k + ".html", page);
Logs.debug("Current processing page: " + k);
// if finished;
Document doc2 = Jsoup.parse(page);
Elements es = doc2.select("div#collections_tab span.next");
if (es == null || es.size() == 0)
{
break;
}
}
}
public static void setFolder(String folder)
{
DoubanCrawler.folder = folder;
}
}