package happy.research.data;
import happy.coding.io.FileIO;
import happy.coding.io.Logs;
import happy.coding.io.Strings;
import happy.coding.io.net.WebCrawler;
import happy.coding.system.Systems;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class GewaraCrawler extends WebCrawler
{
String dir = Systems.getDesktop() + "gewara.com/";
String task = null;
public GewaraCrawler(String url, String task, int id) throws Exception
{
super(url, id);
FileIO.makeDirectory(dir);
this.task = task;
}
public void crawl_web_pages(String url) throws Exception
{
String html = read_url(url);
Document doc = Jsoup.parse(html);
String name = doc.select("div.detail_head_name h1").first().text();
name = Strings.filterWebString(name, '_');
String dirPath = dir + name + "/";
FileIO.makeDirectory(dirPath);
FileIO.writeString(dirPath + name + ".html", html);
}
public void parse_overall_ratings() throws Exception
{
File directory = new File(dir);
File[] dirs = directory.listFiles();
for (File d : dirs)
{
String fname = d.getName();
String file = d.getPath() + "/" + fname + ".html";
Document doc = Jsoup.parse(FileIO.readAsString(file));
GewaraMovie gm = new GewaraMovie();
gm.setName(doc.select("div.detail_head_name h1").first().text());
gm.setSub_name(doc.select("div.detail_head_name strong").first().attr("title"));
Element info = doc.select("div.detail_head_desc").first();
Elements es = info.select("p");
Element e = null;
String val = null;
List<String> vals = null;
for (Element ex : es)
{
e = ex.select("em").first();
String key = e.text();
if (key.startsWith("看点")) gm.setMain_point(ex.select("span").first().text());
else if (key.startsWith("首映")) gm.setReleseDates(e.siblingNodes().get(0).toString());
else if (key.startsWith("片长")) gm.setLength(e.siblingNodes().get(0).toString());
else if (key.startsWith("版本")) gm.setVersion(e.siblingNodes().get(0).toString());
else if (key.startsWith("导演")) gm.setDirector(e.siblingNodes().get(0).toString());
else if (key.startsWith("语言"))
{
vals = new ArrayList<>();
val = e.siblingNodes().get(0).toString();
for (String str : val.split("/"))
vals.add(str);
gm.setLanguage(vals);
} else if (key.startsWith("地区"))
{
vals = new ArrayList<>();
val = e.siblingNodes().get(0).toString();
for (String str : val.split("/"))
vals.add(str);
gm.setCountries(vals);
} else if (key.startsWith("类型"))
{
vals = new ArrayList<>();
val = e.siblingNodes().get(0).toString();
for (String str : val.split("/"))
vals.add(str);
gm.setTypes(vals);
} else if (key.startsWith("主演"))
{
vals = new ArrayList<>();
val = e.siblingNodes().get(0).toString();
for (String str : val.split(" "))
vals.add(str);
gm.setActors(vals);
}
}
es = doc.select("div#showDown_content p b");
gm.setDescription(es.get(0).siblingNodes().get(0).toString());
val = doc.select("#detail_nav li a").first().attr("href");
gm.setGm_url("http://www.gewara.com" + val);
gm.setId(val.substring(val.lastIndexOf("/") + 1));
e = doc.select("div.detail_movieTypeBotm div span").first();
gm.setAvg_rating(Double.parseDouble(e.text()));
e = doc.select("a:contains(哇啦)").first();
val = e.select("span").first().text().replace("(", "").replace(")", "");
gm.setNum_ratings(Integer.parseInt(val));
FileIO.writeString(d.getPath() + "/summary.txt", gm.toString());
}
}
public void crawl_comments(String url) throws Exception
{
String html = read_url(url);
Document doc = Jsoup.parse(html);
String name = doc.select("div.detail_head_name h1").first().text();
name = Strings.filterWebString(name, '_');
String val = doc.select("#detail_nav li a").first().attr("href");
String id = val.substring(val.lastIndexOf("/") + 1);
String dirPath = dir + name + "/comments/";
FileIO.makeDirectory(dirPath);
// save rating pages
int max = 1;
boolean maxSet = false;
url = url + "/commentlist";
for (int k = 0; k <= max; k++)
{
String page_file = dirPath + "page_" + (k + 1) + ".html";
Logs.debug(name + " comments with page: " + (k + 1) + "/" + (max + 1));
String contents = null;
if (!FileIO.exist(page_file))
{
String link = "http://www.gewara.com/ajax/common/qryComment.xhtml?pageNumber="
+ k
+ "&relatedid="
+ id
+ "&title=&issue=false&hasMarks=true&tag=movie&isPic=true&isVideo=false&pages=true&maxCount=20&userLogo=";
contents = read_url(link);
FileIO.writeString(page_file, contents);// new String(contents.getBytes("utf-8"), "utf-8"));
} else
{
contents = FileIO.readAsString(page_file);
}
// find the maximum page num;
if (!maxSet)
{
Document doc2 = Jsoup.parse(contents);
Elements es = doc2.select("div#page a");
Element e = es.get(es.size() - 2);
max = Integer.parseInt(e.attr("lang"));
maxSet = true;
}
}
}
public static void main(String[] args) throws Exception
{
String dir_path = "D:/Dropbox/PhD/My Work/Ongoing/Data Crawl/gewara.com/";
File dir = new File(dir_path);
String sep = ",\t";
for (File movie : dir.listFiles())
{
if (!movie.isDirectory()) continue;
String movie_path = movie.getPath();
File comments = new File(movie_path + "/comments/");
String file = movie_path + "/comments.csv";
FileIO.deleteFile(file);
int total = comments.listFiles().length;
for (File page : comments.listFiles())
{
Logs.debug("Current page: " + page.getName() + "/" + total);
Document doc = Jsoup.parse(FileIO.readAsString(page.getPath()));
Elements es = doc.select("div.ui_wala_comment dl");
StringBuilder sb = new StringBuilder();
for (int i = 0; i < es.size(); i++)
{
Element e = es.get(i);
Element li = e.select("div.page_wala p").first();
String user = li.select("a").first().text();
String user_url = li.select("a").first().attr("href");
Element eli = li.select("span.ui_grades").first();
String rate = "";
String comment = null;
if (eli != null)
{
rate = eli.text();
comment = eli.nextSibling().toString().substring(1);
} else
{
comment = li.select("a").first().nextSibling().toString().substring(1);
}
li = e.select("div.page_replay.page_replay_my.clear").first();
String time = e.select("span.left").first().text();
String forward = li.select("a.page_ico.forwards").first().text();
String reply = li.select("a.page_ico.comment").first().text();
String line = user + sep + user_url + sep + rate + sep + time + sep + forward + sep + reply + sep
+ comment;
if (i < es.size() - 1) line += "\n";
sb.append(line);
}
FileIO.writeString(file, sb.toString(), true);
}
}
}
public static void crawl_data() throws Exception
{
String filePath = FileIO.getResource("gewara.txt");
List<String> urls = FileIO.readAsList(filePath);
String[] tasks = { "comments" };
int nd = 4;
for (String task : tasks)
{
Logs.info("Current task: " + task);
for (int i = 0; i < urls.size(); i += nd)
{
Thread[] tds = new Thread[nd];
boolean flag = false;
for (int j = 0; j < nd; j++)
{
if (i + j >= urls.size())
{
flag = true;
break;
}
String url = urls.get(i + j).trim();
tds[j] = new Thread(new GewaraCrawler(url, task, i + j + 1));
tds[j].start();
}
for (Thread td : tds)
{
if (td != null) td.join();
}
if (flag) break;
}
}
}
@Override
public void run_thread() throws Exception
{
switch (task)
{
case "web_pages":
crawl_web_pages(url);
break;
case "ratings":
break;
case "comments":
crawl_comments(url);
break;
case "reviews":
break;
default:
break;
}
}
}