/*******************************************************************************
* Copyright 2015 htd0324@gmail.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package com.laudandjolynn.mytv.crawler.epg;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.laudandjolynn.mytv.crawler.AbstractCrawler;
import com.laudandjolynn.mytv.event.AllTvStationCrawlEndEvent;
import com.laudandjolynn.mytv.event.CrawlEventListener;
import com.laudandjolynn.mytv.event.ProgramTableCrawlEndEvent;
import com.laudandjolynn.mytv.event.ProgramTableFoundEvent;
import com.laudandjolynn.mytv.event.TvStationFoundEvent;
import com.laudandjolynn.mytv.model.ProgramTable;
import com.laudandjolynn.mytv.model.TvStation;
import com.laudandjolynn.mytv.utils.Constant;
import com.laudandjolynn.mytv.utils.DateUtils;
import com.laudandjolynn.mytv.utils.MyTvUtils;
import com.laudandjolynn.mytv.utils.WebCrawler;
/**
* @author: Laud
* @email: htd0324@gmail.com
* @date: 2015年3月28日 上午12:00:44
* @copyright: www.laudandjolynn.com
*/
public class EpgCrawler extends AbstractCrawler {
private final static Logger logger = LoggerFactory
.getLogger(EpgCrawler.class);
// cntv节目表地址
private final static String EPG_URL = "http://tv.cntv.cn/epg";
private final static String EPG_NAME = "epg";
private final static AtomicInteger SEQUENCE = new AtomicInteger(200000);
private final static String CITY = "城市";
@Override
public String getCrawlerName() {
return EPG_NAME;
}
@Override
public String getUrl() {
return EPG_URL;
}
@Override
public List<TvStation> crawlAllTvStation() {
String epgFile = getCrawlFilePath();
File file = new File(epgFile);
String html = null;
if (file.exists()) {
try {
html = MyTvUtils.readAsHtml(epgFile);
return parseTvStation(html);
} catch (IOException e) {
// do nothing
}
return null;
}
HtmlPage htmlPage = (HtmlPage) WebCrawler.crawl(getUrl());
html = htmlPage.asXml();
MyTvUtils.outputCrawlData(getCrawlerName(), html, getCrawlFileName());
List<TvStation> stationList = parseTvStation(html);
for (CrawlEventListener listener : listeners) {
listener.crawlEnd(new AllTvStationCrawlEndEvent(this, stationList));
}
return stationList;
}
/**
* 根据电视台、日期获取电视节目表
*
* @param date
* 日期,yyyy-MM-dd
* @param stationName
* 电视台名称
* @return
*/
@Override
public List<ProgramTable> crawlProgramTable(String date, TvStation station) {
if (station == null || date == null) {
logger.debug("station name or date is null.");
return null;
}
List<ProgramTable> ptList = crawlProgramTable(station, date);
for (CrawlEventListener listener : listeners) {
listener.crawlEnd(new ProgramTableCrawlEndEvent(this, ptList,
station.getName(), date));
}
return ptList;
}
@Override
public boolean exists(TvStation station) {
String epgFile = getCrawlFilePath();
File file = new File(epgFile);
String city = station.getCity();
String stationName = station.getName();
if (file.exists()) {
String html = null;
try {
html = MyTvUtils.readAsHtml(epgFile);
} catch (IOException e) {
return false;
}
Document doc = Jsoup.parse(html);
Elements elements = null;
if (city == null) {
elements = doc.select("div.md_left_right dl h3 a.channel");
} else {
elements = doc.select("dl#cityList div.lv3 a.channel");
}
for (Element element : elements) {
if (stationName.equals(element.text())) {
return true;
}
}
return false;
}
HtmlPage htmlPage = (HtmlPage) WebCrawler.crawl(EPG_URL);
MyTvUtils.outputCrawlData(getCrawlerName(), htmlPage.asXml(),
getCrawlFileName());
List<?> stationElements = null;
if (city == null) {
stationElements = htmlPage
.getByXPath("//div[@class='md_left_right']/dl//h3//a[@class='channel']");
} else {
// 城市电视台
stationElements = htmlPage
.getByXPath("//dl[@id='cityList']//div[@class='lv3']//a[@class='channel']");
}
for (Object element : stationElements) {
HtmlAnchor anchor = (HtmlAnchor) element;
if (stationName.equals(anchor.getTextContent().trim())) {
return true;
}
}
return false;
}
/**
* 抓取指定日期、电视台的节目表
*
* @param station
* 电视台对象
* @param date
* 日期,yyyy-MM-dd
* @return
*/
private List<ProgramTable> crawlProgramTable(TvStation station, String date) {
if (station == null) {
logger.debug("station and html page must not null.");
return null;
}
Date dateObj = DateUtils.string2Date(date, "yyyy-MM-dd");
if (dateObj == null) {
logger.debug("date must not null.");
return null;
}
String stationName = station.getName();
String queryDate = DateUtils.date2String(dateObj, "yyyy-MM-dd");
logger.info("crawl program table of " + stationName + " at "
+ queryDate);
String city = station.getCity();
List<?> stationElements = null;
HtmlPage htmlPage = (HtmlPage) WebCrawler.crawl(EPG_URL);
if (city == null) {
stationElements = htmlPage
.getByXPath("//div[@class='md_left_right']/dl//h3//a[@class='channel']");
} else {
// 城市电视台
stationElements = htmlPage
.getByXPath("//dl[@id='cityList']//div[@class='lv3']//a[@class='channel']");
}
boolean exists = false;
for (Object element : stationElements) {
HtmlAnchor anchor = (HtmlAnchor) element;
if (stationName.equals(anchor.getTextContent().trim())) {
exists = true;
try {
htmlPage = anchor.click();
} catch (IOException e) {
logger.error("error occur while search program table of "
+ stationName + " at spec date: " + queryDate, e);
return null;
}
break;
}
}
if (!exists) {
logger.info(stationName + " isn't exists at " + getCrawlerName());
return null;
}
if (!queryDate.equals(DateUtils.today())) {
DomElement element = htmlPage.getElementById("date");
element.setAttribute("readonly", "false");
element.setAttribute("value", queryDate);
element.setNodeValue(queryDate);
element.setTextContent(queryDate);
List<?> list = htmlPage.getByXPath("//div[@id='search_1']/a");
HtmlAnchor anchor = (HtmlAnchor) list.get(0);
try {
htmlPage = anchor.click();
} catch (IOException e) {
logger.error("error occur while search program table of "
+ stationName + " at spec date: " + queryDate, e);
return null;
}
}
String html = htmlPage.asXml();
MyTvUtils.outputCrawlData(queryDate, html, queryDate
+ Constant.UNDERLINE + getCrawlerName() + Constant.UNDERLINE
+ stationName);
List<ProgramTable> ptList = parseProgramTable(html);
return ptList;
}
/**
* 解析电视台列表
*
* @param html
* @return
*/
private List<TvStation> parseTvStation(String html) {
Document doc = Jsoup.parse(html);
Elements classifyElements = doc.select("ul.weishi a[href]");
Elements stationElements = doc.select("div.md_left_right");
List<TvStation> resultList = new ArrayList<TvStation>();
for (int i = 0, size = classifyElements == null ? 0 : classifyElements
.size(); i < size; i++) {
Element classifyElement = classifyElements.get(i);
String classify = classifyElement.text().trim();
if (CITY.equals(classify)) {
continue;
}
Element stationElement = stationElements.get(i);
Elements stationTextElements = stationElement
.select("dl h3 a.channel");
for (int j = 0, ssize = stationTextElements == null ? 0
: stationTextElements.size(); j < ssize; j++) {
TvStation tv = new TvStation();
String stationName = stationTextElements.get(j).text().trim();
tv.setName(stationName);
tv.setCity(null);
tv.setClassify(classify);
tv.setSequence(SEQUENCE.incrementAndGet());
for (CrawlEventListener listener : listeners) {
listener.itemFound(new TvStationFoundEvent(this, tv));
}
resultList.add(tv);
}
}
Elements cityElements = stationElements.select("dl#cityList dd");
for (int i = 0, size = cityElements == null ? 0 : cityElements.size(); i < size; i++) {
Element cityElement = cityElements.get(i).select("h3 a[href]")
.get(0);
Elements cityStationElements = cityElements.get(i).select(
"div.lv3 p a.channel");
for (int j = 0, ssize = cityStationElements == null ? 0
: cityStationElements.size(); j < ssize; j++) {
TvStation tv = new TvStation();
String stationName = cityStationElements.get(j).text().trim();
tv.setName(stationName);
tv.setCity(cityElement.text().trim());
tv.setClassify(CITY);
tv.setSequence(SEQUENCE.incrementAndGet());
for (CrawlEventListener listener : listeners) {
listener.itemFound(new TvStationFoundEvent(this, tv));
}
resultList.add(tv);
}
}
return resultList;
}
/**
* 解析电视节目表
*
* @param html
* @return
*/
private List<ProgramTable> parseProgramTable(String html) {
Document doc = Jsoup.parse(html);
List<ProgramTable> resultList = new ArrayList<ProgramTable>();
Elements channelElements = doc.select("#channelTitle");
String stationName = channelElements.get(0).text().trim();
Elements weekElements = doc.select("#week li[rel]");
int week = 0;
String date = null;
for (int i = 0, size = weekElements == null ? 0 : weekElements.size(); i < size; i++) {
Element element = weekElements.get(i);
if (element.hasClass("cur")) {
week = i + 1;
date = element.attr("rel").trim();
break;
}
}
Elements programElemens = doc.select("#epg_list div.content_c dl dd")
.select("a.p_name_a, a.p_name");
for (int i = 0, size = programElemens == null ? 0 : programElemens
.size(); i < size; i++) {
Element programElement = programElemens.get(i);
String programContent = programElement.text().trim();
String[] pc = programContent.split("\\s+");
ProgramTable pt = new ProgramTable();
pt.setAirDate(date);
pt.setAirTime(date + " " + pc[0] + ":00");
pt.setProgram(pc[1]);
pt.setStationName(stationName);
pt.setWeek(week);
for (CrawlEventListener listener : listeners) {
listener.itemFound(new ProgramTableFoundEvent(this, pt));
}
resultList.add(pt);
}
return resultList;
}
/**
* 取得将被存储的抓取文件路径
*
* @return
*/
private String getCrawlFilePath() {
return Constant.CRAWL_FILE_PATH + getCrawlerName() + File.separator
+ getCrawlFileName();
}
/**
* 取得将被存储的抓取文件名
*
* @return
*/
private String getCrawlFileName() {
return getCrawlerName();
}
}