package edu.fudan.weixin.crawler.actions;
import java.util.ArrayList;
import java.util.List;
import net.sf.ehcache.Cache;
import net.sf.ehcache.CacheManager;
import net.sf.ehcache.Element;
import org.apache.struts2.convention.annotation.Action;
import org.apache.struts2.convention.annotation.Namespace;
import org.apache.struts2.convention.annotation.ParentPackage;
import org.apache.struts2.convention.annotation.Result;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBObject;
import edu.fudan.eservice.common.utils.CommonUtil;
import edu.fudan.eservice.common.utils.EncodeHelper;
import edu.fudan.eservice.common.utils.MongoUtil;
import edu.fudan.weixin.entity.News;
import edu.fudan.weixin.utils.AttributeRegexFilter;
@ParentPackage("servicebase")
@Namespace("/crawler")
public class SudyPageAction extends CrawlerBase {
/**
*
*/
private static final long serialVersionUID = 1266884952585718508L;
protected String domain;
protected String listid;
@SuppressWarnings({ "rawtypes", "unchecked" })
@Action(value = "sdlist", results = { @Result(type = "json", params = {
"root", "list" }) })
public String list() {
Cache c = CacheManager.getInstance().getCache("News");
String ckey =domain+listid + page;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
list = (List) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(domain+"/"+listid+"/list"
+ page+".htm");
Parser p = Parser.createParser(retstr.toString(), "utf-8");
list = new ArrayList<News>();
try {
NodeList ls = p
.extractAllNodesThatMatch(new AttributeRegexFilter(
"href", ".*/page\\.htm"));
SimpleNodeIterator i = ls.elements();
while (i.hasMoreNodes()) {
Node n = i.nextNode();
if (n instanceof TagNode) {
TagNode tn = (TagNode) n;
News news = new News();
String href = tn.getAttribute("href");
news.setId(href);
news.setTitle(tn.getAttribute("alt"));
Node tmp=tn.getParent().getNextSibling();
while(tmp!=null &&!(tmp instanceof TableColumn))
tmp=tmp.getNextSibling();
if(tmp!=null)
news.setPubdate(tmp.toPlainTextString());
list.add(news);
}
}
c.put(new Element(ckey, list));
} catch (ParserException e) {
e.printStackTrace();
}
}
jsonp(list);
return NONE;
}
@Action(value = "sdcontent", results = { @Result(type = "json", params = {
"root", "en" }) })
public String content() {
Cache c = CacheManager.getInstance().getCache("News");
String ckey = domain+ newsid;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
en = (News) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(domain
+ newsid);
Parser p = Parser.createParser(retstr.toString(), "utf-8");
try {
NodeList nl = p.extractAllNodesThatMatch(new OrFilter(new NodeFilter[]{
new HasAttributeFilter("class","Article_Title"), new HasAttributeFilter("class",
"Article_PublishDate"),new HasAttributeFilter("class","Article_Content")}));
SimpleNodeIterator i = nl.elements();
en = new News();
en.setId(newsid);
while (i.hasMoreNodes()) {
Node n = i.nextNode();
if (n instanceof TagNode) {
TagNode tn = (TagNode) n;
if (tn.getAttribute("class").equalsIgnoreCase("Article_Title"))
en.setTitle(tn.toPlainTextString());
if (tn.getAttribute("class").equalsIgnoreCase("Article_PublishDate"))
en.setPubdate(tn.toPlainTextString());
if (tn.getAttribute("class").equalsIgnoreCase("Article_Content")) {
NodeList ls = new NodeList();
tn.collectInto(ls, new NodeClassFilter(
ImageTag.class));
SimpleNodeIterator j = ls.elements();
DBCollection col = MongoUtil.getInstance().getDB()
.getCollection("CrawlerImages");
while (j.hasMoreNodes()) {
ImageTag it = (ImageTag) j.nextNode();
it.removeAttribute("width");
it.removeAttribute("height");
it.removeAttribute("style");
it.setAttribute("class", "img-responsive");
String srcstr=it.extractImageLocn();
if(!srcstr.startsWith("http"))
{
if(srcstr.startsWith("/"))
srcstr=domain+srcstr;
else
{
int tk=newsid.lastIndexOf("/");
if(tk>0)
{
srcstr=domain+newsid.substring(0,tk+1)+srcstr;
}else
{
srcstr=domain+srcstr;
}
}
}
String imgid = EncodeHelper.digest(
srcstr, "MD5");
BasicDBObject obj = new BasicDBObject("id",
imgid);
DBObject dbo = col.findOne(obj);
if (dbo == null)
col.save(obj.append("url",srcstr));
it.setImageURL("crawler/image.act?id=" + imgid);
}
ls=new NodeList();
tn.collectInto(ls, new NodeClassFilter(LinkTag.class));
j = ls.elements();
while (j.hasMoreNodes()) {
LinkTag link=(LinkTag)j.nextNode();
String linkstr=link.extractLink();
if(link.isHTTPLikeLink()&&!linkstr.startsWith("http")){
if(linkstr.startsWith("/"))
link.setLink(domain+link.extractLink());
else
{
int tk=newsid.lastIndexOf("/");
if(tk>0)
{
link.setLink(domain+newsid.substring(0,tk+1)+linkstr);
}else
{
link.setLink(domain+linkstr);
}
}
}
}
ls=new NodeList();
tn.collectInto(ls, new HasAttributeFilter("class","wp_pdf_player"));
j = ls.elements();
while (j.hasMoreNodes()) {
TagNode pdf=(TagNode)j.nextNode();
String pdfurl=pdf.getAttribute("pdfsrc");
pdf.removeAttribute("flexpaper");
pdf.removeAttribute("swsrc");
pdf.removeAttribute("pdfsrc");
pdf.setTagName("a");
pdf.setAttribute("href",domain+pdfurl);
NodeList tnl=new NodeList();
tnl.add(new TextNode("PDF正文"));
pdf.setChildren(tnl);
}
en.setContent(tn.toHtml());
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
if (!CommonUtil.isEmpty(en) && !CommonUtil.isEmpty(en.getContent()))
c.put(new Element(ckey, en));
}
jsonp(en);
return NONE;
}
public String getDomain() {
return domain;
}
public void setDomain(String domain) {
this.domain = domain;
}
public String getListid() {
return listid;
}
public void setListid(String listid) {
this.listid = listid;
}
}