package railo.runtime.search.lucene2.html;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import org.ccil.cowan.tagsoup.Parser;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.XMLReaderFactory;
import railo.commons.io.IOUtil;
import railo.commons.io.res.Resource;
import railo.commons.io.res.util.ResourceUtil;
import railo.commons.lang.StringUtil;
import railo.runtime.exp.PageException;
import railo.runtime.type.util.ListUtil;
public final class HTMLParser extends Parser {
private XMLReader xmlReader;
private String title;
private String charset;
private StringBuffer current;
private StringBuffer content;
private boolean hasChanged;
private String strContent;
private Silent silent=new Silent(null,false);
//private boolean silentBefore=false;
private String description;
private String keywords;
private String author;
private String custom1;
private String custom2;
private String custom3;
private String custom4;
public HTMLParser() {
try {
xmlReader=XMLReaderFactory.createXMLReader(Parser.class.getName());
}
catch (SAXException e) {}
xmlReader.setContentHandler(this);
xmlReader.setErrorHandler(this);
}
/**
* parse a concret url
* @param file
* @param charset
* @throws IOException
* @throws SAXException
* @throws SAXException
*/
public synchronized void parse(File file, String charset) throws IOException, SAXException {
parse(ResourceUtil.toResource(file), charset);
}
public synchronized void parse(Resource res, String charset) throws IOException, SAXException {
title="";
this.charset=charset;
current=new StringBuffer();
content=new StringBuffer();
hasChanged=false;
Reader r=IOUtil.getReader(res,charset);
InputSource is=new InputSource(r);
is.setSystemId(res.toString());
try {
xmlReader.parse(is);
}
finally {
IOUtil.closeEL(r);
}
strContent=content.toString();
}
public synchronized void parse(Reader reader) throws IOException, SAXException {
title="";
this.charset=null;
current=new StringBuffer();
content=new StringBuffer();
hasChanged=false;
InputSource is=new InputSource(reader);
try {
xmlReader.parse(is);
}
finally {
IOUtil.closeEL(reader);
}
strContent=content.toString();
}
@Override
public void startElement(String uri, String name, String qName, Attributes atts)throws SAXException {
if(name.equalsIgnoreCase("script")) {
silent=new Silent(silent,true);
}
else if(name.equalsIgnoreCase("body")) {
silent=new Silent(silent,false);
}
else if(name.equalsIgnoreCase("meta")) {
doMeta(atts);
}
if(hasChanged==false && charset==null && name.equalsIgnoreCase("meta")){
if(atts.getValue("http-equiv")!=null) {
String value=atts.getValue("content");
String el;
String n;
String v;
if(value!=null) {
try {
String[] arr=ListUtil.toStringArray(ListUtil.listToArrayRemoveEmpty(value,';'));
for(int i=0;i<arr.length;i++) {
el=arr[i];
n=ListUtil.first(el,"=",true).trim();
v=ListUtil.last(el,"=",true).trim();
if(n.equalsIgnoreCase("charset")) {
charset=v;
hasChanged=true;
//throw new SAXException("has found charset info");
}
}
}
catch (PageException e) {}
}
}
}
}
private void doMeta(Attributes atts) {
String name=atts.getValue("name");
if(name==null) name="";
else name=name.toLowerCase().trim();
if("description".equals(name)) description=atts.getValue("content");
else if("keywords".equals(name)) keywords=atts.getValue("content");
else if("author".equals(name)) author=atts.getValue("content");
else if("custom1".equals(name)) custom1=atts.getValue("content");
else if("custom2".equals(name)) custom2=atts.getValue("content");
else if("custom3".equals(name)) custom3=atts.getValue("content");
else if("custom4".equals(name)) custom4=atts.getValue("content");
}
// <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
public void endElement(String uri, String name, String qName) {
if(name.equalsIgnoreCase("script")) {
silent=silent.parent;
}
else if(name.equalsIgnoreCase("body")) {
silent=silent.parent;
}
String c=current.toString().trim();
//if(name.equals("title"))print.out(c);
if(c.length()>0) {
if(name.equalsIgnoreCase("title"))title=c;
else {
content.append(c);
content.append('\n');
}
current=new StringBuffer();
}
}
@Override
public void characters (char ch[], int start, int length) {
if(!silent.value)
current.append(ch,start,length);
}
/**
* @return Returns the content.
*/
public String getContent() {
return strContent;
}
/**
* @return Returns the title.
*/
public String getTitle() {
return title;
}
/**
* @return Returns the charset.
*/
public String getCharset() {
return charset;
}
/**
* @return Returns the summary
*/
public String getSummary() {
return description;
}
/**
* @return the keywords
*/
public String getKeywords() {
return keywords;
}
/**
* @return if keywords exists
*/
public boolean hasKeywords() {
return !StringUtil.isEmpty(keywords,true);
}
/**
* @return the author
*/
public String getAuthor() {
return author;
}
/**
* @return if author exists
*/
public boolean hasAuthor() {
return !StringUtil.isEmpty(author,true);
}
public boolean hasCustom1() {
return !StringUtil.isEmpty(custom1,true);
}
public boolean hasCustom2() {
return !StringUtil.isEmpty(custom2,true);
}
public boolean hasCustom3() {
return !StringUtil.isEmpty(custom3,true);
}
public boolean hasCustom4() {
return !StringUtil.isEmpty(custom4,true);
}
/**
* @return the custom1
*/
public String getCustom1() {
return custom1;
}
/**
* @return the custom2
*/
public String getCustom2() {
return custom2;
}
/**
* @return the custom3
*/
public String getCustom3() {
return custom3;
}
/**
* @return the custom4
*/
public String getCustom4() {
return custom4;
}
/*public static void main(String[] args) throws Exception {
HTMLParser parser = new HTMLParser();
parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),null);
//print.ln("title:"+parser.getTitle());
//print.ln(parser.getContent());
parser.parse(new File("C:\\projects\\jmuffin\\webroot\\cfmx\\jm\\test\\tags\\_tuv.htm"),"UTF-8");
//print.ln("title:"+parser.getTitle());
//print.ln(parser.getContent());
}*/
private class Silent {
Silent parent;
boolean value;
/**
* constructor of the class
* @param parent
* @param value
*/
public Silent(Silent parent, boolean value) {
this.parent = parent;
this.value = value;
}
}
}