package railo.runtime.search.lucene2.docs; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.document.DateField; import org.apache.lucene.document.Document; import railo.commons.io.IOUtil; import railo.commons.io.res.Resource; import railo.commons.lang.StringUtil; import railo.runtime.op.Caster; import railo.runtime.search.lucene2.html.HTMLParser; /** A utility for making Lucene Documents for HTML documents. */ public final class HTMLDocument { private static final char FILE_SEPARATOR = System.getProperty("file.separator").charAt(0); public static String uid(Resource f) { return f.getPath().replace(FILE_SEPARATOR, '\u0000') + "\u0000" + DateField.timeToString(f.lastModified()); } public static String uid2url(String uid) { String url = uid.replace('\u0000', '/'); // replace nulls with slashes return url.substring(0, url.lastIndexOf('/')); // remove date from end } public static Document getDocument(Resource res,String charset) { Document doc = new Document(); doc.add(FieldUtil.Text("uid", uid(res), false)); HTMLParser parser = new HTMLParser(); try { parser.parse(res,charset); } catch (Throwable t) { return doc; } addContent(doc,parser); return doc; } public static Document getDocument(StringBuffer content, Reader reader) { Document doc = new Document(); HTMLParser parser = new HTMLParser(); try { String str = IOUtil.toString(reader); if(content!=null)content.append(str); doc.add(FieldUtil.UnIndexed("size", Caster.toString(str.length()))); StringReader sr = new StringReader(str); parser.parse(sr); } catch (Throwable t) { //t.printStackTrace(); return doc; } addContent(doc, parser); return doc; } private static void addContent(Document doc, HTMLParser parser) { FieldUtil.setMimeType(doc,"text/html"); //doc.add(FieldUtil.UnIndexed("mime-type", "text/html")); String content = parser.getContent(); FieldUtil.setTitle(doc,parser.getTitle()); String summary = parser.getSummary(); if(StringUtil.isEmpty(summary)){ summary=(content.length()<=200)? content:content.substring(0,200); FieldUtil.setSummary(doc,summary,false); } else{ FieldUtil.setSummary(doc,summary,true); } FieldUtil.setRaw(doc,content); FieldUtil.setContent(doc,content); //doc.add(FieldUtil.UnIndexed("charset", StringUtil.valueOf(parser.getCharset()))); if(parser.hasKeywords()) { FieldUtil.setKeywords(doc,parser.getKeywords()); } if(parser.hasAuthor()){ FieldUtil.setAuthor(doc,parser.getAuthor()); } if(parser.hasCustom1()){ FieldUtil.setCustom(doc,parser.getCustom1(),1); } if(parser.hasCustom2()){ FieldUtil.setCustom(doc,parser.getCustom2(),2); } if(parser.hasCustom3()){ FieldUtil.setCustom(doc,parser.getCustom3(),3); } if(parser.hasCustom4()){ FieldUtil.setCustom(doc,parser.getCustom4(),4); } } private HTMLDocument() {} }