/* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.riotfamily.search.index.html;
import java.util.Iterator;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.htmlparser.NodeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.util.NodeList;
import org.riotfamily.common.util.Generics;
import org.riotfamily.crawler.PageData;
import org.riotfamily.search.index.DocumentBuilder;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.util.StringUtils;
public class HtmlDocumentBuilder implements DocumentBuilder, InitializingBean {
private NodeFilter contentFilter = new NodeClassFilter(BodyTag.class);
private NodeFilter headingsFilter = new NodeClassFilter(HeadingTag.class);
private FieldExtractor titleExtractor;
private FieldExtractor languageExtractor;
private Map<String, FieldExtractor> customFieldExtractors;
/**
* Sets a NodeFilter that is used to extract the ancestor(s) of the nodes
* that should be indexed.
*/
public void setContentFilter(NodeFilter contentFilter) {
this.contentFilter = contentFilter;
}
/**
* Sets a NodeFilter that is used to extract headlines from the content.
*/
public void setHeadingsFilter(NodeFilter headingsFilter) {
this.headingsFilter = headingsFilter;
}
/**
* Sets a FieldExtractor that is used to extract the page title.
*/
public void setTitleExtractor(FieldExtractor titleExtractor) {
this.titleExtractor = titleExtractor;
}
/**
* Sets a FieldExtractor that is used to extract the document language.
* The returned value should be a lower case two-letter ISO code, or
* <code>null</code> if the language can not be determined.
*/
public void setLanguageExtractor(FieldExtractor languageExtractor) {
this.languageExtractor = languageExtractor;
}
/**
* Sets a {@link Map} of {@link FieldExtractor}s. To define an arbitrary
* field within your index, put a {@link FieldExtractor} in this
* {@link Map}. The key of this entry is used as the name of the field.
*/
public void setCustomFieldExtractors(Map<String, FieldExtractor> customFieldExtractors) {
this.customFieldExtractors = customFieldExtractors;
}
/**
* Adds a custom {@link FieldExtractor} for the specified field name.
*/
public void addCustomFieldExtractor(String field, FieldExtractor extractor) {
if (customFieldExtractors == null) {
customFieldExtractors = Generics.newTreeMap();
}
customFieldExtractors.put(field, extractor);
}
public void afterPropertiesSet() throws Exception {
if (titleExtractor == null) {
titleExtractor = new TitleTagExtractor();
}
if (languageExtractor == null) {
languageExtractor = new DefaultLanguageExtractor();
}
}
public Document buildDocument(PageData pageData) {
NodeList nodes = pageData.getNodes();
if (nodes == null) {
return null;
}
String robots = HtmlParserUtils.getMeta(nodes, "robots");
if (robots != null && robots.toLowerCase().indexOf("noindex") != -1) {
return null;
}
Document doc = new Document();
doc.add(new Field(URL, pageData.getUrl(),
Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.add(new Field(CONTENT_TYPE, "text/html", Field.Store.YES,
Field.Index.UN_TOKENIZED));
String title = titleExtractor.getFieldValue(pageData);
if (StringUtils.hasText(title)) {
doc.add(new Field(TITLE, title,
Field.Store.YES, Field.Index.UN_TOKENIZED));
}
if (languageExtractor != null) {
String language = languageExtractor.getFieldValue(pageData);
if (StringUtils.hasText(language)) {
doc.add(new Field(LANGUAGE, language,
Field.Store.NO, Field.Index.UN_TOKENIZED));
}
}
addKeywords(doc, HtmlParserUtils.getMeta(nodes, "keywords"));
addKeywords(doc, HtmlParserUtils.getMeta(nodes, "description"));
NodeList nodesToIndex = nodes.extractAllNodesThatMatch(contentFilter, true);
addKeywords(doc, HtmlParserUtils.extractText(nodesToIndex, headingsFilter));
String content = HtmlParserUtils.toText(nodesToIndex);
if (StringUtils.hasText(content)) {
doc.add(new Field(CONTENT, content,
Field.Store.YES, Field.Index.TOKENIZED));
}
if (customFieldExtractors != null) {
Iterator<Map.Entry<String, FieldExtractor>> i = customFieldExtractors.entrySet().iterator();
while (i.hasNext()) {
Map.Entry<String, FieldExtractor> entry = i.next();
String key = entry.getKey();
FieldExtractor extractor = entry.getValue();
String value = extractor.getFieldValue(pageData);
if (value != null) {
doc.add(new Field(key, value,
Field.Store.YES, Field.Index.UN_TOKENIZED));
}
}
}
return doc;
}
private void addKeywords(Document doc, String keywords) {
if (StringUtils.hasText(keywords)) {
doc.add(new Field(KEYWORDS, keywords,
Field.Store.NO, Field.Index.TOKENIZED));
}
}
}