/* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.riotfamily.search.index;
import java.io.File;
import java.io.IOException;
import javax.servlet.ServletContext;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.riotfamily.crawler.PageData;
import org.riotfamily.crawler.PageHandler;
import org.riotfamily.search.analysis.AnalyzerFactory;
import org.riotfamily.search.analysis.DefaultAnalyzerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.core.io.Resource;
import org.springframework.util.Assert;
import org.springframework.web.context.ServletContextAware;
import org.springframework.web.util.WebUtils;
/**
* PageHandler that creates Lucene documents and adds them to the search index.
*/
public class Indexer implements PageHandler,
ServletContextAware, InitializingBean {
private Logger log = LoggerFactory.getLogger(Indexer.class);
private Directory indexDir;
private File tempDir;
private boolean compound;
private DocumentBuilder documentBuilder;
private AnalyzerFactory analyzerFactory;
private Directory tempIndexDir;
private IndexWriter tempWriter;
/**
* Sets the location where the index should be stored. The given resource
* must point into the file system, i.e. resource.getFile() must return a
* File object.
*/
public void setIndexLocation(Resource resource) throws IOException {
setIndexDir(resource.getFile());
}
/**
* Sets the location where the index should be stored. The given file must
* point to a writable directory. If the directory does not exist it will
* be created.
*/
public void setIndexDir(File dir) throws IOException {
dir.mkdirs();
indexDir = FSDirectory.getDirectory(dir);
}
public void setDocumentBuilder(DocumentBuilder documentBuilder) {
this.documentBuilder = documentBuilder;
}
/**
* Sets the AnalyzerFactory to be used.
*/
public void setAnalyzerFactory(AnalyzerFactory analyzerFactory) {
this.analyzerFactory = analyzerFactory;
}
/**
* Sets whether a compound index file should be used.
* @see IndexWriter#setUseCompoundFile(boolean)
*/
public void setCompound(boolean compound) {
this.compound = compound;
}
public void setServletContext(ServletContext servletContext) {
tempDir = WebUtils.getTempDir(servletContext);
}
public void afterPropertiesSet() throws Exception {
Assert.notNull(documentBuilder, "A DocumentBuilder must be set.");
if (indexDir == null) {
setIndexDir(new File(tempDir, "search-index"));
}
if (analyzerFactory == null) {
analyzerFactory = new DefaultAnalyzerFactory();
}
File d = new File(tempDir, "temp-search-index");
d.mkdir();
tempIndexDir = FSDirectory.getDirectory(d);
}
private Analyzer getAnalyzer(Document document) {
String language = document.get(DocumentBuilder.LANGUAGE);
return analyzerFactory.getAnalyzer(language);
}
/**
* Creates a new IndexWriter that writes to a temporary location. When
* {@link #crawlerFinished()} is invoked, this temporary index is moved to
* its final destination.
*/
public void crawlerStarted() {
try {
tempWriter = new IndexWriter(tempIndexDir, null, true);
tempWriter.setUseCompoundFile(compound);
}
catch (IOException e) {
log.error("Error", e);
}
}
public void handlePage(PageData pageData) {
if (tempWriter == null) {
return;
}
try {
Document document = documentBuilder.buildDocument(pageData);
if (document != null) {
tempWriter.addDocument(document, getAnalyzer(document));
}
}
catch (IOException e) {
log.error("Error indexing page", e);
}
}
public void handlePageIncremental(PageData pageData) {
try {
log.info("Updating index for " + pageData.getUrl());
boolean indexExists = IndexReader.indexExists(indexDir);
if (indexExists) {
IndexReader reader = IndexReader.open(indexDir);
reader.deleteDocuments(new Term(DocumentBuilder.URL, pageData.getUrl()));
reader.close();
}
Document doc = documentBuilder.buildDocument(pageData);
if (doc != null) {
IndexWriter indexWriter = new IndexWriter(indexDir,
getAnalyzer(doc), !indexExists);
indexWriter.addDocument(doc);
indexWriter.close();
}
}
catch (IOException e) {
log.error("Error indexing page", e);
}
}
public void crawlerFinished() {
try {
tempWriter.close();
IndexWriter indexWriter = new IndexWriter(indexDir, null, true);
indexWriter.addIndexes(new Directory[] { tempIndexDir });
indexWriter.close();
}
catch (IOException e) {
log.error("Error", e);
}
tempWriter = null;
}
}