Indexer.java example

Explorer
riot-master
- riot-9.1.x
/* Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.riotfamily.search.index;

import java.io.File;
import java.io.IOException;

import javax.servlet.ServletContext;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.riotfamily.crawler.PageData;
import org.riotfamily.crawler.PageHandler;
import org.riotfamily.search.analysis.AnalyzerFactory;
import org.riotfamily.search.analysis.DefaultAnalyzerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.core.io.Resource;
import org.springframework.util.Assert;
import org.springframework.web.context.ServletContextAware;
import org.springframework.web.util.WebUtils;

/**
 * PageHandler that creates Lucene documents and adds them to the search  index.
 */
public class Indexer implements PageHandler,
		ServletContextAware, InitializingBean {

	private Logger log = LoggerFactory.getLogger(Indexer.class);

	private Directory indexDir;

	private File tempDir;

	private boolean compound;

	private DocumentBuilder documentBuilder;
	
	private AnalyzerFactory analyzerFactory;
	
	private Directory tempIndexDir;

	private IndexWriter tempWriter;
	
	/**
	 * Sets the location where the index should be stored. The given resource
	 * must point into the file system, i.e. resource.getFile() must return a
	 * File object.
	 */
	public void setIndexLocation(Resource resource) throws IOException {
		setIndexDir(resource.getFile());
	}

	/**
	 * Sets the location where the index should be stored. The given file must
	 * point to a writable directory. If the directory does not exist it will
	 * be created.
	 */
	public void setIndexDir(File dir) throws IOException {
		dir.mkdirs();
		indexDir = FSDirectory.getDirectory(dir);
	}

	public void setDocumentBuilder(DocumentBuilder documentBuilder) {
		this.documentBuilder = documentBuilder;
	}
	
	/**
	 * Sets the AnalyzerFactory to be used.
	 */
	public void setAnalyzerFactory(AnalyzerFactory analyzerFactory) {
		this.analyzerFactory = analyzerFactory;
	}

	/**
	 * Sets whether a compound index file should be used.
	 * @see IndexWriter#setUseCompoundFile(boolean)
	 */
	public void setCompound(boolean compound) {
		this.compound = compound;
	}

	public void setServletContext(ServletContext servletContext) {
		tempDir = WebUtils.getTempDir(servletContext);
	}

	public void afterPropertiesSet() throws Exception {
		Assert.notNull(documentBuilder, "A DocumentBuilder must be set.");
		if (indexDir == null) {
			setIndexDir(new File(tempDir, "search-index"));
		}
		if (analyzerFactory == null) {
			analyzerFactory = new DefaultAnalyzerFactory();
		}
		File d = new File(tempDir, "temp-search-index");
		d.mkdir();
		tempIndexDir = FSDirectory.getDirectory(d);
	}

	private Analyzer getAnalyzer(Document document) {
		String language = document.get(DocumentBuilder.LANGUAGE);
		return analyzerFactory.getAnalyzer(language);
	}
	
	/**
	 * Creates a new IndexWriter that writes to a temporary location. When
	 * {@link #crawlerFinished()} is invoked, this temporary index is moved to
	 * its final destination.
	 */
	public void crawlerStarted() {
		try {
			tempWriter = new IndexWriter(tempIndexDir, null, true);
			tempWriter.setUseCompoundFile(compound);
		}
		catch (IOException e) {
			log.error("Error", e);
		}
	}

	public void handlePage(PageData pageData) {
		if (tempWriter == null) {
			return;
		}
		try {
			Document document = documentBuilder.buildDocument(pageData); 
			if (document != null) {
				tempWriter.addDocument(document, getAnalyzer(document));
			}
		}
		catch (IOException e) {
			log.error("Error indexing page", e);
		}
	}

	public void handlePageIncremental(PageData pageData) {
		try {
			log.info("Updating index for " + pageData.getUrl());
			boolean indexExists = IndexReader.indexExists(indexDir);
			if (indexExists) {
				IndexReader reader = IndexReader.open(indexDir);
				reader.deleteDocuments(new Term(DocumentBuilder.URL, pageData.getUrl()));
				reader.close();
			}
			Document doc = documentBuilder.buildDocument(pageData);
			if (doc != null) {
				IndexWriter indexWriter = new IndexWriter(indexDir, 
						getAnalyzer(doc), !indexExists);
				
				indexWriter.addDocument(doc);
				indexWriter.close();
			}
		}
		catch (IOException e) {
			log.error("Error indexing page", e);
		}
	}

	public void crawlerFinished() {
		try {
			tempWriter.close();
			IndexWriter indexWriter = new IndexWriter(indexDir, null, true);
			indexWriter.addIndexes(new Directory[] { tempIndexDir });
			indexWriter.close();
		}
		catch (IOException e) {
			log.error("Error", e);
		}
		tempWriter = null;
	}

}