XmlIndex.java example

/*
 * Copyright 2011 Stefan Partusch
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.spartusch.nasfvi.server;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.logging.Logger;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import de.spartusch.FileMethods;

/**
 * An index that ingests XML data.
 * When ingesting XML node names become field names. The XML format expected
 * by this class is described briefly at
 * {@link de.spartusch.nasfvi.EventExtractor EventExtractor}. 
 * @author Stefan Partusch 
 *
 */
public class XmlIndex {
	private static final Logger LOGGER =
		Logger.getLogger(XmlIndex.class.getName());

	/** The actual index. */
	private final Directory index;
	/** Searcher opened on the index. */
	private NSearcher searcher;
	/** Configuration of the index. */
	private IndexWriterConfig config;
	/** A set of all semesters in the index. */
	private SortedSet<String> semesters;

	/**
	 * Creates a new index and writes it to the file system or opens an index
	 * from the file system.
	 * @param dir Directory of the index
	 * @param newIndex true to create a new index, i.e. to delete
	 * <code>dir</code> before opening an index in <code>dir</code>
	 * @param analyzer Analyzer to use for ingestions
	 * @throws IOException If an IO error occurs
	 */
	public XmlIndex(final File dir, final boolean newIndex,
			final Analyzer analyzer) throws IOException {
		if (newIndex && dir.exists()) {
			LOGGER.info("Deleting " + dir);
			FileMethods.delete(dir);
		}
		index = FSDirectory.open(dir);
		config = new IndexWriterConfig(Version.LUCENE_35, analyzer);
		config.setOpenMode(OpenMode.CREATE_OR_APPEND);
		semesters = new TreeSet<String>();
	}

	/**
	 * Creates a new index in memory only. This index is not written
	 * to the file system and is thus not persistent.
	 * @param analyzer Analyzer to use for ingestions
	 * @throws IOException If an IO error occurs
	 */
	public XmlIndex(final Analyzer analyzer) throws IOException {
		index = new RAMDirectory();
		config = new IndexWriterConfig(Version.LUCENE_35, analyzer);
		config.setOpenMode(OpenMode.CREATE);
		semesters = new TreeSet<String>();
	}

	/**
	 * Returns an opened searcher on the index.
	 * @return Searcher on the index
	 */
	public final synchronized NSearcher getSearcher() {
		return searcher;
	}

	/**
	 * Returns the analyzer used for ingestions.
	 * @return Analyzer used for ingestions
	 */
	public final Analyzer getAnalyzer() {
		return config.getAnalyzer();
	}

	/**
	 * Returns a sorted set of all semesters in the index.
	 * @return Sorted set of indexed semesters
	 */
	public final SortedSet<String> getIndexedSemesters() {
		return new TreeSet<String>(semesters);
	}

	/**
	 * Ingests an XML source into the index.
	 * @param xmlSource Source to ingest
	 * @throws SAXException If parsing fails
	 * @throws IOException If an IO error occurs
	 */
	public final synchronized void ingest(final InputStream xmlSource)
			throws SAXException, IOException {
		LOGGER.info("Starting ingestion");

		XMLReader xr = XMLReaderFactory.createXMLReader();
		XmlIndexHandler handler = new XmlIndexHandler();
		xr.setContentHandler(handler);
		xr.setErrorHandler(handler);
		xr.parse(new InputSource(xmlSource));

		LOGGER.info(handler.documentsAdded() + " documents added");
		searcher = new NSearcher(
			new IndexSearcher(IndexReader.open(index, true))
		);
	}

	/**
	 * A SAX2 event handler for parsing and ingesting XML documents.
	 * @author Stefan Partusch
	 *
	 */
	private final class XmlIndexHandler extends DefaultHandler {
		/** Tag on which to start a new document. */
		private static final String NEW_DOC_TAG = "veranstaltung";
		/** Name of the root tag. */
		private static final String ROOT_TAG = "veranstaltungen";

		/** The document currently under construction. */
		private Document doc;
		/** Name of the current field/tag. */
		private StringBuilder currentField;
		/** Writes data to the index. */
		private IndexWriter writer;
		/** Number of documents added to the index. */
		private int docsAdded;

		public XmlIndexHandler() throws IOException {
			writer = new IndexWriter(index, config);
		}

		@Override
		public void startElement(final String uri, final String localName,
				final String qName, final Attributes atts)
				throws SAXException {
			if (NEW_DOC_TAG.equals(localName)) {
				doc = new Document();
				currentField = null;
			} else if (!ROOT_TAG.equals(localName)) {
				if (doc == null) {
					throw new RuntimeException("newDocumentTag missing");
				}
				currentField = new StringBuilder();
			}
		}

		@Override
		public void characters(final char[] ch,
				final int start, final int length) throws SAXException {
			if (currentField == null) {
				return;
			}
			for (int i = start; i < start + length; i++) {
				currentField.append(ch[i]);
			}
		}

		@Override
		public void endElement(final String uri, final String localName,
				final String qName) throws SAXException {
			if (NEW_DOC_TAG.equals(localName)) {
				try {
					addDocumentId();
					addSemesterBeginEnd();
					writer.addDocument(doc);
					docsAdded++;
					semesters.add(doc.get("semester"));
				} catch (IOException e) {
					LOGGER.severe(e.toString());
				}
			} else if (!ROOT_TAG.equals(localName)) {
				String value = currentField.toString();
				Field.TermVector storeVector = Field.TermVector.NO;
				Field.Store storeField = Field.Store.YES;
				float boost = 1.0f;

				if ("titel".equals(localName)) {
					storeVector = Field.TermVector.YES;
					boost = 2.5f;
				} else if ("beschreibung".equals(localName)) {
					storeVector = Field.TermVector.YES;
					storeField = Field.Store.NO;
					boost = 1.5f;
				}

				Field field =
					new Field(localName, value, storeField,
							Field.Index.ANALYZED, storeVector);
				field.setBoost(boost);

				doc.add(field);
			}
		}

		@Override
		public void endDocument() throws SAXException {
			try {
				writer.commit();
			} catch (CorruptIndexException e) {
				LOGGER.severe(e.toString());
				throw new RuntimeException(e);
			} catch (IOException e) {
				LOGGER.severe(e.toString());
			} finally {
				try {
					writer.close(true);
				} catch (IOException e) {
					LOGGER.severe(e.toString());
				}
			}
		}

		@Override
		public void error(final SAXParseException e) throws SAXException {
			LOGGER.severe(e.toString());
			throw e;
		}

		@Override
		public void fatalError(final SAXParseException e) throws SAXException {
			LOGGER.severe(e.toString());
			throw e;
		}

		@Override
		public void warning(final SAXParseException e) throws SAXException {
			LOGGER.warning(e.toString());
		}

		/**
		 * Returns the number of documents added using this handler.
		 * @return Number of documents added
		 */
		public int documentsAdded() {
			return docsAdded;
		}

		/**
		 * Creates and sets an ID field in the current document.
		 * @throws IOException If an IO error occurs
		 */
		private void addDocumentId() throws IOException {
			String id = String.valueOf(writer.numDocs());

			Field field =
				new Field("id", id, Field.Store.YES,
						Field.Index.NOT_ANALYZED);
			doc.add(field);
		}

		/**
		 * Adds the fields <code>semester_beg</code> and
		 * <code>semester_end</code> to the current document. These fields
		 * indicate the beginning and the end of the document's semester to
		 * make these dates searchable.
		 */
		private void addSemesterBeginEnd() {
			Semester sem = new Semester(doc.get("semester"));

			Field field = new Field("semester_beg", sem.getBegin(),
					Field.Store.NO, Field.Index.NOT_ANALYZED);
			doc.add(field);

			field = new Field("semester_end", sem.getEnd(),
					Field.Store.NO, Field.Index.NOT_ANALYZED);
			doc.add(field);
		}
	}
}