/*
* Copyright 2011 Stefan Partusch
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.spartusch.nasfvi.server;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import de.spartusch.FileMethods;
/**
* An index that ingests XML data.
* When ingesting XML node names become field names. The XML format expected
* by this class is described briefly at
* {@link de.spartusch.nasfvi.EventExtractor EventExtractor}.
* @author Stefan Partusch
*
*/
public class XmlIndex {
private static final Logger LOGGER =
Logger.getLogger(XmlIndex.class.getName());
/** The actual index. */
private final Directory index;
/** Searcher opened on the index. */
private NSearcher searcher;
/** Configuration of the index. */
private IndexWriterConfig config;
/** A set of all semesters in the index. */
private SortedSet<String> semesters;
/**
* Creates a new index and writes it to the file system or opens an index
* from the file system.
* @param dir Directory of the index
* @param newIndex true to create a new index, i.e. to delete
* <code>dir</code> before opening an index in <code>dir</code>
* @param analyzer Analyzer to use for ingestions
* @throws IOException If an IO error occurs
*/
public XmlIndex(final File dir, final boolean newIndex,
final Analyzer analyzer) throws IOException {
if (newIndex && dir.exists()) {
LOGGER.info("Deleting " + dir);
FileMethods.delete(dir);
}
index = FSDirectory.open(dir);
config = new IndexWriterConfig(Version.LUCENE_35, analyzer);
config.setOpenMode(OpenMode.CREATE_OR_APPEND);
semesters = new TreeSet<String>();
}
/**
* Creates a new index in memory only. This index is not written
* to the file system and is thus not persistent.
* @param analyzer Analyzer to use for ingestions
* @throws IOException If an IO error occurs
*/
public XmlIndex(final Analyzer analyzer) throws IOException {
index = new RAMDirectory();
config = new IndexWriterConfig(Version.LUCENE_35, analyzer);
config.setOpenMode(OpenMode.CREATE);
semesters = new TreeSet<String>();
}
/**
* Returns an opened searcher on the index.
* @return Searcher on the index
*/
public final synchronized NSearcher getSearcher() {
return searcher;
}
/**
* Returns the analyzer used for ingestions.
* @return Analyzer used for ingestions
*/
public final Analyzer getAnalyzer() {
return config.getAnalyzer();
}
/**
* Returns a sorted set of all semesters in the index.
* @return Sorted set of indexed semesters
*/
public final SortedSet<String> getIndexedSemesters() {
return new TreeSet<String>(semesters);
}
/**
* Ingests an XML source into the index.
* @param xmlSource Source to ingest
* @throws SAXException If parsing fails
* @throws IOException If an IO error occurs
*/
public final synchronized void ingest(final InputStream xmlSource)
throws SAXException, IOException {
LOGGER.info("Starting ingestion");
XMLReader xr = XMLReaderFactory.createXMLReader();
XmlIndexHandler handler = new XmlIndexHandler();
xr.setContentHandler(handler);
xr.setErrorHandler(handler);
xr.parse(new InputSource(xmlSource));
LOGGER.info(handler.documentsAdded() + " documents added");
searcher = new NSearcher(
new IndexSearcher(IndexReader.open(index, true))
);
}
/**
* A SAX2 event handler for parsing and ingesting XML documents.
* @author Stefan Partusch
*
*/
private final class XmlIndexHandler extends DefaultHandler {
/** Tag on which to start a new document. */
private static final String NEW_DOC_TAG = "veranstaltung";
/** Name of the root tag. */
private static final String ROOT_TAG = "veranstaltungen";
/** The document currently under construction. */
private Document doc;
/** Name of the current field/tag. */
private StringBuilder currentField;
/** Writes data to the index. */
private IndexWriter writer;
/** Number of documents added to the index. */
private int docsAdded;
public XmlIndexHandler() throws IOException {
writer = new IndexWriter(index, config);
}
@Override
public void startElement(final String uri, final String localName,
final String qName, final Attributes atts)
throws SAXException {
if (NEW_DOC_TAG.equals(localName)) {
doc = new Document();
currentField = null;
} else if (!ROOT_TAG.equals(localName)) {
if (doc == null) {
throw new RuntimeException("newDocumentTag missing");
}
currentField = new StringBuilder();
}
}
@Override
public void characters(final char[] ch,
final int start, final int length) throws SAXException {
if (currentField == null) {
return;
}
for (int i = start; i < start + length; i++) {
currentField.append(ch[i]);
}
}
@Override
public void endElement(final String uri, final String localName,
final String qName) throws SAXException {
if (NEW_DOC_TAG.equals(localName)) {
try {
addDocumentId();
addSemesterBeginEnd();
writer.addDocument(doc);
docsAdded++;
semesters.add(doc.get("semester"));
} catch (IOException e) {
LOGGER.severe(e.toString());
}
} else if (!ROOT_TAG.equals(localName)) {
String value = currentField.toString();
Field.TermVector storeVector = Field.TermVector.NO;
Field.Store storeField = Field.Store.YES;
float boost = 1.0f;
if ("titel".equals(localName)) {
storeVector = Field.TermVector.YES;
boost = 2.5f;
} else if ("beschreibung".equals(localName)) {
storeVector = Field.TermVector.YES;
storeField = Field.Store.NO;
boost = 1.5f;
}
Field field =
new Field(localName, value, storeField,
Field.Index.ANALYZED, storeVector);
field.setBoost(boost);
doc.add(field);
}
}
@Override
public void endDocument() throws SAXException {
try {
writer.commit();
} catch (CorruptIndexException e) {
LOGGER.severe(e.toString());
throw new RuntimeException(e);
} catch (IOException e) {
LOGGER.severe(e.toString());
} finally {
try {
writer.close(true);
} catch (IOException e) {
LOGGER.severe(e.toString());
}
}
}
@Override
public void error(final SAXParseException e) throws SAXException {
LOGGER.severe(e.toString());
throw e;
}
@Override
public void fatalError(final SAXParseException e) throws SAXException {
LOGGER.severe(e.toString());
throw e;
}
@Override
public void warning(final SAXParseException e) throws SAXException {
LOGGER.warning(e.toString());
}
/**
* Returns the number of documents added using this handler.
* @return Number of documents added
*/
public int documentsAdded() {
return docsAdded;
}
/**
* Creates and sets an ID field in the current document.
* @throws IOException If an IO error occurs
*/
private void addDocumentId() throws IOException {
String id = String.valueOf(writer.numDocs());
Field field =
new Field("id", id, Field.Store.YES,
Field.Index.NOT_ANALYZED);
doc.add(field);
}
/**
* Adds the fields <code>semester_beg</code> and
* <code>semester_end</code> to the current document. These fields
* indicate the beginning and the end of the document's semester to
* make these dates searchable.
*/
private void addSemesterBeginEnd() {
Semester sem = new Semester(doc.get("semester"));
Field field = new Field("semester_beg", sem.getBegin(),
Field.Store.NO, Field.Index.NOT_ANALYZED);
doc.add(field);
field = new Field("semester_end", sem.getEnd(),
Field.Store.NO, Field.Index.NOT_ANALYZED);
doc.add(field);
}
}
}