package lux;
import static lux.index.IndexConfiguration.*;
import static org.junit.Assert.*;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import javax.xml.stream.XMLStreamException;
import lux.index.FieldRole;
import lux.index.IndexConfiguration;
import lux.index.XmlIndexer;
import lux.index.field.FieldDefinition;
import lux.index.field.FieldDefinition.Type;
import lux.index.field.XPathField;
import lux.query.parser.XmlQueryParser;
import lux.search.LuxSearcher;
import net.sf.saxon.s9api.SaxonApiException;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.surround.parser.ParseException;
import org.apache.lucene.queryparser.surround.parser.QueryParser;
import org.apache.lucene.queryparser.surround.query.BasicQueryFactory;
import org.apache.lucene.queryparser.surround.query.SrndQuery;
import org.apache.lucene.queryparser.xml.ParserException;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.RAMDirectory;
import org.junit.After;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
/**
* Measures space and time for different indexing options, and validates
* indexing results.
*
* The timings are off; we'd need to run this repeatedly to avoid transient
* startup effects which overwhelm the measurements for a single run.
*
* But the space numbers (in bytes) should be valid (from Directory.sizeInBytes()):
* XML storage: 3664896 (3.5M)
* qnames = 3692544 - 3664896 = 27648 = 0.75%
* paths = 3717120 - 3664896 = 52224 = 1.4%
*
* After refactoring XmlField, etc:
* XML storage: 2274304 why did this shrink so much? We're now using serializer instead
* of JDOM outputter - could this all be whitespace from indentation or something?
* qnames: 2302976 - 2274304 = 28672 = 1.3%
* paths: 2328576 - 2274304 = 54272 = 2.4%
* path-occurrences = 122880 = 5.1%
* path-values alone: 755712
* path-values (w/docs): 2714624 - 2274304 = 19%
* qname-values (as phrases): 2631680 - 2274304 = 357376 = 16%
* qname-values (hashed into single tokens): 2542592 - 2274304 = 11.8%
* qname-words w/o terminal tokens: 2683904 - 2274304 = 18%
* qname-words + terminal tokens: 2786304 - 2274304 = 22%
* full-text (with all nodes transparent) 3899392 - 2274304 = 1625088 = 71% (1940480 full text alone)
* full-text (text only) 2673664 - 2274304 = 399360 = 18%
* full-text (text plus all nodes opaque) 3068928 - 2274304 = 35%
*
*/
public class IndexTest {
private static final boolean GATHER_TIMING = false;
private RAMDirectory dir;
@Test
public void testIndexPaths() throws Exception {
buildIndex ("paths and xml", INDEX_PATHS | STORE_DOCUMENT | BUILD_DOCUMENT);
assertTotalDocs ();
}
private void reset() {
dir.close();
dir = new RAMDirectory();
}
@Test
public void testIndexPathsOnly () throws Exception {
IndexTestSupport indexTestSupport = buildIndex ("paths", INDEX_PATHS | BUILD_DOCUMENT);
assertTotalDocs ();
// printAllTerms(indexTestSupport);
assertPathQuery (indexTestSupport);
}
@Test
public void testIndexQNames() throws Exception {
buildIndex ("qnames and xml", INDEX_QNAMES | STORE_DOCUMENT | BUILD_DOCUMENT);
assertTotalDocs ();
}
@Test
public void testIndexQNamesOnly () throws Exception {
buildIndex ("qnames", INDEX_QNAMES | BUILD_DOCUMENT);
assertTotalDocs ();
}
@Test
public void testIndexPathOccurOnly () throws Exception {
// IndexTestSupport indexTestSupport =
buildIndex ("path-occurrences", INDEX_PATHS | INDEX_EACH_PATH | BUILD_DOCUMENT);
// printAllTerms(indexTestSupport);
assertTotalDocs ();
}
@Test
public void testIndexFullText () throws Exception {
buildIndex ("full-text", INDEX_FULLTEXT | STORE_DOCUMENT |BUILD_DOCUMENT);
assertTotalDocs ();
// printAllTerms(indexTestSupport);
}
@Test
public void testIndexFullTextOnly () throws Exception {
//IndexTestSupport indexTestSupport =
buildIndex ("full-text-only", INDEX_FULLTEXT);
assertTotalDocs ();
//printAllTerms(indexTestSupport);
}
private void assertPathQuery(IndexTestSupport indexTestSupport) throws ParseException, IOException {
SrndQuery q = new QueryParser ().parse2("w(w({},\"ACT\"),\"SCENE\")");
Query q2 = q.makeLuceneQueryFieldNoBoost(indexTestSupport.indexer.getConfiguration().getFieldName(FieldRole.PATH), new BasicQueryFactory());
DocIdSetIterator iter = indexTestSupport.searcher.search(q2);
int count = 0;
while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
++ count;
}
assertEquals (5, count);
}
private void assertFullTextQuery(IndexTestSupport indexTestSupport, String qName, String term, int expectedCount) throws IOException, ParserException {
LuxSearcher searcher = indexTestSupport.searcher;
XmlIndexer indexer = indexTestSupport.indexer;
IndexConfiguration config = indexer.getConfiguration();
FieldDefinition field = config.getField(FieldRole.ELEMENT_TEXT);
Query q = new XmlQueryParser(field.getName(), field.getAnalyzer()).parse
(new ByteArrayInputStream(("<QNameTextQuery fieldName=\"" +
config.getFieldName(FieldRole.ELEMENT_TEXT) + "\" qName=\"" +
qName + "\">" + term +
"</QNameTextQuery>").getBytes()));
DocIdSetIterator iter = searcher.search(q);
int count = 0;
while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
++ count;
}
assertEquals (expectedCount, count);
}
private void assertXPathIntField (IndexTestSupport indexTestSupport) throws ParseException, IOException {
Query q = NumericRangeQuery.newIntRange("nodecount", 6000, 20000, true, true);
DocIdSetIterator iter = indexTestSupport.searcher.search(q);
int count = 0;
while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
++ count;
}
assertEquals (1, count);
}
private void assertXPathStringField (int expectedCount, String field, String term, IndexTestSupport indexTestSupport) throws ParseException, IOException {
Query q = new TermQuery (new Term (field, term));
DocIdSetIterator iter = indexTestSupport.searcher.search(q);
int count = 0;
while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
++ count;
}
assertEquals ("Wrong number of matches for " + q.toString(), expectedCount, count);
}
@Test
public void testIndexFullTextOneDoc() throws Exception {
XmlIndexer indexer = new XmlIndexer (INDEX_FULLTEXT);
IndexWriter indexWriter = indexer.newIndexWriter(dir);
indexer.indexDocument(indexWriter, "/lux/reader-test.xml",
getClass().getClassLoader().getResourceAsStream("lux/reader-test.xml"));
indexWriter.close();
System.out.println
(String.format("indexed path-values for lux/reader-test.xml in %d bytes", dir.sizeInBytes()));
IndexTestSupport.printAllTerms(dir, indexer);
/*
IndexTestSupport indexTestSupport = new IndexTestSupport ("lux/hamlet.xml", indexer, dir);
assertFullTextQuery (indexTestSupport, "title", "TEST", 1);
*/
}
@Test
public void testStoreBinary () throws Exception {
XmlIndexer indexer = new XmlIndexer(STORE_DOCUMENT);
IndexWriter indexWriter = indexer.newIndexWriter(dir);
indexer.storeDocument(indexWriter, "/lux/compiler/test-module.xqy",
getClass().getClassLoader().getResourceAsStream("lux/compiler/test-module.xqy"));
indexWriter.close();
System.out.println
(String.format("indexed path-values for test-module.xqy in %d bytes", dir.sizeInBytes()));
}
@Test @Ignore
public void testIndexPathValuesOneDoc() throws Exception {
XmlIndexer indexer = new XmlIndexer (INDEX_PATHS | INDEX_VALUES);
IndexWriter indexWriter = indexer.newIndexWriter(dir);
indexer.indexDocument(indexWriter, "/lux/hamlet.xml", getClass().getClassLoader().getResourceAsStream("lux/hamlet.xml"));
indexWriter.close();
System.out.println
(String.format("indexed path-values for hamlet.xml in %d bytes", dir.sizeInBytes()));
// hamlet.xml = 288815 bytes; indexed in 215040 bytes seems ok??
// printAllTerms(new IndexTestSupport(indexer, dir));
}
@Test
public void testIndexPathValuesOnly() throws Exception {
IndexTestSupport indexTestSupport = buildIndex ("path-values", INDEX_PATHS | INDEX_VALUES | BUILD_DOCUMENT);
assertTotalDocs ();
assertPathQuery(indexTestSupport);
}
@Test
public void testIndexPathText () throws Exception {
IndexTestSupport indexTestSupport = buildIndex ("path-text", INDEX_PATHS | INDEX_FULLTEXT | BUILD_DOCUMENT);
assertTotalDocs ();
assertPathQuery(indexTestSupport);
}
@Test
public void testIndexQNameValues() throws Exception {
buildIndex ("qname-values and docs", INDEX_QNAMES | INDEX_VALUES | STORE_DOCUMENT | BUILD_DOCUMENT);
assertTotalDocs ();
}
@Test
public void testIndexQNameText() throws Exception {
IndexTestSupport indexTestSupport = buildIndex ("qname-text and docs", INDEX_QNAMES | INDEX_FULLTEXT | STORE_DOCUMENT | BUILD_DOCUMENT);
assertFullTextQuery (indexTestSupport, "PERSONA", "ROSENCRANTZ", 4);
assertTotalDocs ();
}
@Test
public void testIndexQNameTextOnly() throws Exception {
// IndexTestSupport indexTestSupport =
buildIndex ("qname-text", INDEX_QNAMES | INDEX_FULLTEXT | BUILD_DOCUMENT);
assertTotalDocs ();
// printAllTerms(indexTestSupport);
}
@Test
public void testIndexPathValues() throws Exception {
buildIndex ("path-values and docs", INDEX_PATHS | INDEX_VALUES | STORE_DOCUMENT | BUILD_DOCUMENT);
assertTotalDocs ();
}
@Test
public void testIndexQNamesAndPaths() throws Exception {
IndexTestSupport its = buildIndex ("qnames and paths and docs", INDEX_QNAMES | INDEX_PATHS | STORE_DOCUMENT | BUILD_DOCUMENT);
assertTotalDocs ();
its.close();
buildIndex ("qnames and paths", INDEX_QNAMES | INDEX_PATHS | BUILD_DOCUMENT);
}
@Test
public void testIndexQNamesAndPathsOnly() throws Exception {
buildIndex ("qnames and paths", INDEX_QNAMES | INDEX_PATHS | BUILD_DOCUMENT);
assertTotalDocs ();
}
@Test
public void testStoreDocuments() throws Exception {
buildIndex ("xml storage", STORE_DOCUMENT | BUILD_DOCUMENT);
assertTotalDocs ();
}
@Test
public void testStoreBinaryDocs() throws Exception {
buildIndex ("xml binary storage", STORE_TINY_BINARY | STORE_DOCUMENT | BUILD_DOCUMENT);
assertTotalDocs ();
}
@Test
public void testXPathIndexes () throws Exception {
XmlIndexer indexer = new XmlIndexer (BUILD_DOCUMENT);
indexer.getConfiguration().addField(new XPathField("nodecount", "count(//node())", null, Store.NO, Type.INT));
indexer.getConfiguration().addField(new XPathField("doctype", "name(/*)", null, Store.NO, Type.STRING));
IndexTestSupport indexTestSupport = buildIndex("xpath", indexer);
assertXPathIntField(indexTestSupport);
assertXPathStringField(5, "doctype", "ACT", indexTestSupport);
if (GATHER_TIMING) {
for (int i = 0; i < 5; i++) {
reset();
indexTestSupport = buildIndex("xpath", indexer);
}
}
}
@Test
public void testMultipleXPathIndexes () throws Exception {
XmlIndexer indexer = new XmlIndexer (BUILD_DOCUMENT);
// SCENE comes in as ACT/*[2] - immediately following TITLE
// These can be encoded within a single XPath - we don't allow multiple indexes with the same name
indexer.getConfiguration().addField(new XPathField("x", "name(/*/*[2]),name(/*)", null, Store.NO, Type.STRING));
IndexTestSupport indexTestSupport = buildIndex("xpath", indexer);
assertXPathStringField(25, "x", "SCENE", indexTestSupport);
}
@Test
public void testMultipleXPathIndexesFail () throws Exception {
XmlIndexer indexer = new XmlIndexer (BUILD_DOCUMENT);
// SCENE comes in as ACT/*[2] - immediately following TITLE
indexer.getConfiguration().addField(new XPathField("x", "name(/*/*[2])", null, Store.NO, Type.STRING));
try {
indexer.getConfiguration().addField(new XPathField("x", "name(/*)", null, Store.NO, Type.STRING));
assertTrue ("expected exception not thrown", false);
} catch (IllegalStateException e) {
assertEquals ("Duplicate field name: x", e.getMessage());
}
}
@Test
public void testXPathIndexNamespace () throws Exception {
IndexConfiguration indexConfig = new IndexConfiguration();
indexConfig.defineNamespaceMapping("", "");
indexConfig.defineNamespaceMapping("x", "http://lux.net{test}");
indexConfig.addField(new XPathField("title", "//x:title", new KeywordAnalyzer(), Store.NO, Type.STRING));
XmlIndexer indexer = new XmlIndexer (indexConfig);
IndexTestSupport indexTestSupport = new IndexTestSupport ("lux/reader-test-ns.xml", indexer, dir);
assertXPathStringField(2, "title", "TEST", indexTestSupport);
}
@Before
public void setup() {
dir = new RAMDirectory();
}
@After
public void cleanup() {
dir.close();
}
private IndexTestSupport buildIndex (String desc, int options) throws XMLStreamException, IOException, SaxonApiException {
XmlIndexer indexer = new XmlIndexer (options);
IndexTestSupport index = buildIndex(desc, indexer);
if (GATHER_TIMING) {
for (int i = 0; i < 3; i++) {
reset();
indexer = new XmlIndexer (options);
index = buildIndex (desc, indexer);
}
}
return index;
}
private IndexTestSupport buildIndex(String desc, XmlIndexer indexer) throws XMLStreamException, IOException, SaxonApiException {
long t0 = System.currentTimeMillis();
IndexTestSupport indexTestSupport = new IndexTestSupport ("lux/hamlet.xml", indexer, dir);
System.out.println
(String.format("indexed %s in %d ms %d bytes", desc,
(System.currentTimeMillis()-t0), dir.sizeInBytes()));
return indexTestSupport;
}
@SuppressWarnings("unused")
private void printAllTerms(IndexTestSupport indexTestSupport) throws Exception {
indexTestSupport.printAllTerms();
}
private void assertTotalDocs() throws IOException {
LuxSearcher searcher = new LuxSearcher(dir);
DocIdSetIterator results = searcher.search(new MatchAllDocsQuery());
int count = 0;
while (results.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
++count;
}
assertEquals (6641, count);
/*
*/
searcher.close();
}
}
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */