package lux.index.field;
import static org.junit.Assert.*;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import lux.index.analysis.AttributeTokenStream;
import lux.index.analysis.DefaultAnalyzer;
import lux.index.analysis.ElementTokenStream;
import lux.index.analysis.ElementVisibility;
import lux.index.analysis.XmlTextTokenStream;
import lux.index.analysis.XmlTokenStreamBase;
import lux.xml.OffsetDocBuilder;
import lux.xml.Offsets;
import lux.xml.XmlReader;
import net.sf.saxon.s9api.Processor;
import net.sf.saxon.s9api.XdmNode;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CharSequenceReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.junit.Test;
public class QNameTokenStreamTest {
Processor processor;
TokenStream tokenStream;
CharTermAttribute termAtt;
PositionIncrementAttribute posAtt;
OffsetAttribute offsetAtt;
String inputString;
@Test
public void testElementTokenStream() throws Exception {
setup("lux/reader-test.xml", ElementTokenStream.class);
((XmlTokenStreamBase) tokenStream).setDefaultVisibility(ElementVisibility.TRANSPARENT);
verifyAllElementText();
}
@Test
public void testElementTokenStreamContainer() throws Exception {
setup("lux/reader-test.xml", ElementTokenStream.class);
((XmlTokenStreamBase) tokenStream).setDefaultVisibility(ElementVisibility.OPAQUE);
int testNameCode = processor.getUnderlyingConfiguration().getNamePool().allocateClarkName("test");
((XmlTokenStreamBase) tokenStream).setElementVisibility(testNameCode, ElementVisibility.CONTAINER);
verifyAllElementText();
}
private void verifyAllElementText() throws IOException {
assertToken("title:test", 1);
assertToken("test:test", 0);
assertToken("entities:0", 1);
assertToken("test:0", 0);
// check position increments for tokens in a phrase
// also test correct offset calculation for CDATA
for (String token : "this is some markup that is escaped".split(" ")) {
assertToken("test:" + token, 1);
}
assertToken ("entities:ģé", 1);
assertToken ("test:ģé", 0);
assertToken ("token:12345678", 1);
assertToken ("test:12345678", 0);
assertToken ("test:the", 1);
assertToken ("test:end", 1);
assertFalse (tokenStream.incrementToken());
}
@Test
public void testOpaqueElementTokenStream() throws Exception {
setup("lux/reader-test.xml", ElementTokenStream.class);
assertEquals ("unexpected default", ElementVisibility.OPAQUE, ((XmlTokenStreamBase) tokenStream).getDefaultVisibility());
assertToken("title:test", 1);
assertToken("entities:0", 1);
// check position increments for tokens in a phrase
// also test correct offset calculation for CDATA
for (String token : "this is some markup that is escaped".split(" ")) {
assertToken("test:" + token, 1);
}
assertToken ("entities:ģé", 1);
assertToken ("token:12345678", 1);
assertToken ("test:the", 1);
assertToken ("test:end", 1);
assertFalse (tokenStream.incrementToken());
}
@Test
public void testTextTokenStream() throws Exception {
setup("lux/reader-test.xml", XmlTextTokenStream.class);
assertToken("test", 1);
assertToken("0", 1);
// check position increments for tokens in a phrase
// also test correct offset calculation for CDATA
for (String token : "this is some markup that is escaped".split(" ")) {
assertToken(token, 1);
}
assertToken ("ģé", 1);
assertToken ("12345678", 1);
assertToken ("the", 1);
assertToken ("end", 1);
assertFalse (tokenStream.incrementToken());
}
@Test
public void testAttributeTokenStream() throws Exception {
setup("lux/reader-test.xml", AttributeTokenStream.class);
assertTokenNoOffsets("id:test", 1);
assertTokenNoOffsets("id:2", 1);
assertFalse (tokenStream.incrementToken());
}
@Test
public void testNoTextDocument () throws Exception {
setup("lux/no-text.xml", AttributeTokenStream.class);
assertTokenNoOffsets("id:1", 1);
assertTokenNoOffsets("id:2", 1);
assertFalse (tokenStream.incrementToken());
setup("lux/no-text.xml", ElementTokenStream.class);
assertFalse (tokenStream.incrementToken());
setup("lux/no-text.xml", XmlTextTokenStream.class);
assertFalse (tokenStream.incrementToken());
}
private void setup(String filename, Class<?> tokenStreamClass) throws Exception {
byte[] input = IOUtils.toByteArray(getClass().getClassLoader().getResourceAsStream(filename));
inputString = new String (input, "utf-8");
processor = new Processor(false);
OffsetDocBuilder builder = new OffsetDocBuilder(processor);
boolean hasCRLF = false;
for (byte b : input) {
if (b == '\r') {
hasCRLF = true;
break;
}
}
builder.setFixupCRLF(hasCRLF); // TODO: should be autodetected
XmlReader reader = new XmlReader();
reader.addHandler(builder);
reader.read(new ByteArrayInputStream(input));
XdmNode doc = builder.getDocument();
DefaultAnalyzer defaultAnalyzer = new DefaultAnalyzer();
TokenStream textTokens = defaultAnalyzer.tokenStream("dummy", new CharSequenceReader(""));
tokenStream = (TokenStream) tokenStreamClass.getConstructor(String.class, Analyzer.class, TokenStream.class, XdmNode.class, Offsets.class, Processor.class).
newInstance("dummy", defaultAnalyzer, textTokens, doc, builder.getOffsets(), processor);
termAtt = tokenStream.addAttribute(CharTermAttribute.class);
offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
posAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
tokenStream.reset();
}
private void assertTokenNoOffsets(String token, int posIncr) throws IOException {
assertTrue ("Token stream ended unexpectedly", tokenStream.incrementToken());
assertEquals (token, termAtt.toString());
assertEquals (posIncr, posAtt.getPositionIncrement());
}
private void assertToken(String token, int posIncr) throws IOException {
assertTokenNoOffsets(token, posIncr);
String t = inputString.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
String term = token.substring(token.indexOf(':') + 1);
assertEquals ("incorrect character offset", term, normalize(t));
}
private Object normalize(String t) {
// It might be nice to have a slightly more general normalization routine
t = t.replace ("0", "0");
t = t.replace ("ģ", "\u0123");
t = t.toLowerCase();
return t;
}
}