package org.cdlib.xtf.util;
/**
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.cdlib.xtf.textIndexer.tokenizer.XTFTokenizer;
/**
* Like Lucene's StandardTokenizer, but handles the easy cases very quickly.
* Punts the hard cases to a real StandardTokenizer, but this is rare enough
* that the speed increase is very substantial.
*
* Does not currently support Chinese/Japanese/Korean, but adding this support
* would be pretty easy.
*
* @author Martin Haye
*/
public class FastTokenizer extends Tokenizer
{
/** Array of characters to read from */
private char[] source;
/** Position within the {@link #source} array */
private int pos = 0;
/** We use a special character to mark the end of a
* {@link DribbleReader}.
*/
static final char fakeChar = '\u1049';
/** This is the special word used by DribbleReader */
static final String fakeWord = "" + fakeChar;
/**
* Used to dribble out tokens to a standard tokenizer; used when
* we encounter a case that's hard to figure out.
*/
private DribbleReader dribbleReader;
/** Standard tokenizer, used for hard cases only */
private Tokenizer stdTokenizer;
private static final char[] charType = new char[0x10000];
static
{
// Alpha-numeric... alpha
setCharType('a', '\u0041', '\u005a');
setCharType('a', '\u0061', '\u007a');
setCharType('a', '\u00c0', '\u00d6');
setCharType('a', '\u00d8', '\u00f6');
setCharType('a', '\u00f8', '\u00ff');
setCharType('a', '\u0100', '\u1fff');
// ...num
setCharType('a', '\u0030', '\u0039');
setCharType('a', '\u0660', '\u0669');
setCharType('a', '\u06f0', '\u06f9');
setCharType('a', '\u0966', '\u096f');
setCharType('a', '\u09e6', '\u09ef');
setCharType('a', '\u0a66', '\u0a6f');
setCharType('a', '\u0ae6', '\u0aef');
setCharType('a', '\u0b66', '\u0b6f');
setCharType('a', '\u0be7', '\u0bef');
setCharType('a', '\u0c66', '\u0c6f');
setCharType('a', '\u0ce6', '\u0cef');
setCharType('a', '\u0d66', '\u0d6f');
setCharType('a', '\u0e50', '\u0e59');
setCharType('a', '\u0ed0', '\u0ed9');
setCharType('a', '\u1040', '\u1049');
// CJK characters
setCharType('s', '\u3040', '\u318f');
setCharType('s', '\u3300', '\u337f');
setCharType('s', '\u3400', '\u3d2d');
setCharType('s', '\u4e00', '\u9fff');
setCharType('s', '\uf900', '\ufaff');
// XTF internal markers
charType['\uEBEB'] = 'a'; // start-of-field marker
charType['\uEE1D'] = 'a'; // end-of-field marker
// Whitespace
charType[' '] = 'w';
charType['\t'] = 'w';
charType['\n'] = 'w';
charType['\r'] = 'w';
charType['\f'] = 'w';
// Punctuation
charType['\''] = 'p';
charType['.'] = 'p';
charType['&'] = 'p';
charType['@'] = 'p';
charType['-'] = 'p';
charType['/'] = 'p';
charType[','] = 'p';
charType['_'] = 'p';
// Currency Symbols
charType['\u0024'] = 's'; // Dollar
charType['\u00a2'] = 's'; // Cent
charType['\u00a3'] = 's'; // Pound Sterling
charType['\u00a4'] = 's'; // currency symbol
charType['\u00a5'] = 's'; // Yen
charType['\u0192'] = 's'; // Florin currency symbol (Dutch)
charType['\u20a3'] = 's'; // Franc
charType['\u20a4'] = 's'; // Lira
charType['\u20a7'] = 's'; // Peseta
charType['\u20ac'] = 's'; // Euro
// Fractions
charType['\u00bc'] = 's'; // one quarter
charType['\u00bd'] = 's'; // one half
charType['\u00be'] = 's'; // three quarters
charType['\u2153'] = 's'; // one third
charType['\u2154'] = 's'; // two thirds
charType['\u2155'] = 's'; // one fifth
charType['\u2156'] = 's'; // two fifths
charType['\u2157'] = 's'; // three fifths
charType['\u2158'] = 's'; // four fifths
charType['\u2159'] = 's'; // one sixth
charType['\u215a'] = 's'; // five sixths
charType['\u215b'] = 's'; // one eighth
charType['\u215c'] = 's'; // three eighths
charType['\u215d'] = 's'; // five eighths
charType['\u215e'] = 's'; // seven eighths
// Math symbols
charType['\u002b'] = 's'; // plus
charType['\u2212'] = 's'; // minus
charType['\u003d'] = 's'; // equals
charType['\u2260'] = 's'; // not equal
charType['\u003c'] = 's'; // less than
charType['\u003e'] = 's'; // greater than
charType['\u2264'] = 's'; // less than or equal
charType['\u2265'] = 's'; // greater than or equal
charType['\u00b1'] = 's'; // plus/minus
charType['\u00d7'] = 's'; // multiply
charType['\u00f7'] = 's'; // divide
charType['\u2219'] = 's'; // period-centered bullet operator
charType['\u00b7'] = 's'; // mid-dot (same as period-centered bullet operator)
charType['\u007e'] = 's'; // tilde
charType['\u005e'] = 's'; // circumflex
charType['\u00b0'] = 's'; // degree
charType['\u00ac'] = 's'; // logical not
charType['\u2248'] = 's'; // approximately equal
charType['\u00b5'] = 's'; // micro
charType['\u221e'] = 's'; // infinity
charType['\u2202'] = 's'; // partial differential
charType['\u220f'] = 's'; // product
charType['\u03c0'] = 's'; // lower-case greek pi
charType['\u222b'] = 's'; // integral
charType['\u2126'] = 's'; // ohm
charType['\u221a'] = 's'; // radical
charType['\u2206'] = 's'; // increment
charType['\u2211'] = 's'; // summation
charType['\u25ca'] = 's'; // lozenge
charType['\u212e'] = 's'; // estimate
charType['\u2032'] = 's'; // single prime
charType['\u2033'] = 's'; // double prime
charType['\u2116'] = 's'; // numero
// Other symbols
charType['\u00a7'] = 's'; // section sign
charType['\u00ae'] = 's'; // registered trademark
charType['\u00a9'] = 's'; // copyright
charType['\u2122'] = 's'; // trademark
};
/** Utility method used when setting up the character type table */
private static void setCharType(char type, char from, char to) {
for (char i = from; i <= to; i++)
charType[i] = type;
}
/**
* Create a tokenizer that will tokenize the stream of characters from
* the given reader. Note that the reader must be an instance of
* FastStringReader, or else fast tokenization isn't possible.
*
* @param reader Reader to get data from.
*/
public FastTokenizer(FastStringReader reader) {
super(reader);
String str = reader.getString();
source = str.toCharArray();
}
/**
* Retrieve the next token in the stream, or null if there are no more.
*/
public Token next()
throws IOException
{
// Skip whitespace and punctuation.
int tpos = pos;
final int tlen = source.length;
char type = 0;
while (tpos < tlen) {
type = charType[source[tpos]];
if (type == 'a' || type == 's')
break;
tpos++;
}
final int start = tpos;
// If we hit a symbol, gobble just that character (we'll make a
// single-char token out of it.)
//
if (type == 's')
tpos++;
else
{
// Gobble up a string of alpha-numeric characters.
while (tpos < tlen) {
type = charType[source[tpos]];
if (type != 'a')
break;
tpos++;
}
}
pos = tpos;
// If at end of string, return null to mark the fact.
if (pos == start)
return null;
// The only situation where our stupid but fast implementation might make
// a mistake is when there is punctuation followed by alpha-numeric.
//
if (tpos >= tlen - 1 || type != 'p' || charType[source[tpos + 1]] != 'a') {
return new Token(new String(source, start, pos - start), start, pos);
}
// Okay, to be safe we'd better use the standard tokenizer. Feed it
// everything up til the next whitespace (or end-of-string).
//
if (dribbleReader == null) {
dribbleReader = new DribbleReader();
stdTokenizer = new XTFTokenizer(dribbleReader);
}
for (; pos < source.length; pos++) {
type = charType[source[pos]];
if (type == 'w')
break;
}
// Special case: the word "x"
if (source[start] == 'x' || source[start] == 'X')
{
if ((start + 1 >= source.length) || (charType[source[start + 1]] != 'a')) {
pos = start + 1;
return new Token("x", start, start + 1, "word");
}
}
// Now let's see what it thinks. First, get a reference token, making
// sure that all old tokens have been dribbled away.
//
Token t1 = stdTokenizer.next();
while (!t1.termText().equals(fakeWord))
t1 = stdTokenizer.next();
dribbleReader.setChars(source, start, pos);
Token t2 = stdTokenizer.next();
if (pos == tlen && t2.termText().equals(fakeWord))
{
// Standard tokenizer ate everything we have, and we're back to
// the fake token. That means there isn't any valid token left
// in the input stream.
//
return null;
}
assert !t2.termText().equals(fakeWord);
if (t2.startOffset() - t1.startOffset() == 2)
{
assert t2.startOffset() - t1.startOffset() == 2;
assert t2.termText().charAt(0) == source[start];
int tokLen = t2.endOffset() - t2.startOffset();
pos = start + tokLen;
return new Token(t2.termText(), start, pos, t2.type());
}
else {
// Special case: token begins after the point we normally
// expect. Happens with: <START-OF-FIELD> '.' '2' x x x...
//
assert t2.startOffset() - t1.startOffset() >= 2;
int tokStart = start + t2.startOffset() - t1.startOffset() - 2;
int tokLen = t2.endOffset() - t2.startOffset();
pos = tokStart + tokLen;
return new Token(t2.termText(), tokStart, pos, t2.type());
}
} // next()
/**
* This class is used, when the fast tokenizer encounters a questionable
* situation, to dribble out characters to a standard tokenizer that can
* do a more complete job.
*
* @author Martin Haye
*/
private class DribbleReader extends Reader
{
/** String used to mark the end of the dribbled text */
static final String fakeStr = " " + fakeWord + " ";
/** Character array version of {@link #fakeStr} */
private final char[] fakeChars = fakeStr.toCharArray();
/** Buffer of characters currently being dribbled */
private char[] buf = fakeChars;
/** Current position within {@link #buf} */
private int pos = 0;
/** Max # of chars to dribble from {@link #buf} */
private int max = fakeChars.length;
/** Does nothing... required by interface */
public void close()
throws IOException
{
}
/** Establish a set of characters to dribble out */
public void setChars(char[] buf, int pos, int max)
{
assert this.pos == 0 : "should have eaten previous string";
assert buf != fakeChars;
for (int i = pos; i < max; i++) {
if (buf[i] == fakeChar)
buf[i] = fakeChar - 1;
}
this.buf = buf;
this.pos = pos;
this.max = max;
} // setChars()
/** Dribble some characters. If we run out, we begin to dribble
* the fake word string.
*/
public int read(char[] cbuf, int off, int len)
throws IOException
{
final int avail = max - pos;
final int toRead = (len > avail) ? avail : len;
System.arraycopy(buf, pos, cbuf, off, toRead);
pos += toRead;
if (pos == max) {
buf = fakeChars;
max = fakeChars.length;
pos = 0;
}
return toRead;
} // read()
} // class DribbleReader
} // class FastTokenizer