package org.basex.util.ft;
import static org.basex.util.Token.*;
import static org.basex.util.ft.FTFlag.*;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import org.basex.util.TokenBuilder;
import org.basex.util.Util;
import org.basex.util.list.IntList;
/**
* Full-text tokenizer.
*
* @author BaseX Team 2005-12, BSD License
* @author Christian Gruen
*/
public final class WesternTokenizer extends Tokenizer {
/** Supported languages. */
private static final HashSet<Language> SUPPORTED = new HashSet<Language>();
static {
final String[] nonw = { "ar", "ja", "ko", "th", "zh" };
for(final Language l : Language.ALL.values()) {
if(!eq(l.code(), nonw)) SUPPORTED.add(l);
}
}
/** Cached sentence positions. */
private final IntList sen = new IntList();
/** Cached paragraph positions. */
private final IntList par = new IntList();
/** Diacritics flag. */
private final boolean dc;
/** Sensitivity flag. */
private final boolean cs;
/** Uppercase flag. */
private final boolean uc;
/** Lowercase flag. */
private final boolean lc;
/** Wildcard flag. */
private final boolean wc;
/** Flag for a paragraph. */
private boolean pa;
/** Text. */
private byte[] text = EMPTY;
/** Current sentence. */
private int sent;
/** Current paragraph. */
private int para;
/** Last punctuation mark. */
private int pm;
/** Last character position. */
private int lp;
/** Character start position. */
private int spos;
/** Current token position. */
private int pos = -1;
/** Current character position. */
private int cpos;
/** Flag indicating a special character. */
private boolean sc;
/** Next pointer. */
private int next;
/**
* Constructor.
* @param f full-text options
*/
public WesternTokenizer(final FTOpt f) {
lc = f != null && f.is(LC);
uc = f != null && f.is(UC);
cs = f != null && f.is(CS);
wc = f != null && f.is(WC);
dc = f != null && f.is(DC);
}
@Override
Collection<Language> languages() {
return SUPPORTED;
}
@Override
Tokenizer get(final FTOpt f) {
return new WesternTokenizer(f);
}
@Override
public WesternTokenizer init(final byte[] txt) {
if(text != txt) {
text = txt;
sen.reset();
par.reset();
}
init();
return this;
}
/**
* Initializes the iterator.
*/
private void init() {
sent = 0;
para = 0;
pos = -1;
cpos = 0;
next = 0;
}
@Override
public boolean hasNext() {
if(next <= 0 && (special ? moreSC() : more())) next++;
return next > 0;
}
@Override
public FTSpan next() {
return new FTSpan(nextToken(), pos, sc);
}
@Override
public byte[] nextToken() {
if(--next < 0) hasNext();
return special ? getSC() : get();
}
/**
* Scans the next token and returns {@code true} if more tokens can be
* returned.
* @return result of check
*/
private boolean more() {
final int l = text.length;
++pos;
lp = cpos;
// parse whitespaces
boolean sn = false;
pa = false;
boolean bs = false;
for(; cpos < l; cpos += cl(text, cpos)) {
final int c = cp(text, cpos);
if(wc && !bs) {
bs = c == '\\';
if(bs) continue;
if(c == '.') break;
}
if(!sn && (c == '.' || c == '!' || c == '?')) {
sn = true;
++sent;
pm = c;
} else if(!pa && c == '\n') {
pa = true;
++para;
} else if(ftChar(c)) {
if(bs) {
// backslash (bs) followed by any character is the character itself:
--cpos;
bs = false;
}
break;
}
bs = false;
}
// end of text...
spos = cpos;
if(cpos == l) return false;
// parse token
for(; cpos < l; cpos += cl(text, cpos)) {
int c = cp(text, cpos);
// parse wildcards
if(wc && !bs) {
bs = c == '\\';
if(bs) continue;
if(c == '.') {
c = cpos + 1 < l ? text[cpos + 1] : 0;
if(c == '?' || c == '*' || c == '+') {
++cpos;
} else if(c == '{') {
while(++cpos < l && text[cpos] != '}');
if(cpos == l) break;
}
continue;
}
}
if(!ftChar(c)) {
if(bs) --cpos;
break;
}
bs = false;
}
return true;
}
/**
* Returns a normalized version of the current token.
* @return result
*/
private byte[] get() {
byte[] n = orig();
final boolean a = ascii(n);
if(!a && !dc) n = dia(n);
if(uc) n = upper(n, a);
if(lc || !cs) n = lower(n, a);
return n;
}
/**
* Returns the original token.
* @return original token
*/
private byte[] orig() {
final int l = cpos - spos;
final byte[] copy = new byte[l];
System.arraycopy(text, spos, copy, 0, l);
return copy;
}
/**
* Checks if more tokens are to be returned; special characters are included.
* @return result of check
*/
private boolean moreSC() {
final int l = text.length;
// parse whitespaces
pa = false;
sc = false;
lp = cpos;
for(; cpos < l; cpos += cl(text, cpos)) {
final int c = cp(text, cpos);
if(c == '\n') {
pa = true;
++cpos;
sc = true;
break;
} else if(ftChar(c)) {
break;
}
sc = true;
}
// special chars found
if(lp < cpos) return true;
++pos;
// end of text...
spos = cpos;
if(cpos == l) return false;
// parse token
for(; cpos < l; cpos += cl(text, cpos)) {
final int c = cp(text, cpos);
if(!ftChar(c)) {
spos = cpos - cl(text, cpos);
break;
}
}
return true;
}
/**
* Get next token, including special characters.
* @return next token
*/
private byte[] getSC() {
return lp < cpos ? Arrays.copyOfRange(text, lp, cpos) :
Arrays.copyOfRange(text, cpos, spos);
}
@Override
int pos(final int w, final FTUnit u) {
if(u == FTUnit.WORD) return w;
// if necessary, calculate sentences and paragraphs
final IntList il = u == FTUnit.SENTENCE ? sen : par;
if(sen.size() == 0) {
init();
while(more()) {
sen.add(sent);
par.add(para);
}
}
return il.get(w);
}
/**
* Removes diacritics from the specified token. This method supports all
* latin1 characters, including supplements.
* @param t token to be converted
* @return converted token
*/
static byte[] dia(final byte[] t) {
// find first character to be normalized
final int tl = t.length;
for(int i = 0; i < tl; i += cl(t, i)) {
final int c = cp(t, i);
// normalized character found; run conversion
if(c != norm(c)) {
final TokenBuilder tb = new TokenBuilder();
tb.add(t, 0, i);
for(int j = i; j < tl; j += cl(t, j))
tb.add(norm(cp(t, j)));
return tb.finish();
}
}
// return original character
return t;
}
/**
* Converts the specified token to upper case.
* @param t token to be converted
* @param a ascii flag
* @return the converted token
*/
static byte[] upper(final byte[] t, final boolean a) {
final int tl = t.length;
if(a) {
for(int i = 0; i < tl; ++i)
t[i] = (byte) uc(t[i]);
return t;
}
final TokenBuilder tb = new TokenBuilder();
for(int i = 0; i < tl; i += cl(t, i))
tb.add(uc(cp(t, i)));
return tb.finish();
}
/**
* Converts the specified token to lower case.
* @param t token to be converted
* @param a ascii flag
* @return the converted token
*/
static byte[] lower(final byte[] t, final boolean a) {
final int tl = t.length;
if(a) {
for(int i = 0; i < tl; ++i)
t[i] = (byte) lc(t[i]);
return t;
}
final TokenBuilder tb = new TokenBuilder();
for(int i = 0; i < tl; i += cl(t, i))
tb.add(lc(cp(t, i)));
return tb.finish();
}
@Override
int[][] info() {
init();
final IntList[] il = new IntList[] { new IntList(), new IntList(),
new IntList(), new IntList(), new IntList()};
int lass = 0;
int lasp = 0;
int sl = 0;
int pl = 0;
while(more()) {
final byte[] n = orig();
final int l = n.length;
il[0].add(l);
for(final byte b : n) il[3].add(b);
if(sent != lass) {
if(sl > 0) {
il[1].add(sl);
il[4].add(pm);
}
lass = sent;
sl = 0;
}
if(para != lasp) {
if(pl > 0) il[2].add(pl);
lasp = para;
pl = 0;
}
sl += l;
pl += l;
}
if(sent != lass && sl > 0) {
il[1].add(sl);
il[4].add(pm);
}
if(pl > 0) il[2].add(pl);
// last sentence not finished with a punctuation mark
il[1].add(sl + 1);
return new int[][] { il[0].toArray(), il[1].toArray(), il[2].toArray(),
il[3].toArray(), il[4].toArray()};
}
@Override
protected byte prec() {
return 10;
}
@Override
boolean paragraph() {
return pa;
}
@Override
public String toString() {
return Util.name(this) + '[' + string(text) + ']';
}
}