/* * This file is part of Caliph & Emir. * * Caliph & Emir is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Caliph & Emir is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Caliph & Emir; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Copyright statement: * -------------------- * (c) 2002-2005 by Mathias Lux (mathias@juggle.at) * http://www.juggle.at, http://caliph-emir.sourceforge.net */ package at.lux.fotoretrieval.lucene; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import java.io.IOException; import java.io.Reader; /** * Date: 25.03.2005 * Time: 22:13:35 * * @author Mathias Lux, mathias@juggle.at */ public class GraphTokenizer extends TokenStream { private char last=' '; private Reader reader; private boolean tokenstart, tokenend; public GraphTokenizer(Reader in) { // super(in); reader = in; } protected boolean isTokenChar(char c) { boolean returnValue = false; if (c == ' ' && last == ']') { returnValue = true; } last = c; return returnValue; } public Token next() throws IOException { StringBuilder currenttoken = new StringBuilder(64); // currenttoken.append('['); char[] character = new char[1]; int i = reader.read(character); // reset our states :) tokenstart = false; tokenend = false; do { // end of stream reached ... if (i == 0) return null; if (character[0] == '[') { // token starts here ... tokenstart = true; } else if (character[0] == ']') { // token ends here ... tokenend = true; } else if (tokenstart && !tokenend) { // between end and start ... currenttoken.append(character[0]); } // we found our token and return it ... if (tokenstart && tokenend) { // currenttoken.append(']'); // prepend a token because lucene does not allow leading wildcards. currenttoken.insert(0, '_'); String tokenString = currenttoken.toString().toLowerCase().replace(' ', '_').trim(); Token t = new Token(tokenString, 0, tokenString.length()-1); return t; } i = reader.read(character); } while (i>0 && !tokenend); return null; } }