/*
* This file is part of Caliph & Emir.
*
* Caliph & Emir is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Caliph & Emir is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Caliph & Emir; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Copyright statement:
* --------------------
* (c) 2002-2005 by Mathias Lux (mathias@juggle.at)
* http://www.juggle.at, http://caliph-emir.sourceforge.net
*/
package at.lux.retrieval.clustering;
import java.util.*;
/**
* @author Mathias Lux, mathias@juggle.at
* Date: 03.06.2004
* Time: 13:42:41
*/
public class TextSuffixTree extends AbstractSuffixTree{
public TextSuffixTree(HashSet<String> stopwordlist) {
super();
stopwords = stopwordlist;
}
/**
* Override this one if you want to change the way of handling tokens
* (or words in this implementation)
* @param sentence
* @return
*/
protected String[] getTokens(String sentence) {
// TODO: eventually think about integrating some symbols like @ and / for supporting URIs as tokens
String[] tokens = sentence.toLowerCase().split("[^a-zA-Z0-9�������]");
return tokens;
}
/**
* Override this method if you want to use another method to create the sentences.
* @param phrase
* @return an array of sentences.
*/
protected String[] getSentences(String phrase) {
String[] sentences = phrase.toLowerCase().split("[.;!?:\\-\\\"'()\\[\\]{}]\\s");
return sentences;
}
protected String[] filterTokens(String[] tokens) {
LinkedList<String> tokenList = new LinkedList<String>();
for (int i = 0; i < tokens.length; i++) {
String token = tokens[i];
boolean add = true;
if (token.length() < 3) add = false;
if (token.equals("...")) add = false;
if (add) tokenList.add(token);
}
String[] result = new String[tokenList.size()];
tokenList.toArray(result);
return result;
}
}