package org.basex.util.ft; import java.util.Collections; import java.util.LinkedList; /** * Abstract tokenizer. * * @author BaseX Team 2005-12, BSD License * @author Jens Erat */ public abstract class Tokenizer extends LanguageImpl { /** List of available tokenizers. */ public static final LinkedList<Tokenizer> IMPL = new LinkedList<Tokenizer>(); /** Are special characters included? */ boolean special; /** Load tokenizer classes and order them by precedence. */ static { IMPL.add(new WesternTokenizer(null)); if(JapaneseTokenizer.available()) IMPL.add(new JapaneseTokenizer(null)); Collections.sort(IMPL); } /** * Checks if the language is supported by the available tokenizers. * @param l language to be found * @return result of check */ public static boolean supportFor(final Language l) { for(final Tokenizer t : IMPL) if(t.supports(l)) return true; return false; } /** * Factory method. * @param f full-text options * @return tokenizer */ abstract Tokenizer get(final FTOpt f); /** * Gets full-text info for the specified token; needed for visualizations. * Does not have to be implemented by all tokenizers. * <ul> * <li/>int[0]: length of each token * <li/>int[1]: sentence info, length of each sentence * <li/>int[2]: paragraph info, length of each paragraph * <li/>int[3]: each token as int[] * <li/>int[4]: punctuation marks of each sentence * </ul> * @return int arrays or empty array if not implemented */ int[][] info() { return new int[0][]; } /** * Checks if current token is a paragraph. Does not have to be implemented * by all tokenizers. Returns false if not implemented. * @return whether current token is a paragraph */ boolean paragraph() { return false; } /** * Calculates a position value, dependent on the specified unit. Does not have * to be implemented by all tokenizers. Returns 0 if not implemented. * @param w word position * @param u unit * @return new position */ @SuppressWarnings("unused") int pos(final int w, final FTUnit u) { return 0; } }