WhitespaceTokenizer.java example

Explorer
jcr-master
- jcr-develop
/*
 * Copyright (C) 2003-2013 eXo Platform SAS.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
package org.exoplatform.services.jcr.analyzer;

import java.io.Reader;

import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
/**
 * Created by The eXo Platform SAS
 * Author : eXoPlatform
 *          exo@exoplatform.com
 * Apr 9, 2013
 *   
 * A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
 * Adjacent sequences of non-Whitespace characters form tokens.
 * <br>
 * You must specify the required {@link Version} compatibility when creating
 * {@link WhitespaceTokenizer}:
 * <ul>
 * <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
 * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
 * {@link CharTokenizer#normalize(int)} for details.</li>
 * </ul>
 */
public class WhitespaceTokenizer extends CharTokenizer {

  private static String searchCharacters;

  static {
    searchCharacters = (System.getProperty("search.excluded-characters")!=null?
                        System.getProperty("search.excluded-characters"):"");
  }
    /**
     * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version
     * to match
     * 
     * @param in
     *          the input to split up into tokens
     */
    public WhitespaceTokenizer(Version matchVersion, Reader in) {
      super(matchVersion, in);
    }

    /**
     * Construct a new WhitespaceTokenizer using a given {@link AttributeSource}.
     * 
     * @param matchVersion
     *          Lucene version to match
     * @param source
     *          the attribute source to use for this {@link org.apache.lucene.analysis.Tokenizer}
     * @param in
     *          the input to split up into tokens
     */
    public WhitespaceTokenizer(Version matchVersion, AttributeSource source, Reader in) {
      super(matchVersion, source, in);
    }

    /**
     * Construct a new WhitespaceTokenizer using a given
     * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
     *
     * @param
     *          matchVersion Lucene version to match See
     *          {@literal <a href="#version">above</a>}
     * @param factory
     *          the attribute factory to use for this {@link org.apache.lucene.analysis.Tokenizer}
     * @param in
     *          the input to split up into tokens
     */
    public WhitespaceTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
      super(matchVersion, factory, in);
    }
    
    /** Collects only characters which do not satisfy
     * {@link Character#isWhitespace(int)}.*/
    @Override
    protected boolean isTokenChar(int c) {

      if(!Character.isWhitespace(c) && searchCharacters.indexOf(c) == -1){
        return true;
      }
      return false;
    }
  }