SpecialCharNormalizer.java example

Explorer
ontopia-master
/*
 * #!
 * Ontopia Classify
 * #-
 * Copyright (C) 2001 - 2013 The Ontopia Project
 * #-
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * !#
 */

package net.ontopia.topicmaps.classify;

import gnu.trove.set.hash.TIntHashSet;

/**
 * INTERNAL: 
 */
public class SpecialCharNormalizer implements TermNormalizerIF, DelimiterTrimmerIF {

  private TIntHashSet prechars = new TIntHashSet();
  private TIntHashSet postchars = new TIntHashSet();
    
  public SpecialCharNormalizer() {
    this("<')(\"[ {\u00B7-%\u201c\u2018/$.,",
         ">')(.,\"':;!]? |}*\u00B7-%\u201d\u2019");
  }

  public SpecialCharNormalizer(String _prechars, String _postchars) {
    this((_prechars == null ? null : _prechars.toCharArray()),
         (_postchars == null ? null : _postchars.toCharArray()));
  }
  
  public SpecialCharNormalizer(char[] _prechars, char[] _postchars) {
    if (_prechars != null) {
      for (int i=0; i < _prechars.length; i++) {
        prechars.add(_prechars[i]);
      }
    }
    if (_postchars != null) {
      for (int i=0; i < _postchars.length; i++) {
        postchars.add(_postchars[i]);
      }
    }
  }
  
  public String normalize(String term) {
    int length = term.length();
    int start = 0;
    int end = length-1;
    for (int i=start; i < end; i++) {
      if (!prechars.contains(term.charAt(i))) {
        start = i;
        break;
      }
    }
    for (int i=end; i >= start; i--) {
      if (!postchars.contains(term.charAt(i))) {
        end = i;
        break;
      }      
    }
    if (start == end)
      return null;
    else if (start == 0 && end == length)
      return term;
    else
      return term.substring(start, end+1);
  }
  
  public int trimStart(String token) {
    int start = 0;
    int end = token.length()-1;
    for (int i=start; i < end+1; i++) {
      if (!prechars.contains(token.charAt(i))) {
        start = i;
        break;
      }
    }
    return start;
  }
  
  public int trimEnd(String token) {
    int end = token.length()-1;
    for (int i=end; i >= 0; i--) {
      if (!postchars.contains(token.charAt(i))) {
        end = i;
        break;
      }      
    }
    return end;
  }
  
}