/*
* #!
* Ontopia Classify
* #-
* Copyright (C) 2001 - 2013 The Ontopia Project
* #-
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* !#
*/
package net.ontopia.topicmaps.classify;
import gnu.trove.set.hash.TIntHashSet;
/**
* INTERNAL:
*/
public class SpecialCharNormalizer implements TermNormalizerIF, DelimiterTrimmerIF {
private TIntHashSet prechars = new TIntHashSet();
private TIntHashSet postchars = new TIntHashSet();
public SpecialCharNormalizer() {
this("<')(\"[ {\u00B7-%\u201c\u2018/$.,",
">')(.,\"':;!]? |}*\u00B7-%\u201d\u2019");
}
public SpecialCharNormalizer(String _prechars, String _postchars) {
this((_prechars == null ? null : _prechars.toCharArray()),
(_postchars == null ? null : _postchars.toCharArray()));
}
public SpecialCharNormalizer(char[] _prechars, char[] _postchars) {
if (_prechars != null) {
for (int i=0; i < _prechars.length; i++) {
prechars.add(_prechars[i]);
}
}
if (_postchars != null) {
for (int i=0; i < _postchars.length; i++) {
postchars.add(_postchars[i]);
}
}
}
public String normalize(String term) {
int length = term.length();
int start = 0;
int end = length-1;
for (int i=start; i < end; i++) {
if (!prechars.contains(term.charAt(i))) {
start = i;
break;
}
}
for (int i=end; i >= start; i--) {
if (!postchars.contains(term.charAt(i))) {
end = i;
break;
}
}
if (start == end)
return null;
else if (start == 0 && end == length)
return term;
else
return term.substring(start, end+1);
}
public int trimStart(String token) {
int start = 0;
int end = token.length()-1;
for (int i=start; i < end+1; i++) {
if (!prechars.contains(token.charAt(i))) {
start = i;
break;
}
}
return start;
}
public int trimEnd(String token) {
int end = token.length()-1;
for (int i=end; i >= 0; i--) {
if (!postchars.contains(token.charAt(i))) {
end = i;
break;
}
}
return end;
}
}