/* * #! * Ontopia Classify * #- * Copyright (C) 2001 - 2013 The Ontopia Project * #- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * !# */ package net.ontopia.topicmaps.classify; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * INTERNAL: */ public class CharacterAnalyzer implements TermAnalyzerIF { // Define a logging category. static Logger log = LoggerFactory.getLogger(CharacterAnalyzer.class.getName()); private static final CharacterAnalyzer INSTANCE = new CharacterAnalyzer(); private static double FACTOR_NO_LETTERS = 0.05d; private static double FACTOR_DIGITS_AND_OTHER = 0.1d; private static double FACTOR_DIGITS = 0.3d; private static double FACTOR_OTHER = 0.8d; public static CharacterAnalyzer getInstance() { return INSTANCE; } public void startAnalysis(TermDatabase tdb) { } public void analyzeTerm(Term term) { // score down if term contains non-letter characters String stem = term.getStem(); int length = stem.length(); int cnt_letters = 0; int cnt_digits = 0; int cnt_other = 0; // count character types for (int i=0; i < length; i++) { char c = stem.charAt(i); if (Character.isLetter(c) || Character.isWhitespace(c)) cnt_letters++; else if (Character.isDigit(c)) cnt_digits++; else cnt_other++; } if (log.isDebugEnabled()) log.debug("t: " + term + " l: " + cnt_letters + " d: " + cnt_digits + " o: " + cnt_other); // if term contains non-letter characters then score down double score = term.getScore(); if (score > 0d) { if (cnt_letters == 0) term.multiplyScore(FACTOR_NO_LETTERS, "no letters"); else if (cnt_digits > 0 && cnt_other > 0) term.multiplyScore(FACTOR_DIGITS_AND_OTHER, "digits and other chars"); else if (cnt_digits > 0) term.multiplyScore(FACTOR_DIGITS, "contains digits"); else if (cnt_other > 0) term.multiplyScore(FACTOR_OTHER, "contains other chars"); } } public void endAnalysis() { } }