/** * Copyright 2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package net.paoding.analysis.knife; import net.paoding.analysis.dictionary.Dictionary; import net.paoding.analysis.dictionary.Hit; /** * * @author Zhiliang Wang [qieqie.wang@gmail.com] * */ public class NumberKnife extends CombinatoricsKnife implements DictionariesWare { private Dictionary units; public NumberKnife() { } public NumberKnife(Dictionaries dictionaries) { setDictionaries(dictionaries); } public void setDictionaries(Dictionaries dictionaries) { super.setDictionaries(dictionaries); units = dictionaries.getUnitsDictionary(); } public int assignable(Beef beef, int offset, int index) { char ch = beef.charAt(index); if (CharSet.isArabianNumber(ch)) return ASSIGNED; if (index > offset) { if (CharSet.isLantingLetter(ch) || ch == '.' || ch == '-' || ch == '_') { if (CharSet.isLantingLetter(ch) || !CharSet.isArabianNumber(beef.charAt(index + 1))) { //分词效果 //123.456 ->123.456/ //123.abc.34 ->123/123.abc.34/abc/34/ ["abc"、"abc/34"系由LetterKnife分出,非NumberKnife] //没有或判断!CharSet.isArabianNumber(beef.charAt(index + 1)),则分出"123.",而非"123" //123.abc.34 ->123./123.abc.34/abc/34/ return POINT; } return ASSIGNED; } } return LIMIT; } protected int collectLimit(Collector collector, Beef beef, int offset, int point, int limit, int dicWordVote) { // "123abc"的直接调用super的 if (point != -1) { return super.collectLimit(collector, beef, offset, point, limit, dicWordVote); } // // 2.2两 // ^=_point // final int _point = limit; // 当前尝试判断的字符的位置 int curTail = offset; int number1 = -1; int number2 = -1; int bitValue = 0; int maxUnit = 0; //TODO:这里又重复从curTail(其值为offset)判断,重新遍历判断是否为数字,算是一个重复计算 //但考虑这个计算对中文分词性能影响微乎其微暂时先不优化 for (; (bitValue = CharSet.toNumber(beef.charAt(curTail))) >= 0; curTail++) { // if (bitValue == 2 && (beef.charAt(curTail) == '两' || beef.charAt(curTail) == '俩' || beef .charAt(curTail) == '倆')) { if (curTail != offset) { break; } } // 处理连续汉字个位值的数字:"三四五六" ->"3456" if (bitValue >= 0 && bitValue < 10) { if (number2 < 0) number2 = bitValue; else { number2 *= 10; number2 += bitValue; } } else { if (number2 < 0) { if (number1 < 0) { number1 = 1; } number1 *= bitValue; } else { if (number1 < 0) { number1 = 0; } if (bitValue >= maxUnit) { number1 += number2; number1 *= bitValue; maxUnit = bitValue; } else { number1 += number2 * bitValue; } } number2 = -1; } } if (number2 > 0) { if (number1 < 0) { number1 = number2; } else { number1 += number2; } } if (number1 >= 0 && curTail > _point) { doCollect(collector, String.valueOf(number1), beef, offset, curTail); } else { super.collectLimit(collector, beef, offset, point, limit, dicWordVote); } curTail = curTail > limit ? curTail : limit; // // 后面可能跟了计量单位 if (units != null && CharSet.isCjkUnifiedIdeographs(beef.charAt(curTail))) { Hit wd = null; Hit wd2 = null; int i = curTail + 1; while ((wd = units.search(beef, curTail, i - curTail)).isHit()) { wd2 = wd; i++; if (!wd.isUnclosed()) { break; } } i --; if (wd2 != null) { collector.collect(wd2.getWord().getText(), curTail, i); return i; } } // return curTail > limit ? curTail : -1; } }