/** * Copyright 2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.sohospace.paoding.cjk; import com.sohospace.dictionary.Dictionary; import com.sohospace.dictionary.Hit; import com.sohospace.paoding.CharSet; import com.sohospace.paoding.Collector; import com.sohospace.paoding.Knife; /** * * @author Zhiliang Wang [qieqie.wang@gmail.com] * * @since 1.0 * */ public class CJKKnife implements Knife { // ------------------------------------------------- private CJKDictionaryFactory factory; // ------------------------------------------------- public CJKKnife() { } public CJKKnife(CJKDictionaryFactory factory) { this.factory = factory; } // ------------------------------------------------- public CJKDictionaryFactory getFactory() { return factory; } public void setFactory(CJKDictionaryFactory factory) { this.factory = factory; } // ------------------------------------------------- public boolean assignable(CharSequence beaf, int index) { return CharSet.isCjkUnifiedIdeographs(beaf.charAt(index)); } public int dissect(Collector collector, CharSequence beaf, int offset) { if (CharSet.isCjkUnifiedIdeographs(beaf.charAt(beaf.length() - 1)) && offset > 0 && beaf.length() - offset < 50){ return -offset; } Dictionary vocabulary = factory.getVocabulary(); /* ����:�����ס�ڱ�����ˮ̶�Ÿ��� */ // setup��end���ڹ涨��֮��������Ƿ�Ϊ�ʵ���� int setup, end; // ΪunidentifiedIndex����Ϊ���ҳ��Ĵ������λ�õ�����ߣ�e.g '��','��','��','��' int identifiedEnd = offset; // ���ڶ�λδ�ִܷʵĿ�Ŀ�ʼλ�ã�e.g '��' int unidentifiedIndex = -1; //���ڸ����ж��Ƿ����shouldAWord()���� int maxWordLength = 0; Hit word = null; for (setup = offset, end = offset; setup < beaf.length() && CharSet.isCjkUnifiedIdeographs(beaf.charAt(setup)); end = ++setup) { for (int count = 1; end < beaf.length() && CharSet.isCjkUnifiedIdeographs(beaf.charAt(end++)); count++) { //��һ��forѭ��ʱ��end=setup+1 word = vocabulary.search(beaf, setup, count); if (word.isUndefined()) { if (unidentifiedIndex < 0 && setup >= identifiedEnd) { unidentifiedIndex = setup; } break; } else if (word.isHit()) { if (identifiedEnd < end) { identifiedEnd = end; } if (unidentifiedIndex >= 0) { dissectUnidentified(collector, beaf, unidentifiedIndex, setup - unidentifiedIndex); unidentifiedIndex = -1; } collector.collect(word.getWord(), setup, end); if (setup == offset && maxWordLength < count) { maxWordLength = count; } if (!(word.isUnclosed() && end < beaf.length()// ����ж���Ϊ�����жϷ��� && beaf.charAt(end) >= word.getNext().charAt(count))) { break; } } } } if (identifiedEnd != end) { dissectUnidentified(collector, beaf, identifiedEnd, end - identifiedEnd); } int len = end - offset; if (len > 2 && len != maxWordLength && shouldAWord(beaf, offset, end)) { collect(collector, beaf, offset, end); } return setup;//��ʱend=start } // ------------------------------------------------- /** * �ԷǴʻ���е��ִʷִ� * * @param cellector * @param beaf * @param offset * @param count */ protected void dissectUnidentified(Collector collector, CharSequence beaf, int offset, int count) { int end = offset + count; Hit word = null; int nearEnd = end - 1; for (int i = offset, j=i; i < end;) { j = skipXword(beaf, i, end); if (j >= 0 && i != j) { i = j; continue; } j = collectNumber(collector, beaf, i, end); if (j >= 0 && i != j) { i = j; continue; } word = factory.getXchars().search(beaf, i, 1); if (word.isHit()) { i++; continue; } // ͷ�� if (i == offset) { // �ٶ����¼�=�ٶ�+��+...!=�ٶ�+����+... collect(collector, beaf, offset, offset + 1); } // β�� if (i == nearEnd) { if (nearEnd != offset) { collect(collector, beaf, nearEnd, end); } } // ���Ԫ�ִ� else { collect(collector, beaf, i, i + 2); } i++; } } protected boolean shouldAWord(CharSequence beaf, int offset, int end) { if (offset > 0 && end < beaf.length()) {//ȷ��ǰ���ַ�����Ҳ���ַ� int prev = offset - 1; if (beaf.charAt(prev) == '��' && beaf.charAt(end) == '��') { return true; } else if (beaf.charAt(prev) == '��' && beaf.charAt(end) == '��') { return true; } else if (beaf.charAt(prev) == '\'' && beaf.charAt(end) == '\'') { return true; } else if (beaf.charAt(prev) == '\"' && beaf.charAt(end) == '\"') { return true; } } return false; } private final void collect(Collector collector, CharSequence beaf, int offset, int end) { collector.collect(beaf.subSequence(offset, end).toString(), offset, end); } private final int skipXword(CharSequence beaf, int offset, int end) { Hit word; for (int k = offset + 2; k <= end; k++) { word = factory.getXwords().search(beaf, offset, k - offset); if (word.isHit()) { offset = k; } if (word.isUndefined() || !word.isUnclosed()) { break; } } return offset; } private final int collectNumber(Collector collector, CharSequence beaf, int offset, int end) { int number1 = -1; int number2 = -1; int cur = offset; int bitValue = 0; int maxUnit = 0; boolean hasDigit = false;// ���ã�ȥ��û������ֻ�е�λ�ĺ��֣��硰�򡱣���ǧ�� for (; cur <= end && (bitValue = toNumber(beaf.charAt(cur))) >= 0; cur++) { if (bitValue == 2 && (beaf.charAt(cur) == '��' || beaf.charAt(cur) == '��' || beaf .charAt(cur) == '�z')) { if (cur != offset) break; } if (bitValue >= 0 && bitValue < 10) { hasDigit = true; if (number2 < 0) number2 = bitValue; else { number2 *= 10; number2 += bitValue; } } else { if (number2 < 0) { if (number1 < 0) { number1 = 1; } number1 *= bitValue; } else { if (number1 < 0) { number1 = 0; } if (bitValue >= maxUnit) { number1 += number2; number1 *= bitValue; maxUnit = bitValue; } else { number1 += number2 * bitValue; } } number2 = -1; } } if (!hasDigit && cur < beaf.length() && !factory.getUnits().search(beaf, cur, 1).isHit()) { return offset; } if (number2 > 0) { if (number1 < 0) { number1 = number2; } else { number1 += number2; } } if (number1 >= 0) { collector.collect(String.valueOf(number1), offset, cur); // ������ܸ��˼�����λ Hit wd; int i = cur + 1; while (i <= beaf.length() && (wd = factory.getUnits().search(beaf, cur, i - cur)) .isHit()) { collector.collect(String.valueOf(number1) + beaf.subSequence(cur, i), offset, i); cur++; if (!wd.isUnclosed()) { break; } i++; } } return cur; } private final int toNumber(char c) { switch (c) { case '��': case '��': return 0; case 'һ': case 'Ҽ': return 1; case '��': case '��': case '��': case '�E': return 2; case '��': case '��': return 3; case '��': case '��': return 4; case '��': case '��': return 5; case '��': case '�': return 6; case '��': case '��': return 7; case '��': case '��': return 8; case '��': case '��': return 9; case 'ʮ': case 'ʲ': return 10; case '��': case '��': return 100; case 'ǧ': case 'Ǫ': return 1000; case '��': case '�f': return 10000; case '��': case '�|': return 100000000; default: return -1; } } }