/** * Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved. * EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. * http://www.ewcms.com */ package com.ewcms.content.document.util.analyzer; import java.util.HashSet; import java.util.Set; import com.ewcms.content.document.util.analyzer.dic.Dictionary; import com.ewcms.content.document.util.analyzer.seg.ISegmenter; /** * <ul> * 分词器上下文状态 * </ul> * * @author 吴智俊 */ public class Context{ //是否使用最大词长切分(粗粒度) private boolean isMaxWordLength = false; //记录Reader内已分析的字串总长度 //在分多段分析词元时,该变量累计当前的segmentBuff相对于reader的位移 private int buffOffset; //最近一次读入的,可处理的字串长度 private int available; //最近一次分析的字串长度 private int lastAnalyzed; //当前缓冲区位置指针 private int cursor; //字符窜读取缓冲 private char[] segmentBuff; /* * 记录正在使用buffer的分词器对象 * 如果set中存在有分词器对象,则buffer不能进行位移操作(处于locked状态) */ private Set<ISegmenter> buffLocker; /* * 词元结果集,为每次游标的移动,存储切分出来的词元 */ private IKSortedLinkSet lexemeSet; Context(char[] segmentBuff , boolean isMaxWordLength){ this.isMaxWordLength = isMaxWordLength; this.segmentBuff = segmentBuff; this.buffLocker = new HashSet<ISegmenter>(4); this.lexemeSet = new IKSortedLinkSet(); } /** * 重置上下文 */ public void resetContext(){ buffLocker.clear(); lexemeSet = new IKSortedLinkSet(); buffOffset = 0; available = 0; lastAnalyzed = 0; cursor = 0; } public boolean isMaxWordLength() { return isMaxWordLength; } public void setMaxWordLength(boolean isMaxWordLength) { this.isMaxWordLength = isMaxWordLength; } public int getBuffOffset() { return buffOffset; } public void setBuffOffset(int buffOffset) { this.buffOffset = buffOffset; } public int getLastAnalyzed() { return lastAnalyzed; } public void setLastAnalyzed(int lastAnalyzed) { this.lastAnalyzed = lastAnalyzed; } public int getCursor() { return cursor; } public void setCursor(int cursor) { this.cursor = cursor; } public void lockBuffer(ISegmenter segmenter){ this.buffLocker.add(segmenter); } public void unlockBuffer(ISegmenter segmenter){ this.buffLocker.remove(segmenter); } /** * 只要buffLocker中存在ISegmenter对象 * 则buffer被锁定 * @return boolean 缓冲去是否被锁定 */ public boolean isBufferLocked(){ return this.buffLocker.size() > 0; } public int getAvailable() { return available; } public void setAvailable(int available) { this.available = available; } /** * 取出分词结果集中的首个词元 * @return Lexeme 集合的第一个词元 */ public Lexeme firstLexeme() { return this.lexemeSet.pollFirst(); } /** * 取出分词结果集中的最后一个词元 * @return Lexeme 集合的最后一个词元 */ public Lexeme lastLexeme() { return this.lexemeSet.pollLast(); } /** * 向分词结果集添加词元 * @param lexeme */ public void addLexeme(Lexeme lexeme){ if(!Dictionary.isStopWord(segmentBuff , lexeme.getBegin() , lexeme.getLength())){ this.lexemeSet.addLexeme(lexeme); } } /** * 获取分词结果集大小 * @return int 分词结果集大小 */ public int getResultSize(){ return this.lexemeSet.size(); } /** * 排除结果集中完全交叠(彼此包含)的词元 * 进行最大切分的时候,过滤长度较小的交叠词元 */ public void excludeOverlap(){ this.lexemeSet.excludeOverlap(); } /** * * @author linly * */ private class IKSortedLinkSet{ //链表头 private Lexeme head; //链表尾 private Lexeme tail; //链表的实际大小 private int size; private IKSortedLinkSet(){ this.size = 0; } /** * 向链表集合添加词元 * @param lexeme */ private void addLexeme(Lexeme lexeme){ if(this.size == 0){ this.head = lexeme; this.tail = lexeme; this.size++; return; }else{ if(this.tail.compareTo(lexeme) == 0){//词元与尾部词元相同,不放入集合 return; }else if(this.tail.compareTo(lexeme) < 0){//词元接入链表尾部 this.tail.setNext(lexeme); lexeme.setPrev(this.tail); this.tail = lexeme; this.size++; return; }else if(this.head.compareTo(lexeme) > 0){//词元接入链表头部 this.head.setPrev(lexeme); lexeme.setNext(this.head); this.head = lexeme; this.size++; return; }else{ //从尾部上逆 Lexeme l = this.tail; while(l != null && l.compareTo(lexeme) > 0){ l = l.getPrev(); } if(l.compareTo(lexeme) == 0){//词元与集合中的词元重复,不放入集合 return; }else if(l.compareTo(lexeme) < 0){//词元插入链表中的某个位置 lexeme.setPrev(l); lexeme.setNext(l.getNext()); l.getNext().setPrev(lexeme); l.setNext(lexeme); this.size++; return; } } } } /** * 取出链表集合的第一个元素 * @return Lexeme */ private Lexeme pollFirst(){ if(this.size == 1){ Lexeme first = this.head; this.head = null; this.tail = null; this.size--; return first; }else if(this.size > 1){ Lexeme first = this.head; this.head = first.getNext(); first.setNext(null); this.size --; return first; }else{ return null; } } /** * 取出链表集合的最后一个元素 * @return Lexeme */ private Lexeme pollLast(){ if(this.size == 1){ Lexeme last = this.head; this.head = null; this.tail = null; this.size--; return last; }else if(this.size > 1){ Lexeme last = this.tail; this.tail = last.getPrev(); last.setPrev(null); this.size--; return last; }else{ return null; } } /** * 剔除集合汇总相邻的切完全包含的lexeme * 进行最大切分的时候,过滤长度较小的交叠词元 */ private void excludeOverlap(){ if(this.size > 1){ Lexeme one = this.head; Lexeme another = one.getNext(); do{ if(one.isOverlap(another) //&& Lexeme.TYPE_CJK_NORMAL == one.getLexemeType() //&& Lexeme.TYPE_CJK_NORMAL == another.getLexemeType() ){ //邻近的两个词元完全交叠,且均为词典内的词语 another = another.getNext(); //从链表中断开交叠的词元 one.setNext(another); if(another != null){ another.setPrev(one); } this.size--; }else{//词元不完全交叠 one = another; another = another.getNext(); } }while(another != null); } } private int size(){ return this.size; } } }