/**
* Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved.
* EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
* http://www.ewcms.com
*/
package com.ewcms.content.document.util.analyzer;
/**
* <ul>
* 语义单元(词元)
* </ul>
*
* @author 吴智俊
*/
public final class Lexeme implements Comparable<Lexeme>{
//lexemeType常量
//普通词元
public static final int TYPE_CJK_NORMAL = 0;
//姓氏
public static final int TYPE_CJK_SN = 1;
//尾缀
public static final int TYPE_CJK_SF = 2;
//未知的
public static final int TYPE_CJK_UNKNOWN = 3;
//数词
public static final int TYPE_NUM = 10;
//量词
public static final int TYPE_NUMCOUNT = 11;
//英文
public static final int TYPE_LETTER = 20;
//词元的起始位移
private int offset;
//词元的相对起始位置
private int begin;
//词元的长度
private int length;
//词元文本
private String lexemeText;
//词元类型
private int lexemeType;
//当前词元的前一个词元
private Lexeme prev;
//当前词元的后一个词元
private Lexeme next;
public Lexeme(int offset , int begin , int length , int lexemeType){
this.offset = offset;
this.begin = begin;
if(length < 0){
throw new IllegalArgumentException("length < 0");
}
this.length = length;
this.lexemeType = lexemeType;
}
/*
* 判断词元相等算法
* 起始位置偏移、起始位置、终止位置相同
* @see java.lang.Object#equals(Object o)
*/
public boolean equals(Object o){
if(o == null){
return false;
}
if(this == o){
return true;
}
if(o instanceof Lexeme){
Lexeme other = (Lexeme)o;
if(this.offset == other.getOffset()
&& this.begin == other.getBegin()
&& this.length == other.getLength()){
return true;
}else{
return false;
}
}else{
return false;
}
}
/*
* 词元哈希编码算法
* @see java.lang.Object#hashCode()
*/
public int hashCode(){
int absBegin = getBeginPosition();
int absEnd = getEndPosition();
return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
}
/*
* 词元在排序集合中的比较算法
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(Lexeme other) {
//起始位置优先
if(this.begin < other.getBegin()){
return -1;
}else if(this.begin == other.getBegin()){
//词元长度优先
if(this.length > other.getLength()){
return -1;
}else if(this.length == other.getLength()){
return 0;
}else {//this.length < other.getLength()
return 1;
}
}else{//this.begin > other.getBegin()
return 1;
}
}
/**
* 判断词元是否彼此包含
* @param other
* @return boolean true 完全包含 , false 可能不相交 或者 相交但不包含
*/
public boolean isOverlap(Lexeme other){
if(other != null){
if(this.getBeginPosition() <= other.getBeginPosition()
&& this.getEndPosition() >= other.getEndPosition()){
return true;
}else if(this.getBeginPosition() >= other.getBeginPosition()
&& this.getEndPosition() <= other.getEndPosition()){
return true;
}else {
return false;
}
}
return false;
}
public int getOffset() {
return offset;
}
public void setOffset(int offset) {
this.offset = offset;
}
public int getBegin() {
return begin;
}
/**
* 获取词元在文本中的起始位置
* @return int
*/
public int getBeginPosition(){
return offset + begin;
}
public void setBegin(int begin) {
this.begin = begin;
}
/**
* 获取词元在文本中的结束位置
* @return int
*/
public int getEndPosition(){
return offset + begin + length;
}
/**
* 获取词元的字符长度
* @return int
*/
public int getLength(){
return this.length;
}
public void setLength(int length) {
if(this.length < 0){
throw new IllegalArgumentException("length < 0");
}
this.length = length;
}
/**
* 获取词元的文本内容
* @return String
*/
public String getLexemeText() {
if(lexemeText == null){
return "";
}
return lexemeText;
}
public void setLexemeText(String lexemeText) {
if(lexemeText == null){
this.lexemeText = "";
this.length = 0;
}else{
this.lexemeText = lexemeText;
this.length = lexemeText.length();
}
}
/**
* 获取词元类型
* @return int
*/
public int getLexemeType() {
return lexemeType;
}
public void setLexemeType(int lexemeType) {
this.lexemeType = lexemeType;
}
public String toString(){
StringBuffer strbuf = new StringBuffer();
strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
strbuf.append(" : ").append(this.lexemeText).append(" : \t");
switch(lexemeType) {
case TYPE_CJK_NORMAL :
strbuf.append("CJK_NORMAL");
break;
case TYPE_CJK_SF :
strbuf.append("CJK_SUFFIX");
break;
case TYPE_CJK_SN :
strbuf.append("CJK_NAME");
break;
case TYPE_CJK_UNKNOWN :
strbuf.append("UNKNOWN");
break;
case TYPE_NUM :
strbuf.append("NUMEBER");
break;
case TYPE_NUMCOUNT :
strbuf.append("COUNT");
break;
case TYPE_LETTER :
strbuf.append("LETTER");
break;
}
return strbuf.toString();
}
Lexeme getPrev() {
return prev;
}
void setPrev(Lexeme prev) {
this.prev = prev;
}
Lexeme getNext() {
return next;
}
void setNext(Lexeme next) {
this.next = next;
}
}