/**
* Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved.
* EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
* http://www.ewcms.com
*/
package com.ewcms.content.document.util.analyzer.lucene;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import com.ewcms.content.document.util.analyzer.IKSegmentation;
import com.ewcms.content.document.util.analyzer.Lexeme;
/**
* <ul>
* IK查询分析器,
* 实现了对分词歧义结果的非冲突排列组合,
* 有效的优化对歧义关键词的搜索命中,
* 针对IK Analyzer V3的优化实现
* </ul>
*
* @author 吴智俊
*/
public final class IKQueryParser {
//查询关键字解析缓存线程本地变量
private static ThreadLocal<Map<String , TokenBranch>> keywordCacheThreadLocal
= new ThreadLocal<Map<String , TokenBranch>>();
//是否采用最大词长分词
private static boolean isMaxWordLength = false;
/**
* 设置分词策略
* isMaxWordLength = true 采用最大词长分词
* @param isMaxWordLength
*/
public static void setMaxWordLength(boolean isMaxWordLength) {
IKQueryParser.isMaxWordLength = isMaxWordLength ;
}
/**
* 优化query队列
* 减少Query表达式的嵌套
* @param queries
* @return
*/
private static Query optimizeQueries(List<Query> queries){
//生成当前branch 的完整query
if(queries.size() == 0){
return null;
}else if(queries.size() == 1){
return queries.get(0);
}else{
BooleanQuery mustQueries = new BooleanQuery();
for(Query q : queries){
mustQueries.add(q, Occur.MUST);
}
return mustQueries;
}
}
/**
* 获取线程本地的解析缓存
* @return
*/
private static Map<String , TokenBranch> getTheadLocalCache(){
Map<String , TokenBranch> keywordCache = keywordCacheThreadLocal.get();
if(keywordCache == null){
keywordCache = new HashMap<String , TokenBranch>(4);
keywordCacheThreadLocal.set(keywordCache);
}
return keywordCache;
}
/**
* 缓存解析结果的博弈树
* @param query
* @return
*/
private static TokenBranch getCachedTokenBranch(String query){
Map<String , TokenBranch> keywordCache = getTheadLocalCache();
return keywordCache.get(query);
}
/**
* 缓存解析结果的博弈树
* @param query
* @return
*/
private static void cachedTokenBranch(String query , TokenBranch tb){
Map<String , TokenBranch> keywordCache = getTheadLocalCache();
keywordCache.put(query, tb);
}
/**
* 单连续字窜(不带空格符)单Field查询分析
* @param field
* @param query
* @return
* @throws IOException
*/
private static Query _parse(String field , String query) throws IOException{
if(field == null){
throw new IllegalArgumentException("parameter \"field\" is null");
}
if(query == null || "".equals(query.trim())){
return new TermQuery(new Term(field));
}
//从缓存中取出已经解析的query生产的TokenBranch
TokenBranch root = getCachedTokenBranch(query);
if(root != null){
return optimizeQueries(root.toQueries(field));
}else{
root = new TokenBranch(null);
//对查询条件q进行分词
StringReader input = new StringReader(query.trim());
IKSegmentation ikSeg = new IKSegmentation(input , isMaxWordLength);
for(Lexeme lexeme = ikSeg.next() ; lexeme != null ; lexeme = ikSeg.next()){
//处理词元分支
root.accept(lexeme);
}
//缓存解析结果的博弈树
cachedTokenBranch(query , root);
return optimizeQueries(root.toQueries(field));
}
}
/**
* 解析IK简易查询表达式
* @param ikQueryExp
* @return Query 查询逻辑对象
*/
public static Query parse(String ikQueryExp){
ExpressionParser ikExpParser = new ExpressionParser();
return ikExpParser.parserExp(ikQueryExp);
}
/**
/**
* 单条件,单Field查询分析
* @param field -- Document field name
* @param query -- keyword
* @return Query 查询逻辑对象
* @throws IOException
*/
public static Query parse(String field , String query) throws IOException{
if(field == null){
throw new IllegalArgumentException("parameter \"field\" is null");
}
String[] qParts = query.split("\\s");
if(qParts.length > 1){
BooleanQuery resultQuery = new BooleanQuery();
for(String q : qParts){
//过滤掉由于连续空格造成的空字串
if("".equals(q)){
continue;
}
Query partQuery = _parse(field , q);
if(partQuery != null &&
(!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
resultQuery.add(partQuery, Occur.SHOULD);
}
}
return resultQuery;
}else{
return _parse(field , query);
}
}
/**
* 多Field,单条件查询分析
* @param fields -- Document fields name
* @param query -- keyword
* @return Query 查询逻辑对象
* @throws IOException
*/
public static Query parseMultiField(String[] fields , String query) throws IOException{
if(fields == null){
throw new IllegalArgumentException("parameter \"fields\" is null");
}
BooleanQuery resultQuery = new BooleanQuery();
for(String field : fields){
if(field != null){
Query partQuery = parse(field , query);
if(partQuery != null &&
(!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
resultQuery.add(partQuery, Occur.SHOULD);
}
}
}
return resultQuery;
}
/**
* 多Field,单条件,多Occur查询分析
* @param fields -- Document fields name
* @param query -- keyword
* @param flags -- BooleanClause
* @return Query 查询逻辑对象
* @throws IOException
*/
public static Query parseMultiField(String[] fields , String query , BooleanClause.Occur[] flags) throws IOException{
if(fields == null){
throw new IllegalArgumentException("parameter \"fields\" is null");
}
if(flags == null){
throw new IllegalArgumentException("parameter \"flags\" is null");
}
if (flags.length != fields.length){
throw new IllegalArgumentException("flags.length != fields.length");
}
BooleanQuery resultQuery = new BooleanQuery();
for(int i = 0; i < fields.length; i++){
if(fields[i] != null){
Query partQuery = parse(fields[i] , query);
if(partQuery != null &&
(!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
resultQuery.add(partQuery, flags[i]);
}
}
}
return resultQuery;
}
/**
* 多Field多条件查询分析
* @param fields
* @param queries
* @return Query 查询逻辑对象
* @throws IOException
*/
public static Query parseMultiField(String[] fields , String[] queries) throws IOException{
if(fields == null){
throw new IllegalArgumentException("parameter \"fields\" is null");
}
if(queries == null){
throw new IllegalArgumentException("parameter \"queries\" is null");
}
if (queries.length != fields.length){
throw new IllegalArgumentException("queries.length != fields.length");
}
BooleanQuery resultQuery = new BooleanQuery();
for(int i = 0; i < fields.length; i++){
if(fields[i] != null){
Query partQuery = parse(fields[i] , queries[i]);
if(partQuery != null &&
(!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
resultQuery.add(partQuery, Occur.SHOULD);
}
}
}
return resultQuery;
}
/**
* 多Field,多条件,多Occur查询分析
* @param fields
* @param queries
* @param flags
* @return Query 查询逻辑对象
* @throws IOException
*/
public static Query parseMultiField(String[] fields , String[] queries , BooleanClause.Occur[] flags) throws IOException{
if(fields == null){
throw new IllegalArgumentException("parameter \"fields\" is null");
}
if(queries == null){
throw new IllegalArgumentException("parameter \"queries\" is null");
}
if(flags == null){
throw new IllegalArgumentException("parameter \"flags\" is null");
}
if (!(queries.length == fields.length && queries.length == flags.length)){
throw new IllegalArgumentException("queries, fields, and flags array have have different length");
}
BooleanQuery resultQuery = new BooleanQuery();
for(int i = 0; i < fields.length; i++){
if(fields[i] != null){
Query partQuery = parse(fields[i] , queries[i]);
if(partQuery != null &&
(!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
resultQuery.add(partQuery, flags[i]);
}
}
}
return resultQuery;
}
/**
* 词元分支
* 当分词出现歧义时,采用词元分支容纳不同的歧义组合
* @author 林良益
*
*/
private static class TokenBranch{
private static final int REFUSED = -1;
private static final int ACCEPTED = 0;
private static final int TONEXT = 1;
//词元分支左边界
private int leftBorder;
//词元分支右边界
private int rightBorder;
//当前分支主词元
private Lexeme lexeme;
//当前分支可并入的词元分支
private List<TokenBranch> acceptedBranchs;
//当前分支的后一个相邻分支
private TokenBranch nextBranch;
TokenBranch(Lexeme lexeme){
if(lexeme != null){
this.lexeme = lexeme;
//初始化branch的左右边界
this.leftBorder = lexeme.getBeginPosition();
this.rightBorder = lexeme.getEndPosition();
}
}
@SuppressWarnings("unused")
public int getLeftBorder() {
return leftBorder;
}
@SuppressWarnings("unused")
public int getRightBorder() {
return rightBorder;
}
public Lexeme getLexeme() {
return lexeme;
}
@SuppressWarnings("unused")
public List<TokenBranch> getAcceptedBranchs() {
return acceptedBranchs;
}
@SuppressWarnings("unused")
public TokenBranch getNextBranch() {
return nextBranch;
}
public int hashCode(){
if(this.lexeme == null){
return 0;
}else{
return this.lexeme.hashCode() * 37;
}
}
public boolean equals(Object o){
if(o == null){
return false;
}
if(this == o){
return true;
}
if(o instanceof TokenBranch){
TokenBranch other = (TokenBranch)o;
if(this.lexeme == null ||
other.getLexeme() == null){
return false;
}else{
return this.lexeme.equals(other.getLexeme());
}
}else{
return false;
}
}
/**
* 组合词元分支
* @param _lexeme
* @return 返回当前branch能否接收词元对象
*/
boolean accept(Lexeme _lexeme){
/*
* 检查新的lexeme 对当前的branch 的可接受类型
* acceptType : REFUSED 不能接受
* acceptType : ACCEPTED 接受
* acceptType : TONEXT 由相邻分支接受
*/
int acceptType = checkAccept(_lexeme);
switch(acceptType){
case REFUSED:
// REFUSE 情况
return false;
case ACCEPTED :
if(acceptedBranchs == null){
//当前branch没有子branch,则添加到当前branch下
acceptedBranchs = new ArrayList<TokenBranch>(2);
acceptedBranchs.add(new TokenBranch(_lexeme));
}else{
boolean acceptedByChild = false;
//当前branch拥有子branch,则优先由子branch接纳
for(TokenBranch childBranch : acceptedBranchs){
acceptedByChild = childBranch.accept(_lexeme) || acceptedByChild;
}
//如果所有的子branch不能接纳,则由当前branch接纳
if(!acceptedByChild){
acceptedBranchs.add(new TokenBranch(_lexeme));
}
}
//设置branch的最大右边界
if(_lexeme.getEndPosition() > this.rightBorder){
this.rightBorder = _lexeme.getEndPosition();
}
break;
case TONEXT :
//把lexeme放入当前branch的相邻分支
if(this.nextBranch == null){
//如果还没有相邻分支,则建立一个不交叠的分支
this.nextBranch = new TokenBranch(null);
}
this.nextBranch.accept(_lexeme);
break;
}
return true;
}
/**
* 将分支数据转成Query逻辑
* @return
*/
List<Query> toQueries(String fieldName){
List<Query> queries = new ArrayList<Query>(1);
//生成当前branch 的query
if(lexeme != null){
queries.add(new TermQuery(new Term(fieldName , lexeme.getLexemeText())));
}
//生成child branch 的query
if(acceptedBranchs != null && acceptedBranchs.size() > 0){
if(acceptedBranchs.size() == 1){
Query onlyOneQuery = optimizeQueries(acceptedBranchs.get(0).toQueries(fieldName));
if(onlyOneQuery != null){
queries.add(onlyOneQuery);
}
}else{
BooleanQuery orQuery = new BooleanQuery();
for(TokenBranch childBranch : acceptedBranchs){
Query childQuery = optimizeQueries(childBranch.toQueries(fieldName));
if(childQuery != null){
orQuery.add(childQuery, Occur.SHOULD);
}
}
if(orQuery.getClauses().length > 0){
queries.add(orQuery);
}
}
}
//生成nextBranch的query
if(nextBranch != null){
queries.addAll(nextBranch.toQueries(fieldName));
}
return queries;
}
/**
* 判断指定的lexeme能否被当前的branch接受
* @param lexeme
* @return 返回接受的形式
*/
private int checkAccept(Lexeme _lexeme){
int acceptType = 0;
if(_lexeme == null){
throw new IllegalArgumentException("parameter:lexeme is null");
}
if(null == this.lexeme){//当前的branch是一个不交叠(ROOT)的分支
if(this.rightBorder > 0 //说明当前branch内至少有一个lexeme
&& _lexeme.getBeginPosition() >= this.rightBorder){
//_lexeme 与 当前的branch不相交
acceptType = TONEXT;
}else{
acceptType = ACCEPTED;
}
}else{//当前的branch是一个有交叠的分支
if(_lexeme.getBeginPosition() < this.lexeme.getBeginPosition()){
//_lexeme 的位置比 this.lexeme还靠前(这种情况不应该发生)
acceptType = REFUSED;
}else if(_lexeme.getBeginPosition() >= this.lexeme.getBeginPosition()
&& _lexeme.getBeginPosition() < this.lexeme.getEndPosition()){
// _lexeme 与 this.lexeme相交
acceptType = REFUSED;
}else if(_lexeme.getBeginPosition() >= this.lexeme.getEndPosition()
&& _lexeme.getBeginPosition() < this.rightBorder){
//_lexeme 与 this.lexeme 不相交, 但_lexeme 与 当前的branch相交
acceptType = ACCEPTED;
}else{//_lexeme.getBeginPosition() >= this.rightBorder
//_lexeme 与 当前的branch不相交
acceptType= TONEXT;
}
}
return acceptType;
}
}
/**
* 查询表达式解析
* alpha版本
* 自定义lucene查询表达式
* 表达式例子 :
* (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword'
*
* @author linliangyi
* May 20, 2010
*/
static class ExpressionParser {
//public static final String LUCENE_SPECIAL_CHAR = "&&||-()':=";
private List<Element> elements = new ArrayList<Element>();
private Stack<Query> querys = new Stack<Query>();
private Stack<Element> operates = new Stack<Element>();
public ExpressionParser(){
}
/**
* 解析查询表达式,生成Lucene Query对象
*
* @param expression
* @return
*/
public Query parserExp(String expression){
Query lucenceQuery = null;
try{
//文法解析
this.splitElements(expression);
//语法解析
this.parseSyntax();
if(this.querys.size() == 1){
lucenceQuery = this.querys.pop();
}else{
throw new IllegalStateException("表达式异常: 缺少逻辑操作符");
}
}finally{
elements.clear();
querys.clear();
operates.clear();
}
return lucenceQuery;
}
/**
* 表达式文法解析
* @param expression
*/
private void splitElements(String expression){
if(expression == null){
return;
}
Element curretElement = null;
char[] expChars = expression.toCharArray();
for(int i = 0 ; i < expChars.length ; i++){
switch(expChars[i]){
case '&' :
if(curretElement == null){
curretElement = new Element();
curretElement.type = '&';
curretElement.append(expChars[i]);
}else if(curretElement.type == '&'){
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
}else if(curretElement.type == '\''){
curretElement.append(expChars[i]);
}else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '&';
curretElement.append(expChars[i]);
}
break;
case '|' :
if(curretElement == null){
curretElement = new Element();
curretElement.type = '|';
curretElement.append(expChars[i]);
}else if(curretElement.type == '|'){
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
}else if(curretElement.type == '\''){
curretElement.append(expChars[i]);
}else {
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '|';
curretElement.append(expChars[i]);
}
break;
case '-' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '-';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case '(' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '(';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case ')' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = ')';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case ':' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = ':';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case '=' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
continue;
}else{
this.elements.add(curretElement);
}
}
curretElement = new Element();
curretElement.type = '=';
curretElement.append(expChars[i]);
this.elements.add(curretElement);
curretElement = null;
break;
case ' ' :
if(curretElement != null){
if(curretElement.type == '\''){
curretElement.append(expChars[i]);
}else{
this.elements.add(curretElement);
curretElement = null;
}
}
break;
case '\'' :
if(curretElement == null){
curretElement = new Element();
curretElement.type = '\'';
}else if(curretElement.type == '\''){
this.elements.add(curretElement);
curretElement = null;
}else{
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = '\'';
}
break;
default :
if(curretElement == null){
curretElement = new Element();
curretElement.type = 'F';
curretElement.append(expChars[i]);
}else if(curretElement.type == 'F'){
curretElement.append(expChars[i]);
}else if(curretElement.type == '\''){
curretElement.append(expChars[i]);
}else{
this.elements.add(curretElement);
curretElement = new Element();
curretElement.type = 'F';
curretElement.append(expChars[i]);
}
}
}
if(curretElement != null){
this.elements.add(curretElement);
curretElement = null;
}
}
/**
* 语法解析
*
*/
private void parseSyntax(){
for(int i = 0 ; i < this.elements.size() ; i++){
Element e = this.elements.get(i);
if('F' == e.type){
Element e2 = this.elements.get(i + 1);
if('=' != e2.type && ':' != e2.type){
throw new IllegalStateException("表达式异常: = 或 : 号丢失");
}
Element e3 = this.elements.get(i + 2);
if('\'' != e3.type){
throw new IllegalStateException("表达式异常:匹配值丢失");
}
i+=2;
if('=' == e2.type){
TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString()));
this.querys.push(tQuery);
}else if(':' == e2.type){
try {
Query tQuery = IKQueryParser.parse(e.toString(), e3.toString());
this.querys.push(tQuery);
} catch (IOException e1) {
e1.printStackTrace();
}
}
}else if('(' == e.type){
this.operates.push(e);
}else if(')' == e.type){
boolean doPop = true;
while(doPop && !this.operates.empty()){
Element op = this.operates.pop();
if('(' == op.type){
doPop = false;
}else {
Query q = toQuery(op);
this.querys.push(q);
}
}
}else{
if(this.operates.isEmpty()){
this.operates.push(e);
}else{
boolean doPeek = true;
while(doPeek && !this.operates.isEmpty()){
Element eleOnTop = this.operates.peek();
if('(' == eleOnTop.type){
doPeek = false;
this.operates.push(e);
}else if(compare(e , eleOnTop) == 1){
this.operates.push(e);
doPeek = false;
}else if(compare(e , eleOnTop) == 0){
Query q = toQuery(eleOnTop);
this.operates.pop();
this.querys.push(q);
}else{
Query q = toQuery(eleOnTop);
this.operates.pop();
this.querys.push(q);
}
}
if(doPeek && this.operates.empty()){
this.operates.push(e);
}
}
}
}
while(!this.operates.isEmpty()){
Element eleOnTop = this.operates.pop();
Query q = toQuery(eleOnTop);
this.querys.push(q);
}
}
/**
* 根据逻辑操作符,生成BooleanQuery
* @param op
* @return
*/
private Query toQuery(Element op){
BooleanQuery resultQuery = new BooleanQuery();
if(this.querys.size() < 2){
throw new IllegalStateException("表达式异常:SubQuery 个数不匹配");
}
Query q2 = this.querys.pop();
Query q1 = this.querys.pop();
if('&' == op.type){
if(q1 instanceof TermQuery){
resultQuery.add(q1,Occur.MUST);
}else{
BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
if(clauses[0].getOccur() == Occur.MUST){
for(BooleanClause c : clauses){
resultQuery.add(c);
}
}else{
resultQuery.add(q1,Occur.MUST);
}
}
if(q2 instanceof TermQuery){
resultQuery.add(q2,Occur.MUST);
}else{
BooleanClause[] clauses = ((BooleanQuery)q2).getClauses();
if(clauses[0].getOccur() == Occur.MUST){
for(BooleanClause c : clauses){
resultQuery.add(c);
}
}else{
resultQuery.add(q2,Occur.MUST);
}
}
}else if('|' == op.type){
if(q1 instanceof TermQuery){
resultQuery.add(q1,Occur.SHOULD);
}else{
BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
if(clauses[0].getOccur() == Occur.SHOULD){
for(BooleanClause c : clauses){
resultQuery.add(c);
}
}else{
resultQuery.add(q1,Occur.SHOULD);
}
}
if(q2 instanceof TermQuery){
resultQuery.add(q2,Occur.SHOULD);
}else{
BooleanClause[] clauses = ((BooleanQuery)q2).getClauses();
if(clauses[0].getOccur() == Occur.SHOULD){
for(BooleanClause c : clauses){
resultQuery.add(c);
}
}else{
resultQuery.add(q2,Occur.SHOULD);
}
}
}else if('-' == op.type){
if(q1 instanceof TermQuery){
resultQuery.add(q1,Occur.MUST);
}else{
BooleanClause[] clauses = ((BooleanQuery)q1).getClauses();
for(BooleanClause c : clauses){
resultQuery.add(c);
}
}
resultQuery.add(q2,Occur.MUST_NOT);
}
return resultQuery;
}
/**
* 比较操作符优先级
* @param e1
* @param e2
* @return
*/
private int compare(Element e1 , Element e2){
if('&' == e1.type){
if('&' == e2.type){
return 0;
}else {
return 1;
}
}else if('|' == e1.type){
if('&' == e2.type){
return -1;
}else if('|' == e2.type){
return 0;
}else{
return 1;
}
}else{
if('-' == e2.type){
return 0;
}else{
return -1;
}
}
}
/**
* 表达式元素
*
* @author linliangyi
* May 20, 2010
*/
private class Element{
char type = 0;
StringBuffer eleTextBuff;
public Element(){
eleTextBuff = new StringBuffer();
}
public void append(char c){
this.eleTextBuff.append(c);
}
public String toString(){
return this.eleTextBuff.toString();
}
}
}
}