/** * Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved. * EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. * http://www.ewcms.com */ package com.ewcms.content.document.util.analyzer.lucene; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Stack; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.BooleanClause.Occur; import com.ewcms.content.document.util.analyzer.IKSegmentation; import com.ewcms.content.document.util.analyzer.Lexeme; /** * <ul> * IK查询分析器, * 实现了对分词歧义结果的非冲突排列组合, * 有效的优化对歧义关键词的搜索命中, * 针对IK Analyzer V3的优化实现 * </ul> * * @author 吴智俊 */ public final class IKQueryParser { //查询关键字解析缓存线程本地变量 private static ThreadLocal<Map<String , TokenBranch>> keywordCacheThreadLocal = new ThreadLocal<Map<String , TokenBranch>>(); //是否采用最大词长分词 private static boolean isMaxWordLength = false; /** * 设置分词策略 * isMaxWordLength = true 采用最大词长分词 * @param isMaxWordLength */ public static void setMaxWordLength(boolean isMaxWordLength) { IKQueryParser.isMaxWordLength = isMaxWordLength ; } /** * 优化query队列 * 减少Query表达式的嵌套 * @param queries * @return */ private static Query optimizeQueries(List<Query> queries){ //生成当前branch 的完整query if(queries.size() == 0){ return null; }else if(queries.size() == 1){ return queries.get(0); }else{ BooleanQuery mustQueries = new BooleanQuery(); for(Query q : queries){ mustQueries.add(q, Occur.MUST); } return mustQueries; } } /** * 获取线程本地的解析缓存 * @return */ private static Map<String , TokenBranch> getTheadLocalCache(){ Map<String , TokenBranch> keywordCache = keywordCacheThreadLocal.get(); if(keywordCache == null){ keywordCache = new HashMap<String , TokenBranch>(4); keywordCacheThreadLocal.set(keywordCache); } return keywordCache; } /** * 缓存解析结果的博弈树 * @param query * @return */ private static TokenBranch getCachedTokenBranch(String query){ Map<String , TokenBranch> keywordCache = getTheadLocalCache(); return keywordCache.get(query); } /** * 缓存解析结果的博弈树 * @param query * @return */ private static void cachedTokenBranch(String query , TokenBranch tb){ Map<String , TokenBranch> keywordCache = getTheadLocalCache(); keywordCache.put(query, tb); } /** * 单连续字窜(不带空格符)单Field查询分析 * @param field * @param query * @return * @throws IOException */ private static Query _parse(String field , String query) throws IOException{ if(field == null){ throw new IllegalArgumentException("parameter \"field\" is null"); } if(query == null || "".equals(query.trim())){ return new TermQuery(new Term(field)); } //从缓存中取出已经解析的query生产的TokenBranch TokenBranch root = getCachedTokenBranch(query); if(root != null){ return optimizeQueries(root.toQueries(field)); }else{ root = new TokenBranch(null); //对查询条件q进行分词 StringReader input = new StringReader(query.trim()); IKSegmentation ikSeg = new IKSegmentation(input , isMaxWordLength); for(Lexeme lexeme = ikSeg.next() ; lexeme != null ; lexeme = ikSeg.next()){ //处理词元分支 root.accept(lexeme); } //缓存解析结果的博弈树 cachedTokenBranch(query , root); return optimizeQueries(root.toQueries(field)); } } /** * 解析IK简易查询表达式 * @param ikQueryExp * @return Query 查询逻辑对象 */ public static Query parse(String ikQueryExp){ ExpressionParser ikExpParser = new ExpressionParser(); return ikExpParser.parserExp(ikQueryExp); } /** /** * 单条件,单Field查询分析 * @param field -- Document field name * @param query -- keyword * @return Query 查询逻辑对象 * @throws IOException */ public static Query parse(String field , String query) throws IOException{ if(field == null){ throw new IllegalArgumentException("parameter \"field\" is null"); } String[] qParts = query.split("\\s"); if(qParts.length > 1){ BooleanQuery resultQuery = new BooleanQuery(); for(String q : qParts){ //过滤掉由于连续空格造成的空字串 if("".equals(q)){ continue; } Query partQuery = _parse(field , q); if(partQuery != null && (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){ resultQuery.add(partQuery, Occur.SHOULD); } } return resultQuery; }else{ return _parse(field , query); } } /** * 多Field,单条件查询分析 * @param fields -- Document fields name * @param query -- keyword * @return Query 查询逻辑对象 * @throws IOException */ public static Query parseMultiField(String[] fields , String query) throws IOException{ if(fields == null){ throw new IllegalArgumentException("parameter \"fields\" is null"); } BooleanQuery resultQuery = new BooleanQuery(); for(String field : fields){ if(field != null){ Query partQuery = parse(field , query); if(partQuery != null && (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){ resultQuery.add(partQuery, Occur.SHOULD); } } } return resultQuery; } /** * 多Field,单条件,多Occur查询分析 * @param fields -- Document fields name * @param query -- keyword * @param flags -- BooleanClause * @return Query 查询逻辑对象 * @throws IOException */ public static Query parseMultiField(String[] fields , String query , BooleanClause.Occur[] flags) throws IOException{ if(fields == null){ throw new IllegalArgumentException("parameter \"fields\" is null"); } if(flags == null){ throw new IllegalArgumentException("parameter \"flags\" is null"); } if (flags.length != fields.length){ throw new IllegalArgumentException("flags.length != fields.length"); } BooleanQuery resultQuery = new BooleanQuery(); for(int i = 0; i < fields.length; i++){ if(fields[i] != null){ Query partQuery = parse(fields[i] , query); if(partQuery != null && (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){ resultQuery.add(partQuery, flags[i]); } } } return resultQuery; } /** * 多Field多条件查询分析 * @param fields * @param queries * @return Query 查询逻辑对象 * @throws IOException */ public static Query parseMultiField(String[] fields , String[] queries) throws IOException{ if(fields == null){ throw new IllegalArgumentException("parameter \"fields\" is null"); } if(queries == null){ throw new IllegalArgumentException("parameter \"queries\" is null"); } if (queries.length != fields.length){ throw new IllegalArgumentException("queries.length != fields.length"); } BooleanQuery resultQuery = new BooleanQuery(); for(int i = 0; i < fields.length; i++){ if(fields[i] != null){ Query partQuery = parse(fields[i] , queries[i]); if(partQuery != null && (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){ resultQuery.add(partQuery, Occur.SHOULD); } } } return resultQuery; } /** * 多Field,多条件,多Occur查询分析 * @param fields * @param queries * @param flags * @return Query 查询逻辑对象 * @throws IOException */ public static Query parseMultiField(String[] fields , String[] queries , BooleanClause.Occur[] flags) throws IOException{ if(fields == null){ throw new IllegalArgumentException("parameter \"fields\" is null"); } if(queries == null){ throw new IllegalArgumentException("parameter \"queries\" is null"); } if(flags == null){ throw new IllegalArgumentException("parameter \"flags\" is null"); } if (!(queries.length == fields.length && queries.length == flags.length)){ throw new IllegalArgumentException("queries, fields, and flags array have have different length"); } BooleanQuery resultQuery = new BooleanQuery(); for(int i = 0; i < fields.length; i++){ if(fields[i] != null){ Query partQuery = parse(fields[i] , queries[i]); if(partQuery != null && (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){ resultQuery.add(partQuery, flags[i]); } } } return resultQuery; } /** * 词元分支 * 当分词出现歧义时,采用词元分支容纳不同的歧义组合 * @author 林良益 * */ private static class TokenBranch{ private static final int REFUSED = -1; private static final int ACCEPTED = 0; private static final int TONEXT = 1; //词元分支左边界 private int leftBorder; //词元分支右边界 private int rightBorder; //当前分支主词元 private Lexeme lexeme; //当前分支可并入的词元分支 private List<TokenBranch> acceptedBranchs; //当前分支的后一个相邻分支 private TokenBranch nextBranch; TokenBranch(Lexeme lexeme){ if(lexeme != null){ this.lexeme = lexeme; //初始化branch的左右边界 this.leftBorder = lexeme.getBeginPosition(); this.rightBorder = lexeme.getEndPosition(); } } @SuppressWarnings("unused") public int getLeftBorder() { return leftBorder; } @SuppressWarnings("unused") public int getRightBorder() { return rightBorder; } public Lexeme getLexeme() { return lexeme; } @SuppressWarnings("unused") public List<TokenBranch> getAcceptedBranchs() { return acceptedBranchs; } @SuppressWarnings("unused") public TokenBranch getNextBranch() { return nextBranch; } public int hashCode(){ if(this.lexeme == null){ return 0; }else{ return this.lexeme.hashCode() * 37; } } public boolean equals(Object o){ if(o == null){ return false; } if(this == o){ return true; } if(o instanceof TokenBranch){ TokenBranch other = (TokenBranch)o; if(this.lexeme == null || other.getLexeme() == null){ return false; }else{ return this.lexeme.equals(other.getLexeme()); } }else{ return false; } } /** * 组合词元分支 * @param _lexeme * @return 返回当前branch能否接收词元对象 */ boolean accept(Lexeme _lexeme){ /* * 检查新的lexeme 对当前的branch 的可接受类型 * acceptType : REFUSED 不能接受 * acceptType : ACCEPTED 接受 * acceptType : TONEXT 由相邻分支接受 */ int acceptType = checkAccept(_lexeme); switch(acceptType){ case REFUSED: // REFUSE 情况 return false; case ACCEPTED : if(acceptedBranchs == null){ //当前branch没有子branch,则添加到当前branch下 acceptedBranchs = new ArrayList<TokenBranch>(2); acceptedBranchs.add(new TokenBranch(_lexeme)); }else{ boolean acceptedByChild = false; //当前branch拥有子branch,则优先由子branch接纳 for(TokenBranch childBranch : acceptedBranchs){ acceptedByChild = childBranch.accept(_lexeme) || acceptedByChild; } //如果所有的子branch不能接纳,则由当前branch接纳 if(!acceptedByChild){ acceptedBranchs.add(new TokenBranch(_lexeme)); } } //设置branch的最大右边界 if(_lexeme.getEndPosition() > this.rightBorder){ this.rightBorder = _lexeme.getEndPosition(); } break; case TONEXT : //把lexeme放入当前branch的相邻分支 if(this.nextBranch == null){ //如果还没有相邻分支,则建立一个不交叠的分支 this.nextBranch = new TokenBranch(null); } this.nextBranch.accept(_lexeme); break; } return true; } /** * 将分支数据转成Query逻辑 * @return */ List<Query> toQueries(String fieldName){ List<Query> queries = new ArrayList<Query>(1); //生成当前branch 的query if(lexeme != null){ queries.add(new TermQuery(new Term(fieldName , lexeme.getLexemeText()))); } //生成child branch 的query if(acceptedBranchs != null && acceptedBranchs.size() > 0){ if(acceptedBranchs.size() == 1){ Query onlyOneQuery = optimizeQueries(acceptedBranchs.get(0).toQueries(fieldName)); if(onlyOneQuery != null){ queries.add(onlyOneQuery); } }else{ BooleanQuery orQuery = new BooleanQuery(); for(TokenBranch childBranch : acceptedBranchs){ Query childQuery = optimizeQueries(childBranch.toQueries(fieldName)); if(childQuery != null){ orQuery.add(childQuery, Occur.SHOULD); } } if(orQuery.getClauses().length > 0){ queries.add(orQuery); } } } //生成nextBranch的query if(nextBranch != null){ queries.addAll(nextBranch.toQueries(fieldName)); } return queries; } /** * 判断指定的lexeme能否被当前的branch接受 * @param lexeme * @return 返回接受的形式 */ private int checkAccept(Lexeme _lexeme){ int acceptType = 0; if(_lexeme == null){ throw new IllegalArgumentException("parameter:lexeme is null"); } if(null == this.lexeme){//当前的branch是一个不交叠(ROOT)的分支 if(this.rightBorder > 0 //说明当前branch内至少有一个lexeme && _lexeme.getBeginPosition() >= this.rightBorder){ //_lexeme 与 当前的branch不相交 acceptType = TONEXT; }else{ acceptType = ACCEPTED; } }else{//当前的branch是一个有交叠的分支 if(_lexeme.getBeginPosition() < this.lexeme.getBeginPosition()){ //_lexeme 的位置比 this.lexeme还靠前(这种情况不应该发生) acceptType = REFUSED; }else if(_lexeme.getBeginPosition() >= this.lexeme.getBeginPosition() && _lexeme.getBeginPosition() < this.lexeme.getEndPosition()){ // _lexeme 与 this.lexeme相交 acceptType = REFUSED; }else if(_lexeme.getBeginPosition() >= this.lexeme.getEndPosition() && _lexeme.getBeginPosition() < this.rightBorder){ //_lexeme 与 this.lexeme 不相交, 但_lexeme 与 当前的branch相交 acceptType = ACCEPTED; }else{//_lexeme.getBeginPosition() >= this.rightBorder //_lexeme 与 当前的branch不相交 acceptType= TONEXT; } } return acceptType; } } /** * 查询表达式解析 * alpha版本 * 自定义lucene查询表达式 * 表达式例子 : * (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword' * * @author linliangyi * May 20, 2010 */ static class ExpressionParser { //public static final String LUCENE_SPECIAL_CHAR = "&&||-()':="; private List<Element> elements = new ArrayList<Element>(); private Stack<Query> querys = new Stack<Query>(); private Stack<Element> operates = new Stack<Element>(); public ExpressionParser(){ } /** * 解析查询表达式,生成Lucene Query对象 * * @param expression * @return */ public Query parserExp(String expression){ Query lucenceQuery = null; try{ //文法解析 this.splitElements(expression); //语法解析 this.parseSyntax(); if(this.querys.size() == 1){ lucenceQuery = this.querys.pop(); }else{ throw new IllegalStateException("表达式异常: 缺少逻辑操作符"); } }finally{ elements.clear(); querys.clear(); operates.clear(); } return lucenceQuery; } /** * 表达式文法解析 * @param expression */ private void splitElements(String expression){ if(expression == null){ return; } Element curretElement = null; char[] expChars = expression.toCharArray(); for(int i = 0 ; i < expChars.length ; i++){ switch(expChars[i]){ case '&' : if(curretElement == null){ curretElement = new Element(); curretElement.type = '&'; curretElement.append(expChars[i]); }else if(curretElement.type == '&'){ curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; }else if(curretElement.type == '\''){ curretElement.append(expChars[i]); }else { this.elements.add(curretElement); curretElement = new Element(); curretElement.type = '&'; curretElement.append(expChars[i]); } break; case '|' : if(curretElement == null){ curretElement = new Element(); curretElement.type = '|'; curretElement.append(expChars[i]); }else if(curretElement.type == '|'){ curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; }else if(curretElement.type == '\''){ curretElement.append(expChars[i]); }else { this.elements.add(curretElement); curretElement = new Element(); curretElement.type = '|'; curretElement.append(expChars[i]); } break; case '-' : if(curretElement != null){ if(curretElement.type == '\''){ curretElement.append(expChars[i]); continue; }else{ this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '-'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case '(' : if(curretElement != null){ if(curretElement.type == '\''){ curretElement.append(expChars[i]); continue; }else{ this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '('; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case ')' : if(curretElement != null){ if(curretElement.type == '\''){ curretElement.append(expChars[i]); continue; }else{ this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = ')'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case ':' : if(curretElement != null){ if(curretElement.type == '\''){ curretElement.append(expChars[i]); continue; }else{ this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = ':'; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case '=' : if(curretElement != null){ if(curretElement.type == '\''){ curretElement.append(expChars[i]); continue; }else{ this.elements.add(curretElement); } } curretElement = new Element(); curretElement.type = '='; curretElement.append(expChars[i]); this.elements.add(curretElement); curretElement = null; break; case ' ' : if(curretElement != null){ if(curretElement.type == '\''){ curretElement.append(expChars[i]); }else{ this.elements.add(curretElement); curretElement = null; } } break; case '\'' : if(curretElement == null){ curretElement = new Element(); curretElement.type = '\''; }else if(curretElement.type == '\''){ this.elements.add(curretElement); curretElement = null; }else{ this.elements.add(curretElement); curretElement = new Element(); curretElement.type = '\''; } break; default : if(curretElement == null){ curretElement = new Element(); curretElement.type = 'F'; curretElement.append(expChars[i]); }else if(curretElement.type == 'F'){ curretElement.append(expChars[i]); }else if(curretElement.type == '\''){ curretElement.append(expChars[i]); }else{ this.elements.add(curretElement); curretElement = new Element(); curretElement.type = 'F'; curretElement.append(expChars[i]); } } } if(curretElement != null){ this.elements.add(curretElement); curretElement = null; } } /** * 语法解析 * */ private void parseSyntax(){ for(int i = 0 ; i < this.elements.size() ; i++){ Element e = this.elements.get(i); if('F' == e.type){ Element e2 = this.elements.get(i + 1); if('=' != e2.type && ':' != e2.type){ throw new IllegalStateException("表达式异常: = 或 : 号丢失"); } Element e3 = this.elements.get(i + 2); if('\'' != e3.type){ throw new IllegalStateException("表达式异常:匹配值丢失"); } i+=2; if('=' == e2.type){ TermQuery tQuery = new TermQuery(new Term(e.toString() , e3.toString())); this.querys.push(tQuery); }else if(':' == e2.type){ try { Query tQuery = IKQueryParser.parse(e.toString(), e3.toString()); this.querys.push(tQuery); } catch (IOException e1) { e1.printStackTrace(); } } }else if('(' == e.type){ this.operates.push(e); }else if(')' == e.type){ boolean doPop = true; while(doPop && !this.operates.empty()){ Element op = this.operates.pop(); if('(' == op.type){ doPop = false; }else { Query q = toQuery(op); this.querys.push(q); } } }else{ if(this.operates.isEmpty()){ this.operates.push(e); }else{ boolean doPeek = true; while(doPeek && !this.operates.isEmpty()){ Element eleOnTop = this.operates.peek(); if('(' == eleOnTop.type){ doPeek = false; this.operates.push(e); }else if(compare(e , eleOnTop) == 1){ this.operates.push(e); doPeek = false; }else if(compare(e , eleOnTop) == 0){ Query q = toQuery(eleOnTop); this.operates.pop(); this.querys.push(q); }else{ Query q = toQuery(eleOnTop); this.operates.pop(); this.querys.push(q); } } if(doPeek && this.operates.empty()){ this.operates.push(e); } } } } while(!this.operates.isEmpty()){ Element eleOnTop = this.operates.pop(); Query q = toQuery(eleOnTop); this.querys.push(q); } } /** * 根据逻辑操作符,生成BooleanQuery * @param op * @return */ private Query toQuery(Element op){ BooleanQuery resultQuery = new BooleanQuery(); if(this.querys.size() < 2){ throw new IllegalStateException("表达式异常:SubQuery 个数不匹配"); } Query q2 = this.querys.pop(); Query q1 = this.querys.pop(); if('&' == op.type){ if(q1 instanceof TermQuery){ resultQuery.add(q1,Occur.MUST); }else{ BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); if(clauses[0].getOccur() == Occur.MUST){ for(BooleanClause c : clauses){ resultQuery.add(c); } }else{ resultQuery.add(q1,Occur.MUST); } } if(q2 instanceof TermQuery){ resultQuery.add(q2,Occur.MUST); }else{ BooleanClause[] clauses = ((BooleanQuery)q2).getClauses(); if(clauses[0].getOccur() == Occur.MUST){ for(BooleanClause c : clauses){ resultQuery.add(c); } }else{ resultQuery.add(q2,Occur.MUST); } } }else if('|' == op.type){ if(q1 instanceof TermQuery){ resultQuery.add(q1,Occur.SHOULD); }else{ BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); if(clauses[0].getOccur() == Occur.SHOULD){ for(BooleanClause c : clauses){ resultQuery.add(c); } }else{ resultQuery.add(q1,Occur.SHOULD); } } if(q2 instanceof TermQuery){ resultQuery.add(q2,Occur.SHOULD); }else{ BooleanClause[] clauses = ((BooleanQuery)q2).getClauses(); if(clauses[0].getOccur() == Occur.SHOULD){ for(BooleanClause c : clauses){ resultQuery.add(c); } }else{ resultQuery.add(q2,Occur.SHOULD); } } }else if('-' == op.type){ if(q1 instanceof TermQuery){ resultQuery.add(q1,Occur.MUST); }else{ BooleanClause[] clauses = ((BooleanQuery)q1).getClauses(); for(BooleanClause c : clauses){ resultQuery.add(c); } } resultQuery.add(q2,Occur.MUST_NOT); } return resultQuery; } /** * 比较操作符优先级 * @param e1 * @param e2 * @return */ private int compare(Element e1 , Element e2){ if('&' == e1.type){ if('&' == e2.type){ return 0; }else { return 1; } }else if('|' == e1.type){ if('&' == e2.type){ return -1; }else if('|' == e2.type){ return 0; }else{ return 1; } }else{ if('-' == e2.type){ return 0; }else{ return -1; } } } /** * 表达式元素 * * @author linliangyi * May 20, 2010 */ private class Element{ char type = 0; StringBuffer eleTextBuff; public Element(){ eleTextBuff = new StringBuffer(); } public void append(char c){ this.eleTextBuff.append(c); } public String toString(){ return this.eleTextBuff.toString(); } } } }