/* * Copyright 2010 Bizosys Technologies Limited * * Licensed to the Bizosys Technologies Limited (Bizosys) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Bizosys licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bizosys.hsearch.outpipe; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import com.bizosys.hsearch.inpipe.util.StopwordManager; import com.bizosys.hsearch.query.HQuery; import com.bizosys.hsearch.query.QueryContext; import com.bizosys.hsearch.query.QueryPlanner; import com.bizosys.hsearch.query.QueryTerm; import com.bizosys.hsearch.query.ReserveQueryWord; import com.bizosys.hsearch.util.LuceneConstants; import com.bizosys.oneline.ApplicationFault; import com.bizosys.oneline.SystemFault; import com.bizosys.oneline.conf.Configuration; import com.bizosys.oneline.pipes.PipeOut; import com.bizosys.oneline.util.StringUtils; /** * Internally it uses LuceneQueryParser for parsing the query. * Once parsed, based on the reserve words, it builds the * query execution plan. * @author karan * */ public class HQueryParser implements PipeOut{ public HQueryParser() { } public void visit(Object objQuery, boolean multiWriter) throws ApplicationFault, SystemFault { HQuery query = (HQuery) objQuery; QueryContext ctx = query.ctx; QueryPlanner planner = query.planner; if ( null == ctx || null == ctx.queryString) { throw new ApplicationFault("Blank Query "); } if ( OutpipeLog.l.isDebugEnabled() ) OutpipeLog.l.debug("Query String = " + ctx.queryString); parse(ctx.queryString, planner,ctx); } public void commit(boolean multiWriter) throws ApplicationFault, SystemFault { } public PipeOut getInstance() { return this; } public void init(Configuration conf) throws ApplicationFault, SystemFault { } public String getName() { return "HQueryParser"; } private static void parse(String text, QueryPlanner planner, QueryContext hq) throws ApplicationFault, SystemFault { text = text.toLowerCase(); List<Section> splits = quotedText(text, '"'); List<String> words = tokenize(text, splits); List<QueryTerm> lastTerms = new ArrayList<QueryTerm>(3); Iterator<String> itr = words.iterator(); boolean optionalMode = true; boolean isNot = false; Set<String> stopwords = StopwordManager.getInstance().getStopwords(); while (itr.hasNext()) { String word = itr.next(); word = word.trim(); if ( word.length() == 0 ) continue; /** * Don't proceed if this is a stopword */ if ( "and".equals(word) ) { if ( 0 != lastTerms.size() && null != planner.optionalTerms) { for (QueryTerm lastTerm : lastTerms) { if ( planner.optionalTerms.contains(lastTerm) ) { planner.optionalTerms.remove(lastTerm); planner.addMustTerm(lastTerm); OutpipeLog.l.trace(lastTerm); } } } optionalMode = false; } else if ( "or".equals(word) ) { if ( 0 != lastTerms.size() && null != planner.mustTerms) { for (QueryTerm lastTerm : lastTerms) { if ( planner.mustTerms.contains(lastTerm) ) { planner.mustTerms.remove(lastTerm); planner.addOptionalTerm(lastTerm); } } } optionalMode = true; } else if ( "not".equals(word) ) { isNot = true; } else { if ( stopwords.contains(word) ) continue; char firstChar = word.charAt(0); switch(firstChar) { case '+' : optionalMode = false; word = word.substring(1); break; case '-' : optionalMode = true; word = word.substring(1); break; case '!' : isNot = true; word = word.substring(1); break; default: } lastTerms.clear(); //Make the base term QueryTerm reserveTerm = new QueryTerm(word,isNot); int reserveWord = ReserveQueryWord.getInstance(). mapReserveWord(reserveTerm.termType); if ( ReserveQueryWord.NO_RESERVE_WORD != reserveWord) { hq.populate(reserveWord, reserveTerm.wordOrig); lastTerms.add(reserveTerm); } else { planner.addPhrase(reserveTerm); List<String> lstWord = standardTokenizer(reserveTerm.wordOrig); for (String aWord : lstWord) { QueryTerm term = new QueryTerm(aWord,isNot); term.setTermType(reserveTerm.termType); if (optionalMode) planner.addOptionalTerm(term); else planner.addMustTerm(term); hq.totalTerms++; lastTerms.add(term); //Make it a Array } } //Refresh the settings optionalMode = true; isNot = false; } } lastTerms.clear(); lastTerms = null; if ( null != planner.optionalTerms && null == planner.mustTerms && planner.optionalTerms.size() == 1 ) { planner.addMustTerm(planner.optionalTerms.get(0)); planner.optionalTerms.clear(); planner.optionalTerms = null; } if ( OutpipeLog.l.isDebugEnabled() ) { OutpipeLog.l.debug("Planner: " + planner.toString()); } } private static List<String> standardTokenizer(String word) throws ApplicationFault { Reader reader = new StringReader(word); StandardTokenizer fil = new StandardTokenizer(LuceneConstants.version, reader); List<String> lstWord = new ArrayList<String>(3); try { word = null; CharTermAttribute termA = (CharTermAttribute)fil.getAttribute(CharTermAttribute.class); fil.reset(); while ( fil.incrementToken()) { word = termA.toString(); lstWord.add(word); } reader.close(); } catch ( Exception ex) { throw new ApplicationFault(ex); } return lstWord; } private static List<String> tokenize(String text, List<Section> splits) { int lastIndex = 0; List<String> words = null;; int textLastIndex = text.length() - 1; String phrase = null; for (Section is : splits) { //Not a phrase start, Take the section if ( lastIndex != (is.start - 1)) { List<String> splittedWords = spaceTokenizer( text.substring(lastIndex, is.start - 1).trim(), ' '); if ( null != phrase ) { appendPhrase(phrase, splittedWords); phrase = null; } if ( null == words)words = new ArrayList<String>(splits.size() * 2 + 1); words.addAll(splittedWords); } //Extract the phase phrase = text.substring(is.start, is.end ); boolean isIsolatedStart = ( (is.start -1) == 0 || ( is.start > 1 && text.charAt(is.start - 2) == ' ')); boolean isIsolatedEnd = (is.end == textLastIndex) || (text.charAt(is.end + 1) == ' '); if ( isIsolatedStart && isIsolatedEnd ) { if ( null == words)words = new ArrayList<String>(splits.size() * 2 + 1); words.add(phrase); phrase = null; } else if (isIsolatedEnd) { if ( null == words) { words = new ArrayList<String>(splits.size() * 2 + 1); words.add(phrase); } else { int lst = words.size() - 1; String word = words.get(lst); words.remove(lst); words.add(word + phrase); } phrase = null; } lastIndex = is.end + 1; } //Remaining last section if ( lastIndex <= textLastIndex) { List<String> splittedWords = StringUtils.fastSplit( text.substring(lastIndex), ' '); if ( null != phrase ) { appendPhrase(phrase, splittedWords); phrase = null; } if ( null == words)words = new ArrayList<String>(splits.size() * 2 + 1); words.addAll(splittedWords); } else if (null != phrase) { if ( null == words) words = new ArrayList<String>(splits.size() * 2 + 1); words.add(phrase); phrase = null; } return words; } private static void appendPhrase(String phrase, List<String> splittedWords) { if ( null != phrase) { if ( splittedWords.size() > 0) { phrase = phrase + splittedWords.get(0); splittedWords.remove(0); splittedWords.add(0,phrase); } else { splittedWords.add(phrase); } phrase = null; } } public static List<Section> quotedText(final String text, char separator) { final List<Section> result = new ArrayList<Section>(); int index1 = text.indexOf(separator);; int index2 = 0; int lastIndex = text.length() - 1; while (index1 >= 0) { if ( (-1 == index1) || (index1 >= lastIndex) ) break; index2 = text.indexOf(separator, index1 + 1); if ( -1 == index2) break; result.add(new Section(index1+1,index2)); index1 = text.indexOf(separator, index2 + 1); } return result; } public static List<String> spaceTokenizer(final String text, char separator) { final List<String> result = new ArrayList<String>(); int index1 = 0; int index2 = text.indexOf(separator); String token = null; if ( index2 == -1) { result.add(text); return result; } while (index2 >= 0) { token = text.substring(index1, index2); result.add(token); index1 = index2 + 1; index2 = text.indexOf(separator, index1); } if (index1 < text.length() - 1) { result.add(text.substring(index1)); } return result; } public static class Section { int start; int end; public Section(int start, int end) { this.start = start; this.end = end; } @Override public String toString() { return this.start + "-" + this.end; } } }