HQueryParser.java example

Explorer
hsearch-obsolete-master
- src
/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.outpipe;

import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import com.bizosys.hsearch.inpipe.util.StopwordManager;
import com.bizosys.hsearch.query.HQuery;
import com.bizosys.hsearch.query.QueryContext;
import com.bizosys.hsearch.query.QueryPlanner;
import com.bizosys.hsearch.query.QueryTerm;
import com.bizosys.hsearch.query.ReserveQueryWord;
import com.bizosys.hsearch.util.LuceneConstants;
import com.bizosys.oneline.ApplicationFault;
import com.bizosys.oneline.SystemFault;
import com.bizosys.oneline.conf.Configuration;
import com.bizosys.oneline.pipes.PipeOut;
import com.bizosys.oneline.util.StringUtils;

/**
 * Internally it uses LuceneQueryParser for parsing the query.
 * Once parsed, based on the reserve words, it builds the 
 * query execution plan.
 * @author karan
 *
 */
public class HQueryParser implements PipeOut{
	
	public HQueryParser() {
	}	

	public void visit(Object objQuery, boolean multiWriter) throws ApplicationFault, SystemFault {

		HQuery query = (HQuery) objQuery;
		QueryContext ctx = query.ctx;
		QueryPlanner planner = query.planner;
		
		if ( null == ctx || null == ctx.queryString) {
			throw new ApplicationFault("Blank Query "); 
		}

		if ( OutpipeLog.l.isDebugEnabled() )
			OutpipeLog.l.debug("Query String = " + ctx.queryString);

		parse(ctx.queryString, planner,ctx);
	}
	
	public void commit(boolean multiWriter) throws ApplicationFault, SystemFault {
	}

	public PipeOut getInstance() {
		return this;
	}

	public void init(Configuration conf) throws ApplicationFault, SystemFault {
	}
	
	public String getName() {
		return "HQueryParser";
	}
	
	
	private static void parse(String text, 
		QueryPlanner planner, QueryContext hq) throws ApplicationFault, SystemFault {
		
		text = text.toLowerCase();
		List<Section> splits = quotedText(text, '"');
		List<String> words = tokenize(text, splits);
		
		List<QueryTerm> lastTerms = new ArrayList<QueryTerm>(3);
		
		Iterator<String> itr = words.iterator();
		boolean optionalMode = true;
		boolean isNot = false;
		Set<String> stopwords = StopwordManager.getInstance().getStopwords();
		while (itr.hasNext()) {
			String word = itr.next();
			word = word.trim();
			if ( word.length() == 0 ) continue;
			
			/**
			 * Don't proceed if this is a stopword
			 */
			
			if ( "and".equals(word) ) {
				if ( 0 != lastTerms.size() && null != planner.optionalTerms) {
					for (QueryTerm lastTerm : lastTerms) {
						if ( planner.optionalTerms.contains(lastTerm) ) {
							planner.optionalTerms.remove(lastTerm);
							planner.addMustTerm(lastTerm);
							OutpipeLog.l.trace(lastTerm);
						}
					}
				}
				optionalMode = false;
				
			} else if ( "or".equals(word) ) {
				if ( 0 != lastTerms.size() && null != planner.mustTerms) {
					for (QueryTerm lastTerm : lastTerms) {
						if ( planner.mustTerms.contains(lastTerm) ) {
							planner.mustTerms.remove(lastTerm);
							planner.addOptionalTerm(lastTerm);
						}
					}
				}
				optionalMode = true;
			} else if ( "not".equals(word) ) {
				isNot = true;
			} else {
				if ( stopwords.contains(word) ) continue;
				char firstChar = word.charAt(0);
				switch(firstChar) {
					case '+' :
						optionalMode = false;
						word = word.substring(1);
						break;
					case '-' :
						optionalMode = true;
						word = word.substring(1);
						break;
					case '!' :
						isNot = true;
						word = word.substring(1);
						break;
					default:
				}
				
				lastTerms.clear();
				
				//Make the base term
				QueryTerm reserveTerm =  new QueryTerm(word,isNot);
				int reserveWord = ReserveQueryWord.getInstance().
				mapReserveWord(reserveTerm.termType);		  
				if ( ReserveQueryWord.NO_RESERVE_WORD != reserveWord) {
					hq.populate(reserveWord, reserveTerm.wordOrig);
					lastTerms.add(reserveTerm);
				} else {
					planner.addPhrase(reserveTerm);
					List<String> lstWord = standardTokenizer(reserveTerm.wordOrig);
					for (String aWord : lstWord) {
						QueryTerm term =  new QueryTerm(aWord,isNot);
						term.setTermType(reserveTerm.termType);
						if (optionalMode) planner.addOptionalTerm(term);
						else planner.addMustTerm(term);
						hq.totalTerms++;
						lastTerms.add(term); //Make it a Array
					}
				}

				//Refresh the settings
				optionalMode = true;
				isNot = false;
			}
		}
		
		lastTerms.clear();
		lastTerms = null;
		
		if ( null != planner.optionalTerms && null == planner.mustTerms && 
			planner.optionalTerms.size() == 1 ) {
			planner.addMustTerm(planner.optionalTerms.get(0));
			planner.optionalTerms.clear();
			planner.optionalTerms = null;
		}

		if ( OutpipeLog.l.isDebugEnabled() ) {
			OutpipeLog.l.debug("Planner: " + planner.toString());
		}
	}

	private static List<String> standardTokenizer(String word) throws ApplicationFault {
		Reader reader = new StringReader(word);
		StandardTokenizer fil = new StandardTokenizer(LuceneConstants.version, reader);
		List<String> lstWord = new ArrayList<String>(3);
		try {
			word = null;
			CharTermAttribute termA = (CharTermAttribute)fil.getAttribute(CharTermAttribute.class);
			fil.reset();
			
			while ( fil.incrementToken()) {
				word = termA.toString();
				lstWord.add(word);
			}
			reader.close();
		} catch ( Exception ex) {
			throw new ApplicationFault(ex);
		}
		return lstWord;
	}

	private static List<String> tokenize(String text, List<Section> splits) {
		int lastIndex = 0;
		List<String> words = null;;
		int textLastIndex = text.length() - 1;
		String phrase = null;
		
		for (Section is : splits) {
			
			//Not a phrase start, Take the section
			if ( lastIndex != (is.start - 1)) {
				List<String> splittedWords = spaceTokenizer(
					text.substring(lastIndex, is.start - 1).trim(), ' ');
				if ( null != phrase ) {
					appendPhrase(phrase, splittedWords);
					phrase = null;
				}
				if ( null == words)words =  new ArrayList<String>(splits.size() * 2 + 1);
				words.addAll(splittedWords);
			}
			
			//Extract the phase
			phrase = text.substring(is.start, is.end );
			boolean isIsolatedStart = ( (is.start -1) == 0 || 
				( is.start > 1 && text.charAt(is.start - 2) == ' '));
			boolean isIsolatedEnd = (is.end == textLastIndex) ||
					(text.charAt(is.end + 1) == ' ');

			if ( isIsolatedStart && isIsolatedEnd ) {
				if ( null == words)words =  new ArrayList<String>(splits.size() * 2 + 1);
				words.add(phrase);
				phrase = null;
			} else if (isIsolatedEnd) {
				if ( null == words) {
					words =  new ArrayList<String>(splits.size() * 2 + 1);
					words.add(phrase);
				} else {
					int lst = words.size() - 1;
					String word = words.get(lst);
					words.remove(lst);
					words.add(word + phrase);
				}
				phrase = null;
			}
			
			lastIndex = is.end + 1;
		}
		
		//Remaining last section
		if  ( lastIndex <= textLastIndex) {
			List<String> splittedWords = StringUtils.fastSplit(
					text.substring(lastIndex), ' ');
			if ( null != phrase ) {
				appendPhrase(phrase, splittedWords);
				phrase = null;
			}
			if ( null == words)words =  new ArrayList<String>(splits.size() * 2 + 1);
			words.addAll(splittedWords);
		} else if (null != phrase) {
			if ( null == words) words =  new ArrayList<String>(splits.size() * 2 + 1);
			words.add(phrase);
			phrase = null;
		}
		return words;
	}
	
	private static void appendPhrase(String phrase, List<String> splittedWords) {
		if ( null != phrase) {
			if ( splittedWords.size() > 0) {
				phrase = phrase + splittedWords.get(0);
				splittedWords.remove(0);
				splittedWords.add(0,phrase);
			} else {
				splittedWords.add(phrase);
			}
			phrase = null;
		}
	}
	
	  public static List<Section> quotedText(final String text, char separator) {

		  final List<Section> result = new ArrayList<Section>();
		  int index1 = text.indexOf(separator);;
		  int index2 = 0; 
		  int lastIndex = text.length() - 1;

		  while (index1 >= 0) {
			  if ( (-1 == index1) || (index1 >= lastIndex) ) break;
			  index2 = text.indexOf(separator, index1 + 1);
			  if ( -1 == index2) break;
			  result.add(new Section(index1+1,index2));
			  index1 = text.indexOf(separator, index2 + 1);
		  }
		  return result;
	  }	
	  
	  public static List<String> spaceTokenizer(final String text, char separator) {
		  final List<String> result = new ArrayList<String>();
		  int index1 = 0;
		  int index2 = text.indexOf(separator);
		  String token = null;
		  if ( index2 == -1) {
			  result.add(text);
			  return result;
		  }
		  
		  while (index2 >= 0) {
			  token = text.substring(index1, index2);
			  result.add(token);
			  index1 = index2 + 1;
			  index2 = text.indexOf(separator, index1);
		  }
	            
		  if (index1 < text.length() - 1) {
			  result.add(text.substring(index1));
		  }
		  return result;
	  }	  
	  
	  public static class Section {
		  int start;
		  int end;
		  
		  public Section(int start, int end) {
			  this.start = start;
			  this.end = end;
		  }
		  
		  @Override
		  public String toString() {
			  return this.start + "-" + this.end;
		  }
	  }
}