QueryOptimizer.java example

Explorer
zoodb-master
/*
 * Copyright 2009-2016 Tilmann Zaeschke. All rights reserved.
 * 
 * This file is part of ZooDB.
 * 
 * ZooDB is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * ZooDB is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with ZooDB.  If not, see <http://www.gnu.org/licenses/>.
 * 
 * See the README and COPYING files for further information. 
 */
package org.zoodb.internal.query;

import java.util.Comparator;
import java.util.Date;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;

import org.zoodb.api.impl.ZooPC;
import org.zoodb.internal.ZooClassDef;
import org.zoodb.internal.ZooFieldDef;
import org.zoodb.internal.query.QueryParser.FNCT_OP;
import org.zoodb.internal.server.index.BitTools;
import org.zoodb.internal.util.DBLogger;

public class QueryOptimizer {
	
	private final ZooClassDef clsDef;
	
	/**
	 * A lookup map for all characters that indicate a (non-indexable) regex String. 
	 */
	private static final boolean[] REGEX_CHARS = new boolean[256];
	static {
		char[] regexChars = {'.', '\\', '+', '*', '[', '|', '$', '?'};
		for (char c: regexChars) {
			REGEX_CHARS[c] = true;
		}
	}
	
	public QueryOptimizer(ZooClassDef clsDef) {
		this.clsDef = clsDef;
	}
	
	/**
	 * Determine index to use.
	 * 
	 * Policy:
	 * 1) Check if index are available. If not, do not perform any further query analysis (for now)
	 *    -> Query rewriting may still be able to optimize really stupid queries.
	 * 2) Create sub-queries
	 * 3) Analyse sub-queries to determine best index to use. Result may imply that index usage is
	 *    pointless (whole index range required). This could also be if one sub-query does not use
	 *    any index, in which case using an index for the rest slightly increases disk access 
	 *    (index read) but reduces CPU needs (only sub-query to process, not whole query).
	 * 4a) For each sub-query, determine index with smallest range/density.
	 * 4b) Check for required sorting. Using an according index can be of advantage, even if range 
	 *    is larger.
	 * 5) Merge queries with same index and overlapping ranges
	 * 6) merge results
	 * 
	 * @param queryTree
	 * @return Index to use.
	 */
	public List<QueryAdvice> determineIndexToUse(QueryTreeNode queryTree) {
		List<QueryAdvice> advices = new LinkedList<QueryAdvice>();
		List<ZooFieldDef> availableIndices = new LinkedList<ZooFieldDef>();
		for (ZooFieldDef f: clsDef.getAllFields()) {
			if (f.isIndexed()) {
				availableIndices.add(f);
			}
		}

		// step 1
		if (availableIndices.isEmpty()) {
			//no index usage
			advices.add( new QueryAdvice(queryTree) );
			return advices;
		}
		
		//step 2 - sub-queries
		//We split the query tree at every OR into sub queries, such that every sub-query contains
		//the full query but only one side of every OR. All ORs are removed.
		//-> Optimization: We remove only (and split only at) ORs where at least on branch
		//   uses an index. TODO
		List<QueryTreeNode> subQueries = new LinkedList<QueryTreeNode>();
		subQueries.add(queryTree);
		queryTree.createSubs(subQueries);
		
//		System.out.println("Query2: " + queryTree.print());
		for (QueryTreeNode sq: subQueries) {
			optimize(sq);
//			System.out.println("Sub-query: " + sq.print());
		}
		//TODO filter out terms that cannot become true.
		//if none is left, return empty set.
		
		IdentityHashMap<ZooFieldDef, Long> minMap = new IdentityHashMap<ZooFieldDef, Long>();
		IdentityHashMap<ZooFieldDef, Long> maxMap = new IdentityHashMap<ZooFieldDef, Long>();
		for (QueryTreeNode sq: subQueries) {
			advices.add(determineIndexToUseSub(sq, minMap, maxMap));
			minMap.clear();
			maxMap.clear();
		}
		
		//TODO merge queries
		//E.g.:
		// - if none uses an index (or at least one doesn't), return only the full query
		// - if ranges overlap, try to merge?

		//TODO optimisation: merge queries
		//for example the following query returns two identical sub-queries:
		//"_int == 123 || _int == 123" --> This is bad and should be avoided.
				
		//check for show-stoppers
		//-> in their case, we simply run the un-split query on the full type extent.
		for (QueryAdvice qa: advices) {
			//assuming that the term is not an empty term (contradicting sub-terms)
			if (qa == null) {
				//ah, one of them iterates over the whole result set.
				advices.clear();
				advices.add(qa);
				return advices;
			}
			//TODO instead of fixed values, use min/max of index.
			if (qa.getMin() <= Long.MIN_VALUE && qa.getMax() >= Long.MAX_VALUE) {
				//ah, one of them iterates over the whole result set.
				advices.clear();
				advices.add(qa);
				return advices;
			}
		}
		
		//check for overlapping / global min/max
		mergeAdvices(advices);
		
		return advices;
	}
	
	private static class AdviceComparator implements Comparator<QueryAdvice> {
		@Override
		public int compare(QueryAdvice o1, QueryAdvice o2) {
			if (o1.getMin() < o2.getMin()) {
				return -1;
			} else if(o1.getMin() > o2.getMin()) {
				return 1;
			} else {
				if (o1.getMax() < o2.getMax()) {
					return -1;
				} else if(o1.getMax() > o2.getMax()) {
					return 1;
				} else {
					return 0;
				}
			}
		}
	}
	
	
	private void mergeAdvices(List<QueryAdvice> advices) {
		//if they overlap, we should merge them to void duplicate loading effort and results.
		//if they don't overlap, we don't have to care about either.
		//-> assuming they all use the same index...
		if (advices.size() < 2) {
			//shortcut
			return;
		}
		IdentityHashMap<ZooFieldDef, TreeSet<QueryAdvice>> map = 
				new IdentityHashMap<ZooFieldDef, TreeSet<QueryAdvice>>();
		//sort QAs by index and by minValue
		for (QueryAdvice qa: advices) {
			TreeSet<QueryAdvice> subList = map.get(qa.getIndex());
			if (subList == null) {
				subList = new TreeSet<QueryAdvice>(new AdviceComparator());
				map.put(qa.getIndex(), subList);
			}
			subList.add(qa);
		}

		//merge
		boolean merged = false;
		for (QueryAdvice qa: advices) {
			TreeSet<QueryAdvice> subList = map.get(qa.getIndex());
			Iterator<QueryAdvice> iter = subList.iterator();
			QueryAdvice prev = iter.next();
			while (iter.hasNext()) {
				QueryAdvice current = iter.next();
				if (prev.getMax() >= current.getMin()) {
					prev.setMax(current.getMax());
					iter.remove();
					merged = true;
				} else {			
					prev = current;
				}
			}
		}

		if (merged) {
			advices.clear();
			for (TreeSet<QueryAdvice> subList: map.values()) {
				advices.addAll(subList);
			}
		}
	}

	/**
	 * 
	 * @param queryTree This is a sub-query that does not contain OR operands.
	 * @param maxMap2 
	 * @param minMap2 
	 * @return QueryAdvise
	 */
	private QueryAdvice determineIndexToUseSub(QueryTreeNode queryTree, 
			IdentityHashMap<ZooFieldDef, Long> minMap, 
			IdentityHashMap<ZooFieldDef, Long> maxMap) {
		//TODO determine the Lists directly by assigning ZooFields to term during parsing?
		QueryTreeIterator iter = queryTree.termIterator();
		while (iter.hasNext()) {
			QueryTerm term = iter.next();
			if (!term.isRhsFixed() || term.isLhsFunction()) {
				//ignore terms with variable rhs and functios on the LHS
				//TODO we currently support only indexes on references, not on paths
				if (term.isLhsFunction()) {
					determineIndexToUseSubForQueryFunctions(minMap, maxMap, term.getLhsFunction());
				}
				continue;
			}
			ZooFieldDef f = term.getLhsFieldDef();
			if (f == null || !f.isIndexed()) {
				//ignore fields that are not index
				continue;
			}
			
			Long minVal = minMap.get(f);
			if (minVal == null) {
				//needs initialization
				//even if we don;t narrow the values, min/max allow ordered traversal
				minMap.put(f, f.getMinValue());
				maxMap.put(f, f.getMaxValue());
			}
			
			Object termVal = term.getValue(null);
			//TODO if(term.isRef())?!?!?!
			//TODO implement term.isIndexable() ?!?!?
			//TODO swap left/right side of query term such that indexed field is always on the left
			//     and the constant is on the right.
			Long value;
			
			switch (f.getJdoType()) {
			case PRIMITIVE:
				switch (f.getPrimitiveType()) {
				case BOOLEAN:   
					//pointless..., well pretty much, unless someone uses this to distinguish
					//very few 'true' from many 'false' or vice versa.
					continue;
				case DOUBLE: value = BitTools.toSortableLong(
						(termVal instanceof Double ? (double)termVal : (double)(float)termVal)); 
				break;
				case FLOAT: value = BitTools.toSortableLong(
						(termVal instanceof Float ? (float)termVal : (float)(double)termVal)); 
				break;
				case CHAR: value = (long)((Character)termVal).charValue();
				case BYTE:
				case INT:
				case LONG:
				case SHORT:	value = ((Number)termVal).longValue(); break;
				default: 				
					throw new IllegalArgumentException("Type: " + f.getPrimitiveType());
				}
				break;
			case STRING:
				value = BitTools.toSortableLong(
						termVal == QueryTerm.NULL ? null : (String)termVal); 
				break;
			case REFERENCE:
				value = (termVal == QueryTerm.NULL ? 
						BitTools.NULL : ((ZooPC)termVal).jdoZooGetOid());
				break;
			case DATE:
				value = (termVal == QueryTerm.NULL ? 0 : ((Date)termVal).getTime()); 
				break;
			default:
				throw new IllegalArgumentException("Type: " + f.getJdoType());
			}

			switch (term.getOp()) {
			case EQ: {
				//TODO check range and exit if EQ does not fit in remaining range
				minMap.put(f, value);
				maxMap.put(f, value);
				break;
			}
			case L:
				if (value < maxMap.get(f)) {
					maxMap.put(f, value - 1); //TODO does this work with floats?
				}
				break;
			case LE: 				
				if (value < maxMap.get(f)) {
					maxMap.put(f, value);
				}
				break;
			case A: 
				if (value > minMap.get(f)) {
					minMap.put(f, value + 1); //TODO does this work with floats?
				}
				break;
			case AE:
				if (value > minMap.get(f)) {
					minMap.put(f, value);
				}
				break;
			case NE:
			case STR_matches:
			case STR_contains_NON_JDO:
			case STR_endsWith:
				//ignore
				break;
			case STR_startsWith:
				setKeysForStringStartsWith((String) term.getValue(null), f, minMap, maxMap);
				break;
			default: 
				throw new IllegalArgumentException("Name: " + term.getOp());
			}
			
			//TODO take into account not-operators (x>1 && x<10) && !(x>5 && X <6) ??
			// -> Hopefully this optimization is marginal and negligible.
			//But it may break everything!
		}
		return createQueryAdvice(minMap, maxMap, queryTree);
	}
	
	private void determineIndexToUseSubForQueryFunctions( 
			IdentityHashMap<ZooFieldDef, Long> minMap, 
			IdentityHashMap<ZooFieldDef, Long> maxMap,
			QueryFunction fn) {
		
		//we can use indexes only for startsWith() and matches() 
		if (!FNCT_OP.STR_startsWith.equals(fn.op()) && !FNCT_OP.STR_matches.equals(fn.op())) {
			return;
		}
		
		//we can use index only when operatig on a local field 
		QueryFunction f0 = fn.getParams()[0];
		if (!FNCT_OP.FIELD.equals(f0.op())) {
			return;
		}
		
		if (f0.getParams()[0].op() != FNCT_OP.THIS) {
			//TODO we don't support path queries yet, i.e. the string field must belong to
			//the currently evaluated main-object, not to a referenced object.
			return;
		}
		
		ZooFieldDef f = f0.getFieldDef();
		if (f == null || !f.isIndexed()) {
			//ignore fields that are not index
			return;
		}
		
		QueryFunction f1 = fn.getParams()[1];
		if (!f1.isConstant()) {
			return;
		}
		Object param1 = f1.evaluate(null, null);   
		
		Long minVal = minMap.get(f);
		if (minVal == null) {
			//needs initialization
			//even if we don;t narrow the values, min/max allow ordered traversal
			minMap.put(f, f.getMinValue());
			maxMap.put(f, f.getMaxValue());
		}

		
		switch (fn.op()) {
		case STR_matches:
			String str = (String) param1;
			for (int i = 0; i < str.length(); i++) {
				char c = str.charAt(i);
				if (REGEX_CHARS[c]) {
					//if we have a regex that does not simply result in full match we
					//simply use the leading part for a startsWith() query.
					if (i == 0) {
						DBLogger.info("Ignoring index on String query because of regex characters.");
					}
					str = str.substring(0, i);
					setKeysForStringStartsWith(str, f, minMap, maxMap);
					return;
				}
			}
			long key = BitTools.toSortableLong(str);
			if (key > minMap.get(f)) {
				minMap.put(f, key);
			}
			if (key < maxMap.get(f)) {
				maxMap.put(f, key);
			}
			break;
		case STR_startsWith:
			setKeysForStringStartsWith((String) param1, f, minMap, maxMap);
			break;
		default: //nothing
		}
	}

	private void setKeysForStringStartsWith(String prefix, ZooFieldDef f,
			IdentityHashMap<ZooFieldDef, Long> minMap, 
			IdentityHashMap<ZooFieldDef, Long> maxMap) {
		long keyMin = BitTools.toSortableLongPrefixMinHash(prefix);
		long keyMax = BitTools.toSortableLongPrefixMaxHash(prefix);
		if (keyMin > minMap.get(f)) {
			minMap.put(f, keyMin);
		}
		if (keyMax < maxMap.get(f)) {
			maxMap.put(f, keyMax);
		}
	}
	
	private QueryAdvice createQueryAdvice(
			IdentityHashMap<ZooFieldDef, Long> minMap, 
			IdentityHashMap<ZooFieldDef, Long> maxMap, 
			QueryTreeNode queryTree) {
		if (minMap.isEmpty()) {
			//return default query
			return new QueryAdvice(queryTree);
		}
		
		//the advised index to use...
		// start with first
		ZooFieldDef def = minMap.keySet().iterator().next();
		QueryAdvice qa = new QueryAdvice(queryTree);
		qa.setIndex( def );
		qa.setMin( minMap.get(def) );
		qa.setMax( maxMap.get(def) );
		
		//only one index left? -> Easy!!!
		//TODO well, better not use it if it covers the whole range? Maybe for sorting?
		if (minMap.size() == 1) {
			qa.setIndex( minMap.keySet().iterator().next() );
			return qa;
		}
		
		for (Map.Entry<ZooFieldDef, Long> me2: minMap.entrySet()) {
			long min2 = me2.getValue();
			long max2 = maxMap.get(me2.getKey());
			//TODO fix for very large values
			if ((max2-min2) < (qa.getMax() - qa.getMin())) {
				qa.setIndex( me2.getKey() );
				qa.setMin( min2 );
				qa.setMax( max2 );
			}
		}
		
		if (qa.getIndex().isString()) {
			//For String we have to extend the range because of the trailing hashcode
			qa.setMin(BitTools.getMinPosInPage(qa.getMin()));
			qa.setMax(BitTools.getMaxPosInPage(qa.getMax()));
		}

//		DatabaseLogger.debugPrintln(0, "Using index: " + def.getName());
		return qa;
	}

	private void optimize(QueryTreeNode q) {
		stripUnaryNodes(q);
	}

	private void stripUnaryNodes(QueryTreeNode q) {
		while (q.isUnary() && q.n1 != null) {
			//this is a unary root node that shouldn't be one
			q.op = q.n1.op;
			q.n2 = q.n1.n2;
			q.t2 = q.n1.t2;
			q.t1 = q.n1.t1;
			q.n1 = q.n1.n1;
			q.relateToChildren();
		}
		//check unary nodes if they are not root / pull down leaf-unaries
		if (q.isUnary() && q.p != null) {
			if (q.p.n1 == q) {
				q.p.n1 = q.n1;
				q.p.t1 = q.t1;
				if (q.n1 != null) {
					q.n1.p = q.p;
				}
			} else {
				q.p.n2 = q.n1;
				q.p.t2 = q.t1;
				if (q.n2 != null) {
					q.n2.p = q.p;
				}
			}
		}
		if (q.n1 != null) {
			stripUnaryNodes(q.n1);
		}
		if (q.n2 != null) {
			stripUnaryNodes(q.n2);
		}
	}

}