ExtPhrase.java example

Explorer
eXist-1.4.x-master
/*
 * eXist Open Source Native XML Database
 * Copyright (C) 2005-2009 The eXist Project
 * http://exist-db.org
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *  
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *  
 *  $Id$
 */
package org.exist.xquery.functions;

import org.apache.log4j.Logger;

import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.exist.EXistException;
import org.exist.dom.ExtArrayNodeSet;
import org.exist.dom.Match;
import org.exist.dom.NodeProxy;
import org.exist.dom.NodeSet;
import org.exist.numbering.NodeId;
import org.exist.storage.NativeTextEngine;
import org.exist.storage.TextSearchEngine;
import org.exist.storage.analysis.TextToken;
import org.exist.storage.analysis.Tokenizer;
import org.exist.util.GlobToRegex;
import org.exist.xquery.Constants;
import org.exist.xquery.XPathException;
import org.exist.xquery.XQueryContext;
import org.exist.xquery.util.ExpressionDumper;
import org.exist.xquery.value.Sequence;

/**
 *  phrase() function : search an exact phrase in a NodeSet
 *
 *@author     Bruno Chatel <bcha@chadocs.com> (March 30, 2005)
 */
public class ExtPhrase extends ExtFulltext {
    private static final Logger logger = Logger.getLogger(ExtPhrase.class);
	/**
	 * 
	 * @param context
	 */
	public ExtPhrase(XQueryContext context) {
		super(context, Constants.FULLTEXT_AND);
	}
    
    /**
     * 
     * @param searchArg
     * @param nodes 
     */
    public Sequence evalQuery(String searchArg,	NodeSet nodes) throws XPathException {
        String[] terms;
        try {
			terms = getSearchTerms(searchArg);
		} catch (EXistException e) {
            logger.error(e.getMessage());
			throw new XPathException(this, e.getMessage(), e);
		}
		NodeSet hits = processQuery(terms, nodes);
		if (hits == null)
			return Sequence.EMPTY_SEQUENCE;
        
        // Can return NPE ; prevents issue exact matches
        TextSearchEngine engine = context.getBroker().getTextEngine();
        if(engine==null){
            throw new XPathException("The legacy fulltext indexing has been disabled by "
                    + "default from version 1.4.1. Please consider migrating to "
                    + "the new full text indexing.."); 
        }
        
		boolean hasWildcards = false;
		for(int i = 0; i < terms.length; i++) {
			hasWildcards |=	NativeTextEngine.containsWildcards(terms[i]);
		}
		return hasWildcards	? patternMatch(context, terms, hits) : exactMatch(context, terms, hits);
	}

	/**
	 * 
	 * @param context
	 * @param result
	 */
	private Sequence exactMatch(XQueryContext context, String[] terms, NodeSet result) {
		TextToken token;
	
		NodeSet r = new ExtArrayNodeSet();
		final Tokenizer tok = context.getBroker().getTextEngine().getTokenizer();
           
		// define search phrase for matches
		String matchTerm = "";
		for (int k = 0; k < terms.length ; k++) {
			matchTerm = matchTerm + terms[k];
			if (k != terms.length - 1)
				matchTerm = matchTerm + "\\W*"; 
		}
		// iterate on results
		for (Iterator i = result.iterator(); i.hasNext();) {            
			Vector matchGid = new Vector();
            NodeProxy current = (NodeProxy) i.next();
			// get first match
            Match nextMatch = current.getMatches();
			// remove previously found matches on current
			current.setMatches(null);
			// iterate on attach matches, with unicity of related nodeproxy gid
			String term;    
			while(nextMatch != null) {
                NodeId nodeId= nextMatch.getNodeId();
				// if current gid has not been previously processed
				if(!matchGid.contains(nodeId)) {
					NodeProxy mcurrent = new NodeProxy(current.getDocument(), nodeId);
                    Match match = null;
                    int firstOffset = -1;
					// add it in gid array
					matchGid.add(nodeId);
					String value = mcurrent.getNodeValue();
					tok.setText(value);
                    int j = 0;
					if (j < terms.length)
						term = terms[j];
					else
						break;
                    int frequency = 0;
					while ((token = tok.nextToken()) != null) {
                        String word = token.getText().toLowerCase();
						if (word.equalsIgnoreCase(term)) {
							j++;
							if (j == terms.length) {
								// all terms found
                                if (match == null)
                                    match = nextMatch.createInstance(getExpressionId(), nodeId, matchTerm);
                                if (firstOffset < 0)
                                    firstOffset = token.startOffset();
                                match.addOffset(firstOffset, token.endOffset() - firstOffset);
								frequency++;
								// start again on fist term
								j = 0;
								term = terms[j];
								continue;
							} else {
								term = terms[j];
                                if (firstOffset < 0)
                                    firstOffset = token.startOffset();
                            }
						} else if (j > 0 && word.equalsIgnoreCase(terms[0])) {
							// first search term found: start again
							j = 1;
							term = terms[j];
                            firstOffset = token.startOffset();
							continue;
						} else {
							//	reset
							j = 0;
                            firstOffset = -1;
							term = terms[j];
						}
					}
					// if phrase found
					if(frequency != 0) {
						// add new match to current
						current.addMatch(match);
						// add current to result
						r.add(current);
						// reset frequency
						frequency=0;
					}				
				}
				// process next match
				nextMatch = nextMatch.getNextMatch();
			}
		}
//		LOG.debug("found " + r.getLength());
		return r;
	}

	/**
	 * 
	 * @param context
	 * @param result
	 * @return
	 */
	private Sequence patternMatch(XQueryContext context, String[] terms, NodeSet result) {
		// generate list of search term patterns
	    Pattern patterns[] = new Pattern[terms.length];
        Matcher matchers[] = new Matcher[terms.length];
        for (int i = 0; i < patterns.length; i++) {
            try {
                patterns[i] = Pattern.compile(GlobToRegex.globToRegexp(terms[i]), 
                        Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE);
                matchers[i] = patterns[i].matcher("");
            } catch (PatternSyntaxException e) {
                logger.error("malformed pattern" + e.getMessage());
                return Sequence.EMPTY_SEQUENCE;
            }
        }
		
		// walk through hits 
		ExtArrayNodeSet r = new ExtArrayNodeSet();		
		final Tokenizer tok = context.getBroker().getTextEngine().getTokenizer();
        Matcher matcher;
		for (Iterator i = result.iterator(); i.hasNext();) {
			Match nextMatch;
			Vector matchGid = new Vector();
            NodeProxy current = (NodeProxy) i.next();
			// get first match
			nextMatch = current.getMatches();
			// remove previously found matches on current
			current.setMatches(null);
			// iterate on attach matches, with unicity of related nodeproxy gid
			while(nextMatch != null) {
				Hashtable matchTable = new Hashtable();
                NodeId nodeId = nextMatch.getNodeId(); 
				// if current gid has not been previously processed
				if(!matchGid.contains(nodeId)) {
					NodeProxy mcurrent = new NodeProxy(current.getDocument(), nodeId);
					// add it in gid array
					matchGid.add(nodeId);
                    String value = mcurrent.getNodeValue();
					tok.setText(value);
                    int j = 0;
					if (j < patterns.length) {
                        //Pattern term = patterns[j];
                        matcher = matchers[j];
                    } else
						break;
					String matchTerm = null;
                    TextToken token;
					while ((token = tok.nextToken()) != null) {
                        String word = token.getText().toLowerCase();
                        matcher.reset(word);
                        matchers[0].reset(word);
						if (matcher.matches()) {
							j++;
							if(matchTerm == null)
								matchTerm=word;
							else
								matchTerm = matchTerm + "\\W*" + word;  
							if (j == patterns.length) {
								// all terms found
								if(matchTable.containsKey(matchTerm)) {
									// previously found matchTerm
									Match match = (Match)(matchTable.get(matchTerm));
                                    match.addOffset(token.startOffset(), matchTerm.length());
								} else {
									Match match = nextMatch.createInstance(getExpressionId(), nodeId, matchTerm);
                                    match.addOffset(token.startOffset(), matchTerm.length());
									matchTable.put(matchTerm,match);
								}
								// start again on fist term
								j=0;
                                //Pattern term = patterns[j];
                                matcher = matchers[j];
								matchTerm = null;
								continue;
							} else {
                                //Pattern term = patterns[j];
                                matcher = matchers[j];
                            }
						} else if (j > 0 && matchers[0].matches()) {
							// first search term found: start again
							j=1;
                            //Pattern term = patterns[j];
                            matcher = matchers[j];
							matchTerm = word;
							continue;
						} else {
							// reset
							j = 0;
                            //Pattern term = patterns[j];
                            matcher = matchers[j];
							matchTerm = null;
							continue;
						}
					}
					// one or more match found
					if(matchTable.size()!=0) {
						Enumeration eMatch = matchTable.elements();
						while(eMatch.hasMoreElements()){
							Match match = (Match)(eMatch.nextElement());
							current.addMatch(match);
					     }
						// add current to result
						r.add(current);
					}			
				}
				// process next match
				nextMatch = nextMatch.getNextMatch();
			}
		}
		return r;	
	}

	/* (non-Javadoc)
     * @see org.exist.xquery.functions.ExtFulltext#dump(org.exist.xquery.util.ExpressionDumper)
     */
    public void dump(ExpressionDumper dumper) {
        dumper.display("phrase(");
        path.dump(dumper);
        dumper.display(", ");
        searchTerm.dump(dumper);
        dumper.display(")");
    }

    public String toString() {
        StringBuilder buf = new StringBuilder();
        buf.append("phrase(");
        buf.append(path);
        buf.append(", ");
        buf.append(searchTerm);
        buf.append(")");
        return buf.toString();
    }    

}