TextSearchEngine.java example

Explorer
eXist-1.4.x-master

/* eXist Open Source Native XML Database
 * Copyright (C) 2001/02,  Wolfgang M. Meier (meier@ifs.tu-darmstadt.de)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 * 
 * $Id$
 */
package org.exist.storage;

import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.util.Observable;
import java.util.TreeSet;

import org.apache.log4j.Logger;
import org.exist.collections.Collection;
import org.exist.dom.*;
import org.exist.fulltext.ElementContent;
import org.exist.security.PermissionDeniedException;
import org.exist.storage.analysis.SimpleTokenizer;
import org.exist.storage.analysis.Tokenizer;
import org.exist.storage.btree.DBException;
import org.exist.storage.serializers.Serializer;
import org.exist.util.Configuration;
import org.exist.util.Occurrences;
import org.exist.util.PorterStemmer;
import org.exist.xquery.TerminatedException;
import org.exist.xquery.XQueryContext;

/**
 * This is the base class for all classes providing access to the fulltext index.
 * 
 * The class has methods to add text and attribute nodes to the fulltext index,
 * or to search for nodes matching selected search terms.
 *  
 * @author wolf
 */
public abstract class TextSearchEngine extends Observable {

	protected final static Logger LOG =
		Logger.getLogger(TextSearchEngine.class);
		
    protected TreeSet stoplist = new TreeSet();
	protected DBBroker broker = null;
	protected Tokenizer tokenizer;
	protected Configuration config;
	protected boolean indexNumbers = false ;
	protected boolean stem = false ;
	protected boolean termFreq = true;
	protected PorterStemmer stemmer = null;
	protected int trackMatches = Serializer.TAG_ELEMENT_MATCHES;
	
	public final static String INDEX_NUMBERS_ATTRIBUTE = "parseNumbers";
	public final static String STEM_ATTRIBUTE = "stemming";
	public final static String STORE_TERM_FREQUENCY_ATTRIBUTE = "track-term-freq";
	public final static String TOKENIZER_ATTRIBUTE = "tokenizer";
	public static final String CONFIGURATION_STOPWORDS_ELEMENT_NAME = "stopwords";
	public final static String STOPWORD_FILE_ATTRIBUTE = "file";
	
	public final static String PROPERTY_INDEX_NUMBERS = "indexer.indexNumbers";
	public final static String PROPERTY_STEM = "indexer.stem";
	public final static String PROPERTY_STORE_TERM_FREQUENCY = "indexer.store-term-freq";
	public final static String PROPERTY_TOKENIZER = "indexer.tokenizer";
	public final static String PROPERTY_STOPWORD_FILE = "stopwords";
	
	/**
	 * Construct a new instance and configure it.
	 * 
	 * @param broker
	 * @param conf
	 */
	public TextSearchEngine(DBBroker broker, Configuration conf) {
		this.broker = broker;
		this.config = conf;
		String stopword, tokenizerClass;
		Boolean num, stemming, termFrequencies;
		if ((num = (Boolean) config.getProperty(PROPERTY_INDEX_NUMBERS)) != null)
			indexNumbers = num.booleanValue();
		if ((stemming = (Boolean) config.getProperty(PROPERTY_STEM)) != null)
			stem = stemming.booleanValue();
		if((termFrequencies = (Boolean) config.getProperty(PROPERTY_STORE_TERM_FREQUENCY)) != null)
			termFreq = termFrequencies.booleanValue();
		String track = (String) config.getProperty(Serializer.PROPERTY_TAG_MATCHING_ELEMENTS);
		if (track != null)
			trackMatches = track.equalsIgnoreCase("yes")
			? Serializer.TAG_ELEMENT_MATCHES
					: Serializer.TAG_NONE;
		track = (String) config.getProperty(Serializer.PROPERTY_TAG_MATCHING_ATTRIBUTES);
		if (track != null && track.equalsIgnoreCase("yes"))
			trackMatches = trackMatches | Serializer.TAG_ATTRIBUTE_MATCHES;
		
		if ((tokenizerClass = (String) config.getProperty(PROPERTY_TOKENIZER)) != null) {
			try {
				Class tokClass = Class.forName(tokenizerClass);
				tokenizer = (Tokenizer) tokClass.newInstance();
				LOG.debug("using tokenizer: " + tokenizerClass);
			} catch (ClassNotFoundException e) {
				LOG.debug(e);
			} catch (InstantiationException e) {
				LOG.debug(e);
			} catch (IllegalAccessException e) {
				LOG.debug(e);
			}
		}
		if (tokenizer == null) {
			LOG.debug("using simple tokenizer");
			tokenizer = new SimpleTokenizer();
		}

		if (stem)
			stemmer = new PorterStemmer();
		tokenizer.setStemming(stem);
		if ((stopword = (String) config.getProperty(PROPERTY_STOPWORD_FILE)) != null) {
			try {
				FileReader in = new FileReader(stopword);
				StreamTokenizer tok = new StreamTokenizer(in);
				int next = tok.nextToken();
				while (next != StreamTokenizer.TT_EOF) {
					if (next != StreamTokenizer.TT_WORD)
						continue;
					stoplist.add(tok.sval);
					next = tok.nextToken();
				}
			} catch (FileNotFoundException e) {
				LOG.debug(e);
			} catch (IOException e) {
				LOG.debug(e);
			}
		}
	}

    /**
	 * Returns the Tokenizer used for tokenizing strings into
	 * words.
	 * 
	 * @return tokenizer
	 */
	public Tokenizer getTokenizer() {
		return tokenizer;
	}
    
    /**
	 * Tokenize and index the given text node.
	 * 
	 * @param indexSpec
	 * @param node
	 */
	public abstract void storeText(CharacterDataImpl node, int indexingHint, FulltextIndexSpec indexSpec, boolean remove);
    public abstract void storeText(StoredNode parent, ElementContent text, int indexingHint, FulltextIndexSpec indexSpec, boolean remove);

	public abstract void flush();
	public abstract boolean close() throws DBException;

	public int getTrackMatches() {
		return trackMatches;
	}
	
	public void setTrackMatches(int flags) {
		trackMatches = flags;
	}

    public NodeSet getNodesContaining(XQueryContext context, DocumentSet docs,
	        NodeSet contextSet, int axis, QName qname, String expr, int type) throws TerminatedException {
        return getNodesContaining(context, docs, contextSet, axis, qname, expr, type, true);
    }

    /**
	 * For each of the given search terms and each of the documents in the
	 * document set, return a node-set of matching nodes.
	 *
	 * The type-argument indicates if search terms should be compared using
	 * a regular expression. Valid values are DBBroker.MATCH_EXACT or
	 * DBBroker.MATCH_REGEXP.
	 */
	public abstract NodeSet getNodesContaining(XQueryContext context, DocumentSet docs,
	        NodeSet contextSet, int axis, QName qname, String expr, int type, boolean matchAll) throws TerminatedException;
	
	public abstract NodeSet getNodes(XQueryContext context, DocumentSet docs, NodeSet contextSet, int axis, QName qname,
	        TermMatcher matcher, CharSequence startTerm) throws TerminatedException;
	
	/**
	 * Queries the fulltext index to retrieve information on indexed words contained
	 * in the index for the current collection. Returns a list of {@link Occurrences} for all 
	 * words contained in the index. If param end is null, all words starting with 
	 * the string sequence param start are returned. Otherwise, the method 
	 * returns all words that come after start and before end in lexical order.
	 */
	public abstract Occurrences[] scanIndexTerms(
		DocumentSet docs,
		NodeSet contextSet,
		String start,
		String end) throws PermissionDeniedException;

    public abstract Occurrences[] scanIndexTerms(
		DocumentSet docs,
		NodeSet contextSet,
        QName[] qnames,
        String start,
		String end) throws PermissionDeniedException;
    
    public abstract String[] getIndexTerms(DocumentSet docs, TermMatcher matcher);
	
	/**
	 * Remove index entries for an entire collection.
	 * 
	 * @param collection
	 */
	public abstract void dropIndex(Collection collection);
	
	/**
	 * Remove all index entries for the given document.
	 * 
	 * @param doc
	 */
	public abstract void dropIndex(DocumentImpl doc);
	
}