IndexTask.java example

Explorer
solrcene-master
package org.apache.lucene.ant;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.Vector;
import java.lang.reflect.Constructor;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.DynamicConfigurator;
import org.apache.tools.ant.Project;
import org.apache.tools.ant.Task;
import org.apache.tools.ant.types.EnumeratedAttribute;
import org.apache.tools.ant.types.FileSet;
import org.apache.tools.ant.types.Resource;
import org.apache.tools.ant.types.ResourceCollection;
import org.apache.tools.ant.types.resources.FileResource;

/**
 *  Ant task to index files with Lucene
 *
 */
public class IndexTask extends Task {
  /**
   *  resources
   */
  protected Vector<ResourceCollection> rcs = new Vector<ResourceCollection>();

  /**
   *  overwrite index?
   */
  private boolean overwrite = false;

  /**
   *  index path
   */
  private File indexDir;

  /**
   *  document handler classname
   */
  private String handlerClassName =
    FileExtensionDocumentHandler.class.getName();

  /**
   *  document handler instance
   */
  private DocumentHandler handler;


  /**
   *
   */
  private String analyzerClassName =
    StandardAnalyzer.class.getName();

  /**
   *  analyzer instance
   */
  private Analyzer analyzer;

  /**
   *  Lucene merge factor
   */
  private int mergeFactor = 20;

  private HandlerConfig handlerConfig;

  private boolean useCompoundIndex = true;


  /**
   *  Creates new instance
   */
  public IndexTask() {
  }


  /**
   *  Specifies the directory where the index will be stored
   */
  public void setIndex(File indexDir) {
    this.indexDir = indexDir;
  }


  /**
   *  Sets the mergeFactor attribute of the IndexTask object
   *
   *@param  mergeFactor  The new mergeFactor value
   */
  public void setMergeFactor(int mergeFactor) {
    this.mergeFactor = mergeFactor;
  }


  /**
   *  Sets the overwrite attribute of the IndexTask object
   *
   *@param  overwrite  The new overwrite value
   */
  public void setOverwrite(boolean overwrite) {
    this.overwrite = overwrite;
  }


  /**
   * If creating a new index and this is set to true, the
   * index will be created in compound format.
   */
  public void setUseCompoundIndex(boolean useCompoundIndex) {
    this.useCompoundIndex = useCompoundIndex;
  }

  /**
   *  Sets the documentHandler attribute of the IndexTask object
   *
   *@param  classname  The new documentHandler value
   */
  public void setDocumentHandler(String classname) {
    handlerClassName = classname;
  }

  /**
   * Sets the analyzer based on the builtin Lucene analyzer types.
   *
   * TODO: Enforce analyzer and analyzerClassName to be mutually exclusive
   */
  public void setAnalyzer(AnalyzerType type) {
    analyzerClassName = type.getClassname();
  }

  public void setAnalyzerClassName(String classname) {
    analyzerClassName = classname;
  }

  /**
   *  Adds a set of files (nested fileset attribute).
   *
   *@param  set  FileSet to be added
   */
  public void addFileset(FileSet set) {
    add(set);
  }

    /**
     * Add a collection of files to copy.
     * @param res a resource collection to copy.
     * @since Ant 1.7
     */
    public void add(ResourceCollection res) {
        rcs.add(res);
    }

  /**
   * Sets custom properties for a configurable document handler.
   */
  public void addConfig(HandlerConfig config) throws BuildException {
    if (handlerConfig != null) {
      throw new BuildException("Only one config element allowed");
    }

    handlerConfig = config;
  }

  private static final Analyzer createAnalyzer(String className) throws Exception{
    final Class<? extends Analyzer> clazz = Class.forName(className).asSubclass(Analyzer.class);
    try {
      // first try to use a ctor with version parameter (needed for many new Analyzers that have no default one anymore
      Constructor<? extends Analyzer> cnstr = clazz.getConstructor(Version.class);
      return cnstr.newInstance(Version.LUCENE_CURRENT);
    } catch (NoSuchMethodException nsme) {
      // otherwise use default ctor
      return clazz.newInstance();
    }
  }

  /**
   *  Begins the indexing
   *
   *@exception  BuildException  If an error occurs indexing the
   *      fileset
   */
  @Override
  public void execute() throws BuildException {

    // construct handler and analyzer dynamically
    try {
      handler = Class.forName(handlerClassName).asSubclass(DocumentHandler.class).newInstance();

      analyzer = IndexTask.createAnalyzer(analyzerClassName);
    } catch (Exception e) {
      throw new BuildException(e);
    }

    log("Document handler = " + handler.getClass(), Project.MSG_VERBOSE);
    log("Analyzer = " + analyzer.getClass(), Project.MSG_VERBOSE);

    if (handler instanceof ConfigurableDocumentHandler) {
      ((ConfigurableDocumentHandler) handler).configure(handlerConfig.getProperties());
    }

    try {
      indexDocs();
    } catch (IOException e) {
      throw new BuildException(e);
    }
  }


  /**
   * Index the fileset.
   *
   *@exception  IOException if Lucene I/O exception
   *TODO: refactor!!!!!
   */
  private void indexDocs() throws IOException {
    Date start = new Date();

    boolean create = overwrite;
    // If the index directory doesn't exist,
    // create it and force create mode
    if (indexDir.mkdirs() && !overwrite) {
      create = true;
    }

    FSDirectory dir = FSDirectory.open(indexDir);
    try {
      Searcher searcher = null;
      boolean checkLastModified = false;
      if (!create) {
        try {
          searcher = new IndexSearcher(dir, true);
          checkLastModified = true;
        } catch (IOException ioe) {
          log("IOException: " + ioe.getMessage());
          // Empty - ignore, which indicates to index all
          // documents
        }
      }

      log("checkLastModified = " + checkLastModified, Project.MSG_VERBOSE);

      IndexWriterConfig conf = new IndexWriterConfig(
          Version.LUCENE_CURRENT, analyzer).setOpenMode(
          create ? OpenMode.CREATE : OpenMode.APPEND);
      LogMergePolicy lmp = (LogMergePolicy) conf.getMergePolicy();
      lmp.setUseCompoundFile(useCompoundIndex);
      lmp.setUseCompoundDocStore(useCompoundIndex);
      lmp.setMergeFactor(mergeFactor);
      IndexWriter writer = new IndexWriter(dir, conf);
      int totalFiles = 0;
      int totalIndexed = 0;
      int totalIgnored = 0;
      try {

        for (int i = 0; i < rcs.size(); i++) {
          ResourceCollection rc = rcs.elementAt(i);
          if (rc.isFilesystemOnly()) {
            Iterator resources = rc.iterator();
            while (resources.hasNext()) {
              Resource r = (Resource) resources.next();
              if (!r.isExists() || !(r instanceof FileResource)) {
                continue;
              }
              
              totalFiles++;

              File file = ((FileResource) r).getFile();
              
              if (!file.exists() || !file.canRead()) {
                throw new BuildException("File \"" +
                                         file.getAbsolutePath()
                                         + "\" does not exist or is not readable.");
              }

              boolean indexIt = true;

              if (checkLastModified) {
                Term pathTerm =
                  new Term("path", file.getPath());
                TermQuery query =
                  new TermQuery(pathTerm);
                ScoreDoc[] hits = searcher.search(query, null, 1).scoreDocs;

                // if document is found, compare the
                // indexed last modified time with the
                // current file
                // - don't index if up to date
                if (hits.length > 0) {
                  Document doc = searcher.doc(hits[0].doc);
                  String indexModified =
                    doc.get("modified").trim();
                  if (indexModified != null) {
                    long lastModified = 0;
                    try {
                      lastModified = DateTools.stringToTime(indexModified);
                    } catch (ParseException e) {
                      // if modified time is not parsable, skip
                    }
                    if (lastModified == file.lastModified()) {
                      // TODO: remove existing document
                      indexIt = false;
                    }
                  }
                }
              }

              if (indexIt) {
                try {
                  log("Indexing " + file.getPath(),
                      Project.MSG_VERBOSE);
                  Document doc =
                    handler.getDocument(file);

                  if (doc == null) {
                    totalIgnored++;
                  } else {
                    // Add the path of the file as a field named "path".  Use a Keyword field, so
                    // that the index stores the path, and so that the path is searchable
                    doc.add(new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));

                    // Add the last modified date of the file a field named "modified".  Use a
                    // Keyword field, so that it's searchable, but so that no attempt is made
                    // to tokenize the field into words.
                    doc.add(new Field("modified", DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND), Field.Store.YES, Field.Index.NOT_ANALYZED));

                    writer.addDocument(doc);
                    totalIndexed++;
                  }
                } catch (DocumentHandlerException e) {
                  throw new BuildException(e);
                }
              }
            }
            // for j
          }
          // if (fs != null)
        }
        // for i

        writer.optimize();
      }
        //try
      finally {
        // always make sure everything gets closed,
        // no matter how we exit.
        writer.close();
        if (searcher != null) {
          searcher.close();
        }
      }

      Date end = new Date();

      log(totalIndexed + " out of " + totalFiles + " indexed (" +
          totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) +
          " milliseconds");
    } finally {
      dir.close();
    }
  }

  public static class HandlerConfig implements DynamicConfigurator {
    Properties props = new Properties();

    public void setDynamicAttribute(String attributeName, String value) throws BuildException {
      props.setProperty(attributeName, value);
    }

    public Object createDynamicElement(String elementName) throws BuildException {
      throw new BuildException("Sub elements not supported");
    }

    public Properties getProperties() {
      return props;
    }
  }

 public static class AnalyzerType extends EnumeratedAttribute {
    private static Map<String,String> analyzerLookup = new HashMap<String,String>();

    static {
      analyzerLookup.put("simple", SimpleAnalyzer.class.getName());
      analyzerLookup.put("standard", StandardAnalyzer.class.getName());
      analyzerLookup.put("stop", StopAnalyzer.class.getName());
      analyzerLookup.put("whitespace", WhitespaceAnalyzer.class.getName());
    }

    /**
     * @see EnumeratedAttribute#getValues
     */
    @Override
    public String[] getValues() {
      Set<String> keys = analyzerLookup.keySet();
      return keys.toArray(new String[0]);
    }

    public String getClassname() {
      return analyzerLookup.get(getValue());
    }
  }
}