Indexer.java example

Explorer

NewSumServer-master
- src
  - org
    - scify
      - NewSumServer
        Server
        Comms
        Communicator.java
        MachineLearning
        INSECTDBWithDir.java
        classificationModule.java
        dataSets.java
        labelTagging.java
        util.java
        vector.java
        writeToFile.java
        OCR
        TextHandler.java
        Searching
        Indexer.java
        Searcher.java
        Sources
        BlogParser.java
        ISourceParser.java
        RSSSources.java
        RssParser.java
        Storage
        IDataStorage.java
        InsectFileIO.java
        SimpleFileIO.java
        Structures
        Article.java
        Sentence.java
        Topic.java
        UnlabeledArticle.java
        User.java
        Summarisation
        ArticleClusterer.java
        RedundancyRemover.java
        Summariser.java
        dumpClusterer.java
        Utils
        Main.java
        Utilities.java

/*
 * Copyright 2013 SciFY NPO <info@scify.org>.
 *
 * This product is part of the NewSum Free Software.
 * For more information about NewSum visit
 * 
 * 	http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * If this code or its output is used, extended, re-engineered, integrated, 
 * or embedded to any extent in another software or hardware, there MUST be 
 * an explicit attribution to this work in the resulting source code, 
 * the packaging (where such packaging exists), or user interface 
 * (where such an interface exists). 
 * The attribution must be of the form "Powered by NewSum, SciFY"
 */ 

package org.scify.NewSumServer.Server.Searching;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.sql.Time;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LimitTokenCountAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.scify.NewSumServer.Server.Utils.Main;
import org.scify.NewSumServer.Server.Utils.Utilities;

/**
 * The Class used for Indexing
 * @author George K. <gkiom@scify.org>
 */
public class Indexer {

    private static final String FILE_FIELD = "file";
    private static final String TEXT_FIELD = "text";
    /**
     * The Directory containing the Index Files
     */
    private static File indexDir;
    /**
     * The Absolute path to the Directory where the Files to be indexed are
     */
    private String sFilesPath;
    /**
     * The Absolute path to the Directory where the Indexed Files are stored
     */
    private String sIndexPath;

    private Locale lLoc;

    private Analyzer anal;
    /**
     * The Global Logger Class.
     */
    protected final static Logger LOGGER = Main.getLogger();

//    public final String sFileSeparator = System.getProperty("file.separator");


    /**
     *
     * @param sFilesPath The Absolute path to the Directory where the Files to be indexed are
     * @param sIndexPath The Absolute path to the Directory where the Indexed Files are stored
     * @param loc The locale that the files will be indexed with
     */
    public Indexer(String sFilesPath, String sIndexPath, Locale loc) {
        this.sFilesPath = sFilesPath;
        this.sIndexPath = sIndexPath;
        this.lLoc       = loc;
        // The dir the Index files will be saved in
        indexDir        = new File(this.sIndexPath);
    }
    /**
     * The Main method of the Indexer Class.
     * Traverses a directory and creates the index files needed for the package to
     * operate.
     * @throws CorruptIndexException
     * @throws LockObtainFailedException
     * @throws IOException
     */
    public void createIndex()
            throws CorruptIndexException,
            LockObtainFailedException, IOException {
        // The dir containing the Files to Index
        File docDir = new File(this.sFilesPath);
        Directory FSDir = FSDirectory.open(indexDir);
        //init the Analyzer, according to locale
        if (lLoc.toString().equals("el")) {
            anal = new GreekAnalyzer(Version.LUCENE_36);
        } else if (lLoc.toString().equals("en")) {
            // The standard analyzer
            Analyzer stdAnal = new StandardAnalyzer(Version.LUCENE_36);
            // In order to index all the text in a field,
            // however long that field may be
            anal = new LimitTokenCountAnalyzer(stdAnal, Integer.MAX_VALUE);
        }
        // The configuration for the Index Writer
        IndexWriterConfig conf =
                new IndexWriterConfig(Version.LUCENE_36, anal);
        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        // The Index Writer
        IndexWriter indexWriter
                = new IndexWriter(FSDir, conf);
        // For each File in the dir, create a Document
        for (File file : getFilesFromFirstLeverSubdirs(docDir)) {
            String filename = file.getName();
            String fullFileName = file.getAbsolutePath();
            String tmpText = Utilities.readFromFile(fullFileName, " ");
            Document d = new Document(); //lucene Document
            // Add the "filename" field
            d.add(new Field
                (FILE_FIELD, filename,
                    Field.Store.YES, Field.Index.NOT_ANALYZED));
            // Add The "Text" Field
            d.add(new Field
                (TEXT_FIELD, tmpText,
                    Field.Store.YES, Field.Index.ANALYZED));

            // Add the Document to the Writer
            indexWriter.addDocument(d);
        }
        int numDocs = indexWriter.numDocs();
        // the index will be merged down into a single segment, resulting in
        // a smaller index with better search performance. Costly Operation,
        // DO NOT USE on large dirs or when low disk space (needs (2-3)*DirSize)
        indexWriter.forceMerge(1);
        // Syncs All referenced Index Files.
        // At this point old indexes will be deleted, freeing up space
        indexWriter.commit();
        // Terminate the Writer appropriately
        indexWriter.close();
//        LOGGER.log(Level.INFO, "Succesfully closed indexWriter with {0}", anal.toString());
    }

    /**
     *
     * @return The Directory that the Index Files are in
     */
    public File getIndexDirectory() {
//        Logger.getAnonymousLogger().log(Level.INFO, "INDEXER-->INDEXPATH: {0}", indexDir);
        return indexDir;
    }
//    public Analyzer getAnalyzer() {
//        return anal;
//    }

    /**
     * Get the files in the subdirs of a given directory.
     * If the given file is not a directory, return an empty list.
     * @param fDir The directory to analyze.
     * @return The list of (non-directory) files in the subdirectories.
     */
    protected List<File> getFilesFromFirstLeverSubdirs(File fDir) {
        ArrayList<File> lRes = new ArrayList<File>();
        // Only analyze, if a directory
        if (fDir.isDirectory()) {
            // Check only first level
            lRes.addAll(Arrays.asList(fDir.listFiles(new FileFilter() {

                @Override
                public boolean accept(File file) {
                    // Do NOT accept directories
                    return !file.isDirectory();
                }
            })));

            // For every subdir
            for (File fSubDir : fDir.listFiles(new FileFilter() {

                @Override
                public boolean accept(File file) {
                    // Do NOT accept directories
                    return file.isDirectory();
                }
            })) {
                // Get children files (Recursion)
                lRes.addAll(getFilesFromFirstLeverSubdirs(fSubDir));
            }
        }

        // Return result
        return lRes;
    }
}
//final class LuceneUtil {
//
//  private LuceneUtil() {}
//
//  public static List<String> tokenizeString(Analyzer analyzer, String string) {
//    List<String> result = new ArrayList<String>();
//    try {//or 'field' instead of null
//      TokenStream stream  = analyzer.tokenStream(null, new StringReader(string));
//      while (stream.incrementToken()) {
//        result.add(stream.getAttribute(CharTermAttribute.class).toString());
//      }
//    } catch (IOException e) {
//      // not thrown b/c we're using a string reader...
//      throw new RuntimeException(e);
//    }
//    return result;
//  }
//
//}