ESAWikipediaIndexer.java example

Explorer

wikiprep-esa-master
- esa-lucene
  - src
    - edu
      - wiki
        api
        concept
        IConceptIterator.java
        IConceptVector.java
        IConceptVectorData.java
        scorer
        CosineScorer.java
        search
        IScorer.java
        concept
        ConceptVectorSimilarity.java
        TroveConceptVector.java
        TroveConceptVectorOrderedIterator.java
        demo
        TestCombinedESAVectors.java
        TestESAVectors.java
        TestGeneralESAVectors.java
        TestSimilarity.java
        TestWordsim353.java
        index
        ESASimilarity.java
        ESAWikipediaIndexer.java
        WikipediaAnalyzer.java
        modify
        IndexModifier.java
        IndexPruner.java
        MemIndexModifier.java
        search
        ESASearcher.java
        NormalizedWikipediaDistance.java
        WikipediaNormalSearcher.java
        service
        ESAServlet.java
        UTF8Filter.java
        util
        HeapSort.java
    - org
      - apache
        lucene
        analysis
        CustomFilter.java
        CustomTokenizer.java

package edu.wiki.index;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.sql.Blob;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

/**
 * Performs indexing with Lucene.
 * Keeps term frequency vectors for further use.
 * 
 * Usage: ESAWikipediaIndexer <Lucene index location>
 * 
 * @author Cagatay Calli <ccalli@gmail.com>
 *
 */
public class ESAWikipediaIndexer {
	
	  private IndexWriter writer;
	  
	  static Connection connection = null;
	  static Statement stmtArticle;
	  static PreparedStatement pstmt;
	  static Statement stmtLimit;
	  static String strArticleQuery = "SELECT a.id,a.title,t.old_text FROM article a, text t WHERE ? <= a.id AND a.id < ? AND a.id = t.old_id";
	  
	  static String strLimitQuery = "SELECT MAX(id) FROM article;";
	  static String strPrQuery = "SELECT MAX(score) FROM pagerank;";
	  
	  static int limitID; 
	  int addCount = 0;
	  
	  public static void initDB() throws ClassNotFoundException, SQLException, IOException {
			// Load the JDBC driver 
			String driverName = "com.mysql.jdbc.Driver"; // MySQL Connector 
			Class.forName(driverName); 
			
			// read DB config
			InputStream is = ESAWikipediaIndexer.class.getResourceAsStream("/config/db.conf");
			BufferedReader br = new BufferedReader(new InputStreamReader(is));
			String serverName = br.readLine();
			String mydatabase = br.readLine();
			String username = br.readLine(); 
			String password = br.readLine();
			br.close();

			// Create a connection to the database 
			String url = "jdbc:mysql://" + serverName + "/" + mydatabase; // a JDBC url 
			connection = DriverManager.getConnection(url, username, password);
			
			pstmt = connection.prepareStatement(strArticleQuery);
			pstmt.setFetchSize(200);
			
			stmtLimit = connection.createStatement();
			ResultSet res = stmtLimit.executeQuery(strLimitQuery);
			res.next();
			limitID = res.getInt(1);
			
			stmtLimit.close();
		}
	  
	  public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException {
		
	    if(args.length < 1){
	    	System.out.println("Usage: ESAWikipediaIndexer <index path>");
	    	System.exit(-1);
	    }

	    String s = args[0];
	    
	    initDB();

	    ESAWikipediaIndexer indexer = null;
	    try {
	    	Directory fsdir = FSDirectory.open(new File(s));
	      indexer = new ESAWikipediaIndexer(fsdir);
	    } catch (Exception ex) {
	      System.out.println("Cannot create index..." + ex.getMessage());
	      System.exit(-1);
	    }

	    indexer.indexDB();
	    

	    //===================================================
	    //after adding, we always have to call the
	    //closeIndex, otherwise the index is not created    
	    //===================================================
	    indexer.closeIndex();
	  }

	  /**
	   * Constructor
	   * @param indexDir the name of the folder in which the index should be created
	   * @throws java.io.IOException
	   */
	  ESAWikipediaIndexer(Directory indexDir) throws IOException {
	    // the boolean true parameter means to create a new index everytime, 
	    // potentially overwriting any existing files there.
	    writer = new IndexWriter(indexDir, new WikipediaAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED); 
	  }

	  /**
	   * Indexes a file or directory
	   * @param fileName the name of a text file or a folder we wish to add to the index
	   * @throws java.io.IOException
	 * @throws SQLException 
	   */
	  public void indexDB() throws IOException, SQLException {
	    
	    int originalNumDocs = writer.numDocs();
	    int id = 0;
	    String title;
	    Blob text_blob;
	    // float prScore;
	    	    
	    writer.setSimilarity(new ESASimilarity());
	    
	    for(int kid = 0;kid<limitID;){
	    	id = 0;
	    	
	    	pstmt.setInt(1, kid);
	    	pstmt.setInt(2, kid+400);
	    	ResultSet res = pstmt.executeQuery();
	    	
	    	
	    	while(res.next()){	// there are articles to process 
				id = res.getInt(1);
				title = new String(res.getBytes(2));
				text_blob = res.getBlob(3);
				//prScore = res.getFloat(4);
				
				try {
			        Document doc = new Document();

			        //===================================================
			        // add contents of file
			        //===================================================
			        
			        // doc.add(new Field("contents", new InputStreamReader(text_blob.getBinaryStream())));

			        doc.add(new Field("contents", new InputStreamReader(text_blob.getBinaryStream()),Field.TermVector.WITH_OFFSETS));
			        // doc.add(new Field("contents", new StringReader(wtext), Field.TermVector.WITH_OFFSETS));
			        
			        // doc.add(new Field("contents", wtext, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_OFFSETS));
			        
			        // ===
			        // second field - id
			        // ===  
			        doc.add(new Field("id", String.valueOf(id),
			        		Field.Store.YES,
			        		Field.Index.NOT_ANALYZED));
			        
			        // ====
			        // third field - title
			        // ====
			        doc.add(new Field("title", title,
			                Field.Store.YES,
			                Field.Index.NOT_ANALYZED));

			        writer.addDocument(doc);
			        //System.out.println("Added: " + id);
			        addCount++;
			        
			      } catch (Exception e) {
			        System.out.println("Could not add: " + id);
			      }
				
			}
	    	
	    	if(id > 0){
	    		kid = id + 1;
	    	}
	    	else {
	    		kid = kid + 400;
	    	}
	    	
	    	System.out.println("Added: " + addCount);
	    	
	    }
	    
	    int newNumDocs = writer.numDocs();
	    System.out.println("");
	    System.out.println("************************");
	    System.out.println((newNumDocs - originalNumDocs) + " documents added.");
	    System.out.println("************************");

	  }

	  /**
	   * Close the index.
	   * @throws java.io.IOException
	   */
	  public void closeIndex() throws IOException {
	    writer.optimize();
	    writer.close();
	  }
}