package edu.wiki.index;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.sql.Blob;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/**
* Performs indexing with Lucene.
* Keeps term frequency vectors for further use.
*
* Usage: ESAWikipediaIndexer <Lucene index location>
*
* @author Cagatay Calli <ccalli@gmail.com>
*
*/
public class ESAWikipediaIndexer {
private IndexWriter writer;
static Connection connection = null;
static Statement stmtArticle;
static PreparedStatement pstmt;
static Statement stmtLimit;
static String strArticleQuery = "SELECT a.id,a.title,t.old_text FROM article a, text t WHERE ? <= a.id AND a.id < ? AND a.id = t.old_id";
static String strLimitQuery = "SELECT MAX(id) FROM article;";
static String strPrQuery = "SELECT MAX(score) FROM pagerank;";
static int limitID;
int addCount = 0;
public static void initDB() throws ClassNotFoundException, SQLException, IOException {
// Load the JDBC driver
String driverName = "com.mysql.jdbc.Driver"; // MySQL Connector
Class.forName(driverName);
// read DB config
InputStream is = ESAWikipediaIndexer.class.getResourceAsStream("/config/db.conf");
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String serverName = br.readLine();
String mydatabase = br.readLine();
String username = br.readLine();
String password = br.readLine();
br.close();
// Create a connection to the database
String url = "jdbc:mysql://" + serverName + "/" + mydatabase; // a JDBC url
connection = DriverManager.getConnection(url, username, password);
pstmt = connection.prepareStatement(strArticleQuery);
pstmt.setFetchSize(200);
stmtLimit = connection.createStatement();
ResultSet res = stmtLimit.executeQuery(strLimitQuery);
res.next();
limitID = res.getInt(1);
stmtLimit.close();
}
public static void main(String[] args) throws IOException, ClassNotFoundException, SQLException {
if(args.length < 1){
System.out.println("Usage: ESAWikipediaIndexer <index path>");
System.exit(-1);
}
String s = args[0];
initDB();
ESAWikipediaIndexer indexer = null;
try {
Directory fsdir = FSDirectory.open(new File(s));
indexer = new ESAWikipediaIndexer(fsdir);
} catch (Exception ex) {
System.out.println("Cannot create index..." + ex.getMessage());
System.exit(-1);
}
indexer.indexDB();
//===================================================
//after adding, we always have to call the
//closeIndex, otherwise the index is not created
//===================================================
indexer.closeIndex();
}
/**
* Constructor
* @param indexDir the name of the folder in which the index should be created
* @throws java.io.IOException
*/
ESAWikipediaIndexer(Directory indexDir) throws IOException {
// the boolean true parameter means to create a new index everytime,
// potentially overwriting any existing files there.
writer = new IndexWriter(indexDir, new WikipediaAnalyzer(), true, IndexWriter.MaxFieldLength.LIMITED);
}
/**
* Indexes a file or directory
* @param fileName the name of a text file or a folder we wish to add to the index
* @throws java.io.IOException
* @throws SQLException
*/
public void indexDB() throws IOException, SQLException {
int originalNumDocs = writer.numDocs();
int id = 0;
String title;
Blob text_blob;
// float prScore;
writer.setSimilarity(new ESASimilarity());
for(int kid = 0;kid<limitID;){
id = 0;
pstmt.setInt(1, kid);
pstmt.setInt(2, kid+400);
ResultSet res = pstmt.executeQuery();
while(res.next()){ // there are articles to process
id = res.getInt(1);
title = new String(res.getBytes(2));
text_blob = res.getBlob(3);
//prScore = res.getFloat(4);
try {
Document doc = new Document();
//===================================================
// add contents of file
//===================================================
// doc.add(new Field("contents", new InputStreamReader(text_blob.getBinaryStream())));
doc.add(new Field("contents", new InputStreamReader(text_blob.getBinaryStream()),Field.TermVector.WITH_OFFSETS));
// doc.add(new Field("contents", new StringReader(wtext), Field.TermVector.WITH_OFFSETS));
// doc.add(new Field("contents", wtext, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_OFFSETS));
// ===
// second field - id
// ===
doc.add(new Field("id", String.valueOf(id),
Field.Store.YES,
Field.Index.NOT_ANALYZED));
// ====
// third field - title
// ====
doc.add(new Field("title", title,
Field.Store.YES,
Field.Index.NOT_ANALYZED));
writer.addDocument(doc);
//System.out.println("Added: " + id);
addCount++;
} catch (Exception e) {
System.out.println("Could not add: " + id);
}
}
if(id > 0){
kid = id + 1;
}
else {
kid = kid + 400;
}
System.out.println("Added: " + addCount);
}
int newNumDocs = writer.numDocs();
System.out.println("");
System.out.println("************************");
System.out.println((newNumDocs - originalNumDocs) + " documents added.");
System.out.println("************************");
}
/**
* Close the index.
* @throws java.io.IOException
*/
public void closeIndex() throws IOException {
writer.optimize();
writer.close();
}
}