package com.personalityextractor.store;
import java.io.File;
import java.io.IOException;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import com.personalityextractor.entity.WikipediaEntity;
import com.personalityextractor.evaluation.PerfMetrics;
import com.personalityextractor.evaluation.PerfMetrics.Metric;
public class LuceneStore {
private static LuceneStore instance = null;
private static final String PAGE_INDEX_PATH = "/Users/semanticvoid/projects/PE/indicies/pages";
private static final String CATEGORY_INDEX_PATH = "/Users/semanticvoid/projects/PE/indicies/categories";
private static MysqlStore db = null;
static {
try {
db = new MysqlStore("localhost", "root", "", "wikiminer");
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
static IndexSearcher pgSearcher = null;
static IndexSearcher catSearcher = null;
private LuceneStore() {
}
public static LuceneStore getInstance() {
if (instance == null) {
instance = new LuceneStore();
}
return instance;
}
public void loadIndices() {
Date d1 = new Date();
try {
if (pgSearcher == null) {
System.err.print("Loading page index...\t");
if (new File(PAGE_INDEX_PATH).exists()) {
pgSearcher = new IndexSearcher(new RAMDirectory(
FSDirectory.open(new File(PAGE_INDEX_PATH))));
}
System.err.print("[ DONE ]\n");
}
if (catSearcher == null) {
System.err.print("Loading category index...\t");
if (new File(CATEGORY_INDEX_PATH).exists()) {
catSearcher = new IndexSearcher(new RAMDirectory(
FSDirectory.open(new File(CATEGORY_INDEX_PATH))));
}
System.err.print("[ DONE ]\n");
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
Date d2 = new Date();
PerfMetrics.getInstance().addToMetrics(Metric.LOAD,
(d2.getTime() - d1.getTime()));
}
Query query = null; // the Query created by the QueryParser
TopDocs hits = null; // the search results
public List<WikipediaEntity> search(String terms) {
Date d1 = new Date();
List<WikipediaEntity> entities = new ArrayList<WikipediaEntity>();
try {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser qp = new QueryParser(Version.LUCENE_30, "text",
analyzer);
query = qp.parse(terms);
hits = pgSearcher.search(query, null, 100);
int numResults = 100;
if (hits.totalHits < 100) {
numResults = hits.totalHits;
}
if (hits.totalHits != 0) {
for (int i = 0; i < numResults; i++) {
Document doc = pgSearcher.doc(hits.scoreDocs[i].doc);
// System.out.println(doc.get("id") + "\t" + doc.get("inlinks"));
entities.add(new WikipediaEntity(doc.get("text"), doc
.get("id"), Integer.valueOf(doc.get("type")), doc
.get("inlinks")));
}
}
Collections.sort(entities);
if(numResults > 20) {
numResults = 20;
}
List<WikipediaEntity> tmpEntities = entities.subList(0, numResults);
entities = tmpEntities;
} catch (Exception e) {
e.printStackTrace();
}
Date d2 = new Date();
PerfMetrics.getInstance().addToMetrics(Metric.SEARCHPAGE,
(d2.getTime() - d1.getTime()));
return entities;
}
public WikipediaEntity searchPageId(String id) {
try {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser qp = new QueryParser(Version.LUCENE_30, "id", analyzer);
query = qp.parse(id);
hits = pgSearcher.search(query, 1);
if (hits.totalHits != 0) {
Document doc = pgSearcher.doc(hits.scoreDocs[0].doc);
return new WikipediaEntity(doc.get("text"), doc.get("id"),
Integer.valueOf(doc.get("type")), doc.get("inlinks"));
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public List<WikipediaEntity> getCategories(String id) {
Date d1 = new Date();
List<WikipediaEntity> categories = new ArrayList<WikipediaEntity>();
if (id != null) {
List<String> categoryIds = getCategoryIds(id);
for (String cid : categoryIds) {
WikipediaEntity we = searchPageId(cid);
if (we != null) {
categories.add(we);
}
}
}
Date d2 = new Date();
PerfMetrics.getInstance().addToMetrics(Metric.GETCATEGORIES,
(d2.getTime() - d1.getTime()));
return categories;
}
private List<String> getCategoryIds(String id) {
List<String> categoryIds = new ArrayList<String>();
try {
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser qp = new QueryParser(Version.LUCENE_30, "child",
analyzer);
query = qp.parse(id);
hits = catSearcher.search(query, 50);
int numResults = 50;
if (hits.totalHits < 50) {
numResults = hits.totalHits;
}
if (hits.totalHits != 0) {
for (int i = 0; i < numResults; i++) {
Document doc = catSearcher.doc(hits.scoreDocs[i].doc);
categoryIds.add(doc.get("parent"));
}
}
} catch (Exception e) {
e.printStackTrace();
}
return categoryIds;
}
public double compare(String id1, String id2) {
double sim = 0;
if (id1 == null || id2 == null || id1.equals("") || id2.equals("")) {
return sim;
}
List<String> categories1 = getCategoryIds(id1);
List<String> categories2 = getCategoryIds(id2);
double intersection = 0;
for (String id11 : categories1) {
if (categories2.contains(id11)) {
intersection++;
}
}
if (intersection > 0 && (categories1.size() + categories2.size()) > 0) {
sim = intersection * 2 / (categories1.size() + categories2.size());
}
return sim;
}
public void indexPages(int index) {
ResultSet rs = null;
int refresh = 0;
try {
new File(PAGE_INDEX_PATH).mkdir();
Directory directory = new SimpleFSDirectory(new File(
PAGE_INDEX_PATH));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
IndexWriter iwriter = new IndexWriter(directory, analyzer, true,
MaxFieldLength.UNLIMITED);
do {
String query = "SELECT * from page_indexed LIMIT " + index
+ ", 1000";
try {
rs = db.execute(query);
while (rs.next()) {
String id = rs.getString("page_id");
String data = rs.getString("page_title");
String inlinks = rs.getString("inlinks");
String type = rs.getString("page_type");
try {
Document doc = new Document();
doc.add(new Field("text", data, Field.Store.YES,
Field.Index.ANALYZED));
doc.add(new Field("id", id, Field.Store.YES,
Field.Index.ANALYZED));
doc.add(new Field("inlinks", inlinks, Field.Store.YES,
Field.Index.NO));
doc.add(new Field("type", type, Field.Store.YES,
Field.Index.NO));
iwriter.addDocument(doc);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (rs != null) {
try {
rs.close();
db.closeStmt();
} catch (Exception e) {
e.printStackTrace();
}
}
}
System.out.println(index);
index += 1000;
refresh++;
if (refresh % 500000 == 0) {
iwriter.optimize();
}
} while (index < 6700000);
iwriter.optimize();
iwriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public void indexCategories(int index) {
ResultSet rs = null;
int refresh = 0;
try {
new File(CATEGORY_INDEX_PATH).mkdir();
Directory directory = new SimpleFSDirectory(new File(
CATEGORY_INDEX_PATH));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
IndexWriter iwriter = new IndexWriter(directory, analyzer, true,
MaxFieldLength.UNLIMITED);
do {
String query = "SELECT * from categorylink LIMIT " + index
+ ", 5000";
try {
rs = db.execute(query);
while (rs.next()) {
String parent = rs.getString("cl_parent");
String child = rs.getString("cl_child");
try {
Document doc = new Document();
doc.add(new Field("parent", parent,
Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("child", child, Field.Store.YES,
Field.Index.ANALYZED));
iwriter.addDocument(doc);
} catch (IOException e) {
e.printStackTrace();
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (rs != null) {
try {
rs.close();
db.closeStmt();
} catch (Exception e) {
e.printStackTrace();
}
}
}
System.out.println(index);
index += 5000;
refresh++;
// if (refresh % 500000 == 0) {
// iwriter.optimize();
// }
} while (index < 9526832);
iwriter.optimize();
iwriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
LuceneStore s = new LuceneStore();
// s.indexPages(0);
// s.indexCategories(0);
s.loadIndices();
Date d1 = new Date();
List<WikipediaEntity> results = s.search("google");
for (WikipediaEntity e : results) {
System.out.println("\t" + e.getWikiminerID() + "\t"
+ e.getCommonness());
}
Date d2 = new Date();
System.out.println((d2.getTime() - d1.getTime()));
// d1 = new Date();
// s.search("apple");
// d2 = new Date();
// System.out.println((d2.getTime() - d1.getTime()));
//
// d1 = new Date();
// s.search("france");
// d2 = new Date();
// System.out.println((d2.getTime() - d1.getTime()));
//
// d1 = new Date();
// s.search("pakistan");
// d2 = new Date();
// System.out.println((d2.getTime() - d1.getTime()));
}
}