/*
* DrakkarKeel - An Enterprise Collaborative Search Platform
*
* The contents of this file are subject under the terms described in the
* DRAKKARKEEL_LICENSE file included in this distribution; you may not use this
* file except in compliance with the License.
*
* 2013-2014 DrakkarKeel Platform.
*/
package drakkar.mast.retrieval;
import drakkar.oar.DocumentMetaData;
import drakkar.oar.facade.event.FacadeDesktopListener;
import static drakkar.oar.util.KeyField.*;
import static drakkar.oar.util.KeyMessage.*;
import drakkar.oar.util.KeySearchable;
import drakkar.oar.util.OutputMonitor;
import drakkar.oar.util.Utilities;
import drakkar.mast.IndexException;
import drakkar.mast.SearchException;
import drakkar.mast.recommender.CollectionInfo;
import drakkar.mast.recommender.DocInfo;
import drakkar.mast.recommender.DocTermInfo;
import drakkar.mast.recommender.TermInfo;
import drakkar.mast.retrieval.analysis.NGramAnalyzer;
import drakkar.mast.retrieval.analysis.NGramAnalyzerCaseSensitive;
import drakkar.mast.retrieval.analysis.NGramQuery;
import drakkar.mast.retrieval.analysis.StopStemAnalyzer;
import drakkar.mast.retrieval.analysis.StopStemAnalyzerCaseSensitive;
import drakkar.mast.retrieval.analysis.WikiAnalyzer;
import drakkar.mast.retrieval.analysis.WikiCaseSensitiveAnalyzer;
import drakkar.mast.retrieval.parser.JavaParser;
import drakkar.mast.retrieval.parser.PdfParser;
import com.thoughtworks.qdox.parser.ParseException;
import java.io.File;
import java.io.IOException;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryTermScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/**
* context del motor de búsqueda Apache Lucene
*/
public class LuceneContext extends AdvEngineContext {
private IndexSearcher searcher;
private boolean appendIndex;
private IndexWriter writer;
private IndexWriter writerLSI;
private PerFieldAnalyzerWrapper fieldAnalyzer;
private PerFieldAnalyzerWrapper fieldAnalyzerCS;
private ScoreDoc[] scoreDoc;
private ScoreDoc scoreDocObj;
private Query queryq;
private Query[] queryall;
private Directory directory;
private IndexReader reader;
private TopDocs topDocs;
private Highlighter hg;
private Document docum = null;
private TokenStream tokens;
/**
* Default Constructor
*/
public LuceneContext() {
defaultIndexPath = "./index/lucene/";
defaultIndexLSIPath = "./index/lsi/lucene/";
this.applyLSI = false;
}
/**
* constructor
*
* @param listener oyente de los procesos realizados por este motor
*/
public LuceneContext(FacadeDesktopListener listener) {
super(listener);
defaultIndexPath = "./index/lucene/";
defaultIndexLSIPath = "./index/lsi/lucene/";
this.applyLSI = false;
}
/**
* {@inheritDoc}
*/
@Override
public ArrayList<DocumentMetaData> search(String query, boolean caseSensitive) throws SearchException {
ArrayList<DocumentMetaData> finalResultsList = null;
this.finalMetaResult = new ArrayList<DocumentMetaData>();
this.scoreDoc = null;
this.queryq = null;
String[] codeAndBooks;
this.queryall = new Query[5];
try {
setStartTimeOfSearch(new Date());
if (IndexReader.indexExists(FSDirectory.open(this.indexPath))) {
this.directory = FSDirectory.open(this.indexPath);
this.reader = IndexReader.open(this.directory);
this.searcher = new IndexSearcher(this.reader);
//////////////////////////////////////////////
try {
if (caseSensitive == false) {
codeAndBooks = new String[5];
codeAndBooks[0] = getDocumentField(FIELD_CODE_ALL_COMMENTS);
codeAndBooks[1] = getDocumentField(FIELD_CODE_ALL_SOURCE);
codeAndBooks[2] = getDocumentField(FIELD_DOC_TEXT);
codeAndBooks[3] = getDocumentField(FIELD_NAME);
codeAndBooks[4] = getDocumentField(FIELD_DOC_BOOK);
this.setFieldAnalyzer(new PerFieldAnalyzerWrapper(new StopStemAnalyzer()));
this.getFieldAnalyzer().addAnalyzer(getDocumentField(FIELD_CODE_ALL_SOURCE), new NGramAnalyzer());
this.getFieldAnalyzer().addAnalyzer(getDocumentField(FIELD_NAME), new NGramAnalyzer());
String fieldToProcess;
for (int i = 0; i < codeAndBooks.length; i++) {
fieldToProcess = codeAndBooks[i];
this.queryq = new NGramQuery(this.getFieldAnalyzer(), query, fieldToProcess);
this.queryall[i] = this.queryq;
this.queryq = this.queryq.combine(this.queryall);
}
} else if (caseSensitive == true) {
codeAndBooks = new String[5];
codeAndBooks[0] = getDocumentFieldCS(FIELD_CODE_ALL_COMMENTS);
codeAndBooks[1] = getDocumentFieldCS(FIELD_CODE_ALL_SOURCE);
codeAndBooks[2] = getDocumentFieldCS(FIELD_DOC_TEXT);
codeAndBooks[3] = getDocumentFieldCS(FIELD_NAME);
codeAndBooks[4] = getDocumentFieldCS(FIELD_DOC_BOOK);
this.setFieldAnalyzerCS(new PerFieldAnalyzerWrapper(new StopStemAnalyzerCaseSensitive()));
this.getFieldAnalyzerCS().addAnalyzer(getDocumentFieldCS(FIELD_CODE_ALL_SOURCE), new NGramAnalyzerCaseSensitive());
this.getFieldAnalyzerCS().addAnalyzer(getDocumentFieldCS(FIELD_NAME), new NGramAnalyzerCaseSensitive());
String fieldToProcess;
for (int i = 0; i < codeAndBooks.length; i++) {
fieldToProcess = codeAndBooks[i];
this.queryq = new NGramQuery(this.getFieldAnalyzerCS(), query, fieldToProcess);
this.queryall[i] = this.queryq;
this.queryq = this.queryq.combine(this.queryall);
}
}
//Finds the top n hits for query, applying filter if non-null.
this.topDocs = this.searcher.search(this.queryq, null, 1000);
int totalHits = this.topDocs.totalHits;
this.retrievedDocsCount = totalHits;
this.scoreDoc = this.topDocs.scoreDocs;
setEndTimeOfSearch(new Date());
String message = "Lucene retrieved " + totalHits + " document(s) (in " + getSearchTime() + " milliseconds) that matched query '" + query + "'.";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
//save results
this.finalMetaResult = saveResults(this.scoreDoc, caseSensitive, this.queryq);
//delete repeated
if (this.finalMetaResult.size() > 1) {
deleteRepeated(this.finalMetaResult);
}
finalResultsList = this.finalMetaResult;
} catch (IOException ex) {
String message = "Class: SearchEngineLucene." + " Method: searchFiles(String query)." + " Error: " + ex.getMessage();
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new SearchException(ex.getMessage());
}
} else {
OutputMonitor.printLine("Index path incorrect", OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, "Index path incorrect");
}
} catch (CorruptIndexException ex) {
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
throw new SearchException(ex.getMessage());
} catch (IOException ex) {
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
throw new SearchException(ex.getMessage());
}
this.retrievedDocsCount += finalResultsList.size();
return finalResultsList;
}
/**
* {@inheritDoc}
*/
@Override
public ArrayList<DocumentMetaData> search(String query, String docType, boolean caseSensitive) throws SearchException {
ArrayList<DocumentMetaData> finalResultsList = null;
try {
setStartTimeOfSearch(new Date());
if (IndexReader.indexExists(FSDirectory.open(this.indexPath))) {
this.directory = FSDirectory.open(this.indexPath);
this.reader = IndexReader.open(this.directory);
this.searcher = new IndexSearcher(this.reader);
////////////////////////////////////////////
ArrayList<DocumentMetaData> tempList = search(query, caseSensitive); //busca en toda la colección de documentos
finalResultsList = this.filterMetaDocuments(docType, tempList);
this.finalMetaResult = finalResultsList;
setEndTimeOfSearch(new Date());
String message = "Lucene retrieved " + finalResultsList.size() + " document(s) (in " + getSearchTime() + " milliseconds) that matched query '" + query + "'. for doctype " + docType;
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
} else {
OutputMonitor.printLine("Index path incorrect", OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, "Index path incorrect");
}
} catch (CorruptIndexException ex) {
OutputMonitor.printStream("", ex);
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
throw new SearchException(ex.getMessage());
} catch (IOException ex) {
OutputMonitor.printStream("", ex);
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
throw new SearchException(ex.getMessage());
}
this.retrievedDocsCount += finalResultsList.size();
return finalResultsList;
}
/**
* {@inheritDoc}
*/
@Override
public ArrayList<DocumentMetaData> search(String query, String[] docTypes, boolean caseSensitive) throws SearchException {
ArrayList<DocumentMetaData> finalResultsList = null;
ArrayList<DocumentMetaData> tempList = null;
setStartTimeOfSearch(new Date());
for (int i = 0; i < docTypes.length; i++) {
String doc = docTypes[i];
tempList = search(query, doc, caseSensitive);
finalResultsList.addAll(tempList);
}
if (finalResultsList.size() > 1) {
deleteRepeated(finalMetaResult);
}
String message = "Lucene retrieved " + finalResultsList.size() + " document(s) (in " + getSearchTime() + " milliseconds) that matched query '" + query + "'. for doctypes ";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
setEndTimeOfSearch(new Date());
this.retrievedDocsCount += finalResultsList.size();
return finalResultsList;
}
/**
* {@inheritDoc}
*/
@Override
public ArrayList<DocumentMetaData> search(String query, int field, boolean caseSensitive) throws SearchException {
ArrayList<DocumentMetaData> finalResultsList = null;
try {
if (IndexReader.indexExists(FSDirectory.open(this.indexPath))) {
this.directory = FSDirectory.open(this.indexPath);
this.reader = IndexReader.open(this.directory);
this.searcher = new IndexSearcher(this.reader);
////////////////////////////////////////////
this.finalMetaResult = new ArrayList<DocumentMetaData>();
this.queryq = null;
setStartTimeOfSearch(new Date());
try {
if (caseSensitive == false) {
this.setFieldAnalyzer(new PerFieldAnalyzerWrapper(new StopStemAnalyzer()));
this.getFieldAnalyzer().addAnalyzer(getDocumentField(FIELD_CODE_ALL_SOURCE), new NGramAnalyzer());
this.getFieldAnalyzer().addAnalyzer(getDocumentField(FIELD_NAME), new NGramAnalyzer());
// this.fieldAnalyzer = new PerFieldAnalyzerWrapper(new NGramAnalyzer());
// this.fieldAnalyzer.addAnalyzer(getDocumentField(FIELD_CODE_ALL_COMMENTS), new StopStemAnalyzer());
String fieldToProcess = getDocumentField(field);
this.queryq = new NGramQuery(this.getFieldAnalyzer(), query, fieldToProcess);
} else if (caseSensitive == true) {
this.setFieldAnalyzerCS(new PerFieldAnalyzerWrapper(new StopStemAnalyzerCaseSensitive()));
this.getFieldAnalyzerCS().addAnalyzer(getDocumentFieldCS(FIELD_CODE_ALL_SOURCE), new NGramAnalyzerCaseSensitive());
this.getFieldAnalyzerCS().addAnalyzer(getDocumentFieldCS(FIELD_NAME), new NGramAnalyzerCaseSensitive());
// this.fieldAnalyzerCS = new PerFieldAnalyzerWrapper(new NGramAnalyzerCaseSensitive());
// this.fieldAnalyzerCS.addAnalyzer(getDocumentFieldCS(FIELD_CODE_ALL_COMMENTS), new StopStemAnalyzerCaseSensitive());
String fieldToProcess = getDocumentFieldCS(field);
this.queryq = new NGramQuery(this.getFieldAnalyzerCS(), query, fieldToProcess);
}
this.topDocs = this.searcher.search(this.queryq, null, 1000); //Finds the top n hits for query, applying filter if non-null.
int totalHits = this.topDocs.totalHits;
this.retrievedDocsCount = totalHits;
this.scoreDoc = this.topDocs.scoreDocs;
setEndTimeOfSearch(new Date());
String message = "Lucene retrieved " + totalHits + " document(s) (in " + getSearchTime() + " milliseconds) that matched query '" + query + "'.";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
this.finalMetaResult = saveResults(this.scoreDoc, caseSensitive, this.queryq);
if (this.finalMetaResult.size() > 1) {
deleteRepeated(this.finalMetaResult);
}
finalResultsList = this.finalMetaResult;
} catch (IOException ex) {
String message = "Class: SearchEngineLucene." + " Method: searchFiles(String query)." + " Error: " + ex.getMessage();
OutputMonitor.printStream(message, ex);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new SearchException(ex.getMessage());
}
} else {
OutputMonitor.printLine("Index path incorrect", OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, "Index path incorrect");
}
} catch (CorruptIndexException ex) {
OutputMonitor.printStream("", ex);
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
throw new SearchException(ex.getMessage());
} catch (IOException ex) {
OutputMonitor.printStream("", ex);
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
throw new SearchException(ex.getMessage());
}
this.retrievedDocsCount += finalResultsList.size();
return finalResultsList;
}
/**
* {@inheritDoc}
*/
@Override
public ArrayList<DocumentMetaData> search(String query, int[] fields, boolean caseSensitive) throws SearchException {
ArrayList<DocumentMetaData> tempList = null;
ArrayList<DocumentMetaData> documents = new ArrayList<DocumentMetaData>();
int field;
setStartTimeOfSearch(new Date());
for (int i = 0; i < fields.length; i++) {
field = fields[i];
tempList = this.search(query, field, caseSensitive);
documents.addAll(tempList);
}
if (documents.size() > 1) {
deleteRepeated(documents);
}
setEndTimeOfSearch(new Date());
String message = "Lucene retrieved " + documents.size() + " document(s) (in " + getSearchTime() + " milliseconds) that matched query '" + query + "'. for fields ";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
this.retrievedDocsCount += documents.size();
return documents;
}
/**
* {@inheritDoc}
*/
@Override
public ArrayList<DocumentMetaData> search(String query, String docType, int field, boolean caseSensitive) throws SearchException {
ArrayList<DocumentMetaData> docsResult = new ArrayList<DocumentMetaData>();
ArrayList<DocumentMetaData> finalResultsList = new ArrayList<DocumentMetaData>();
setStartTimeOfSearch(new Date());
try {
if (IndexReader.indexExists(FSDirectory.open(this.indexPath))) {
this.directory = FSDirectory.open(this.indexPath);
this.reader = IndexReader.open(this.directory);
this.searcher = new IndexSearcher(this.reader);
////////////////////////////////////////////
docsResult = search(query, field, caseSensitive);
finalResultsList = this.filterMetaDocuments(docType, docsResult);
this.finalMetaResult = finalResultsList;
setEndTimeOfSearch(new Date());
String message = "Lucene retrieved " + finalResultsList.size() + " document(s) (in " + getSearchTime() + " milliseconds) that matched query '" + query + "'for field and docType";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
this.retrievedDocsCount += finalResultsList.size();
} else {
OutputMonitor.printLine("Index path incorrect", OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, "Index path incorrect");
}
} catch (IOException ex) {
OutputMonitor.printStream("", ex);
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
}
return finalResultsList;
}
/**
* {@inheritDoc}
*/
@Override
public ArrayList<DocumentMetaData> search(String query, String docType, int[] fields, boolean caseSensitive) throws SearchException {
ArrayList<DocumentMetaData> tempList = null;
ArrayList<DocumentMetaData> documents = new ArrayList<DocumentMetaData>();
try {
setStartTimeOfSearch(new Date());
if (IndexReader.indexExists(FSDirectory.open(this.indexPath))) {
this.directory = FSDirectory.open(this.indexPath);
this.reader = IndexReader.open(this.directory);
this.searcher = new IndexSearcher(this.reader);
/////////////////////////////////////////////
String docSource;
for (int i = 0; i < this.documentalSource.size(); i++) {
docSource = this.documentalSource.get(i);
if (docSource.equalsIgnoreCase(docType)) {
if (fields != null && fields.length > 0) {
for (Integer field : fields) {
if (field != 0) {
tempList = search(query, docType, field, caseSensitive);
if (tempList != null) {
documents.addAll(tempList);
}
}
}
// esto es para eliminar los documentos repetidos.
this.deleteRepeated(documents);
} else {
tempList = search(query, docType, caseSensitive);
this.retrievedDocsCount += tempList.size();
return tempList;
}
} else if (docType == null) {
tempList = search(query, caseSensitive);
this.retrievedDocsCount += tempList.size();
return tempList;
}
}
} else {
OutputMonitor.printLine("Index path incorrect", OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, "Index path incorrect");
}
} catch (CorruptIndexException ex) {
OutputMonitor.printStream("", ex);
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
throw new SearchException(ex.getMessage());
} catch (IOException ex) {
OutputMonitor.printStream("", ex);
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
throw new SearchException(ex.getMessage());
}
setEndTimeOfSearch(new Date());
String message = "Lucene retrieved " + documents.size() + " document(s) (in " + getSearchTime() + " milliseconds) that matched query '" + query + "'for fields and doctype";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
this.retrievedDocsCount += documents.size();
return documents;
}
/**
* {@inheritDoc}
*/
@Override
public ArrayList<DocumentMetaData> search(String query, String[] docTypes, int field, boolean caseSensitive) throws SearchException {
ArrayList<DocumentMetaData> tempList = null;
ArrayList<DocumentMetaData> documents = new ArrayList<DocumentMetaData>();
String doc;
setStartTimeOfSearch(new Date());
for (int i = 0; i < docTypes.length; i++) {
doc = docTypes[i];
if (doc.equals("documents")) {
tempList = search(query, doc, caseSensitive);
} else {
tempList = this.search(query, doc, field, caseSensitive);
}
documents.addAll(tempList);
}
if (documents.size() > 1) {
deleteRepeated(documents);
}
String message = "Lucene retrieved " + documents.size() + " document(s) (in " + getSearchTime() + " milliseconds) that matched query '" + query + "'. for doctypes and field ";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
setEndTimeOfSearch(new Date());
this.retrievedDocsCount += documents.size();
return documents;
}
/**
* {@inheritDoc}
*/
@Override
public ArrayList<DocumentMetaData> search(String query, String[] docTypes, int[] fields, boolean caseSensitive) throws SearchException {
ArrayList<DocumentMetaData> tempList = null;
ArrayList<DocumentMetaData> documents = new ArrayList<DocumentMetaData>();
String doc;
setStartTimeOfSearch(new Date());
for (int i = 0; i < docTypes.length; i++) {
doc = docTypes[i];
if (doc.equals("documents")) {
tempList = search(query, doc, caseSensitive);
} else {
tempList = this.search(query, doc, fields, caseSensitive);
}
documents.addAll(tempList);
}
if (documents.size() > 1) {
deleteRepeated(documents);
}
String message = "Lucene retrieved " + documents.size() + " document(s) (in " + getSearchTime() + " milliseconds) that matched query '" + query + "'. for doctypes and fields ";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
setEndTimeOfSearch(new Date());
this.retrievedDocsCount += documents.size();
return documents;
}
/**
* {@inheritDoc}
*/
@Override
public long makeIndex() throws IndexException {
this.indexPath = new File(this.defaultIndexPath);
if (applyLSI) {
this.indexLSIPath = new File(this.defaultIndexLSIPath);
}
this.collectionPath = new File(this.defaultCollectionPath);
long indexedFiles = 0;
if (!this.collectionPath.exists() || this.collectionPath.listFiles().length == 0) {
String message = collectionPath + "does not exist or is empty";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
} else if (this.indexPath != null) {
indexedFiles = this.build(MAKE_INDEX);
}
this.indexedDocsCount += indexedFiles;
return indexedFiles;
}
/**
* {@inheritDoc}
*/
@Override
public long makeIndex(File collectionPath) throws IndexException {
this.indexPath = new File(this.defaultIndexPath);
if (applyLSI) {
this.indexLSIPath = new File(this.defaultIndexLSIPath);
}
this.collectionPath = collectionPath;
long indexedFiles = 0;
if (!this.collectionPath.exists() || this.collectionPath.listFiles().length == 0) {
String message = collectionPath + "does not exist or is empty";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
} else if (this.indexPath != null) {
indexedFiles = this.build(MAKE_INDEX);
}
this.indexedDocsCount += indexedFiles;
return indexedFiles;
}
/**
* {@inheritDoc}
*/
@Override
public long makeIndex(List<File> collectionPath) throws IndexException {
this.indexPath = new File(this.defaultIndexPath);
this.indexLSIPath = new File(this.defaultIndexLSIPath);
long indexedFiles = 0;
if (collectionPath.isEmpty()) {
OutputMonitor.printLine("The collection does not have files", OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, "The collection does not have files");
throw new IndexException("The collection does not have files");
} else if (this.indexPath != null) {
indexedFiles = this.build(collectionPath, MAKE_INDEX);
}
this.indexedDocsCount += indexedFiles;
return indexedFiles;
}
/**
* {@inheritDoc}
*/
@Override
public long makeIndex(File collectionPath, File indexPath) throws IndexException {
this.indexPath = indexPath;
if (applyLSI) {
this.indexLSIPath = new File(this.defaultIndexLSIPath);
}
this.collectionPath = collectionPath;
long indexedFiles = 0;
if (!this.collectionPath.exists() || this.collectionPath.listFiles().length == 0) {
String message = collectionPath + "does not exist or is empty";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
} else if (indexPath != null) {
indexedFiles = this.build(MAKE_INDEX);
} else {
String message = "indexPath is null";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
}
this.indexedDocsCount += indexedFiles;
return indexedFiles;
}
/**
* {@inheritDoc}
*/
@Override
public long makeIndex(List<File> collectionPath, File indexPath) throws IndexException {
this.indexPath = indexPath;
if (applyLSI) {
this.indexLSIPath = new File(this.defaultIndexLSIPath);
}
long indexedFiles = 0;
if (collectionPath.isEmpty()) {
OutputMonitor.printLine("The collection does not have files", OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, "The collection does not have files");
throw new IndexException("The collection does not have files");
} else if (this.indexPath != null) {
indexedFiles = this.build(collectionPath, MAKE_INDEX);
} else {
String message = "indexPath is null";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
}
this.indexedDocsCount += indexedFiles;
return indexedFiles;
}
/**
* {@inheritDoc}
*/
@Override
public long updateIndex(File collectionPath) throws IndexException {
this.indexPath = new File(this.defaultIndexPath);
this.collectionPath = collectionPath;
long indexedFiles = 0;
if (!this.collectionPath.exists() || this.collectionPath.listFiles().length == 0) {
String message = collectionPath + "does not exist or is empty";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
} else if (this.indexPath != null) {
indexedFiles = this.build(ADD_INDEX);
}
this.indexedDocsCount += indexedFiles;
return indexedFiles;
}
/**
* {@inheritDoc}
*/
@Override
public long updateIndex(List<File> collectionPath) throws IndexException {
this.indexPath = new File(this.defaultIndexPath);
long indexedFiles = 0;
if (collectionPath.isEmpty()) {
OutputMonitor.printLine("The collection does not have files", OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, "The collection does not have files");
throw new IndexException("The collection does not have files");
} else if (this.indexPath != null) {
indexedFiles = this.build(collectionPath, ADD_INDEX);
}
this.indexedDocsCount += indexedFiles;
return indexedFiles;
}
/**
* {@inheritDoc}
*/
@Override
public long updateIndex(File collectionPath, File indexPath) throws IndexException {
this.indexPath = indexPath;
this.collectionPath = collectionPath;
long indexedFiles = 0;
if (!this.collectionPath.exists() || this.collectionPath.listFiles().length == 0) {
String message = collectionPath + "does not exist or is empty";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
} else if (indexPath != null) {
indexedFiles = this.build(ADD_INDEX);
} else {
String message = "IndexPath is null";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
}
this.indexedDocsCount += indexedFiles;
return indexedFiles;
}
/**
* {@inheritDoc}
*/
@Override
public long updateIndex(List<File> collectionPath, File indexPath) throws IndexException {
this.indexPath = indexPath;
long indexedFiles = 0;
String message;
if (collectionPath.isEmpty()) {
message = "The collection does not have files";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
} else if (this.indexPath != null) {
indexedFiles = this.build(collectionPath, ADD_INDEX);
} else {
message = "indexPath is null";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
}
this.indexedDocsCount += indexedFiles;
return indexedFiles;
}
/**
* {@inheritDoc}
*/
@Override
public boolean loadIndex() throws IndexException {
this.reader = null;
boolean flag = false;
File defaultFile = new File(this.defaultIndexPath);
if (applyLSI) {
this.indexLSIPath = new File(this.defaultIndexLSIPath);
}
String message;
try {
if (!defaultFile.isDirectory() || !defaultFile.exists() || defaultFile == null || IndexReader.indexExists(FSDirectory.open(defaultFile)) == false) {
message = "Not found index in default index path";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
throw new IndexException(message);
} else {
this.reader = IndexReader.open(FSDirectory.open(defaultFile));
int cant = this.reader.numDocs();
this.reader.close();
message = "Loading Lucene...";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
try {
Thread.sleep(2000);
} catch (InterruptedException ex) {
OutputMonitor.printStream("", ex);
}
message = "Total of documents of the index: " + cant;
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
flag = true;
this.notifyLoadedDocument(cant);
//set path for search
this.indexPath = defaultFile;
initLSIManager();
}
} catch (CorruptIndexException ex) {
message = "Class: SearchEngineLucene" + " Method: LoadIndex" + " Error: " + ex.getMessage();
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
} catch (IOException ex) {
message = "Class: SearchEngineLucene" + " Method: LoadIndex" + " Error: " + ex.getMessage();
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
}
return flag;
}
/**
* {@inheritDoc}
*/
@Override
public boolean loadIndex(File indexPath) throws IndexException {
String message;
try {
if (applyLSI) {
this.indexLSIPath = new File(this.defaultIndexLSIPath);
}
this.reader = null;
if (!indexPath.isDirectory() || !indexPath.exists() || indexPath == null || IndexReader.indexExists(FSDirectory.open(indexPath)) == false) {
message = "Not found index in this directory: " + indexPath.getAbsolutePath();
throw new IndexException(message);
} else {
this.reader = IndexReader.open(FSDirectory.open(indexPath));
int cant = this.reader.numDocs();
this.reader.close();
message = "Loading Lucene...";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
try {
Thread.sleep(2000);
} catch (InterruptedException ex) {
OutputMonitor.printStream("", ex);
}
message = "Total of documents of the index: " + cant;
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
this.notifyLoadedDocument(cant);
//set path for search
this.indexPath = indexPath;
return true;
}
} catch (CorruptIndexException ex) {
message = "Class: SearchEngineLucene" + " Method: LoadIndex" + " Error: " + ex.getMessage();
OutputMonitor.printStream(message, ex);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
} catch (IOException ex) {
message = "Class: SearchEngineLucene" + " Method: LoadIndex" + " Error: " + ex.getMessage();
OutputMonitor.printStream(message, ex);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
}
}
/**
* {@inheritDoc}
*/
@Override
public boolean safeToBuildIndex(File indexPath, int operation) throws IndexException {
boolean flag = true;
try {
this.appendIndex = false;
//File idx = indexPath;
String idxpath = indexPath.getPath();
File dir = indexPath.getParentFile();
String message = null;
if (!dir.exists()) {
if (!dir.mkdirs()) {
//ensure that the index folder exists
flag = false;
message = "Could not create the index folders at: " + dir.getPath() + ".\n" + "Aborting indexing process.";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
}
} else if (IndexReader.indexExists(FSDirectory.open(indexPath))) {
switch (operation) {
case MAKE_INDEX:
message = "Overwriting index " + idxpath + "\n";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
deleteFiles(indexPath);
flag = true;
break;
case ADD_INDEX:
message = "Appending new files to index " + idxpath + "\n";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
this.appendIndex = true;
flag = true;
break;
default:
message = "Not building index " + idxpath + "\n";
OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
this.notifyTaskProgress(ERROR_MESSAGE, message);
flag = false;
throw new IndexException(message);
}
} else if (operation == ADD_INDEX) {
flag = false;
message = "Not exist Lucene index in this address" + indexPath;
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
}
} catch (IOException ex) {
OutputMonitor.printStream("", ex);
}
return flag;
}
/**
* {@inheritDoc}
*/
@Override
public String getDocumentField(int field) {
switch (field) {
case FIELD_FILEPATH:
return "filepath";
case FIELD_NAME:
return "name";
case FIELD_CODE_PACKAGE:
return "package";
case FIELD_CODE_CLASSES_NAMES:
return "classesnames";
case FIELD_CODE_METHODS_NAMES:
return "methodsnames";
case FIELD_CODE_ALL_COMMENTS:
return "allcomments";
case FIELD_CODE_ALL_SOURCE:
return "allsource";//todo el contenido del codigo
case FIELD_CODE_VARIABLES_NAMES:
return "classesvariables";
case FIELD_CODE_JAVADOCS:
return "javadocs";
case FIELD_DOC_TEXT:
return "content"; //todo el contenido del doc txt
case FIELD_DOC_BOOK:
return "book";
case AUTHOR_DOCUMENTS:
return "author";
case LAST_MODIFIED_DOCUMENTS:
return "lastModified";
default:
return null;
}
}
/**
* Devuelve los campos correspondientes cuando casesensitive es true
*
* @param field
* @return
*/
public String getDocumentFieldCS(int field) {
switch (field) {
case FIELD_FILEPATH:
return "filepathcs";
case FIELD_NAME:
return "namecs";
case FIELD_CODE_PACKAGE:
return "packagecs";
case FIELD_CODE_CLASSES_NAMES:
return "classesnamescs";
case FIELD_CODE_METHODS_NAMES:
return "methodsnamescs";
case FIELD_CODE_ALL_COMMENTS:
return "allcommentscs";
case FIELD_CODE_ALL_SOURCE:
return "allsourcecs";
case FIELD_CODE_VARIABLES_NAMES:
return "classesvariablescs";
case FIELD_CODE_JAVADOCS:
return "javadocscs";
case FIELD_DOC_TEXT:
return "contentcs";
case FIELD_DOC_BOOK:
return "bookcs";
default:
return null;
}
}
/**
* Método para construir el índice con la colección por defecto
*
* @param operación a realizar: MAKE o ADD
*/
private long build(int operation) throws IndexException {
long indexedFiles = 0;
String message = "Lucene index will be created at [" + this.indexPath + "]";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
//inicia la indexacion
try {
if (safeToBuildIndex(this.indexPath, operation)) {
setStartTimeOfIndexation(new Date());
// this.analyzer = new NGramAnalyzer();
//TODO I changed theses lines 2012-11-12
// this.setFieldAnalyzer(new PerFieldAnalyzerWrapper(new NGramAnalyzer()));
// this.getFieldAnalyzer().addAnalyzer(getDocumentField(FIELD_CODE_ALL_COMMENTS), new StopStemAnalyzer());
if (this.appendIndex) {
//Adding: new docs
this.writer = new IndexWriter(FSDirectory.open(this.indexPath), this.getFieldAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED);
if (applyLSI) {
this.writerLSI = new IndexWriter(FSDirectory.open(this.indexLSIPath), new PerFieldAnalyzerWrapper(new StopStemAnalyzer()), false, IndexWriter.MaxFieldLength.UNLIMITED);
}
// ("number "+writer.getReader().maxDoc());
} else {
//create or overwrite index
this.writer = new IndexWriter(FSDirectory.open(this.indexPath), this.getFieldAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
if (applyLSI) {
this.writerLSI = new IndexWriter(FSDirectory.open(this.indexLSIPath), new PerFieldAnalyzerWrapper(new StopStemAnalyzer()), true, IndexWriter.MaxFieldLength.UNLIMITED);
}
}
indexedFiles = indexDocs(this.writer, this.writerLSI, this.collectionPath, operation);
message = "Optimizing...";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
this.writer.optimize();
this.writer.close();
if (applyLSI) {
this.writerLSI.optimize();
this.writerLSI.close();
}
setEndTimeOfIndexation(new Date());
message = "Indexation Time " + this.getIndexationTime() + " milliseconds.";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
}
} catch (IOException e) {
message = " caught a " + e.getClass() + "\n with message: " + e.getMessage() + ".";
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
}
initLSIManager(); // inicializar la matriz de LSI
return indexedFiles;
}
private void initLSIManager() {
if (applyLSI) {
Thread t = new Thread(new Runnable() {
public void run() {
try {
CollectionInfo collectionInfo = getCollectionInfo();
lsiManager.setInitValues(collectionInfo);
} catch (IndexException ex) {
OutputMonitor.printStream("Reading LSI index.", ex);
}
}
});
t.start();
}
}
/**
* Método para construir el índice a partir de una colección de files
*
* @param operation ----- operación a realizar: MAKE o ADD
* @param collectionPath ----- lista de ficheros que representan la
* colección
*/
private long build(List<File> collectionPath, int operation) throws IndexException {
long indexedFiles = 0;
String message = "Indexing to directory '" + this.indexPath + "'...";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
//inicia la indexacion
try {
setStartTimeOfIndexation(new Date());
if (safeToBuildIndex(this.indexPath, operation)) {
// this.analyzer = new NGramAnalyzer();
this.setFieldAnalyzer(new PerFieldAnalyzerWrapper(new NGramAnalyzer()));
this.getFieldAnalyzer().addAnalyzer(getDocumentField(FIELD_CODE_ALL_COMMENTS), new StopStemAnalyzer());
if (this.appendIndex) {
//añadir docs a un indice existente
this.writer = new IndexWriter(FSDirectory.open(this.indexPath), this.getFieldAnalyzer(), false, IndexWriter.MaxFieldLength.UNLIMITED);
if (applyLSI) {
this.writerLSI = new IndexWriter(FSDirectory.open(this.indexLSIPath), new PerFieldAnalyzerWrapper(new StopStemAnalyzer()), false, IndexWriter.MaxFieldLength.UNLIMITED);
}
} else {
//crear o sobreescribir
this.writer = new IndexWriter(FSDirectory.open(this.indexPath), this.getFieldAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
if (applyLSI) {
this.writerLSI = new IndexWriter(FSDirectory.open(this.indexLSIPath), new PerFieldAnalyzerWrapper(new StopStemAnalyzer()), true, IndexWriter.MaxFieldLength.UNLIMITED);
}
}
indexedFiles = indexDocs(this.writer, this.writerLSI, collectionPath, operation);
message = "Optimizing...";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
this.writer.optimize();
this.writer.close();
if (applyLSI) {
this.writerLSI.optimize();
this.writerLSI.close();
}
setEndTimeOfIndexation(new Date());
message = "Indexation Time " + this.getIndexationTime() + " milliseconds.";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
}
} catch (IOException e) {
message = " caught a " + e.getClass() + "\n with message: " + e.getMessage() + ".";
this.notifyTaskProgress(ERROR_MESSAGE, message);
throw new IndexException(message);
}
initLSIManager();
return indexedFiles;
}
/**
* Indexa los documentos que estan en un vector
*
* @param writer
* @param list
* @throws IOException
*/
private int indexDocs(IndexWriter writer, IndexWriter writerLSI, List<File> list, int operation) throws IndexException {
int docCount = 0;
File file;
String message;
for (int i = 0; i < list.size(); i++) {
file = list.get(i);
if (file.getName().endsWith(".java") || file.getName().endsWith(".pdf") || file.getName().endsWith(".txt") || file.getName().endsWith(".xml")) {
{
indexFile(writer, writerLSI, file, operation);
message = "Adding: " + file;
docCount++;
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
}
} else {
message = "There are files in the collection that are not: .java, pdf, txt o xml documents" + "\n" + "so, they could not be indexed.";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
}
}
return docCount;
}
/**
* Indexa los documentos que estan en un file
*/
private int indexDocs(IndexWriter writer, IndexWriter writerLSI, File file, int operation) throws IndexException {
int docCount = 0;
String message;
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list();
this.indexedDocsCount = files.length;
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, writerLSI, new File(file, files[i]), operation);
}
}
} else if (file.getName().endsWith(".java") || file.getName().endsWith(".pdf") || file.getName().endsWith(".txt") || file.getName().endsWith(".xml")) {
indexFile(writer, writerLSI, file, operation);
message = "Adding: " + file;
docCount++;
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
} else {
message = "There are files in the collection that are not: .java, pdf, txt or xml documents" + "\n" + "so, they could not be indexed.";
OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
this.notifyTaskProgress(INFORMATION_MESSAGE, message);
}
}
return docCount;
}
/**
* Indexacion por campo de cada documento del repositorio
*
* @param writer
* @param f
*/
private void indexFile(IndexWriter writer, IndexWriter writerLSI, File f, int operation) throws IndexException {
boolean javaFile = false;
if (f.isHidden() || !f.exists() || !f.canRead()) {
return;
}
DocumentLucene doc = null; //para case insensitive
DocumentLucene doccs = null; //para case sensitive
DocumentLucene contentDocLSI = null; //para case contenido lsi
DocumentLucene pathDocLSI = null; //para case path lsi
// String extFile = f.getPath().endsWith(".pdf");
try {
setFieldAnalyzer(new PerFieldAnalyzerWrapper(new NGramAnalyzer()));
setFieldAnalyzerCS(new PerFieldAnalyzerWrapper(new NGramAnalyzerCaseSensitive()));
if (f.getPath().endsWith(".pdf")) {
doc = new DocumentLucene();
doccs = new DocumentLucene();
if (applyLSI) {
contentDocLSI = new DocumentLucene();
pathDocLSI = new DocumentLucene();
}
PdfParser pdfp = new PdfParser();
try {
pdfp.divideTextforLucene(f, doccs, doc, contentDocLSI);
} catch (Exception e) {
}
pdfp.analyzePdfDocument(f);
doc.addField(getDocumentField(AUTHOR_DOCUMENTS), pdfp.getAuthor());
doccs.addField(getDocumentField(AUTHOR_DOCUMENTS), pdfp.getAuthor());
try {
String date = DateFormat.getDateInstance().format(pdfp.getCalModification().getTime());
doc.addField(getDocumentField(LAST_MODIFIED_DOCUMENTS), date);
doccs.addField(getDocumentField(LAST_MODIFIED_DOCUMENTS), date);
} catch (Exception ex) {
doc.addField(getDocumentField(LAST_MODIFIED_DOCUMENTS), "");
doccs.addField(getDocumentField(LAST_MODIFIED_DOCUMENTS), "");
}
} else if (f.getPath().endsWith(".java")) {
getFieldAnalyzer().addAnalyzer(getDocumentField(FIELD_CODE_ALL_COMMENTS), new StopStemAnalyzer());
getFieldAnalyzerCS().addAnalyzer(getDocumentFieldCS(FIELD_CODE_ALL_COMMENTS), new StopStemAnalyzerCaseSensitive());
ArrayList<String> comment = new ArrayList<String>();
ArrayList<String> javadocs = new ArrayList<String>();
JavaParser jp = new JavaParser();
jp.AnalyzeDocument(f);
doc = new DocumentLucene();
doccs = new DocumentLucene();
doc.addField(getDocumentField(FIELD_FILEPATH), f.getCanonicalPath());
doccs.addField(getDocumentFieldCS(FIELD_FILEPATH), f.getCanonicalPath());
doc.addField(getDocumentField(FIELD_NAME), f.getName());
doccs.addField(getDocumentFieldCS(FIELD_NAME), f.getName());
doc.addField(getDocumentField(FIELD_CODE_ALL_SOURCE), jp.getAllSource());
doccs.addField(getDocumentFieldCS(FIELD_CODE_ALL_SOURCE), jp.getAllSource());
if (jp.getClassPackage() != null) {
doc.addField(getDocumentField(FIELD_CODE_PACKAGE), jp.getClassPackage());
doccs.addField(getDocumentFieldCS(FIELD_CODE_PACKAGE), jp.getClassPackage());
}
for (int i = 0; i < jp.getClassNumber(); i++) {
if (jp.getClassesComments(i) != null) {
comment.add(jp.getClassesComments(i));
}
if (jp.getClassesJDocs(i) != null) {
javadocs.add(jp.getClassesJDocs(i));
}
doc.addField(getDocumentField(FIELD_CODE_CLASSES_NAMES), jp.getClassesNames(i));
doccs.addField(getDocumentFieldCS(FIELD_CODE_CLASSES_NAMES), jp.getClassesNames(i));
for (int l = 0; l < jp.getClassVariableNumber(); l++) {
doc.addField(getDocumentField(FIELD_CODE_VARIABLES_NAMES), jp.getClassesVarName(i, l));
doccs.addField(getDocumentFieldCS(FIELD_CODE_VARIABLES_NAMES), jp.getClassesVarName(i, l));
if (jp.getClassesCommentVariables(i, l) != null) {
comment.add(jp.getClassesCommentVariables(i, l));
}
if (jp.getVariablesJDocs(i, l) != null) {
javadocs.add(jp.getVariablesJDocs(i, l));
}
}
for (int j = 0; j < jp.getClassesMethods(i); j++) {
if (jp.getClassesMethodComment(i, j) != null) {
comment.add(jp.getClassesMethodComment(i, j));
}
if (jp.getClassesMethodJDocs(i, j) != null) {
javadocs.add(jp.getClassesMethodJDocs(i, j));
}
doc.addField(getDocumentField(FIELD_CODE_METHODS_NAMES), jp.getClassesMethodsName(i, j));
doccs.addField(getDocumentFieldCS(FIELD_CODE_METHODS_NAMES), jp.getClassesMethodsName(i, j));
}
}
//unir los comentarios
doc.addField(getDocumentField(FIELD_CODE_ALL_COMMENTS), joinData(comment));
doccs.addField(getDocumentFieldCS(FIELD_CODE_ALL_COMMENTS), joinData(comment));
//unir los javadocs
doc.addField(getDocumentField(FIELD_CODE_JAVADOCS), joinData(javadocs));
doccs.addField(getDocumentFieldCS(FIELD_CODE_JAVADOCS), joinData(javadocs));
///////////////////
if (applyLSI) {
contentDocLSI = new DocumentLucene();
contentDocLSI.addField(getDocumentField(FIELD_CODE_ALL_SOURCE), jp.getAllSource());
pathDocLSI = new DocumentLucene();
pathDocLSI.addField(getDocumentField(FIELD_FILEPATH), f.getCanonicalPath());
pathDocLSI.addField(getDocumentField(FIELD_NAME), f.getName());
}
} else if (f.getPath().endsWith(".txt")) {
doc = new DocumentLucene();
doccs = new DocumentLucene();
doc.addField(getDocumentField(FIELD_FILEPATH), f.getCanonicalPath());
doccs.addField(getDocumentFieldCS(FIELD_FILEPATH), f.getCanonicalPath());
doc.addField(getDocumentField(FIELD_NAME), f.getName());
doccs.addField(getDocumentFieldCS(FIELD_NAME), f.getName());
String textFile = Utilities.readFile(f);
doc.addField(getDocumentField(FIELD_DOC_TEXT), textFile);
doccs.addField(getDocumentFieldCS(FIELD_DOC_TEXT), textFile);
////////////////////
if (applyLSI) {
contentDocLSI = new DocumentLucene();
// doclsi.addField(getDocumentField(FIELD_FILEPATH), f.getCanonicalPath());
contentDocLSI.addField(getDocumentField(FIELD_DOC_TEXT), textFile);
pathDocLSI = new DocumentLucene();
pathDocLSI.addField(getDocumentField(FIELD_FILEPATH), f.getCanonicalPath());
pathDocLSI.addField(getDocumentField(FIELD_NAME), f.getName());
}
} else if (f.getPath().endsWith(".xml")) {
getFieldAnalyzer().addAnalyzer(getDocumentField(FIELD_DOC_TEXT), new WikiAnalyzer());
getFieldAnalyzerCS().addAnalyzer(getDocumentFieldCS(FIELD_DOC_TEXT), new WikiCaseSensitiveAnalyzer());
doc = new DocumentLucene();
doccs = new DocumentLucene();
doc.addField(getDocumentField(FIELD_FILEPATH), f.getCanonicalPath());
doccs.addField(getDocumentFieldCS(FIELD_FILEPATH), f.getCanonicalPath());
doc.addField(getDocumentField(FIELD_NAME), f.getName());
doccs.addField(getDocumentFieldCS(FIELD_NAME), f.getName());
String textFile = Utilities.readFile(f);
doc.addField(getDocumentField(FIELD_DOC_TEXT), textFile);
doccs.addField(getDocumentFieldCS(FIELD_DOC_TEXT), textFile);
////////////////////
if (applyLSI) {
contentDocLSI = new DocumentLucene();
// doclsi.addField(getDocumentField(FIELD_FILEPATH), f.getCanonicalPath());
contentDocLSI.addField(getDocumentField(FIELD_DOC_TEXT), textFile);
pathDocLSI = new DocumentLucene();
pathDocLSI.addField(getDocumentField(FIELD_FILEPATH), f.getCanonicalPath());
pathDocLSI.addField(getDocumentField(FIELD_NAME), f.getName());
}
}
writer.addDocument(doc.getDoc(), getFieldAnalyzer());
writer.addDocument(doccs.getDoc(), getFieldAnalyzerCS());
////////
if (applyLSI) {
setFieldAnalyzer(new PerFieldAnalyzerWrapper(new StopStemAnalyzer()));
writerLSI.addDocument(pathDocLSI.getDoc(), getFieldAnalyzer());
writerLSI.addDocument(contentDocLSI.getDoc(), getFieldAnalyzer());
}
if (operation == ADD_INDEX) {
this.notifyAddedDocument();
} else if (operation == MAKE_INDEX) {
this.notifyIndexedDocument();
}
} catch (IOException ex) {
OutputMonitor.printStream("", ex);
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
}
}
/**
* Metodo para unir los comentarios de las variables, los metodos y de la
* clase de codigo fuente y para unir javadocs también
*
* @param aa
* @return
*/
private String joinData(ArrayList<String> aa) {
String result = " ";
if (!aa.isEmpty()) {
for (int i = 0; i < aa.size(); i++) {
if (aa.get(i) != null) {
result = result.concat(" " + aa.get(i));
}
}
} else {
result = " ";
}
return result;
}
/**
* Devuelve una lista de DocumentMetaData construida a partir de los
* resultados de búsqueda
*
* @param sd
* @param queryT
* @param caseS
* @return
* @throws IOException
* @throws ParseException
*/
private ArrayList<DocumentMetaData> saveResults(ScoreDoc[] sd, boolean caseS, Query q) {
DocumentMetaData metaDoc;
long size = 0;
String summary = null, filepath = null, name = null, author = null, lastModified = null;
ArrayList<DocumentMetaData> docsfound = new ArrayList<DocumentMetaData>();
this.setFieldAnalyzer(null);
String textfield = null, field = null, filetype = null, scoreString = null;
float score;
double ss;
for (int k = 0; k < sd.length; k++) {
metaDoc = new DocumentMetaData();
this.scoreDocObj = sd[k];
int iddoc = this.scoreDocObj.doc;
//scoreDocObj.doc es el numero que representa ese doc en los resultados de busqueda
score = this.scoreDocObj.score;
scoreString = String.valueOf(score);
ss = Double.valueOf(scoreString);
this.docum = null;
try {
this.docum = this.searcher.doc(iddoc);
} catch (CorruptIndexException ex) {
OutputMonitor.printStream("", ex);
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
} catch (IOException ex) {
OutputMonitor.printStream("", ex);
this.notifyTaskProgress(ERROR_MESSAGE, ex.getMessage());
}
if (caseS == false) {
filepath = this.docum.get(getDocumentField(FIELD_FILEPATH));
name = this.docum.get(getDocumentField(FIELD_NAME));
author = this.docum.get(getDocumentField(AUTHOR_DOCUMENTS));
lastModified = this.docum.get(getDocumentField(LAST_MODIFIED_DOCUMENTS));
//this.analyzer = new NGramAnalyzer();
this.setFieldAnalyzer(new PerFieldAnalyzerWrapper(new NGramAnalyzer()));
this.getFieldAnalyzer().addAnalyzer(getDocumentField(FIELD_CODE_ALL_COMMENTS), new StopStemAnalyzer());
} else {
filepath = this.docum.get(getDocumentFieldCS(FIELD_FILEPATH));
name = this.docum.get(getDocumentFieldCS(FIELD_NAME));
author = this.docum.get(getDocumentField(AUTHOR_DOCUMENTS));
lastModified = this.docum.get(getDocumentField(LAST_MODIFIED_DOCUMENTS));
//this.analyzer = new NGramAnalyzerCaseSensitive();
this.setFieldAnalyzer(new PerFieldAnalyzerWrapper(new NGramAnalyzerCaseSensitive()));
this.getFieldAnalyzer().addAnalyzer(getDocumentFieldCS(FIELD_CODE_ALL_COMMENTS), new StopStemAnalyzerCaseSensitive());
}
filetype = getFileExtension(filepath);
File f = new File(filepath);
size = f.length();
if (filetype.equalsIgnoreCase("java")) {
// field = getDocumentField(SearchAssignable.FIELD_CODE_ALL_COMMENTS);
//textfield = this.docum.get(getDocumentField(SearchAssignable.FIELD_CODE_ALL_COMMENTS));
field = getDocumentField(FIELD_CODE_ALL_SOURCE);
textfield = docum.get(getDocumentField(FIELD_CODE_ALL_SOURCE));
} else if (filetype.equalsIgnoreCase("pdf")) {
field = getDocumentField(FIELD_DOC_BOOK);
textfield = this.docum.get(getDocumentField(FIELD_DOC_BOOK));
} else if (filetype.equalsIgnoreCase("txt")) {
field = getDocumentField(FIELD_DOC_TEXT);
textfield = this.docum.get(getDocumentField(FIELD_DOC_TEXT));
}
if (textfield != null) {
String temp = getHighlighter(q, this.getFieldAnalyzer(), textfield, field);
summary = filterTags(filterTags(temp, "<B>"), "</B>");
} else {
summary = " ";
}
//////////////////////////////
metaDoc.setName(name);
metaDoc.setPath(filepath);
metaDoc.setIndex(iddoc);
metaDoc.setAuthor(author);
metaDoc.setLastModified(lastModified);
// System.err.println("lucene "+iddoc);
metaDoc.setSynthesis(summary);
metaDoc.setSize(size);
metaDoc.setType(filetype);
metaDoc.setScore(ss);
metaDoc.setSearcher(KeySearchable.LUCENE_SEARCH_ENGINE);
docsfound.add(metaDoc);
} //end for
return docsfound;
}
/**
* Para la sumarización
*
* @return
*/
private String getHighlighter(Query q, Analyzer a, String text, String field) {
String summary = null;
this.hg = new Highlighter(new QueryTermScorer(q));
this.hg.setTextFragmenter(new SimpleFragmenter(20));
this.hg.setMaxDocCharsToAnalyze(600);
try {
try {
this.tokens = TokenSources.getTokenStream(field, text, a);
summary = this.hg.getBestFragments(this.tokens, text, 20, "...");
// summary = this.hg.getBestFragments(this.tokens, text, 10).toString();
} catch (IOException ex) {
OutputMonitor.printStream("IO", ex);
}
} catch (InvalidTokenOffsetsException ex) {
OutputMonitor.printStream("", ex);
}
if (summary == null) {
summary = " ";
}
return summary;
}
/**
* Método para eliminar etiquetas producidas en el summary por el analyzer
*
* @param text
* @param mark
* @return
*/
public String filterTags(String text, String mark) {
String result = "";
if (text.contains(mark)) {
String[] array = text.split(mark);
for (int i = 0; i < array.length; i++) {
result += array[i];
}
}
return result;
}
/**
*
* @param indexDirectory
* @return
* @throws IndexException
*/
public CollectionInfo getCollectionInfo(String indexDirectory) throws IndexException {
this.defaultIndexLSIPath = indexDirectory;
return getCollectionInfo();
}
/**
* Este método obtiene la relación de ocurrencia de los términos en el
* índice de la colección especificada.
*
* @return relación documentos por término
*
* @throws IndexException si ocurre una error el el proceso de obtención de
* los términos de la colección.
*/
public CollectionInfo getCollectionInfo() throws IndexException {
try {
this.indexLSIPath = new File(defaultIndexLSIPath);
this.directory = FSDirectory.open(this.indexLSIPath);
if (IndexReader.indexExists(this.directory)) {
// se verifica que exista un índice en el directorio especificado
this.reader = IndexReader.open(this.directory);
TermEnum terms = this.reader.terms(); // se obtienen todos los términos del índice de la colección
Map<TermInfo, List<DocTermInfo>> termsMap = new HashMap<TermInfo, List<DocTermInfo>>();
List<DocTermInfo> list;
Term termItem;
TermDocs docs;
int docsCount = 0, termsCount = 0;
docs = this.reader.termDocs();
Document doc;
List<String> termsList = new ArrayList<String>();
Set<Integer> docsIds = new HashSet<Integer>();
docsCount = this.reader.numDocs();
Map<Integer, Integer> docsMap = new HashMap<Integer, Integer>();
List<DocInfo> docInfoList = new ArrayList<DocInfo>(docsCount);
String name, filePath;
int index = 0;
for (int i = 0; i < docsCount; i += 2) {
doc = this.reader.document(i);
name = doc.get("name");
filePath = doc.get("filepath");
docInfoList.add(new DocInfo(name, filePath));
docsMap.put(i + 1, index);
index++;
}
docsMap.remove(docsCount + 1);
while (terms.next()) {
termItem = terms.term();
list = new ArrayList<DocTermInfo>();
docs = this.reader.termDocs(termItem);
while (docs.next()) {
int docNum = docs.doc();
if (!(docNum % 2 == 0)) {
doc = this.reader.document(docNum);
int termFreq = docs.freq();
list.add(new DocTermInfo(docsMap.get(docNum), termFreq));
docsIds.add(docNum);
}
}
if (!list.isEmpty()) {
termsMap.put(new TermInfo(termsCount, termItem.text(), reader.docFreq(termItem)), list);
termsList.add(termItem.text());
termsCount++;
}
}
return new CollectionInfo(termsMap, "Apache Lucene", termsList, docInfoList, singularValue);
} else {
throw new IndexException("Index invalid. Not exist index in the directory: " + defaultIndexLSIPath);
}
} catch (IOException ex) {
throw new IndexException(ex.getMessage());
}
}
/**
* @return the fieldAnalyzer
*/
public PerFieldAnalyzerWrapper getFieldAnalyzer() {
return fieldAnalyzer;
}
/**
* @param fieldAnalyzer the fieldAnalyzer to set
*/
public void setFieldAnalyzer(PerFieldAnalyzerWrapper fieldAnalyzer) {
this.fieldAnalyzer = fieldAnalyzer;
}
/**
* @return the fieldAnalyzerCS
*/
public PerFieldAnalyzerWrapper getFieldAnalyzerCS() {
return fieldAnalyzerCS;
}
/**
* @param fieldAnalyzerCS the fieldAnalyzerCS to set
*/
public void setFieldAnalyzerCS(PerFieldAnalyzerWrapper fieldAnalyzerCS) {
this.fieldAnalyzerCS = fieldAnalyzerCS;
}
}