/*******************************************************************************
* Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package tml.storage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.sql.SQLException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Timer;
import javax.swing.event.EventListenerList;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import tml.Configuration;
import tml.annotators.Annotator;
import tml.corpus.CorpusParameters;
import tml.corpus.RepositoryCorpus;
import tml.corpus.TextDocument;
import tml.sql.DbConnection;
import tml.storage.importers.AbstractImporter;
import tml.storage.importers.Importer;
import tml.storage.importers.TextImporter;
import tml.vectorspace.NoDocumentsInCorpusException;
/**
* This class represents a documents repository. Documents can be inserted,
* deleted and searched from a Repository. All documents that were successfully
* inserted in a repository can then later be used to create a {@link Corpus}
* and perform operations on them.
* </p>
* <p>
* At the heart of a repository lies a {@link TextDocument}, that represents a
* text document and is accessible using any id of your choice (e.g. from a
* database, or from the filesystem). The content of a new documents is expected
* to be just plain text. Importers from different formats will be provided in
* time, for the moment we have only a Wiki cleaner.
* </p>
* <p>
* All the documents, once inserted in the Repository can then be searched using
* the searchTextDocuments method. Queries are made using the syntax from
* Apache's Lucene.
* </p>
* <p>
* <em>Code examples</em>
* </p>
* <p>
* Initialising a {@link Repository}:
* </p>
*
* <pre>
* Repository repository = new Repository("path/to/repository/folder");
* </pre>
* <p>
* Obtaining all the documents in a Repository
* </p>
*
* <pre>
* ...
* List<TextDocument> documents = repository.getAllTextDocuments();
* for(TextDocument doc : documents) {
* System.out.println("Document:" + doc.getTitle());
* }
* ...
* </pre>
* <p>
* Inserting a document
* </p>
*
* <pre>
* String content = "The content of my document";
* String title = "A title";
* String url = "http://www/mydoc.txt";
* String id = "TheIdOfMyDoc";
* repository.addDocument(id, content, title, url);
* </pre>
* <p>
* Obtaining a document from the repository
* </p>
*
* <pre>
* String id = "TheIdOfMyDoc";
* TextDocument doc = repository.getTextDocument(id);
* </pre>
* <p>
* Removing a document from the repository
* </p>
*
* <pre>
* TextDocument doc = repository.getTextDocument("someId");
* repository.deleteDocument(doc);
* </pre>
* <p>
* Searching for documents containing "foo"
* </p>
*
* <pre>
* String query = "foo";
* List<TextDocument> documents = repository.searchTextDocuments(query);
* for (TextDocument doc : documents) {
* System.out.println("Document found:" + doc.getTitle());
* }
* </pre>
*
* @see TextDocument
* @see Corpus
* @author Jorge Villalon
*
*/
public class Repository {
/**
* Cleans an id (typically a file name) to suits the syntax of Lucene
*
* @param id
* the external id of a document
* @return the id clean of special characters that Lucene uses
*/
public static String cleanIdForLucene(String id) {
String cleanId = id.replace(" ", "");
cleanId = cleanId.replace("_", "");
cleanId = cleanId.replace("\\.", "");
return cleanId;
}
/**
* Deletes all the files of the {@link Repository}.
*
* @param indexPath
* The path to the folder where the LuceneIndex files are stored
* @throws IOException
* @throws LockObtainFailedException
* @throws CorruptIndexException
* @throws SQLException
*
*/
public static void cleanStorage(String indexPath)
throws CorruptIndexException, LockObtainFailedException,
IOException, SQLException {
DbConnection conn = new DbConnection();
conn.cleanMetaDataStorage();
// Opening an IndexWriter with true to create a new empty one
IndexWriter writer = new IndexWriter(
SimpleFSDirectory.open(new File(indexPath)),
new StandardAnalyzer(Version.LUCENE_29),
true,
IndexWriter.MaxFieldLength.UNLIMITED);
writer.close(true);
writer = null;
}
/**
* This method is necessary due to problems on processing UTF-8 encoded text that comes from
* a paste from word. Usually quotations and double quotations come with weird characters
* that do not correspond to those of quotations. That makes it impossible to detect
* for the parsers.
*
* @param word
* @return
*/
public static String cleanWord(String word) {
word = word.replace('\u0060', '\'');
word = word.replace('\u2018', '\'');
word = word.replace('\u2019', '\'');
word = word.replace('\u201A', '\'');
word = word.replace('\u201B', '\'');
word = word.replace('\u2032', '\'');
word = word.replace('\u2035', '\'');
word = word.replace('\u201C', '\"');
word = word.replace('\u201D', '\"');
word = word.replace('\u201E', '\"');
word = word.replace('\u201F', '\"');
word = word.replace('\u2033', '\"');
word = word.replace('\u2036', '\"');
word = word.replace('\u2010', '-');
word = word.replace('\u2012', '-');
word = word.replace('\u2013', '-');
word = word.replace('\u2014', '-');
word = word.replace('\u2015', '-');
word = word.replaceAll("\r\n", "");
word = word.replace('\r', ' ');
word = word.replace('\n', ' ');
word = word.replaceAll("\uFEFF", "");
word = word.trim();
return word;
}
/**
* Obtains the content of a text file. Basically it uses readline and then
* writes only a \n for newlines so it removes any \r to make further
* process easier.
*
* @param file
* @param charset
* @return
* @throws IOException
*/
public static String getFileContent(File file, String charset) throws IOException {
StringBuffer buffer = new StringBuffer();
String line = null;
// Remove special characters Unicode!
BufferedReader reader = new BufferedReader(
new InputStreamReader(
new FileInputStream(file),
charset));
while ((line = reader.readLine()) != null) {
line = cleanWord(line);
buffer.append(line);
buffer.append('\n');
}
reader.close();
return buffer.toString();
}
private IndexWriter luceneIndexWriter = null;
private IndexReader luceneIndexReader = null;
// General attributes
/** The logger for log4j */
private static Logger logger = Logger.getLogger(Repository.class);
/** Timer for indexing */
private Timer indexerTimer;
/** Timer for annotations */
private Timer annotatorTimer;
/** Timer for cleanup */
private Timer cleanupTimer;
/** The language for the documents in the repository */
private Locale locale;
/** The character encoding used to read files from the fileystem */
private String encoding = "UTF-8";
/** The parser for the content before inserting into the index */
private Importer defaultImporter = null;
// Lucene specific attributes
/** The standard Lucene analyser for this repository */
private Analyzer analyzer;
/** The folder where the repository is kept */
private String indexPath;
/** Path to the storage of calculated SVDs */
private String svdStoragePath;
private String tmpPath;
public String getTmpPath() {
return tmpPath;
}
private String processedPath;
public String getProcessedPath() {
return processedPath;
}
/** The stopwords */
private String[] stopwords;
/** The field that contains the content of a document */
private String luceneContentField = "contents";
/** The field that contains the title of a document */
private String luceneTitleField = "title";
/** The field that contains the url of a document */
private String luceneUrlField = "url";
/** The field that contains the external ID of a document */
private String luceneExternalIdField = "externalid";
/** The field that contains the ID of the parent of a document */
private String luceneParentField = "reference";
/** The field that contains the ID of the parent of a document */
private String luceneParentDocumentField = "parent";
/** The path to the execution folder */
private String execPath = "";
public String getExecPath() {
return execPath;
}
public void setExecPath(String execPath) {
this.execPath = execPath;
}
private DbConnection dbConnection = null;
public DbConnection getDbConnection() {
return dbConnection;
}
/**
* @return the luceneParentDocumentField
*/
public String getLuceneParentDocumentField() {
return luceneParentDocumentField;
}
/** The field that contains the PennTree bank parse */
private String lucenePenntreeField = "penntree";
/** The field that contains the type of the passage */
private String luceneTypeField = "type";
/** The maximum number of documents to index every time the indexing is called */
private int maxDocumentsToIndex = -1;
// Metadata annotations specific attributes
/** The list of annotators that will be used on indexing */
private List<Annotator> annotators = null;
/** */
private EventListenerList listeners = null;
public Repository() throws IOException, SQLException {
this(Configuration.getTmlFolder() + "/lucene");
}
/**
* Creates a new instance of the class {@link Repository} using a Standard
* Analyzer without stop words removal.
*
* @param luceneIndexPath
* an absolute path to the folder that stores the Lucene Index
* @throws IOException
* @throws SQLException
*/
public Repository(String luceneIndexPath) throws IOException, SQLException {
this(luceneIndexPath, new Locale("en"));
}
/**
*
* @param luceneIndexPath
* @param locale
* @throws IOException
* @throws SQLException
*/
@SuppressWarnings({ "rawtypes" })
public Repository(String luceneIndexPath, Locale locale) throws IOException, SQLException {
assert (luceneIndexPath != null);
assert (locale != null);
// Read default properties and initialize log4j
Configuration.getTmlProperties(true);
this.indexPath = luceneIndexPath;
File folder = new File(this.indexPath);
if (!folder.exists()) {
String message = "Repository folder doesn't exist ["
+ this.indexPath + "]";
logger.error(message);
throw new IOException(message);
}
this.locale = locale;
this.defaultImporter = new TextImporter();
this.annotators = new ArrayList<Annotator>();
logger.info("TML initialization");
logger.debug("Context Path:\t\t" + Configuration.getContextPath());
logger.info("Repository path:\t" + this.indexPath);
try {
new IndexSearcher(SimpleFSDirectory.open(new File(luceneIndexPath)), true);
logger.info("Repository:\t\tLucene initialized");
} catch (Exception e1) {
logger.warn("Repository:\t\tLucene index corrupt or inexistent, recreating");
Repository.cleanStorage(luceneIndexPath);
}
this.svdStoragePath = Configuration.getTmlFolder() + "/svd";
File svdFolder = new File(svdStoragePath);
if(!svdFolder.exists())
svdFolder.mkdir();
logger.debug("Cache:\t\t\tSVDs stored in " + this.svdStoragePath);
this.tmpPath = Configuration.getTmlFolder() + "/tmp";
File tmpFolder = new File(this.tmpPath);
if(!tmpFolder.exists())
tmpFolder.mkdir();
logger.debug("Temp:\t\t\tTemporary files in " + this.tmpPath);
this.processedPath = Configuration.getTmlFolder() + "/processed";
File processedFolder = new File(this.processedPath);
if(!processedFolder.exists())
processedFolder.mkdir();
logger.debug("Indexer:\t\tProcessed files in " + this.processedPath);
File stopWordsFile = new File(Configuration.getTmlFolder() + "/stopwords/stopwords_" + this.locale.getLanguage() + ".txt");
if (stopWordsFile == null || !stopWordsFile.exists()) {
InputStream stream = this.getClass().getResourceAsStream(
"/tml/stopwords_" + this.locale.getLanguage() + ".txt");
if (stream == null) {
logger.info("Failed to load stopwords for language "
+ this.locale.getLanguage()
+ ", falling to english");
stream = this.getClass().getResourceAsStream("/tml/stopwords.txt");
}
this.stopwords = getStopWordsFromBufferedReader(new BufferedReader(
new InputStreamReader(stream)));
} else {
this.stopwords = getStopWordsFromFile(stopWordsFile);
}
logger.debug("Stopwords:\t\tUsing " + this.locale.getDisplayLanguage(Locale.ENGLISH) + " (" + this.stopwords.length + " stopwords)");
String snowballLang = this.locale.getDisplayLanguage(Locale.ENGLISH);
this.analyzer = new SnowballAnalyzer(
Version.LUCENE_29,
snowballLang,
this.stopwords);
logger.debug("Stemming:\t\tUsing " + this.analyzer.toString() + " " + snowballLang);
// TODO: Recognize when and how to analyze Korean, Chinese or some other languages
// this.analyzer = new CJKAnalyzer(this.stopwords);
// Check DB connectino for metadata
this.dbConnection = new DbConnection();
// Loads default annotators
String annotators = Configuration.getTmlProperties().getProperty(
"tml.annotators");
if(annotators != null && annotators.length() > 0) {
logger.debug("Annotators:\t\tLoading defaults");
for (String annotatorName : annotators.split(",")) {
if (annotatorName.trim().length() == 0) {
continue;
}
Class classDefinition = null;
Annotator annotator = null;
try {
classDefinition = Class.forName("tml.annotators." + annotatorName);
annotator = (Annotator) classDefinition.newInstance();
this.annotators.add(annotator);
annotator.init();
} catch (Exception e) {
logger.error("Default annotator not found! " + annotatorName);
logger.error(e);
continue;
}
}
}
this.listeners = new EventListenerList();
try {
this.openIndexWriter();
this.closeIndexWriter();
} catch (CorruptIndexException e) {
e.printStackTrace();
throw e;
} catch (IOException e) {
e.printStackTrace();
throw e;
}
if(Configuration.getTmlProperties().getProperty("tml.indexer.run").equals("true"))
initializeIndexerTimer();
if(Configuration.getTmlProperties().getProperty("tml.annotator.run").equals("true"))
initializeAnnotatorTimer();
if(Configuration.getTmlProperties().getProperty("tml.cleanup.run").equals("true"))
initializeCleanupTimer();
logger.info("TML initialized");
}
public String[][] getAllDocuments() {
try {
return this.getDbConnection().getDocuments();
} catch (Exception e) {
e.printStackTrace();
logger.error(e);
return null;
}
}
/**
* Adds an annotator to the repository
* @param annotator the annotator
*/
public void addAnnotator(Annotator annotator) {
if (!this.containsAnnotator(annotator)) {
annotator.init();
this.annotators.add(annotator);
} else {
logger.debug("Annotator " + annotator.getFieldName() + " already loaded!");
}
}
private boolean containsAnnotator(Annotator annotator) {
for (Annotator existingAnnotator : this.annotators) {
if (annotator.getFieldName().equals(existingAnnotator.getFieldName())) {
return true;
}
}
return false;
}
/**
* This method allows to add a listener so the Repository
* can report asynchronously the state of the prcessing
* @param l the listener to add
*/
public void addRepositoryListener(RepositoryListener l) {
this.listeners.add(RepositoryListener.class, l);
}
/**
* Removes a listener that was previously added if exists
* @param l the listener to remove
*/
public void removeRepositoryListener(RepositoryListener l) {
this.listeners.remove(RepositoryListener.class, l);
}
/**
* Fires an event of the Repository
* @param evt the event object
*/
private void doRepositoryAction(RepositoryEvent evt) {
RepositoryListener[] list = this.listeners.getListeners(RepositoryListener.class);
for (RepositoryListener listener : list) {
listener.repositoryAction(evt);
}
}
/**
* Adds a new document to the repository
*
* @param externalId
* an external id to identify the document
* @param content
* the content of the document
* @param title
* the title of the document
* @param url
* a url to find the document (optional)
* @param importer
* an importer (how to decode the content)
* @throws IOException
* @throws SQLException
*/
public void addDocument(String externalId, String content, String title,
String url, Importer importer) throws IOException, SQLException {
logger.debug("Adding document " + title + " with id:" + externalId);
if (importer != null) {
content = importer.getCleanContent(content);
} else if (this.defaultImporter != null) {
content = this.defaultImporter.getCleanContent(content);
}
this.openIndexWriter();
this.addDocumentToOpenIndex(externalId, content, title, url, importer);
closeIndexWriter();
}
/**
* Add all the files in a folder into the Lucene Index.
* It can only process .txt files.
*
* @param folder
* an absolute path to the folder that contains the files
* @throws IOException
*/
public void addDocumentsInFolder(String folder) throws IOException {
addDocumentsInFolder(folder, -1);
}
/**
* Add all the files in a folder into the Lucene Index. Up to a maximum.
* It can only process .txt files.
*
* @param folder
* an absolute path to the folder that contains the files
* @param maxDocs
* the maximum number of documents to index
* @throws IOException
*/
public void addDocumentsInFolder(String folder, int maxDocs) throws IOException {
logger.debug("Adding text files from " + folder);
File corpusFile = new File(folder);
if (!corpusFile.exists() || !corpusFile.isDirectory()) {
throw new FileNotFoundException(
"Invalid corpus folder, it doesn't exists! (" + folder + ")");
}
// First insert all the filenames in an arraylist to sort them by name
List<String> files = new ArrayList<String>();
for (String file : corpusFile.list(new FilenameFilter() {
public boolean accept(File dir, String name) {
return !name.startsWith(".");
}
})) {
files.add(file);
}
Collections.sort(files);
if (maxDocs > 0) {
for (int i = files.size() - 1; i >= maxDocs; i--) {
files.remove(i);
}
}
// Create the list of files from the list of file names
List<File> fileList = new ArrayList<File>();
for (String f : files) {
fileList.add(new File(folder + "/" + f));
}
File[] a = new File[fileList.size()];
this.addDocumentsInList(fileList.toArray(a));
}
/**
* Adds all the files in the list to the repository. It will filter by
* extension and only load files finishing with ".txt". It also ignores
* files starting with a dot ".".
*
* @param fileList
* @throws CorruptIndexException
* @throws IOException
*/
public void addDocumentsInList(File[] fileList) throws CorruptIndexException, IOException {
long time = System.currentTimeMillis();
this.openIndexWriter();
logger.debug("Adding files using encoding " + this.encoding);
int count = 0;
doRepositoryAction(new RepositoryEvent(this, "addingDocument", 0, fileList.length));
for (File f : fileList) {
if (!f.isDirectory() && !f.getName().startsWith(".")) {
// Calculating the file extension (e.g. .txt or .html)
String[] pieces = f.getName().split("\\.");
String extension = pieces[pieces.length - 1];
// We use the file extension to get an importer
Importer importer = AbstractImporter.createImporter(extension);
if (importer == null) {
logger.info("Don't know how to parse ." + extension
+ " files, ignoring " + f.getName());
continue;
}
logger.debug("Using importer " + importer.getClass().getName());
String content = null;
try {
content = getFileContent(f, this.encoding);
String title = f.getName().replace("." + extension, "");
String url = f.getAbsolutePath();
String externalid = cleanIdForLucene(title);
logger.debug("Adding document " + count + ":" + f.getName());
this.addDocumentToOpenIndex(externalid, content, title, url,
importer);
} catch (IOException e) {
e.printStackTrace();
logger.error("Failed to load content or adding document to index for file " + f);
} catch (SQLException e) {
e.printStackTrace();
logger.error("Fatal error insterting documents in the database");
throw new IOException(e);
} finally {
count++;
doRepositoryAction(new RepositoryEvent(this, "addingDocument", count, fileList.length));
}
} else {
logger.debug("Ignoring document " + f.getName());
}
}
this.closeIndexWriter();
time = System.currentTimeMillis() - time;
doRepositoryAction(new RepositoryEvent(this, "addingDocument", fileList.length, fileList.length));
logger.info("Successfully added " + count + " documents in " + time
+ " ms");
}
private void addDocumentToOpenIndex(String externalId, String content,
String title, String url, Importer importer)
throws IOException, SQLException {
if (importer != null) {
content = importer.getCleanContent(content);
} else if (this.defaultImporter != null) {
content = this.defaultImporter.getCleanContent(content);
}
if (content == null) {
content = "";
}
Document doc = this.createDocument(content,
"document",
"null",
"null",
externalId,
title,
url);
this.addSegmentsInDocument(content, doc, externalId);
this.addDocumentToOpenIndex(doc);
}
/**
* Chops a content in pieces and adds a new document for each piece into the
* Lucene Index. The documents will have the type "segment" and will refer
* to its parent using the field "parent".
*
* @param content
* the content of the document to chop
* @param document
* the Lucene Document
* @param docId
* the id of the document
* @throws IOException
* @throws SQLException
*/
private void addSegmentsInDocument(String content, Document document,
String docId) throws IOException, SQLException {
String title = document.get(this.getLuceneTitleField());
logger.debug("Adding segments to document " + docId + "[" + title
+ "]");
long time = System.currentTimeMillis();
BufferedReader strReader = new BufferedReader(new StringReader(content));
String line = null;
int sentenceNumber = 0;
int paragraphNumber = 0;
int ignoredLines = 0;
int ignoredSentences = 0;
boolean isBibliography = false;
logger.debug("Parsing text with " + this.locale);
while ((line = strReader.readLine()) != null && !isBibliography) {
BreakIterator iterator = BreakIterator.getSentenceInstance(this.locale);
iterator.setText(line);
int start = iterator.first();
int end = 0;
List<String> sentencesList = new ArrayList<String>();
while ((end = iterator.next()) != BreakIterator.DONE) {
sentencesList.add(line.substring(start, end));
start = end;
}
String documentId = docId;
if (line.length() >= 2) {
String lowLine = line.trim().toLowerCase().replaceAll("\\W", "");
if (isBibliographyTitle(lowLine)) {
isBibliography = true;
continue;
}
paragraphNumber++;
String paragraphExtId = "p" + paragraphNumber + "d" + documentId;
this.addTextPassageToOpenIndex(
line,
"paragraph",
documentId,
documentId,
paragraphExtId,
"Paragraph " + paragraphNumber + " of " + title,
"N/A");
int numSentence = 0;
doRepositoryAction(new RepositoryEvent(this, "addingSentence", 0, sentencesList.size()));
for (String sentence : sentencesList) {
String url = "N/A";
if (sentence.length() >= 2) {
numSentence++;
sentenceNumber++;
doRepositoryAction(new RepositoryEvent(this, "addingSentence", numSentence, sentencesList.size()));
if (numSentence == sentencesList.size()) {
url = "last";
}
String sentenceExtId = "s" + sentenceNumber + "d" + documentId;
this.addTextPassageToOpenIndex(
sentence,
"sentence",
paragraphExtId,
documentId,
sentenceExtId,
"Sentence " + sentenceNumber + " of " + title,
url);
} else {
ignoredSentences++;
}
}
} else {
ignoredLines++;
}
}
time = System.currentTimeMillis() - time;
doRepositoryAction(new RepositoryEvent(this, "addingSentence", 100, 100));
logger.debug("Added " + paragraphNumber + " paragraphs and "
+ sentenceNumber + " sentences.");
logger.debug("Ignored " + ignoredLines + " paragraphs and "
+ ignoredSentences + " sentences.");
}
/**
* Inserts a new text passage into the Repository.
*
* @param content
* the content of the document
* @param title
* the title of the document
* @param url
* the url of the document
* @param type
* the type of the document ("document", "sentence" or
* "paragraph")
* @param parent
* the id of the parent document (when type is segment)
* @return the Lucene Document that was just added
* @throws IOException
* @throws SQLException
*/
private Document addTextPassageToOpenIndex(String content, String type,
String parent, String parentDocument, String externalId, String title, String url) throws IOException, SQLException {
Document document = new Document();
document.add(new Field(this.getLuceneContentField(), content,
Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS));
document.add(new Field(this.getLuceneExternalIdField(), externalId,
Store.YES, Index.NOT_ANALYZED, TermVector.NO));
document.add(new Field(this.getLuceneTitleField(), title, Store.YES,
Index.NOT_ANALYZED, TermVector.NO));
document.add(new Field(this.getLuceneUrlField(), url, Store.YES,
Index.NOT_ANALYZED, TermVector.NO));
document.add(new Field("indexdate", Calendar.getInstance().getTime().toString(), Store.YES, Index.NOT_ANALYZED, TermVector.NO));
document.add(new Field(this.getLuceneParentField(), parent, Store.YES,
Index.NOT_ANALYZED, TermVector.NO));
document.add(new Field("type", type, Store.YES, Index.NOT_ANALYZED,
TermVector.NO));
document.add(new Field("parent", parentDocument, Store.YES, Index.NOT_ANALYZED,
TermVector.NO));
this.getDbConnection().insertDocument(this, document);
Term term = new Term("externalid", externalId);
luceneIndexWriter.updateDocument(term, document);
return document;
}
/**
* Inserts a new text passage into the Repository.
*
* @param content
* the content of the document
* @param title
* the title of the document
* @param url
* the url of the document
* @param type
* the type of the document ("document", "sentence" or
* "paragraph")
* @param parent
* the id of the parent document (when type is segment)
* @return the Lucene Document that was just added
* @throws IOException
* @throws SQLException
*/
private Document addDocumentToOpenIndex(Document document) throws IOException, SQLException {
this.getDbConnection().insertDocument(this, document);
Term term = new Term("externalid", document.get(this.getLuceneExternalIdField()));
luceneIndexWriter.updateDocument(term, document);
return document;
}
/**
* Inserts a new text passage into the Repository.
*
* @param content
* the content of the document
* @param title
* the title of the document
* @param url
* the url of the document
* @param type
* the type of the document ("document", "sentence" or
* "paragraph")
* @param parent
* the id of the parent document (when type is segment)
* @return the Lucene Document that was just added
* @throws IOException
*/
private Document createDocument(String content, String type,
String parent, String parentDocument, String externalId, String title, String url) throws IOException {
Document document = new Document();
document.add(new Field(this.getLuceneContentField(), content,
Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS));
document.add(new Field(this.getLuceneExternalIdField(), externalId,
Store.YES, Index.NOT_ANALYZED, TermVector.NO));
document.add(new Field(this.getLuceneTitleField(), title, Store.YES,
Index.NOT_ANALYZED, TermVector.NO));
document.add(new Field(this.getLuceneUrlField(), url, Store.YES,
Index.NOT_ANALYZED, TermVector.NO));
document.add(new Field("indexdate", Calendar.getInstance().getTime().toString(), Store.YES, Index.NOT_ANALYZED, TermVector.NO));
document.add(new Field(this.getLuceneParentField(), parent, Store.YES,
Index.NOT_ANALYZED, TermVector.NO));
document.add(new Field("type", type, Store.YES, Index.NOT_ANALYZED,
TermVector.NO));
document.add(new Field("parent", parentDocument, Store.YES, Index.NOT_ANALYZED,
TermVector.NO));
return document;
}
public Thread annotateDocuments() {
DocumentAnnotator process = new DocumentAnnotator(this);
Thread t = new Thread(process);
t.start();
return t;
}
private void closeIndexWriter() throws CorruptIndexException, IOException {
if(luceneIndexWriter == null)
return;
luceneIndexWriter.commit();
luceneIndexWriter.optimize(true);
luceneIndexWriter.close(true);
}
/**
* Deletes a document from the repository. A TextDocument object must be
* used so the document must be first obtained from the repository.
*
* @param document
* @throws IOException
*/
public void deleteTextDocument(TextDocument document) throws IOException {
logger.info("Deleting document " + document);
Term term = new Term(
this.luceneExternalIdField,
document.getExternalId());
this.openIndexWriter();
luceneIndexWriter.deleteDocuments(term);
term = new Term(this.luceneParentDocumentField, document.getExternalId());
luceneIndexWriter.deleteDocuments(term);
this.closeIndexWriter();
}
/**
* Returns a list with all the documents in the repository in
* {@link TextDocument} form
*
* @return a list of {@link TextDocument}
* @throws Exception
*/
public List<TextDocument> getAllTextDocuments() throws Exception {
List<TextDocument> documents = new ArrayList<TextDocument>();
RepositoryCorpus corpus = new RepositoryCorpus();
try {
corpus.setParameters(CorpusParameters.getNoReductionParameters());
corpus.load(this);
} catch (NoDocumentsInCorpusException e) {
return documents;
} catch (Exception e) {
throw e;
}
for (String externalId : corpus.getPassages()) {
documents.add(getTextDocument(cleanIdForLucene(externalId)));
}
return documents;
}
/**
* Gets the Lucene analyzer that the {@link Repository} is using
*
* @return the {@link Analyzer}
*/
public Analyzer getAnalyzer() {
return analyzer;
}
/**
* @return the annotators available for this repository
*/
public List<Annotator> getAnnotators() {
return annotators;
}
/**
* Gets the content of a field for a document, using its external id.
* @param externalId the id of the document
* @param fieldname the name of the field to retrieve
* @return the content of the field
* @throws IOException
*/
public String getDocumentField(String externalId, String fieldname) throws IOException {
Document document = getLuceneDocument(externalId);
return document.get(fieldname);
}
/**
* @return the encoding used by TML
*/
public String getEncoding() {
return encoding;
}
/**
* @return the path to the Lucene index
*/
public String getIndexPath() {
return indexPath;
}
/**
* Obtains an IndexReader of the Lucene index
*
* @return the IndexReader
* @throws IOException
*/
public IndexReader getIndexReader() throws IOException {
if (luceneIndexReader == null || !luceneIndexReader.isCurrent()) {
luceneIndexReader = FilterIndexReader.open(SimpleFSDirectory.open(new File(this.indexPath)), true);
}
return luceneIndexReader;
}
/**
* Obtains an IndexSearcher for the Lucene index
*
* @return the IndexSearcher
* @throws IOException
*/
public IndexSearcher getIndexSearcher() throws IOException {
return new IndexSearcher(this.getIndexReader());
}
/**
* @return the {@link Locale} being used by TML
*/
public Locale getLocale() {
return locale;
}
/**
* Gets the name of the field used by the underlying Lucene index for the
* content
*
* @return the name of the content field
*/
public String getLuceneContentField() {
return luceneContentField;
}
private Document getLuceneDocument(String externalId) throws IOException {
TopDocs hits = getLuceneDocumentHits(externalId);
if (hits == null) {
throw new IOException("Document " + externalId + " not found!");
}
Document doc = this.getIndexSearcher().doc(hits.scoreDocs[0].doc);
return doc;
}
private TopDocs getLuceneDocumentHits(String externalId) throws IOException {
QueryParser parser = new QueryParser(Version.LUCENE_29,
this.getLuceneContentField(),
new KeywordAnalyzer());
// logger.debug("Retrieving document " + externalId);
String query = "externalid:" + externalId;
Query documentsQuery;
try {
documentsQuery = parser.parse(query);
} catch (ParseException e) {
logger.error("Invalid externalId:" + externalId);
e.printStackTrace();
return null;
}
TopDocs hits = this.getIndexSearcher().search(documentsQuery, 9999);
if (hits.totalHits < 1) {
return null;
}
if (hits.totalHits > 1) {
throw new IOException("The query returned more than one document");
}
return hits;
}
/**
* Gets the name of the field used by the underlying Lucene index for the
* external id
*
* @return the name of the external id field
*/
public String getLuceneExternalIdField() {
return luceneExternalIdField;
}
/**
* Gets the name of the field used by the underlying Lucene index for the
* parent
*
* @return the name of the parent field
*/
public String getLuceneParentField() {
return luceneParentField;
}
/**
* @return the name of the field used to store the PennTree bank string
*/
public String getLucenePenntreeField() {
return lucenePenntreeField;
}
/**
* Gets the name of the field used by the underlying Lucene index for the
* title
*
* @return the name of the title field
*/
public String getLuceneTitleField() {
return luceneTitleField;
}
/**
* @return the name of the field that stores the type of the Lucene
* document (document, paragraph or sentence)
*/
public String getLuceneTypeField() {
return luceneTypeField;
}
/**
* Gets the name of the field used by the underlying Lucene index for the
* url
*
* @return the name of the url field
*/
public String getLuceneUrlField() {
return luceneUrlField;
}
/**
* @return the maxDocumentsToIndex
*/
public int getMaxDocumentsToIndex() {
return maxDocumentsToIndex;
}
/**
* Gets the {@link Importer} used to transform the content before inserting
* into the {@link Repository}
*
* @return the {@link Importer} being used by TML
*/
public Importer getParser() {
return defaultImporter;
}
/**
* @return the list of stopwords used to analyse and parse documents
*/
public String[] getStopwords() {
return stopwords;
}
private String[] getStopWordsFromBufferedReader(BufferedReader reader)
throws IOException {
List<String> stopwords = new ArrayList<String>();
String line = reader.readLine();
while (line != null) {
stopwords.add(line);
line = reader.readLine();
}
String[] output = new String[stopwords.size()];
return stopwords.toArray(output);
}
/**
* Processes a file and returns each line in an array. It's useful to
* transform a stopwords file into the list that Lucene needs.
*
* @param file
* an absolute path to the stopwords file
* @return an array of stop words
* @throws IOException
*/
private String[] getStopWordsFromFile(File file) throws IOException {
List<String> stopwords = new ArrayList<String>();
BufferedReader reader = null;
if (file != null) {
reader = new BufferedReader(new FileReader(file));
} else {
reader = new BufferedReader(new InputStreamReader(ClassLoader.getSystemResourceAsStream("/stopwords.txt")));
}
String line = reader.readLine();
while (line != null) {
stopwords.add(line);
line = reader.readLine();
}
String[] output = new String[stopwords.size()];
return stopwords.toArray(output);
}
/**
* @return the svdStoragePath
*/
public String getSvdStoragePath() {
return svdStoragePath;
}
/**
* Gets a document from the repository by its external id. Returns a
* {@link TextDocument} object with basic information about the document,
* like title and url. In order to perform operations on the documents, it
* must be loaded, which means that a {@link Corpus} and its inner
* {@link SemanticSpace} will be created.
*
* @param externalId
* the id of the document
* @return a {@link TextDocument}
* @throws IOException
*/
public TextDocument getTextDocument(String externalId) throws IOException {
TopDocs hits = getLuceneDocumentHits(externalId);
Document doc = this.getIndexSearcher().doc(hits.scoreDocs[0].doc);
TextDocument document = new TextDocument(hits.scoreDocs[0].doc, doc.get(getLuceneTitleField()), doc.get(getLuceneUrlField()),
externalId, doc.get(getLuceneContentField()));
return document;
}
/**
* Add reference
*
* @param sentence
* the sentence to evaluate
* @return if the sentence corresponds to the title of the references
* section
*/
public boolean isBibliographyTitle(String sentence) {
String[] words = sentence.split("\\s");
if (words.length >= 4) {
return false;
}
for (String word : words) {
if (word.toLowerCase().matches(
"(\\d+)?\\s*((resources?)|(references?)|(bibliography)|(notes?))\\s*")) {
return true;
}
}
return false;
}
private void openIndexWriter() throws LockObtainFailedException, CorruptIndexException, IOException {
Directory dir = null;
try {
dir = SimpleFSDirectory.open(new File(indexPath));
} catch (IOException e) {
e.printStackTrace();
throw e;
}
try {
luceneIndexWriter = new IndexWriter(
dir,
this.analyzer,
IndexWriter.MaxFieldLength.UNLIMITED);
} catch (CorruptIndexException e) {
e.printStackTrace();
throw e;
} catch (LockObtainFailedException e) {
logger.error("Index is locked! Trying to unlock.");
IndexWriter.unlock(dir);
luceneIndexWriter = new IndexWriter(
dir,
this.analyzer,
IndexWriter.MaxFieldLength.UNLIMITED);
} catch (IOException e) {
e.printStackTrace();
throw e;
}
}
/**
* Removes an annotator to the repository
* @param annotator the annotator
*/
public void removeAnnotator(Annotator annotator) {
this.annotators.remove(annotator);
}
/**
* Sets the character encoding that will be used in this repository
*
* @param encoding
*/
public void setEncoding(String encoding) {
if (Charset.isSupported(encoding)) {
this.encoding = encoding;
} else {
logger.info("Invalid encoding or not supported");
}
}
/**
* @param maxDocumentsToIndex the maxDocumentsToIndex to set
*/
public void setMaxDocumentsToIndex(int maxDocumentsToIndex) {
this.maxDocumentsToIndex = maxDocumentsToIndex;
}
public String getAnnotations(String documentId,
String fieldName) {
return this.getDbConnection().getAnnotation(documentId, fieldName);
}
public Thread cleanup() {
DocumentCleanup process = new DocumentCleanup(this);
Thread t = new Thread(process);
t.start();
return t;
}
private void initializeCleanupTimer() throws IOException {
cleanupTimer = new Timer();
TmlCleanupTask task = new TmlCleanupTask(this);
int seconds = 300;
try {
seconds = Integer.parseInt(Configuration.getTmlProperties()
.getProperty("tml.cleanup.interval"));
} catch (Exception e) {
e.printStackTrace();
logger.error("Annotator interval not set or invalid "
+ Configuration.getTmlProperties().getProperty("tml.cleanup.interval"));
}
logger.info("TML cleanup started every " + seconds + " seconds");
cleanupTimer.schedule(task, new Date(), seconds * 1000);
}
private void initializeAnnotatorTimer() throws IOException {
if(this.getAnnotators().size() == 0) {
logger.info("There are no annotators, no need to run.");
return;
}
annotatorTimer = new Timer();
TmlAnnotatorTask task = new TmlAnnotatorTask(this);
int seconds = 300;
try {
seconds = Integer.parseInt(Configuration.getTmlProperties()
.getProperty("tml.annotator.interval"));
} catch (Exception e) {
e.printStackTrace();
logger.error("Annotator interval not set or invalid "
+ Configuration.getTmlProperties().getProperty("tml.annotator.interval"));
}
logger.info("TML annotator started every " + seconds + " seconds");
annotatorTimer.schedule(task, new Date(), seconds * 1000);
}
private void initializeIndexerTimer() throws IOException {
indexerTimer = new Timer();
TmlIndexerTask task = new TmlIndexerTask(this);
task.setMaxFilesToProcess(1);
task.setUploadFolder(Configuration.getTmlFolder() + "upload");
int seconds = 300;
try {
seconds = Integer.parseInt(Configuration.getTmlProperties()
.getProperty("tml.indexer.interval"));
} catch (Exception e) {
e.printStackTrace();
logger.error("Indexer interval not set or invalid "
+ Configuration.getTmlProperties().getProperty("tml.indexer.interval"));
}
logger.info("TML indexer started every " + seconds + " seconds");
indexerTimer.schedule(task, new Date(), seconds * 1000);
}
}