Repository.java example

Explorer
tml-master
- tml
  - src
    - main
      - java
        SemanticSpace.java
        TmlCommandLine.java
        tml
        Configuration.java
        annotators
        AbstractAnnotator.java
        Annotator.java
        AnnotatorManager.java
        PennTreeAnnotator.java
        corpus
        Corpus.java
        CorpusParameters.java
        Dictionary.java
        ParagraphCorpus.java
        RepositoryCorpus.java
        SearchResultsCorpus.java
        SentenceCorpus.java
        SimpleCorpus.java
        Term.java
        TextDocument.java
        TextPassage.java
        sql
        DbConnection.java
        storage
        DocumentAnnotator.java
        DocumentCleanup.java
        Repository.java
        RepositoryEvent.java
        RepositoryListener.java
        TmlAnnotatorTask.java
        TmlCleanupTask.java
        TmlIndexerTask.java
        importers
        AbstractImporter.java
        HtmlImporter.java
        Importer.java
        PdfImporter.java
        TextImporter.java
        test
        AbstractTmlIndexingTest.java
        utils
        DBUtils.java
        DistanceLib.java
        Highlighting.java
        JDBCUtils.java
        LanczosSVDLIBCUtils.java
        LanczosSVDPACKCUtils.java
        LuceneUtils.java
        MatrixUtils.java
        RegexUtils.java
        StanfordUtils.java
        Stats.java
        WordNetUtils.java
        vectorspace
        EmptyTextPassageException.java
        NoDocumentsInCorpusException.java
        NotEnoughTermsInCorpusException.java
        SVD.java
        SemanticSpace.java
        TermWeighting.java
        TermWeightingException.java
        factorisation
        MatrixFactorisation.java
        MultiDimensionalScalingNR.java
        NonnegativeMatrixFactorisationED.java
        NonnegativeMatrixFactorisationKL.java
        PrincipalCoordinateAnalysis.java
        ProbabilisticLatentSemanticAnalysis.java
        SingularValueDecomposition.java
        SpaceDecomposition.java
        operations
        AbstractOperation.java
        ClassDiscovery.java
        CompoundNounsSummarized.java
        ConceptExtraction.java
        FactorAnalysisPlot.java
        LastPassage.java
        LexiconAnalysis.java
        Operation.java
        OperationEvent.java
        OperationListener.java
        ParagraphCoherenceIndex.java
        PassageDistances.java
        PassageExtractionSummarization.java
        PassagesSimilarity.java
        RapidAutomaticKeywordExtraction.java
        Readability.java
        RelationshipExtraction.java
        Summary.java
        TagClouds.java
        TermExtractionSummarization.java
        results
        AbstractResult.java
        FactorAnalysisPlotResult.java
        LastPassageResult.java
        LexiconAnalysisResult.java
        NullResult.java
        ParagraphCoherenceIndexResult.java
        PassageClusteringLingoResult.java
        PassageDistancesResult.java
        PassageExtractionSummarizationResult.java
        PassageSimilarityResult.java
        RapidAutomaticKeywordExtractionResult.java
        ReadabilityResult.java
        RelationshipExtractionResult.java
        Summary.java
        SummaryResult.java
        TagCloudsResult.java
        TermRankedResult.java
        TermsExtractionSummarizationResult.java
        summarization
        AbstractSummarizationOperation.java
        LatentSemanticAnalysisSummarization.java
        SummarizationOperation.java
        VectorLengthSummarization.java
        visualizations
        AbstractVisualization.java
        TagClouds.java
        Visualization.java
    - test
      - java
        tml
        test
        DbConnectionTest.java
        FactorAnalysisPlotTest.java
        IndexingDocumentsTest.java
        IndexingHtmlTest.java
        IndexingInvalidDocumentsTest.java
        IndexingPlainTextTest.java
        LanczosTest.java
        LuceneSearchTest.java
        NonNegativeMatrixFactorizationTest.java
        RapidAutomaticKeywordExtractionTest.java
        ReadabilityTest.java
        SimpleCorpusTest.java
        StemmingTest.java
        TagCloudsTest.java
        ValidateBerryDumaisTest.java
        ValidateDistancesTest.java
        ValidateHandbookOfLSATest.java
        ValidateIntroToLSATest.java
        ValidateSameDistancesAllDimensions.java
/*******************************************************************************
 *  Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
 *  
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License. 
 *  You may obtain a copy of the License at 
 *  
 *  	http://www.apache.org/licenses/LICENSE-2.0 
 *  	
 *  Unless required by applicable law or agreed to in writing, software 
 *  distributed under the License is distributed on an "AS IS" BASIS, 
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 *  See the License for the specific language governing permissions and 
 *  limitations under the License.
 *******************************************************************************/
package tml.storage;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.sql.SQLException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Timer;

import javax.swing.event.EventListenerList;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

import tml.Configuration;
import tml.annotators.Annotator;
import tml.corpus.CorpusParameters;
import tml.corpus.RepositoryCorpus;
import tml.corpus.TextDocument;
import tml.sql.DbConnection;
import tml.storage.importers.AbstractImporter;
import tml.storage.importers.Importer;
import tml.storage.importers.TextImporter;
import tml.vectorspace.NoDocumentsInCorpusException;

/**
 * This class represents a documents repository. Documents can be inserted,
 * deleted and searched from a Repository. All documents that were successfully
 * inserted in a repository can then later be used to create a {@link Corpus}
 * and perform operations on them.
 * </p>
 * <p>
 * At the heart of a repository lies a {@link TextDocument}, that represents a
 * text document and is accessible using any id of your choice (e.g. from a
 * database, or from the filesystem). The content of a new documents is expected
 * to be just plain text. Importers from different formats will be provided in
 * time, for the moment we have only a Wiki cleaner.
 * </p>
 * <p>
 * All the documents, once inserted in the Repository can then be searched using
 * the searchTextDocuments method. Queries are made using the syntax from
 * Apache's Lucene.
 * </p>
 * <p>
 * <em>Code examples</em>
 * </p>
 * <p>
 * Initialising a {@link Repository}:
 * </p>
 * 
 * <pre>
 * Repository repository = new Repository("path/to/repository/folder");
 * </pre>
 * <p>
 * Obtaining all the documents in a Repository
 * </p>
 * 
 * <pre>
 * ...
 * List<TextDocument> documents = repository.getAllTextDocuments();
 * for(TextDocument doc : documents) {
 *   System.out.println("Document:" + doc.getTitle());
 * }
 * ...
 * </pre>
 * <p>
 * Inserting a document
 * </p>
 * 
 * <pre>
 * String content = "The content of my document";
 * String title = "A title";
 * String url = "http://www/mydoc.txt";
 * String id = "TheIdOfMyDoc";
 * repository.addDocument(id, content, title, url);
 * </pre>
 * <p>
 * Obtaining a document from the repository
 * </p>
 * 
 * <pre>
 * String id = "TheIdOfMyDoc";
 * TextDocument doc = repository.getTextDocument(id);
 * </pre>
 * <p>
 * Removing a document from the repository
 * </p>
 * 
 * <pre>
 * TextDocument doc = repository.getTextDocument("someId");
 * repository.deleteDocument(doc);
 * </pre>
 * <p>
 * Searching for documents containing "foo"
 * </p>
 * 
 * <pre>
 * String query = "foo";
 * List<TextDocument> documents = repository.searchTextDocuments(query);
 * for (TextDocument doc : documents) {
 * 	System.out.println("Document found:" + doc.getTitle());
 * }
 * </pre>
 * 
 * @see TextDocument
 * @see Corpus
 * @author Jorge Villalon
 * 
 */
public class Repository {

	/**
	 * Cleans an id (typically a file name) to suits the syntax of Lucene
	 *
	 * @param id
	 *            the external id of a document
	 * @return the id clean of special characters that Lucene uses
	 */
	public static String cleanIdForLucene(String id) {
		String cleanId = id.replace(" ", "");
		cleanId = cleanId.replace("_", "");
		cleanId = cleanId.replace("\\.", "");
		return cleanId;
	}

	/**
	 * Deletes all the files of the {@link Repository}.
	 *
	 * @param indexPath
	 *            The path to the folder where the LuceneIndex files are stored
	 * @throws IOException
	 * @throws LockObtainFailedException
	 * @throws CorruptIndexException
	 * @throws SQLException 
	 *
	 */
	public static void cleanStorage(String indexPath)
			throws CorruptIndexException, LockObtainFailedException,
			IOException, SQLException {

		DbConnection conn = new DbConnection();
		conn.cleanMetaDataStorage();

		// Opening an IndexWriter with true to create a new empty one
		IndexWriter writer = new IndexWriter(
				SimpleFSDirectory.open(new File(indexPath)),
				new StandardAnalyzer(Version.LUCENE_29),
				true,
				IndexWriter.MaxFieldLength.UNLIMITED);
		writer.close(true);
		writer = null;
	}

	/**
	 * This method is necessary due to problems on processing UTF-8 encoded text that comes from
	 * a paste from word. Usually quotations and double quotations come with weird characters
	 * that do not correspond to those of quotations. That makes it impossible to detect
	 * for the parsers.
	 *
	 * @param word
	 * @return
	 */
	public static String cleanWord(String word) {
		word = word.replace('\u0060', '\'');
		word = word.replace('\u2018', '\'');
		word = word.replace('\u2019', '\'');
		word = word.replace('\u201A', '\'');
		word = word.replace('\u201B', '\'');
		word = word.replace('\u2032', '\'');
		word = word.replace('\u2035', '\'');

		word = word.replace('\u201C', '\"');
		word = word.replace('\u201D', '\"');
		word = word.replace('\u201E', '\"');
		word = word.replace('\u201F', '\"');
		word = word.replace('\u2033', '\"');
		word = word.replace('\u2036', '\"');

		word = word.replace('\u2010', '-');
		word = word.replace('\u2012', '-');
		word = word.replace('\u2013', '-');
		word = word.replace('\u2014', '-');
		word = word.replace('\u2015', '-');
		word = word.replaceAll("\r\n", "");
		word = word.replace('\r', ' ');
		word = word.replace('\n', ' ');
		word = word.replaceAll("\uFEFF", "");
		word = word.trim();

		return word;
	}

	/**
	 * Obtains the content of a text file. Basically it uses readline and then
	 * writes only a \n for newlines so it removes any \r to make further
	 * process easier.
	 * 
	 * @param file
	 * @param charset
	 * @return
	 * @throws IOException
	 */
	public static String getFileContent(File file, String charset) throws IOException {
		StringBuffer buffer = new StringBuffer();


		String line = null;

		// Remove special characters Unicode!
		BufferedReader reader = new BufferedReader(
				new InputStreamReader(
						new FileInputStream(file), 
						charset));
		
		while ((line = reader.readLine()) != null) {
			line = cleanWord(line);
			buffer.append(line);
			buffer.append('\n');
		}
		
		reader.close();
		return buffer.toString();
	}

	private IndexWriter luceneIndexWriter = null;
	private IndexReader luceneIndexReader = null;
	// General attributes
	/** The logger for log4j */
	private static Logger logger = Logger.getLogger(Repository.class);
	/** Timer for indexing */
	private Timer indexerTimer;
	/** Timer for annotations */
	private Timer annotatorTimer;
	/** Timer for cleanup */
	private Timer cleanupTimer;
	/** The language for the documents in the repository */
	private Locale locale;
	/** The character encoding used to read files from the fileystem */
	private String encoding = "UTF-8";
	/** The parser for the content before inserting into the index */
	private Importer defaultImporter = null;
	// Lucene specific attributes
	/** The standard Lucene analyser for this repository */
	private Analyzer analyzer;
	/** The folder where the repository is kept */
	private String indexPath;
	/** Path to the storage of calculated SVDs */
	private String svdStoragePath;
	private String tmpPath;
	public String getTmpPath() {
		return tmpPath;
	}
	private String processedPath;
	public String getProcessedPath() {
		return processedPath;
	}
	/** The stopwords */
	private String[] stopwords;
	/** The field that contains the content of a document */
	private String luceneContentField = "contents";
	/** The field that contains the title of a document */
	private String luceneTitleField = "title";
	/** The field that contains the url of a document */
	private String luceneUrlField = "url";
	/** The field that contains the external ID of a document */
	private String luceneExternalIdField = "externalid";
	/** The field that contains the ID of the parent of a document */
	private String luceneParentField = "reference";
	/** The field that contains the ID of the parent of a document */
	private String luceneParentDocumentField = "parent";
	/** The path to the execution folder */
	private String execPath = "";

	public String getExecPath() {
		return execPath;
	}

	public void setExecPath(String execPath) {
		this.execPath = execPath;
	}
	private DbConnection dbConnection = null;

	public DbConnection getDbConnection() {
		return dbConnection;
	}

	/**
	 * @return the luceneParentDocumentField
	 */
	public String getLuceneParentDocumentField() {
		return luceneParentDocumentField;
	}
	/** The field that contains the PennTree bank parse */
	private String lucenePenntreeField = "penntree";
	/** The field that contains the type of the passage */
	private String luceneTypeField = "type";
	/** The maximum number of documents to index every time the indexing is called */
	private int maxDocumentsToIndex = -1;
	// Metadata annotations specific attributes
	/** The list of annotators that will be used on indexing */
	private List<Annotator> annotators = null;
	/** */
	private EventListenerList listeners = null;

	public Repository() throws IOException, SQLException {
		this(Configuration.getTmlFolder() + "/lucene");
	}

	/**
	 * Creates a new instance of the class {@link Repository} using a Standard
	 * Analyzer without stop words removal.
	 *
	 * @param luceneIndexPath
	 *            an absolute path to the folder that stores the Lucene Index
	 * @throws IOException
	 * @throws SQLException 
	 */
	public Repository(String luceneIndexPath) throws IOException, SQLException {
		this(luceneIndexPath, new Locale("en"));
	}

	/**
	 * 
	 * @param luceneIndexPath
	 * @param locale
	 * @throws IOException
	 * @throws SQLException
	 */
	@SuppressWarnings({ "rawtypes" })
	public Repository(String luceneIndexPath, Locale locale) throws IOException, SQLException {
		assert (luceneIndexPath != null);
		assert (locale != null);

		// Read default properties and initialize log4j
		Configuration.getTmlProperties(true);

		this.indexPath = luceneIndexPath;
		File folder = new File(this.indexPath);
		if (!folder.exists()) {
			String message = "Repository folder doesn't exist ["
					+ this.indexPath + "]";
			logger.error(message);
			throw new IOException(message);
		}

		this.locale = locale;
		this.defaultImporter = new TextImporter();
		this.annotators = new ArrayList<Annotator>();

		logger.info("TML initialization");
		logger.debug("Context Path:\t\t" + Configuration.getContextPath());
		logger.info("Repository path:\t" + this.indexPath);

		try {
			new IndexSearcher(SimpleFSDirectory.open(new File(luceneIndexPath)), true);
			logger.info("Repository:\t\tLucene initialized");
		} catch (Exception e1) {
			logger.warn("Repository:\t\tLucene index corrupt or inexistent, recreating");
			Repository.cleanStorage(luceneIndexPath);
		}

		this.svdStoragePath = Configuration.getTmlFolder() + "/svd";
		File svdFolder = new File(svdStoragePath);
		if(!svdFolder.exists())
			svdFolder.mkdir();
		logger.debug("Cache:\t\t\tSVDs stored in " + this.svdStoragePath);

		this.tmpPath =  Configuration.getTmlFolder() + "/tmp";
		File tmpFolder = new File(this.tmpPath);
		if(!tmpFolder.exists())
			tmpFolder.mkdir();
		logger.debug("Temp:\t\t\tTemporary files in " + this.tmpPath);

		this.processedPath =  Configuration.getTmlFolder() + "/processed";
		File processedFolder = new File(this.processedPath);
		if(!processedFolder.exists())
			processedFolder.mkdir();
		logger.debug("Indexer:\t\tProcessed files in " + this.processedPath);

		File stopWordsFile = new File(Configuration.getTmlFolder() + "/stopwords/stopwords_" + this.locale.getLanguage() + ".txt");

		if (stopWordsFile == null || !stopWordsFile.exists()) {
			InputStream stream = this.getClass().getResourceAsStream(
					"/tml/stopwords_" + this.locale.getLanguage() + ".txt");
			if (stream == null) {
				logger.info("Failed to load stopwords for language "
						+ this.locale.getLanguage()
						+ ", falling to english");
				stream = this.getClass().getResourceAsStream("/tml/stopwords.txt");
			}
			this.stopwords = getStopWordsFromBufferedReader(new BufferedReader(
					new InputStreamReader(stream)));
		} else {
			this.stopwords = getStopWordsFromFile(stopWordsFile);
		}

		logger.debug("Stopwords:\t\tUsing " + this.locale.getDisplayLanguage(Locale.ENGLISH) + " (" + this.stopwords.length + " stopwords)");

		String snowballLang = this.locale.getDisplayLanguage(Locale.ENGLISH);

		this.analyzer = new SnowballAnalyzer(
				Version.LUCENE_29,
				snowballLang, 
				this.stopwords);

		logger.debug("Stemming:\t\tUsing " + this.analyzer.toString() + " " + snowballLang);

		// TODO: Recognize when and how to analyze Korean, Chinese or some other languages
		// this.analyzer = new CJKAnalyzer(this.stopwords);

		// Check DB connectino for metadata
		this.dbConnection = new DbConnection();

		// Loads default annotators
		String annotators = Configuration.getTmlProperties().getProperty(
				"tml.annotators");

		if(annotators != null && annotators.length() > 0) {
			logger.debug("Annotators:\t\tLoading defaults");
			for (String annotatorName : annotators.split(",")) {
				if (annotatorName.trim().length() == 0) {
					continue;
				}

				Class classDefinition = null;
				Annotator annotator = null;
				try {
					classDefinition = Class.forName("tml.annotators." + annotatorName);
					annotator = (Annotator) classDefinition.newInstance();
					this.annotators.add(annotator);
					annotator.init();
				} catch (Exception e) {
					logger.error("Default annotator not found! " + annotatorName);
					logger.error(e);
					continue;
				}
			}
		}

		this.listeners = new EventListenerList();

		try {
			this.openIndexWriter();
			this.closeIndexWriter();
		} catch (CorruptIndexException e) {
			e.printStackTrace();
			throw e;
		} catch (IOException e) {
			e.printStackTrace();
			throw e;
		}

		if(Configuration.getTmlProperties().getProperty("tml.indexer.run").equals("true"))
			initializeIndexerTimer();

		if(Configuration.getTmlProperties().getProperty("tml.annotator.run").equals("true"))
			initializeAnnotatorTimer();

		if(Configuration.getTmlProperties().getProperty("tml.cleanup.run").equals("true"))
			initializeCleanupTimer();

		logger.info("TML initialized");
	}

	public String[][] getAllDocuments() {
		try {
			return this.getDbConnection().getDocuments();
		} catch (Exception e) {
			e.printStackTrace();
			logger.error(e);
			return null;
		}
	}

	/**
	 * Adds an annotator to the repository
	 * @param annotator the annotator
	 */
	public void addAnnotator(Annotator annotator) {
		if (!this.containsAnnotator(annotator)) {
			annotator.init();
			this.annotators.add(annotator);
		} else {
			logger.debug("Annotator " + annotator.getFieldName() + " already loaded!");
		}
	}

	private boolean containsAnnotator(Annotator annotator) {
		for (Annotator existingAnnotator : this.annotators) {
			if (annotator.getFieldName().equals(existingAnnotator.getFieldName())) {
				return true;
			}
		}
		return false;
	}

	/**
	 * This method allows to add a listener so the Repository
	 * can report asynchronously the state of the prcessing
	 * @param l the listener to add
	 */
	public void addRepositoryListener(RepositoryListener l) {
		this.listeners.add(RepositoryListener.class, l);
	}

	/**
	 * Removes a listener that was previously added if exists
	 * @param l the listener to remove
	 */
	public void removeRepositoryListener(RepositoryListener l) {
		this.listeners.remove(RepositoryListener.class, l);
	}

	/**
	 * Fires an event of the Repository
	 * @param evt the event object
	 */
	private void doRepositoryAction(RepositoryEvent evt) {
		RepositoryListener[] list = this.listeners.getListeners(RepositoryListener.class);
		for (RepositoryListener listener : list) {
			listener.repositoryAction(evt);
		}
	}

	/**
	 * Adds a new document to the repository
	 *
	 * @param externalId
	 *            an external id to identify the document
	 * @param content
	 *            the content of the document
	 * @param title
	 *            the title of the document
	 * @param url
	 *            a url to find the document (optional)
	 * @param importer
	 *            an importer (how to decode the content)
	 * @throws IOException
	 * @throws SQLException 
	 */
	public void addDocument(String externalId, String content, String title,
			String url, Importer importer) throws IOException, SQLException {
		logger.debug("Adding document " + title + " with id:" + externalId);

		if (importer != null) {
			content = importer.getCleanContent(content);
		} else if (this.defaultImporter != null) {
			content = this.defaultImporter.getCleanContent(content);
		}

		this.openIndexWriter();

		this.addDocumentToOpenIndex(externalId, content, title, url, importer);

		closeIndexWriter();
	}

	/**
	 * Add all the files in a folder into the Lucene Index.
	 * It can only process .txt files.
	 *
	 * @param folder
	 *            an absolute path to the folder that contains the files
	 * @throws IOException
	 */
	public void addDocumentsInFolder(String folder) throws IOException {
		addDocumentsInFolder(folder, -1);
	}

	/**
	 * Add all the files in a folder into the Lucene Index. Up to a maximum.
	 * It can only process .txt files.
	 *
	 * @param folder
	 *            an absolute path to the folder that contains the files
	 * @param maxDocs
	 *            the maximum number of documents to index
	 * @throws IOException
	 */
	public void addDocumentsInFolder(String folder, int maxDocs) throws IOException {

		logger.debug("Adding text files from " + folder);

		File corpusFile = new File(folder);

		if (!corpusFile.exists() || !corpusFile.isDirectory()) {
			throw new FileNotFoundException(
					"Invalid corpus folder, it doesn't exists! (" + folder + ")");
		}

		// First insert all the filenames in an arraylist to sort them by name
		List<String> files = new ArrayList<String>();
		for (String file : corpusFile.list(new FilenameFilter() {

			public boolean accept(File dir, String name) {
				return !name.startsWith(".");
			}
		})) {
			files.add(file);
		}
		Collections.sort(files);

		if (maxDocs > 0) {
			for (int i = files.size() - 1; i >= maxDocs; i--) {
				files.remove(i);
			}
		}
		// Create the list of files from the list of file names
		List<File> fileList = new ArrayList<File>();
		for (String f : files) {
			fileList.add(new File(folder + "/" + f));
		}
		File[] a = new File[fileList.size()];

		this.addDocumentsInList(fileList.toArray(a));
	}

	/**
	 * Adds all the files in the list to the repository. It will filter by
	 * extension and only load files finishing with ".txt". It also ignores
	 * files starting with a dot ".".
	 *
	 * @param fileList
	 * @throws CorruptIndexException 
	 * @throws IOException
	 */
	public void addDocumentsInList(File[] fileList) throws CorruptIndexException, IOException {

		long time = System.currentTimeMillis();

		this.openIndexWriter();

		logger.debug("Adding files using encoding " + this.encoding);

		int count = 0;
		doRepositoryAction(new RepositoryEvent(this, "addingDocument", 0, fileList.length));
		for (File f : fileList) {
			if (!f.isDirectory() && !f.getName().startsWith(".")) {
				// Calculating the file extension (e.g. .txt or .html)
				String[] pieces = f.getName().split("\\.");
				String extension = pieces[pieces.length - 1];

				// We use the file extension to get an importer
				Importer importer = AbstractImporter.createImporter(extension);
				if (importer == null) {
					logger.info("Don't know how to parse ." + extension
							+ " files, ignoring " + f.getName());
					continue;
				}

				logger.debug("Using importer " + importer.getClass().getName());
				String content = null;
				try {
					content = getFileContent(f, this.encoding);
					String title = f.getName().replace("." + extension, "");
					String url = f.getAbsolutePath();
					String externalid = cleanIdForLucene(title);
					logger.debug("Adding document " + count + ":" + f.getName());
					this.addDocumentToOpenIndex(externalid, content, title, url,
							importer);
				} catch (IOException e) {
					e.printStackTrace();
					logger.error("Failed to load content or adding document to index for file " + f);
				} catch (SQLException e) {
					e.printStackTrace();
					logger.error("Fatal error insterting documents in the database");
					throw new IOException(e);
				} finally {
					count++;
					doRepositoryAction(new RepositoryEvent(this, "addingDocument", count, fileList.length));
				}
			} else {
				logger.debug("Ignoring document " + f.getName());
			}
		}

		this.closeIndexWriter();

		time = System.currentTimeMillis() - time;

		doRepositoryAction(new RepositoryEvent(this, "addingDocument", fileList.length, fileList.length));
		logger.info("Successfully added " + count + " documents in " + time
				+ " ms");
	}

	private void addDocumentToOpenIndex(String externalId, String content,
			String title, String url, Importer importer)
					throws IOException, SQLException {

		if (importer != null) {
			content = importer.getCleanContent(content);
		} else if (this.defaultImporter != null) {
			content = this.defaultImporter.getCleanContent(content);
		}

		if (content == null) {
			content = "";
		}

		Document doc = this.createDocument(content,
				"document",
				"null",
				"null",
				externalId,
				title,
				url);
		this.addSegmentsInDocument(content, doc, externalId);
		this.addDocumentToOpenIndex(doc);
	}

	/**
	 * Chops a content in pieces and adds a new document for each piece into the
	 * Lucene Index. The documents will have the type "segment" and will refer
	 * to its parent using the field "parent".
	 *
	 * @param content
	 *            the content of the document to chop
	 * @param document
	 *            the Lucene Document
	 * @param docId
	 *            the id of the document
	 * @throws IOException
	 * @throws SQLException 
	 */
	private void addSegmentsInDocument(String content, Document document,
			String docId) throws IOException, SQLException {

		String title = document.get(this.getLuceneTitleField());
		logger.debug("Adding segments to document " + docId + "[" + title
				+ "]");

		long time = System.currentTimeMillis();

		BufferedReader strReader = new BufferedReader(new StringReader(content));

		String line = null;
		int sentenceNumber = 0;
		int paragraphNumber = 0;
		int ignoredLines = 0;
		int ignoredSentences = 0;
		boolean isBibliography = false;
		logger.debug("Parsing text with " + this.locale);
		while ((line = strReader.readLine()) != null && !isBibliography) {
			BreakIterator iterator = BreakIterator.getSentenceInstance(this.locale);
			iterator.setText(line);
			int start = iterator.first();
			int end = 0;
			List<String> sentencesList = new ArrayList<String>();
			while ((end = iterator.next()) != BreakIterator.DONE) {
				sentencesList.add(line.substring(start, end));
				start = end;
			}
			String documentId = docId;
			if (line.length() >= 2) {
				String lowLine = line.trim().toLowerCase().replaceAll("\\W", "");
				if (isBibliographyTitle(lowLine)) {
					isBibliography = true;
					continue;
				}
				paragraphNumber++;
				String paragraphExtId = "p" + paragraphNumber + "d" + documentId;
				this.addTextPassageToOpenIndex(
						line,
						"paragraph",
						documentId,
						documentId,
						paragraphExtId,
						"Paragraph " + paragraphNumber + " of " + title,
						"N/A");
				int numSentence = 0;
				doRepositoryAction(new RepositoryEvent(this, "addingSentence", 0, sentencesList.size()));
				for (String sentence : sentencesList) {
					String url = "N/A";
					if (sentence.length() >= 2) {
						numSentence++;
						sentenceNumber++;
						doRepositoryAction(new RepositoryEvent(this, "addingSentence", numSentence, sentencesList.size()));
						if (numSentence == sentencesList.size()) {
							url = "last";
						}
						String sentenceExtId = "s" + sentenceNumber + "d" + documentId;
						this.addTextPassageToOpenIndex(
								sentence,
								"sentence",
								paragraphExtId,
								documentId,
								sentenceExtId,
								"Sentence " + sentenceNumber + " of " + title,
								url);
					} else {
						ignoredSentences++;
					}
				}
			} else {
				ignoredLines++;
			}
		}

		time = System.currentTimeMillis() - time;
		doRepositoryAction(new RepositoryEvent(this, "addingSentence", 100, 100));

		logger.debug("Added " + paragraphNumber + " paragraphs and "
				+ sentenceNumber + " sentences.");
		logger.debug("Ignored " + ignoredLines + " paragraphs and "
				+ ignoredSentences + " sentences.");
	}

	/**
	 * Inserts a new text passage into the Repository.
	 *
	 * @param content
	 *            the content of the document
	 * @param title
	 *            the title of the document
	 * @param url
	 *            the url of the document
	 * @param type
	 *            the type of the document ("document", "sentence" or
	 *            "paragraph")
	 * @param parent
	 *            the id of the parent document (when type is segment)
	 * @return the Lucene Document that was just added
	 * @throws IOException
	 * @throws SQLException 
	 */
	private Document addTextPassageToOpenIndex(String content, String type,
			String parent, String parentDocument, String externalId, String title, String url) throws IOException, SQLException {

		Document document = new Document();
		document.add(new Field(this.getLuceneContentField(), content,
				Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS));
		document.add(new Field(this.getLuceneExternalIdField(), externalId,
				Store.YES, Index.NOT_ANALYZED, TermVector.NO));
		document.add(new Field(this.getLuceneTitleField(), title, Store.YES,
				Index.NOT_ANALYZED, TermVector.NO));
		document.add(new Field(this.getLuceneUrlField(), url, Store.YES,
				Index.NOT_ANALYZED, TermVector.NO));
		document.add(new Field("indexdate", Calendar.getInstance().getTime().toString(), Store.YES, Index.NOT_ANALYZED, TermVector.NO));
		document.add(new Field(this.getLuceneParentField(), parent, Store.YES,
				Index.NOT_ANALYZED, TermVector.NO));
		document.add(new Field("type", type, Store.YES, Index.NOT_ANALYZED,
				TermVector.NO));
		document.add(new Field("parent", parentDocument, Store.YES, Index.NOT_ANALYZED,
				TermVector.NO));

		this.getDbConnection().insertDocument(this, document);

		Term term = new Term("externalid", externalId);

		luceneIndexWriter.updateDocument(term, document);

		return document;
	}

	/**
	 * Inserts a new text passage into the Repository.
	 *
	 * @param content
	 *            the content of the document
	 * @param title
	 *            the title of the document
	 * @param url
	 *            the url of the document
	 * @param type
	 *            the type of the document ("document", "sentence" or
	 *            "paragraph")
	 * @param parent
	 *            the id of the parent document (when type is segment)
	 * @return the Lucene Document that was just added
	 * @throws IOException
	 * @throws SQLException 
	 */
	private Document addDocumentToOpenIndex(Document document) throws IOException, SQLException {

		this.getDbConnection().insertDocument(this, document);

		Term term = new Term("externalid", document.get(this.getLuceneExternalIdField()));

		luceneIndexWriter.updateDocument(term, document);

		return document;
	}

	/**
	 * Inserts a new text passage into the Repository.
	 *
	 * @param content
	 *            the content of the document
	 * @param title
	 *            the title of the document
	 * @param url
	 *            the url of the document
	 * @param type
	 *            the type of the document ("document", "sentence" or
	 *            "paragraph")
	 * @param parent
	 *            the id of the parent document (when type is segment)
	 * @return the Lucene Document that was just added
	 * @throws IOException
	 */
	private Document createDocument(String content, String type,
			String parent, String parentDocument, String externalId, String title, String url) throws IOException {

		Document document = new Document();
		document.add(new Field(this.getLuceneContentField(), content,
				Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS));
		document.add(new Field(this.getLuceneExternalIdField(), externalId,
				Store.YES, Index.NOT_ANALYZED, TermVector.NO));
		document.add(new Field(this.getLuceneTitleField(), title, Store.YES,
				Index.NOT_ANALYZED, TermVector.NO));
		document.add(new Field(this.getLuceneUrlField(), url, Store.YES,
				Index.NOT_ANALYZED, TermVector.NO));
		document.add(new Field("indexdate", Calendar.getInstance().getTime().toString(), Store.YES, Index.NOT_ANALYZED, TermVector.NO));
		document.add(new Field(this.getLuceneParentField(), parent, Store.YES,
				Index.NOT_ANALYZED, TermVector.NO));
		document.add(new Field("type", type, Store.YES, Index.NOT_ANALYZED,
				TermVector.NO));
		document.add(new Field("parent", parentDocument, Store.YES, Index.NOT_ANALYZED,
				TermVector.NO));

		return document;
	}

	public Thread annotateDocuments() {
		DocumentAnnotator process = new DocumentAnnotator(this);
		Thread t = new Thread(process);
		t.start();
		return t;
	}

	private void closeIndexWriter() throws CorruptIndexException, IOException {
		if(luceneIndexWriter == null)
			return;

		luceneIndexWriter.commit();
		luceneIndexWriter.optimize(true);
		luceneIndexWriter.close(true);
	}

	/**
	 * Deletes a document from the repository. A TextDocument object must be
	 * used so the document must be first obtained from the repository.
	 *
	 * @param document
	 * @throws IOException
	 */
	public void deleteTextDocument(TextDocument document) throws IOException {
		logger.info("Deleting document " + document);
		Term term = new Term(
				this.luceneExternalIdField,
				document.getExternalId());
		this.openIndexWriter();
		luceneIndexWriter.deleteDocuments(term);
		term = new Term(this.luceneParentDocumentField, document.getExternalId());
		luceneIndexWriter.deleteDocuments(term);
		this.closeIndexWriter();
	}

	/**
	 * Returns a list with all the documents in the repository in
	 * {@link TextDocument} form
	 *
	 * @return a list of {@link TextDocument}
	 * @throws Exception
	 */
	public List<TextDocument> getAllTextDocuments() throws Exception {
		List<TextDocument> documents = new ArrayList<TextDocument>();
		RepositoryCorpus corpus = new RepositoryCorpus();
		try {
			corpus.setParameters(CorpusParameters.getNoReductionParameters());
			corpus.load(this);
		} catch (NoDocumentsInCorpusException e) {
			return documents;
		} catch (Exception e) {
			throw e;
		}
		for (String externalId : corpus.getPassages()) {
			documents.add(getTextDocument(cleanIdForLucene(externalId)));
		}
		return documents;
	}

	/**
	 * Gets the Lucene analyzer that the {@link Repository} is using
	 *
	 * @return the {@link Analyzer}
	 */
	public Analyzer getAnalyzer() {
		return analyzer;
	}

	/**
	 * @return the annotators available for this repository
	 */
	public List<Annotator> getAnnotators() {
		return annotators;
	}

	/**
	 * Gets the content of a field for a document, using its external id.
	 * @param externalId the id of the document
	 * @param fieldname the name of the field to retrieve
	 * @return the content of the field
	 * @throws IOException
	 */
	public String getDocumentField(String externalId, String fieldname) throws IOException {
		Document document = getLuceneDocument(externalId);
		return document.get(fieldname);
	}

	/**
	 * @return the encoding used by TML
	 */
	public String getEncoding() {
		return encoding;
	}

	/**
	 * @return the path to the Lucene index
	 */
	public String getIndexPath() {
		return indexPath;
	}

	/**
	 * Obtains an IndexReader of the Lucene index
	 *
	 * @return the IndexReader
	 * @throws IOException
	 */
	public IndexReader getIndexReader() throws IOException {
		if (luceneIndexReader == null || !luceneIndexReader.isCurrent()) {
			luceneIndexReader = FilterIndexReader.open(SimpleFSDirectory.open(new File(this.indexPath)), true);
		}
		return luceneIndexReader;
	}

	/**
	 * Obtains an IndexSearcher for the Lucene index
	 *
	 * @return the IndexSearcher
	 * @throws IOException
	 */
	public IndexSearcher getIndexSearcher() throws IOException {
		return new IndexSearcher(this.getIndexReader());
	}

	/**
	 * @return the {@link Locale} being used by TML
	 */
	public Locale getLocale() {
		return locale;
	}

	/**
	 * Gets the name of the field used by the underlying Lucene index for the
	 * content
	 *
	 * @return the name of the content field
	 */
	public String getLuceneContentField() {
		return luceneContentField;
	}

	private Document getLuceneDocument(String externalId) throws IOException {
		TopDocs hits = getLuceneDocumentHits(externalId);
		if (hits == null) {
			throw new IOException("Document " + externalId + " not found!");
		}
		Document doc = this.getIndexSearcher().doc(hits.scoreDocs[0].doc);
		return doc;
	}

	private TopDocs getLuceneDocumentHits(String externalId) throws IOException {
		QueryParser parser = new QueryParser(Version.LUCENE_29,
				this.getLuceneContentField(),
				new KeywordAnalyzer());
		//		logger.debug("Retrieving document " + externalId);
		String query = "externalid:" + externalId;
		Query documentsQuery;
		try {
			documentsQuery = parser.parse(query);
		} catch (ParseException e) {
			logger.error("Invalid externalId:" + externalId);
			e.printStackTrace();
			return null;
		}

		TopDocs hits = this.getIndexSearcher().search(documentsQuery, 9999);
		if (hits.totalHits < 1) {
			return null;
		}
		if (hits.totalHits > 1) {
			throw new IOException("The query returned more than one document");
		}

		return hits;
	}

	/**
	 * Gets the name of the field used by the underlying Lucene index for the
	 * external id
	 *
	 * @return the name of the external id field
	 */
	public String getLuceneExternalIdField() {
		return luceneExternalIdField;
	}

	/**
	 * Gets the name of the field used by the underlying Lucene index for the
	 * parent
	 *
	 * @return the name of the parent field
	 */
	public String getLuceneParentField() {
		return luceneParentField;
	}

	/**
	 * @return the name of the field used to store the PennTree bank string
	 */
	public String getLucenePenntreeField() {
		return lucenePenntreeField;
	}

	/**
	 * Gets the name of the field used by the underlying Lucene index for the
	 * title
	 *
	 * @return the name of the title field
	 */
	public String getLuceneTitleField() {
		return luceneTitleField;
	}

	/**
	 * @return the name of the field that stores the type of the Lucene
	 * document (document, paragraph or sentence)
	 */
	public String getLuceneTypeField() {
		return luceneTypeField;
	}

	/**
	 * Gets the name of the field used by the underlying Lucene index for the
	 * url
	 *
	 * @return the name of the url field
	 */
	public String getLuceneUrlField() {
		return luceneUrlField;
	}

	/**
	 * @return the maxDocumentsToIndex
	 */
	public int getMaxDocumentsToIndex() {
		return maxDocumentsToIndex;
	}

	/**
	 * Gets the {@link Importer} used to transform the content before inserting
	 * into the {@link Repository}
	 *
	 * @return the {@link Importer} being used by TML
	 */
	public Importer getParser() {
		return defaultImporter;
	}

	/**
	 * @return the list of stopwords used to analyse and parse documents
	 */
	public String[] getStopwords() {
		return stopwords;
	}

	private String[] getStopWordsFromBufferedReader(BufferedReader reader)
			throws IOException {
		List<String> stopwords = new ArrayList<String>();
		String line = reader.readLine();
		while (line != null) {
			stopwords.add(line);
			line = reader.readLine();
		}
		String[] output = new String[stopwords.size()];
		return stopwords.toArray(output);
	}

	/**
	 * Processes a file and returns each line in an array. It's useful to
	 * transform a stopwords file into the list that Lucene needs.
	 *
	 * @param file
	 *            an absolute path to the stopwords file
	 * @return an array of stop words
	 * @throws IOException
	 */
	private String[] getStopWordsFromFile(File file) throws IOException {
		List<String> stopwords = new ArrayList<String>();
		BufferedReader reader = null;
		if (file != null) {
			reader = new BufferedReader(new FileReader(file));
		} else {
			reader = new BufferedReader(new InputStreamReader(ClassLoader.getSystemResourceAsStream("/stopwords.txt")));
		}
		String line = reader.readLine();
		while (line != null) {
			stopwords.add(line);
			line = reader.readLine();
		}
		String[] output = new String[stopwords.size()];
		return stopwords.toArray(output);
	}

	/**
	 * @return the svdStoragePath
	 */
	public String getSvdStoragePath() {
		return svdStoragePath;
	}

	/**
	 * Gets a document from the repository by its external id. Returns a
	 * {@link TextDocument} object with basic information about the document,
	 * like title and url. In order to perform operations on the documents, it
	 * must be loaded, which means that a {@link Corpus} and its inner
	 * {@link SemanticSpace} will be created.
	 *
	 * @param externalId
	 *            the id of the document
	 * @return a {@link TextDocument}
	 * @throws IOException
	 */
	public TextDocument getTextDocument(String externalId) throws IOException {
		TopDocs hits = getLuceneDocumentHits(externalId);

		Document doc = this.getIndexSearcher().doc(hits.scoreDocs[0].doc);
		TextDocument document = new TextDocument(hits.scoreDocs[0].doc, doc.get(getLuceneTitleField()), doc.get(getLuceneUrlField()),
				externalId, doc.get(getLuceneContentField()));
		return document;
	}

	/**
	 * Add reference
	 *
	 * @param sentence
	 *            the sentence to evaluate
	 * @return if the sentence corresponds to the title of the references
	 *         section
	 */
	public boolean isBibliographyTitle(String sentence) {
		String[] words = sentence.split("\\s");
		if (words.length >= 4) {
			return false;
		}
		for (String word : words) {
			if (word.toLowerCase().matches(
					"(\\d+)?\\s*((resources?)|(references?)|(bibliography)|(notes?))\\s*")) {
				return true;
			}
		}
		return false;
	}

	private void openIndexWriter() throws LockObtainFailedException, CorruptIndexException, IOException {
		Directory dir = null;
		try {
			dir = SimpleFSDirectory.open(new File(indexPath));
		} catch (IOException e) {
			e.printStackTrace();
			throw e;
		}
		try {
			luceneIndexWriter = new IndexWriter(
					dir, 
					this.analyzer, 
					IndexWriter.MaxFieldLength.UNLIMITED);
		} catch (CorruptIndexException e) {
			e.printStackTrace();
			throw e;
		} catch (LockObtainFailedException e) {
			logger.error("Index is locked! Trying to unlock.");
			IndexWriter.unlock(dir);
			luceneIndexWriter = new IndexWriter(
					dir, 
					this.analyzer, 
					IndexWriter.MaxFieldLength.UNLIMITED);
		} catch (IOException e) {
			e.printStackTrace();
			throw e;
		}
	}

	/**
	 * Removes an annotator to the repository
	 * @param annotator the annotator
	 */
	public void removeAnnotator(Annotator annotator) {
		this.annotators.remove(annotator);
	}

	/**
	 * Sets the character encoding that will be used in this repository
	 *
	 * @param encoding
	 */
	public void setEncoding(String encoding) {
		if (Charset.isSupported(encoding)) {
			this.encoding = encoding;
		} else {
			logger.info("Invalid encoding or not supported");
		}
	}

	/**
	 * @param maxDocumentsToIndex the maxDocumentsToIndex to set
	 */
	public void setMaxDocumentsToIndex(int maxDocumentsToIndex) {
		this.maxDocumentsToIndex = maxDocumentsToIndex;
	}

	public String getAnnotations(String documentId,
			String fieldName) {

		return this.getDbConnection().getAnnotation(documentId, fieldName);
	}

	public Thread cleanup() {
		DocumentCleanup process = new DocumentCleanup(this);
		Thread t = new Thread(process);
		t.start();
		return t;
	}

	private void initializeCleanupTimer() throws IOException {

		cleanupTimer = new Timer();

		TmlCleanupTask task = new TmlCleanupTask(this);

		int seconds = 300;
		try {
			seconds = Integer.parseInt(Configuration.getTmlProperties()
					.getProperty("tml.cleanup.interval"));
		} catch (Exception e) {
			e.printStackTrace();
			logger.error("Annotator interval not set or invalid "
					+ Configuration.getTmlProperties().getProperty("tml.cleanup.interval"));
		}
		logger.info("TML cleanup started every " + seconds + " seconds");
		cleanupTimer.schedule(task, new Date(), seconds * 1000);
	}

	private void initializeAnnotatorTimer() throws IOException {
		if(this.getAnnotators().size() == 0) {
			logger.info("There are no annotators, no need to run.");
			return;
		}

		annotatorTimer = new Timer();

		TmlAnnotatorTask task = new TmlAnnotatorTask(this);

		int seconds = 300;
		try {
			seconds = Integer.parseInt(Configuration.getTmlProperties()
					.getProperty("tml.annotator.interval"));
		} catch (Exception e) {
			e.printStackTrace();
			logger.error("Annotator interval not set or invalid "
					+ Configuration.getTmlProperties().getProperty("tml.annotator.interval"));
		}
		logger.info("TML annotator started every " + seconds + " seconds");
		annotatorTimer.schedule(task, new Date(), seconds * 1000);
	}

	private void initializeIndexerTimer() throws IOException {
		indexerTimer = new Timer();

		TmlIndexerTask task = new TmlIndexerTask(this);
		task.setMaxFilesToProcess(1);
		task.setUploadFolder(Configuration.getTmlFolder() + "upload");

		int seconds = 300;
		try {
			seconds = Integer.parseInt(Configuration.getTmlProperties()
					.getProperty("tml.indexer.interval"));
		} catch (Exception e) {
			e.printStackTrace();
			logger.error("Indexer interval not set or invalid "
					+ Configuration.getTmlProperties().getProperty("tml.indexer.interval"));
		}
		logger.info("TML indexer started every " + seconds + " seconds");
		indexerTimer.schedule(task, new Date(), seconds * 1000);
	}
}