/*******************************************************************************
* Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package tml.corpus;
import java.io.IOException;
import java.sql.SQLException;
import java.util.Random;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.store.LockObtainFailedException;
import tml.corpus.CorpusParameters.DimensionalityReduction;
import tml.storage.Repository;
import tml.vectorspace.NoDocumentsInCorpusException;
import tml.vectorspace.NotEnoughTermsInCorpusException;
import tml.vectorspace.TermWeightingException;
import tml.vectorspace.TermWeighting.GlobalWeight;
import tml.vectorspace.TermWeighting.LocalWeight;
/**
* SimpleCorpus is a simple corpus which contains a set of documents from a
* folder, it consider each document a vector. It automatically loads the
* documents and creates a weighted matrix.
*
* You can change the parameters for the term loading by accessing the internal
* corpus. See more details in {@link Corpus}.
*
* @author Jorge Villalon
* @see Corpus
*
*/
public class SimpleCorpus {
private Repository repository = null;
private Corpus internalCorpus = null;
private String pathToRepository = null;
private String pathToDocuments = null;
/**
* @param pathToDocuments
* @param pathToRepository
* @throws IOException
* @throws LockObtainFailedException
* @throws CorruptIndexException
* @throws ParseException
* @throws NoDocumentsInCorpusException
* @throws NotEnoughTermsInCorpusException
* @throws NormalizationException
* @throws TermWeightingException
* @throws SQLException
*/
public SimpleCorpus(String pathToDocuments, String pathToRepository)
throws CorruptIndexException, LockObtainFailedException,
IOException, ParseException, NotEnoughTermsInCorpusException,
NoDocumentsInCorpusException, TermWeightingException, SQLException
{
this(pathToDocuments, pathToRepository, true);
}
/**
* @param pathToDocuments
* @param pathToRepository
* @param load
* @throws IOException
* @throws LockObtainFailedException
* @throws CorruptIndexException
* @throws ParseException
* @throws NoDocumentsInCorpusException
* @throws NotEnoughTermsInCorpusException
* @throws NormalizationException
* @throws TermWeightingException
* @throws SQLException
*/
public SimpleCorpus(String pathToDocuments, String pathToRepository,
boolean load) throws CorruptIndexException,
LockObtainFailedException, IOException, ParseException,
NotEnoughTermsInCorpusException, NoDocumentsInCorpusException,
TermWeightingException, SQLException {
this.pathToRepository = pathToRepository;
Random rand = new Random();
int randNum = (999 + rand.nextInt(9000));
String randPath = this.pathToRepository + "/lucene/"
+ Integer.toString(randNum);
Repository.cleanStorage(randPath);
this.repository = new Repository(randPath);
this.repository.addDocumentsInFolder(pathToDocuments);
this.internalCorpus = new RepositoryCorpus();
if (load) {
this.load();
}
}
/**
* @return the internal corpus
*/
public Corpus getCorpus() {
return internalCorpus;
}
/**
* @return the list of documents in the corpus
*/
public String[] getDocuments() {
return this.internalCorpus.getPassages();
}
/**
* @return a double array of Doubles with the weighted term/doc matrix
*/
public double[][] getMatrix() {
return this.internalCorpus.getSemanticSpace().getTermsDocuments()
.getArray();
}
/**
* @return the folder from where the documents where processed
*/
public String getPathToDocuments() {
return pathToDocuments;
}
/**
* @return the folder where the Lucene index is stored
*/
public String getPathToRepository() {
return pathToRepository;
}
/**
* @return the list of terms in the corpus
*/
public String[] getTerms() {
return this.internalCorpus.getTerms();
}
/**
* Loads the corpus (if not loaded automatically).
*
* @throws NotEnoughTermsInCorpusException
* @throws IOException
* @throws NoDocumentsInCorpusException
* @throws NormalizationException
* @throws TermWeightingException
*/
public void load() throws NotEnoughTermsInCorpusException, IOException,
NoDocumentsInCorpusException, TermWeightingException
{
this.internalCorpus.getParameters().setDimensionalityReduction(DimensionalityReduction.NO);
this.internalCorpus.load(this.repository);
}
/**
* @throws NotEnoughTermsInCorpusException
* @throws IOException
* @throws NoDocumentsInCorpusException
* @throws TermWeightingException
* @throws NormalizationException
*/
public void loadTfIdfNormalised() throws NotEnoughTermsInCorpusException,
IOException, NoDocumentsInCorpusException, TermWeightingException
{
this.internalCorpus.getParameters().setTermWeightLocal(LocalWeight.TF);
this.internalCorpus.getParameters().setTermWeightGlobal(GlobalWeight.Idf);
this.internalCorpus.getParameters().setDimensionalityReduction(DimensionalityReduction.NO);
load();
}
}