/*******************************************************************************
* Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package tml.corpus;
import java.io.IOException;
import org.apache.lucene.queryParser.ParseException;
import tml.corpus.CorpusParameters.DimensionalityReduction;
import tml.corpus.CorpusParameters.TermSelection;
import tml.storage.Repository;
import tml.vectorspace.NoDocumentsInCorpusException;
import tml.vectorspace.NotEnoughTermsInCorpusException;
import tml.vectorspace.TermWeightingException;
import tml.vectorspace.TermWeighting.GlobalWeight;
import tml.vectorspace.TermWeighting.LocalWeight;
/**
* <p>
* The TextDocument class represents a whole document, which comprises a
* content, a title and a url. Each document is identified by an id, known as
* the externalId. It also has an internal id, the Lucene Id, which identifies
* the document within the underlying Lucene index.
* </p>
* <p>
* A TextDocument contains two corpora, a sentence based {@link Corpus} and a
* paragraph based {@link Corpus}. The TextDocument is responsible for loading
* both and assigning the necessary parameters for their creation. This means
* that the construction of the {@link Corpus} and the {@link SemanticSpace} are
* defined on a per document basis.
* </p>
* <p>
* The TextDocument contains a duplicate of its content, this can cause
* scalability problems with long documents (more than 2000 terms, aprox. 10000
* words)
* </p>
* <p>
* The most basic way to use a TextDocument is to perform operations to its
* corpora. Operations can be calculating semantic distances between sentences
* or extracting the most important paragraphs (based on variance) to give some
* examples.
* </p>
* <p>
* The following example shows how to obtain a {@link TextDocument} from a
* {@link Repository} and then how to extract the key sentences.
* </p>
*
* <pre>
* Repository repository = new Repository("path/to/repository");
* TextDocument document = repository.getTextDocument("foo");
* if (document != null) {
* System.out.println("Document " + document.getTitle() + " found");
* }
* </pre>
* <p>
* Now we are going to set the parameters to load the document's corpora and
* load them.
* </p>
*
* <pre>
* document.setTermSelection(TermSelection.MIN_DF);
* document.setTermSelectionThreshold(1);
* document.setTermLocalWeight(LocalWeight.TF);
* document.setTermGlobalWeight(GlobalWeight.Idf);
* document
* .setDimensionalityReduction(DimensionalityReduction.DIMENSIONS_MAX_PERCENTAGE);
* document.setDimensionalityReductionThreshold(50);
* document.setDimensionsReduced(true);
* document.setNormalized(true);
* document.load(repository);
* </pre>
* <p>
* Finally we can perform an operation and show the results.
* </p>
*
* <pre>
* KeyTextPassages operation = new KeyTextPassages();
* operation.setCorpus(document.getSentenceCorpus());
* operation.start();
*
* for (KeyTextPassagesResult result : operation.getResults()) {
* System.out.println("Sentence id: " + result.getTextPassageId()
* + " from eigenvector:" + result.getEigenVectorIndex()
* + " with load:" + result.getLoad() + " content:"
* + result.getTextPassageContent());
* }
* </pre>
*
* @see Repository AbstractOperation Corpus
* @author Jorge Villalon
*
*/
public class TextDocument {
/** The Lucene id of the document */
private int luceneId;
/** The title of the document */
private String title;
/** The url of the document */
private String url;
/** The external id of the document */
private String externalId;
/** The content of the document */
private String content;
/** The sentence corpus of the document */
private SentenceCorpus sentenceCorpus = null;
/** The paragraph corpus of the document */
private ParagraphCorpus paragraphCorpus = null;
private CorpusParameters parameters = null;
/**
* Constructor of {@link TextDocument}. It creates a new instance of a
* TextDocument. It should be used only by the {@link Repository}.
*
* @param luceneId
* the id within the Lucene index
* @param title
* the title of the document
* @param url
* the url of the document
* @param externalId
* the external id
* @param content
* the content of the document
*/
public TextDocument(int luceneId, String title, String url,
String externalId, String content) {
super();
this.luceneId = luceneId;
this.title = title;
this.url = url;
this.externalId = externalId;
this.content = content;
this.parameters = new CorpusParameters();
this.parameters.setTermSelectionCriterion(TermSelection.DF);
this.parameters.setTermSelectionThreshold(1);
this.parameters.setDimensionalityReduction(DimensionalityReduction.PCT);
this.parameters.setDimensionalityReductionThreshold(25);
this.parameters.setTermWeightLocal(LocalWeight.LOGTF);
this.parameters.setTermWeightGlobal(GlobalWeight.Entropy);
}
/**
* Gets the content of the document
*
* @return the content
*/
public String getContent() {
return content;
}
/**
* Gets the external id used when the document was inserted.
*
* @return the external id
*/
public String getExternalId() {
return externalId;
}
/**
* Gets the Lucene internal id of the document
*
* @return the Lucene id
*/
public int getLuceneId() {
return luceneId;
}
/**
* Gets the {@link ParagraphCorpus} created with the paragraphs of the
* {@link TextDocument}
*
* @return A {@link ParagraphCorpus} object or null.
*/
public ParagraphCorpus getParagraphCorpus() {
return paragraphCorpus;
}
/**
* Gets the {@link SentenceCorpus} created with the sentences of the
* {@link TextDocument}
*
* @return A {@link SentenceCorpus} object or null.
*/
public SentenceCorpus getSentenceCorpus() {
return sentenceCorpus;
}
/**
* Gets the title of the document
*
* @return the title
*/
public String getTitle() {
return title;
}
/**
* Gets the url of the document
*
* @return the url
*/
public String getUrl() {
return url;
}
/**
* Loads the corpora for the {@link TextDocument} with all the parameters
* that the document has set. To load the term frequency vectors, a pointer
* to the repository is necessary.
*
* @param repository
* @throws Exception
* @throws IOException
* @throws ParseException
* @throws NotEnoughTermsInCorpusException
* @throws NoDocumentsInCorpusException
* @throws TermWeightingException
* @throws NormalizationException
*/
public void load(Repository repository) throws Exception, IOException, ParseException,
NotEnoughTermsInCorpusException, NoDocumentsInCorpusException,
TermWeightingException {
this.sentenceCorpus = new SentenceCorpus(this);
this.sentenceCorpus.setName("Sentences from " + title);
this.sentenceCorpus.setParameters(parameters);
this.sentenceCorpus.load(repository);
this.paragraphCorpus = new ParagraphCorpus(this);
this.paragraphCorpus.setName("Paragraphs from " + title);
this.paragraphCorpus.setParameters(parameters);
this.paragraphCorpus.load(repository);
}
/**
* @return the parameters
*/
public CorpusParameters getParameters() {
return parameters;
}
/**
* @param parameters the parameters to set
*/
public void setParameters(CorpusParameters parameters) {
this.parameters = parameters;
}
/**
* The default view of a TextDocument is its title
*/
@Override
public String toString() {
return this.title;
}
}