TextDocument.java example

Explorer

tml-master
- tml
  - src
    - main
      - java
        SemanticSpace.java
        TmlCommandLine.java
        tml
        Configuration.java
        annotators
        AbstractAnnotator.java
        Annotator.java
        AnnotatorManager.java
        PennTreeAnnotator.java
        corpus
        Corpus.java
        CorpusParameters.java
        Dictionary.java
        ParagraphCorpus.java
        RepositoryCorpus.java
        SearchResultsCorpus.java
        SentenceCorpus.java
        SimpleCorpus.java
        Term.java
        TextDocument.java
        TextPassage.java
        sql
        DbConnection.java
        storage
        DocumentAnnotator.java
        DocumentCleanup.java
        Repository.java
        RepositoryEvent.java
        RepositoryListener.java
        TmlAnnotatorTask.java
        TmlCleanupTask.java
        TmlIndexerTask.java
        importers
        AbstractImporter.java
        HtmlImporter.java
        Importer.java
        PdfImporter.java
        TextImporter.java
        test
        AbstractTmlIndexingTest.java
        utils
        DBUtils.java
        DistanceLib.java
        Highlighting.java
        JDBCUtils.java
        LanczosSVDLIBCUtils.java
        LanczosSVDPACKCUtils.java
        LuceneUtils.java
        MatrixUtils.java
        RegexUtils.java
        StanfordUtils.java
        Stats.java
        WordNetUtils.java
        vectorspace
        EmptyTextPassageException.java
        NoDocumentsInCorpusException.java
        NotEnoughTermsInCorpusException.java
        SVD.java
        SemanticSpace.java
        TermWeighting.java
        TermWeightingException.java
        factorisation
        MatrixFactorisation.java
        MultiDimensionalScalingNR.java
        NonnegativeMatrixFactorisationED.java
        NonnegativeMatrixFactorisationKL.java
        PrincipalCoordinateAnalysis.java
        ProbabilisticLatentSemanticAnalysis.java
        SingularValueDecomposition.java
        SpaceDecomposition.java
        operations
        AbstractOperation.java
        ClassDiscovery.java
        CompoundNounsSummarized.java
        ConceptExtraction.java
        FactorAnalysisPlot.java
        LastPassage.java
        LexiconAnalysis.java
        Operation.java
        OperationEvent.java
        OperationListener.java
        ParagraphCoherenceIndex.java
        PassageDistances.java
        PassageExtractionSummarization.java
        PassagesSimilarity.java
        RapidAutomaticKeywordExtraction.java
        Readability.java
        RelationshipExtraction.java
        Summary.java
        TagClouds.java
        TermExtractionSummarization.java
        results
        AbstractResult.java
        FactorAnalysisPlotResult.java
        LastPassageResult.java
        LexiconAnalysisResult.java
        NullResult.java
        ParagraphCoherenceIndexResult.java
        PassageClusteringLingoResult.java
        PassageDistancesResult.java
        PassageExtractionSummarizationResult.java
        PassageSimilarityResult.java
        RapidAutomaticKeywordExtractionResult.java
        ReadabilityResult.java
        RelationshipExtractionResult.java
        Summary.java
        SummaryResult.java
        TagCloudsResult.java
        TermRankedResult.java
        TermsExtractionSummarizationResult.java
        summarization
        AbstractSummarizationOperation.java
        LatentSemanticAnalysisSummarization.java
        SummarizationOperation.java
        VectorLengthSummarization.java
        visualizations
        AbstractVisualization.java
        TagClouds.java
        Visualization.java
    - test
      - java
        tml
        test
        DbConnectionTest.java
        FactorAnalysisPlotTest.java
        IndexingDocumentsTest.java
        IndexingHtmlTest.java
        IndexingInvalidDocumentsTest.java
        IndexingPlainTextTest.java
        LanczosTest.java
        LuceneSearchTest.java
        NonNegativeMatrixFactorizationTest.java
        RapidAutomaticKeywordExtractionTest.java
        ReadabilityTest.java
        SimpleCorpusTest.java
        StemmingTest.java
        TagCloudsTest.java
        ValidateBerryDumaisTest.java
        ValidateDistancesTest.java
        ValidateHandbookOfLSATest.java
        ValidateIntroToLSATest.java
        ValidateSameDistancesAllDimensions.java

/*******************************************************************************
 *  Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
 *  
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License. 
 *  You may obtain a copy of the License at 
 *  
 *  	http://www.apache.org/licenses/LICENSE-2.0 
 *  	
 *  Unless required by applicable law or agreed to in writing, software 
 *  distributed under the License is distributed on an "AS IS" BASIS, 
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 *  See the License for the specific language governing permissions and 
 *  limitations under the License.
 *******************************************************************************/
package tml.corpus;

import java.io.IOException;

import org.apache.lucene.queryParser.ParseException;

import tml.corpus.CorpusParameters.DimensionalityReduction;
import tml.corpus.CorpusParameters.TermSelection;
import tml.storage.Repository;
import tml.vectorspace.NoDocumentsInCorpusException;
import tml.vectorspace.NotEnoughTermsInCorpusException;
import tml.vectorspace.TermWeightingException;
import tml.vectorspace.TermWeighting.GlobalWeight;
import tml.vectorspace.TermWeighting.LocalWeight;



/**
 * <p>
 * The TextDocument class represents a whole document, which comprises a
 * content, a title and a url. Each document is identified by an id, known as
 * the externalId. It also has an internal id, the Lucene Id, which identifies
 * the document within the underlying Lucene index.
 * </p>
 * <p>
 * A TextDocument contains two corpora, a sentence based {@link Corpus} and a
 * paragraph based {@link Corpus}. The TextDocument is responsible for loading
 * both and assigning the necessary parameters for their creation. This means
 * that the construction of the {@link Corpus} and the {@link SemanticSpace} are
 * defined on a per document basis.
 * </p>
 * <p>
 * The TextDocument contains a duplicate of its content, this can cause
 * scalability problems with long documents (more than 2000 terms, aprox. 10000
 * words)
 * </p>
 * <p>
 * The most basic way to use a TextDocument is to perform operations to its
 * corpora. Operations can be calculating semantic distances between sentences
 * or extracting the most important paragraphs (based on variance) to give some
 * examples.
 * </p>
 * <p>
 * The following example shows how to obtain a {@link TextDocument} from a
 * {@link Repository} and then how to extract the key sentences.
 * </p>
 * 
 * <pre>
 * Repository repository = new Repository("path/to/repository");
 * TextDocument document = repository.getTextDocument("foo");
 * if (document != null) {
 * 	System.out.println("Document " + document.getTitle() + " found");
 * }
 * </pre>
 * <p>
 * Now we are going to set the parameters to load the document's corpora and
 * load them.
 * </p>
 * 
 * <pre>
 * document.setTermSelection(TermSelection.MIN_DF);
 * document.setTermSelectionThreshold(1);
 * document.setTermLocalWeight(LocalWeight.TF);
 * document.setTermGlobalWeight(GlobalWeight.Idf);
 * document
 * 		.setDimensionalityReduction(DimensionalityReduction.DIMENSIONS_MAX_PERCENTAGE);
 * document.setDimensionalityReductionThreshold(50);
 * document.setDimensionsReduced(true);
 * document.setNormalized(true);
 * document.load(repository);
 * </pre>
 * <p>
 * Finally we can perform an operation and show the results.
 * </p>
 * 
 * <pre>
 * KeyTextPassages operation = new KeyTextPassages();
 * operation.setCorpus(document.getSentenceCorpus());
 * operation.start();
 * 
 * for (KeyTextPassagesResult result : operation.getResults()) {
 * 	System.out.println("Sentence id: " + result.getTextPassageId()
 * 			+ " from eigenvector:" + result.getEigenVectorIndex()
 * 			+ " with load:" + result.getLoad() + " content:"
 * 			+ result.getTextPassageContent());
 * }
 * </pre>
 * 
 * @see Repository AbstractOperation Corpus
 * @author Jorge Villalon
 * 
 */
public class TextDocument {

	/** The Lucene id of the document */
	private int luceneId;
	/** The title of the document */
	private String title;
	/** The url of the document */
	private String url;
	/** The external id of the document */
	private String externalId;
	/** The content of the document */
	private String content;

	/** The sentence corpus of the document */
	private SentenceCorpus sentenceCorpus = null;
	/** The paragraph corpus of the document */
	private ParagraphCorpus paragraphCorpus = null;

	private CorpusParameters parameters = null;

	/**
	 * Constructor of {@link TextDocument}. It creates a new instance of a
	 * TextDocument. It should be used only by the {@link Repository}.
	 * 
	 * @param luceneId
	 *            the id within the Lucene index
	 * @param title
	 *            the title of the document
	 * @param url
	 *            the url of the document
	 * @param externalId
	 *            the external id
	 * @param content
	 *            the content of the document
	 */
	public TextDocument(int luceneId, String title, String url,
			String externalId, String content) {
		super();

		this.luceneId = luceneId;
		this.title = title;
		this.url = url;
		this.externalId = externalId;
		this.content = content;
		this.parameters = new CorpusParameters();
		this.parameters.setTermSelectionCriterion(TermSelection.DF);
		this.parameters.setTermSelectionThreshold(1);
		this.parameters.setDimensionalityReduction(DimensionalityReduction.PCT);
		this.parameters.setDimensionalityReductionThreshold(25);
		this.parameters.setTermWeightLocal(LocalWeight.LOGTF);
		this.parameters.setTermWeightGlobal(GlobalWeight.Entropy);
	}

	/**
	 * Gets the content of the document
	 * 
	 * @return the content
	 */
	public String getContent() {
		return content;
	}

	/**
	 * Gets the external id used when the document was inserted.
	 * 
	 * @return the external id
	 */
	public String getExternalId() {
		return externalId;
	}

	/**
	 * Gets the Lucene internal id of the document
	 * 
	 * @return the Lucene id
	 */
	public int getLuceneId() {
		return luceneId;
	}

	/**
	 * Gets the {@link ParagraphCorpus} created with the paragraphs of the
	 * {@link TextDocument}
	 * 
	 * @return A {@link ParagraphCorpus} object or null.
	 */
	public ParagraphCorpus getParagraphCorpus() {
		return paragraphCorpus;
	}

	/**
	 * Gets the {@link SentenceCorpus} created with the sentences of the
	 * {@link TextDocument}
	 * 
	 * @return A {@link SentenceCorpus} object or null.
	 */
	public SentenceCorpus getSentenceCorpus() {
		return sentenceCorpus;
	}

	/**
	 * Gets the title of the document
	 * 
	 * @return the title
	 */
	public String getTitle() {
		return title;
	}

	/**
	 * Gets the url of the document
	 * 
	 * @return the url
	 */
	public String getUrl() {
		return url;
	}

	/**
	 * Loads the corpora for the {@link TextDocument} with all the parameters
	 * that the document has set. To load the term frequency vectors, a pointer
	 * to the repository is necessary.
	 * 
	 * @param repository
	 * @throws Exception 
	 * @throws IOException
	 * @throws ParseException
	 * @throws NotEnoughTermsInCorpusException
	 * @throws NoDocumentsInCorpusException
	 * @throws TermWeightingException
	 * @throws NormalizationException
	 */
	public void load(Repository repository) throws Exception, IOException, ParseException,
			NotEnoughTermsInCorpusException, NoDocumentsInCorpusException,
			TermWeightingException {
		this.sentenceCorpus = new SentenceCorpus(this);
		this.sentenceCorpus.setName("Sentences from " + title);
		this.sentenceCorpus.setParameters(parameters);
		this.sentenceCorpus.load(repository);

		this.paragraphCorpus = new ParagraphCorpus(this);
		this.paragraphCorpus.setName("Paragraphs from " + title);
		this.paragraphCorpus.setParameters(parameters);
		this.paragraphCorpus.load(repository);
	}

	/**
	 * @return the parameters
	 */
	public CorpusParameters getParameters() {
		return parameters;
	}

	/**
	 * @param parameters the parameters to set
	 */
	public void setParameters(CorpusParameters parameters) {
		this.parameters = parameters;
	}

	/**
	 * The default view of a TextDocument is its title
	 */
	@Override
	public String toString() {
		return this.title;
	}
}