/*
* File: TextPipelineExample.java
* Authors: Justin Basilico
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright March 02, 2009, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government. Export
* of this program may require a license from the United States Government.
* See CopyrightHistory.txt for complete details.
*
*/
package examples;
import gov.sandia.cognition.math.matrix.Vector;
import gov.sandia.cognition.text.Textual;
import gov.sandia.cognition.text.convert.CommonDocumentTextualConverterFactory;
import gov.sandia.cognition.text.convert.DocumentSingleFieldConverter;
import gov.sandia.cognition.text.document.Document;
import gov.sandia.cognition.text.document.Field;
import gov.sandia.cognition.text.document.extractor.TextDocumentExtractor;
import gov.sandia.cognition.text.term.DefaultTermIndex;
import gov.sandia.cognition.text.term.Term;
import gov.sandia.cognition.text.term.filter.TermFilter;
import gov.sandia.cognition.text.term.TermOccurrence;
import gov.sandia.cognition.text.term.filter.LowerCaseTermFilter;
import gov.sandia.cognition.text.term.vector.BagOfWordsTransform;
import gov.sandia.cognition.text.term.vector.weighter.CommonTermWeighterFactory;
import gov.sandia.cognition.text.term.vector.weighter.CompositeLocalGlobalTermWeighter;
import gov.sandia.cognition.text.token.LetterNumberTokenizer;
import gov.sandia.cognition.text.token.Token;
import gov.sandia.cognition.text.token.Tokenizer;
import java.net.URI;
import java.net.URL;
/**
* An example of a typical text processing pipeline. It loads a document,
* pulls out the body field, tokenizes it, filters terms, converts it to
* counts (bag-of-words).
*
* @author Justin Basilico
* @since 3.0
*/
public class TextPipelineExample
extends Object
{
/**
* Runs the example.
*
* @param args Ignored. The example has no arguments.
* @throws Exception If there is an error.
*/
public static void main(
final String[] args)
throws Exception
{
// To start out with we get the example file as a URI.
final String exampleFileName = "examples/example.txt";
final URL exampleFileURL = ClassLoader.getSystemResource(exampleFileName);
final URI exampleFileURI = exampleFileURL.toURI();
// Now we extract the document from the URI.
final TextDocumentExtractor extractor = new TextDocumentExtractor();
final Document document = extractor.extractDocument(exampleFileURI);
// Here we can inspect all of the fields that were extracted.
System.out.println("Fields:");
for (Field field : document.getFields())
{
System.out.println(" Field: " + field.getName());
System.out.println(" Content: " + field.getText());
}
// Next we convert the document to a textual representation by
// extracting the text field.
final DocumentSingleFieldConverter converter =
CommonDocumentTextualConverterFactory.createBodyConverter();
final Textual text = converter.convert(document);
// Next we tokenize the text.
final Tokenizer tokenizer = new LetterNumberTokenizer();
final Iterable<Token> tokens = tokenizer.tokenize(text);
// Here we can look at all the tokens.
System.out.println("Tokens:");
for (Token token : tokens)
{
System.out.println(" " + token.getText());
}
// Next we filter the tokens. Note that tokens are an extension of term
// occurences.
Iterable<? extends TermOccurrence> terms = tokens;
final TermFilter[] filters =
{
new LowerCaseTermFilter()
};
for (TermFilter filter : filters)
{
terms = filter.filterTerms(terms);
}
System.out.println("Term Occurences:");
for (TermOccurrence term : terms)
{
System.out.println(" " + term.getTerm());
}
// Next we index the terms.
final DefaultTermIndex termIndex = new DefaultTermIndex();
termIndex.addAll(terms);
// Next we transform the terms to a bag of words.
final BagOfWordsTransform bagOfWords = new BagOfWordsTransform(termIndex);
final Vector counts = bagOfWords.convertToVector(terms);
System.out.println("Term Counts:");
for (int i = 0; i < termIndex.getTermCount(); i++)
{
final Term term = termIndex.getTerm(i);
final double count = counts.getElement(i);
System.out.println(" " + i + " (" + term + "): " + count);
}
// Next we do a TF-IDF transform.
final CompositeLocalGlobalTermWeighter tfidf = CommonTermWeighterFactory.createTFIDFWeighter();
tfidf.getGlobalWeighter().add(counts);
// This just makes the global weights non-zero.
Vector zeroDocument = counts.clone();
zeroDocument.zero();
tfidf.getGlobalWeighter().add(zeroDocument);
final Vector weighted = tfidf.evaluate(counts);
System.out.println("Term Weights:");
for (int i = 0; i < termIndex.getTermCount(); i++)
{
final Term term = termIndex.getTerm(i);
final double count = weighted.getElement(i);
System.out.println(" " + i + " (" + term + "): " + count);
}
}
}