/* * File: TextPipelineExample.java * Authors: Justin Basilico * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright March 02, 2009, Sandia Corporation. * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive * license for use of this work by or on behalf of the U.S. Government. Export * of this program may require a license from the United States Government. * See CopyrightHistory.txt for complete details. * */ package examples; import gov.sandia.cognition.math.matrix.Vector; import gov.sandia.cognition.text.Textual; import gov.sandia.cognition.text.convert.CommonDocumentTextualConverterFactory; import gov.sandia.cognition.text.convert.DocumentSingleFieldConverter; import gov.sandia.cognition.text.document.Document; import gov.sandia.cognition.text.document.Field; import gov.sandia.cognition.text.document.extractor.TextDocumentExtractor; import gov.sandia.cognition.text.term.DefaultTermIndex; import gov.sandia.cognition.text.term.Term; import gov.sandia.cognition.text.term.filter.TermFilter; import gov.sandia.cognition.text.term.TermOccurrence; import gov.sandia.cognition.text.term.filter.LowerCaseTermFilter; import gov.sandia.cognition.text.term.vector.BagOfWordsTransform; import gov.sandia.cognition.text.term.vector.weighter.CommonTermWeighterFactory; import gov.sandia.cognition.text.term.vector.weighter.CompositeLocalGlobalTermWeighter; import gov.sandia.cognition.text.token.LetterNumberTokenizer; import gov.sandia.cognition.text.token.Token; import gov.sandia.cognition.text.token.Tokenizer; import java.net.URI; import java.net.URL; /** * An example of a typical text processing pipeline. It loads a document, * pulls out the body field, tokenizes it, filters terms, converts it to * counts (bag-of-words). * * @author Justin Basilico * @since 3.0 */ public class TextPipelineExample extends Object { /** * Runs the example. * * @param args Ignored. The example has no arguments. * @throws Exception If there is an error. */ public static void main( final String[] args) throws Exception { // To start out with we get the example file as a URI. final String exampleFileName = "examples/example.txt"; final URL exampleFileURL = ClassLoader.getSystemResource(exampleFileName); final URI exampleFileURI = exampleFileURL.toURI(); // Now we extract the document from the URI. final TextDocumentExtractor extractor = new TextDocumentExtractor(); final Document document = extractor.extractDocument(exampleFileURI); // Here we can inspect all of the fields that were extracted. System.out.println("Fields:"); for (Field field : document.getFields()) { System.out.println(" Field: " + field.getName()); System.out.println(" Content: " + field.getText()); } // Next we convert the document to a textual representation by // extracting the text field. final DocumentSingleFieldConverter converter = CommonDocumentTextualConverterFactory.createBodyConverter(); final Textual text = converter.convert(document); // Next we tokenize the text. final Tokenizer tokenizer = new LetterNumberTokenizer(); final Iterable<Token> tokens = tokenizer.tokenize(text); // Here we can look at all the tokens. System.out.println("Tokens:"); for (Token token : tokens) { System.out.println(" " + token.getText()); } // Next we filter the tokens. Note that tokens are an extension of term // occurences. Iterable<? extends TermOccurrence> terms = tokens; final TermFilter[] filters = { new LowerCaseTermFilter() }; for (TermFilter filter : filters) { terms = filter.filterTerms(terms); } System.out.println("Term Occurences:"); for (TermOccurrence term : terms) { System.out.println(" " + term.getTerm()); } // Next we index the terms. final DefaultTermIndex termIndex = new DefaultTermIndex(); termIndex.addAll(terms); // Next we transform the terms to a bag of words. final BagOfWordsTransform bagOfWords = new BagOfWordsTransform(termIndex); final Vector counts = bagOfWords.convertToVector(terms); System.out.println("Term Counts:"); for (int i = 0; i < termIndex.getTermCount(); i++) { final Term term = termIndex.getTerm(i); final double count = counts.getElement(i); System.out.println(" " + i + " (" + term + "): " + count); } // Next we do a TF-IDF transform. final CompositeLocalGlobalTermWeighter tfidf = CommonTermWeighterFactory.createTFIDFWeighter(); tfidf.getGlobalWeighter().add(counts); // This just makes the global weights non-zero. Vector zeroDocument = counts.clone(); zeroDocument.zero(); tfidf.getGlobalWeighter().add(zeroDocument); final Vector weighted = tfidf.evaluate(counts); System.out.println("Term Weights:"); for (int i = 0; i < termIndex.getTermCount(); i++) { final Term term = termIndex.getTerm(i); final double count = weighted.getElement(i); System.out.println(" " + i + " (" + term + "): " + count); } } }