/*
* File: NGramFilter.java
* Authors: Justin Basilico
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright April 30, 2009, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government. Export
* of this program may require a license from the United States Government.
* See CopyrightHistory.txt for complete details.
*
*/
package gov.sandia.cognition.text.term.filter;
import gov.sandia.cognition.text.term.DefaultTermOccurrence;
import gov.sandia.cognition.text.term.DefaultTermNGram;
import gov.sandia.cognition.text.term.Term;
import gov.sandia.cognition.text.term.TermOccurrence;
import gov.sandia.cognition.util.AbstractCloneableSerializable;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
/**
* A term filter that creates an n-gram of terms.
*
* @author Justin Basilico
* @since 3.0
*/
public class NGramFilter
extends AbstractCloneableSerializable
implements TermFilter
{
/** The default is a bigram. */
public static final int DEFAULT_SIZE = 2;
/** The size of the n-gram. Also known as the value of n. */
protected int size;
/**
* Creates a new {@code NGramFilter} with the default size.
*/
public NGramFilter()
{
this(DEFAULT_SIZE);
}
/**
* Creates a new {@code NGramFilter} with the given size.
*
* @param size
* The size of the n-grams to create. Must be greater than 1.
*/
public NGramFilter(
final int size)
{
super();
this.setSize(size);
}
@Override
public NGramFilter clone()
{
return (NGramFilter) super.clone();
}
public Collection<TermOccurrence> filterTerms(
final Iterable<? extends TermOccurrence> terms)
{
final LinkedList<TermOccurrence> result =
new LinkedList<TermOccurrence>();
// TODO: Replace this with a circular buffer to improve efficiency.
final LinkedList<TermOccurrence> occurrencesBuffer =
new LinkedList<TermOccurrence>();
// We need to keep track of the array of terms we use.
Term[] previousTerms = new Term[this.size];
// Iterate through the term occurrences.
final Iterator<? extends TermOccurrence> it = terms.iterator();
// We keep going until our buffer of n-gram occurrences is empty. That
// buffer is fed in from the iterator.
boolean keepGoing = it.hasNext();
while (keepGoing)
{
// Copy the previous terms into our new term n-gram. We create a
// new n-gram each time so that each n-gram has a unique array.
final Term[] currentTerms = new Term[this.size];
for (int i = 1; i < this.size; i++)
{
currentTerms[i - 1] = previousTerms[i];
}
// Get the term and its occurrence.
final TermOccurrence occurrence = it.hasNext() ? it.next() : null;
final Term term = occurrence != null ? occurrence.getTerm() : null;
// Update the buffer.
if ( occurrencesBuffer.size() >= this.size
|| occurrence == null)
{
occurrencesBuffer.removeFirst();
}
// Buffer length is now < this.size.
if (occurrence != null)
{
// Add this occurrence onto the list of occurrences.
occurrencesBuffer.addLast(occurrence);
}
// Buffer length is now <= this.size.
currentTerms[this.size - 1] = term;
// Create the n-gram from the terms.
final DefaultTermNGram nGram = new DefaultTermNGram(currentTerms);
// We look up the first and last to get the span. We look at the
// last because the current occurrence may be null.
final TermOccurrence first = occurrencesBuffer.getFirst();
final TermOccurrence last = occurrencesBuffer.getLast();
final int start = first.getStart();
final int end = last.getStart() + last.getLength();
final int length = end - start;
// Add the term occurrence.
result.add(new DefaultTermOccurrence(nGram, start, length));
// Swap out the previous terms with our current terms.
previousTerms = currentTerms;
// Keep going until we've run our of occurrences.
keepGoing = it.hasNext() || occurrencesBuffer.size() > 1;
}
return result;
}
/**
* Gets the size of the n-gram created by the filter. Also known as the
* value of n.
*
* @return
* The size of the n-gram created by the filter.
*/
public int getSize()
{
return this.size;
}
/**
* Sets the size of the n-gram created by the filter. Also known as the
* value of n.
*
* @param size
* The size of the n-gram created by the filter. Must be greater than
* 1.
*/
public void setSize(
final int size)
{
if (size <= 1)
{
throw new IllegalArgumentException("size must be greater than 1");
}
this.size = size;
}
}