/*
* Copyright 2011
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.ngrams.util;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* Creates a NGram iterable from a list of tokens. It does not detect any sentence boundaries. Thus,
* one should make sure to only add lists that reflect a sentence or a phrase.
*
*
*/
public class NGramStringListIterable
implements Iterable<List<String>>
{
List<List<String>> nGramList;
/**
* @param tokens
* An iterable of tokens.
* @param minN
* minimum n-gram length.
* @param maxN
* maximum n-gram length.
*/
public NGramStringListIterable(Iterable<String> tokens, int minN, int maxN)
{
this.nGramList = createNGramList(tokens, minN, maxN);
}
/**
* @param tokens
* An array of tokens.
* @param minN
* minimum n-gram length.
* @param maxN
* maximum n-gram length.
*/
public NGramStringListIterable(String[] tokens, int minN, int maxN)
{
this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN);
}
@Override
public Iterator<List<String>> iterator()
{
return nGramList.iterator();
}
private List<List<String>> createNGramList(Iterable<String> tokens, int minN, int maxN)
{
if (minN > maxN) {
throw new IllegalArgumentException("minN needs to be smaller or equal than maxN.");
}
List<List<String>> nGrams = new ArrayList<List<String>>();
// fill token list
List<String> tokenList = new ArrayList<String>();
for (String t : tokens) {
tokenList.add(t);
}
for (int k = minN; k <= maxN; k++) {
// if the number of tokens is less than k => break
if (tokenList.size() < k) {
break;
}
nGrams.addAll(getNGrams(tokenList, k));
}
return nGrams;
}
private List<List<String>> getNGrams(List<String> tokenList, int k)
{
List<List<String>> nGrams = new ArrayList<List<String>>();
int size = tokenList.size();
for (int i = 0; i < (size + 1 - k); i++) {
nGrams.add(tokenList.subList(i, i + k));
}
return nGrams;
}
}