/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.ngrams;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.text.AnnotationFS;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.NGram;
public class NGramIterable<T extends AnnotationFS>
implements Iterable<NGram>
{
List<NGram> nGramList;
private NGramIterable(Iterable<T> tokens, int n)
{
this.nGramList = createNGramList(tokens, n);
}
public static <T extends AnnotationFS> NGramIterable<T> create(Iterable<T> tokens, int n)
{
return new NGramIterable<T>(tokens, n);
}
@Override
public Iterator<NGram> iterator()
{
return nGramList.iterator();
}
private List<NGram> createNGramList(Iterable<T> tokens, int n)
{
List<NGram> nGrams = new ArrayList<NGram>();
// fill token list
List<T> tokenList = new ArrayList<T>();
for (T t : tokens) {
tokenList.add(t);
}
// remove last element, if it contains a punctuation mark
if (tokenList.size() > 0) {
String lastElementText = tokenList.get(tokenList.size() - 1).getCoveredText();
if (lastElementText.length() == 1
&& (lastElementText.equals(".")
|| lastElementText.equals("!") || lastElementText.equals("?"))) {
tokenList.remove(tokenList.size() - 1);
}
}
for (int k = 1; k <= n; k++) {
// if the number of tokens is less than k => break
if (tokenList.size() < k) {
break;
}
nGrams.addAll(getNGrams(tokenList, k));
}
return nGrams;
}
private List<NGram> getNGrams(List<T> tokenList, int k)
{
List<NGram> nGrams = new ArrayList<NGram>();
int size = tokenList.size();
for (int i = 0; i < (size + 1 - k); i++) {
try {
NGram ngram = new NGram(tokenList.get(i).getCAS().getJCas(), tokenList.get(i)
.getBegin(), tokenList.get(i + k - 1).getEnd());
ngram.setText(getTokenText(tokenList, i, i + k - 1));
nGrams.add(ngram);
}
catch (CASException e) {
throw new IllegalStateException(e);
}
}
return nGrams;
}
private String getTokenText(List<T> tokenList, int start, int end)
{
List<String> tokenTexts = new ArrayList<String>();
for (int i = start; i <= end; i++) {
tokenTexts.add(tokenList.get(i).getCoveredText());
}
return StringUtils.join(tokenTexts, " ");
}
}