/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.ngrams.util; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.commons.lang.StringUtils; /** * Creates a character NGram iterable from a list of tokens. * * */ public class CharacterNGramStringIterable implements Iterable<String> { List<String> nGramList; /** * @param token * A token * @param minN * the minimal n-gram length. * @param maxN * the maximal n-gram length. */ public CharacterNGramStringIterable(String token, int minN, int maxN) { this.nGramList = createNGramList(token, minN, maxN); } @Override public Iterator<String> iterator() { return nGramList.iterator(); } private List<String> createNGramList(String token, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } List<String> nGrams = new ArrayList<String>(); // fill character list List<String> charList = new ArrayList<String>(); for (char c : token.toCharArray()) { charList.add(Character.toString(c)); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (charList.size() < k) { break; } nGrams.addAll(getNGrams(charList, k)); } return nGrams; } private List<String> getNGrams(List<String> tokenList, int k) { List<String> nGrams = new ArrayList<String>(); int size = tokenList.size(); for (int i = 0; i < (size + 1 - k); i++) { nGrams.add(StringUtils.join(tokenList.subList(i, i + k), "")); } return nGrams; } }