package com.cse10.duplicateDetector; import weka.core.tokenizers.NGramTokenizer; import java.util.ArrayList; import java.util.List; /** * use to generate full words * Created by Chamath on 1/2/2015. */ public class FullWordSegmenter extends WordSegmenter { private NGramTokenizer nGramTokenizer; public FullWordSegmenter() { nGramTokenizer = new NGramTokenizer(); nGramTokenizer.setNGramMaxSize(1); nGramTokenizer.setNGramMinSize(1); } /** * tokenize the given string into words * * @param document * @return */ @Override protected List<String> getWords(String document) { ArrayList<String> words = new ArrayList(); nGramTokenizer.tokenize(document); String token = ""; while (nGramTokenizer.hasMoreElements()) { token = (String) nGramTokenizer.nextElement(); token = token.toLowerCase(); words.add(token); } return words; } }