/* * Encog(tm) Core v3.4 - Java Version * http://www.heatonresearch.com/encog/ * https://github.com/encog/encog-java-core * Copyright 2008-2016 Heaton Research, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * For more information on Heaton Research copyrights, licenses * and trademarks visit: * http://www.heatonresearch.com/copyright */ package org.encog.util.text; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.TreeSet; public class BagOfWords { private final Map<String, Integer> words = new HashMap<String, Integer>(); private boolean breakSpaces = true; private boolean ignoreCase = true; private int totalWords; private final int k; private int laplaceClasses; public BagOfWords(int laplace) { this.k = laplace; } public BagOfWords() { this(0); } public void process(String str) { if (breakSpaces) { processSpaces(str); } else { increase(str); } } private void processSpaces(String str) { StringBuilder word = new StringBuilder(); for (int i = 0; i < str.length(); i++) { char ch = str.charAt(i); if (ch != '\'' && !Character.isLetterOrDigit(ch)) { if (word.length() > 0) { increase(word.toString()); word.setLength(0); } } else { word.append(ch); } } if (word.length() > 0) { increase(word.toString()); } } public void increase(String word) { String word2; this.totalWords++; this.laplaceClasses++; if (this.ignoreCase) { word2 = word.toLowerCase(); } else { word2 = word; } if (this.words.containsKey(word2)) { int i = this.words.get(word2); i++; this.words.put(word2, i); } else { this.words.put(word2, 1); } } /** * @return the breakSpaces */ public boolean isBreakSpaces() { return breakSpaces; } /** * @param breakSpaces the breakSpaces to set */ public void setBreakSpaces(boolean breakSpaces) { this.breakSpaces = breakSpaces; } /** * @return the ignoreCase */ public boolean isIgnoreCase() { return ignoreCase; } /** * @param ignoreCase the ignoreCase to set */ public void setIgnoreCase(boolean ignoreCase) { this.ignoreCase = ignoreCase; } /** * @return the words */ public Map<String, Integer> getWords() { return words; } public void clear() { this.words.clear(); } public String toString() { StringBuilder result = new StringBuilder(); // sort Set<String> set = new TreeSet<String>(); set.addAll(this.words.keySet()); // display for (String key : set) { int i = this.words.get(key); result.append(key); result.append(","); result.append(i); result.append("\n"); } return result.toString(); } public boolean contains(String word) { return this.words.containsKey(word); } public int getK() { return this.k; } /** * @return the totalWords */ public int getTotalWords() { return totalWords; } public int getCount(String word) { String word2; if( this.ignoreCase ) { word2 = word.toLowerCase(); } else { word2 = word; } if( !this.words.containsKey(word2) ) { return 0; } return this.words.get(word2); } public double probability(String word) { double n = ((double)getCount(word))+((double)this.k); double d = ((double)getTotalWords())+(k*this.laplaceClasses); return n/d; } /** * @return the laplaceClasses */ public int getLaplaceClasses() { return laplaceClasses; } /** * @param laplaceClasses the laplaceClasses to set */ public void setLaplaceClasses(int laplaceClasses) { this.laplaceClasses = laplaceClasses; } /** * @param totalWords the totalWords to set */ public void setTotalWords(int totalWords) { this.totalWords = totalWords; } public int getUniqueWords() { return this.words.size(); } }