package com.knowledgebooks.nlp; import com.knowledgebooks.nlp.util.NameValue; import com.knowledgebooks.nlp.util.NoiseWords; import com.knowledgebooks.nlp.util.Tokenizer; import com.knowledgebooks.public_domain.Stemmer; import java.util.*; /** * License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt) */ /** * The purpose of this class is to determine a small set of search terms that are likely to * yield the same page using a search engine. Lots of overlapping code with AutoTagger but I * decided to keep this class separate because I need this functionality in a few cases * where I don't care about tagging text. */ public class ExtractSearchTerms { public ExtractSearchTerms(String text) { // this code is not so efficient since I first need to get the // best tags for the input text, then go back and keep track of // which words provide the most evidence for selecting these tags. List<NameValue<String, Float>> tagResults = new AutoTagger().getTags(text); Map<String, Float> tagRelevance = new HashMap<String, Float>(); for (NameValue<String, Float> nv : tagResults) { tagRelevance.put(nv.getName(), nv.getValue()); } List<String> words = Tokenizer.wordsToList(text); int number_of_words = words.size(); Stemmer stemmer = new Stemmer(); List<String> stems = new ArrayList<String>(number_of_words); for (String word : words) stems.add(stemmer.stemOneWord(word)); int number_of_tag_types = AutoTagger.tagClassNames.length; float[] scores = new float[number_of_words]; for (int w = 0; w < number_of_words; w++) { if (NoiseWords.checkFor(stems.get(w)) == false) { for (int i = 0; i < number_of_tag_types; i++) { Float f = AutoTagger.hashes.get(i).get(stems.get(w)); if (f != null) { Float tag_relevance_factor = tagRelevance.get(AutoTagger.tagClassNames[i]); if (tag_relevance_factor != null) { scores[w] += f * tag_relevance_factor; } } } } } float max_score = 0.001f; for (int i = 0; i < number_of_words; i++) if (max_score < scores[i]) max_score = scores[i]; float cutoff = 0.2f * max_score; for (int i = 0; i < number_of_words; i++) { if (NoiseWords.checkFor(stems.get(i)) == false) { if (scores[i] > cutoff) if (!bestSearchTerms.contains(words.get(i))) bestSearchTerms.add(words.get(i)); } } } public List<String> getBest() { return bestSearchTerms; } private List<String> bestSearchTerms = new ArrayList<String>(); }