package lia.advsearching; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import junit.framework.TestCase; import lia.common.TestUtil; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermFreqVector; import java.io.IOException; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; // From chapter 5 public class CategorizerTest extends TestCase { Map categoryMap; protected void setUp() throws Exception { categoryMap = new TreeMap(); buildCategoryVectors(); // dumpCategoryVectors(); } public void testCategorization() throws Exception { assertEquals("/technology/computers/programming/methodology", getCategory("extreme agile methodology")); assertEquals("/education/pedagogy", getCategory("montessori education philosophy")); } private void dumpCategoryVectors() { Iterator categoryIterator = categoryMap.keySet().iterator(); while (categoryIterator.hasNext()) { String category = (String) categoryIterator.next(); System.out.println("Category " + category); Map vectorMap = (Map) categoryMap.get(category); Iterator vectorIterator = vectorMap.keySet().iterator(); while (vectorIterator.hasNext()) { String term = (String) vectorIterator.next(); System.out.println(" " + term + " = " + vectorMap.get(term)); } } } private void buildCategoryVectors() throws IOException { IndexReader reader = IndexReader.open(TestUtil.getBookIndexDirectory()); int maxDoc = reader.maxDoc(); for (int i = 0; i < maxDoc; i++) { if (!reader.isDeleted(i)) { Document doc = reader.document(i); String category = doc.get("category"); Map vectorMap = (Map) categoryMap.get(category); if (vectorMap == null) { vectorMap = new TreeMap(); categoryMap.put(category, vectorMap); } TermFreqVector termFreqVector = reader.getTermFreqVector(i, "subject"); addTermFreqToMap(vectorMap, termFreqVector); } } } private void addTermFreqToMap(Map vectorMap, TermFreqVector termFreqVector) { String[] terms = termFreqVector.getTerms(); int[] freqs = termFreqVector.getTermFrequencies(); for (int i = 0; i < terms.length; i++) { String term = terms[i]; if (vectorMap.containsKey(term)) { Integer value = (Integer) vectorMap.get(term); vectorMap.put(term, new Integer(value.intValue() + freqs[i])); } else { vectorMap.put(term, new Integer(freqs[i])); } } } private String getCategory(String subject) { String[] words = subject.split(" "); Iterator categoryIterator = categoryMap.keySet().iterator(); double bestAngle = Double.MAX_VALUE; String bestCategory = null; while (categoryIterator.hasNext()) { String category = (String) categoryIterator.next(); // System.out.println(category); double angle = computeAngle(words, category); // System.out.println(" -> angle = " + angle + " (" + Math.toDegrees(angle) + ")"); if (angle < bestAngle) { bestAngle = angle; bestCategory = category; } } return bestCategory; } private double computeAngle(String[] words, String category) { Map vectorMap = (Map) categoryMap.get(category); int dotProduct = 0; int sumOfSquares = 0; for (String word : words) { int categoryWordFreq = 0; if (vectorMap.containsKey(word)) { categoryWordFreq = ((Integer) vectorMap.get(word)).intValue(); } dotProduct += categoryWordFreq; //#1 sumOfSquares += categoryWordFreq * categoryWordFreq; } double denominator; if (sumOfSquares == words.length) { denominator = sumOfSquares; // #2 } else { denominator = Math.sqrt(sumOfSquares) * Math.sqrt(words.length); } double ratio = dotProduct / denominator; return Math.acos(ratio); } /* #1 Assume each word has frequency 1 #2 Shortcut to prevent precision issue */ }