package at.lux.retrieval.vectorspace; import org.jdom.Document; import org.jdom.JDOMException; import org.jdom.input.SAXBuilder; import org.apache.lucene.index.IndexReader; import org.apache.lucene.document.Field; import junit.framework.TestCase; import java.io.File; import java.io.IOException; /* * This file is part of Caliph & Emir. * * Caliph & Emir is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Caliph & Emir is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Caliph & Emir; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Copyright statement: * -------------------- * (c) 2002-2006 by Mathias Lux (mathias@juggle.at) * http://www.juggle.at, http://www.SemanticMetadata.net */ /** * This file is part of Caliph & Emir * Date: 16.03.2006 * Time: 22:03:46 * * @author Mathias Lux, mathias@juggle.at */ public class ElementTextVectorSimilarityTest extends TestCase { Document d1, d2; String doc1 = "testdata/I-Know 02/iknow_008.mp7.xml"; String doc2 = "testdata/I-Know 02/iknow_010.mp7.xml"; private SAXBuilder saxBuilder; /** * Sets up the fixture, for example, open a network connection. * This method is called before a test is executed. */ protected void setUp() throws Exception { super.setUp(); saxBuilder = new SAXBuilder(); d1 = saxBuilder.build(new File(doc1)); d2 = saxBuilder.build(new File(doc2)); } public void testSimilarity() throws IOException, JDOMException { ElementTextVectorSimilarity sim = new ElementTextVectorSimilarity(); double distance = sim.getSimilarity(d1, d1); System.out.println("distance = " + distance); distance = sim.getSimilarity(d1, d2); System.out.println("distance = " + distance); distance = sim.getSimilarity(d2, d1); System.out.println("distance = " + distance); IndexReader reader = IndexReader.open("testdata/idx_paths"); System.out.println("Loading documents and adding them to corpus ..."); for (int i = 0; i < reader.numDocs(); i++) { // Graph g_idx = new Graph(reader.document(i).getField("graph").stringValue()); Field[] files = reader.document(i).getFields("file"); for (Field file : files) { Document d = saxBuilder.build(file.stringValue()); sim.addToCorpus(d); } } System.out.println(""); distance = sim.getSimilarity(d1, d1, ElementTextVectorSimilarity.WeightType.TfIdf); System.out.println("distance = " + distance); distance = sim.getSimilarity(d1, d2, ElementTextVectorSimilarity.WeightType.TfIdf); System.out.println("distance = " + distance); distance = sim.getSimilarity(d2, d1, ElementTextVectorSimilarity.WeightType.TfIdf); System.out.println("distance = " + distance); distance = sim.getSimilarity(d2, d2, ElementTextVectorSimilarity.WeightType.TfIdf); System.out.println("distance = " + distance); System.out.println(""); distance = sim.getSimilarity(d1, d1, ElementTextVectorSimilarity.WeightType.BM25); System.out.println("distance = " + distance); distance = sim.getSimilarity(d1, d2, ElementTextVectorSimilarity.WeightType.BM25); System.out.println("distance = " + distance); distance = sim.getSimilarity(d2, d1, ElementTextVectorSimilarity.WeightType.BM25); System.out.println("distance = " + distance); distance = sim.getSimilarity(d2, d2, ElementTextVectorSimilarity.WeightType.BM25); System.out.println("distance = " + distance); } }