package at.lux.retrieval.graphisomorphism.metrics; import at.lux.retrieval.graphisomorphism.NodeDistanceFunction; import at.lux.fotoretrieval.RetrievalToolkit; import org.jdom.Element; import org.jdom.Namespace; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Token; import java.util.*; import java.io.StringReader; import java.io.IOException; /* * This file is part of Caliph & Emir. * * Caliph & Emir is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Caliph & Emir is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Caliph & Emir; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Copyright statement: * -------------------- * (c) 2002-2006 by Mathias Lux (mathias@juggle.at) * http://www.juggle.at, http://www.SemanticMetadata.net */ /** * This file is part of Caliph & Emir * Date: 18.02.2006 * Time: 15:56:07 * * @author Mathias Lux, mathias@juggle.at */ public class TermVectorNodeDistanceFunction implements NodeDistanceFunction { Namespace xsi = Namespace.getNamespace("xsi", "http://www.w3.org/2001/XMLSchema-instance"); Analyzer analyer = new SimpleAnalyzer(); Type type = Type.BagOfWords; public enum Type {BagOfWords, CosinusCoefficient, DiceCoefficient, JaccardCoefficient}; public TermVectorNodeDistanceFunction(Type type) { this.type = type; } public float getDistance(Element node1, Element node2) { HashMap<String, Integer> termVector1 = createTermVector(node1); HashMap<String, Integer> termVector2 = createTermVector(node2); int maxSize = Math.max(termVector1.keySet().size(), termVector2.keySet().size()); if (type == Type.BagOfWords) { // bag of words approach: HashSet<String> terms = new HashSet<String>(maxSize); terms.addAll(termVector1.keySet()); terms.addAll(termVector2.keySet()); int intersectionCardinality = 0; for (String term : terms) { if (termVector1.containsKey(term) && termVector2.containsKey(term)) { intersectionCardinality++; } } return 1f - ((float) intersectionCardinality / (float) maxSize); } else if (type == Type.CosinusCoefficient){ HashSet<String> terms = new HashSet<String>(maxSize); terms.addAll(termVector1.keySet()); terms.addAll(termVector2.keySet()); float sum = 0f; int sum1 = 0; int sum2 = 0; for (String dim : terms) { float factor1 = 0f; float factor2 = 0f; if (termVector1.containsKey(dim)) { Integer entry = termVector1.get(dim); factor1 = entry; sum1 += entry * entry; } if (termVector2.containsKey(dim)) { Integer entry = termVector2.get(dim); factor2 = entry; sum2 += entry * entry; } sum += factor1 * factor2; } float upper = sum; float lower = (float) Math.sqrt((float) sum1 * (float) sum2); return 1f - upper / lower; } else if (type == Type.DiceCoefficient){ throw new UnsupportedOperationException("Not implemented yet!"); } else if (type == Type.JaccardCoefficient){ throw new UnsupportedOperationException("Not implemented yet!"); } else { throw new UnsupportedOperationException("Type " + type.name() + " unknown!"); } } /** * Returns the maximum distance for this function. * Used for normalization and algorithmic issues * * @return the maximum distance. */ public float getMaxDistance() { return 1f; } private HashMap<String, Integer> createTermVector(Element node) { HashMap<String, Integer> result = new HashMap<String, Integer>(32); // add labels: addTerms(result, getTextFromXPath(node, "Label/Name")); // add free text: addTerms(result, getTextFromXPath(node, "Definition/FreeTextAnnotation")); // if agent: if (node.getAttributeValue("type", xsi).equals("AgentObjectType")) { addTerms(result, getTextFromXPath(node, "Agent/Name/GivenName")); addTerms(result, getTextFromXPath(node, "Agent/Name/FamilyName")); addTerms(result, getTextFromXPath(node, "Agent/Affiliation/Organization/Name")); addTerms(result, getTextFromXPath(node, "Agent/Address/PostalAddress/AddressLine")); addTerms(result, getTextFromXPath(node, "Agent/ElectronicAddress/Email")); addTerms(result, getTextFromXPath(node, "Agent/ElectronicAddress/Url")); } else if (node.getAttributeValue("type", xsi).equals("EventType")) { addTerms(result, getTextFromXPath(node, "SemanticPlace/Label/Name")); addTerms(result, getTextFromXPath(node, "SemanticPlace/Place/PostalAddress/AddressLine")); addTerms(result, getTextFromXPath(node, "SemanticTime/Label/Name")); } else if (node.getAttributeValue("type", xsi).equals("SemanticTimeType")) { // nothing more to do ... } else if (node.getAttributeValue("type", xsi).equals("SemanticPlaceType")) { addTerms(result, getTextFromXPath(node, "Place/PostalAddress/AddressLine")); } return result; } private String getTextFromXPath(Element node, String xPath) { StringBuilder sb = new StringBuilder(128); List labels = RetrievalToolkit.xpathQuery(node, xPath, null); for (Iterator iterator = labels.iterator(); iterator.hasNext();) { Element e = (Element) iterator.next(); sb.append(e.getTextTrim()); if (iterator.hasNext()) sb.append(' '); } return sb.toString(); } private void addTerms(HashMap<String, Integer> terms, String text) { TokenStream tokenStream = analyer.tokenStream("tmp", new StringReader(text)); Token token; try { while ((token = tokenStream.next()) != null) { String s = token.termText(); if (terms.get(s) == null) { terms.put(s, 0); } terms.put(s, terms.get(s) + 1); } } catch (IOException e) { e.printStackTrace(); } } }