package at.lux.retrieval.vectorspace;
import at.lux.fotoretrieval.RetrievalToolkit;
import at.lux.fotoretrieval.lucene.Relation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.Namespace;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
/*
* This file is part of Caliph & Emir.
*
* Caliph & Emir is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Caliph & Emir is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Caliph & Emir; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Copyright statement:
* --------------------
* (c) 2002-2006 by Mathias Lux (mathias@juggle.at)
* http://www.juggle.at, http://www.SemanticMetadata.net
*/
/**
* This file is part of Caliph & Emir
* Date: 16.03.2006
* Time: 21:32:35
*
* @author Mathias Lux, mathias@juggle.at
*/
public class ElementTextVectorSimilarity {
private Namespace xsi = Namespace.getNamespace("xsi", "http://www.w3.org/2001/XMLSchema-instance");
private Analyzer analyer = new SimpleAnalyzer();
private HashMap<String, Integer> idf = new HashMap<String, Integer>(128);
private int numDocs = 0;
private int allDocsLength = 0;
private boolean useRelations = true;
public enum WeightType {
TfIdf,
BM25,
Unweighted
}
public double getSimilarity(Document mpeg7Document1, Document mpeg7Document2) {
return getSimilarity(mpeg7Document1, mpeg7Document2, WeightType.Unweighted);
}
public double getSimilarity(Document mpeg7Document1, Document mpeg7Document2, WeightType type) {
List semanticBaseDoc1 = RetrievalToolkit.xpathQuery(mpeg7Document1.getRootElement(), "//Semantic/SemanticBase", null);
List semanticBaseDoc2 = RetrievalToolkit.xpathQuery(mpeg7Document2.getRootElement(), "//Semantic/SemanticBase", null);
List semanticRelationsDoc1 = RetrievalToolkit.xpathQuery(mpeg7Document1.getRootElement(), "//Semantic/Graph/Relation", null);
List semanticRelationsDoc2 = RetrievalToolkit.xpathQuery(mpeg7Document2.getRootElement(), "//Semantic/Graph/Relation", null);
HashMap<String, Integer> termVector1 = createTermVector(semanticBaseDoc1, semanticRelationsDoc1);
HashMap<String, Integer> termVector2 = createTermVector(semanticBaseDoc2, semanticRelationsDoc2);
HashSet<String> terms = new HashSet<String>(termVector1.size() + termVector2.size());
if (type != WeightType.BM25) {
terms.addAll(termVector1.keySet());
terms.addAll(termVector2.keySet());
double sum = 0f;
double sum1 = 0;
double sum2 = 0;
for (String dim : terms) {
double factor1 = 0f;
double factor2 = 0f;
if (termVector1.containsKey(dim)) {
double entry = termVector1.get(dim);
if (type == WeightType.TfIdf) {
if (!idf.containsKey(dim))
throw new UnsupportedOperationException("Document has to be added to corpus first!");
double idfValue = Math.log((double) numDocs / (double) idf.get(dim));
entry = entry * idfValue;
}
factor1 = entry;
sum1 += entry * entry;
}
if (termVector2.containsKey(dim)) {
double entry = termVector2.get(dim);
if (type == WeightType.TfIdf) {
if (!idf.containsKey(dim))
throw new UnsupportedOperationException("Document has to be added to corpus first!");
double idfValue = Math.log((double) numDocs / (double) idf.get(dim));
entry = entry * idfValue;
}
factor2 = entry;
sum2 += entry * entry;
}
sum += factor1 * factor2;
}
double upper = sum;
double lower = (float) Math.sqrt((float) sum1 * (float) sum2);
return upper / lower;
} else {
double avdl = (double) allDocsLength / (double) numDocs;
double dl1 = termVector1.size();
double dl2 = termVector2.size();
double k1 = 1.5;
double b = 0.5;
double sum = 0.0;
/*
for (String term : termVector1.keySet()) {
if (termVector2.containsKey(term)) {
terms.add(term);
}
}
for (String term : terms) {
if (numDocs < idf.get(term)) {
System.out.println(term + " -> " + idf.get(term));
}
assert(numDocs >= idf.get(term));
sum += getBm25Weight(k1, b, termVector1.get(term), idf.get(term), avdl, dl1) *
getBm25Weight(k1, b, termVector2.get(term), idf.get(term), avdl, dl2);
}
return sum;
*/
terms.addAll(termVector1.keySet());
terms.addAll(termVector2.keySet());
double sum1 = 0;
double sum2 = 0;
for (String dim : terms) {
double factor1 = 0f;
double factor2 = 0f;
if (termVector1.containsKey(dim)) {
double entry = termVector1.get(dim);
if (!idf.containsKey(dim))
throw new UnsupportedOperationException("Document has to be added to corpus first!");
// double idfValue = Math.log((double) numDocsInCorpus / (double) idf.get(dim));
entry = getBm25Weight(k1, b, termVector1.get(dim), idf.get(dim), avdl, dl1);
factor1 = entry;
sum1 += entry * entry;
}
if (termVector2.containsKey(dim)) {
double entry = termVector2.get(dim);
if (!idf.containsKey(dim))
throw new UnsupportedOperationException("Document has to be added to corpus first!");
// double idfValue = Math.log((double) numDocsInCorpus / (double) idf.get(dim));
entry = getBm25Weight(k1, b, termVector2.get(dim), idf.get(dim), avdl, dl2);
factor2 = entry;
sum2 += entry * entry;
}
sum += factor1 * factor2;
}
double upper = sum;
double lower = (float) Math.sqrt((float) sum1 * (float) sum2);
return upper / lower;
}
}
private double getBm25Weight(double k1, double b, double termFreq, double docFreq, double avgDocLength, double docLength) {
assert(numDocs >= docFreq);
return ((k1 + 1.0) * termFreq) / (k1 * ((1 - b) + b * docLength / avgDocLength) + termFreq) * Math.log((numDocs - docFreq + 0.5) / (docFreq + 0.5));
}
public void addToCorpus(Document mpeg7Document) {
numDocs++;
List semanticBaseDoc = RetrievalToolkit.xpathQuery(mpeg7Document.getRootElement(), "//Semantic/SemanticBase", null);
List semanticRelationsDoc = RetrievalToolkit.xpathQuery(mpeg7Document.getRootElement(), "//Semantic/Graph/Relation", null);
HashMap<String, Integer> termVector = createTermVector(semanticBaseDoc, semanticRelationsDoc);
// calculating the document term frequeny + avg. document length.
for (String term : termVector.keySet()) {
allDocsLength += termVector.get(term);
if (!idf.containsKey(term)) {
idf.put(term, 1);
} else {
idf.put(term, idf.get(term) + 1);
}
}
// calculating the
}
private HashMap<String, Integer> createTermVector(List semanticBaseDoc, List semanticRelationsDoc) {
HashMap<String, Integer> termVector = new HashMap<String, Integer>(32);
// nodes ...
for (Object node : semanticBaseDoc) {
Element e = (Element) node;
addToTermVector(e, termVector);
}
// relations ...
if (useRelations) {
for (Object node : semanticRelationsDoc) {
Element e = (Element) node;
String relationType = getRelationType(e.getAttributeValue("type"));
if (!Relation.relationMapping.containsKey(relationType))
relationType = Relation.invertRelationType(relationType);
if (termVector.get(relationType) == null) {
termVector.put(relationType, 0);
}
termVector.put(relationType, termVector.get(relationType) + 1);
}
}
return termVector;
}
private void addToTermVector(Element node, HashMap<String, Integer> result) {
// add labels:
addTerms(result, getTextFromXPath(node, "Label/Name"));
// add free text:
addTerms(result, getTextFromXPath(node, "Definition/FreeTextAnnotation"));
// if agent:
if (node.getAttributeValue("type", xsi).equals("AgentObjectType")) {
addTerms(result, getTextFromXPath(node, "Agent/Name/GivenName"));
addTerms(result, getTextFromXPath(node, "Agent/Name/FamilyName"));
addTerms(result, getTextFromXPath(node, "Agent/Affiliation/Organization/Name"));
addTerms(result, getTextFromXPath(node, "Agent/Address/PostalAddress/AddressLine"));
addTerms(result, getTextFromXPath(node, "Agent/ElectronicAddress/Email"));
addTerms(result, getTextFromXPath(node, "Agent/ElectronicAddress/Url"));
} else if (node.getAttributeValue("type", xsi).equals("EventType")) {
addTerms(result, getTextFromXPath(node, "SemanticPlace/Label/Name"));
addTerms(result, getTextFromXPath(node, "SemanticPlace/Place/PostalAddress/AddressLine"));
addTerms(result, getTextFromXPath(node, "SemanticTime/Label/Name"));
} else if (node.getAttributeValue("type", xsi).equals("SemanticTimeType")) {
// nothing more to do ...
} else if (node.getAttributeValue("type", xsi).equals("SemanticPlaceType")) {
addTerms(result, getTextFromXPath(node, "Place/PostalAddress/AddressLine"));
}
}
private String getTextFromXPath(Element node, String xPath) {
StringBuilder sb = new StringBuilder(128);
List labels = RetrievalToolkit.xpathQuery(node, xPath, null);
for (Iterator iterator = labels.iterator(); iterator.hasNext();) {
Element e = (Element) iterator.next();
sb.append(e.getTextTrim());
if (iterator.hasNext()) sb.append(' ');
}
return sb.toString();
}
private void addTerms(HashMap<String, Integer> terms, String text) {
TokenStream tokenStream = analyer.tokenStream("tmp", new StringReader(text));
Token token;
try {
while ((token = tokenStream.next()) != null) {
String s = token.termText();
if (terms.get(s) == null) {
terms.put(s, 0);
}
terms.put(s, terms.get(s) + 1);
}
} catch (IOException e) {
e.printStackTrace();
}
}
private static String getRelationType(String relationType) {
int index = relationType.lastIndexOf(':');
return relationType.substring(index + 1);
}
}