LuceneRetrievalEngine.java example

Explorer
CaliphEmir-master
- caliphemir
  - src
    - at
  - test
    - at
      - lux
        fotoannotation
        ComponentsTest.java
        FileTreeTest.java
        fotoretrieval
        FastMapGraphTest.java
        FastMapTest.java
        GraphConstructionTest.java
        PointPanel.java
        SimilarityMatrixTest.java
        lucene
        GraphAnalyzerTest.java
        LabeledGraphTest.java
        panels
        TestConfigurationDialog.java
        retrievalengines
        GraphTest.java
        LucenePathIndexRetrievalEngineTest.java
        LuceneRetrievalEngineTest.java
        graphviz
        SpringEmbedderTest.java
        SpringEmbedderVis.java
        imageanalysis
        ColorLayoutTest.java
        ColorStructureTest.java
        ColorTest.java
        DominantColorTest.java
        EdgeHistogramTest.java
        ScalableColorTest.java
        db
        DerbyTest.java
        imaging
        BmpReaderTest.java
        PpmReaderTest.java
        retrieval
        StcTest.java
        evaluation
        SuffixTreeEvaluation.java
        graphisomorphism
        FastSubgraphIsomorphismTest.java
        SubgraphIsomorphismTest.java
        metrics
        BooleanNodeDistanceFunctionTest.java
        SimpleEdgeDistanceFunctionTest.java
        TermVectorNodeDistanceFunctionTest.java
        suffixtreemodel
        SuffixTreeTest.java
        vectorspace
        ElementTextVectorSimilarityTest.java
        GraphVectorSimilarityTest.java
/*
 * This file is part of Caliph & Emir.
 *
 * Caliph & Emir is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Caliph & Emir is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Caliph & Emir; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Copyright statement:
 * --------------------
 * (c) 2002-2005 by Mathias Lux (mathias@juggle.at)
 * http://www.juggle.at, http://caliph-emir.sourceforge.net
 */
package at.lux.fotoretrieval.retrievalengines;

import at.lux.components.StatusBar;
import at.lux.fotoretrieval.FileOperations;
import at.lux.fotoretrieval.ResultListEntry;
import at.lux.fotoretrieval.RetrievalToolkit;
import at.lux.fotoretrieval.lucene.Graph;
import at.lux.fotoretrieval.lucene.Node;
import at.lux.fotoretrieval.lucene.Relation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.Namespace;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;

import javax.swing.*;
import java.io.*;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

/**
 * Date: 13.10.2004
 * Time: 21:47:58
 *
 * @author Mathias Lux, mathias@juggle.at
 */
public class LuceneRetrievalEngine extends AbstractRetrievalEngine {
    private int maxResults = 40;
    private static Namespace xsi = Namespace.getNamespace("xsi", "http://www.w3.org/2001/XMLSchema-instance");
    public static final HashMap<String, String> relationMapping;

    public LuceneRetrievalEngine(int maxResults) {
        this.maxResults = maxResults;
    }

    static {
        relationMapping = new HashMap<String, String>(27);
        relationMapping.put("key", "keyFor");
        relationMapping.put("annotates", "annotatedBy");
        relationMapping.put("shows", "appearsIn");
        relationMapping.put("references", "referencedBy");
        relationMapping.put("quality", "qualityOf");
        relationMapping.put("symbolizes", "symbolizedBy");
        relationMapping.put("location", "locationOf");
        relationMapping.put("source", "sourceOf");
        relationMapping.put("destination", "destinationOf");
        relationMapping.put("path", "pathOf");
        relationMapping.put("time", "timeOf");
        relationMapping.put("depicts", "depictedBy");
        relationMapping.put("represents", "representedBy");
        relationMapping.put("context", "contextFor");
        relationMapping.put("interprets", "interpretedBy");
        relationMapping.put("agent", "agentOf");
        relationMapping.put("patient", "patientOf");
        relationMapping.put("experiencer", "experiencerOf");
        relationMapping.put("stimulus", "stimulusOf");
        relationMapping.put("causer", "causerOf");
        relationMapping.put("goal", "goalOf");
        relationMapping.put("beneficiary", "beneficiaryOf");
        relationMapping.put("theme", "themeOf");
        relationMapping.put("result", "resultOf");
        relationMapping.put("instrument", "instrumentOf");
        relationMapping.put("accompanier", "accompanierOf");
        relationMapping.put("summarizes", "summarizedBt");
        relationMapping.put("specializes", "generalizes");
        relationMapping.put("exemplifies", "exemplifiedBy");
        relationMapping.put("part", "partOf");
        relationMapping.put("property", "propertyOf");
        relationMapping.put("user", "userOf");
        relationMapping.put("component", "componentOf");
        relationMapping.put("substance", "substanceOf");
        relationMapping.put("entails", "entailedBy");
        relationMapping.put("manner", "mannerOf");
        relationMapping.put("state", "stateOf");
        relationMapping.put("influences", "dependsOn");
    }

    /**
     * In this case we can search for images with a String query deinfing
     * a graph, where the nodes are build by search queries in square brackets
     * and are referenced in relations by their postion starting with number 1.
     * relations follow the nodes starting with the type followed by the
     * position of the source node in the List and the target node:
     * <code>
     * query := node+ relation+
     * node := [term+]
     * relation := type source target
     * term := String
     * type := String
     * source := Integer
     * target := Integer
     * </code>
     * e.g."[Mathias Lux] [Talking] agentOf 1 2"
     *
     * @param xPath
     * @param objects       can be set to <code>null</code>
     * @param whereToSearch
     * @param recursive
     * @param progress
     */
    public List<ResultListEntry> getImagesBySemantics(String xPath, Vector objects, String whereToSearch, boolean recursive, JProgressBar progress) {
        List<String> nodeQueries = new LinkedList<String>();
        StringTokenizer st = new StringTokenizer(xPath, "]");
        String relationString = "";
        List<Relation> relations = new LinkedList<Relation>();
        while (st.hasMoreTokens()) {
            String s = st.nextToken().trim();
            if (s.startsWith("[")) {
                s = s.substring(1);
                nodeQueries.add(s);
            } else {
                relationString = s;
            }
        }
        if (relationString.length() > 1) {
            // there are relations, go ahead and parse them:
            StringTokenizer sr = new StringTokenizer(relationString);
            Relation currentRelation = null;
            while (sr.hasMoreTokens()) {
                String s = sr.nextToken();
                try {
                    int i = Integer.parseInt(s);
                    if (currentRelation.getSource() < 0) {
                        currentRelation.setSource(i);
                    } else if (currentRelation.getTarget() < 0) {
                        currentRelation.setTarget(i);
                        currentRelation.eliminateInverse();
                        relations.add(currentRelation);
                        currentRelation = null;
                    }
                } catch (NumberFormatException e) {
                    // its the type :)
                    currentRelation = new Relation(-1, -1, s.trim());
                }
            }
        }

        // so for now do the retrieval for the nodes:
        int numOfNodes = nodeQueries.size();
        List<List<Node>> nodeResults = new LinkedList<List<Node>>();

        for (int i = 0; i < numOfNodes; i++) {
            String queryString = nodeQueries.get(i);
            List<Node> nodes = getNodes(queryString, whereToSearch);
            nodeResults.add(nodes);
        }

        // now we can expand our query on retrieved nodes:
        List<Graph> graphList = getExpandedGraphsFromResults(nodeResults, relations, 3);
        LinkedList<ResultListEntry> results = new LinkedList<ResultListEntry>();
        if (progress != null) {
            progress.setMinimum(0);
            progress.setMaximum(graphList.size());
            progress.setValue(0);
            progress.setString("Querying expanded graphs");
        }
        int countGraph = 0;
        for (Iterator<Graph> iterator = graphList.iterator(); iterator.hasNext();) {
            Graph graph = iterator.next();
            results.addAll(searchForGraph(graph, whereToSearch));
            countGraph++;
            if (progress != null) progress.setValue(countGraph);
        }

        // for now eliminate the doublettes
        if (progress != null) {
            progress.setMinimum(0);
            progress.setMaximum(results.size());
            progress.setValue(0);
            progress.setString("Removing double entries");
        }
        countGraph = 0;
        HashMap<String, ResultListEntry> gegencheck = new HashMap<String, ResultListEntry>();
        for (Iterator<ResultListEntry> iterator = results.iterator(); iterator.hasNext();) {
            ResultListEntry resultListEntry = iterator.next();
            String file = resultListEntry.getFilePath();
            double relevance = resultListEntry.getRelevance();
            if (gegencheck.containsKey(file)) {
                double rel = gegencheck.get(file).getRelevance();
                if (rel < relevance) {
                    gegencheck.put(file, resultListEntry);
                }
            } else {
                gegencheck.put(file, resultListEntry);
            }
            countGraph++;
            if (progress != null) progress.setValue(countGraph);
        }
        results.clear();
        results.addAll(gegencheck.values());
        Collections.sort(results);
        return results;
    }

    private List<Graph> getExpandedGraphsFromResults(List<List<Node>> nodeResults, List<Relation> relations, int depth) {
        List<List<Node>> expanded = getExpandedSets(nodeResults, depth);
//        System.out.println("Expanding to " + expanded.size() + " graphs");
        List<Graph> results = new LinkedList<Graph>();
        for (Iterator<List<Node>> iterator = expanded.iterator(); iterator.hasNext();) {
            List<Node> nodes = iterator.next();
            Graph g = getGraphFromResults(nodes, relations);
            results.add(g);
        }
        // if there are any relations without type we have to
        // create a reverse relation each, otherwise we won't
        // get all our results:
//        List<Graph> additionalResults = new LinkedList<Graph>();
//        for (Iterator<Graph> iterator = results.iterator(); iterator.hasNext();) {
//            Graph graph = iterator.next();
//            expandUntypedRelations(graph, additionalResults);
//        }
        return results;
    }

    private List<List<Node>> getExpandedSets(List<List<Node>> nodeResults, int depth) {
        if (nodeResults.size() > 1) {
            List<Node> firstNodesResults = nodeResults.get(0);
            int numLevels = 0;
            for (Iterator<Node> iterator = firstNodesResults.iterator(); iterator.hasNext();) {
                Node node = iterator.next();
                if (node.getWeight() < 1f) break;
                numLevels++;
            }
            numLevels += depth;
            if (firstNodesResults.size() < depth) {
                numLevels = firstNodesResults.size();
            }
            List<List<Node>> tmpNodeResults = new LinkedList<List<Node>>(nodeResults);
            tmpNodeResults.remove(0);
            List<List<Node>> results = getExpandedSets(tmpNodeResults, depth);
            List<List<Node>> endResult = new LinkedList<List<Node>>();
            for (int i = 0; i < numLevels && i < firstNodesResults.size(); i++) {
                for (int j = 0; j < results.size(); j++) {
                    List<Node> nodeList = new LinkedList<Node>(results.get(j));
                    nodeList.add(0, firstNodesResults.get(i));
                    endResult.add(nodeList);
                }
            }
            return endResult;
        } else {
            List<List<Node>> endResult = new LinkedList<List<Node>>();
            List<Node> firstNodesResults = nodeResults.get(0);
            int numLevels = 0;
            for (Iterator<Node> iterator = firstNodesResults.iterator(); iterator.hasNext();) {
                Node node = iterator.next();
                if (node.getWeight() < 1f) break;
                numLevels++;
            }
            numLevels += depth;
            for (int i = 0; i < numLevels && i < firstNodesResults.size(); i++) {
                List<Node> nodeList = new LinkedList<Node>();
                nodeList.add(firstNodesResults.get(i));
                endResult.add(nodeList);
            }
            return endResult;
        }
    }

    private Graph getGraphFromResults(List<Node> nodeResults, List<Relation> relations) {
        HashMap<Integer, Integer> idReplacementTable = new HashMap<Integer, Integer>(nodeResults.size());
        List<Node> nodes = new LinkedList<Node>();
        List<Relation> myRelations = new LinkedList<Relation>();
        for (int i = 0; i < nodeResults.size(); i++) {
            Node node = nodeResults.get(i);
            idReplacementTable.put(i + 1, node.getNodeID());
            nodes.add(node);
        }
        // Create the relations with the real IDs:
        for (Iterator<Relation> iterator = relations.iterator(); iterator.hasNext();) {
            Relation r = iterator.next();
            int src = (idReplacementTable.get(r.getSource()));
            int tgt = (idReplacementTable.get(r.getTarget()));
            myRelations.add(new Relation(src, tgt, r.getType()));
        }
        // now we can create the graph we want to search for:
        Graph g = new Graph(nodes, myRelations);
        return g;
    }

    private List<ResultListEntry> searchForGraph(Graph g, String whereToSearch) {
//        System.out.println("Querying for graph: " + g.toString());
//        for (Iterator<Node> iterator = g.getNodes().iterator(); iterator.hasNext();) {
//            Node node = iterator.next();
//            System.out.println(node.getLabel() + ": " + node.getNodeID() + " (" + node.getWeight() + ") ");
//        }
        // and we search for it ion the text file:
        String indexFile;

        // create regex string:
        // as there are all nodes and relations surrounded with square brackets this is easy
        // between the relations there may various literals: '.*'
        String regexInsert = ".*";
        StringBuilder graphSearch = new StringBuilder(g.toString().length() * 2);
        StringTokenizer stok = new StringTokenizer(g.toString(), "[");
        String s = "";
        graphSearch.append(regexInsert);
        LinkedList<String> graphSearchList = new LinkedList<String>();
        while (stok.hasMoreTokens()) {
            StringBuilder regexItem = new StringBuilder(32);
            s = stok.nextToken().trim();
            s = s.substring(0, s.length() - 1);

            // and there may be other nodes & relations:
            regexItem.append(regexInsert);
            // opening bracket '['
            regexItem.append("\\x5B");
            // the actual content (node or relation)
            regexItem.append(s);
            // closing bracket ']'
            regexItem.append("\\x5D");
            // and there may be other nodes & relations:
            regexItem.append(regexInsert);

            String regex = regexItem.toString();

            if (regex.indexOf("\\w*") > -1) {
                // here we create support for relation wildcards:
                regex = expandUntypedRelation(regex);
            }

            graphSearchList.add(regex);

/*
            // opening bracket '['
            graphSearch.append("\\x5B");
            // the actual content (node or relation)
            graphSearch.append(s);
            // closing bracket ']'
            graphSearch.append("\\x5D");
            // and there may be other nodes & relations:
            graphSearch.append(regexInsert);
*/
        }

        List<ResultListEntry> resultList = new LinkedList<ResultListEntry>();
        SAXBuilder builder = new SAXBuilder();

        if (!whereToSearch.endsWith(File.separator)) {
            indexFile = whereToSearch + File.separator + "idx_graphs.list";
        } else {
            indexFile = whereToSearch + "idx_graphs.list";
        }
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(indexFile))));
            String line = null;
//            String regex = graphSearch.toString();
//            String oldRegex = regex;
//            if (regex.indexOf("\\w*") > -1) {
//                 here we create support for relation wildcards:
//                regex = expandUntypedRelation(regex);
//
//            }
//            System.out.println("REGEX: " + regex);
            while ((line = br.readLine()) != null) {
                boolean match = true;
                for (Iterator<String> iterator = graphSearchList.iterator(); iterator.hasNext();) {
                    String regex = iterator.next();
                    if (!line.matches(regex)) {
                        match = false;
                        continue;
                    }
                }
                if (match) {
                    // we found a graph:
                    System.out.println("FOUND: " + line);
                    StringTokenizer st = new StringTokenizer(line, "|");
                    String graphString = st.nextToken();
                    Graph theGraph = new Graph(graphString);
                    float similarity = g.getMcsSimilarity(theGraph);
                    while (st.hasMoreTokens()) {
                        String fileName = st.nextToken();
                        Element e = builder.build(new FileInputStream(fileName)).getRootElement();
                        ResultListEntry entry = new ResultListEntry((double) similarity, e, fileName);
                        resultList.add(entry);
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return resultList;
    }

    private String expandUntypedRelation(String regex) {
        String behind = regex.substring(regex.indexOf("\\w*") + 4);
        String before = regex.substring(0, regex.indexOf("\\w*") + 4);

        String firstNum = behind.substring(0, behind.indexOf(' '));
        String secondNum = behind.substring(behind.indexOf(' ') + 1, behind.indexOf('\\'));

        behind = behind.substring(behind.indexOf('\\'));

        regex = before + "((" + firstNum + " " + secondNum + ")|(" + secondNum + " " + firstNum + "))" + behind;
        return regex;
    }

    /**
     * Not implemented ... please use the method of the engine
     * {@link at.lux.fotoretrieval.retrievalengines.FileSystemRetrieval}
     * @param VisualDescriptor
     * @param whereToSearch
     * @param recursive
     * @param progress
     */
    public List<ResultListEntry> getSimilarImages(Element VisualDescriptor, String whereToSearch, boolean recursive, JProgressBar progress) {
        return null;
    }
	public List<ResultListEntry> getSimilarImages_fromSet(Set<Element> VisualDescriptorSet, String whereToSearch, boolean recursive, JProgressBar progress) {
        return null;
    }

    public List<ResultListEntry> getImagesByXPathSearch(String xPath, String whereToSearch, boolean recursive, JProgressBar progress) {
        ArrayList<ResultListEntry> results = new ArrayList<ResultListEntry>(maxResults);
        if (progress != null)
            progress.setString("Searching through index");
        SAXBuilder builder = new SAXBuilder();
        try {
            QueryParser qParse = new QueryParser("all", new StandardAnalyzer());
            IndexSearcher searcher = new IndexSearcher(parseFulltextIndexDirectory(whereToSearch));
            Query query = qParse.parse(xPath);
            Hits hits = searcher.search(query);
            int hitsCount = hits.length();
            if (hitsCount > maxResults) hitsCount = maxResults;
            if (progress != null) {
                progress.setMinimum(0);
                progress.setMaximum(hitsCount);
                progress.setValue(0);
                progress.setString("Reading results from disk");
            }

            for (int i = 0; i < hitsCount; i++) {
                Document d = hits.doc(i);
                Element e = builder.build(new FileInputStream(d.get("file"))).getRootElement();
                results.add(new ResultListEntry(hits.score(i), e, d.get("file")));
                if (progress != null) progress.setValue(i);
            }

        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            System.err.println("XPath was: " + xPath);
            e.printStackTrace();
        } catch (JDOMException e) {
            e.printStackTrace();
        }
        return results;

    }

    /**
     * In general we take the base path for our search for the pathToIndex parameter.
     * we then add the directory "index" and create it there.
     *
     * @param pathToIndex
     * @param statusBar
     */
    public void indexFiles(String pathToIndex, StatusBar statusBar) {
        // parsing and eventually creating the directory for the index ...
        String indexDir = parseFulltextIndexDirectory(pathToIndex);

        Analyzer analyzer = new StandardAnalyzer();
        boolean createFlag = true;
        SAXBuilder builder = new SAXBuilder();
        String prefix = "Creating fulltext index: ";
        try {
            IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);
            String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true);
            if (descriptions == null) return;
            float numAllDocsPercent = (float) descriptions.length / 100f;
            DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
            df.setMaximumFractionDigits(1);

            for (int i = 0; i < descriptions.length; i++) {
                try {
                    Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                    Document idxDocument = new Document();
                    // adding the file itself ...
                    idxDocument.add(new Field("file", descriptions[i], Field.Store.YES, Field.Index.NO));
                    // adding all given names
                    StringBuilder all = new StringBuilder(255);

                    List l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
//                    System.out.println("NumberOfRelations: " + l.size());

                    addToDocument(idxDocument, e, "//Agent/Name/GivenName", "GivenName", all);
                    addToDocument(idxDocument, e, "//Agent/Name/FamilyName", "FamilyName", all);
                    addToDocument(idxDocument, e, "//Label/Name", "Label", all);
                    addToDocument(idxDocument, e, "//FreeTextAnnotation", "FreeTextAnnotation", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/Who/Name", "Who", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/Where/Name", "Where", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/How/Name", "How", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/Why/Name", "Why", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/When/Name", "When", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/WhatObject/Name", "WhatObjects", all);
                    addToDocument(idxDocument, e, "//StructuredAnnotation/WhatAction/Name", "WhatAction", all);

                    idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED));

                    writer.addDocument(idxDocument);

                    if (statusBar != null) {
                        StringBuilder status = new StringBuilder(13).append(prefix);
                        status.append(df.format(((float) i) / numAllDocsPercent));
                        status.append('%');
                        statusBar.setStatus(status.toString());
                    }

                } catch (Exception e1) {
                    System.err.println("Error with file " + descriptions[i] + " (" + e1.getMessage() + ")");
                }
            }
            writer.optimize();
            writer.close();
            if (statusBar != null) {
                statusBar.setStatus("Indexing finished");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Creates a path from the base directory to a index directory for storing
     * the fulltext index
     *
     * @param pathToIndex directory where the index dir should be created
     * @return path to index directory for with Lucene
     */
    public static String parseFulltextIndexDirectory(String pathToIndex) {
        String indexDir = pathToIndex;
        if (!indexDir.endsWith(System.getProperty("file.separator"))) indexDir += System.getProperty("file.separator");
        indexDir += "idx_fulltext";
        File indexDirFile = new File(indexDir);
        if (!indexDirFile.exists()) indexDirFile.mkdir();
        return indexDir;
    }

    /**
     * Creates a path from the base directory to a index directory for storing
     * the index of semantic objects
     *
     * @param pathToIndex directory where the index dir should be created
     * @return path to index directory for with Lucene
     */
    public static String parseSemanticIndexDirectory(String pathToIndex) {
        String indexDir = pathToIndex;
        if (!indexDir.endsWith(System.getProperty("file.separator"))) indexDir += System.getProperty("file.separator");
        indexDir += "idx_semantic";
        File indexDirFile = new File(indexDir);
        if (!indexDirFile.exists()) indexDirFile.mkdir();
        return indexDir;
    }

    private void addToDocument(Document document, Element root, String xPath, String fieldName, StringBuilder allContents) {
        List l = RetrievalToolkit.xpathQuery(root, xPath, null);
        StringWriter sw = new StringWriter(128);
        for (Iterator iterator = l.iterator(); iterator.hasNext();) {
            Element e = (Element) iterator.next();
            sw.append(e.getTextTrim());
            sw.append(" ");
            allContents.append(e.getTextTrim());
            allContents.append(" ");
        }
        document.add(new Field(fieldName, sw.toString(), Field.Store.YES, Field.Index.TOKENIZED));
    }

    public void indexFilesSemantically(String pathToIndex, StatusBar statusBar) {
        if (statusBar != null) statusBar.setStatus("Creating index from semantic annotations");

        SAXBuilder builder = new SAXBuilder();
        XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setIndent("").setLineSeparator("").setExpandEmptyElements(false));

        try {
            String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true);
            if (descriptions == null) return;
            float numAllDocsPercent = (float) descriptions.length / 100f;
            DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
            df.setMaximumFractionDigits(1);

            // Preparing objects for the index:
            HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length);
            HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>(descriptions.length);

            // in the first run we identify the semantic objects that we want to index and build
            // a table were we can relate them to the documents (identified by their path)
            for (int i = 0; i < descriptions.length; i++) {
                try {
                    Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                    List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element semanticElement = (Element) iterator.next();
                        String xmlString = outputter.outputString(semanticElement).trim().replaceAll("id=\"id_[0-9]*\"", "");
                        // check if element is already there, indicator is its string representation.
                        if (!elementMap.keySet().contains(xmlString)) {
                            // its not here, put it in.
                            elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size()));
//                            System.out.println(xmlString);
                        }
                        // now get the unified element
                        semanticElement = elementMap.get(xmlString).semanticElement;
                        // and check if there is an entry in the table for where to find the element
                        if (!element2document.keySet().contains(semanticElement)) {
                            element2document.put(semanticElement, new LinkedList<String>());
                        }
                        // and add found document if not already there:
                        List documentList = element2document.get(semanticElement);
                        if (!documentList.contains(descriptions[i])) documentList.add(descriptions[i]);
                    }
                    if (statusBar != null) statusBar.setStatus("Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent));
                } catch (JDOMException e1) {
                    System.err.println("Exception in document #" + i + ": " + e1.getMessage());
                } catch (IOException e1) {
                    e1.printStackTrace();
                }
            }
            // read stats:
            // System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different.");

            // Now we can add the nodes to a lucene index:
            // fields: label, id, type, files (separated by '|'), xml, all
            // -------------------------------------------

            // opening the index for writing:
            boolean createFlag = true;
            String indexDir = parseSemanticIndexDirectory(pathToIndex);
            Analyzer analyzer = new StandardAnalyzer();
            IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);

            if (statusBar != null) statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes");

            // iterating through nodes and storing them:
            for (Iterator<Element> iterator = element2document.keySet().iterator(); iterator.hasNext();) {
                Element semElement = iterator.next();
                // needed for later XPath :( otherwise everthing in the whole document is retrieved.

                String fileList = getFileListFromNode(element2document.get(semElement));
                Document idxDocument = new Document();
                // adding the file itself ...
                idxDocument.add(new Field("files", fileList, Field.Store.YES, Field.Index.NO));

//                System.out.println(((Element) o).getTextTrim());

                StringBuilder all = new StringBuilder(255);
                // adding the label
//                addToDocument(idxDocument, semElement, "//Label/Name", "label", all);
                String elementLabel = semElement.getChild("Label", semElement.getNamespace()).getChildTextTrim("Name", semElement.getNamespace());
                idxDocument.add(new Field("label", elementLabel, Field.Store.YES, Field.Index.TOKENIZED));

                // adding the type:
                String elementType = semElement.getAttribute("type", xsi).getValue().trim();
                idxDocument.add(new Field("type", elementType, Field.Store.YES, Field.Index.NO));
                // adding the XML contents:
                String xmlString = outputter.outputString(semElement);
                idxDocument.add(new Field("xml", xmlString, Field.Store.YES, Field.Index.NO));
                // adding the id:
                idxDocument.add(new Field("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + "", Field.Store.YES, Field.Index.NO));
                // adding all, unstored for retrieval only
                List l = RetrievalToolkit.xpathQuery(semElement, "*//*", null);
                for (Iterator it3 = l.iterator(); it3.hasNext();) {
                    Element e = (Element) it3.next();
                    all.append(e.getTextTrim());
                    all.append(" ");
                }
                idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED));

                writer.addDocument(idxDocument);

            }
            // now optimize and close the index:
            // todo: open index for appending and/or updating
            writer.optimize();
            writer.close();

            // Now we can create the powerset for each existing graph
            // (based on sorted node ids) and store
            // all resulting graphs within an index.
            // ----------------------------------------------------------
            if (statusBar != null) statusBar.setStatus("Creating and merging powersets of available graphs");
            HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>(descriptions.length);
            for (int i = 0; i < descriptions.length; i++) {
                try {
                    Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement();
                    List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                    HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size());
                    LinkedList<Relation> relations = new LinkedList<Relation>();
                    LinkedList<Integer> nodes = new LinkedList<Integer>();
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element semanticElement = (Element) iterator.next();
                        String xmlString = outputter.outputString(semanticElement);
                        int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id;
                        String docID = semanticElement.getAttribute("id").getValue();
                        docID2overallID.put(docID, id);
                        nodes.add(id);
                    }
                    // get all relations with global ids and eliminate inverse relations
                    l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element relation = (Element) iterator.next();
                        int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1));
                        int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1));
                        String type = relation.getAttribute("type").getValue();
                        type = type.substring(type.lastIndexOf(':') + 1);
                        Relation r = eliminateInverse(new Relation(source, target, type));
                        relations.add(r);
                    }

                    // now create a graph object
                    Collections.sort(nodes);
                    Collections.sort(relations);
                    LinkedList<Node> nodeList = new LinkedList<Node>();
                    for (Iterator<Integer> iterator = nodes.iterator(); iterator.hasNext();) {
                        nodeList.add(new Node(iterator.next()));
                    }
                    Graph g = new Graph(nodeList, relations);
//                    List<Graph> powerSet = new LinkedList<Graph>();
//                    powerSet.add(g);
                    HashSet<String> docs = new HashSet<String>(1);
                    docs.add(descriptions[i]);
                    graph2document.put(g, docs);
/*

                    // add all these subgraphs and the reference to the document to
                    // a data structure:
                    for (Iterator<Graph> iterator = powerSet.iterator(); iterator.hasNext();) {
                        Graph graph = iterator.next();
//                        List<Graph> relationsPowerSet = graph.getPowerSetOfRelations();
//                        for (Iterator<Graph> iterator1 = relationsPowerSet.iterator(); iterator1.hasNext();) {
//                            Graph graph1 = iterator1.next();
//                        }
                        // add graph if not trivial:
                        if (graph.getNodes().size() > 1) {
                            // containsKey for Graph does not match my needs -
                            // different graph objects reference the same graph!
                            if (string2graph.containsKey(graph.toString())) {
                                graph = string2graph.get(graph.toString());
                                graph2document.get(graph).add(descriptions[i]);
                            } else {
                                HashSet<String> docs = new HashSet<String>(1);
                                docs.add(descriptions[i]);
                                graph2document.put(graph, docs);
                            }
                        }
                    }
*/
                } catch (JDOMException e1) {
                    System.err.println("Exception in document #" + i + ": " + e1.getMessage());
                }
            }

            HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2);
            HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length);

            /*
                For now we reduce the number of graphs by identifiying and merging duplicates and
                remove redundant entries:
            */
            for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
                Graph g = iterator.next();
                if (str2graph.containsKey(g.toString())) {
                    g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g));
                } else {
                    str2graph.put(g.toString(), g);
                    g2d.put(g, graph2document.get(g));
                }
            }
            graph2document = g2d;
            System.out.println(graph2document.size() + " non trivial different graphs were found");
            // now put all the available graphs into an index:
            // -----------------------------------------------
            // todo: create real fast storable index of subgraphs instead of file :-) possible candidate a trie

            // for now we will store a simple text file:
            if (statusBar != null) statusBar.setStatus("Storing powersets of available graphs as file");
            String indexFile;
            if (!pathToIndex.endsWith(File.separator)) {
                indexFile = pathToIndex + File.separator + "idx_graphs.list";
            } else {
                indexFile = pathToIndex + "idx_graphs.list";
            }
            File f = new File(indexFile);
            BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(f, false))));
            for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
                Graph g = iterator.next();
                bw.write(g.toString());
                for (Iterator<String> iterator1 = graph2document.get(g).iterator(); iterator1.hasNext();) {
                    String s = iterator1.next();
                    bw.write("|" + s);
                }
                bw.write("\n");
            }
            bw.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    /**
     * Searches for all available nodes with given query String
     *
     * @param queryString   query like "Mathias Lux" or some text inside a node.
     * @param whereToSearch defines the base directory for the search
     * @return a List of Matching nodes with their associated weights
     */
    public static List<Node> getNodes(String queryString, String whereToSearch) {
        return ((LucenePathIndexRetrievalEngine) RetrievalEngineFactory.getPathIndexRetrievalEngine()).getNodes(queryString, whereToSearch);
    }

    private String getFileListFromNode(List<String> list) {
        StringBuilder files = new StringBuilder(64);
        for (Iterator<String> it2 = list.iterator(); it2.hasNext();) {
            files.append(it2.next());
            if (it2.hasNext()) {
                files.append('|');
            }
        }
        return files.toString();
    }

    /**
     * Eliminates all inverse relations to simplify retrieval
     *
     * @param relation
     * @return the normalized relation
     */
    public static Relation eliminateInverse(Relation relation) {
        Relation result = relation;
        if (relationMapping.containsKey(relation.getType())) {
            result = new Relation(relation.getTarget(), relation.getSource(), relationMapping.get(relation.getType()));
        }
        return result;
    }

}