/* * This file is part of Caliph & Emir. * * Caliph & Emir is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Caliph & Emir is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Caliph & Emir; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Copyright statement: * -------------------- * (c) 2002-2005 by Mathias Lux (mathias@juggle.at) * http://www.juggle.at, http://caliph-emir.sourceforge.net */ package at.lux.fotoretrieval.retrievalengines; import at.lux.components.StatusBar; import at.lux.fotoretrieval.FileOperations; import at.lux.fotoretrieval.ResultListEntry; import at.lux.fotoretrieval.RetrievalToolkit; import at.lux.fotoretrieval.lucene.Graph; import at.lux.fotoretrieval.lucene.Node; import at.lux.fotoretrieval.lucene.Relation; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.Namespace; import org.jdom.input.SAXBuilder; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; import javax.swing.*; import java.io.*; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.*; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; /** * Date: 13.10.2004 * Time: 21:47:58 * * @author Mathias Lux, mathias@juggle.at */ public class LuceneRetrievalEngine extends AbstractRetrievalEngine { private int maxResults = 40; private static Namespace xsi = Namespace.getNamespace("xsi", "http://www.w3.org/2001/XMLSchema-instance"); public static final HashMap<String, String> relationMapping; public LuceneRetrievalEngine(int maxResults) { this.maxResults = maxResults; } static { relationMapping = new HashMap<String, String>(27); relationMapping.put("key", "keyFor"); relationMapping.put("annotates", "annotatedBy"); relationMapping.put("shows", "appearsIn"); relationMapping.put("references", "referencedBy"); relationMapping.put("quality", "qualityOf"); relationMapping.put("symbolizes", "symbolizedBy"); relationMapping.put("location", "locationOf"); relationMapping.put("source", "sourceOf"); relationMapping.put("destination", "destinationOf"); relationMapping.put("path", "pathOf"); relationMapping.put("time", "timeOf"); relationMapping.put("depicts", "depictedBy"); relationMapping.put("represents", "representedBy"); relationMapping.put("context", "contextFor"); relationMapping.put("interprets", "interpretedBy"); relationMapping.put("agent", "agentOf"); relationMapping.put("patient", "patientOf"); relationMapping.put("experiencer", "experiencerOf"); relationMapping.put("stimulus", "stimulusOf"); relationMapping.put("causer", "causerOf"); relationMapping.put("goal", "goalOf"); relationMapping.put("beneficiary", "beneficiaryOf"); relationMapping.put("theme", "themeOf"); relationMapping.put("result", "resultOf"); relationMapping.put("instrument", "instrumentOf"); relationMapping.put("accompanier", "accompanierOf"); relationMapping.put("summarizes", "summarizedBt"); relationMapping.put("specializes", "generalizes"); relationMapping.put("exemplifies", "exemplifiedBy"); relationMapping.put("part", "partOf"); relationMapping.put("property", "propertyOf"); relationMapping.put("user", "userOf"); relationMapping.put("component", "componentOf"); relationMapping.put("substance", "substanceOf"); relationMapping.put("entails", "entailedBy"); relationMapping.put("manner", "mannerOf"); relationMapping.put("state", "stateOf"); relationMapping.put("influences", "dependsOn"); } /** * In this case we can search for images with a String query deinfing * a graph, where the nodes are build by search queries in square brackets * and are referenced in relations by their postion starting with number 1. * relations follow the nodes starting with the type followed by the * position of the source node in the List and the target node: * <code> * query := node+ relation+ * node := [term+] * relation := type source target * term := String * type := String * source := Integer * target := Integer * </code> * e.g."[Mathias Lux] [Talking] agentOf 1 2" * * @param xPath * @param objects can be set to <code>null</code> * @param whereToSearch * @param recursive * @param progress */ public List<ResultListEntry> getImagesBySemantics(String xPath, Vector objects, String whereToSearch, boolean recursive, JProgressBar progress) { List<String> nodeQueries = new LinkedList<String>(); StringTokenizer st = new StringTokenizer(xPath, "]"); String relationString = ""; List<Relation> relations = new LinkedList<Relation>(); while (st.hasMoreTokens()) { String s = st.nextToken().trim(); if (s.startsWith("[")) { s = s.substring(1); nodeQueries.add(s); } else { relationString = s; } } if (relationString.length() > 1) { // there are relations, go ahead and parse them: StringTokenizer sr = new StringTokenizer(relationString); Relation currentRelation = null; while (sr.hasMoreTokens()) { String s = sr.nextToken(); try { int i = Integer.parseInt(s); if (currentRelation.getSource() < 0) { currentRelation.setSource(i); } else if (currentRelation.getTarget() < 0) { currentRelation.setTarget(i); currentRelation.eliminateInverse(); relations.add(currentRelation); currentRelation = null; } } catch (NumberFormatException e) { // its the type :) currentRelation = new Relation(-1, -1, s.trim()); } } } // so for now do the retrieval for the nodes: int numOfNodes = nodeQueries.size(); List<List<Node>> nodeResults = new LinkedList<List<Node>>(); for (int i = 0; i < numOfNodes; i++) { String queryString = nodeQueries.get(i); List<Node> nodes = getNodes(queryString, whereToSearch); nodeResults.add(nodes); } // now we can expand our query on retrieved nodes: List<Graph> graphList = getExpandedGraphsFromResults(nodeResults, relations, 3); LinkedList<ResultListEntry> results = new LinkedList<ResultListEntry>(); if (progress != null) { progress.setMinimum(0); progress.setMaximum(graphList.size()); progress.setValue(0); progress.setString("Querying expanded graphs"); } int countGraph = 0; for (Iterator<Graph> iterator = graphList.iterator(); iterator.hasNext();) { Graph graph = iterator.next(); results.addAll(searchForGraph(graph, whereToSearch)); countGraph++; if (progress != null) progress.setValue(countGraph); } // for now eliminate the doublettes if (progress != null) { progress.setMinimum(0); progress.setMaximum(results.size()); progress.setValue(0); progress.setString("Removing double entries"); } countGraph = 0; HashMap<String, ResultListEntry> gegencheck = new HashMap<String, ResultListEntry>(); for (Iterator<ResultListEntry> iterator = results.iterator(); iterator.hasNext();) { ResultListEntry resultListEntry = iterator.next(); String file = resultListEntry.getFilePath(); double relevance = resultListEntry.getRelevance(); if (gegencheck.containsKey(file)) { double rel = gegencheck.get(file).getRelevance(); if (rel < relevance) { gegencheck.put(file, resultListEntry); } } else { gegencheck.put(file, resultListEntry); } countGraph++; if (progress != null) progress.setValue(countGraph); } results.clear(); results.addAll(gegencheck.values()); Collections.sort(results); return results; } private List<Graph> getExpandedGraphsFromResults(List<List<Node>> nodeResults, List<Relation> relations, int depth) { List<List<Node>> expanded = getExpandedSets(nodeResults, depth); // System.out.println("Expanding to " + expanded.size() + " graphs"); List<Graph> results = new LinkedList<Graph>(); for (Iterator<List<Node>> iterator = expanded.iterator(); iterator.hasNext();) { List<Node> nodes = iterator.next(); Graph g = getGraphFromResults(nodes, relations); results.add(g); } // if there are any relations without type we have to // create a reverse relation each, otherwise we won't // get all our results: // List<Graph> additionalResults = new LinkedList<Graph>(); // for (Iterator<Graph> iterator = results.iterator(); iterator.hasNext();) { // Graph graph = iterator.next(); // expandUntypedRelations(graph, additionalResults); // } return results; } private List<List<Node>> getExpandedSets(List<List<Node>> nodeResults, int depth) { if (nodeResults.size() > 1) { List<Node> firstNodesResults = nodeResults.get(0); int numLevels = 0; for (Iterator<Node> iterator = firstNodesResults.iterator(); iterator.hasNext();) { Node node = iterator.next(); if (node.getWeight() < 1f) break; numLevels++; } numLevels += depth; if (firstNodesResults.size() < depth) { numLevels = firstNodesResults.size(); } List<List<Node>> tmpNodeResults = new LinkedList<List<Node>>(nodeResults); tmpNodeResults.remove(0); List<List<Node>> results = getExpandedSets(tmpNodeResults, depth); List<List<Node>> endResult = new LinkedList<List<Node>>(); for (int i = 0; i < numLevels && i < firstNodesResults.size(); i++) { for (int j = 0; j < results.size(); j++) { List<Node> nodeList = new LinkedList<Node>(results.get(j)); nodeList.add(0, firstNodesResults.get(i)); endResult.add(nodeList); } } return endResult; } else { List<List<Node>> endResult = new LinkedList<List<Node>>(); List<Node> firstNodesResults = nodeResults.get(0); int numLevels = 0; for (Iterator<Node> iterator = firstNodesResults.iterator(); iterator.hasNext();) { Node node = iterator.next(); if (node.getWeight() < 1f) break; numLevels++; } numLevels += depth; for (int i = 0; i < numLevels && i < firstNodesResults.size(); i++) { List<Node> nodeList = new LinkedList<Node>(); nodeList.add(firstNodesResults.get(i)); endResult.add(nodeList); } return endResult; } } private Graph getGraphFromResults(List<Node> nodeResults, List<Relation> relations) { HashMap<Integer, Integer> idReplacementTable = new HashMap<Integer, Integer>(nodeResults.size()); List<Node> nodes = new LinkedList<Node>(); List<Relation> myRelations = new LinkedList<Relation>(); for (int i = 0; i < nodeResults.size(); i++) { Node node = nodeResults.get(i); idReplacementTable.put(i + 1, node.getNodeID()); nodes.add(node); } // Create the relations with the real IDs: for (Iterator<Relation> iterator = relations.iterator(); iterator.hasNext();) { Relation r = iterator.next(); int src = (idReplacementTable.get(r.getSource())); int tgt = (idReplacementTable.get(r.getTarget())); myRelations.add(new Relation(src, tgt, r.getType())); } // now we can create the graph we want to search for: Graph g = new Graph(nodes, myRelations); return g; } private List<ResultListEntry> searchForGraph(Graph g, String whereToSearch) { // System.out.println("Querying for graph: " + g.toString()); // for (Iterator<Node> iterator = g.getNodes().iterator(); iterator.hasNext();) { // Node node = iterator.next(); // System.out.println(node.getLabel() + ": " + node.getNodeID() + " (" + node.getWeight() + ") "); // } // and we search for it ion the text file: String indexFile; // create regex string: // as there are all nodes and relations surrounded with square brackets this is easy // between the relations there may various literals: '.*' String regexInsert = ".*"; StringBuilder graphSearch = new StringBuilder(g.toString().length() * 2); StringTokenizer stok = new StringTokenizer(g.toString(), "["); String s = ""; graphSearch.append(regexInsert); LinkedList<String> graphSearchList = new LinkedList<String>(); while (stok.hasMoreTokens()) { StringBuilder regexItem = new StringBuilder(32); s = stok.nextToken().trim(); s = s.substring(0, s.length() - 1); // and there may be other nodes & relations: regexItem.append(regexInsert); // opening bracket '[' regexItem.append("\\x5B"); // the actual content (node or relation) regexItem.append(s); // closing bracket ']' regexItem.append("\\x5D"); // and there may be other nodes & relations: regexItem.append(regexInsert); String regex = regexItem.toString(); if (regex.indexOf("\\w*") > -1) { // here we create support for relation wildcards: regex = expandUntypedRelation(regex); } graphSearchList.add(regex); /* // opening bracket '[' graphSearch.append("\\x5B"); // the actual content (node or relation) graphSearch.append(s); // closing bracket ']' graphSearch.append("\\x5D"); // and there may be other nodes & relations: graphSearch.append(regexInsert); */ } List<ResultListEntry> resultList = new LinkedList<ResultListEntry>(); SAXBuilder builder = new SAXBuilder(); if (!whereToSearch.endsWith(File.separator)) { indexFile = whereToSearch + File.separator + "idx_graphs.list"; } else { indexFile = whereToSearch + "idx_graphs.list"; } try { BufferedReader br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(indexFile)))); String line = null; // String regex = graphSearch.toString(); // String oldRegex = regex; // if (regex.indexOf("\\w*") > -1) { // here we create support for relation wildcards: // regex = expandUntypedRelation(regex); // // } // System.out.println("REGEX: " + regex); while ((line = br.readLine()) != null) { boolean match = true; for (Iterator<String> iterator = graphSearchList.iterator(); iterator.hasNext();) { String regex = iterator.next(); if (!line.matches(regex)) { match = false; continue; } } if (match) { // we found a graph: System.out.println("FOUND: " + line); StringTokenizer st = new StringTokenizer(line, "|"); String graphString = st.nextToken(); Graph theGraph = new Graph(graphString); float similarity = g.getMcsSimilarity(theGraph); while (st.hasMoreTokens()) { String fileName = st.nextToken(); Element e = builder.build(new FileInputStream(fileName)).getRootElement(); ResultListEntry entry = new ResultListEntry((double) similarity, e, fileName); resultList.add(entry); } } } } catch (Exception e) { e.printStackTrace(); } return resultList; } private String expandUntypedRelation(String regex) { String behind = regex.substring(regex.indexOf("\\w*") + 4); String before = regex.substring(0, regex.indexOf("\\w*") + 4); String firstNum = behind.substring(0, behind.indexOf(' ')); String secondNum = behind.substring(behind.indexOf(' ') + 1, behind.indexOf('\\')); behind = behind.substring(behind.indexOf('\\')); regex = before + "((" + firstNum + " " + secondNum + ")|(" + secondNum + " " + firstNum + "))" + behind; return regex; } /** * Not implemented ... please use the method of the engine * {@link at.lux.fotoretrieval.retrievalengines.FileSystemRetrieval} * @param VisualDescriptor * @param whereToSearch * @param recursive * @param progress */ public List<ResultListEntry> getSimilarImages(Element VisualDescriptor, String whereToSearch, boolean recursive, JProgressBar progress) { return null; } public List<ResultListEntry> getSimilarImages_fromSet(Set<Element> VisualDescriptorSet, String whereToSearch, boolean recursive, JProgressBar progress) { return null; } public List<ResultListEntry> getImagesByXPathSearch(String xPath, String whereToSearch, boolean recursive, JProgressBar progress) { ArrayList<ResultListEntry> results = new ArrayList<ResultListEntry>(maxResults); if (progress != null) progress.setString("Searching through index"); SAXBuilder builder = new SAXBuilder(); try { QueryParser qParse = new QueryParser("all", new StandardAnalyzer()); IndexSearcher searcher = new IndexSearcher(parseFulltextIndexDirectory(whereToSearch)); Query query = qParse.parse(xPath); Hits hits = searcher.search(query); int hitsCount = hits.length(); if (hitsCount > maxResults) hitsCount = maxResults; if (progress != null) { progress.setMinimum(0); progress.setMaximum(hitsCount); progress.setValue(0); progress.setString("Reading results from disk"); } for (int i = 0; i < hitsCount; i++) { Document d = hits.doc(i); Element e = builder.build(new FileInputStream(d.get("file"))).getRootElement(); results.add(new ResultListEntry(hits.score(i), e, d.get("file"))); if (progress != null) progress.setValue(i); } } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { System.err.println("XPath was: " + xPath); e.printStackTrace(); } catch (JDOMException e) { e.printStackTrace(); } return results; } /** * In general we take the base path for our search for the pathToIndex parameter. * we then add the directory "index" and create it there. * * @param pathToIndex * @param statusBar */ public void indexFiles(String pathToIndex, StatusBar statusBar) { // parsing and eventually creating the directory for the index ... String indexDir = parseFulltextIndexDirectory(pathToIndex); Analyzer analyzer = new StandardAnalyzer(); boolean createFlag = true; SAXBuilder builder = new SAXBuilder(); String prefix = "Creating fulltext index: "; try { IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag); String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true); if (descriptions == null) return; float numAllDocsPercent = (float) descriptions.length / 100f; DecimalFormat df = (DecimalFormat) NumberFormat.getInstance(); df.setMaximumFractionDigits(1); for (int i = 0; i < descriptions.length; i++) { try { Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement(); Document idxDocument = new Document(); // adding the file itself ... idxDocument.add(new Field("file", descriptions[i], Field.Store.YES, Field.Index.NO)); // adding all given names StringBuilder all = new StringBuilder(255); List l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null); // System.out.println("NumberOfRelations: " + l.size()); addToDocument(idxDocument, e, "//Agent/Name/GivenName", "GivenName", all); addToDocument(idxDocument, e, "//Agent/Name/FamilyName", "FamilyName", all); addToDocument(idxDocument, e, "//Label/Name", "Label", all); addToDocument(idxDocument, e, "//FreeTextAnnotation", "FreeTextAnnotation", all); addToDocument(idxDocument, e, "//StructuredAnnotation/Who/Name", "Who", all); addToDocument(idxDocument, e, "//StructuredAnnotation/Where/Name", "Where", all); addToDocument(idxDocument, e, "//StructuredAnnotation/How/Name", "How", all); addToDocument(idxDocument, e, "//StructuredAnnotation/Why/Name", "Why", all); addToDocument(idxDocument, e, "//StructuredAnnotation/When/Name", "When", all); addToDocument(idxDocument, e, "//StructuredAnnotation/WhatObject/Name", "WhatObjects", all); addToDocument(idxDocument, e, "//StructuredAnnotation/WhatAction/Name", "WhatAction", all); idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED)); writer.addDocument(idxDocument); if (statusBar != null) { StringBuilder status = new StringBuilder(13).append(prefix); status.append(df.format(((float) i) / numAllDocsPercent)); status.append('%'); statusBar.setStatus(status.toString()); } } catch (Exception e1) { System.err.println("Error with file " + descriptions[i] + " (" + e1.getMessage() + ")"); } } writer.optimize(); writer.close(); if (statusBar != null) { statusBar.setStatus("Indexing finished"); } } catch (IOException e) { e.printStackTrace(); } } /** * Creates a path from the base directory to a index directory for storing * the fulltext index * * @param pathToIndex directory where the index dir should be created * @return path to index directory for with Lucene */ public static String parseFulltextIndexDirectory(String pathToIndex) { String indexDir = pathToIndex; if (!indexDir.endsWith(System.getProperty("file.separator"))) indexDir += System.getProperty("file.separator"); indexDir += "idx_fulltext"; File indexDirFile = new File(indexDir); if (!indexDirFile.exists()) indexDirFile.mkdir(); return indexDir; } /** * Creates a path from the base directory to a index directory for storing * the index of semantic objects * * @param pathToIndex directory where the index dir should be created * @return path to index directory for with Lucene */ public static String parseSemanticIndexDirectory(String pathToIndex) { String indexDir = pathToIndex; if (!indexDir.endsWith(System.getProperty("file.separator"))) indexDir += System.getProperty("file.separator"); indexDir += "idx_semantic"; File indexDirFile = new File(indexDir); if (!indexDirFile.exists()) indexDirFile.mkdir(); return indexDir; } private void addToDocument(Document document, Element root, String xPath, String fieldName, StringBuilder allContents) { List l = RetrievalToolkit.xpathQuery(root, xPath, null); StringWriter sw = new StringWriter(128); for (Iterator iterator = l.iterator(); iterator.hasNext();) { Element e = (Element) iterator.next(); sw.append(e.getTextTrim()); sw.append(" "); allContents.append(e.getTextTrim()); allContents.append(" "); } document.add(new Field(fieldName, sw.toString(), Field.Store.YES, Field.Index.TOKENIZED)); } public void indexFilesSemantically(String pathToIndex, StatusBar statusBar) { if (statusBar != null) statusBar.setStatus("Creating index from semantic annotations"); SAXBuilder builder = new SAXBuilder(); XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setIndent("").setLineSeparator("").setExpandEmptyElements(false)); try { String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true); if (descriptions == null) return; float numAllDocsPercent = (float) descriptions.length / 100f; DecimalFormat df = (DecimalFormat) NumberFormat.getInstance(); df.setMaximumFractionDigits(1); // Preparing objects for the index: HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length); HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>(descriptions.length); // in the first run we identify the semantic objects that we want to index and build // a table were we can relate them to the documents (identified by their path) for (int i = 0; i < descriptions.length; i++) { try { Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement(); List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null); for (Iterator iterator = l.iterator(); iterator.hasNext();) { Element semanticElement = (Element) iterator.next(); String xmlString = outputter.outputString(semanticElement).trim().replaceAll("id=\"id_[0-9]*\"", ""); // check if element is already there, indicator is its string representation. if (!elementMap.keySet().contains(xmlString)) { // its not here, put it in. elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size())); // System.out.println(xmlString); } // now get the unified element semanticElement = elementMap.get(xmlString).semanticElement; // and check if there is an entry in the table for where to find the element if (!element2document.keySet().contains(semanticElement)) { element2document.put(semanticElement, new LinkedList<String>()); } // and add found document if not already there: List documentList = element2document.get(semanticElement); if (!documentList.contains(descriptions[i])) documentList.add(descriptions[i]); } if (statusBar != null) statusBar.setStatus("Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent)); } catch (JDOMException e1) { System.err.println("Exception in document #" + i + ": " + e1.getMessage()); } catch (IOException e1) { e1.printStackTrace(); } } // read stats: // System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different."); // Now we can add the nodes to a lucene index: // fields: label, id, type, files (separated by '|'), xml, all // ------------------------------------------- // opening the index for writing: boolean createFlag = true; String indexDir = parseSemanticIndexDirectory(pathToIndex); Analyzer analyzer = new StandardAnalyzer(); IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag); if (statusBar != null) statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes"); // iterating through nodes and storing them: for (Iterator<Element> iterator = element2document.keySet().iterator(); iterator.hasNext();) { Element semElement = iterator.next(); // needed for later XPath :( otherwise everthing in the whole document is retrieved. String fileList = getFileListFromNode(element2document.get(semElement)); Document idxDocument = new Document(); // adding the file itself ... idxDocument.add(new Field("files", fileList, Field.Store.YES, Field.Index.NO)); // System.out.println(((Element) o).getTextTrim()); StringBuilder all = new StringBuilder(255); // adding the label // addToDocument(idxDocument, semElement, "//Label/Name", "label", all); String elementLabel = semElement.getChild("Label", semElement.getNamespace()).getChildTextTrim("Name", semElement.getNamespace()); idxDocument.add(new Field("label", elementLabel, Field.Store.YES, Field.Index.TOKENIZED)); // adding the type: String elementType = semElement.getAttribute("type", xsi).getValue().trim(); idxDocument.add(new Field("type", elementType, Field.Store.YES, Field.Index.NO)); // adding the XML contents: String xmlString = outputter.outputString(semElement); idxDocument.add(new Field("xml", xmlString, Field.Store.YES, Field.Index.NO)); // adding the id: idxDocument.add(new Field("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + "", Field.Store.YES, Field.Index.NO)); // adding all, unstored for retrieval only List l = RetrievalToolkit.xpathQuery(semElement, "*//*", null); for (Iterator it3 = l.iterator(); it3.hasNext();) { Element e = (Element) it3.next(); all.append(e.getTextTrim()); all.append(" "); } idxDocument.add(new Field("all", all.toString(), Field.Store.NO, Field.Index.TOKENIZED)); writer.addDocument(idxDocument); } // now optimize and close the index: // todo: open index for appending and/or updating writer.optimize(); writer.close(); // Now we can create the powerset for each existing graph // (based on sorted node ids) and store // all resulting graphs within an index. // ---------------------------------------------------------- if (statusBar != null) statusBar.setStatus("Creating and merging powersets of available graphs"); HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>(descriptions.length); for (int i = 0; i < descriptions.length; i++) { try { Element e = builder.build(new FileInputStream(descriptions[i])).getRootElement(); List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null); HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size()); LinkedList<Relation> relations = new LinkedList<Relation>(); LinkedList<Integer> nodes = new LinkedList<Integer>(); for (Iterator iterator = l.iterator(); iterator.hasNext();) { Element semanticElement = (Element) iterator.next(); String xmlString = outputter.outputString(semanticElement); int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id; String docID = semanticElement.getAttribute("id").getValue(); docID2overallID.put(docID, id); nodes.add(id); } // get all relations with global ids and eliminate inverse relations l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null); for (Iterator iterator = l.iterator(); iterator.hasNext();) { Element relation = (Element) iterator.next(); int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1)); int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1)); String type = relation.getAttribute("type").getValue(); type = type.substring(type.lastIndexOf(':') + 1); Relation r = eliminateInverse(new Relation(source, target, type)); relations.add(r); } // now create a graph object Collections.sort(nodes); Collections.sort(relations); LinkedList<Node> nodeList = new LinkedList<Node>(); for (Iterator<Integer> iterator = nodes.iterator(); iterator.hasNext();) { nodeList.add(new Node(iterator.next())); } Graph g = new Graph(nodeList, relations); // List<Graph> powerSet = new LinkedList<Graph>(); // powerSet.add(g); HashSet<String> docs = new HashSet<String>(1); docs.add(descriptions[i]); graph2document.put(g, docs); /* // add all these subgraphs and the reference to the document to // a data structure: for (Iterator<Graph> iterator = powerSet.iterator(); iterator.hasNext();) { Graph graph = iterator.next(); // List<Graph> relationsPowerSet = graph.getPowerSetOfRelations(); // for (Iterator<Graph> iterator1 = relationsPowerSet.iterator(); iterator1.hasNext();) { // Graph graph1 = iterator1.next(); // } // add graph if not trivial: if (graph.getNodes().size() > 1) { // containsKey for Graph does not match my needs - // different graph objects reference the same graph! if (string2graph.containsKey(graph.toString())) { graph = string2graph.get(graph.toString()); graph2document.get(graph).add(descriptions[i]); } else { HashSet<String> docs = new HashSet<String>(1); docs.add(descriptions[i]); graph2document.put(graph, docs); } } } */ } catch (JDOMException e1) { System.err.println("Exception in document #" + i + ": " + e1.getMessage()); } } HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2); HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length); /* For now we reduce the number of graphs by identifiying and merging duplicates and remove redundant entries: */ for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) { Graph g = iterator.next(); if (str2graph.containsKey(g.toString())) { g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g)); } else { str2graph.put(g.toString(), g); g2d.put(g, graph2document.get(g)); } } graph2document = g2d; System.out.println(graph2document.size() + " non trivial different graphs were found"); // now put all the available graphs into an index: // ----------------------------------------------- // todo: create real fast storable index of subgraphs instead of file :-) possible candidate a trie // for now we will store a simple text file: if (statusBar != null) statusBar.setStatus("Storing powersets of available graphs as file"); String indexFile; if (!pathToIndex.endsWith(File.separator)) { indexFile = pathToIndex + File.separator + "idx_graphs.list"; } else { indexFile = pathToIndex + "idx_graphs.list"; } File f = new File(indexFile); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(f, false)))); for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) { Graph g = iterator.next(); bw.write(g.toString()); for (Iterator<String> iterator1 = graph2document.get(g).iterator(); iterator1.hasNext();) { String s = iterator1.next(); bw.write("|" + s); } bw.write("\n"); } bw.close(); } catch (IOException e) { e.printStackTrace(); } } /** * Searches for all available nodes with given query String * * @param queryString query like "Mathias Lux" or some text inside a node. * @param whereToSearch defines the base directory for the search * @return a List of Matching nodes with their associated weights */ public static List<Node> getNodes(String queryString, String whereToSearch) { return ((LucenePathIndexRetrievalEngine) RetrievalEngineFactory.getPathIndexRetrievalEngine()).getNodes(queryString, whereToSearch); } private String getFileListFromNode(List<String> list) { StringBuilder files = new StringBuilder(64); for (Iterator<String> it2 = list.iterator(); it2.hasNext();) { files.append(it2.next()); if (it2.hasNext()) { files.append('|'); } } return files.toString(); } /** * Eliminates all inverse relations to simplify retrieval * * @param relation * @return the normalized relation */ public static Relation eliminateInverse(Relation relation) { Relation result = relation; if (relationMapping.containsKey(relation.getType())) { result = new Relation(relation.getTarget(), relation.getSource(), relationMapping.get(relation.getType())); } return result; } }