/* * Copyright (c) 2013, University of Toronto. * * Licensed under the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. You may obtain * a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package edu.toronto.cs.xml2rdf.mapping.generator; import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPathExpressionException; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import edu.toronto.cs.xml2rdf.freebase.FreeBaseLinker; import edu.toronto.cs.xml2rdf.mapping.Entity; import edu.toronto.cs.xml2rdf.opencyc.OpenCycOntology; import edu.toronto.cs.xml2rdf.string.StringMetric; import edu.toronto.cs.xml2rdf.utils.DependencyDAG; import edu.toronto.cs.xml2rdf.utils.DisjointSet; import edu.toronto.cs.xml2rdf.utils.LogUtils; import edu.toronto.cs.xml2rdf.xml.XMLUtils; import java.util.concurrent.ConcurrentHashMap; //import org.xeustechnologies.googleapi.spelling.SpellChecker; //import org.xeustechnologies.googleapi.spelling.SpellResponse; /* * This is a "dummy" implementation of MappingGenerator interface. * TODO: Perhaps we can implement a brand new (not so dummy) class * that modularize each mapping step. */ /** * @author Soheil Hassas Yeganeh <soheil@cs.toronto.edu> */ public class DummyMappingGenerator implements MappingGenerator { // Flag for printing debugging information static boolean debug = true; // Ceilings private final int maxElement; private final int maxOnotlogyLookup; // Mapping essentials Map<String, Schema> schemas = new ConcurrentHashMap<String, Schema>(); private final List<MappingStep> enabledSteps; // Metrics private final StringMetric stringMetric; private final SchemaSimilarityMetic schemaSimMetric; // All thresholds private final double ontologyMatchingThreshold; private final double schemaSimThreshold; private int leafPromotionThreshold = 5; private double matchThreshold = 0.75; private double ignoredNumbers = 0.25; private int minimumNumberOfAttributeToMerges = 2; private final double intralinkingThreshold; /* * Constructor that initialize all threshold parameters. * TODO: Design algorithms to estimate the thresholds so that they do not * need to be manually assigned. */ public DummyMappingGenerator(double ontologyMatchingThreshold, StringMetric stringMetric, double schemaSimThreshold, SchemaSimilarityMetic schemaSimMetric, int leafPromotionThreshold, double matchThreshold, int maxElement, int maxOntologyLookup, double ignoredNumbers, int minimumNumberOfAttributeToMerges, double internalLinkingThreshold, MappingStep... enabledSteps) { this.ontologyMatchingThreshold = ontologyMatchingThreshold; this.schemaSimMetric = schemaSimMetric; this.stringMetric = stringMetric; this.schemaSimThreshold = schemaSimThreshold; this.matchThreshold = matchThreshold; this.leafPromotionThreshold = leafPromotionThreshold; this.maxElement = maxElement; this.maxOnotlogyLookup = maxOntologyLookup; this.ignoredNumbers = ignoredNumbers; this.minimumNumberOfAttributeToMerges = minimumNumberOfAttributeToMerges; this.intralinkingThreshold = internalLinkingThreshold; this.enabledSteps = Arrays.asList( enabledSteps == null || enabledSteps.length == 0 ? MappingStep.values() : enabledSteps); } /* * The root/mother function that calls all mapping step functions. * TODO: Modularize this function. */ @Override public Document generateMapping(Element rootDoc, String typePrefix) { // Capture time at each step long start; long end; // The organization of the XML files should have "clinical_studies" as the // very root document element (which is passed in as rootDoc), with many // "clinical_study" child nodes, which is the children variable below. NodeList children = rootDoc.getChildNodes(); System.out.println(children.getLength()); // Step 1. Merge the child element nodes and their associated schemas start = System.currentTimeMillis(); // Iterate through all child nodes or up to the maximum number specified, // and process (merge) ONLY child nodes that are elements. for (int i = 0; i < children.getLength() && (maxElement == -1 || i < maxElement); i++) { if (children.item(i) instanceof Element) { // Get the child element node. Element child = (Element) children.item(i); String name = child.getNodeName(); // Create a schema for this child element node if one with the same node name does not exist. // Consequently, there will be only one schema for each unique node name. // The path of the schema is the ABSOLUTE path to the child element node, starting with "/" // and the root element node name, such as "/clinical_studies/clinical_study". Schema schema = schemas.get(name); if (schema == null) { // Eric: What if child nodes have the same name but at different layers of the // XML file and thus different path? Only the first path is used? schema = new Schema(null, child, "/" + rootDoc.getNodeName() + "/" + name); schemas.put(name, schema); } // Merge the child element node with its schema, that is, the schema of the same name try { mergeWithSchema(child, schema); } catch (Exception e) { if (debug) { e.printStackTrace(); } } } } end = System.currentTimeMillis(); System.out.println("Execution time of step 1 : schema merge was " + (end - start) + " ms."); cacheInstances(schemas, rootDoc); // Step 2. Flatten the schema start = System.currentTimeMillis(); try { // Eric: the flat threshold is always 1? flattenSchema(rootDoc.getOwnerDocument(), 1); } catch (XPathExpressionException e1) { if (debug) { e1.printStackTrace(); } } end = System.currentTimeMillis(); System.out.println("Execution time of step 2 : schema flatten was " + (end - start) + " ms."); // Step 3. Remove duplicates start = System.currentTimeMillis(); removeDuplicates(); end = System.currentTimeMillis(); System.out.println("Execution time of step 3 : duplicate remove was " + (end - start) + " ms."); // Step 4. Find a possible key for each identified schema start = System.currentTimeMillis(); try { for (Schema schema : schemas.values()) { try { // Eric: uniqunessThreshold is always 0.0d? // TODO: find a way to automate this. findKeysForSchema(schema, rootDoc.getOwnerDocument(), 0.0d); } catch (XPathExpressionException e) { if (debug) { e.printStackTrace(); } } } } catch (java.util.ConcurrentModificationException e1) { if (debug) { e1.printStackTrace(); } } end = System.currentTimeMillis(); System.out.println("Execution time of step 4 : key identification was " + (end - start) + " ms."); // Step 5. Intra-link schemas start = System.currentTimeMillis(); try { intralinkSchemas(rootDoc.getOwnerDocument(), intralinkingThreshold); } catch (XPathExpressionException e1) { if (debug) { e1.printStackTrace(); } } end = System.currentTimeMillis(); System.out.println("Execution time of step 5 : intra-linking was " + (end - start) + " ms."); DependencyDAG<Schema> dependecyDAG = new DependencyDAG<Schema>(); for (Schema schema : schemas.values()) { dependecyDAG.addNode(schema); // TODO(oktie): Haaji this is duplicate of the code on line 904. for (Relation rel : schema.getRelations()) { if (!schemas.containsKey(rel.getSchema())) { LogUtils.error(DummyMappingGenerator.class, "ERRRRRRRRRRRRR! " + rel.getSchema() + " Does not exist. " + rel); } } } for (Schema schema : schemas.values()) { for (Relation rel : schema.getRelations()) { dependecyDAG.addDependency(schema, rel.getSchema()); } } Document mappingRoot = null; try { mappingRoot = DocumentBuilderFactory.newInstance() .newDocumentBuilder().newDocument(); Element rootElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "mapping"); mappingRoot.appendChild(rootElement); while (dependecyDAG.size() != 0) { Schema schema = dependecyDAG.removeElementWithNoDependency(); addEntities(schema, mappingRoot, "", typePrefix); } } catch (ParserConfigurationException e) { if (debug) { e.printStackTrace(); } } return mappingRoot; } private void cacheInstances(Map<String, Schema> schemas, Element rootDoc) { for (Map.Entry<String, Schema> entry : schemas.entrySet()) { String name = entry.getKey(); entry.getValue(); } } /* * Checking if the mapping step is toggled by the user. */ public boolean isStepEnabled(MappingStep step) { return enabledSteps.contains(step); } SchemaInstance createSchemaInstance(Element element, Schema schema) { SchemaInstance instance = null; try { instance = new SchemaInstance(element); schema.instances.add(instance); } catch (IOException e) { } return instance; } /* * Step 1. Merge the schemas */ private SchemaInstance mergeWithSchema(Element element, Schema schema) throws SchemaException, XPathExpressionException { // Cache the instance. SchemaInstance instance = createSchemaInstance(element, schema); // Set the schema name, if null, to the name of the element; // or check if the two names are the same, as they should be // Eric: I believe this is unnecessary and should be removed String schemaName = schema.getName(); if (schemaName == null) { schema.setName(element.getNodeName()); } else { if (!schema.getName().equals(element.getNodeName())) { throw new SchemaException("Schema element names do not match."); } } // Never merge leaf element nodes. // // Eric: Technically, this "if statement" will always be true because // if the element if a leaf, then "mergeWithSchema" function will never // be called on this leaf element in the first place if (!XMLUtils.isLeaf(element)) { // Get all the (immediate next level) child nodes of the given element node NodeList children = element.getChildNodes(); // Iterate through all child nodes, but process // ONLY those that are elements for (int i = 0; i < children.getLength(); i++) { if (children.item(i) instanceof Element) { // Process child element node that is NOT a leaf element node. if (!XMLUtils.isLeaf(children.item(i))) { // Get the non-leaf child element node, which means it has // leaf (and possibly non-leaf) child element nodes under it Element child = (Element) children.item(i); // The boolean value to indicate if a previous instance of this // non-leaf child element with the same name has already been // processed/merged. boolean found = false; // Find out if this non-leaf child element already exists // in parent element's relations, meaning that a previous // instance of the non-leaf child element with the same name // has already been processed and put into the relations of // the parent element. // // If so, merge this instance of the non-leaf child element // with the already consolidated associated schema, during // which new relations or attributes might be added to this // schema for (Relation childRelation : schema.getRelations()) { if (childRelation.getName().equals(child.getNodeName())) { SchemaInstance childInstance = mergeWithSchema(child, childRelation.getSchema()); createRelationInstnace(childRelation, instance, childInstance); found = true; break; } } // This is the first encounter of the non-leaf child element // with this node name if (!found) { // Get the name of the non-leaf child element node String name = child.getNodeName(); // Create the path, which is the ABSOLUTE path to this // non-leaf child element node, starting with "/" String path = schema.getPath() + "/" + name; // Create a schema for this non-leaf child element node, // if none exists yet Schema childSchema = schemas.get(name); if (childSchema == null) { // Eric: Why not set the parent to the current schema? childSchema = new Schema(null, child, path); schemas.put(child.getNodeName(), childSchema); } // Merge this non-leaf child element node first before // further processing this node SchemaInstance childInstance = mergeWithSchema(child, childSchema); // Create the lookupKeys for the creation of relation later // This is essentially a list of all leaf elements that // exist under the current child node Set<Attribute> lookupKeys = new HashSet<Attribute>(); // Get the list of RELATIVE path to all leaf element nodes // of the current non-leaf child element node, with path // starting with the name of the current non-leaf child // element node (and not "/"), and ending with the name // of the leaf element nodes List<String> leaves = XMLUtils.getAllLeaves(child); // Iterate through all paths to the leaf element nodes for (String leafPath : leaves) { // Get the name of the current LEAF element node int lastNodeIndex = leafPath.lastIndexOf('/'); String lastNodeName = leafPath.substring(lastNodeIndex + 1); // Create leafName by simply replacing all "/" with "." String leafName = leafPath.replace('/', '.'); // Append ".name" to the end of leafName if the current // leaf element node has been promoted and has an // OntologyLink schema associated with it // // Eric: Is it correct to say that the ONLY case where // lastNodeSchema is NOT null is when the child node has // been promoted, which means lastNodeSchema is ALWAYS // an OntologyLink schema? Schema lastNodeSchema = schemas.get(lastNodeName); if (lastNodeSchema instanceof OntologyLink) { // Eric: Why ".name"? What's the meaning behind this? leafName += ".name"; } // Create leafPath through removing the name of the parent non-leaf // element node at the beginning, along with the "/", and then append // "/text()" at the end of the leafPath. // // This is essentially the RELATIVE path to the TEXT VALUE of the // current leaf element node under the parent non-leaf element node, // and this path will be understood correctly by XPath leafPath = leafPath.replaceAll("^" + child.getNodeName() + "/?", ""); // Eric: Why would leafPath ever be empty anyways? It must at least // contain the name of the LAEF node. leafPath = leafPath.length() > 0 ? leafPath + "/text()" : "text()"; // Create an entry to the lookupKeys, which keeps track of the parent // non-leaf element node's schema, the name and the RELATIVE path to // all the TEXT VALUES of the leaf element nodes under it, and whether // these element nodes are keys or not // // Eric: I'm still unclear about the answer to the email question // regarding the lookupKeys (Question 1.2). lookupKeys.add(new Attribute(schema, leafName, leafPath, false)); } // Eric: Why is path (the third parameter) set to name? // Set the parent-child (schema-childSchema) relation, with lookupKeys essentially // a list of LEAF nodes of the child (childSchema) and their parent is set to // schema // One can think of the path to the childSchema as schema.getPath() + "/" + name // (name is the name of the childSchema) Relation relation = new Relation(schema, name, name, childSchema, lookupKeys); schema.addRelation(relation); createRelationInstnace(relation, instance, childInstance); } } // Process child element node that IS INDEED a leaf element node else { // Get the leaf child element and its name Element child = (Element) children.item(i); String name = child.getNodeName(); // Get the ABSOLUTE path to the leaf child element, // starting with "/" String path = schema.getPath() + "/" + name; // Find out if a previous instance of the leaf child element // with the same name has already been added to the attributes // or relations. Since the leaf child element has no children, // the previous instance will be exactly the same as the current // instance (structure-wise), the current instance does not need // to be processed anymore. boolean found = false; for (Attribute childAttribute : schema.getAttributes()) { if (childAttribute.getName().equals(child.getNodeName())) { found = true; break; } } for (Relation childRelation : schema.getRelations()) { if (childRelation.getSchema() instanceof OntologyLink && childRelation.getName().equals(child.getNodeName())) { found = true; break; } } // If no previous instance has found, which means this is the first // encounter of the leaf child node with this name if (!found) { LogUtils.debug(this.getClass(), "searching in ontology for " + path); // values contains all the text values of the elements with the same ABSOLUTE path Set<String> values = new HashSet<String>(); // types contains all typeIDs (above threshold) based on the above text values Set<String> types = findTypeInOntology(path, element.getOwnerDocument(), values, matchThreshold, ignoredNumbers); // If types contains some typeIDs and values contains enough text values // Eric: What's the significance of values.size() >= leafPromotionThreshold since // values are merely the different text values of the current leaf node? if (types != null && types.size() > 0 && values.size() >= leafPromotionThreshold) { LogUtils.debug(this.getClass(), "Types found for " + element + " " + types); // Find out if a previous instance of the leaf child // element with the same name has already been processed found = false; // If a previous instance of the leaf child element has already been // processed and added to parent's relation, merge the current instance // of the leaf child node // // Eric: This leaf child node will NEVER get merged because mergeWithSchema // function only process non-leaf elements. Is this correct? for (Relation childRelation : schema.getRelations()) { if (childRelation.getName().equals(child.getNodeName())) { mergeWithSchema(child, childRelation.getSchema()); found = true; break; } } // If no previous instance of the leaf child element is found // and this is the first encounter of the leaf child element with // this node name if (!found) { // Create a schema for the current leaf child element, // if none exists yet OntologyLink childSchema = (OntologyLink) schemas.get(child.getNodeName()); if (childSchema == null) { childSchema = new OntologyLink(null, child, path, types); schemas.put(child.getNodeName(), childSchema); } // Merge the current leaf child element before further processing // // Eric: Again, this leaf child element will NEVER get merged // because mergeWithSchema function only process non-leaf elements. // Is this correct? SchemaInstance childInstance = mergeWithSchema(child, childSchema); // Eric: Because the current child element is a leaf, it does NOT contain // other child elements, which means the list leaves contains ONLY ONE // string, which is the name of the current child element, and consequently // the set lookupKeys contains ONLY ONE attribute, with its path being "text()". // Is this the correct understanding? Once again, I'm not sure why lookupKeys // are needed. // // Eric: The following relation creation process is exactly the same as before, // which I believe should and must be simplified for the reasoning above. Set<Attribute> lookupKeys = new HashSet<Attribute>(); // Eric: Would this just return child itself as it is the leaf element? List<String> leaves = XMLUtils.getAllLeaves(child); for (String leafPath : leaves) { int lastNodeIndex = leafPath.lastIndexOf('/'); String lastNodeName = leafPath.substring(lastNodeIndex + 1); // Eric: Here, the lastNodeSchema is actually just the schema // of the CURRENT child, which is the OntologyLink just created // above. Is this the intention? // FIXME: The following code can be simplified for the reasoning above. Schema lastNodeSchema = schemas.get(lastNodeName); String leafName = leafPath.replace('/', '.'); if (lastNodeSchema instanceof OntologyLink) { leafName += ".name"; } leafPath = leafPath.replaceAll("^" + child.getNodeName() + "/?", ""); leafPath = leafPath.length() > 0 ? leafPath + "/text()" : "text()"; lookupKeys.add(new Attribute(schema, leafName, leafPath, false)); } Relation relation = new Relation(schema, name, name, childSchema, lookupKeys); createRelationInstnace(relation, instance, childInstance); } } // If the current leaf child node is not promoted, make it an attribute else { // The attribute is created with path being the ABSOLUTE path // to the TEXT VALUE of the leaf child node // // Eric: Why use "setPath" when name and path can be set when // the attribute is initialized Attribute attribute = new Attribute(schema, name, path, false); attribute.setName(child.getNodeName()); attribute.setPath(child.getNodeName() + "/text()"); schema.addAttribute(attribute); createAttributeInstance(attribute, instance, child); // ????? if (types != null && types.size() != 0) { LogUtils.debug(this.getClass(), "Types found for " + element + " " + types); attribute.setTypeURIs(types); } } } } } } } return instance; } private AttributeInstance createAttributeInstance(Attribute attribute, SchemaInstance schemaInstance, Element attributeElement) { AttributeInstance instance = null; try { instance = new AttributeInstance(schemaInstance, attributeElement); attribute.addInstance(instance); } catch (IOException e) { } return instance; } private RelationInstance createRelationInstnace(Relation relation, SchemaInstance from, SchemaInstance to) { RelationInstance instance = new RelationInstance(from, to); relation.addInstance(instance); return instance; } /* * Step 1. Merge the schemas - Helper Function */ private Set<String> findTypeInOntology(String path, Document doc, Set<String> visitedTerms, double matchThreshold, double ignoredNumebers) throws XPathExpressionException { // Perform ontology finding only if "INTRALINKING" is enabled if (!isStepEnabled(MappingStep.INTRALINKING)) { return new HashSet<String>(); } // OpenCycOntology currently is NOT in use OpenCycOntology ontology = OpenCycOntology.getInstance(); // Instantiate FreeBaseLinker FreeBaseLinker freebase = new FreeBaseLinker(); // Get all instances of the nodes with the same ABSOLUTE path. // This means all the nodes have the same node name and they must // be all leaf element nodes since findTypeInOntology only calls // on leaf element nodes NodeList nl = XMLUtils.getNodesByPath(path, null, doc); // For each String typeIDs, count how many Integer times they are // returned from freebase Map<String, Integer> commonTypes = new HashMap<String, Integer>(); // Count how many times no typeIDs is returned for a text value int count = 0; // Iterate through all instances of nodes with the same ABSOLUTE path for (int i = 0; i < nl.getLength() && (maxOnotlogyLookup == -1 || i < maxOnotlogyLookup); i++) { // Break all iterations if there are too many times (count) where no typeIDs is returned, // or enough different typeIDs have already returned. if (count > 100 && commonTypes.size() < 100) { break; } // Skip the current iteration if the text value of the // current instance has already been processed // Eric: This "term" is incorrect for the Patent data String term = nl.item(i).getTextContent(); // Eric: The above "term" is wrong because it includes in-tag attributes, // and the "term" ends up being "Candy holderd2e53", which should've been "Candy holder". // The following line of code fixes the problem. // String term = nl.item(i).getChildNodes().item(0).getNodeValue(); if (visitedTerms.contains(term)) { continue; } // If not, the text value of the current instance is // added to the visistedTerms, and these added terms // are not processed, such as having digits removed, etc visitedTerms.add(term); // Skip the current iteration if the text value of the current instance is empty, // longer than 50 characters, or consists entirely of digits if (term.trim().length() == 0 || term.length() > 50 || term.matches("^\\d+$")) { continue; } // Remove all digits from the text value of the current instance String withoutNumbers = term.trim().replaceAll("\\d", ""); // Skip the current iteration (once again) if the ratio of the length of digits over // the total length of the text value is too high if ((term.length() - withoutNumbers.length()) / (double) term.length() >= ignoredNumebers) { continue; } // A set that holds all freebase typeIDs that look something // like "http://rdf.freebase.com/rdf/music.release" Set<String> types = new HashSet<String>(); //ontology.findTypesForResource(term, stringMetric, ontologyMatchingThreshold); // Get the list of typeIDs based on the text value, and the typeIDs look like the following: // "http://rdf.freebase.com/rdf/music.release" Set<String> freebaseTypes = freebase.findTypesForResource(term, stringMetric, ontologyMatchingThreshold); // Add all typeIDs if freebaseTypes is not null if (freebaseTypes != null) { types.addAll(freebaseTypes); } // If no typeIds is added and the length of the term is less than 20, // which could mean that there might be spelling mistakes if (types.size() == 0 && term.length() < 20) { // Get the Google spell checker and get the spell response // SpellChecker checker = new SpellChecker(); // SpellResponse spellResponse = checker.check(term); // If there are spell corrections // if (spellResponse.getCorrections() != null // && spellResponse.getCorrections().length > 0) { // Get the spell checked text value // // Eric: It seems like only one word is returned for text values of any length, // so for example, "Daniel Aradi MD PhD" is spell checked as "Abadi", which is // obviously wrong // term = ""; // for (int j = 0; j < spellResponse.getCorrections().length; j++) { // term += spellResponse.getCorrections()[j].getValue().split("\t")[0]; // } // Try add typeIDs based on the new spell-checked text value if (term.length() > 0) { types = new HashSet<String>(); //.findTypesForResource(term, stringMetric, ontologyMatchingThreshold); freebaseTypes = freebase.findTypesForResource(term, stringMetric, ontologyMatchingThreshold); if (freebaseTypes != null) { types.addAll(freebaseTypes); } } // } } // Skip the current iteration if still no typeIDs is found if (types.size() == 0) { count++; continue; } // Count for each typeID, the number of times it has occurred for (String type : types) { Integer typeCount = commonTypes.get(type); if (typeCount == null) { typeCount = 0; } typeCount++; commonTypes.put(type, typeCount); } // if (commonTypes == null) { // commonTypes = types; // } else { // Set<String> tempCommonTypes = SetUtils.intersection(commonTypes, types); // if (tempCommonTypes.size() == 0) { // count++; // } else { // commonTypes = tempCommonTypes; // } // } } // double ratio = (visitedTerms.size() - count) / visitedTerms.size(); // Add typeIDs to types this typeID has occurred enough times (over the threshold) Set<String> types = new HashSet<String>(); for (Map.Entry<String, Integer> entry : commonTypes.entrySet()) { // System.out.println("Score for " + entry.getKey() + " is " + entry.getValue() / (double) visitedTerms.size()); if (entry.getValue() / (double) visitedTerms.size() >= matchThreshold) { types.add(entry.getKey()); } } // System.err.println("returning " + types + " for " + path); return types; } /* * Step 2. Flatten the schemas */ private void flattenSchema(Document doc, double flatThreshold) throws XPathExpressionException { // Only perform schema flattening if enabled if (!isStepEnabled(MappingStep.SCHEMA_FLATTENING)) { return; } DependencyDAG<Schema> dependecyDAG = new DependencyDAG<Schema>(); for (Schema schema : schemas.values()) { dependecyDAG.addNode(schema); } for (Schema schema : schemas.values()) { for (Relation rel : schema.getRelations()) { dependecyDAG.addDependency(schema, rel.getSchema()); } } while (dependecyDAG.size() != 0) { Schema schema = dependecyDAG.removeElementWithNoDependency(); // Eric: The line below serves no purpose!!! schema.getAttributes(); Set<Relation> oneToOneRelations = findOneToOneRelations(doc, schema); for (Relation rel : oneToOneRelations) { LogUtils.debug(getClass(), "is one to one : " + schema + " . " + rel); flattenRelation(schema, rel); } } } /* * Step 2. Flatten the schemas - Helper Function */ private Set<Relation> findOneToOneRelations(Document doc, Schema schema) throws XPathExpressionException { // If there is no relation in the schema, just return an empty set. if (schema.getRelations().size() == 0) { return new HashSet<Relation>(); } Set<Relation> oneToOneRelations = new HashSet<Relation>(); for (Relation rel : schema.getRelations()) { if (rel.isOneToOne()) { oneToOneRelations.add(rel); } } return oneToOneRelations; } /* * Step 2. Flatten the schemas - Helper Function */ private void flattenRelation(Schema schema, Relation rel) { Schema targetSchema = rel.getSchema(); // The promoted (relational) leaf node is now demoted // back to an attribute because of one-to-one'ness if (targetSchema instanceof OntologyLink) { // Eric: Serious?! String name = "name"; name = targetSchema.getName(); String path = rel.getPath() + "/text()"; Attribute attr = new Attribute(schema, name, path, false); attr.setTypeURIs(targetSchema.getTypeURIs()); schema.addAttribute(attr); attr.setParent(schema); } // Add attributes of the relation to the schema, // with modification to attributes' name, path, and // parent schema for (Attribute attr : targetSchema.getAttributes()) { String name = attr.getName(); name = targetSchema.getName() + "_" + name; attr.setName(name); String path = attr.getPath(); path = rel.getPath() + "/" + path; attr.setPath(path); schema.addAttribute(attr); attr.setParent(schema); } // Add relations of the relation to the schema, // with modification to relations' name, path, and // lookupKey attributes for (Relation targetRel : targetSchema.getRelations()) { String path = rel.getPath() + "/" + targetRel.getName(); targetRel.setPath(path); String name = targetRel.getName(); name = targetSchema.getName() + "_" + name; targetRel.setName(name); schema.addRelation(targetRel); targetRel.setParent(schema); // Eric: Shouldn't we also update the parent schema to the new one? for (Attribute lookupKey : targetRel.getLookupKeys()) { lookupKey.setPath(rel.getPath() + "/" + lookupKey.getPath()); lookupKey.setName(lookupKey.getName().replace(rel.getName() + ".", rel.getName() + "_")); } } // Now that we port over all relations and attributes of the relation // to its one-to-one parent schema, remove this relation and complete // the flatten process schema.getRelations().remove(rel); // Remove the relation schema altogether, iff this schema is not a // relation of any other schemas maybeRemoveSchema(targetSchema); } /* * Step 3. Remove duplicates */ private void removeDuplicates() { if (!isStepEnabled(MappingStep.DUPLICATE_REMOVAL)) { return; } // The value of dSets is a hierarchy set of schemas that are considered to be // similar or duplicates Map<Schema, DisjointSet<Schema>> dSets = new HashMap<Schema, DisjointSet<Schema>>(); for (Schema schema : schemas.values()) { DisjointSet<Schema> set = new DisjointSet<Schema>(schema); dSets.put(schema, set); } // TODO: Better way to detect duplicate pairs? For example, instead of iterating all // possible schema pairs, only compare those that are at the same level because its // unlikely the very top schema will be similar to that of almost leaf schemas for (Schema schema1 : schemas.values()) { for (Schema schema2 : schemas.values()) { // Skip the current schema pair if they are the same, if they do have enough // attributes, or if schema1 name > schema2 name to avoid inspecting // <schema1, schema2> and <schema2, schema1> if (schema1 == schema2 || schema1.getAttributes().size() < minimumNumberOfAttributeToMerges || schema2.getAttributes().size() < minimumNumberOfAttributeToMerges || schema1.getName().compareTo(schema2.getName()) > 0) { continue; } // TODO: better similarity schema double similarity = schemaSimMetric.getSimiliarity(schema1, schema2); if (similarity >= schemaSimThreshold) { dSets.get(schema1).union(dSets.get(schema2)); LogUtils.info(this.getClass(), "Merging " + schema1 + " with " + schema2); } } } while (dSets.size() > 0) { Set<Schema> listOfSchemas = new HashSet<Schema>(); Schema schema = dSets.keySet().iterator().next(); listOfSchemas.add(schema); DisjointSet<Schema> dset = dSets.remove(schema); DisjointSet<Schema> root = dset.find(); for (DisjointSet<Schema> set : root.getChildren()) { Schema similarSchema = set.getData(); if (!schema.equals(similarSchema)) { listOfSchemas.add(similarSchema); dSets.remove(similarSchema); } } if (listOfSchemas.size() > 1) { Schema newSchema = mergeSchemas(listOfSchemas); // Replace old relation schema with the merged one for (Schema oldSchema : schemas.values()) { for (Relation rel : oldSchema.getRelations()) { if (listOfSchemas.contains(rel.getSchema())) { rel.setSchema(newSchema); } } } // Remove all the pre-merged schemas for (Schema s : listOfSchemas) { for (Attribute attr : s.getAttributes()) { attr.setParent(newSchema); } schemas.remove(s.getName()); } // Place the new merged schema schemas.put(newSchema.getName(), newSchema); } } } /* * Step 3. Remove duplicates - Helper Function */ private Schema mergeSchemas(Set<Schema> listOfSchemas) { String path = ""; String name = ""; Set<Attribute> attributes = new HashSet<Attribute>(); Set<Relation> relations = new HashSet<Relation>(); for (Schema s : listOfSchemas) { attributes.addAll(s.getAttributes()); relations.addAll(s.getRelations()); path += s.getPath() + "|"; name += s.getName() + "_or_"; } path = path.substring(0, path.length() - 1); name = name.substring(0, name.length() - 4); Schema schema = new Schema(null, name, path); schema.setAttributes(attributes); schema.setRelations(relations); return schema; } /* * Step 4. Find a possible key for each identified schema */ private void findKeysForSchema(Schema schema, Document doc, double uniqunessThreshold) throws XPathExpressionException { // A set of maps, with each map corresponds to one instance of // the input schema. The keys of the map are the names of all // relations and attributes of the input schema, and the values // of the map are the text values of relations or attributes. Set<Map<String, String>> entities = new HashSet<Map<String, String>>(); if (schema instanceof OntologyLink) { return; } // A set of schema names (either relation schemas or attribute // schemas of the current schema) that are NOT keys Set<String> bannedKeys = new HashSet<String>(); // Get all instances of the input schema, such as all // instances of "/clinical_studies/clinical_study" NodeList entityNL = XMLUtils.getNodesByPath(schema.getPath(), null, doc); // Iterate through all instances of the input schema, inspect its // attributes and relations, find those that cannot be keys (that is, // one schema instance has more than one attribute/relation schemas // with the same name), and fill variable entities defined above // TODO: This loop can be made faster!!! Think!!! for (int i = 0; i < entityNL.getLength(); i++) { // The keys of the map are the names of all relations and // attributes of the current instance of the input schema, and // the values of the map are the text values of these relations // or attributes. HashMap<String, String> instance = new HashMap<String, String>(); // Get the current instance of the input schema Element element = (Element) entityNL.item(i); // For each attribute, find all its instances under the current // instance of the input schema, identify those that cannot be // keys, and fill the variable instance defined above for (Attribute attr : schema.getAttributes()) { // Skip if the current attribute is already banned if (bannedKeys.contains(attr.getName())) { continue; } // Identify if the current attribute should be banned NodeList attributeNL = XMLUtils.getNodesByPath(attr.getPath(), element, doc); if (attributeNL.getLength() != 1) { bannedKeys.add(attr.getName()); attr.setKey(false); } // Fill the variable instance defined above with the text value of the attribute // Eric: Only adds one text value when more than one instances can occur. WRONG? (Map // can only perserve one value per key, that's why). // Eric: Should the banned instances be added as well? I don't think they should. instance.put(attr.getName(), XMLUtils.getStringByPath(attr.getPath(), element, doc)); } // For each relation, find all its instances under the current // instance of the input schema, identify those that cannot be // keys, and fill the variable instance defined above for (Relation rel : schema.getRelations()) { // Skip if the current relation is already banned if (bannedKeys.contains(rel.getName())) { continue; } // Identify if the current attribute should be banned // NodeList relNL = XMLUtils.getNodesByPath(rel.getPath() + "/text()", element, doc); NodeList relNL = XMLUtils.getNodesByPath(rel.getPath(), element, doc); if (relNL.getLength() != 1) { bannedKeys.add(rel.getName()); continue; } // Fill the variable instance defined above with the text value of the relation, // which the text values of all its leaf children // Eric: Only adds one text value when more than one instances can occur. WRONG? (Map // can only perserve one value per key, that's why). // Eric: Should the banned instances be added as well? I don't think they should. instance.put(rel.getName(), XMLUtils.getStringByPath(rel.getPath(), element, doc)); } entities.add(instance); } // Find possible keys among all attributes of the input schema for (Attribute attr : schema.getAttributes()) { if (bannedKeys.contains(attr.getName())) { continue; } // For the current attribute, count for each of its text // value, how many such text value has occurred across // all maps (entities) Map<String, Integer> valueMap = new HashMap<String, Integer>(); for (Map<String, String> instance : entities) { String value = instance.get(attr.getName()); Integer count = valueMap.get(value); if (count == null) { count = 1; } else { count++; } valueMap.put(value, count); } // For the current attribute, count the number of text values // that have occurred more than once, in another word, for the // current attribute, count the number of its text values that // have occurred more than once across all instances of the input // schema // // Eric: Is this the right schema? Let's say there's 200 instances // of the schema, and the attribute has 100 unique values. 1 particular // attribute value has occurred in 101 instances of the schema, but the // other 99 attribute values occurred only once. This attribute will // be considered as a key, but is this correct? int nonUnique = 0; for (Map.Entry<String, Integer> entry : valueMap.entrySet()) { Integer count = entry.getValue(); if (count != 1) { nonUnique++; } } // Consider the attribute as a key if the attribute's text value // is unique "enough" (passing the threshold) int total = valueMap.size(); if (nonUnique / (double) total <= uniqunessThreshold) { attr.setKey(true); LogUtils.debug(this.getClass(), schema.getName() + "." + attr.getName() + " is unique"); } } Set<Relation> depromotedRels = new HashSet<Relation>(); for (Relation rel : schema.getRelations()) { if (!(rel.getSchema() instanceof OntologyLink) || bannedKeys.contains(rel.getName())) { continue; } // For the current relation, count for each of its text // value, how many such text value has occurred across // all maps (entities) Map<String, Integer> valueMap = new HashMap<String, Integer>(); for (Map<String, String> instance : entities) { String value = instance.get(rel.getName()); Integer count = valueMap.get(value); if (count == null) { count = 1; } else { count++; } valueMap.put(value, count); } // For the current relation, count the number of text values // that have occurred more than once, in another word, for the // current relation, count the number of its text values that // have occurred more than once across all instances of the input // schema // Eric: Considering relation's text value is the aggregate text // values of all its leaf children, it seems EXTREMELY LIKELY // that relations will almost always be considered as keys. SO IS // THIS RIGHT? WHY CONSIDER RELATIONS AS KEYS? int nonUnique = 0; for (Map.Entry<String, Integer> entry : valueMap.entrySet()) { Integer count = entry.getValue(); if (count != 1) { nonUnique++; } } // Consider the relation as a key if the text value of relation // is unique "enough" (passing the threshold), then convert // such relation to attribute (NOT CLEAR!!! WHAT ABOUT ITS CHILDREN) int total = valueMap.size(); if (nonUnique / (double) total <= uniqunessThreshold) { // Eric: Why are you so sure this is an ontology schema? OntologyLink promotedLeafSchema = (OntologyLink) rel.getSchema(); Set<String> typeURIs = promotedLeafSchema.getTypeURIs(); depromotedRels.add(rel); schema.setTypeURIs(typeURIs); Attribute attr = new Attribute(schema, promotedLeafSchema.getName(), rel.getPath(), true); schema.addAttribute(attr); LogUtils.debug(getClass(), schema.getName() + "." + attr.getName() + " is unique"); } } // Remove such identified key relation from the input, // schema and possibly remove such relation altogether for (Relation rel : depromotedRels) { schema.getRelations().remove(rel); maybeRemoveSchema(rel.getSchema()); } } /* * Helper Function to remove a schema iff this schema * is not a relation of any other schemas */ private void maybeRemoveSchema(Schema schemaToBeRemoved) { for (Schema schema : schemas.values()) { // The if-continue just skips the schema of the // same name if (schema.equals(schemaToBeRemoved)) { continue; } // Now we know the current schema has a // different name for (Relation relation : schema.relations) { if (relation.schema.equals(schemaToBeRemoved)) { return; } } } // Only remove the schema if it's not a relation // of any other schemas // Eric: WRONG, should be schemaToBeRemoved.getName() // schemas.remove(schemaToBeRemoved); schemas.remove(schemaToBeRemoved.getName()); } /* * Step 5. Intra-link schemas */ private void intralinkSchemas(Document doc, double linkingThreshold) throws XPathExpressionException { // Eric: Why is INTERLINKING used under intralinkSchemas()? if (!isStepEnabled(MappingStep.INTERLINKING)) { return; } for (Schema schema : schemas.values()) { // Get all instances of the schema NodeList nl = XMLUtils.getNodesByPath(schema.getPath(), null, doc); for (Attribute attr : schema.getAttributes()) { List<Attribute> matchedAttributes = new LinkedList<Attribute>(); // Eric: THIS IS WRONG FOR "facility", for example, because // the path of the attribute includes "facility"? Design choice? Set<String> propertyValues = XMLUtils.getStringsByPath( schema.getPath() + "/" + attr.getPath(), null, doc); for (Schema targetSchema : schemas.values()) { // Skip current iteration when the two schemas are the same if (targetSchema.equals(attr.getParent())) { continue; } for (Attribute targetAttribute : targetSchema.getAttributes()) { if (!targetAttribute.isKey()) { continue; } if (targetAttribute.equals(attr)) { continue; } Set<String> targetPropertyValues = XMLUtils.getStringsByPath( targetSchema.getPath() + "/" + targetAttribute.getPath(), null, doc); Set<String> sharedValues = org.openjena.atlas.lib.SetUtils.intersection(propertyValues, targetPropertyValues); if (sharedValues.size() / (double) propertyValues.size() >= linkingThreshold) { matchedAttributes.add(targetAttribute); } } } for (Attribute matchedAttribute : matchedAttributes) { Schema taregetSchema = matchedAttribute.getParent(); Set<Attribute> lookupKeys = new HashSet<Attribute>(); // Eric: What's the point of this add? lookupKeys.add(new Attribute(schema, matchedAttribute.getName(), attr.getPath(), false)); Relation rel = new Relation(schema, attr.getName() + "_to_" + matchedAttribute.getName() + "_internal_relation", attr.getPath(), taregetSchema, lookupKeys); schema.addRelation(rel); } } } } private void addEntities(Schema schema, Document mappingRoot, String path, String typePrefix) { if (schema instanceof OntologyLink) { Element entityElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "entity"); entityElement.setAttribute("path", schema.getPath()); entityElement.setAttribute("type", typePrefix + schema.getName()); mappingRoot.getDocumentElement().appendChild(entityElement); Element idElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "id"); idElement.setTextContent(typePrefix + "${" + Entity.AUTO_GENERATED + "}"); entityElement.appendChild(idElement); Element attributeElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "property"); attributeElement.setAttribute("path", "text()"); attributeElement.setAttribute("name", typePrefix + "name_property"); attributeElement.setAttribute("key", "true"); entityElement.appendChild(attributeElement); for (String ontologyURI : ((OntologyLink) schema).getTypeURIs()) { String label = OpenCycOntology.getInstance() .getLabelForResource(ontologyURI); Element ontologyElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "ontology-link"); ontologyElement.setAttribute("uri", ontologyURI); ontologyElement.setAttribute("label", label); entityElement.appendChild(ontologyElement); } } else { Element entityElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "entity"); entityElement.setAttribute("path", schema.getPath()); entityElement.setAttribute("type", typePrefix + schema.getName()); mappingRoot.getDocumentElement().appendChild(entityElement); Element idElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "id"); idElement.setTextContent(typePrefix + "${" + Entity.AUTO_GENERATED + "}"); entityElement.appendChild(idElement); // TODO: reload attributes for (String ontologyURI : schema.getTypeURIs()) { String label = OpenCycOntology.getInstance().getLabelForResource(ontologyURI); Element ontologyElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "ontology-link"); ontologyElement.setAttribute("uri", ontologyURI); ontologyElement.setAttribute("label", label); entityElement.appendChild(ontologyElement); } for (Attribute attribute : schema.getAttributes()) { Element attributeElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "property"); attributeElement.setAttribute("path", attribute.getPath()); attributeElement.setAttribute("name", typePrefix + attribute.getName() + "_property"); attributeElement.setAttribute("key", String.valueOf(attribute.isKey())); for (String ontologyURI : attribute.getTypeURIs()) { Element ontologyElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "ontology-link"); String label = OpenCycOntology.getInstance().getLabelForResource(ontologyURI); ontologyElement.setAttribute("uri", ontologyURI); ontologyElement.setAttribute("label", label); attributeElement.appendChild(ontologyElement); } entityElement.appendChild(attributeElement); } for (Relation relation : schema.getRelations()) { Element relationElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "relation"); relationElement.setAttribute("path", relation.getPath()); relationElement.setAttribute("targetEntity", typePrefix + relation.getSchema().getName()); relationElement.setAttribute("name", typePrefix + relation.getName() + "_rel"); entityElement.appendChild(relationElement); Element lookupElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "lookupkey"); for (Attribute attr : relation.getLookupKeys()) { Element targetPropertyElement = mappingRoot.createElementNS( "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "target-property"); targetPropertyElement.setAttribute("path", attr.getPath()); String name = attr.getName(); String[] nameSplitted = name.split("\\."); String newName = nameSplitted[0]; for (int i = 1; i < nameSplitted.length - 1; i++) { newName += "." + nameSplitted[i] + "_rel"; } if (nameSplitted.length == 1) { newName += "_prop"; } else { newName += nameSplitted[nameSplitted.length - 1] + "_prop"; } targetPropertyElement.setAttribute("name", typePrefix + attr.getName()); lookupElement.appendChild(targetPropertyElement); } relationElement.appendChild(lookupElement); } } } //////////////////////////////////////////////////////////////////////////// // Old Functions no longer in use //////////////////////////////////////////////////////////////////////////// /* * The functions below are NOT used currently and possibly obsolete! */ private void intralinkSchemasOld(Document doc, double linkingThreshold) throws XPathExpressionException { for (Schema schema : schemas.values()) { NodeList nl = XMLUtils.getNodesByPath(schema.getPath(), null, doc); for (Attribute attr : schema.getAttributes()) { Map<Attribute, Integer> attributeMatchMap = new HashMap<Attribute, Integer>(); attributeLoop: for (int i = 0; i < nl.getLength(); i++) { if (nl.item(i) instanceof Element) { Element entityElement = (Element) nl.item(i); Set<String> propertyValues = XMLUtils.getStringsByPath( attr.getPath(), entityElement, doc); for (Schema targetSchema : schemas.values()) { for (Attribute targetAttribute : targetSchema.getAttributes()) { if (!targetAttribute.isKey()) { continue; } if (targetAttribute.equals(attr)) { continue; } NodeList valueNodeList = XMLUtils.getNodesByPath( targetSchema.getPath() + "/" + targetAttribute.getPath(), null, doc); for (int j = 0; j < valueNodeList.getLength(); j++) { Node node = valueNodeList.item(j); if (propertyValues.contains(node.getTextContent().trim())) { Integer count = attributeMatchMap.get(targetAttribute); if (count == null) { count = 0; } attributeMatchMap.put(targetAttribute, count + 1); continue attributeLoop; } } } } } } Attribute matchedAttribute = null; for (Map.Entry<Attribute, Integer> entry : attributeMatchMap.entrySet()) { if (entry.getValue() / (double) nl.getLength() >= linkingThreshold) { matchedAttribute = entry.getKey(); break; } } if (matchedAttribute != null) { Schema taregetSchema = matchedAttribute.getParent(); Set<Attribute> lookupKeys = new HashSet<Attribute>(); lookupKeys.add(new Attribute(schema, matchedAttribute.getName(), attr.getPath(), false)); Relation rel = new Relation(schema, attr.getName() + "_interanl_relation", attr.getPath(), taregetSchema, lookupKeys); schema.addRelation(rel); } } } } private boolean isRelationOneToOne(Document doc, Schema schema, Relation rel) throws XPathExpressionException { Map<Set<String>, Set<Set<String>>> relMap = new HashMap<Set<String>, Set<Set<String>>>(); Map<Set<String>, Set<Set<String>>> reverseRelMap = new HashMap<Set<String>, Set<Set<String>>>(); String path = schema.getPath(); NodeList entitiesNL = XMLUtils.getNodesByPath(path, null, doc); for (int i = 0; i < entitiesNL.getLength(); i++) { Element entityElement = (Element) entitiesNL.item(i); Set<String> entityValue = new HashSet<String>(XMLUtils.getAllLeaveValues(entityElement)); NodeList relationsNL = XMLUtils.getNodesByPath(rel.getPath(), entityElement, doc); for (int j = 0; j < relationsNL.getLength(); j++) { Set<String> relValue = new HashSet<String>( XMLUtils.getAllLeaveValues((Element) relationsNL.item(j))); Set<Set<String>> entitySet = relMap.get(relValue); if (entitySet == null) { entitySet = new HashSet<Set<String>>(); relMap.put(relValue, entitySet); } entitySet.add(entityValue); Set<Set<String>> relSet = reverseRelMap.get(entityValue); if (relSet == null) { relSet = new HashSet<Set<String>>(); reverseRelMap.put(entityValue, relSet); } relSet.add(relValue); if (entitySet.size() > 1 || relSet.size() > 1) { LogUtils.debug(getClass(), schema + " . " + rel + " is not one to one because of " + relValue); return false; } } } // for (Map.Entry<Set<String>, Set<Set<String>>> relEntry: relMap.entrySet()) { // if (relEntry.getValue().size() > 1) { // LogUtils.debug(getClass(), schema + " . " + rel + " is not one to one because of " + relEntry); // return false; // } // } // // for (Map.Entry<Set<String>, Set<Set<String>>> entityEntry: reverseRelMap.entrySet()) { // if (entityEntry.getValue().size() > 1) { // LogUtils.debug(getClass(), schema + " . " + rel + " is not one to one because of " + entityEntry); // return false; // } // } return relMap.size() > 0 && reverseRelMap.size() > 0; } private double getEntropyOfRelation(Document doc, Schema schema, Relation rel) throws XPathExpressionException { Map<Set<String>, Set<Set<String>>> relMap = new HashMap<Set<String>, Set<Set<String>>>(); String path = schema.getPath(); NodeList entitiesNL = XMLUtils.getNodesByPath(path, null, doc); for (int i = 0; i < entitiesNL.getLength(); i++) { Element entityElement = (Element) entitiesNL.item(i); Set<String> entityValue = new HashSet<String>( XMLUtils.getAllLeaveValues(entityElement)); NodeList relationsNL = XMLUtils.getNodesByPath(rel.getPath(), entityElement, doc); for (int j = 0; j < relationsNL.getLength(); j++) { Set<String> relValue = new HashSet<String>( XMLUtils.getAllLeaveValues((Element) relationsNL.item(j))); Set<Set<String>> entitySet = relMap.get(relValue); if (entitySet == null) { entitySet = new HashSet<Set<String>>(); relMap.put(relValue, entitySet); } entitySet.add(entityValue); } } int sum = 0; for (Map.Entry<Set<String>, Set<Set<String>>> entry : relMap.entrySet()) { sum += entry.getValue().size(); } double entropy = 0; for (Map.Entry<Set<String>, Set<Set<String>>> entry : relMap.entrySet()) { double p = entry.getValue().size() / (double) sum; entropy += -p * Math.log(p); } return entropy; } }