/*
* Copyright (c) 2013, University of Toronto.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may obtain
* a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package edu.toronto.cs.xml2rdf.mapping.generator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xeustechnologies.googleapi.spelling.SpellChecker;
import org.xeustechnologies.googleapi.spelling.SpellResponse;
import edu.toronto.cs.xml2rdf.freebase.FreeBaseLinker;
import edu.toronto.cs.xml2rdf.mapping.Entity;
import edu.toronto.cs.xml2rdf.mapping.generator.MappingGenerator.MappingStep;
import edu.toronto.cs.xml2rdf.opencyc.OpenCycOntology;
import edu.toronto.cs.xml2rdf.string.StringMetric;
import edu.toronto.cs.xml2rdf.utils.DependencyDAG;
import edu.toronto.cs.xml2rdf.utils.DisjointSet;
import edu.toronto.cs.xml2rdf.utils.LogUtils;
import edu.toronto.cs.xml2rdf.xml.XMLUtils;
public class DemoMappingGenerator implements MappingGenerator {
// attributeMap = Map<"attribute name", "Map of unique attribute instances">
// "Map of unique attribute instances" = Map<ID, "unique attribute text value">
//
// A map of attributes, with the key being the name of the
// attribute, and the value being a map of "unique" instances
// of the attribute. The uniqueness of the instance is defined
// by the uniqueness of its text value. The instance map has
// unique ID as its key and the attribute text value as its value.
private Map<String, Map<Integer, String>> attributeMap = new HashMap<String, Map<Integer, String>>();
// attributeMapR = Map<"attribute name", "Map of unique attribute instances">
// "Map of unique attribute instances" = Map<"unique attribute text value", ID>
//
// The same as above, except the key and the value of the instance map are switched.
// This is done so to quickly look up the unique ID of an attribute text value.
private Map<String, Map<String, Integer>> attributeMapR = new HashMap<String, Map<String, Integer>>();
// "entity" here means non-attribute elements.
// relationsMap = Map<"entity name", "Map of unique entity instances">
// "Map of unique entity instances" = Map<ID, "Map of unique combination of entities and attributes">
// "Map of unique combination of entities and attributes" = Map<"attribute/entity name", "A set of unique instance ID's">
//
// The uniqueness of an entity instance is defined by the unique combination of its child entity and attribute instances,
// which is in turn defined by the uniqueness of each child entity or attribute instance.
private Map<String, Map<Integer, Map<String, Set<Integer>>>> relationsMap = new HashMap<String, Map<Integer, Map<String, Set<Integer>>>>();
// The same as above, except the key and the value of the entity map are switched.
// This is done so to quickly look up the unique ID of an entity instance.
private Map<String, Map<Map<String, Set<Integer>>, Integer>> relationsMapR = new HashMap<String, Map<Map<String, Set<Integer>>, Integer>>();
// Flag for printing debugging information
static boolean debug = false;
// Ceilings
private int maxElement;
private int maxOnotlogyLookup;
// Mapping essentials
Map<String, Schema> schemas = new HashMap<String, Schema>();
private List<MappingStep> enabledSteps;
// Metrics
private StringMetric stringMetric;
private SchemaSimilarityMetic schemaSimMetric;
// All thresholds
private double ontologyMatchingThreshold;
private double schemaSimThreshold;
private int leafPromotionThreshold = 5;
private double matchThreshold = 0.75;
private double ignoredNumbers = 0.25;
private int minimumNumberOfAttributeToMerges = 2;
private double intralinkingThreshold;
/*
* Constructor that initialize all threshold parameters.
* TODO: Design algorithms to estimate the thresholds so that they do not
* need to be manually assigned.
*/
public DemoMappingGenerator(double ontologyMatchingThreshold,
StringMetric stringMetric, double schemaSimThreshold,
SchemaSimilarityMetic schemaSimMetric,
int leafPromotionThreshold, double matchThreshold,
int maxElement, int maxOntologyLookup,
double ignoredNumbers,
int minimumNumberOfAttributeToMerges,
double internalLinkingThreshold, MappingStep... enabledSteps) {
this.ontologyMatchingThreshold = ontologyMatchingThreshold;
this.schemaSimMetric = schemaSimMetric;
this.stringMetric = stringMetric;
this.schemaSimThreshold = schemaSimThreshold;
this.matchThreshold = matchThreshold;
this.leafPromotionThreshold = leafPromotionThreshold;
this.maxElement = maxElement;
this.maxOnotlogyLookup = maxOntologyLookup;
this.ignoredNumbers = ignoredNumbers;
this.minimumNumberOfAttributeToMerges = minimumNumberOfAttributeToMerges;
this.intralinkingThreshold = internalLinkingThreshold;
this.enabledSteps = Arrays.asList(
enabledSteps == null || enabledSteps.length == 0
? MappingStep.values() : enabledSteps);
}
@Override
public Document generateMapping(Element rootDoc, String typePrefix) {
// The organization of the XML files should have "clinical_studies" as the
// very root document element (which is passed in as rootDoc), with many
// "clinical_study" child nodes, which is the children variable below.
NodeList children = rootDoc.getChildNodes();
// Iterate through all child nodes or up to the maximum number specified,
// and process (merge) ONLY child nodes that are elements.
for (int i = 0; i < children.getLength() && (maxElement == -1 || i < maxElement); i++) {
if (children.item(i) instanceof Element) {
// Get the child element instance
Element child = (Element) children.item(i);
// Merge the child element instance
mergeWithSchema(child);
}
}
// The function is not flattening the schema just yet.
// It is now only returning the One-to-One relations of the schema.
Map<String, Set<String>> OTOMap = flattenSchema();
// Print the One-to-One relation
for (String key : OTOMap.keySet()) {
System.out.println(key + "\t" + OTOMap.get(key));
}
// Debug code for printing maps of maps of maps. Please ignore.
//
// for (String k1 : relationsMap.keySet()) {
// if (k1.equals("clinical_study")){
// System.out.println(k1);
// Map<Integer, Map<String, Set<Integer>>> v1 = relationsMap.get(k1);
// for (Integer k2 : v1.keySet()) {
// System.out.println("\t" + k2);
// Map<String, Set<Integer>> v2 = v1.get(k2);
// for (String k3 : v2.keySet()) {
// System.out.println("\t\t" + k3);
// Set<Integer> v3 = v2.get(k3);
// System.out.println("\t\t\t" + v3);
// }
// }
// }
// }
//
// System.out.println("----------");
//
// for (String k1 : attributeMap.keySet()) {
// System.out.println(k1);
// Map<Integer, String> v1 = attributeMap.get(k1);
// for (Integer k2 : v1.keySet()) {
// System.out.println("\t" + k2 + "\t" + v1.get(k2));
// }
// }
//
// System.out.println(relationsMapR.get("intervention_browse"));
// System.out.println(attributeMapR.get("mesh_term").get("Omeprazole"));
return null;
}
// Generate a schematic/instance view of the document
//
// Example:
//
// XML Document:
//
// <clinical_study>
// <location>
// <facility>
// <name>Eric Yao</name>
// <name>Jia Xian Yao</name>
// <phone>123456</phone>
// </facility>
// <country>Canada</country>
// </location>
// </clinical_study>
//
// <clinical_study>
// <location>
// <facility>
// <name>Oktie Hassanzadeh</name>
// <phone>654321</phone>
// </facility>
// <country>Canada</country>
// </location>
// <location>
// <facility>
// <name>Soheil Hassas Yeganeh</name>
// <phone>123456</phone>
// </facility>
// <country>Canada</country>
// </location>
// </clinical_study>
//
// Generate Maps by Java
//
// attributeMap = {
// [ "name", ( <1, "Eric Yao">, <2, "Jia Xian Yao">, <3, "Oktie Hassanzadeh">, <4, "Soheil Hassas Yeganeh"> ) ],
// [ "phone", ( <1, "123456">, <2, "654321"> ) ],
// [ "country", ( <1, "Canada"> ) ]
// }
//
// relationsMap = {
// [ "facility",
// ( < 1, { ["A^name", (1, 2)], ["A^phone", (1)] } >,
// < 2, { ["A^name", (3) ], ["A^phone", (2)] } >,
// < 3, { ["A^name", (4) ], ["A^phone", (1)] } >
// )
// ],
// [ "location",
// ( < 1, { ["R^facility", (1)], ["A^country", (1)] } >,
// < 2, { ["R^facility", (2)], ["A^country", (2)] } >,
// < 3, { ["R^facility", (3)], ["A^country", (1)] } >
// )
// ],
// [ "clinical_study",
// ( < 1, { ["R^location", (1)] } >,
// < 2, { ["R^location", (2, 3)] } >
// )
// ]
// }
//
// A couple of things to note here:
//
// (1) Notice only unique attribute text values are stored, see <country> or <phone> as an examples.
// (2) An entity can have child combination of only entities <clinical_study>, only attributes <facility>, and both <location>.
// (3) All instances of the child elements are stored in Set<Integer>, see how two instances of <location> are stored under
// <clinical_study>, or two instances of <name> are stored under <facility>.
//
private String mergeWithSchema(Element element) {
if (XMLUtils.isLeaf(element)) {
// Base case, the element is a leaf node and thus an attribute
// Get the attribute name and attribute text value
String name = element.getNodeName();
String value = element.getTextContent();
// The unique ID of the attribute text value
int id;
// The attribute instance map and its reverse map of the
// current attribute
Map<Integer, String> attributeValues = attributeMap.get(name);
Map<String, Integer> attributeValuesR;
if (attributeValues == null) {
// It is the first encounter of an attribute with
// its name, no instance map has been created yet
// Create instance map and its reverse map
attributeValues = new HashMap<Integer, String>();
attributeValuesR = new HashMap<String, Integer>();
// Unique ID always starts at 1
id = 1;
// Initialize instance/reverse map
attributeValues.put(id, value);
attributeValuesR.put(value, id);
// Add attribute name (key) and the instance/reverse map (value)
// to their corresponding map
attributeMap.put(name, attributeValues);
attributeMapR.put(name, attributeValuesR);
} else {
// The attribute of its name has already been
// encountered with instance map retrieved.
// Retrieve the reverse instance map to check
// if the attribute text value has been added,
// and to look up its unique ID
attributeValuesR = attributeMapR.get(name);
if (!attributeValuesR.containsKey(value)) {
// The attribute text value is new and thus unique
// Assign the unique ID for the unique text value
id = attributeValuesR.size() + 1;
// Add the unique ID and text values to
// instance/reverse map
attributeValues.put(id, value);
attributeValuesR.put(value, id);
} else {
// The attribute text value already exists
// in the instance/reverse map
// Retrieve the unique ID
id = attributeValuesR.get(value);
}
}
// Return a string containing the attribute name and the unique instance ID
String retStr = "A^" + name + ":" + id; // "A^" stands for "Attribute".
return retStr;
} else {
// Recursive case, the element is not a leaf node
// Get the children of the current element, possibly
// a combination of attributes and entities
NodeList children = element.getChildNodes();
// childrenMap = Map<"attribute/entity name", "A set of its unique instance ID's">
// A map of the combination of the child attribute and entity instances
Map<String, Set<Integer>> childrenMap = new HashMap<String, Set<Integer>>();
// Iterate through all child nodes and process only those that are elements.
for (int i = 0; i < children.getLength(); i++) {
if (children.item(i) instanceof Element) {
// Get the child element instance
Element child = (Element) children.item(i);
// Merge the child element instance and get
// the a returning string of the form "A^/R^name:ID",
// with "A^/R^" indicating if the child element
// is an attribute or entity instance, and "ID" being
// the unique instance ID
String childStr = mergeWithSchema(child);
// Get the child element name and its unique instance ID
int index = childStr.indexOf(":");
String childName = childStr.substring(0, index);
int childId = Integer.parseInt(childStr.substring(index + 1));
// Get the set containing the unique instance ID's
// of the child element
Set<Integer> childSet = childrenMap.get(childName);
if (childSet == null) {
// This is the first encounter of the child element
// with its name. Create the empty set.
childSet = new HashSet<Integer>();
}
// Add the unique instance ID to the set and update the map
childSet.add(childId);
childrenMap.put(childName, childSet);
}
}
// Now that we have gone through all the child elements (attribute and entity
// instances), childrenMap now documents the different child attribute and entity
// elements that exists (schematic view), as well as all their unique instances
// (instance view)
// Get the name of the current entity element
String name = element.getNodeName();
// The unique ID of the current entity instance
int id;
// Get the entity instance map of the current entity element
Map<Integer, Map<String, Set<Integer>>> currRelationMap = relationsMap.get(name);
Map<Map<String, Set<Integer>>, Integer> currRelationMapR;
if (currRelationMap == null) {
// This is the first encounter of the entity with its name
// Create the entity instance/reverse map
currRelationMap = new HashMap<Integer, Map<String, Set<Integer>>>();
currRelationMapR = new HashMap<Map<String, Set<Integer>>, Integer>();
// The unique ID starts at 1
id = 1;
// Initialize entity instance/reverse map
currRelationMap.put(id, childrenMap);
currRelationMapR.put(childrenMap, id);
// Put them in their corresponding map
relationsMap.put(name, currRelationMap);
relationsMapR.put(name, currRelationMapR);
} else {
// Entity with its name has already been added, meaning
// there exists an entity instance map (perhaps waiting
// to be updated)
// Retrieve the reverse entity instance map to check
// if the current entity instance already exists, and
// to look up its unique ID
currRelationMapR = relationsMapR.get(name);
if (!currRelationMapR.containsKey(childrenMap)) {
// The combination of attribute and entity instances
// of the current entity instance has not been added
// Create the new unique ID
id = currRelationMapR.size() + 1;
// Update the entity instance/reverse map
currRelationMap.put(id, childrenMap);
currRelationMapR.put(childrenMap, id);
} else {
// The combination of attribute and entity instances
// of the current entity instance already exists, this
// means that, there is another instance of this entity
// that has EXACTLY THE SAME combination of attribute
// and entity instances (not just schematic, but also
// the SAME instances)
// Retrieve the unique ID of the already-existed entity instance
id = currRelationMapR.get(childrenMap);
}
}
// Return a string containing the entity name and the unique instance ID
String retStr = "R^" + name + ":" + id; // "R" stands for "Relation".
return retStr;
}
// Phew, DONE!
}
// For now, find one-to-one relation between entities.
//
// For a parent entity and its child entity (not child attribute) to have
// one-to-one relation, they must satisfy two rules.
//
// (1) For each unique parent entity instance, there can be only one unique
// child entity instance.
// (2) For each unique child entity instance, it must belong to only one
// unique parent entity instance. That is, if two unique parent entity
// instance both have the same unique child entity instance, they do
// not have the one-to-one relation.
//
private Map<String, Set<String>> flattenSchema() {
// OTOMap = Map<"parent entity name", "a set of its child entity name whose relation is one-to-one">
Map<String, Set<String>> OTOMap = new HashMap<String, Set<String>>();
// Iterate through all entity element names (schematic view)
for (String name : relationsMap.keySet()) {
// Get the entity instance map
Map<Integer, Map<String, Set<Integer>>> instances = relationsMap.get(name);
// A list of banned child entity names because they violate either of the two rules
List<String> bannedRels = new ArrayList<String>();
// OTORels = <"child entity name", "a set of its unique instance ID's">
//
// This is to check if the same instance of the child entity has appeared
// under different instances of the parent entity
Map<String, Set<Integer>> OTORels = new HashMap<String, Set<Integer>>();
// Iterate through all child entity instances
for (Integer id : instances.keySet()) {
Map<String, Set<Integer>> instance = instances.get(id);
// Get the child entity name (schematic view) and only
// process when it is not an attribute
for (String relName : instance.keySet()) {
if (relName.startsWith("R^")) {
// Only process when the entity element is not yet banned
if (!bannedRels.contains(relName)) {
// Get the number of instances this child entity element has
// occurred under this particular parent entity element
Set<Integer> relIds = instance.get(relName);
if (relIds.size() > 1) {
// More than one unique instance of the child entity
// element are found under the same parent entity element.
// Rule 1 is violated.
// Add the child entity name to the banned list
bannedRels.add(relName);
// Remove the child entity from the one-to-one relation map
OTORels.remove(relName);
} else {
// Only one unique instance of the child entity element
// is found under the same parent entity element.
// Get the set of child entity instances encountered so far
Set<Integer> OTOIds = OTORels.get(relName);
if (OTOIds == null) {
// This is the first encounter.
// Update the one-to-one relation map
OTORels.put(relName, relIds);
} else {
// The child entity element has been encountered before
if (!OTOIds.addAll(relIds)) {
// The same instance of the child entity element
// exists under another differnt instance of the
// parent entity element. Rule 2 is violated.
// Add the child entity name to the banned list
bannedRels.add(relName);
// Remove the child entity from the one-to-one relation map
OTORels.remove(relName);
}
}
}
}
}
}
}
// All those remained are child entity elements that share one-to-one relation
// with the current parent entity element.
if (!OTORels.isEmpty()) {
OTOMap.put(name, OTORels.keySet());
}
}
return OTOMap;
}
}