DummyMappingGenerator.java example

Explorer
xcurator-master
- eval
  - edu
    - toronto
      - cs
        xml2rdf
        mapping
        generator
        MappingGeneratorEval.java
- lib
  - new_libs
    - apache-jena-3.1.0
      - src-examples
        arq
        examples
        AlgebraEx.java
        AlgebraExec.java
        ExProg1.java
        ExProg2.java
        ExQuerySelect1.java
        ExQuerySelect2.java
        ExampleDBpedia1.java
        ExampleDBpedia2.java
        ExampleDBpedia3.java
        aggregates
        CustomAggregate.java
        bgpmatching
        OpExecutorAlt.java
        StageAltMain.java
        StageGeneratorAlt.java
        constructquads
        ExampleConstructQuads.java
        engine
        MyQueryEngine.java
        filter
        classify.java
        propertyfunction
        labelSearch.java
        localname.java
        uppercase.java
        riot
        ExRIOT_1.java
        ExRIOT_2.java
        ExRIOT_3.java
        ExRIOT_4.java
        ExRIOT_5.java
        ExRIOT_6.java
        ExRIOT_7.java
        ExRIOT_out1.java
        ExRIOT_out2.java
        ExRIOT_out3.java
        update
        UpdateExecuteOperations.java
        UpdateProgrammatic.java
        UpdateReadFromFile.java
        jena
        examples
        ontology
        classHierarchy
        ClassHierarchy.java
        Main.java
        describeClass
        DescribeClass.java
        Main.java
        rdf
        Tutorial01.java
        Tutorial02.java
        Tutorial03.java
        Tutorial04.java
        Tutorial05.java
        Tutorial06.java
        Tutorial07.java
        Tutorial08.java
        Tutorial09.java
        Tutorial10.java
        Tutorial11.java
        jena-examples
        src
        main
        java
        org
        apache
        jena
        example
        Base.java
        CheeseBase.java
        helloworld
        HelloWorld.java
        pizza
        PizzaSparqlNoInf.java
        test
        java
        org
        apache
        jena_examples
        AppTest.java
        tdb
        examples
        ExQuadFilter.java
        ExTDB1.java
        ExTDB2.java
        ExTDB3.java
        ExTDB4.java
        ExTDB5.java
        ExTDB6.java
        ExTDB_Txn1.java
        ExTDB_Txn2.java
        ExTDB_Txn3.java
- src
  - edu
    - toronto
      - cs
        xcurator
        cli
        CLIRunner.java
        RdfFactory.java
        Util.java
        config
        RunConfig.java
        mapping
        MappingFactory.java
        XbrlEntityFiltering.java
        common
        DataDocument.java
        ElementIdGenerator.java
        NsContext.java
        RdfUriBuilder.java
        RdfUriConfig.java
        XMLUtils.java
        XPathFinder.java
        XmlDocumentBuilder.java
        XmlParser.java
        XmlUriBuilder.java
        discoverer
        BasicEntityDiscovery.java
        HashBasedEntityInterlinking.java
        KeyAttributeDiscovery.java
        MappingDiscoverer.java
        MappingDiscoveryStep.java
        OntologyLinkAddition.java
        RemoveGroupingNodes.java
        SerializeMapping.java
        eval
        Accuracy.java
        EvalUtil.java
        GoldStandardGenerator.java
        mapping
        Attribute.java
        Mapping.java
        MappingModel.java
        Reference.java
        Relation.java
        Schema.java
        SearchPath.java
        ValueAttribute.java
        XmlBasedMapping.java
        model
        Attribute.java
        AttributeInstance.java
        OntologyLink.java
        OntologyLinkInstance.java
        Relation.java
        RelationInstance.java
        Schema.java
        SchemaInstance.java
        parser
        Parser.java
        PatentParser.java
        rdf
        RdfGeneration.java
        RdfGenerationStep.java
        RdfGenerator.java
        XmlBasedMappingDeserialization.java
        utils
        BasicSimilarityMetric.java
        DependencyDAG.java
        DisjointSet.java
        IOUtils.java
        LogUtils.java
        SchemaSimilarityMetric.java
        StrUtils.java
        XMLUtil.java
        xml2rdf
        analysis
        SchemaGraph.java
        SchemaNodeLinkTree.java
        Statistics.java
        freebase
        FreeBaseLinker.java
        FreeBaseLinkerOld.java
        FreebaseUtil.java
        interlink
        Interlinker.java
        jena
        JenaUtils.java
        SKOS.java
        mapping
        Entity.java
        ForeignLookupKey.java
        LookupKey.java
        Mapping.java
        Property.java
        Relation.java
        generator
        DemoMappingGenerator.java
        DummyMappingGenerator.java
        DummySimilarityMetric.java
        MappingGenerator.java
        RelationFlatteningMetric.java
        Schema.java
        SchemaException.java
        SchemaSimilarityMetic.java
        opencyc
        OpenCycOntology.java
        string
        NoWSCaseInsensitiveStringMetric.java
        StringMetric.java
        StringUtils.java
        utils
        DependencyDAG.java
        DisjointSet.java
        LogUtils.java
        xml
        XMLUtils.java
- test
  - edu
    - toronto
      - cs
        xcurator
        TestConfigs.java
        cli
        CLIRunnerTest.java
        CliSuite.java
        RdfFactoryTest.java
        UtilTest.java
        config
        ConfigSuite.java
        RunConfigTest.java
        mapping
        MappingFactoryTest.java
        MappingSuite.java
        XbrlEntityFilteringTest.java
        discoverer
        BasicEntityDiscoveryTest.java
        MappingDiscoveryTests.java
        OntologyLinkAdditionTest.java
        json2xml
        TestJSON2XML.java
        rdf
        RdfGeneratorTest.java
        RdfGeneratorTest2.java
        XmlBasedMappingDeserializationTests.java
        xml
        ElementIdGeneratorTest.java
        NsContextTest.java
        XMLUtilsTest.java
        XPathFinderTest.java
        XmlParserTest.java
        xml2rdf
        jena
        JenaSimpleTest.java
        mapping
        DataTest.java
        MappingTest.java
        MappingTestNew.java
        generator
        MappingGeneratorStepTestCT.java
        MappingGeneratorStepTestDBLP.java
        MappingGeneratorStepTestDrugBank.java
        MappingGeneratorStepTestFDA.java
        MappingGeneratorStepTestXBRL.java
        MappingGeneratorTest.java
/*
 *    Copyright (c) 2013, University of Toronto.
 *
 *    Licensed under the Apache License, Version 2.0 (the "License"); you may
 *    not use this file except in compliance with the License. You may obtain
 *    a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 *    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 *    License for the specific language governing permissions and limitations
 *    under the License.
 */
package edu.toronto.cs.xml2rdf.mapping.generator;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import edu.toronto.cs.xml2rdf.freebase.FreeBaseLinker;
import edu.toronto.cs.xml2rdf.mapping.Entity;
import edu.toronto.cs.xml2rdf.opencyc.OpenCycOntology;
import edu.toronto.cs.xml2rdf.string.StringMetric;
import edu.toronto.cs.xml2rdf.utils.DependencyDAG;
import edu.toronto.cs.xml2rdf.utils.DisjointSet;
import edu.toronto.cs.xml2rdf.utils.LogUtils;
import edu.toronto.cs.xml2rdf.xml.XMLUtils;
import java.util.concurrent.ConcurrentHashMap;
//import org.xeustechnologies.googleapi.spelling.SpellChecker;
//import org.xeustechnologies.googleapi.spelling.SpellResponse;

/*
 * This is a "dummy" implementation of MappingGenerator interface.
 * TODO: Perhaps we can implement a brand new (not so dummy) class
 * that modularize each mapping step.
 */
/**
 * @author Soheil Hassas Yeganeh <soheil@cs.toronto.edu>
 */
public class DummyMappingGenerator implements MappingGenerator {

    // Flag for printing debugging information
    static boolean debug = true;

    // Ceilings
    private final int maxElement;
    private final int maxOnotlogyLookup;

    // Mapping essentials
    Map<String, Schema> schemas = new ConcurrentHashMap<String, Schema>();
    private final List<MappingStep> enabledSteps;

    // Metrics
    private final StringMetric stringMetric;
    private final SchemaSimilarityMetic schemaSimMetric;

    // All thresholds
    private final double ontologyMatchingThreshold;
    private final double schemaSimThreshold;
    private int leafPromotionThreshold = 5;
    private double matchThreshold = 0.75;
    private double ignoredNumbers = 0.25;
    private int minimumNumberOfAttributeToMerges = 2;
    private final double intralinkingThreshold;

    /*
     * Constructor that initialize all threshold parameters.
     * TODO: Design algorithms to estimate the thresholds so that they do not
     * need to be manually assigned.
     */
    public DummyMappingGenerator(double ontologyMatchingThreshold,
            StringMetric stringMetric, double schemaSimThreshold,
            SchemaSimilarityMetic schemaSimMetric,
            int leafPromotionThreshold, double matchThreshold,
            int maxElement, int maxOntologyLookup,
            double ignoredNumbers,
            int minimumNumberOfAttributeToMerges,
            double internalLinkingThreshold, MappingStep... enabledSteps) {
        this.ontologyMatchingThreshold = ontologyMatchingThreshold;
        this.schemaSimMetric = schemaSimMetric;
        this.stringMetric = stringMetric;
        this.schemaSimThreshold = schemaSimThreshold;
        this.matchThreshold = matchThreshold;
        this.leafPromotionThreshold = leafPromotionThreshold;
        this.maxElement = maxElement;
        this.maxOnotlogyLookup = maxOntologyLookup;
        this.ignoredNumbers = ignoredNumbers;
        this.minimumNumberOfAttributeToMerges = minimumNumberOfAttributeToMerges;
        this.intralinkingThreshold = internalLinkingThreshold;
        this.enabledSteps = Arrays.asList(
                enabledSteps == null || enabledSteps.length == 0
                        ? MappingStep.values() : enabledSteps);
    }

    /*
     * The root/mother function that calls all mapping step functions.
     * TODO: Modularize this function.
     */
    @Override
    public Document generateMapping(Element rootDoc, String typePrefix) {

        // Capture time at each step
        long start;
        long end;

        // The organization of the XML files should have "clinical_studies" as the
        // very root document element (which is passed in as rootDoc), with many
        // "clinical_study" child nodes, which is the children variable below.
        NodeList children = rootDoc.getChildNodes();
        System.out.println(children.getLength());

        // Step 1. Merge the child element nodes and their associated schemas
        start = System.currentTimeMillis();

        // Iterate through all child nodes or up to the maximum number specified,
        // and process (merge) ONLY child nodes that are elements.
        for (int i = 0; i < children.getLength() && (maxElement == -1 || i < maxElement); i++) {
            if (children.item(i) instanceof Element) {
                // Get the child element node.
                Element child = (Element) children.item(i);
                String name = child.getNodeName();
                // Create a schema for this child element node if one with the same node name does not exist.
                // Consequently, there will be only one schema for each unique node name.
                // The path of the schema is the ABSOLUTE path to the child element node, starting with "/"
                // and the root element node name, such as "/clinical_studies/clinical_study".
                Schema schema = schemas.get(name);

                if (schema == null) {
                    // Eric: What if child nodes have the same name but at different layers of the
                    // XML file and thus different path? Only the first path is used?
                    schema = new Schema(null, child, "/" + rootDoc.getNodeName() + "/" + name);
                    schemas.put(name, schema);
                }

                // Merge the child element node with its schema, that is, the schema of the same name
                try {
                    mergeWithSchema(child, schema);
                } catch (Exception e) {
                    if (debug) {
                        e.printStackTrace();
                    }
                }
            }
        }

        end = System.currentTimeMillis();
        System.out.println("Execution time of step 1 : schema merge was " + (end - start) + " ms.");

        cacheInstances(schemas, rootDoc);

        // Step 2. Flatten the schema
        start = System.currentTimeMillis();

        try {
            // Eric: the flat threshold is always 1?
            flattenSchema(rootDoc.getOwnerDocument(), 1);
        } catch (XPathExpressionException e1) {
            if (debug) {
                e1.printStackTrace();
            }
        }

        end = System.currentTimeMillis();
        System.out.println("Execution time of step 2 : schema flatten was " + (end - start) + " ms.");

        // Step 3. Remove duplicates
        start = System.currentTimeMillis();

        removeDuplicates();

        end = System.currentTimeMillis();
        System.out.println("Execution time of step 3 : duplicate remove was " + (end - start) + " ms.");

        // Step 4. Find a possible key for each identified schema
        start = System.currentTimeMillis();

        try {
            for (Schema schema : schemas.values()) {
                try {
                    // Eric: uniqunessThreshold is always 0.0d?
                    // TODO: find a way to automate this.
                    findKeysForSchema(schema, rootDoc.getOwnerDocument(), 0.0d);
                } catch (XPathExpressionException e) {
                    if (debug) {
                        e.printStackTrace();
                    }
                }
            }
        } catch (java.util.ConcurrentModificationException e1) {
            if (debug) {
                e1.printStackTrace();
            }
        }

        end = System.currentTimeMillis();
        System.out.println("Execution time of step 4 : key identification was " + (end - start) + " ms.");

        // Step 5. Intra-link schemas
        start = System.currentTimeMillis();

        try {
            intralinkSchemas(rootDoc.getOwnerDocument(), intralinkingThreshold);
        } catch (XPathExpressionException e1) {
            if (debug) {
                e1.printStackTrace();
            }
        }

        end = System.currentTimeMillis();
        System.out.println("Execution time of step 5 : intra-linking was " + (end - start) + " ms.");

        DependencyDAG<Schema> dependecyDAG = new DependencyDAG<Schema>();

        for (Schema schema : schemas.values()) {
            dependecyDAG.addNode(schema);
            // TODO(oktie): Haaji this is duplicate of the code on line 904.
            for (Relation rel : schema.getRelations()) {
                if (!schemas.containsKey(rel.getSchema())) {
                    LogUtils.error(DummyMappingGenerator.class,
                            "ERRRRRRRRRRRRR! " + rel.getSchema() + " Does not exist. " + rel);
                }
            }
        }

        for (Schema schema : schemas.values()) {
            for (Relation rel : schema.getRelations()) {
                dependecyDAG.addDependency(schema, rel.getSchema());
            }
        }

        Document mappingRoot = null;
        try {
            mappingRoot = DocumentBuilderFactory.newInstance()
                    .newDocumentBuilder().newDocument();

            Element rootElement = mappingRoot.createElementNS(
                    "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "mapping");
            mappingRoot.appendChild(rootElement);

            while (dependecyDAG.size() != 0) {
                Schema schema = dependecyDAG.removeElementWithNoDependency();
                addEntities(schema, mappingRoot, "", typePrefix);
            }

        } catch (ParserConfigurationException e) {
            if (debug) {
                e.printStackTrace();
            }
        }

        return mappingRoot;
    }

    private void cacheInstances(Map<String, Schema> schemas, Element rootDoc) {
        for (Map.Entry<String, Schema> entry : schemas.entrySet()) {
            String name = entry.getKey();
            entry.getValue();
        }
    }

    /*
     * Checking if the mapping step is toggled by the user.
     */
    public boolean isStepEnabled(MappingStep step) {
        return enabledSteps.contains(step);
    }

    SchemaInstance createSchemaInstance(Element element, Schema schema) {
        SchemaInstance instance = null;
        try {
            instance = new SchemaInstance(element);
            schema.instances.add(instance);
        } catch (IOException e) {
        }
        return instance;
    }

    /*
     * Step 1. Merge the schemas
     */
    private SchemaInstance mergeWithSchema(Element element, Schema schema)
            throws SchemaException, XPathExpressionException {
        // Cache the instance.
        SchemaInstance instance = createSchemaInstance(element, schema);

        // Set the schema name, if null, to the name of the element;
        // or check if the two names are the same, as they should be
        // Eric: I believe this is unnecessary and should be removed
        String schemaName = schema.getName();
        if (schemaName == null) {
            schema.setName(element.getNodeName());
        } else {
            if (!schema.getName().equals(element.getNodeName())) {
                throw new SchemaException("Schema element names do not match.");
            }
        }

        // Never merge leaf element nodes.
        //
        // Eric: Technically, this "if statement" will always be true because
        // if the element if a leaf, then "mergeWithSchema" function will never
        // be called on this leaf element in the first place
        if (!XMLUtils.isLeaf(element)) {

            // Get all the (immediate next level) child nodes of the given element node
            NodeList children = element.getChildNodes();

            // Iterate through all child nodes, but process
            // ONLY those that are elements
            for (int i = 0; i < children.getLength(); i++) {
                if (children.item(i) instanceof Element) {

                    // Process child element node that is NOT a leaf element node.
                    if (!XMLUtils.isLeaf(children.item(i))) {

                        // Get the non-leaf child element node, which means it has
                        // leaf (and possibly non-leaf) child element nodes under it
                        Element child = (Element) children.item(i);

                        // The boolean value to indicate if a previous instance of this
                        // non-leaf child element with the same name has already been
                        // processed/merged.
                        boolean found = false;

                        // Find out if this non-leaf child element already exists
                        // in parent element's relations, meaning that a previous
                        // instance of the non-leaf child element with the same name
                        // has already been processed and put into the relations of
                        // the parent element.
                        //
                        // If so, merge this instance of the non-leaf child element
                        // with the already consolidated associated schema, during
                        // which new relations or attributes might be added to this
                        // schema
                        for (Relation childRelation : schema.getRelations()) {
                            if (childRelation.getName().equals(child.getNodeName())) {
                                SchemaInstance childInstance
                                        = mergeWithSchema(child, childRelation.getSchema());
                                createRelationInstnace(childRelation, instance, childInstance);
                                found = true;
                                break;
                            }
                        }

                        // This is the first encounter of the non-leaf child element
                        // with this node name
                        if (!found) {

                            // Get the name of the non-leaf child element node
                            String name = child.getNodeName();
                            // Create the path, which is the ABSOLUTE path to this
                            // non-leaf child element node, starting with "/"
                            String path = schema.getPath() + "/" + name;

                            // Create a schema for this non-leaf child element node,
                            // if none exists yet
                            Schema childSchema = schemas.get(name);
                            if (childSchema == null) {
                                // Eric: Why not set the parent to the current schema?
                                childSchema = new Schema(null, child, path);
                                schemas.put(child.getNodeName(), childSchema);
                            }

                            // Merge this non-leaf child element node first before
                            // further processing this node
                            SchemaInstance childInstance
                                    = mergeWithSchema(child, childSchema);

                            // Create the lookupKeys for the creation of relation later
                            // This is essentially a list of all leaf elements that
                            // exist under the current child node
                            Set<Attribute> lookupKeys = new HashSet<Attribute>();

                            // Get the list of RELATIVE path to all leaf element nodes
                            // of the current non-leaf child element node, with path
                            // starting with the name of the current non-leaf child
                            // element node (and not "/"), and ending with the name
                            // of the leaf element nodes
                            List<String> leaves = XMLUtils.getAllLeaves(child);

                            // Iterate through all paths to the leaf element nodes
                            for (String leafPath : leaves) {

                                // Get the name of the current LEAF element node
                                int lastNodeIndex = leafPath.lastIndexOf('/');
                                String lastNodeName = leafPath.substring(lastNodeIndex + 1);

                                // Create leafName by simply replacing all "/" with "."
                                String leafName = leafPath.replace('/', '.');

                                // Append ".name" to the end of leafName if the current
                                // leaf element node has been promoted and has an
                                // OntologyLink schema associated with it
                                //
                                // Eric: Is it correct to say that the ONLY case where
                                // lastNodeSchema is NOT null is when the child node has
                                // been promoted, which means lastNodeSchema is ALWAYS
                                // an OntologyLink schema?
                                Schema lastNodeSchema = schemas.get(lastNodeName);
                                if (lastNodeSchema instanceof OntologyLink) {
                                    // Eric: Why ".name"? What's the meaning behind this?
                                    leafName += ".name";
                                }

                                // Create leafPath through removing the name of the parent non-leaf
                                // element node at the beginning, along with the "/", and then append
                                // "/text()" at the end of the leafPath.
                                //
                                // This is essentially the RELATIVE path to the TEXT VALUE of the
                                // current leaf element node under the parent non-leaf element node,
                                // and this path will be understood correctly by XPath
                                leafPath = leafPath.replaceAll("^" + child.getNodeName() + "/?", "");
                                // Eric: Why would leafPath ever be empty anyways? It must at least
                                // contain the name of the LAEF node.
                                leafPath = leafPath.length() > 0 ? leafPath + "/text()" : "text()";

                                // Create an entry to the lookupKeys, which keeps track of the parent
                                // non-leaf element node's schema, the name and the RELATIVE path to
                                // all the TEXT VALUES of the leaf element nodes under it, and whether
                                // these element nodes are keys or not
                                //
                                // Eric: I'm still unclear about the answer to the email question
                                // regarding the lookupKeys (Question 1.2).
                                lookupKeys.add(new Attribute(schema, leafName, leafPath, false));
                            }

                            // Eric: Why is path (the third parameter) set to name?
                            // Set the parent-child (schema-childSchema) relation, with lookupKeys essentially
                            // a list of LEAF nodes of the child (childSchema) and their parent is set to
                            // schema
                            // One can think of the path to the childSchema as schema.getPath() + "/" + name
                            // (name is the name of the childSchema)
                            Relation relation = new Relation(schema, name, name, childSchema, lookupKeys);
                            schema.addRelation(relation);
                            createRelationInstnace(relation, instance, childInstance);
                        }
                    } // Process child element node that IS INDEED a leaf element node
                    else {

                        // Get the leaf child element and its name
                        Element child = (Element) children.item(i);
                        String name = child.getNodeName();

                        // Get the ABSOLUTE path to the leaf child element,
                        // starting with "/"
                        String path = schema.getPath() + "/" + name;

                        // Find out if a previous instance of the leaf child element
                        // with the same name has already been added to the attributes
                        // or relations. Since the leaf child element has no children,
                        // the previous instance will be exactly the same as the current
                        // instance (structure-wise), the current instance does not need
                        // to be processed anymore.
                        boolean found = false;

                        for (Attribute childAttribute : schema.getAttributes()) {
                            if (childAttribute.getName().equals(child.getNodeName())) {
                                found = true;
                                break;
                            }
                        }

                        for (Relation childRelation : schema.getRelations()) {
                            if (childRelation.getSchema() instanceof OntologyLink
                                    && childRelation.getName().equals(child.getNodeName())) {
                                found = true;
                                break;
                            }
                        }

                        // If no previous instance has found, which means this is the first
                        // encounter of the leaf child node with this name
                        if (!found) {

                            LogUtils.debug(this.getClass(), "searching in ontology for " + path);

                            // values contains all the text values of the elements with the same ABSOLUTE path
                            Set<String> values = new HashSet<String>();
                            // types contains all typeIDs (above threshold) based on the above text values
                            Set<String> types = findTypeInOntology(path, element.getOwnerDocument(),
                                    values, matchThreshold, ignoredNumbers);

                            // If types contains some typeIDs and values contains enough text values
                            // Eric: What's the significance of values.size() >= leafPromotionThreshold since
                            // values are merely the different text values of the current leaf node?
                            if (types != null && types.size() > 0 && values.size() >= leafPromotionThreshold) {

                                LogUtils.debug(this.getClass(), "Types found for " + element + " " + types);

                                // Find out if a previous instance of the leaf child
                                // element with the same name has already been processed
                                found = false;

                                // If a previous instance of the leaf child element has already been
                                // processed and added to parent's relation, merge the current instance
                                // of the leaf child node
                                //
                                // Eric: This leaf child node will NEVER get merged because mergeWithSchema
                                // function only process non-leaf elements. Is this correct?
                                for (Relation childRelation : schema.getRelations()) {
                                    if (childRelation.getName().equals(child.getNodeName())) {
                                        mergeWithSchema(child, childRelation.getSchema());
                                        found = true;
                                        break;
                                    }
                                }

                                // If no previous instance of the leaf child element is found
                                // and this is the first encounter of the leaf child element with
                                // this node name
                                if (!found) {

                                    // Create a schema for the current leaf child element,
                                    // if none exists yet
                                    OntologyLink childSchema = (OntologyLink) schemas.get(child.getNodeName());
                                    if (childSchema == null) {
                                        childSchema = new OntologyLink(null, child, path, types);
                                        schemas.put(child.getNodeName(), childSchema);
                                    }

                                    // Merge the current leaf child element before further processing
                                    //
                                    // Eric: Again, this leaf child element will NEVER get merged
                                    // because mergeWithSchema function only process non-leaf elements.
                                    // Is this correct?
                                    SchemaInstance childInstance
                                            = mergeWithSchema(child, childSchema);

                                    // Eric: Because the current child element is a leaf, it does NOT contain
                                    // other child elements, which means the list leaves contains ONLY ONE
                                    // string, which is the name of the current child element, and consequently
                                    // the set lookupKeys contains ONLY ONE attribute, with its path being "text()".
                                    // Is this the correct understanding? Once again, I'm not sure why lookupKeys
                                    // are needed.
                                    //
                                    // Eric: The following relation creation process is exactly the same as before,
                                    // which I believe should and must be simplified for the reasoning above.
                                    Set<Attribute> lookupKeys = new HashSet<Attribute>();

                                    // Eric: Would this just return child itself as it is the leaf element?
                                    List<String> leaves = XMLUtils.getAllLeaves(child);

                                    for (String leafPath : leaves) {
                                        int lastNodeIndex = leafPath.lastIndexOf('/');
                                        String lastNodeName = leafPath.substring(lastNodeIndex + 1);
                                        // Eric: Here, the lastNodeSchema is actually just the schema
                                        // of the CURRENT child, which is the OntologyLink just created
                                        // above. Is this the intention?
                                        // FIXME: The following code can be simplified for the reasoning above.
                                        Schema lastNodeSchema = schemas.get(lastNodeName);

                                        String leafName = leafPath.replace('/', '.');

                                        if (lastNodeSchema instanceof OntologyLink) {
                                            leafName += ".name";
                                        }

                                        leafPath = leafPath.replaceAll("^" + child.getNodeName() + "/?", "");
                                        leafPath = leafPath.length() > 0
                                                ? leafPath + "/text()" : "text()";

                                        lookupKeys.add(new Attribute(schema, leafName, leafPath,
                                                false));
                                    }

                                    Relation relation = new Relation(schema, name, name, childSchema, lookupKeys);
                                    createRelationInstnace(relation, instance, childInstance);
                                }
                            } // If the current leaf child node is not promoted, make it an attribute
                            else {

                                // The attribute is created with path being the ABSOLUTE path
                                // to the TEXT VALUE of the leaf child node
                                //
                                // Eric: Why use "setPath" when name and path can be set when
                                // the attribute is initialized
                                Attribute attribute = new Attribute(schema, name, path, false);
                                attribute.setName(child.getNodeName());
                                attribute.setPath(child.getNodeName() + "/text()");

                                schema.addAttribute(attribute);
                                createAttributeInstance(attribute, instance, child);

                                // ?????
                                if (types != null && types.size() != 0) {
                                    LogUtils.debug(this.getClass(),
                                            "Types found for " + element + " " + types);
                                    attribute.setTypeURIs(types);
                                }
                            }
                        }
                    }
                }
            }
        }
        return instance;
    }

    private AttributeInstance createAttributeInstance(Attribute attribute,
            SchemaInstance schemaInstance, Element attributeElement) {
        AttributeInstance instance = null;
        try {
            instance = new AttributeInstance(schemaInstance, attributeElement);
            attribute.addInstance(instance);
        } catch (IOException e) {
        }
        return instance;
    }

    private RelationInstance createRelationInstnace(Relation relation,
            SchemaInstance from, SchemaInstance to) {
        RelationInstance instance = new RelationInstance(from, to);
        relation.addInstance(instance);
        return instance;
    }

    /*
     * Step 1. Merge the schemas - Helper Function
     */
    private Set<String> findTypeInOntology(String path, Document doc,
            Set<String> visitedTerms, double matchThreshold, double ignoredNumebers)
            throws XPathExpressionException {

        // Perform ontology finding only if "INTRALINKING" is enabled
        if (!isStepEnabled(MappingStep.INTRALINKING)) {
            return new HashSet<String>();
        }

        // OpenCycOntology currently is NOT in use
        OpenCycOntology ontology = OpenCycOntology.getInstance();

        // Instantiate FreeBaseLinker
        FreeBaseLinker freebase = new FreeBaseLinker();

        // Get all instances of the nodes with the same ABSOLUTE path.
        // This means all the nodes have the same node name and they must
        // be all leaf element nodes since findTypeInOntology only calls
        // on leaf element nodes
        NodeList nl = XMLUtils.getNodesByPath(path, null, doc);

        // For each String typeIDs, count how many Integer times they are
        // returned from freebase
        Map<String, Integer> commonTypes = new HashMap<String, Integer>();

        // Count how many times no typeIDs is returned for a text value
        int count = 0;

        // Iterate through all instances of nodes with the same ABSOLUTE path
        for (int i = 0; i < nl.getLength()
                && (maxOnotlogyLookup == -1 || i < maxOnotlogyLookup); i++) {

            // Break all iterations if there are too many times (count) where no typeIDs is returned,
            // or enough different typeIDs have already returned.
            if (count > 100 && commonTypes.size() < 100) {
                break;
            }

            // Skip the current iteration if the text value of the
            // current instance has already been processed
            // Eric: This "term" is incorrect for the Patent data
            String term = nl.item(i).getTextContent();

            // Eric: The above "term" is wrong because it includes in-tag attributes,
            // and the "term" ends up being "Candy holderd2e53", which should've been "Candy holder".
            // The following line of code fixes the problem.
            // String term = nl.item(i).getChildNodes().item(0).getNodeValue();
            if (visitedTerms.contains(term)) {
                continue;
            }

            // If not, the text value of the current instance is
            // added to the visistedTerms, and these added terms
            // are not processed, such as having digits removed, etc
            visitedTerms.add(term);

            // Skip the current iteration if the text value of the current instance is empty,
            // longer than 50 characters, or consists entirely of digits
            if (term.trim().length() == 0 || term.length() > 50 || term.matches("^\\d+$")) {
                continue;
            }

            // Remove all digits from the text value of the current instance
            String withoutNumbers = term.trim().replaceAll("\\d", "");

            // Skip the current iteration (once again) if the ratio of the length of digits over
            // the total length of the text value is too high
            if ((term.length() - withoutNumbers.length()) / (double) term.length()
                    >= ignoredNumebers) {
                continue;
            }

            // A set that holds all freebase typeIDs that look something
            // like "http://rdf.freebase.com/rdf/music.release"
            Set<String> types = new HashSet<String>(); //ontology.findTypesForResource(term, stringMetric, ontologyMatchingThreshold);

            // Get the list of typeIDs based on the text value, and the typeIDs look like the following:
            // "http://rdf.freebase.com/rdf/music.release"
            Set<String> freebaseTypes = freebase.findTypesForResource(term,
                    stringMetric, ontologyMatchingThreshold);

            // Add all typeIDs if freebaseTypes is not null
            if (freebaseTypes != null) {
                types.addAll(freebaseTypes);
            }

            // If no typeIds is added and the length of the term is less than 20,
            // which could mean that there might be spelling mistakes
            if (types.size() == 0 && term.length() < 20) {

                // Get the Google spell checker and get the spell response
//                SpellChecker checker = new SpellChecker();
//                SpellResponse spellResponse = checker.check(term);
                // If there are spell corrections
//                if (spellResponse.getCorrections() != null
//                        && spellResponse.getCorrections().length > 0) {
                // Get the spell checked text value
                //
                // Eric: It seems like only one word is returned for text values of any length,
                // so for example, "Daniel Aradi MD PhD" is spell checked as "Abadi", which is
                // obviously wrong
//                    term = "";
//                    for (int j = 0; j < spellResponse.getCorrections().length; j++) {
//                        term += spellResponse.getCorrections()[j].getValue().split("\t")[0];
//                    }
                // Try add typeIDs based on the new spell-checked text value
                if (term.length() > 0) {
                    types = new HashSet<String>(); //.findTypesForResource(term, stringMetric, ontologyMatchingThreshold);
                    freebaseTypes = freebase.findTypesForResource(term, stringMetric,
                            ontologyMatchingThreshold);
                    if (freebaseTypes != null) {
                        types.addAll(freebaseTypes);
                    }
                }
//                }
            }

            // Skip the current iteration if still no typeIDs is found
            if (types.size() == 0) {
                count++;
                continue;
            }

            // Count for each typeID, the number of times it has occurred
            for (String type : types) {
                Integer typeCount = commonTypes.get(type);
                if (typeCount == null) {
                    typeCount = 0;
                }
                typeCount++;
                commonTypes.put(type, typeCount);
            }

            //      if (commonTypes == null) {
            //        commonTypes = types;
            //      } else {
            //        Set<String> tempCommonTypes = SetUtils.intersection(commonTypes, types);
            //        if (tempCommonTypes.size() == 0) {
            //          count++;
            //        } else {
            //          commonTypes = tempCommonTypes;
            //        }
            //      }
        }

        // double ratio = (visitedTerms.size() - count) / visitedTerms.size();
        // Add typeIDs to types this typeID has occurred enough times (over the threshold)
        Set<String> types = new HashSet<String>();
        for (Map.Entry<String, Integer> entry : commonTypes.entrySet()) {
            // System.out.println("Score for " + entry.getKey() + " is " + entry.getValue() / (double) visitedTerms.size());
            if (entry.getValue() / (double) visitedTerms.size() >= matchThreshold) {
                types.add(entry.getKey());
            }
        }

        // System.err.println("returning " + types + " for " + path);
        return types;

    }

    /*
     * Step 2. Flatten the schemas
     */
    private void flattenSchema(Document doc, double flatThreshold)
            throws XPathExpressionException {

        // Only perform schema flattening if enabled
        if (!isStepEnabled(MappingStep.SCHEMA_FLATTENING)) {
            return;
        }

        DependencyDAG<Schema> dependecyDAG = new DependencyDAG<Schema>();

        for (Schema schema : schemas.values()) {
            dependecyDAG.addNode(schema);
        }

        for (Schema schema : schemas.values()) {
            for (Relation rel : schema.getRelations()) {
                dependecyDAG.addDependency(schema, rel.getSchema());
            }
        }

        while (dependecyDAG.size() != 0) {
            Schema schema = dependecyDAG.removeElementWithNoDependency();
            // Eric: The line below serves no purpose!!!
            schema.getAttributes();

            Set<Relation> oneToOneRelations = findOneToOneRelations(doc, schema);

            for (Relation rel : oneToOneRelations) {
                LogUtils.debug(getClass(), "is one to one : " + schema + " . " + rel);
                flattenRelation(schema, rel);
            }
        }

    }

    /*
     * Step 2. Flatten the schemas - Helper Function
     */
    private Set<Relation> findOneToOneRelations(Document doc, Schema schema)
            throws XPathExpressionException {
        // If there is no relation in the schema, just return an empty set.
        if (schema.getRelations().size() == 0) {
            return new HashSet<Relation>();
        }

        Set<Relation> oneToOneRelations = new HashSet<Relation>();

        for (Relation rel : schema.getRelations()) {
            if (rel.isOneToOne()) {
                oneToOneRelations.add(rel);
            }
        }

        return oneToOneRelations;
    }

    /*
     * Step 2. Flatten the schemas - Helper Function
     */
    private void flattenRelation(Schema schema, Relation rel) {

        Schema targetSchema = rel.getSchema();

        // The promoted (relational) leaf node is now demoted
        // back to an attribute because of one-to-one'ness
        if (targetSchema instanceof OntologyLink) {
            // Eric: Serious?!
            String name = "name";
            name = targetSchema.getName();

            String path = rel.getPath() + "/text()";

            Attribute attr = new Attribute(schema, name, path, false);
            attr.setTypeURIs(targetSchema.getTypeURIs());

            schema.addAttribute(attr);
            attr.setParent(schema);
        }

        // Add attributes of the relation to the schema,
        // with modification to attributes' name, path, and
        // parent schema
        for (Attribute attr : targetSchema.getAttributes()) {
            String name = attr.getName();
            name = targetSchema.getName() + "_" + name;
            attr.setName(name);

            String path = attr.getPath();
            path = rel.getPath() + "/" + path;
            attr.setPath(path);

            schema.addAttribute(attr);
            attr.setParent(schema);
        }

        // Add relations of the relation to the schema,
        // with modification to relations' name, path, and
        // lookupKey attributes
        for (Relation targetRel : targetSchema.getRelations()) {
            String path = rel.getPath() + "/" + targetRel.getName();
            targetRel.setPath(path);

            String name = targetRel.getName();
            name = targetSchema.getName() + "_" + name;
            targetRel.setName(name);

            schema.addRelation(targetRel);
            targetRel.setParent(schema);

            // Eric: Shouldn't we also update the parent schema to the new one?
            for (Attribute lookupKey : targetRel.getLookupKeys()) {
                lookupKey.setPath(rel.getPath() + "/" + lookupKey.getPath());
                lookupKey.setName(lookupKey.getName().replace(rel.getName() + ".",
                        rel.getName() + "_"));
            }
        }

        // Now that we port over all relations and attributes of the relation
        // to its one-to-one parent schema, remove this relation and complete
        // the flatten process
        schema.getRelations().remove(rel);

        // Remove the relation schema altogether, iff this schema is not a
        // relation of any other schemas
        maybeRemoveSchema(targetSchema);
    }

    /*
     * Step 3. Remove duplicates
     */
    private void removeDuplicates() {

        if (!isStepEnabled(MappingStep.DUPLICATE_REMOVAL)) {
            return;
        }

        // The value of dSets is a hierarchy set of schemas that are considered to be
        // similar or duplicates
        Map<Schema, DisjointSet<Schema>> dSets = new HashMap<Schema, DisjointSet<Schema>>();

        for (Schema schema : schemas.values()) {
            DisjointSet<Schema> set = new DisjointSet<Schema>(schema);
            dSets.put(schema, set);
        }

        // TODO: Better way to detect duplicate pairs? For example, instead of iterating all
        // possible schema pairs, only compare those that are at the same level because its
        // unlikely the very top schema will be similar to that of almost leaf schemas
        for (Schema schema1 : schemas.values()) {
            for (Schema schema2 : schemas.values()) {
                // Skip the current schema pair if they are the same, if they do have enough
                // attributes, or if schema1 name > schema2 name to avoid inspecting
                // <schema1, schema2> and <schema2, schema1>
                if (schema1 == schema2
                        || schema1.getAttributes().size() < minimumNumberOfAttributeToMerges
                        || schema2.getAttributes().size() < minimumNumberOfAttributeToMerges
                        || schema1.getName().compareTo(schema2.getName()) > 0) {
                    continue;
                }

                // TODO: better similarity schema
                double similarity = schemaSimMetric.getSimiliarity(schema1, schema2);

                if (similarity >= schemaSimThreshold) {
                    dSets.get(schema1).union(dSets.get(schema2));
                    LogUtils.info(this.getClass(), "Merging " + schema1 + " with " + schema2);
                }
            }
        }

        while (dSets.size() > 0) {

            Set<Schema> listOfSchemas = new HashSet<Schema>();

            Schema schema = dSets.keySet().iterator().next();
            listOfSchemas.add(schema);
            DisjointSet<Schema> dset = dSets.remove(schema);
            DisjointSet<Schema> root = dset.find();

            for (DisjointSet<Schema> set : root.getChildren()) {
                Schema similarSchema = set.getData();
                if (!schema.equals(similarSchema)) {
                    listOfSchemas.add(similarSchema);
                    dSets.remove(similarSchema);
                }
            }

            if (listOfSchemas.size() > 1) {
                Schema newSchema = mergeSchemas(listOfSchemas);

                // Replace old relation schema with the merged one
                for (Schema oldSchema : schemas.values()) {
                    for (Relation rel : oldSchema.getRelations()) {
                        if (listOfSchemas.contains(rel.getSchema())) {
                            rel.setSchema(newSchema);
                        }
                    }
                }

                // Remove all the pre-merged schemas
                for (Schema s : listOfSchemas) {
                    for (Attribute attr : s.getAttributes()) {
                        attr.setParent(newSchema);
                    }
                    schemas.remove(s.getName());
                }

                // Place the new merged schema
                schemas.put(newSchema.getName(), newSchema);
            }
        }
    }

    /*
     * Step 3. Remove duplicates - Helper Function
     */
    private Schema mergeSchemas(Set<Schema> listOfSchemas) {
        String path = "";
        String name = "";

        Set<Attribute> attributes = new HashSet<Attribute>();
        Set<Relation> relations = new HashSet<Relation>();

        for (Schema s : listOfSchemas) {
            attributes.addAll(s.getAttributes());
            relations.addAll(s.getRelations());
            path += s.getPath() + "|";
            name += s.getName() + "_or_";
        }

        path = path.substring(0, path.length() - 1);
        name = name.substring(0, name.length() - 4);

        Schema schema = new Schema(null, name, path);
        schema.setAttributes(attributes);
        schema.setRelations(relations);
        return schema;
    }

    /*
     * Step 4. Find a possible key for each identified schema
     */
    private void findKeysForSchema(Schema schema, Document doc,
            double uniqunessThreshold) throws XPathExpressionException {

        // A set of maps, with each map corresponds to one instance of
        // the input schema. The keys of the map are the names of all
        // relations and attributes of the input schema, and the values
        // of the map are the text values of relations or attributes.
        Set<Map<String, String>> entities = new HashSet<Map<String, String>>();

        if (schema instanceof OntologyLink) {
            return;
        }

        // A set of schema names (either relation schemas or attribute
        // schemas of the current schema) that are NOT keys
        Set<String> bannedKeys = new HashSet<String>();

        // Get all instances of the input schema, such as all
        // instances of "/clinical_studies/clinical_study"
        NodeList entityNL = XMLUtils.getNodesByPath(schema.getPath(), null, doc);

        // Iterate through all instances of the input schema, inspect its
        // attributes and relations, find those that cannot be keys (that is,
        // one schema instance has more than one attribute/relation schemas
        // with the same name), and fill variable entities defined above
        // TODO: This loop can be made faster!!! Think!!!
        for (int i = 0; i < entityNL.getLength(); i++) {

            // The keys of the map are the names of all relations and
            // attributes of the current instance of the input schema, and
            // the values of the map are the text values of these relations
            // or attributes.
            HashMap<String, String> instance = new HashMap<String, String>();

            // Get the current instance of the input schema
            Element element = (Element) entityNL.item(i);

            // For each attribute, find all its instances under the current
            // instance of the input schema, identify those that cannot be
            // keys, and fill the variable instance defined above
            for (Attribute attr : schema.getAttributes()) {

                // Skip if the current attribute is already banned
                if (bannedKeys.contains(attr.getName())) {
                    continue;
                }

                // Identify if the current attribute should be banned
                NodeList attributeNL = XMLUtils.getNodesByPath(attr.getPath(), element, doc);
                if (attributeNL.getLength() != 1) {
                    bannedKeys.add(attr.getName());
                    attr.setKey(false);
                }

                // Fill the variable instance defined above with the text value of the attribute
                // Eric: Only adds one text value when more than one instances can occur. WRONG? (Map
                // can only perserve one value per key, that's why).
                // Eric: Should the banned instances be added as well?  I don't think they should.
                instance.put(attr.getName(), XMLUtils.getStringByPath(attr.getPath(), element, doc));
            }

            // For each relation, find all its instances under the current
            // instance of the input schema, identify those that cannot be
            // keys, and fill the variable instance defined above
            for (Relation rel : schema.getRelations()) {

                // Skip if the current relation is already banned
                if (bannedKeys.contains(rel.getName())) {
                    continue;
                }

                // Identify if the current attribute should be banned
                // NodeList relNL = XMLUtils.getNodesByPath(rel.getPath() + "/text()", element, doc);
                NodeList relNL = XMLUtils.getNodesByPath(rel.getPath(), element, doc);
                if (relNL.getLength() != 1) {
                    bannedKeys.add(rel.getName());
                    continue;
                }

                // Fill the variable instance defined above with the text value of the relation,
                // which the text values of all its leaf children
                // Eric: Only adds one text value when more than one instances can occur. WRONG? (Map
                // can only perserve one value per key, that's why).
                // Eric: Should the banned instances be added as well? I don't think they should.
                instance.put(rel.getName(), XMLUtils.getStringByPath(rel.getPath(), element, doc));

            }

            entities.add(instance);
        }

        // Find possible keys among all attributes of the input schema
        for (Attribute attr : schema.getAttributes()) {

            if (bannedKeys.contains(attr.getName())) {
                continue;
            }

            // For the current attribute, count for each of its text
            // value, how many such text value has occurred across
            // all maps (entities)
            Map<String, Integer> valueMap = new HashMap<String, Integer>();
            for (Map<String, String> instance : entities) {
                String value = instance.get(attr.getName());
                Integer count = valueMap.get(value);
                if (count == null) {
                    count = 1;
                } else {
                    count++;
                }
                valueMap.put(value, count);
            }

            // For the current attribute, count the number of text values
            // that have occurred more than once, in another word, for the
            // current attribute, count the number of its text values that
            // have occurred more than once across all instances of the input
            // schema
            //
            // Eric: Is this the right schema? Let's say there's 200 instances
            // of the schema, and the attribute has 100 unique values. 1 particular
            // attribute value has occurred in 101 instances of the schema, but the
            // other 99 attribute values occurred only once. This attribute will
            // be considered as a key, but is this correct?
            int nonUnique = 0;
            for (Map.Entry<String, Integer> entry : valueMap.entrySet()) {
                Integer count = entry.getValue();
                if (count != 1) {
                    nonUnique++;
                }
            }

            // Consider the attribute as a key if the attribute's text value
            // is unique "enough" (passing the threshold)
            int total = valueMap.size();
            if (nonUnique / (double) total <= uniqunessThreshold) {
                attr.setKey(true);
                LogUtils.debug(this.getClass(), schema.getName() + "." + attr.getName() + " is unique");
            }
        }

        Set<Relation> depromotedRels = new HashSet<Relation>();

        for (Relation rel : schema.getRelations()) {

            if (!(rel.getSchema() instanceof OntologyLink)
                    || bannedKeys.contains(rel.getName())) {
                continue;
            }

            // For the current relation, count for each of its text
            // value, how many such text value has occurred across
            // all maps (entities)
            Map<String, Integer> valueMap = new HashMap<String, Integer>();
            for (Map<String, String> instance : entities) {
                String value = instance.get(rel.getName());
                Integer count = valueMap.get(value);
                if (count == null) {
                    count = 1;
                } else {
                    count++;
                }
                valueMap.put(value, count);
            }

            // For the current relation, count the number of text values
            // that have occurred more than once, in another word, for the
            // current relation, count the number of its text values that
            // have occurred more than once across all instances of the input
            // schema
            // Eric: Considering relation's text value is the aggregate text
            // values of all its leaf children, it seems EXTREMELY LIKELY
            // that relations will almost always be considered as keys. SO IS
            // THIS RIGHT? WHY CONSIDER RELATIONS AS KEYS?
            int nonUnique = 0;
            for (Map.Entry<String, Integer> entry : valueMap.entrySet()) {
                Integer count = entry.getValue();
                if (count != 1) {
                    nonUnique++;
                }
            }

            // Consider the relation as a key if the text value of relation
            // is unique "enough" (passing the threshold), then convert
            // such relation to attribute (NOT CLEAR!!! WHAT ABOUT ITS CHILDREN)
            int total = valueMap.size();
            if (nonUnique / (double) total <= uniqunessThreshold) {
                // Eric: Why are you so sure this is an ontology schema?
                OntologyLink promotedLeafSchema = (OntologyLink) rel.getSchema();
                Set<String> typeURIs = promotedLeafSchema.getTypeURIs();
                depromotedRels.add(rel);
                schema.setTypeURIs(typeURIs);
                Attribute attr = new Attribute(schema, promotedLeafSchema.getName(), rel.getPath(), true);
                schema.addAttribute(attr);
                LogUtils.debug(getClass(), schema.getName() + "." + attr.getName() + " is unique");
            }
        }

        // Remove such identified key relation from the input,
        // schema and possibly remove such relation altogether
        for (Relation rel : depromotedRels) {
            schema.getRelations().remove(rel);
            maybeRemoveSchema(rel.getSchema());
        }

    }

    /*
     * Helper Function to remove a schema iff this schema
     * is not a relation of any other schemas
     */
    private void maybeRemoveSchema(Schema schemaToBeRemoved) {

        for (Schema schema : schemas.values()) {
            // The if-continue just skips the schema of the
            // same name
            if (schema.equals(schemaToBeRemoved)) {
                continue;
            }
            // Now we know the current schema has a
            // different name
            for (Relation relation : schema.relations) {
                if (relation.schema.equals(schemaToBeRemoved)) {
                    return;
                }
            }
        }
        // Only remove the schema if it's not a relation
        // of any other schemas
        // Eric: WRONG, should be schemaToBeRemoved.getName()
        // schemas.remove(schemaToBeRemoved);
        schemas.remove(schemaToBeRemoved.getName());
    }

    /*
     * Step 5. Intra-link schemas
     */
    private void intralinkSchemas(Document doc, double linkingThreshold)
            throws XPathExpressionException {

        // Eric: Why is INTERLINKING used under intralinkSchemas()?
        if (!isStepEnabled(MappingStep.INTERLINKING)) {
            return;
        }

        for (Schema schema : schemas.values()) {

            // Get all instances of the schema
            NodeList nl = XMLUtils.getNodesByPath(schema.getPath(), null, doc);

            for (Attribute attr : schema.getAttributes()) {

                List<Attribute> matchedAttributes = new LinkedList<Attribute>();

                // Eric: THIS IS WRONG FOR "facility", for example, because
                // the path of the attribute includes "facility"? Design choice?
                Set<String> propertyValues = XMLUtils.getStringsByPath(
                        schema.getPath() + "/" + attr.getPath(), null, doc);

                for (Schema targetSchema : schemas.values()) {

                    // Skip current iteration when the two schemas are the same
                    if (targetSchema.equals(attr.getParent())) {
                        continue;
                    }

                    for (Attribute targetAttribute : targetSchema.getAttributes()) {
                        if (!targetAttribute.isKey()) {
                            continue;
                        }

                        if (targetAttribute.equals(attr)) {
                            continue;
                        }

                        Set<String> targetPropertyValues = XMLUtils.getStringsByPath(
                                targetSchema.getPath() + "/" + targetAttribute.getPath(), null, doc);

                        Set<String> sharedValues
                                = org.openjena.atlas.lib.SetUtils.intersection(propertyValues,
                                        targetPropertyValues);

                        if (sharedValues.size() / (double) propertyValues.size() >= linkingThreshold) {
                            matchedAttributes.add(targetAttribute);
                        }
                    }
                }

                for (Attribute matchedAttribute : matchedAttributes) {
                    Schema taregetSchema = matchedAttribute.getParent();

                    Set<Attribute> lookupKeys = new HashSet<Attribute>();

                    // Eric: What's the point of this add?
                    lookupKeys.add(new Attribute(schema, matchedAttribute.getName(),
                            attr.getPath(), false));

                    Relation rel = new Relation(schema, attr.getName() + "_to_"
                            + matchedAttribute.getName() + "_internal_relation", attr.getPath(),
                            taregetSchema, lookupKeys);

                    schema.addRelation(rel);
                }
            }

        }
    }

    private void addEntities(Schema schema, Document mappingRoot, String path,
            String typePrefix) {

        if (schema instanceof OntologyLink) {
            Element entityElement = mappingRoot.createElementNS(
                    "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "entity");
            entityElement.setAttribute("path", schema.getPath());
            entityElement.setAttribute("type", typePrefix
                    + schema.getName());
            mappingRoot.getDocumentElement().appendChild(entityElement);

            Element idElement = mappingRoot.createElementNS(
                    "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "id");
            idElement.setTextContent(typePrefix + "${" + Entity.AUTO_GENERATED + "}");
            entityElement.appendChild(idElement);

            Element attributeElement = mappingRoot.createElementNS(
                    "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "property");
            attributeElement.setAttribute("path", "text()");
            attributeElement.setAttribute("name", typePrefix + "name_property");
            attributeElement.setAttribute("key", "true");
            entityElement.appendChild(attributeElement);

            for (String ontologyURI : ((OntologyLink) schema).getTypeURIs()) {

                String label = OpenCycOntology.getInstance()
                        .getLabelForResource(ontologyURI);

                Element ontologyElement = mappingRoot.createElementNS(
                        "http://www.cs.toronto.edu/xml2rdf/mapping/v1",
                        "ontology-link");
                ontologyElement.setAttribute("uri", ontologyURI);
                ontologyElement.setAttribute("label", label);
                entityElement.appendChild(ontologyElement);
            }

        } else {
            Element entityElement = mappingRoot.createElementNS(
                    "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "entity");
            entityElement.setAttribute("path", schema.getPath());
            entityElement.setAttribute("type", typePrefix
                    + schema.getName());
            mappingRoot.getDocumentElement().appendChild(entityElement);

            Element idElement = mappingRoot.createElementNS(
                    "http://www.cs.toronto.edu/xml2rdf/mapping/v1", "id");
            idElement.setTextContent(typePrefix + "${" + Entity.AUTO_GENERATED + "}");
            entityElement.appendChild(idElement);

            // TODO: reload attributes
            for (String ontologyURI : schema.getTypeURIs()) {
                String label
                        = OpenCycOntology.getInstance().getLabelForResource(ontologyURI);

                Element ontologyElement = mappingRoot.createElementNS(
                        "http://www.cs.toronto.edu/xml2rdf/mapping/v1",
                        "ontology-link");
                ontologyElement.setAttribute("uri", ontologyURI);
                ontologyElement.setAttribute("label", label);
                entityElement.appendChild(ontologyElement);
            }

            for (Attribute attribute : schema.getAttributes()) {
                Element attributeElement = mappingRoot.createElementNS(
                        "http://www.cs.toronto.edu/xml2rdf/mapping/v1",
                        "property");
                attributeElement.setAttribute("path", attribute.getPath());
                attributeElement.setAttribute("name",
                        typePrefix + attribute.getName() + "_property");
                attributeElement.setAttribute("key", String.valueOf(attribute.isKey()));

                for (String ontologyURI : attribute.getTypeURIs()) {
                    Element ontologyElement = mappingRoot.createElementNS(
                            "http://www.cs.toronto.edu/xml2rdf/mapping/v1",
                            "ontology-link");
                    String label
                            = OpenCycOntology.getInstance().getLabelForResource(ontologyURI);

                    ontologyElement.setAttribute("uri", ontologyURI);
                    ontologyElement.setAttribute("label", label);
                    attributeElement.appendChild(ontologyElement);
                }

                entityElement.appendChild(attributeElement);
            }

            for (Relation relation : schema.getRelations()) {
                Element relationElement = mappingRoot.createElementNS(
                        "http://www.cs.toronto.edu/xml2rdf/mapping/v1",
                        "relation");
                relationElement.setAttribute("path", relation.getPath());
                relationElement.setAttribute("targetEntity", typePrefix
                        + relation.getSchema().getName());
                relationElement.setAttribute("name", typePrefix + relation.getName() + "_rel");
                entityElement.appendChild(relationElement);

                Element lookupElement = mappingRoot.createElementNS(
                        "http://www.cs.toronto.edu/xml2rdf/mapping/v1",
                        "lookupkey");

                for (Attribute attr : relation.getLookupKeys()) {
                    Element targetPropertyElement = mappingRoot.createElementNS(
                            "http://www.cs.toronto.edu/xml2rdf/mapping/v1",
                            "target-property");
                    targetPropertyElement.setAttribute("path", attr.getPath());
                    String name = attr.getName();
                    String[] nameSplitted = name.split("\\.");
                    String newName = nameSplitted[0];
                    for (int i = 1; i < nameSplitted.length - 1; i++) {
                        newName += "." + nameSplitted[i] + "_rel";
                    }

                    if (nameSplitted.length == 1) {
                        newName += "_prop";
                    } else {
                        newName += nameSplitted[nameSplitted.length - 1] + "_prop";
                    }

                    targetPropertyElement.setAttribute("name", typePrefix + attr.getName());
                    lookupElement.appendChild(targetPropertyElement);
                }

                relationElement.appendChild(lookupElement);
            }

        }
    }

    ////////////////////////////////////////////////////////////////////////////
    // Old Functions no longer in use
    ////////////////////////////////////////////////////////////////////////////

    /*
     * The functions below are NOT used currently and possibly obsolete!
     */
    private void intralinkSchemasOld(Document doc, double linkingThreshold)
            throws XPathExpressionException {
        for (Schema schema : schemas.values()) {
            NodeList nl = XMLUtils.getNodesByPath(schema.getPath(), null, doc);

            for (Attribute attr : schema.getAttributes()) {

                Map<Attribute, Integer> attributeMatchMap
                        = new HashMap<Attribute, Integer>();

                attributeLoop:
                for (int i = 0; i < nl.getLength(); i++) {
                    if (nl.item(i) instanceof Element) {
                        Element entityElement = (Element) nl.item(i);
                        Set<String> propertyValues = XMLUtils.getStringsByPath(
                                attr.getPath(), entityElement, doc);

                        for (Schema targetSchema : schemas.values()) {
                            for (Attribute targetAttribute : targetSchema.getAttributes()) {
                                if (!targetAttribute.isKey()) {
                                    continue;
                                }

                                if (targetAttribute.equals(attr)) {
                                    continue;
                                }

                                NodeList valueNodeList = XMLUtils.getNodesByPath(
                                        targetSchema.getPath() + "/" + targetAttribute.getPath(),
                                        null, doc);

                                for (int j = 0; j < valueNodeList.getLength(); j++) {
                                    Node node = valueNodeList.item(j);
                                    if (propertyValues.contains(node.getTextContent().trim())) {
                                        Integer count = attributeMatchMap.get(targetAttribute);
                                        if (count == null) {
                                            count = 0;
                                        }
                                        attributeMatchMap.put(targetAttribute, count + 1);
                                        continue attributeLoop;
                                    }
                                }

                            }
                        }

                    }
                }

                Attribute matchedAttribute = null;
                for (Map.Entry<Attribute, Integer> entry : attributeMatchMap.entrySet()) {
                    if (entry.getValue() / (double) nl.getLength() >= linkingThreshold) {
                        matchedAttribute = entry.getKey();
                        break;
                    }
                }

                if (matchedAttribute != null) {
                    Schema taregetSchema = matchedAttribute.getParent();

                    Set<Attribute> lookupKeys = new HashSet<Attribute>();
                    lookupKeys.add(new Attribute(schema, matchedAttribute.getName(),
                            attr.getPath(), false));
                    Relation rel = new Relation(schema,
                            attr.getName() + "_interanl_relation", attr.getPath(),
                            taregetSchema, lookupKeys);
                    schema.addRelation(rel);
                }
            }
        }
    }

    private boolean isRelationOneToOne(Document doc, Schema schema, Relation rel)
            throws XPathExpressionException {
        Map<Set<String>, Set<Set<String>>> relMap
                = new HashMap<Set<String>, Set<Set<String>>>();
        Map<Set<String>, Set<Set<String>>> reverseRelMap
                = new HashMap<Set<String>, Set<Set<String>>>();

        String path = schema.getPath();
        NodeList entitiesNL = XMLUtils.getNodesByPath(path, null, doc);
        for (int i = 0; i < entitiesNL.getLength(); i++) {
            Element entityElement = (Element) entitiesNL.item(i);
            Set<String> entityValue
                    = new HashSet<String>(XMLUtils.getAllLeaveValues(entityElement));

            NodeList relationsNL = XMLUtils.getNodesByPath(rel.getPath(),
                    entityElement, doc);
            for (int j = 0; j < relationsNL.getLength(); j++) {
                Set<String> relValue = new HashSet<String>(
                        XMLUtils.getAllLeaveValues((Element) relationsNL.item(j)));
                Set<Set<String>> entitySet = relMap.get(relValue);
                if (entitySet == null) {
                    entitySet = new HashSet<Set<String>>();
                    relMap.put(relValue, entitySet);
                }

                entitySet.add(entityValue);

                Set<Set<String>> relSet = reverseRelMap.get(entityValue);
                if (relSet == null) {
                    relSet = new HashSet<Set<String>>();
                    reverseRelMap.put(entityValue, relSet);
                }

                relSet.add(relValue);
                if (entitySet.size() > 1 || relSet.size() > 1) {
                    LogUtils.debug(getClass(), schema + " . " + rel
                            + " is not one to one because of " + relValue);
                    return false;
                }
            }
        }

        //    for (Map.Entry<Set<String>, Set<Set<String>>> relEntry: relMap.entrySet()) {
        //      if (relEntry.getValue().size() > 1) {
        //        LogUtils.debug(getClass(), schema + " . " + rel + " is not one to one because of " + relEntry);
        //        return false;
        //      }
        //    }
        //
        //    for (Map.Entry<Set<String>, Set<Set<String>>> entityEntry: reverseRelMap.entrySet()) {
        //      if (entityEntry.getValue().size() > 1) {
        //        LogUtils.debug(getClass(), schema + " . " + rel + " is not one to one because of " + entityEntry);
        //        return false;
        //      }
        //    }
        return relMap.size() > 0 && reverseRelMap.size() > 0;
    }

    private double getEntropyOfRelation(Document doc, Schema schema, Relation rel)
            throws XPathExpressionException {
        Map<Set<String>, Set<Set<String>>> relMap
                = new HashMap<Set<String>, Set<Set<String>>>();

        String path = schema.getPath();
        NodeList entitiesNL = XMLUtils.getNodesByPath(path, null, doc);
        for (int i = 0; i < entitiesNL.getLength(); i++) {
            Element entityElement = (Element) entitiesNL.item(i);
            Set<String> entityValue = new HashSet<String>(
                    XMLUtils.getAllLeaveValues(entityElement));

            NodeList relationsNL = XMLUtils.getNodesByPath(rel.getPath(),
                    entityElement, doc);
            for (int j = 0; j < relationsNL.getLength(); j++) {
                Set<String> relValue = new HashSet<String>(
                        XMLUtils.getAllLeaveValues((Element) relationsNL.item(j)));
                Set<Set<String>> entitySet = relMap.get(relValue);
                if (entitySet == null) {
                    entitySet = new HashSet<Set<String>>();
                    relMap.put(relValue, entitySet);
                }
                entitySet.add(entityValue);
            }
        }

        int sum = 0;
        for (Map.Entry<Set<String>, Set<Set<String>>> entry : relMap.entrySet()) {
            sum += entry.getValue().size();
        }

        double entropy = 0;
        for (Map.Entry<Set<String>, Set<Set<String>>> entry : relMap.entrySet()) {
            double p = entry.getValue().size() / (double) sum;
            entropy += -p * Math.log(p);
        }
        return entropy;
    }

}