/* * Copyright (c) 2013, University of Toronto. * * Licensed under the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. You may obtain * a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package edu.toronto.cs.xcurator.discoverer; import edu.toronto.cs.xcurator.common.DataDocument; import edu.toronto.cs.xcurator.mapping.Mapping; import edu.toronto.cs.xcurator.mapping.Attribute; import edu.toronto.cs.xcurator.mapping.Schema; import edu.toronto.cs.xcurator.mapping.Relation; import edu.toronto.cs.xcurator.common.NsContext; import edu.toronto.cs.xcurator.common.RdfUriBuilder; import edu.toronto.cs.xcurator.common.XmlParser; import edu.toronto.cs.xcurator.common.XmlUriBuilder; import edu.toronto.cs.xcurator.mapping.ValueAttribute; import java.util.List; import javax.xml.XMLConstants; import org.apache.log4j.Logger; import org.w3c.dom.Attr; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; public class BasicEntityDiscovery implements MappingDiscoveryStep { private final XmlParser parser; private final RdfUriBuilder rdfUriBuilder; private final XmlUriBuilder xmlUriBuilder; private boolean discoverRootLevelEntity; static final Logger logger = Logger.getLogger(BasicEntityDiscovery.class); public BasicEntityDiscovery(XmlParser parser, RdfUriBuilder rdfUriBuilder, XmlUriBuilder xmlUriBuilder) { this.parser = parser; this.rdfUriBuilder = rdfUriBuilder; this.xmlUriBuilder = xmlUriBuilder; this.discoverRootLevelEntity = false; } public BasicEntityDiscovery(XmlParser parser, RdfUriBuilder rdfUriBuilder, XmlUriBuilder xmlUriBuilder, boolean discoverRootLevelEntity) { this(parser, rdfUriBuilder, xmlUriBuilder); this.discoverRootLevelEntity = discoverRootLevelEntity; } @Override public void process(List<DataDocument> dataDocuments, Mapping mapping) { System.out.println("process BasicEntityDiscovery..."); logger.debug("datadoc#: " + dataDocuments.size()); for (DataDocument dataDoc : dataDocuments) { // logger.debug(""); // Create a root entity from the root element. Element root = dataDoc.Data.getDocumentElement(); NsContext rootNsContext = new NsContext(root); String rdfTypeUri = rdfUriBuilder.getRdfTypeUri(root); String xmlTypeUri = xmlUriBuilder.getXmlTypeUri(root); String path = getElementPath(root, "", "/", rootNsContext); Schema rootEntity = new Schema(rdfTypeUri, xmlTypeUri, rootNsContext); rootEntity.addPath(path); rootEntity.addInstance(root); // If for some specific type of XML document, the root element to child node // relation is significant, we should add the root level entity if (discoverRootLevelEntity) { mapping.addEntity(rootEntity); } // Merge the current document namespace context to the mapping's. // The document namespace cannot be overrided as a document cannot be // the child of another document. NsContext mappingNsContext = mapping.getBaseNamespaceContext(); mappingNsContext.merge(rootNsContext, false); mapping.setBaseNamespaceContext(mappingNsContext); // Discover entities in this document discoverEntitiesFromXmlElements(root, rootEntity, dataDoc, mapping); } // set the mapping as initialized when this step is completed. mapping.setInitialized(); } private void discoverEntitiesFromXmlElements(Element parent, Schema schema, DataDocument dataDoc, Mapping mapping) { NodeList children = parent.getChildNodes(); final int childscount = children.getLength(); logger.debug("childcount#: " + childscount); for (int i = 0; i < childscount; i++) { // final Node child2 = children.item(i); // logger.debug(">> " + child2.getLocalName()); if (children.item(i) instanceof Element) { Element child = (Element) children.item(i); final String tempcont = child.getTextContent().trim(); logger.debug(">> " + child.getNodeName() + " || " + tempcont.subSequence(0, Math.min(20, tempcont.length())) + " <<"); // If the child element has no attributes, then its an attribute of // its parent if (parser.isLeaf(child) && child.getAttributes().getLength() == 0) { logger.debug("Leaf!"); discoverAttributeFromLeafElement(child, schema); continue; } // We have found another entity, get its URI and check if we have seen it. String xmlTypeUri = xmlUriBuilder.getXmlTypeUri(child); // Create the RDF.type URI for this entity. String rdfTypeUri = rdfUriBuilder.getRdfTypeUri(child); logger.debug(xmlTypeUri + " " + rdfTypeUri); Schema childSchema = mapping.getEntity(xmlTypeUri); // Create a new namespace context by inheriting from the parent // and discovering overriding definitions. NsContext nsContext = new NsContext(schema.getNamespaceContext()); nsContext.discover(child); // Build the absolute path to this entity. String path = getElementPath(child, schema.getPath(), "/", nsContext); logger.debug("parentPath= " + schema.getPath()); logger.debug("NodeName=" + child.getNodeName()); logger.debug("path= " + path); if (childSchema == null) { // If we have seen not seen this entity, create new. childSchema = new Schema(rdfTypeUri, xmlTypeUri, nsContext, child.getLocalName()); childSchema.addPath(path); childSchema.addInstance(child); mapping.addEntity(childSchema); } else { // If we have seen this entity, simply merge the paths (if differ) childSchema.addPath(path); childSchema.addInstance(child); // We don't override namespace context here // We are assuming the input XML documents are following good practice // - using the same namespace prefixes definitions across documents // If the namespace prefixes are different, should consider generating // mapping for each one of them individually instead of together. // So overriding or not does not matter, as there should be no conflict childSchema.mergeNamespaceContext(nsContext, true); } // Create a relation about the parent and this entity // Use relative path for direct-descendent relation String relationPath = getElementPath(child, ".", "/", nsContext); String relationUri = rdfUriBuilder.getRdfRelationUriFromElements(parent, child); if (schema.hasRelation(relationUri)) { Relation relation = schema.getRelation(relationUri); relation.addPath(relationPath); } else { Relation relation = new Relation(schema, childSchema, relationUri); relation.addPath(relationPath); schema.addRelation(relation); } // During this step, only direct parent-child entity relations are // discovered. Relations based on reference keys should be discovered // in other steps // Discover the attributes of this entity from the XML attributes discoverAttributesFromXmlAttributes(child, childSchema); // Discover the value from the XML text node discoverValueFromTextContent(child, childSchema); // Recursively discover the related entities of this one discoverEntitiesFromXmlElements(child, childSchema, dataDoc, mapping); } } } private void discoverAttributeFromLeafElement(Element element, Schema schema) { // Transform a leaf element with no XML attributes // into an attribute of the schema String rdfUri = rdfUriBuilder.getRdfPropertyUri(element); String xmlUri = xmlUriBuilder.getXmlTypeUri(element); // The path is ./child_node/text(), with . being the parent node String path = getElementPath(element, ".", "/", schema.getNamespaceContext()) + "/text()"; addAttributeToSchema(schema, rdfUri, xmlUri, path, element.getTextContent()); } private void discoverAttributesFromXmlAttributes(Element element, Schema entity) { // Get attribtues from the XML attributes of the element List<Attr> xmlAttrs = parser.getAttributes(element); for (Attr xmlAttr : xmlAttrs) { String rdfUri = rdfUriBuilder.getRdfPropertyUri(xmlAttr); String xmlUri = xmlUriBuilder.getXmlTypeUri(xmlAttr); // Use relative path for attribute String path = getAttrPath(xmlAttr, "", "@"); addAttributeToSchema(entity, rdfUri, xmlUri, path, xmlAttr.getTextContent()); } } private void discoverValueFromTextContent(Element element, Schema entity) { if (!parser.isLeaf(element)) { return; } String textContent = element.getTextContent().trim(); if (!textContent.equals("")) { String rdfUri = rdfUriBuilder.getRdfPropertyUriForValue(element); Attribute attr = new ValueAttribute(entity, rdfUri); attr.addPath("text()"); entity.addAttribute(attr); } } private void addAttributeToSchema(Schema schema, String rdfUri, String xmlUri, String path, String instanceValue) { Attribute attr = new Attribute(schema, rdfUri, xmlUri); attr.addPath(path); attr.addInstance(instanceValue); schema.addAttribute(attr); } private String getElementPath(Node node, String parentPath, String separator, NsContext nsContext) { String prefix = node.getPrefix(); if (prefix != null) { // When there is a namespace used, juse use the full node name return parentPath + separator + node.getNodeName(); } else if (nsContext.getNamespaceURI(XMLConstants.DEFAULT_NS_PREFIX) .equals(XMLConstants.NULL_NS_URI)) { // When there is no default namespace defined, just use the local name return parentPath + separator + node.getLocalName(); } else { // When there is a default namespace defined for this node, // have it empty before : in the path. return parentPath + separator + ":" + node.getLocalName(); } } private String getAttrPath(Node node, String parentPath, String separator) { String prefix = node.getPrefix(); return parentPath + separator + (prefix != null ? prefix + ":" : "") + node.getLocalName(); } }