/******************************************************************************* * Copyright 2012 University of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This code was developed by the Information Integration Group as part * of the Karma project at the Information Sciences Institute of the * University of Southern California. For more information, publications, * and related projects, please see: http://www.isi.edu/integration ******************************************************************************/ package edu.isi.karma.modeling.semantictypes; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import edu.isi.karma.modeling.alignment.Alignment; import edu.isi.karma.modeling.ontology.OntologyManager; import edu.isi.karma.modeling.semantictypes.crfmodelhandler.CRFModelHandler; import edu.isi.karma.modeling.semantictypes.crfmodelhandler.CRFModelHandler.ColumnFeature; import edu.isi.karma.rep.HNodePath; import edu.isi.karma.rep.Node; import edu.isi.karma.rep.Worksheet; import edu.isi.karma.rep.alignment.Label; import edu.isi.karma.rep.alignment.SemanticType; import edu.isi.karma.rep.alignment.SemanticTypes; import edu.isi.karma.rep.metadata.Tag; import edu.isi.karma.webserver.ServletContextParameterMap; import edu.isi.karma.webserver.ServletContextParameterMap.ContextParameter; /** * This class provides various utility methods that can be used by the semantic * typing module. * * @author Shubham Gupta * */ public class SemanticTypeUtil { private static Logger logger = LoggerFactory .getLogger(SemanticTypeUtil.class); private final static int TRAINING_EXAMPLE_MAX_COUNT = Integer .parseInt(ServletContextParameterMap .getParameterValue(ContextParameter.TRAINING_EXAMPLE_MAX_COUNT)); /** * Prepares and returns a collection of training examples to be used in * semantic types training. Parameter TRAINING_EXAMPLE_MAX_COUNT specifies * the count of examples. The examples are randomly chosen to get a uniform * distribution of values across the column. Empty values are currently not * included in the set. * * @param worksheet * The target worksheet * @param path * Path to the target column * @return Collection of training examples */ public static ArrayList<String> getTrainingExamples(Worksheet worksheet, HNodePath path) { Collection<Node> nodes = new ArrayList<Node>(); worksheet.getDataTable().collectNodes(path, nodes); ArrayList<String> nodeValues = new ArrayList<String>(); for (Node n : nodes) { String nodeValue = n.getValue().asString(); if (nodeValue != null && !nodeValue.equals("")) nodeValues.add(nodeValue); } // Shuffling the values so that we get randomly chosen values to train Collections.shuffle(nodeValues); if (nodeValues.size() > TRAINING_EXAMPLE_MAX_COUNT) { ArrayList<String> subset = new ArrayList<String>(); // SubList method of ArrayList causes ClassCast exception for (int i = 0; i < TRAINING_EXAMPLE_MAX_COUNT; i++) subset.add(nodeValues.get(i)); return subset; } return nodeValues; } /** * This method predicts semantic types for all the columns in a worksheet * using CRF modeling technique developed by Aman Goel. It creates a * SemanticType object for each column and puts it inside the SemanticTypes * object for that worksheet. User-assigned semantic types are not replaced. * It also identifies nodes (table cells) that are outliers and are stored * in the outlierTag object. * * @param worksheet * The target worksheet * @param outlierTag * Tag object that stores outlier nodes * @param crfModelHandler * The CRF Model Handler to use * @return Returns a boolean value that shows if a semantic type object was * replaced or added for the worksheet. If nothing changed, false is * returned. */ public static boolean populateSemanticTypesUsingCRF(Worksheet worksheet, Tag outlierTag, CRFModelHandler crfModelHandler, OntologyManager ontMgr) { boolean semanticTypesChangedOrAdded = false; SemanticTypes types = worksheet.getSemanticTypes(); List<HNodePath> paths = worksheet.getHeaders().getAllPaths(); for (HNodePath path : paths) { boolean semanticTypeAdded = false; ArrayList<String> trainingExamples = getTrainingExamples(worksheet, path); if (trainingExamples.size() == 0) continue; Map<ColumnFeature, Collection<String>> columnFeatures = new HashMap<ColumnFeature, Collection<String>>(); // Prepare the column name feature String columnName = path.getLeaf().getColumnName(); Collection<String> columnNameList = new ArrayList<String>(); columnNameList.add(columnName); columnFeatures.put(ColumnFeature.ColumnHeaderName, columnNameList); // // Prepare the table name feature // String tableName = worksheetName; // Collection<String> tableNameList = new ArrayList<String>(); // tableNameList.add(tableName); // columnFeatures.put(ColumnFeature.TableName, tableNameList); // Stores the probability scores ArrayList<Double> scores = new ArrayList<Double>(); // Stores the predicted labels ArrayList<String> labels = new ArrayList<String>(); boolean predictResult = crfModelHandler.predictLabelForExamples( trainingExamples, 4, labels, scores, null, columnFeatures); if (!predictResult) { logger.debug("Error occured while predicting semantic type."); continue; } if (labels.size() == 0) { continue; } logger.debug("Examples: " + trainingExamples + " Type: " + labels + " ProbL " + scores); // Create and add the semantic type to the semantic types set of the // worksheet String topLabel = labels.get(0); String domain = ""; String type = topLabel; // Check if it contains domain information if (topLabel.contains("|")) { domain = topLabel.split("\\|")[0]; type = topLabel.split("\\|")[1]; } Label typeURI = ontMgr.getUriLabel(type); if(typeURI == null) { logger.error("Could not find the resource " + type + " in ontology model!"); continue; } Label domainURI = null; if (!domain.equals("")) domainURI = ontMgr.getUriLabel(domain); SemanticType semtype = new SemanticType(path.getLeaf().getId(),typeURI, domainURI, SemanticType.Origin.CRFModel,scores.get(0), false); // Check if the user already provided a semantic type manually SemanticType existingType = types.getSemanticTypeForHNodeId(path .getLeaf().getId()); if (existingType == null) { if (semtype.getConfidenceLevel() != SemanticType.ConfidenceLevel.Low) { worksheet.getSemanticTypes().addType(semtype); semanticTypeAdded = true; semanticTypesChangedOrAdded = true; } } else { if (existingType.getOrigin() != SemanticType.Origin.User) { worksheet.getSemanticTypes().addType(semtype); semanticTypeAdded = true; // Check if the new semantic type is different from the // older one if (!existingType.getType().equals(semtype.getType()) || !existingType.getDomain().equals( semtype.getDomain())) semanticTypesChangedOrAdded = true; } } // If the semantic type was added, then identify the outliers and // add the CRF model information for that column if (semanticTypeAdded) { // Identify the outliers identifyOutliers(worksheet, labels.get(0), path, outlierTag, columnFeatures, crfModelHandler); logger.debug("Outliers:" + outlierTag.getNodeIdList()); // Add the scores information to the Full CRF Model of the // worksheet CRFColumnModel columnModel = new CRFColumnModel(labels, scores); worksheet.getCrfModel().addColumnModel(path.getLeaf().getId(), columnModel); } } return semanticTypesChangedOrAdded; } /** * Identifies the outlier nodes (table cells) for a given column. * * @param worksheet * Target worksheet * @param predictedType * Type which was user-assigned or predicted by the CRF model for * the given column. If the type for a given node is different * from the predictedType, it is tagged as outlier and it's id is * stored in the outlier tag object * @param path * Path to the given column * @param outlierTag * The outlier tag object which stores all the outlier node ids. * @param columnFeatures * Features such as column name, table name that are required by * the CRF Model to predict the semantic type for a node (table * cell) * @param crfModelHandler */ public static void identifyOutliers(Worksheet worksheet, String predictedType, HNodePath path, Tag outlierTag, Map<ColumnFeature, Collection<String>> columnFeatures, CRFModelHandler crfModelHandler) { Collection<Node> nodes = new ArrayList<Node>(); worksheet.getDataTable().collectNodes(path, nodes); // Identify the top semantic type for each node // It it does not matches the predicted type, it is a outlier. Set<String> allNodeIds = new HashSet<String>(); Set<String> outlierNodeIds = new HashSet<String>(); int outlierCounter = 0; for (Node node : nodes) { allNodeIds.add(node.getId()); // Compute the semantic type for the node value List<String> examples = new ArrayList<String>(); List<String> predictedLabels = new ArrayList<String>(); List<Double> confidenceScores = new ArrayList<Double>(); String nodeVal = node.getValue().asString(); if (nodeVal != null && !nodeVal.equals("")) { examples.add(nodeVal); boolean result = crfModelHandler.predictLabelForExamples( examples, 1, predictedLabels, confidenceScores, null, columnFeatures); if (!result) { logger.error("Error while predicting type for " + nodeVal); continue; } // Check here if it is an outlier // System.out.println("Example: " + examples.get(0) + " Label: " + predictedLabels + " Score: " + confidenceScores); if (!predictedLabels.get(0).equalsIgnoreCase(predictedType)) { logger.info(nodeVal + ": " + predictedLabels + " Prob: " + confidenceScores); outlierCounter++; outlierNodeIds.add(node.getId()); } } } System.out.println("Total outliers: " + outlierCounter); // Remove the existing ones outlierTag.removeNodeIds(allNodeIds); // Add the new ones outlierTag.addNodeIds(outlierNodeIds); } /** * Removes the namespace from a given URI. It makes a assumption that the * namespace is until the last # or last '/' in the URI string, so it should * be used only for interface purposes and not for reasoning or logic. The * right way would be store the namespaces map in memory and use that to * remove the namespace from a URI. * * @param uri * Input URI * @return URI string with namespace removed */ public static String removeNamespace(String uri) { if (uri.contains("#")) uri = uri.split("#")[1]; else if (uri.contains("/")) uri = uri.substring(uri.lastIndexOf("/") + 1); return uri; } public static void computeSemanticTypesSuggestion(Worksheet worksheet, CRFModelHandler crfModelHandler, OntologyManager ontMgr, Alignment alignment) { List<HNodePath> paths = worksheet.getHeaders().getAllPaths(); for (HNodePath path : paths) { ArrayList<String> trainingExamples = getTrainingExamples(worksheet, path); Map<ColumnFeature, Collection<String>> columnFeatures = new HashMap<ColumnFeature, Collection<String>>(); // Prepare the column name feature String columnName = path.getLeaf().getColumnName(); Collection<String> columnNameList = new ArrayList<String>(); columnNameList.add(columnName); columnFeatures.put(ColumnFeature.ColumnHeaderName, columnNameList); // Stores the probability scores ArrayList<Double> scores = new ArrayList<Double>(); // Stores the predicted labels ArrayList<String> labels = new ArrayList<String>(); boolean predictResult = crfModelHandler.predictLabelForExamples(trainingExamples, 4, labels, scores, null, columnFeatures); if (!predictResult) { logger.debug("Error occured while predicting semantic type."); continue; } if (labels.size() == 0) { continue; } /** Remove the labels that are not in the ontology or are already used as the semantic type **/ List<String> removeLabels = new ArrayList<String>(); for (int i=0; i<labels.size(); i++) { String label = labels.get(i); SemanticType existingSemanticType = worksheet.getSemanticTypes().getSemanticTypeForHNodeId(path.getLeaf().getId()); /** Check if not in ontology **/ if (label.contains("|")) { Label domainUri = ontMgr.getUriLabel(label.split("\\|")[0]); Label typeUri = ontMgr.getUriLabel(label.split("\\|")[1]); // Remove from the list if URI not present in the model if (domainUri == null || typeUri == null) { removeLabels.add(label); continue; } // Check if it is being used as the semantic type already if (existingSemanticType != null && existsInSemanticTypesCollection(typeUri, domainUri, existingSemanticType)) { removeLabels.add(label); } } else { Label typeUri = ontMgr.getUriLabel(label); // Remove from the list if URI not present in the model if (typeUri == null) { removeLabels.add(label); continue; } // Check if it is being used as the semantic type already if (existingSemanticType != null && existsInSemanticTypesCollection(typeUri, null, existingSemanticType)) { removeLabels.add(label); } } } for (String removeLabel : removeLabels) { int idx = labels.indexOf(removeLabel); // System.out.println("Removing " + removeLabel); // logger.info("Removing " + removeLabel); labels.remove(removeLabel); scores.remove(idx); } if (labels.size() == 0) { continue; } CRFColumnModel columnModel = new CRFColumnModel(labels, scores); worksheet.getCrfModel().addColumnModel(path.getLeaf().getId(), columnModel); } } private static boolean existsInSemanticTypesCollection(Label typeLabel, Label domainLabel, SemanticType existingSemanticType) { if (typeLabel.getUri().equals(existingSemanticType.getType().getUri())) { if (domainLabel == null) { if(existingSemanticType.getDomain() == null) return true; return false; } if (existingSemanticType.getDomain() == null) return false; if (existingSemanticType.getDomain().getUri().equals(domainLabel.getUri())) return true; return false; } return false; } public static void computeSemanticTypesForAutoModel(Worksheet worksheet, CRFModelHandler crfModelHandler, OntologyManager ontMgr) { String autoModelURI = ServletContextParameterMap .getParameterValue(ContextParameter.AUTO_MODEL_URI); String topClassURI = autoModelURI + worksheet.getTitle(); List<HNodePath> paths = worksheet.getHeaders().getAllPaths(); for (HNodePath path : paths) { Map<ColumnFeature, Collection<String>> columnFeatures = new HashMap<ColumnFeature, Collection<String>>(); // Prepare the column name feature String columnName = path.getLeaf().getColumnName(); Collection<String> columnNameList = new ArrayList<String>(); columnNameList.add(columnName); columnFeatures.put(ColumnFeature.ColumnHeaderName, columnNameList); // Stores the probability scores ArrayList<Double> scores = new ArrayList<Double>(); // Stores the predicted labels ArrayList<String> labels = new ArrayList<String>(); String label = topClassURI+"#"+worksheet.getTitle()+"|"+topClassURI+"#"+columnName; labels.add(label); scores.add(1.0); CRFColumnModel columnModel = new CRFColumnModel(labels, scores); worksheet.getCrfModel().addColumnModel(path.getLeaf().getId(), columnModel); } } }