/******************************************************************************* * Copyright 2012 University of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This code was developed by the Information Integration Group as part * of the Karma project at the Information Sciences Institute of the * University of Southern California. For more information, publications, * and related projects, please see: http://www.isi.edu/integration ******************************************************************************/ package edu.isi.karma.modeling.semantictypes; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map.Entry; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.concurrent.locks.Condition; import java.util.Random; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import edu.isi.karma.controller.command.selection.SuperSelection; import edu.isi.karma.modeling.ontology.OntologyManager; import edu.isi.karma.rep.HNodePath; import edu.isi.karma.rep.Node; import edu.isi.karma.rep.Worksheet; import edu.isi.karma.rep.Workspace; import edu.isi.karma.rep.alignment.ColumnNode; import edu.isi.karma.rep.alignment.Label; import edu.isi.karma.rep.alignment.SemanticType; import edu.isi.karma.rep.alignment.SemanticType.Origin; import edu.isi.karma.rep.metadata.Tag; import edu.isi.karma.webserver.ContextParametersRegistry; import edu.isi.karma.webserver.ServletContextParameterMap; import edu.isi.karma.webserver.ServletContextParameterMap.ContextParameter; /** * This class provides various utility methods that can be used by the semantic * typing module. * * @author Shubham Gupta * */ public class SemanticTypeUtil { private static Logger logger = LoggerFactory .getLogger(SemanticTypeUtil.class); private static TrainingFactory trainingFactory = null; private static boolean isSemanticTypeTrainingEnabled = true; /** * Prepares and returns a collection of training examples to be used in * semantic types training. Parameter TRAINING_EXAMPLE_MAX_COUNT specifies * the count of examples. The examples are randomly chosen to get a uniform * distribution of values across the column. Empty values are currently not * included in the set. * * @param worksheet * The target worksheet * @param path * Path to the target column * @return Collection of training examples */ public static ArrayList<String> getTrainingExamples(Workspace workspace, Worksheet worksheet, HNodePath path, SuperSelection sel) { if(!getSemanticTypeTrainingEnabled() || path == null) { return new ArrayList<>(); } final ServletContextParameterMap contextParameters = ContextParametersRegistry.getInstance().getContextParameters(workspace.getContextId()); int TRAINING_EXAMPLE_MAX_COUNT = Integer .parseInt(contextParameters .getParameterValue(ContextParameter.TRAINING_EXAMPLE_MAX_COUNT)); ArrayList<Node> nodes = new ArrayList<>(Math.max(100, worksheet.getDataTable().getNumRows())); worksheet.getDataTable().collectNodes(path, nodes, sel); Random r = new Random(); ArrayList<String> subset = new ArrayList<>(TRAINING_EXAMPLE_MAX_COUNT); if (nodes.size() > TRAINING_EXAMPLE_MAX_COUNT *2) { HashSet<Integer> seenValues = new HashSet<>(TRAINING_EXAMPLE_MAX_COUNT); // SubList method of ArrayList causes ClassCast exception int attempts = 0; while(subset.size() < TRAINING_EXAMPLE_MAX_COUNT && attempts < Math.min(nodes.size(), TRAINING_EXAMPLE_MAX_COUNT*2)) { int randValue = r.nextInt(nodes.size()); String nodeValue = nodes.get(randValue).getValue().asString(); if(seenValues.add(randValue) && (nodeValue != null && !nodeValue.isEmpty())) { subset.add(nodeValue); } attempts++; } } else { Collections.shuffle(nodes); for(int i = 0; i < nodes.size() && subset.size() < TRAINING_EXAMPLE_MAX_COUNT; i++) { String nodeValue = nodes.get(i).getValue().asString(); if (nodeValue != null && !nodeValue.equals("")) subset.add(nodeValue); } } return subset; } private class TrainingFactory extends Thread { private ArrayList<TrainingJob> tasks; private Lock lock; private Condition taskAvailable; TrainingFactory() { this.tasks = new ArrayList<>(); this.lock = new ReentrantLock(); this.taskAvailable = this.lock.newCondition(); this.start(); } void addTrainingJob(TrainingJob trainingJob) { this.lock.lock(); if (this.tasks.add(trainingJob)) { this.taskAvailable.signalAll(); } this.lock.unlock(); } public void run() { while (true) { this.lock.lock(); try { while (this.tasks.isEmpty()) { this.taskAvailable.await(); } TrainingJob trainingJob = this.tasks.remove(0); this.lock.unlock(); Workspace workspace = trainingJob.workspace; Worksheet worksheet = trainingJob.worksheet; ArrayList<SemanticType> newTypes = trainingJob.newTypes; SuperSelection sel = trainingJob.sel; if(newTypes.size() > 0) { HNodePath currentColumnPath = null; List<HNodePath> paths = worksheet.getHeaders().getAllPaths(); String hNodeId = newTypes.get(0).getHNodeId(); for (HNodePath path : paths) { if (path.getLeaf().getId().equals(hNodeId)) { currentColumnPath = path; break; } } ArrayList<String> examples = SemanticTypeUtil.getTrainingExamples(workspace, worksheet, currentColumnPath, sel); ISemanticTypeModelHandler modelHandler = workspace.getSemanticTypeModelHandler(); for(SemanticType newType : newTypes) { String label = newType.getModelLabelString(); modelHandler.addType(label, examples); } } } catch (InterruptedException ie) { ie.printStackTrace(); this.lock.unlock(); } } } } private class TrainingJob { public Workspace workspace; public Worksheet worksheet; public ArrayList<SemanticType> newTypes; public SuperSelection sel; TrainingJob(Workspace workspace, Worksheet worksheet, List<SemanticType> newTypes, SuperSelection sel) { this.workspace = workspace; this.worksheet = worksheet; this.newTypes = new ArrayList<>(); this.newTypes.addAll(newTypes); this.sel = sel; } } public void trainOnColumn(Workspace workspace, Worksheet worksheet, List<SemanticType> newTypes, SuperSelection sel) { trainingFactory = trainingFactory == null ? new TrainingFactory() : trainingFactory; trainingFactory.addTrainingJob(new TrainingJob(workspace, worksheet, newTypes, sel)); } public SemanticTypeColumnModel predictColumnSemanticType(Workspace workspace, Worksheet worksheet, String hNodeId, int numSuggestions, SuperSelection sel) { HNodePath currentColumnPath = null; List<HNodePath> paths = worksheet.getHeaders().getAllPaths(); for (HNodePath path : paths) { if (path.getLeaf().getId().equals(hNodeId)) { currentColumnPath = path; break; } } if(currentColumnPath != null) return predictColumnSemanticType(workspace, worksheet,currentColumnPath, numSuggestions, sel); return null; } public SemanticTypeColumnModel predictColumnSemanticType(Workspace workspace, Worksheet worksheet, HNodePath path, int numSuggestions, SuperSelection sel) { ArrayList<String> trainingExamples = SemanticTypeUtil.getTrainingExamples(workspace, worksheet, path, sel); if (trainingExamples.isEmpty()) return null; ISemanticTypeModelHandler modelHandler = workspace.getSemanticTypeModelHandler(); OntologyManager ontologyManager = workspace.getOntologyManager(); List<SemanticTypeLabel> result = modelHandler.predictType(trainingExamples, numSuggestions); if (result == null) { logger.debug("Error occured while predicting semantic type."); return null; } if (result.isEmpty()) { return null; } /** Remove the labels that are not in the ontology or are already used as the semantic type **/ List<SemanticTypeLabel> removeLabels = new ArrayList<>(); String domainUri, typeUri; Label domain, type; for (int i=0; i<result.size(); i++) { SemanticTypeLabel semLabel = result.get(i); String label = semLabel.getLabel(); /** Check if not in ontology **/ if (label.contains("|")) { domainUri = label.split("\\|")[0].trim(); typeUri = label.split("\\|")[1].trim(); domain = ontologyManager.getUriLabel(domainUri); type = ontologyManager.getUriLabel(typeUri); // Remove from the list if URI not present in the model if (domain == null || type == null) { removeLabels.add(semLabel); continue; } } else { domain = ontologyManager.getUriLabel(label); // Remove from the list if URI not present in the model if (domain == null) { removeLabels.add(semLabel); continue; } } } for (SemanticTypeLabel removeLabel : removeLabels) { result.remove(removeLabel); } if (result.isEmpty()) { return null; } return new SemanticTypeColumnModel(result); } public List<SemanticType> getSuggestedTypes(OntologyManager ontologyManager, ColumnNode columnNode, SemanticTypeColumnModel columnModel) { ArrayList<SemanticType> suggestedSemanticTypes = new ArrayList<>(); if (columnModel == null) return suggestedSemanticTypes; for (Entry<String, Double> entry : columnModel.getScoreMap().entrySet()) { String key = entry.getKey(); Double confidence = entry.getValue(); if (key == null || key.isEmpty()) continue; String[] parts = key.split("\\|"); if (parts == null || parts.length != 2) continue; String domainUri = parts[0].trim(); String propertyUri = parts[1].trim(); Label domainLabel = ontologyManager.getUriLabel(domainUri); if (domainLabel == null) continue; Label propertyLabel = ontologyManager.getUriLabel(propertyUri); if (propertyLabel == null) continue; SemanticType semanticType = new SemanticType(columnNode.getHNodeId(), propertyLabel, domainLabel, null, false, Origin.TfIdfModel, confidence); logger.info("\t" + propertyUri + " of " + domainUri + ": " + confidence); suggestedSemanticTypes.add(semanticType); } Collections.sort(suggestedSemanticTypes, Collections.reverseOrder()); return suggestedSemanticTypes; } public ArrayList<SemanticType> getColumnSemanticSuggestions(Workspace workspace, Worksheet worksheet, ColumnNode columnNode, int numSuggestions, SuperSelection sel) { ArrayList<SemanticType> suggestedSemanticTypes = new ArrayList<>(); logger.info("Column Semantic Suggestions for:" + columnNode.getColumnName()); if(workspace != null && worksheet != null) { OntologyManager ontologyManager = workspace.getOntologyManager(); String hNodeId = columnNode.getHNodeId(); SemanticTypeColumnModel columnModel = predictColumnSemanticType(workspace, worksheet, hNodeId, numSuggestions, sel); if (columnModel != null) { for (Entry<String, Double> entry : columnModel.getScoreMap().entrySet()) { String key = entry.getKey(); Double confidence = entry.getValue(); if (key == null || key.isEmpty()) continue; String[] parts = key.split("\\|"); if (parts == null || parts.length != 2) continue; String domainUri = parts[0].trim(); String propertyUri = parts[1].trim(); Label domainLabel = ontologyManager.getUriLabel(domainUri); if (domainLabel == null) continue; Label propertyLabel = ontologyManager.getUriLabel(propertyUri); if (propertyLabel == null) continue; SemanticType semanticType = new SemanticType(hNodeId, propertyLabel, domainLabel, null, false, Origin.TfIdfModel, confidence); logger.info("\t" + propertyUri + " of " + domainUri + ": " + confidence); suggestedSemanticTypes.add(semanticType); } } } Collections.sort(suggestedSemanticTypes, Collections.reverseOrder()); return suggestedSemanticTypes; } /** * Identifies the outlier nodes (table cells) for a given column. * * @param worksheet * Target worksheet * @param predictedType * Type which was user-assigned or predicted by the CRF model for * the given column. If the type for a given node is different * from the predictedType, it is tagged as outlier and it's id is * stored in the outlier tag object * @param path * Path to the given column * @param outlierTag * The outlier tag object which stores all the outlier node ids. * @param columnFeatures * Features such as column name, table name that are required by * the CRF Model to predict the semantic type for a node (table * cell) * @param crfModelHandler */ public static void identifyOutliers(Worksheet worksheet, String predictedType, HNodePath path, Tag outlierTag, ISemanticTypeModelHandler modelHandler, SuperSelection sel) { Collection<Node> nodes = new ArrayList<>(); worksheet.getDataTable().collectNodes(path, nodes, sel); // Identify the top semantic type for each node // It it does not matches the predicted type, it is a outlier. Set<String> allNodeIds = new HashSet<>(); Set<String> outlierNodeIds = new HashSet<>(); int outlierCounter = 0; for (Node node : nodes) { allNodeIds.add(node.getId()); // Compute the semantic type for the node value List<String> examples = new ArrayList<>(); String nodeVal = node.getValue().asString(); if (nodeVal != null && !nodeVal.equals("")) { examples.add(nodeVal); List<SemanticTypeLabel> result = modelHandler.predictType(examples, 1); if (result == null) { logger.error("Error while predicting type for " + nodeVal); continue; } // Check here if it is an outlier // System.out.println("Example: " + examples.get(0) + " Label: " + predictedLabels + " Score: " + confidenceScores); String predictedLabel = result.get(0).getLabel(); if (!predictedLabel.equalsIgnoreCase(predictedType)) { outlierCounter++; outlierNodeIds.add(node.getId()); } } } System.out.println("Total outliers: " + outlierCounter); // Remove the existing ones outlierTag.removeNodeIds(allNodeIds); // Add the new ones outlierTag.addNodeIds(outlierNodeIds); } /** * Removes the namespace from a given URI. It makes a assumption that the * namespace is until the last # or last '/' in the URI string, so it should * be used only for interface purposes and not for reasoning or logic. The * right way would be store the namespaces map in memory and use that to * remove the namespace from a URI. * * @param uri * Input URI * @return URI string with namespace removed */ public static String removeNamespace(String uri) { if (uri.contains("#")) uri = uri.split("#")[1]; else if (uri.contains("/")) uri = uri.substring(uri.lastIndexOf("/") + 1); return uri; } // public static void computeSemanticTypesForAutoModel(Worksheet worksheet, // ISemanticTypeModelHandler crfModelHandler, OntologyManager ontMgr) { // // String autoModelURI = ServletContextParameterMap // .getParameterValue(ContextParameter.AUTO_MODEL_URI); // String topClassURI = autoModelURI + worksheet.getTitle(); // // List<HNodePath> paths = worksheet.getHeaders().getAllPaths(); // for (HNodePath path : paths) { // // // Prepare the column name feature // String columnName = path.getLeaf().getColumnName(); // // // String label = topClassURI+"#"+worksheet.getTitle()+"|"+topClassURI+"#"+columnName; // ArrayList<SemanticTypeLabel> labels = new ArrayList<>(); // labels.add(new SemanticTypeLabel(label, 1.0f)); // // SemanticTypeColumnModel columnModel = new SemanticTypeColumnModel(labels); // worksheet.getSemanticTypeModel().addColumnModel(path.getLeaf().getId(), columnModel); // } // } public static void setSemanticTypeTrainingStatus(boolean status) { isSemanticTypeTrainingEnabled = status; } public static boolean getSemanticTypeTrainingEnabled() { return isSemanticTypeTrainingEnabled; } }