package edu.isi.karma.research.modeling;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.jgrapht.graph.AsUndirectedGraph;
import org.jgrapht.graph.DirectedWeightedMultigraph;
import org.jgrapht.graph.WeightedMultigraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;
import edu.isi.karma.modeling.alignment.GraphBuilder;
import edu.isi.karma.modeling.alignment.GraphVizLabelType;
import edu.isi.karma.modeling.alignment.GraphVizUtil;
import edu.isi.karma.modeling.alignment.ModelEvaluation;
import edu.isi.karma.modeling.alignment.SemanticModel;
import edu.isi.karma.modeling.alignment.SteinerTree;
import edu.isi.karma.modeling.alignment.TreePostProcess;
import edu.isi.karma.modeling.alignment.learner.ModelReader;
import edu.isi.karma.modeling.alignment.learner.SortableSemanticModel;
import edu.isi.karma.modeling.ontology.OntologyManager;
import edu.isi.karma.modeling.research.Params;
import edu.isi.karma.rep.alignment.ColumnNode;
import edu.isi.karma.rep.alignment.ColumnSemanticTypeStatus;
import edu.isi.karma.rep.alignment.DefaultLink;
import edu.isi.karma.rep.alignment.InternalNode;
import edu.isi.karma.rep.alignment.Label;
import edu.isi.karma.rep.alignment.LabeledLink;
import edu.isi.karma.rep.alignment.Node;
import edu.isi.karma.rep.alignment.SemanticType;
import edu.isi.karma.rep.alignment.SemanticType.Origin;
import edu.isi.karma.webserver.ContextParametersRegistry;
import edu.isi.karma.webserver.ServletContextParameterMap;
public class ModelLearner_LOD_Greedy {
static Logger logger = LoggerFactory.getLogger(ModelLearner_LOD_Greedy.class);
private Map<String, Pattern> patterns; // patternId to pattern map
private Map<String, Set<String>> patternIndex; // type to pattern map
private OntologyManager ontologyManager;
public ModelLearner_LOD_Greedy(String patternDirectoryPath, OntologyManager ontologyManager) {
if (ontologyManager == null) {
logger.warn("ontology manager is null.");
}
this.ontologyManager = ontologyManager;
logger.info("importing patterns ...");
long timeStart = System.currentTimeMillis();
patterns = PatternReader.importPatterns(patternDirectoryPath, null);
long timeImportPatterns = System.currentTimeMillis();
float elapsedTimeSec = (timeImportPatterns - timeStart)/1000F;
logger.info("time to import the patterns: " + elapsedTimeSec + "s");
if (patterns == null) {
logger.error("no pattern imported.");
return;
}
logger.info("creating index over patterns ...");
patternIndex = PatternReader.createPatternIndex(patterns.values());
long timeIndexPatterns = System.currentTimeMillis();
elapsedTimeSec = (timeIndexPatterns - timeImportPatterns)/1000F;
logger.info("time to create index for the patterns: " + elapsedTimeSec + "s");
}
private Set<Pattern> findRelatedPatterns(List<String> types, Map<String, Pattern> patterns, Map<String, Set<String>> patternIndex) {
HashSet<Pattern> relatedPatterns = new HashSet<Pattern>();
if (types == null || patternIndex == null) {
logger.info("either list of the semantic types or the pattern index is empty");
return relatedPatterns;
}
for (String t : types) {
Set<String> patternIds = patternIndex.get(t);
if (patternIds != null) {
for (String patternId : patternIds) {
relatedPatterns.add(patterns.get(patternId));
}
}
}
return relatedPatterns;
}
private List<Pattern> sortPatterns(Set<Pattern> patterns, List<String> types) {
List<Pattern> patternsList = new ArrayList<Pattern>(patterns);
Collections.sort(patternsList, new PatternComparator(types));
return patternsList;
}
private List<Pattern> findMinimalCoveringSet(List<Pattern> sortedPatterns, List<String> types) {
List<Pattern> minimalSet = new LinkedList<Pattern>();
Multiset<String> sourceTypes = HashMultiset.create(types);
// Multiset<String> coveredTypes = HashMultiset.create();
for (Pattern p : sortedPatterns) {
Multiset<String> patternTypes = HashMultiset.create(p.getTypes());
Multiset<String> patternCommonTypes = Multisets.intersection(patternTypes, sourceTypes);
// if (Multisets.containsOccurrences(coveredTypes, patternCommonTypes)) // this pattern does not cover any new source type
if (patternCommonTypes.size() == 0) // this pattern does not cover any new source type
continue;
else {
minimalSet.add(p);
// coveredTypes.addAll(patternCommonTypes);
Multisets.removeOccurrences(sourceTypes, patternCommonTypes);
}
// if (Multisets.containsOccurrences(coveredTypes, sourceTypes))
if (sourceTypes.size() == 0)
break;
}
return minimalSet;
}
private DirectedWeightedMultigraph<Node, LabeledLink> combinePatterns(List<Pattern> patterns) {
DirectedWeightedMultigraph<Node, LabeledLink> graph =
new DirectedWeightedMultigraph<Node, LabeledLink>(LabeledLink.class);
Set<String> nodeIds = new HashSet<String>();
Set<String> edgeIds = new HashSet<String>();
Node source, target;
HashMap<String, Integer> linkIdMap =
new HashMap<String, Integer>();
for (Pattern p : patterns) {
DirectedWeightedMultigraph<Node, LabeledLink> g = p.getGraph();
for (LabeledLink l : g.edgeSet()) {
source = l.getSource();
target = l.getTarget();
if (!nodeIds.contains(source.getId())) {
graph.addVertex(source);
nodeIds.add(source.getId());
}
if (!nodeIds.contains(target.getId())) {
graph.addVertex(target);
nodeIds.add(target.getId());
}
if (!edgeIds.contains(l.getId())) {
graph.addEdge(source, target, l);
graph.setEdgeWeight(l, l.getWeight());
linkIdMap.put(l.getId(), 1);
} else {
Integer count = linkIdMap.get(l.getId());
linkIdMap.put(l.getId(), ++count);
LabeledLink newLink = l.copy(l.getId() + count);
graph.addEdge(source, target, newLink);
graph.setEdgeWeight(newLink, newLink.getWeight());
}
}
}
return graph;
}
public GraphBuilder addOntologyPaths(DirectedWeightedMultigraph<Node, LabeledLink> graph) {
logger.info("graph nodes before using ontology: " + graph.vertexSet().size());
logger.info("graph links before using ontology: " + graph.edgeSet().size());
GraphBuilder graphBuilder = new GraphBuilder(ontologyManager, false);
for (Node n : graph.vertexSet()) {
graphBuilder.addNodeAndUpdate(n);
}
for (DefaultLink l : graph.edgeSet()) {
graphBuilder.addLink(l.getSource(), l.getTarget(), l, l.getWeight());
}
logger.info("graph nodes after using ontology: " + graphBuilder.getGraph().vertexSet().size());
logger.info("graph links after using ontology: " + graphBuilder.getGraph().edgeSet().size());
return graphBuilder;
}
public DirectedWeightedMultigraph<Node, LabeledLink> getSteinerTree(GraphBuilder graphBuilder, List<Node> steinerNodes) {
SteinerTree steinerTree = new SteinerTree(
new AsUndirectedGraph<Node, DefaultLink>(graphBuilder.getGraph()), steinerNodes);
WeightedMultigraph<Node, DefaultLink> t = steinerTree.getDefaultSteinerTree();
TreePostProcess treePostProcess = new TreePostProcess(graphBuilder, t);
return treePostProcess.getTree();
}
public SemanticModel hypothesize(List<ColumnNode> columnNodes) {
if (patterns == null || patternIndex == null) {
logger.error("no pattern/patternIndex found.");
return null;
}
if (columnNodes == null || columnNodes.isEmpty()) {
logger.error("column nodes list is empty.");
return null;
}
HashMultimap<String, ColumnNode> typeColumnNodes = HashMultimap.create();
List<String> types = new LinkedList<String>();
for (ColumnNode cn : columnNodes) {
if (cn.getSemanticTypeStatus() != ColumnSemanticTypeStatus.UserAssigned)
continue;
SemanticType st = cn.getUserSemanticTypes().get(0);
String domain = st.getDomain() == null ? null : st.getDomain().getUri();
if (domain == null) {
logger.warn("the domain is null for the semantic type: " + st.getModelLabelString());
continue;
}
types.add(domain);
typeColumnNodes.put(domain, cn);
}
if (types.isEmpty()) {
logger.error("semantic type list is empty.");
return null;
}
logger.info("finding related patterns ...");
long start = System.currentTimeMillis();
Set<Pattern> relatedPatterns = findRelatedPatterns(types, patterns, patternIndex);
long timeFindRelatedPatterns = System.currentTimeMillis();
float elapsedTimeSec = (timeFindRelatedPatterns - start)/1000F;
logger.info("time to find related patterns: " + elapsedTimeSec + "s");
if (relatedPatterns == null || relatedPatterns.isEmpty()) {
logger.info("no related pattern found for the source.");
return null;
}
logger.info("sorting related patterns ...");
List<Pattern> sortedPatterns = sortPatterns(relatedPatterns, types);
long timeSortPatterns = System.currentTimeMillis();
elapsedTimeSec = (timeSortPatterns - timeFindRelatedPatterns)/1000F;
logger.info("time to sort related patterns: " + elapsedTimeSec + "s");
logger.info("finding minimal set of patterns with maximum covering of source types ...");
List<Pattern> minimalSet = findMinimalCoveringSet(sortedPatterns, types);
long timeFindMinimalCover = System.currentTimeMillis();
elapsedTimeSec = (timeFindMinimalCover - timeSortPatterns)/1000F;
logger.info("time to find minimal set: " + elapsedTimeSec + "s");
if (minimalSet == null || minimalSet.isEmpty()) {
logger.error("cannot find a pattern set that covers at least one source type, this should not conceptually happen if we have even one related pattern");
return null;
}
logger.info("combining patterns ...");
DirectedWeightedMultigraph<Node, LabeledLink> graph = combinePatterns(minimalSet);
long timeCombinePatterns = System.currentTimeMillis();
elapsedTimeSec = (timeCombinePatterns - timeFindMinimalCover)/1000F;
logger.info("time to combine patterns: " + elapsedTimeSec + "s");
if (graph == null || graph.edgeSet().isEmpty()) {
logger.error("graph is null, this should not conceptually happen if we have even one related pattern");
return null;
}
if (this.ontologyManager == null) {
SemanticModel sm = new SemanticModel("", graph);
return sm;
}
logger.info("add the paths from ontology ...");
GraphBuilder graphBuilder = addOntologyPaths(graph);
long timeAddOntologyPaths = System.currentTimeMillis();
elapsedTimeSec = (timeAddOntologyPaths - timeCombinePatterns)/1000F;
logger.info("time to combine patterns: " + elapsedTimeSec + "s");
logger.info("compute steiner tree ...");
List<Node> steinerNodes = new LinkedList<Node>();
for (Node n : graph.vertexSet()) {
if (n instanceof InternalNode) {
if (typeColumnNodes.containsKey(n.getUri()) &&
typeColumnNodes.get(n.getUri()).iterator().hasNext()) {
ColumnNode cn = typeColumnNodes.get(n.getUri()).iterator().next();
typeColumnNodes.remove(n.getUri(), cn);
steinerNodes.add(n);
}
}
}
DirectedWeightedMultigraph<Node, LabeledLink> tree = getSteinerTree(graphBuilder, steinerNodes);
long timeComputeSteinerNodes = System.currentTimeMillis();
elapsedTimeSec = (timeComputeSteinerNodes - timeAddOntologyPaths)/1000F;
logger.info("time to combine patterns: " + elapsedTimeSec + "s");
// add the column nodes
// String nodeUri, propertyUri;
// Node source, target;
// List<Node> graphNodes = new LinkedList<Node>();
// for (Node n : graph.vertexSet()) {
// graphNodes.add(n);
// }
// for (Node n : graphNodes) {
// if (n instanceof InternalNode) {
// nodeUri = n.getUri();
// if (typeColumnNodes.containsKey(nodeUri) &&
// typeColumnNodes.get(nodeUri).iterator().hasNext()) {
// ColumnNode cn = typeColumnNodes.get(nodeUri).iterator().next();
// typeColumnNodes.remove(nodeUri, cn);
//
// graph.addVertex(cn);
// source = n;
// target = cn;
// propertyUri = cn.getUserSemanticTypes().get(0).getType().getUri();
// LabeledLink l = new DataPropertyLink(
// LinkIdFactory.getLinkId(propertyUri, source.getId(), target.getId()),
// new Label(propertyUri));
// graph.addEdge(source, target, l);
// }
// }
// }
SemanticModel sm = new SemanticModel("", tree);
return sm;
}
private static double roundDecimals(double d, int k) {
String format = "";
for (int i = 0; i < k; i++) format += "#";
DecimalFormat DForm = new DecimalFormat("#." + format);
return Double.valueOf(DForm.format(d));
}
public static void simpleTest() {
// get list of semantic types for a source
List<SemanticType> types = new ArrayList<SemanticType>();
SemanticType st1 = new SemanticType("", null, new Label("http://erlangen-crm.org/current/E39_Actor"), Origin.User, 1.0);
SemanticType st2 = new SemanticType("", null, new Label("http://erlangen-crm.org/current/E22_Man-Made_Object"), Origin.User, 1.0);
SemanticType st3 = new SemanticType("", null, new Label("http://erlangen-crm.org/current/E21_Person"), Origin.User, 1.0);
SemanticType st4 = new SemanticType("", null, new Label("http://erlangen-crm.org/current/E55_Type"), Origin.User, 1.0);
SemanticType st5 = new SemanticType("", null, new Label("http://www.w3.org/2004/02/skos/core#Concept"), Origin.User, 1.0);
types.add(st1);
types.add(st2);
types.add(st3);
types.add(st4);
types.add(st5);
List<ColumnNode> columnNodes = new LinkedList<ColumnNode>();
for (SemanticType st : types) {
ColumnNode cn = new ColumnNode(null, null, null, null);
cn.assignUserType(st);
columnNodes.add(cn);
}
ModelLearner_LOD_Greedy ml = new ModelLearner_LOD_Greedy(Params.PATTERNS_INPUT_DIR, null);
SemanticModel sm = ml.hypothesize(columnNodes);
String output = Params.RESULTS_DIR + "out.dot";
if (sm == null) {
logger.info("could not learn any model for the source");
return ;
}
// sm.print();
try {
sm.writeGraphviz(output, true, true);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
logger.info("result is ready at: " + output);
}
public static void main(String[] args) throws Exception {
// simpleTest();
ServletContextParameterMap contextParameters = ContextParametersRegistry.getInstance().getDefault();
OntologyManager ontologyManager = new OntologyManager(contextParameters.getId());
File ff = new File(Params.ONTOLOGY_DIR);
File[] files = ff.listFiles();
if (files == null) {
logger.error("no ontology to import at " + ff.getAbsolutePath());
return;
}
for (File f : files) {
if (f.getName().endsWith(".owl") ||
f.getName().endsWith(".rdf") ||
f.getName().endsWith(".n3") ||
f.getName().endsWith(".ttl") ||
f.getName().endsWith(".xml")) {
logger.info("Loading ontology file: " + f.getAbsolutePath());
ontologyManager.doImport(f, "UTF-8");
}
}
ontologyManager.updateCache();
// LODGreedyModelLearner modelLearner = new LODGreedyModelLearner(Params.PATTERNS_DIR, ontologyManager);
ModelLearner_LOD_Greedy modelLearner = new ModelLearner_LOD_Greedy(Params.PATTERNS_INPUT_DIR, null);
String outputPath = Params.OUTPUT_DIR;
List<SemanticModel> semanticModels =
ModelReader.importSemanticModelsFromJsonFiles(Params.MODEL_DIR, Params.MODEL_MAIN_FILE_EXT);
boolean randomModel = false;
boolean useCorrectType = true;
int numberOfCRFCandidates = 1;
String filePath = Params.RESULTS_DIR;
String filename = "";
filename += "results,k=" + numberOfCRFCandidates;
filename += useCorrectType ? "-correct types":"";
filename += randomModel ? "-random":"";
filename += ".csv";
PrintWriter resultFile = new PrintWriter(new File(filePath + filename));
resultFile.println("source \t p \t r \t t \n");
for (int i = 0; i < semanticModels.size(); i++) {
// for (int i = 0; i <= 10; i++) {
// int i = 1; {
int newSourceIndex = i;
SemanticModel newSource = semanticModels.get(newSourceIndex);
logger.info("======================================================");
logger.info(newSource.getName() + "(#attributes:" + newSource.getColumnNodes().size() + ")");
System.out.println(newSource.getName() + "(#attributes:" + newSource.getColumnNodes().size() + ")");
logger.info("======================================================");
SemanticModel correctModel = newSource;
List<ColumnNode> columnNodes = correctModel.getColumnNodes();
long start = System.currentTimeMillis();
SemanticModel sm = modelLearner.hypothesize(columnNodes);
long elapsedTimeMillis = System.currentTimeMillis() - start;
float elapsedTimeSec = elapsedTimeMillis/1000F;
List<SortableSemanticModel> topHypotheses = new LinkedList<SortableSemanticModel>();
if (sm != null) topHypotheses.add(new SortableSemanticModel(sm, false));
Map<String, SemanticModel> models =
new TreeMap<String, SemanticModel>();
ModelEvaluation me;
models.put("1-correct model", correctModel);
if (topHypotheses != null)
for (int k = 0; k < topHypotheses.size(); k++) {
SortableSemanticModel m = topHypotheses.get(k);
me = m.evaluate(correctModel);
String label = "candidate" + k +
(m.getSteinerNodes() == null ? "" : m.getSteinerNodes().getScoreDetailsString()) +
"cost:" + roundDecimals(m.getCost(), 6) +
// "-distance:" + me.getDistance() +
"-precision:" + me.getPrecision() +
"-recall:" + me.getRecall();
models.put(label, m);
if (k == 0) { // first rank model
System.out.println("precision: " + me.getPrecision() +
", recall: " + me.getRecall() +
", time: " + elapsedTimeSec);
logger.info("precision: " + me.getPrecision() +
", recall: " + me.getRecall() +
", time: " + elapsedTimeSec);
String s = newSource.getName() + "\t" + me.getPrecision() + "\t" + me.getRecall() + "\t" + elapsedTimeSec;
resultFile.println(s);
}
}
String outName = outputPath + newSource.getName() + Params.GRAPHVIS_OUT_DETAILS_FILE_EXT;
GraphVizUtil.exportSemanticModelsToGraphviz(
models,
newSource.getName(),
outName,
GraphVizLabelType.LocalId,
GraphVizLabelType.LocalUri,
true,
true);
}
resultFile.close();
}
}