package edu.isi.karma.research.modeling;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.jgrapht.graph.DirectedWeightedMultigraph;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import edu.isi.karma.modeling.alignment.LinkIdFactory;
import edu.isi.karma.modeling.alignment.NodeIdFactory;
import edu.isi.karma.modeling.research.Params;
import edu.isi.karma.rep.alignment.InternalNode;
import edu.isi.karma.rep.alignment.Label;
import edu.isi.karma.rep.alignment.LabeledLink;
import edu.isi.karma.rep.alignment.Node;
import edu.isi.karma.rep.alignment.ObjectPropertyLink;
import edu.isi.karma.rep.alignment.ObjectPropertyType;
public class PatternGenerator {
List<Pattern> basicPatterns;
private HashMap<String,List<Connection>> possibleConnections;
private String outputDir;
private VirtuosoConnector virtuosoConnector;
enum Direction {IN , OUT, BOTH}
private class Connection {
String linkUri;
String nodeUri;
Direction direction;
public Connection(String linkUri, String nodeUri, Direction direction) {
this.linkUri = linkUri;
this.nodeUri = nodeUri;
this.direction = direction;
}
}
public PatternGenerator(String inputDir, String outputDir, VirtuosoConnector vc) {
this.basicPatterns = new LinkedList<Pattern>();
this.possibleConnections = new HashMap<String,List<Connection>>();
this.outputDir = outputDir;
this.virtuosoConnector = vc;
Map<String,Pattern> patternsLengthOne = PatternReader.importPatterns(inputDir, 1);
if (patternsLengthOne != null) {
this.basicPatterns.addAll(patternsLengthOne.values());
}
this.computePossibleConnections(Direction.OUT);
}
public void computePossibleConnections(Direction direction) {
for (Pattern p: this.basicPatterns) {
DirectedWeightedMultigraph<Node, LabeledLink> g = p.getGraph();
if (g == null)
continue;
if (g.edgeSet().size() == 0 ||
g.edgeSet().size() > 1)
continue;
for (Node n : g.vertexSet()) {
if (this.possibleConnections.get(n.getUri()) == null) {
this.possibleConnections.put(n.getUri(), new LinkedList<Connection>());
}
}
}
for (String s : this.possibleConnections.keySet()) {
for (Pattern p: this.basicPatterns) {
DirectedWeightedMultigraph<Node, LabeledLink> g = p.getGraph();
if (g == null)
continue;
if (g.edgeSet().size() == 0 ||
g.edgeSet().size() > 1)
continue;
LabeledLink l = g.edgeSet().iterator().next();
String sourceUri = l.getSource().getUri();
String targetUri = l.getTarget().getUri();
if (direction != Direction.IN && sourceUri.equalsIgnoreCase(s)) {
Connection c = new Connection(l.getUri(), l.getTarget().getUri(), Direction.OUT);
this.possibleConnections.get(s).add(c);
} else if (direction != Direction.OUT && targetUri.equalsIgnoreCase(s)) {
Connection c = new Connection(l.getUri(), l.getSource().getUri(), Direction.IN);
this.possibleConnections.get(s).add(c);
}
}
}
}
public Set<Node> getNodesWithUri(String uri, DirectedWeightedMultigraph<Node, LabeledLink> g) {
Set<Node> nodes = new HashSet<Node>();
if (uri == null || g == null || g.vertexSet().isEmpty())
return nodes;
for (Node n : g.vertexSet())
if (n.getUri().equalsIgnoreCase(uri))
nodes.add(n);
return nodes;
}
@SuppressWarnings("unchecked")
public List<Pattern> getPatterns(int length,
int instanceLimit,
boolean includeShorterPatterns,
List<Pattern> seeds,
int seedLength) {
List<Pattern> results = new LinkedList<Pattern>();
if (length <= 0 || length < seedLength) {
return results;
} else if (length == seedLength) {
return seeds;
} else {
List<Pattern> shorterPatterns = getPatterns(length-1, instanceLimit, includeShorterPatterns, seeds, seedLength);
if (includeShorterPatterns &&
(length-1) > seedLength &&
shorterPatterns != null)
results.addAll(shorterPatterns);
for (Pattern p : shorterPatterns) {
// try to connect patterns of length one to nodes of a pattern (join)
DirectedWeightedMultigraph<Node, LabeledLink> g = p.getGraph();
if (g == null)
continue;
Node source, target;
for (Node n : g.vertexSet()) {
List<Connection> connections = this.possibleConnections.get(n.getUri());
if (connections == null || connections.size() == 0)
continue;
for (Connection c : connections) {
NodeIdFactory nodeIdFactory = p.getNodeIdFactory().clone();
if (nodeIdFactory.lastIndexOf(c.nodeUri) == instanceLimit)
continue;
String newNodeId = nodeIdFactory.getNodeId(c.nodeUri);
Node newNode = new InternalNode(newNodeId, new Label(c.nodeUri));
DirectedWeightedMultigraph<Node, LabeledLink> newG =
(DirectedWeightedMultigraph<Node, LabeledLink>)g.clone();
newG.addVertex(newNode);
if (c.direction == Direction.OUT) {
source = n;
target = newNode;
} else {
source = newNode;
target = n;
}
String newLinkId = LinkIdFactory.getLinkId(c.linkUri, source.getId(), target.getId());
LabeledLink newLink = new ObjectPropertyLink(newLinkId, new Label(c.linkUri), ObjectPropertyType.None);
newG.addEdge(source, target, newLink);
Pattern newP = new Pattern(length, 0, newG, nodeIdFactory);
results.add(newP);
// add links to existing nodes in the pattern
Set<Node> nodesInPatternWithSameURI = this.getNodesWithUri(c.nodeUri, g);
for (Node existingNode : nodesInPatternWithSameURI) {
if (c.direction == Direction.OUT) {
source = n;
target = existingNode;
} else {
source = existingNode;
target = n;
}
newLinkId = LinkIdFactory.getLinkId(c.linkUri, source.getId(), target.getId());
newLink = new ObjectPropertyLink(newLinkId, new Label(c.linkUri), ObjectPropertyType.None);
if (g.containsEdge(newLink))
continue;
newG = (DirectedWeightedMultigraph<Node, LabeledLink>)g.clone();
newG.addEdge(source, target, newLink);
newP = new Pattern(length, 0, newG, p.getNodeIdFactory().clone());
results.add(newP);
}
}
}
}
System.out.println("***** " + "patterns with length " + length + " *****");
results = getValidPatterns(results);
printPatterns(length, results);
return results;
}
}
@SuppressWarnings("unchecked")
public List<Pattern> getPatterns(int length, int instanceLimit, boolean includeShorterPatterns) {
List<Pattern> results = new LinkedList<Pattern>();
if (length <= 0) {
return results;
} else if (length == 1) {
results = this.basicPatterns;
} else {
List<Pattern> shorterPatterns = getPatterns(length-1, instanceLimit, includeShorterPatterns);
if (includeShorterPatterns && shorterPatterns != null)
results.addAll(shorterPatterns);
for (Pattern p : shorterPatterns) {
// try to connect patterns of length one to nodes of a pattern (join)
DirectedWeightedMultigraph<Node, LabeledLink> g = p.getGraph();
if (g == null)
continue;
Node source, target;
for (Node n : g.vertexSet()) {
List<Connection> connections = this.possibleConnections.get(n.getUri());
if (connections == null || connections.size() == 0)
continue;
for (Connection c : connections) {
NodeIdFactory nodeIdFactory = p.getNodeIdFactory().clone();
if (nodeIdFactory.lastIndexOf(c.nodeUri) == instanceLimit)
continue;
String newNodeId = nodeIdFactory.getNodeId(c.nodeUri);
Node newNode = new InternalNode(newNodeId, new Label(c.nodeUri));
DirectedWeightedMultigraph<Node, LabeledLink> newG =
(DirectedWeightedMultigraph<Node, LabeledLink>)g.clone();
newG.addVertex(newNode);
if (c.direction == Direction.OUT) {
source = n;
target = newNode;
} else {
source = newNode;
target = n;
}
String newLinkId = LinkIdFactory.getLinkId(c.linkUri, source.getId(), target.getId());
LabeledLink newLink = new ObjectPropertyLink(newLinkId, new Label(c.linkUri), ObjectPropertyType.None);
newG.addEdge(source, target, newLink);
Pattern newP = new Pattern(length, 0, newG, nodeIdFactory);
results.add(newP);
// add links to existing nodes in the pattern
Set<Node> nodesInPatternWithSameURI = this.getNodesWithUri(c.nodeUri, g);
for (Node existingNode : nodesInPatternWithSameURI) {
if (c.direction == Direction.OUT) {
source = n;
target = existingNode;
} else {
source = existingNode;
target = n;
}
newLinkId = LinkIdFactory.getLinkId(c.linkUri, source.getId(), target.getId());
newLink = new ObjectPropertyLink(newLinkId, new Label(c.linkUri), ObjectPropertyType.None);
if (g.containsEdge(newLink))
continue;
newG = (DirectedWeightedMultigraph<Node, LabeledLink>)g.clone();
newG.addEdge(source, target, newLink);
newP = new Pattern(length, 0, newG, p.getNodeIdFactory().clone());
results.add(newP);
}
}
}
}
}
System.out.println("***** " + "patterns with length " + length + " *****");
results = getValidPatterns(results);
printPatterns(length, results);
return results;
}
private void printPatterns(int length, List<Pattern> patterns) {
File outRootDir = new File(outputDir);
if (outRootDir.exists() && outRootDir.isDirectory()) {
String outPath = outRootDir.getAbsolutePath() + "/" + length;
File outDir = new File(outPath);
if (outDir.exists()) {
try {
FileUtils.deleteDirectory(outDir);
} catch (IOException e) {
e.printStackTrace();
return;
}
}
if (!outDir.mkdir()) {
System.out.println("could not create the diretcory: " + outPath);
return;
}
String filename;
for (Pattern p : patterns) {
filename = outPath + "/" + p.getId() + ".json";
try {
p.writeJson(filename);
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
private List<Pattern> getValidPatterns(List<Pattern> patterns) {
List<Pattern> distinctPatterns, validPatterns;
System.out.println("number of patterns before removing duplicates: " + patterns.size());
distinctPatterns = removeDuplicates(patterns);
System.out.println("number of patterns after removing duplicates: " + distinctPatterns.size());
validPatterns = getPatternCount(distinctPatterns);
System.out.println("number of existing patterns in the data: " + validPatterns.size());
return validPatterns;
}
private List<Pattern> removeDuplicates(List<Pattern> patterns) {
List<Pattern> results = new LinkedList<Pattern>();
if (patterns == null || patterns.isEmpty())
return results;
HashSet<Integer> hashTable = new HashSet<Integer>();
int hash;
for (Pattern p :patterns) {
hash = p.getLabel().hashCode();
if (hashTable.contains(hash)) continue;
hashTable.add(hash);
results.add(p);
}
// Collections.sort(patterns, new PatternLabelComparator());
// for (int i = 0; i < patterns.size(); i++) {
// if (i > 0 && patterns.get(i).getLabel().equalsIgnoreCase(patterns.get(i-1).getLabel()))
// continue;
// results.add(patterns.get(i));
// }
return results;
}
private List<Pattern> getPatternCount(List<Pattern> patterns) {
List<Pattern> results = new LinkedList<Pattern>();
if (patterns == null || patterns.isEmpty())
return results;
if (this.virtuosoConnector == null)
return results;
VirtuosoManager vm = new VirtuosoManager(this.virtuosoConnector);
RepositoryConnection con;
try {
con = vm.getConnection();
} catch (RepositoryException e) {
e.printStackTrace();
return results;
}
int count;
int patternNumber = 1;
for (Pattern p : patterns) {
// System.out.println(p.toSparql(this.virtuosoConnector.getGraphIRI()));
count = vm.getPatternCount(con, p.toSparql(this.virtuosoConnector.getGraphIRI()));
if (count != 0) {
p.setFrequency(count);
results.add(p);
}
System.out.println("pattern " + patternNumber++ + "/" + patterns.size() +
", length: " + p.getLength() +
", count: " + count);
if (patternNumber % 5000 == 0) {
System.out.println("getting new connection ...");
try {
vm.closeConnection(con);
} catch (RepositoryException e) {
e.printStackTrace();
}
try {
con = vm.getConnection();
} catch (RepositoryException e) {
e.printStackTrace();
}
}
}
try {
vm.closeConnection(con);
} catch (RepositoryException e) {
e.printStackTrace();
return results;
}
return results;
}
private static void prunePatterns() throws IOException {
int length = 5;
int matches = 0;
Node domain, source, target;
String lodDSName = "ds29";
// String lodDSName = "saam";
// String lodDSName = "musicbrainz";
boolean chain = false;
boolean timespan = false;
boolean duplicate = false;
boolean removeUris = false;
boolean twoAggregatedCHOs = true;
File f = new File(Params.SOURCE_DIR);
File[] files = f.listFiles();
String sourcename, filename;
List<String> removeUriList = new LinkedList<String>();
removeUriList.add("http://purl.org/ontology/mo/Playlist");
removeUriList.add("http://www.w3.org/TR/owl-time/Interval");
for (int i = 0; i < files.length; i++)
{
if (lodDSName.equals("ds29")) {
File file = files[i];
filename = file.getName();
System.out.println("processing " + filename + " ...");
sourcename = filename.substring(0, filename.lastIndexOf("."));
} else {
System.out.println("processing " + lodDSName + " ...");
sourcename = lodDSName;
i = files.length;
}
matches = 0;
for (int j = 1; j <= length; j++) {
System.out.println("reading patterns with length " + j);
File f1 = new File(Params.LOD_DIR + sourcename + "/" + Params.PATTERNS_OUTPUT_DIR + "/" + j);
File[] patternFiles = f1.listFiles();
if (files != null)
for (File f2 : patternFiles) {
Pattern p = Pattern.readJson(f2.getAbsolutePath());
if (chain) {
for (Node n : p.getGraph().vertexSet()) {
if (p.getGraph().outDegreeOf(n) > 1) {
matches++;
// System.out.println(p.getPrintStr());
// if (!f2.delete())
// System.out.println("error in deleting the file " + f2.getAbsolutePath());
break;
}
}
}
if (twoAggregatedCHOs) {
boolean existAggregatedCHO = false;
for (LabeledLink l : p.getGraph().edgeSet()) {
source = l.getSource();
target = l.getTarget();
if (!existAggregatedCHO && target.getUri().equalsIgnoreCase("http://www.americanartcollaborative.org/ontology/CulturalHeritageObject")
&& l.getUri().equalsIgnoreCase("http://www.europeana.eu/schemas/edm/aggregatedCHO")) {
existAggregatedCHO = true;
} else if (target.getUri().equalsIgnoreCase("http://www.americanartcollaborative.org/ontology/CulturalHeritageObject")
&& l.getUri().equalsIgnoreCase("http://www.europeana.eu/schemas/edm/aggregatedCHO")) {
matches ++;
// System.out.println(p.getPrintStr());
if (!f2.delete())
System.out.println("error in deleting the file " + f2.getAbsolutePath());
}
}
}
if (timespan) {
domain = null;
for (LabeledLink l : p.getGraph().edgeSet()) {
source = l.getSource();
target = l.getTarget();
if (target.getUri().equalsIgnoreCase("http://erlangen-crm.org/current/E52_Time-Span")) {
if (domain == null) {
domain = source;
} else if (source.equals(domain)) {
matches ++;
// System.out.println(p.getPrintStr());
// if (!f2.delete())
// System.out.println("error in deleting the file " + f2.getAbsolutePath());
}
}
}
}
if (removeUris) {
for (Node n : p.getGraph().vertexSet()) {
for (String uri : removeUriList) {
if (n.getUri().equalsIgnoreCase(uri)) {
matches++;
// System.out.println(p.getPrintStr());
if (!f2.delete())
System.out.println("error in deleting the file " + f2.getAbsolutePath());
break;
}
}
}
}
boolean visited = false;
if (duplicate) {
for (Node n : p.getGraph().vertexSet()) {
if (!visited && n.getUri().equals("http://erlangen-crm.org/current/E12_Production")) {
visited = true;
} else if (n.getUri().equals("http://erlangen-crm.org/current/E12_Production")) {
matches++;
// System.out.println(p.getPrintStr());
// if (!f2.delete())
// System.out.println("error in deleting the file " + f2.getAbsolutePath());
break;
}
}
}
}
}
System.out.println(matches);
}
}
private static void generatePatternsFromSeeds() throws IOException {
int length = 5;
int seedLength = 4;
int instanceLimit = 2;
boolean includeShorterPatterns = false;
String instance = "fusionRepository.isi.edu";
int port = 1140;
// int port = 1300;
String username = "dba";
String password = "dba";
int queryTimeout = 1;
String sourcename = "musicbrainz";
String patternInputDirStr = Params.LOD_DIR + sourcename + "/" + Params.PATTERNS_INPUT_DIR;
String patternOutputDirStr = Params.LOD_DIR + sourcename + "/" + Params.PATTERNS_OUTPUT_DIR;
VirtuosoConnector vc = new VirtuosoConnector(instance, port, username, password);
vc.setQueryTimeout(queryTimeout);
PatternGenerator pg = new PatternGenerator(patternInputDirStr, patternOutputDirStr, vc);
System.out.println("reading patterns with length " + seedLength);
List<Pattern> seeds = new LinkedList<Pattern>();
File f = new File(patternOutputDirStr);
for (int i = 0; i < length; i++) {
File f1 = new File(f.getAbsoluteFile() + "/" + seedLength);
File[] files = f1.listFiles();
if (files != null)
for (File f2 : files) {
Pattern p = Pattern.readJson(f2.getAbsolutePath());
seeds.add(p);
}
}
System.out.println("finished reading patterns.");
long start = System.currentTimeMillis();
List<Pattern> patterns = pg.getPatterns(length, instanceLimit, includeShorterPatterns, seeds, seedLength);
long patternGeneraionTime = System.currentTimeMillis();
System.out.println("================================================================================");
System.out.println("time to generate patterns: " + (patternGeneraionTime - start)/1000F);
System.out.println("number of possible patterns: " + patterns.size());
System.out.println("================================================================================");
}
private static void generatePatterns() {
int length = 5;
int instanceLimit = 2;
boolean includeShorterPatterns = false;
String instance = "fusionRepository.isi.edu";
// int port = 1140;
// int port = 1300;
int port = 1500;
String username = "dba";
String password = "dba";
String baseGraph = "http://weapon-lod/";
int queryTimeout = 5;
String graphIRI;
File f = new File(Params.SOURCE_DIR);
File[] files = f.listFiles();
String sourcename, filename;
String patternInputDirStr, patternOutputDirStr;
// int i = 4; {
for (int i = 0; i < files.length; i++) {
File file = files[i];
filename = file.getName();
System.out.println("processing " + filename + " ...");
sourcename = filename.substring(0, filename.lastIndexOf("."));
patternInputDirStr = Params.LOD_DIR + sourcename + "/" + Params.PATTERNS_INPUT_DIR;
patternOutputDirStr = Params.LOD_DIR + sourcename + "/" + Params.PATTERNS_OUTPUT_DIR;
File patternOutputDir = new File(patternOutputDirStr);
if (!patternOutputDir.exists()) {
patternOutputDir.mkdir();
}
graphIRI = baseGraph + sourcename;
VirtuosoConnector vc = new VirtuosoConnector(instance, port, username, password, graphIRI);
vc.setQueryTimeout(queryTimeout);
PatternGenerator pg = new PatternGenerator(patternInputDirStr, patternOutputDirStr, vc);
long start = System.currentTimeMillis();
List<Pattern> patterns = pg.getPatterns(length, instanceLimit, includeShorterPatterns);
long patternGeneraionTime = System.currentTimeMillis();
System.out.println("================================================================================");
System.out.println("time to generate patterns: " + (patternGeneraionTime - start)/1000F);
System.out.println("number of possible patterns: " + patterns.size());
System.out.println("================================================================================");
System.out.println("done with " + filename + ".");
}
}
private static void generatePatterns2() {
int length = 4;
int instanceLimit = 2;
boolean includeShorterPatterns = false;
String instance = "fusionRepository.isi.edu";
int port = 1140;
// int port = 1300;
String username = "dba";
String password = "dba";
int queryTimeout = 1;
String sourcename = "musicbrainz";
String patternInputDirStr, patternOutputDirStr;
patternInputDirStr = Params.LOD_DIR + sourcename + "/" + Params.PATTERNS_INPUT_DIR;
patternOutputDirStr = Params.LOD_DIR + sourcename + "/" + Params.PATTERNS_OUTPUT_DIR;
File patternOutputDir = new File(patternOutputDirStr);
if (!patternOutputDir.exists()) {
patternOutputDir.mkdir();
}
VirtuosoConnector vc = new VirtuosoConnector(instance, port, username, password, null);
vc.setQueryTimeout(queryTimeout);
PatternGenerator pg = new PatternGenerator(patternInputDirStr, patternOutputDirStr, vc);
long start = System.currentTimeMillis();
List<Pattern> patterns = pg.getPatterns(length, instanceLimit, includeShorterPatterns);
long patternGeneraionTime = System.currentTimeMillis();
System.out.println("================================================================================");
System.out.println("time to generate patterns: " + (patternGeneraionTime - start)/1000F);
System.out.println("number of possible patterns: " + patterns.size());
System.out.println("================================================================================");
System.out.println("done.");
}
public static void main(String[] args) throws IOException {
boolean generatePatternsFromSeeds = false;
if (generatePatternsFromSeeds) generatePatternsFromSeeds();
else generatePatterns();
// two production URIs that have two time span
//uri1: nodeID://b4981707
//uri2: nodeID://b4988056
// prunePatterns();
System.out.println("done.");
}
}