package word2vec.corpuscreation; import java.io.FileNotFoundException; import java.io.PrintWriter; import java.util.Random; import java.util.Set; import org.jgrapht.UndirectedGraph; import org.jgrapht.graph.DefaultEdge; import org.jgrapht.graph.SimpleGraph; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.RDFNode; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.rdf.model.StmtIterator; public class CreateRandomDBpediaModel { public static final int JUMPPROBABILITY = 10; public static final int STEPNR = 500000000; public static final String INFOBOXPROPERTIES = "/home/zwicklbauer/HDTGeneration/infobox_properties_en.nt"; public static final String MAPPINGPROPERTIES = "/home/zwicklbauer/HDTGeneration/mappingbased_properties_cleaned_en.nt"; public static final String ARTICLECATEGORIES = "/home/zwicklbauer/HDTGeneration/article_categories_en.nt"; public static final String SKOSBROADER = "/home/zwicklbauer/HDTGeneration/skos_categories_en.nt"; public static final String MODELPATH = "/home/zwicklbauer/word2vec/dbpediamodel_Categories.dat"; private Random random; private UndirectedGraph<String, DefaultEdge> graph; private String[] vertexes; public CreateRandomDBpediaModel() { super(); this.graph = new SimpleGraph<String, DefaultEdge>(DefaultEdge.class); this.random = new Random(); } public void createDBpediaGraph() { System.out.println("Create DBpediaGraph"); System.out.println("Add Facts"); addFactsToGraph(); System.out.println("Add Categories"); addCategoriesToGraph(); System.out.println("Add SkosBroader"); addSkosBroaderToGraph(); Set<String> v = graph.vertexSet(); this.vertexes = new String[v.size()]; this.vertexes = v.toArray(this.vertexes); } public void createWord2VecModel() { try { PrintWriter writer = new PrintWriter(MODELPATH); int counter = 0; String init = null; while (init == null) { init = performRandomJump(); } writer.write(init.replaceAll("http://dbpedia.org/resource/", "")); while (counter < STEPNR) { init = performNextStep(init); String output = init.replaceAll("http://dbpedia.org/resource/", ""); writer.write(output + " "); counter++; } writer.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } } private String performNextStep(String current) { String result = null; String randomjump = performRandomJump(); if (randomjump == null) { if (graph.containsVertex(current)) { Set<DefaultEdge> edgeSet = graph.edgesOf(current); DefaultEdge[] edges = new DefaultEdge[edgeSet.size()]; edges = edgeSet.toArray(edges); DefaultEdge def = edges[random.nextInt(edges.length)]; String source = graph.getEdgeSource(def); String target = graph.getEdgeTarget(def); String relevant = null; if (source.equalsIgnoreCase(current)) { relevant = target; } else { relevant = source; } result = relevant; if (relevant.contains("Category:")) { relevant = performNextStep(current); result = relevant; } } else { result = performSaveRandomJump(); } } else { result = randomjump; } if (result.contains("__")) { String[] splitter = result.split("__"); result = splitter[0]; } return result; } private String performRandomJump() { String result = null; int randomint = random.nextInt(100); if (randomint < JUMPPROBABILITY) { int jump = random.nextInt(vertexes.length); result = vertexes[jump]; while (result.contains("Category:")) { jump = random.nextInt(vertexes.length); result = vertexes[jump]; } } return result; } private String performSaveRandomJump() { String result = null; int jump = random.nextInt(vertexes.length); result = vertexes[jump]; while (result.contains("Category:")) { jump = random.nextInt(vertexes.length); result = vertexes[jump]; } return result; } private void addCategoriesToGraph() { Model m = ModelFactory.createDefaultModel(); m.read(ARTICLECATEGORIES); StmtIterator it = m.listStatements(); int counter = 0; while (it.hasNext()) { if (counter % 10000 == 0) { System.out.println(counter); } Statement s = it.next(); Resource subject = s.getSubject(); Property pra = s.getPredicate(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (pra.isResource() && obj.getURI().startsWith( "http://dbpedia.org/resource/")) { if (!subject.getURI().equalsIgnoreCase(obj.getURI())) { graph.addVertex(subject.getURI()); graph.addVertex(obj.getURI()); graph.addEdge(subject.getURI(), obj.getURI()); } } } } counter++; } private void addSkosBroaderToGraph() { Model m = ModelFactory.createDefaultModel(); m.read(SKOSBROADER); StmtIterator it = m.listStatements(); while (it.hasNext()) { Statement s = it.next(); Resource subject = s.getSubject(); Property pra = s.getPredicate(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (pra.isResource() && obj.getURI().startsWith( "http://dbpedia.org/resource/")) { if (!subject.getURI().equalsIgnoreCase(obj.getURI())) { graph.addVertex(subject.getURI()); graph.addVertex(obj.getURI()); graph.addEdge(subject.getURI(), obj.getURI()); } } } } } private void addFactsToGraph() { Model m = ModelFactory.createDefaultModel(); m.read(INFOBOXPROPERTIES); StmtIterator it = m.listStatements(); int counter = 0; while (it.hasNext()) { if (counter % 10000 == 0) { System.out.println(counter); } Statement s = it.next(); Resource subject = s.getSubject(); Property pra = s.getPredicate(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (pra.isResource() && obj.getURI().startsWith( "http://dbpedia.org/resource/")) { if (!subject.getURI().equalsIgnoreCase(obj.getURI())) { graph.addVertex(subject.getURI()); graph.addVertex(obj.getURI()); graph.addEdge(subject.getURI(), obj.getURI()); } } } counter++; } m = ModelFactory.createDefaultModel(); m.read(MAPPINGPROPERTIES); it = m.listStatements(); counter = 0; while (it.hasNext()) { if (counter % 10000 == 0) { System.out.println(counter); } Statement s = it.next(); Resource subject = s.getSubject(); Property pra = s.getPredicate(); RDFNode object = s.getObject(); if (object.isResource()) { Resource obj = object.asResource(); if (pra.isResource() && obj.getURI().startsWith( "http://dbpedia.org/resource/")) { if (!subject.getURI().equalsIgnoreCase(obj.getURI())) { graph.addVertex(subject.getURI()); graph.addVertex(obj.getURI()); graph.addEdge(subject.getURI(), obj.getURI()); } } } counter++; } } public static void main(String[] args) { CreateRandomDBpediaModel model = new CreateRandomDBpediaModel(); model.createDBpediaGraph(); model.createWord2VecModel(); } }