package doser.word2vec.dbpediaGraphThinning; import java.io.File; import java.io.FileNotFoundException; import java.io.PrintWriter; import java.util.HashSet; import java.util.Set; import org.apache.log4j.Logger; import com.hp.hpl.jena.query.QueryException; import com.hp.hpl.jena.query.QueryExecution; import com.hp.hpl.jena.query.QueryExecutionFactory; import com.hp.hpl.jena.query.QueryFactory; import com.hp.hpl.jena.query.QuerySolution; import com.hp.hpl.jena.query.ResultSet; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import com.hp.hpl.jena.rdf.model.Property; import com.hp.hpl.jena.rdf.model.RDFNode; import com.hp.hpl.jena.rdf.model.Resource; import com.hp.hpl.jena.rdf.model.Statement; import com.hp.hpl.jena.rdf.model.StmtIterator; import word2vec.tools.Word2VecModel; public class DbpediaGraphModification { public final static String OUTPUTFILE = "/home/zwicklbauer/word2vec/MSEDbpediaCategories.txt"; private Model m; private Model skosModel; private Word2VecModel w2v; public DbpediaGraphModification() { super(); this.m = ModelFactory.createDefaultModel(); this.m.read("/home/zwicklbauer/HDTGeneration/article_categories_en.nt"); this.skosModel = ModelFactory.createDefaultModel(); this.skosModel.read("/home/zwicklbauer/HDTGeneration/skos_categories_en.nt"); this.w2v = Word2VecModel .createWord2VecModel("/mnt/ssd1/disambiguation/word2vec/wikientitymodel_min5.seq"); } public Set<String> initializeCategories() { Model model = ModelFactory.createDefaultModel(); model.read("/home/zwicklbauer/HDTGeneration/skos_categories_en.nt"); StmtIterator it = model.listStatements(); Set<String> set = new HashSet<String>(); System.out.println("Los gehts"); while (it.hasNext()) { Statement s = it.next(); Resource r = s.getSubject(); Property p = s.getPredicate(); RDFNode n = s.getObject(); if (p.getURI().equalsIgnoreCase( "http://www.w3.org/2004/02/skos/core#broader") && n.isResource()) { Resource target = n.asResource(); if(!hasSubCategory(target.getURI())) set.add(target.getURI()); if(!hasSubCategory(r.getURI())) set.add(r.getURI()); } } return set; } private boolean hasSubCategory(String uri) { final String query = "SELECT ?entities WHERE{ ?types <http://www.w3.org/2004/02/skos/core#broader> <" + uri + ">. }"; boolean hasSubtype = false; try { final com.hp.hpl.jena.query.Query cquery = QueryFactory .create(query); final QueryExecution qexec = QueryExecutionFactory .create(cquery, skosModel); final ResultSet results = qexec.execSelect(); while (results.hasNext()) { hasSubtype = true; break; } } catch (final QueryException e) { Logger.getRootLogger().error(e.getStackTrace()); } return hasSubtype; } private Set<String> queryEntitiesFromCategory(final String catUri) { Set<String> set = new HashSet<String>(); final String query = "SELECT ?entities WHERE{ ?entities <http://purl.org/dc/terms/subject> <" + catUri + ">. }"; try { final com.hp.hpl.jena.query.Query cquery = QueryFactory .create(query); final QueryExecution qexec = QueryExecutionFactory .create(cquery, m); final ResultSet results = qexec.execSelect(); while (results.hasNext()) { final QuerySolution sol = results.nextSolution(); set.add(sol.getResource("entities").getURI() .replaceAll("http://dbpedia.org/resource/", "")); } } catch (final QueryException e) { Logger.getRootLogger().error(e.getStackTrace()); } return set; } public float computeDistance(String[] words) { return this.w2v.computeMSE(words); } public static void main(String[] args) { File file = new File(OUTPUTFILE); PrintWriter writer = null; try { writer = new PrintWriter(file); DbpediaGraphModification mod = new DbpediaGraphModification(); Set<String> categories = mod.initializeCategories(); for (String cat : categories) { Set<String> entities = mod.queryEntitiesFromCategory(cat); String[] entArr = new String[entities.size()]; entArr = entities.toArray(entArr); float distance = mod.computeDistance(entArr); writer.println(distance+"\t"+cat); } writer.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } } }