package doser.word2vec.dbpediaGraphThinning; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Random; import java.util.Set; import org.apache.log4j.Logger; import com.hp.hpl.jena.query.QueryException; import com.hp.hpl.jena.query.QueryExecution; import com.hp.hpl.jena.query.QueryExecutionFactory; import com.hp.hpl.jena.query.QueryFactory; import com.hp.hpl.jena.query.QuerySolution; import com.hp.hpl.jena.query.ResultSet; import com.hp.hpl.jena.rdf.model.Model; import com.hp.hpl.jena.rdf.model.ModelFactory; import word2vec.tools.Word2VecModel; public class FullyEvaluateCategories { public static final int NROFCHOSENCATEGORIES = 50000; private Word2VecModel w2c; private Random random; private Model m; public FullyEvaluateCategories() { super(); this.random = new Random(); this.m = ModelFactory.createDefaultModel(); this.m.read("/home/zwicklbauer/HDTGeneration/article_categories_en.nt"); this.w2c = Word2VecModel.createWord2VecModel("/mnt/ssd1/disambiguation/word2vec/wikientitymodel_min5.seq"); } public String[] createCategorySet() { File file = new File( "/home/zwicklbauer/word2vec/MSEDbpediaCategories.txt"); BufferedReader reader = null; String line = null; List<String> set = new LinkedList<String>(); int counter = 0; try { reader = new BufferedReader(new FileReader(file)); while ((line = reader.readLine()) != null) { String splitter[] = line.split("\\t"); float score = Float.valueOf(splitter[0]); if (score > 0 && score < 0.04) { set.add(splitter[1]); } if (score > 0) { counter++; } } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } } System.out.println("Category Relation: " + set.size() + " of " + counter); String[] arr = new String[set.size()]; arr = set.toArray(arr); return arr; } public void evaluate(String[] categories) { try { PrintWriter writer = new PrintWriter(new File( "/home/zwicklbauer/purityInCategories.dat")); int counter = 0; while (counter < NROFCHOSENCATEGORIES) { String cat = categories[this.random.nextInt(categories.length)]; Set<String> set = queryEntitiesFromCategory(cat); float sum = 0; float comp = 0; for(String e1 : set) { for(String e2 : set) { if(!e1.equalsIgnoreCase(e2)) { float score = w2c.computeSimilarity(e1, e2); if(score > -2) { sum += score; comp++; } } } } System.out.println(cat + "\t"+(sum/comp)); writer.println("0\t"+(sum/comp)); counter++; } writer.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } } private Set<String> queryEntitiesFromCategory(final String catUri) { Set<String> set = new HashSet<String>(); final String query = "SELECT ?entities WHERE{ ?entities <http://purl.org/dc/terms/subject> <" + catUri + ">. }"; try { final com.hp.hpl.jena.query.Query cquery = QueryFactory .create(query); final QueryExecution qexec = QueryExecutionFactory .create(cquery, m); final ResultSet results = qexec.execSelect(); while (results.hasNext()) { final QuerySolution sol = results.nextSolution(); set.add(sol.getResource("entities").getURI() .replaceAll("http://dbpedia.org/resource/", "")); } } catch (final QueryException e) { Logger.getRootLogger().error(e.getStackTrace()); } return set; } public static void main(String[] args) { FullyEvaluateCategories ev = new FullyEvaluateCategories(); ev.evaluate(ev.createCategorySet()); } }