/** * */ package com.personalityextractor.entity.graph; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Set; import net.sf.json.JSONObject; import com.personalityextractor.Runner; import com.personalityextractor.data.source.Wikiminer; import com.personalityextractor.entity.WikipediaEntity; import com.personalityextractor.entity.extractor.frequencybased.TopNNPHashTagsExtractor; import com.personalityextractor.entity.graph.ranking.IRanker; import com.personalityextractor.entity.graph.ranking.WeightGraphRanker; import cs224n.util.Counter; /** * @author semanticvoid * */ public class Graph { private static Set<String> superCategories = new HashSet<String>(); static { superCategories.add("Agriculture"); superCategories.add("Arts"); superCategories.add("Applied sciences"); superCategories.add("Belief"); superCategories.add("Business"); superCategories.add("Computers"); superCategories.add("Culture"); superCategories.add("Education"); superCategories.add("Environment"); superCategories.add("Geography"); superCategories.add("Health"); superCategories.add("History"); superCategories.add("Humanities"); superCategories.add("Language"); superCategories.add("Law"); superCategories.add("Mathematics"); superCategories.add("Nature"); superCategories.add("People"); superCategories.add("Politics"); superCategories.add("Science"); // superCategories.add("Society"); superCategories.add("Technology"); superCategories.add("Sports"); superCategories.add("Travel"); } private static double DECAY = 0.5; HashMap<String, Node> nodes; Node root = null; int edgeCount = 0; public Graph(List<WikipediaEntity> leafEntities) { this.root = new Node(new WikipediaEntity("__ROOT__", "-1", -1)); this.nodes = new HashMap<String, Node>(); for (WikipediaEntity we : leafEntities) { Node n = new Node(we); n.setLeaf(true); n.setWeight(we.count); nodes.put(n.getId(), n); } } private List<WikipediaEntity> getSuperCategory(String id, int depth) { WikipediaEntity entity = null; List<WikipediaEntity> supeCategories = new ArrayList<WikipediaEntity>(); int shortestPath = Integer.MAX_VALUE; int currentDepth = 0; int maxCount = -1; int minCount = 999; HashMap<String, WikipediaEntity> entities = new HashMap<String, WikipediaEntity>(); HashMap<String, Integer> entityCount = new HashMap<String, Integer>(); Set<String> prevCategories = new HashSet<String>(); Set<String> currCategories = null; prevCategories.add(id); int iter = 0; do { currCategories = new HashSet<String>(); for (String cid : prevCategories) { Node cNode = nodes.get(cid); List<WikipediaEntity> categories = null; if (iter == 0) { categories = Wikiminer.getCategories(cid); } else { categories = Wikiminer.getParentCategories(cid); } for (WikipediaEntity category : categories) { Node n2 = null; if (cid.equalsIgnoreCase(category.getWikiminerID())) { continue; } // if (!nodes.containsKey(category.getWikiminerID())) { if (!superCategories.contains(category.getText())) { currCategories.add(category.getWikiminerID()); } else { System.out.println("hit " + category.getText() + " at depth " + currentDepth); // if (currentDepth < shortestPath) { // entity = category; // } if (!entities.containsKey(category.getText())) { entities.put(category.getText(), category); } if (!entityCount.containsKey(category.getText())) { entityCount.put(category.getText(), (depth-currentDepth)); } else { entityCount.put(category.getText(), entityCount.get(category.getText()) + (depth-currentDepth)); } } // n2 = new Node(category); // nodes.put(n2.getId(), n2); // } else { // // n2 = nodes.get(category.getWikiminerID()); // } // formEdge(cNode, n2, cNode.getWeight()); } } iter++; prevCategories = currCategories; currentDepth++; // System.out.println("depth:\t" + currentDepth); } while (prevCategories.size() > 0 && currentDepth < depth); for (String k : entityCount.keySet()) { int count = entityCount.get(k); // System.out.println(k + "\t" + count); // supeCategories.add(entities.get(k)); if (count > maxCount) { maxCount = count; // entity = entities.get(k); } if (count < minCount) { minCount = count; } } System.out.println("maxcount: " + maxCount + "\tmincount: " + minCount); List<Integer> list = new ArrayList<Integer>(); for (String k : entityCount.keySet()) { list.add(entityCount.get(k)); } Collections.sort(list); // System.out.println("95 percentile: " + list.get((int) Math.floor(((list.size()-1)*0.95)))); int index = (int) Math.floor(((list.size()-1)*0.95)); if(index >= 0) { int midcount = list.get(index); for (String k : entityCount.keySet()) { int count = entityCount.get(k); System.out.println(k + "\t" + count); // supeCategories.add(entities.get(k)); if (count >= midcount) { supeCategories.add(entities.get(k)); } } } return supeCategories; } public void build(int depth) { Set<Node> cNodes = new HashSet<Node>(); Object[] ids = nodes.keySet().toArray(); for (Object id : ids) { String cid = (String) id; List<WikipediaEntity> e = getSuperCategory(cid, depth); if (e != null) { for(WikipediaEntity e1 : e) { System.out.println(nodes.get(id).getEntity().getText() + " has super category " + e1.getText()); Node n2 = null; if(nodes.containsKey(e1.getWikiminerID())) { n2 = nodes.get(e1.getWikiminerID()); } else { n2 = new Node(e1); } if (!nodes.containsKey(n2.getId())) { cNodes.add(n2); nodes.put(n2.getId(), n2); formEdge(n2, root, 1); } Node n1 = nodes.get(cid); formEdge(n1, n2, 1); // fixed weight for now } } } // link to root // for (Node n : cNodes) { // nodes.put(n.getId(), n); // formEdge(n, root, 1); // } System.out.println("done"); } // public void build(int depth) { // int currentDepth = 0; // Set<String> prevCategories = new HashSet<String>(); // // // add leaf node categories // prevCategories.addAll(nodes.keySet()); // // Set<String> currCategories; // // int iter = 0; // if(depth > 0) { // do { // currCategories = new HashSet<String>(); // // // for (String cid : prevCategories) { // Node cNode = nodes.get(cid); // List<WikipediaEntity> categories = null; // if(iter == 0) { // categories = Wikiminer.getCategories(cid); // } else { // categories = Wikiminer.getParentCategories(cid); // } // // // List<WikipediaEntity> tmpCategories = new // ArrayList<WikipediaEntity>(); // // for (WikipediaEntity c : categories) { // // String txt = c.getText(); // // WikipediaEntity e = Wikiminer.getHighestSenseEntity(txt); // // if(e != null) { // // tmpCategories.add(e); // // } // // } // // categories = tmpCategories; // // for (WikipediaEntity category : categories) { // Node n2 = null; // // if (cid.equalsIgnoreCase(category.getWikiminerID())) { // continue; // } // // if (!nodes.containsKey(category.getWikiminerID())) { // if(!superCategories.contains(category.getText())) { // currCategories.add(category.getWikiminerID()); // } else { // System.out.println("hit " + category.getText()); // } // n2 = new Node(category); // nodes.put(n2.getId(), n2); // } else { // n2 = nodes.get(category.getWikiminerID()); // } // // formEdge(cNode, n2, cNode.getWeight()); // } // } // iter++; // // prevCategories = currCategories; // currentDepth++; // System.out.println("depth:\t" + currentDepth); // } while (prevCategories.size() > 0 && currentDepth < depth); // } // // // link to roo // for (String cid : prevCategories) { // Node cNode = nodes.get(cid); // formEdge(cNode, root, 0); // } // } private void formEdge(Node n1, Node n2, double weight) { Edge e1 = new Edge(edgeCount++, n1.getId(), n2.getId()); Edge e2 = new Edge(edgeCount++, n2.getId(), n1.getId()); n1.addEdge(e1); n2.addEdge(e2); n2.addWeight(weight); // add decay // n2.addWeight(weight*1.0*DECAY); } public Collection<Node> getNodes() { return nodes.values(); } public JSONObject toJSON(String handle, List<Node> nodes) { JSONObject json = new JSONObject(); Set<String> seen = new HashSet<String>(); // Node root = new Node(new WikipediaEntity(handle, -1)); if (nodes != null) { for (Node n : nodes) { // JSONObject j = generateJSON(n, null, seen); // System.out.println(j.toString()); // json.put(n.getEntity().getText(), j); formEdge(n, root, 1); } } json = generateJSON(root, null, new HashSet<String>()); JSONObject jroot = new JSONObject(); jroot.put(handle, json); return jroot; } public void printWeights() { for (String id : nodes.keySet()) { Node n = nodes.get(id); // if(n.getEntity().getType() == 1) { System.out.println(n.getId() + "\t" + n.getEntity().getText() + "\t" + n.getEntity().getType() + "\t" + n.getWeight()); // } } } private JSONObject generateJSON(Node root, Node parent, Set<String> seen) { if (root == null) { return null; } else { seen.add(root.getId()); } JSONObject json = new JSONObject(); List<Edge> edges = root.getEdges(); // if(root != null && root.getId() != null && // root.getId().equals("38809")) { // System.out.print(""); // } if (edges.size() > 0) { for (Edge e : edges) { if (e.getNode2() != null) { JSONObject cJson = null; if (!seen.contains(e.getNode2())) { // && // !seen.contains(e.getNode1())) // { // seen.add(e.getNode2()); seen.add(e.getNode1()); if (!nodes.get(e.getNode2()).isLeaf()) { cJson = generateJSON(nodes.get(e.getNode2()), root, seen); } else { json.put(nodes.get(e.getNode2()).getEntity() .getText(), nodes.get(e.getNode2()) .getWeight()); } } if (cJson != null) { json.put(nodes.get(e.getNode2()).getEntity().getText(), cJson); } else { if (nodes.get(e.getNode2()) != null && nodes.get(e.getNode2()) != null && !seen.contains(e.getNode2())) { double w = nodes.get(e.getNode2()).getWeight(); if (nodes.get(e.getNode2()).getWeight() < 1) { w = 1; } json.put(nodes.get(e.getNode2()).getEntity() .getText(), w); } } } } // if (json.size() == 0) { // json.put(root.getEntity().getText(), root.getWeight()); // } } else { return null; } return json; } public static void main(String[] args) { ArrayList<WikipediaEntity> entities = new ArrayList<WikipediaEntity>(); entities.add(new WikipediaEntity("Rajiv Gandhi", "26129", 1)); entities.add(new WikipediaEntity("Apple", "856", 1)); entities.add(new WikipediaEntity("Sonia Gandhi", "169798", 1)); entities.add(new WikipediaEntity("Bill Gates", "3747", 1)); // WikipediaEntity e = new WikipediaEntity("Sonia Gandhi", "169798", 1); // e.incrCount(); // e.incrCount(); // e.incrCount(); // entities.add(e); List<String> tweets = new ArrayList<String>(); // tweets.add("Sonia Gandhi is a person."); tweets.add("Rajiv Gandhi is a Congress leader."); tweets.add("Sachin Tendulkar is awesome."); tweets.add("Amazon is an awesome company."); TopNNPHashTagsExtractor tne = new TopNNPHashTagsExtractor(); Counter<String> extracted_entities = tne.extract(tweets); Counter<String> finalEntityCounter = new Counter<String>(); finalEntityCounter.setCount("Sonia Gandhi" , 10); // HashMap<String, WikipediaEntity> allEntities = tne // .resolve(finalEntityCounter); // entities = new ArrayList<WikipediaEntity>(); // entities.addAll(allEntities.values()); Graph g = new Graph(entities); g.build(7); g.printWeights(); IRanker ranker = new WeightGraphRanker(g); // List<Node> topNodes = ranker.getTopRankedNodes(100); System.out.println(g.toJSON("testuser", null)); // System.out.println(Runner.nodesToJson("testuser", g, topNodes)); } }