package edu.emory.clir.clearnlp.lexicon.dbpedia; import java.io.InputStreamReader; import java.io.ObjectOutputStream; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import com.google.gson.Gson; import edu.emory.clir.clearnlp.collection.tree.PrefixNode; import edu.emory.clir.clearnlp.collection.tree.PrefixTree; import edu.emory.clir.clearnlp.collection.triple.ObjectIntIntTriple; import edu.emory.clir.clearnlp.component.utils.NLPUtils; import edu.emory.clir.clearnlp.ner.NERInfoSet; import edu.emory.clir.clearnlp.ner.NERTag; import edu.emory.clir.clearnlp.tokenization.AbstractTokenizer; import edu.emory.clir.clearnlp.util.IOUtils; import edu.emory.clir.clearnlp.util.Joiner; import edu.emory.clir.clearnlp.util.StringUtils; import edu.emory.clir.clearnlp.util.constant.StringConst; import edu.emory.clir.clearnlp.util.lang.TLanguage; public class PrefixTreeGenerator implements DBPediaXML { private Map<DBPediaType,DBPediaType> super_type_map; private DBPediaTypeMap type_map; private DBPediaInfoMap info_map; public PrefixTreeGenerator(DBPediaTypeMap typeMap, DBPediaInfoMap infoMap, Set<DBPediaType> types) { type_map = typeMap; info_map = infoMap; super_type_map = getSuperTypeMap(types); } public Map<DBPediaType,DBPediaType> getSuperTypeMap(Set<DBPediaType> superTypes) { Map<DBPediaType,DBPediaType> map = new HashMap<>(); for (DBPediaType superType : superTypes) map.put(superType, superType); for (DBPediaType type : type_map.keySet()) { for (DBPediaType superType : superTypes) { if (type_map.isSuperType(type, superType)) { map.put(type, superType); break; } } } return map; } /** Note this list is an object. */ public PrefixTree<String,NERInfoSet> getPrefixTree(AbstractTokenizer tokenizer, boolean lower) { PrefixTree<String,NERInfoSet> tree = new PrefixTree<>(); NERInfoSet list; DBPediaInfo info; for (Entry<String,DBPediaInfo> e : info_map.entrySet()) { info = e.getValue(); list = getNERInfoSet(e.getKey(), info.getTypes()); if (list != null) addAliases(tokenizer, tree, info.getAliases(), list, lower); } return tree; } private NERInfoSet getNERInfoSet(String title, Set<DBPediaType> types) { Set<DBPediaType> set = new HashSet<>(); DBPediaType superType; for (DBPediaType type : types) { if ((superType = super_type_map.get(type)) != null) set.add(superType); } if (set.isEmpty()) return null; NERInfoSet list = new NERInfoSet(); for (DBPediaType type : set) list.addCategory(NERTag.fromDBPediaType(type)); return list; } private void addAliases(AbstractTokenizer tokenizer, PrefixTree<String,NERInfoSet> tree, Set<String> aliases, NERInfoSet list, boolean lower) { PrefixNode<String,NERInfoSet> node; List<String> tokens; String[] t; for (String alias : aliases) { tokens = tokenizer.tokenize(alias); t = trimTokens(tokens, lower); if (t.length > 0) { node = tree.add(t, 0, t.length, String::toString); if (node.hasValue()) node.getValue().addCategories(list.getCategorySet()); else node.setValue(list); } } } private String[] trimTokens(List<String> tokens, boolean lower) { int i, size, bIdx = -1; for (i=tokens.size()-1; i>=0; i--) { if (StringUtils.containsDigitOnly(tokens.get(i))) tokens.remove(i); else break; } size = tokens.size(); for (i=0; i<size; i++) { if (tokens.get(i).equals(StringConst.LRB)) bIdx = i; else if (tokens.get(i).equals(StringConst.RRB) && bIdx >= 0) { tokens.subList(bIdx, i+1).clear(); break; } } if (tokens.size() == 1 && StringUtils.containsDigitOnly(tokens.get(0))) tokens.clear(); int len = tokens.size(); String[] t = new String[len]; for (i=0; i<len; i++) { t[i] = StringUtils.toSimplifiedForm(tokens.get(i)); if (lower) t[i] = StringUtils.toLowerCase(t[i]); } return t; } static public void main(String[] args) throws Exception { final String typeMapFile = args[0]; // dbpedia.owl.json.xz final String infoMapFile = args[1]; // instances_en.json.xz final String prefixTreeFile = args[2]; // prefix_tree.xz Gson gson = new Gson(); DBPediaTypeMap typeMap = gson.fromJson(new InputStreamReader(IOUtils.createXZBufferedInputStream(typeMapFile)), DBPediaTypeMap.class); DBPediaInfoMap infoMap = gson.fromJson(new InputStreamReader(IOUtils.createXZBufferedInputStream(infoMapFile)), DBPediaInfoMap.class); AbstractTokenizer tokenizer = NLPUtils.getTokenizer(TLanguage.ENGLISH); boolean lower = true; PrefixTreeGenerator ptg = new PrefixTreeGenerator(typeMap, infoMap, NERTag.DBPediaTypeSet); PrefixTree<String,NERInfoSet> prefixTree = ptg.getPrefixTree(tokenizer, lower); ObjectOutputStream out = new ObjectOutputStream(IOUtils.createXZBufferedOutputStream(prefixTreeFile)); out.writeObject(prefixTree); out.close(); String s = "John Emory Democratic Party London Bridge Emory University South Korea Rocky Mountains M16 New Years Eve The Catcher in the Rye Korean Ming Dynasty Euro"; if (lower) s = StringUtils.toLowerCase(s); String[] array = s.split(" "); for (ObjectIntIntTriple<NERInfoSet> t : prefixTree.getAll(array, 0, String::toString, true, true)) System.out.println(t.o.joinTags("_")+" "+Joiner.join(array, " ", t.i1, t.i2+1)); // String[] array = "The Chicago Bulls are an American professional basketball team . They are based in Chicago , Illinois , playing in the Central Division of the Eastern Conference in the National Basketball Association (NBA) . The team was founded on January 26 , 1966 . The Bulls play their home games at the United Center . The Bulls saw their greatest success during the 1990s . They are known for having one of the NBA 's greatest dynasties , winning six NBA championships between 1991 and 1998 with two three-peats . All six championship teams were led by Hall of Famers Michael Jordan , Scottie Pippen and coach Phil Jackson . The Bulls are the only NBA franchise to win multiple championships and never lose an NBA Finals in their history.".split(" "); // ObjectInputStream in = new ObjectInputStream(IOUtils.createXZBufferedInputStream(prefixTreeFile)); // long st, et; // @SuppressWarnings("unchecked") // PrefixTree<String,NERInfoSet> pre= (PrefixTree<String,NERInfoSet>)in.readObject(); in.close(); // ObjectIntPair<NERInfoSet> p; // int i, len = array.length; // st = System.currentTimeMillis(); // for (i=0; i<len; i++) // { // p = pre.getValue(array, i, false); // if (p != null) System.out.println(Joiner.join(array, " ", i, p.i+1)+" - "+p.o); // } // et = System.currentTimeMillis(); // System.out.println(et-st); } }