package nlp.com.knowledgebooks.nlp; import nlp.com.knowledgebooks.nlp.util.ScoredList; import nlp.com.knowledgebooks.nlp.util.Tokenizer; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.OutputStreamWriter; import java.util.Arrays; import java.util.Enumeration; import java.util.Hashtable; import java.util.List; /** * Wrapper for code to find both human and place names in input text. * * <p/> * Copyright 2002-2008 by Mark Watson. All rights reserved. * <p/> * <p/> * Copyright 1998-2012 by Mark Watson. All rights reserved. * <p/> * This software is can be used under either of the following licenses: * <p/> * 1. LGPL v3<br/> * 2. Apache 2 * <p/> */ public class ExtractNames { /** * Facade method: get all place and human names from a text string * @param words * @return */ public ScoredList[] getProperNames(List<String> words) { ScoredList placeNames = new ScoredList(); ScoredList humanNames = new ScoredList(); ScoredList[] ret = new ScoredList[2]; ret[0] = humanNames; ret[1] = placeNames; if (words == null) return ret; for (int i=0; i<words.size(); i++) { // 5 word human names: if (isHumanName(words, i, 5)) { String s = words.get(i) + " " + words.get(i+1) + " " + words.get(i+2) + " " + words.get(i+3) + " " + words.get(i+4); humanNames.addValue(s); i += 4; continue; } // 4 word human names: if (isHumanName(words, i, 4)) { String s = words.get(i) + " " + words.get(i+1) + " " + words.get(i+2) + " " + words.get(i+3); humanNames.addValue(s); i += 3; continue; } // 3 word names: if (isPlaceName(words, i, 3)) { String s = words.get(i) + " " + words.get(i+1) + " " + words.get(i+2); placeNames.addValue(s); i += 2; continue; } if (isHumanName(words, i, 3)) { String s = words.get(i) + " " + words.get(i+1) + " " + words.get(i+2); humanNames.addValue(s); i += 2; continue; } // 2 word names: if (isPlaceName(words, i, 2)) { String s = words.get(i) + " " + words.get(i+1); placeNames.addValue(s); i += 1; continue; } if (isHumanName(words, i, 2)) { String s = words.get(i) + " " + words.get(i+1); humanNames.addValue(s); i += 1; continue; } // 1 word names: if (isPlaceName(words, i, 1)) { placeNames.addValue(words.get(i)); continue; } } return ret; } /** * * @param s * @return */ public ScoredList[] getProperNames(String s) { List<String> words = Tokenizer.wordsToList(s); return getProperNames(words); } /** * * @param words * @param startIndex * @param numWords * @return */ public boolean isPlaceName(List<String> words, int startIndex, int numWords) { if ((startIndex + numWords) > words.size()) return false; if (numWords == 1) return isPlaceName(words.get(startIndex)); String s = ""; for (int i=startIndex; i<(startIndex + numWords); i++) { if (i < (startIndex + numWords - 1)) s = s + words.get(startIndex) + " "; else s = s + words.get(startIndex); } return isPlaceName(s); } /** * * @param name * @return */ public boolean isPlaceName(String name) { if (placeNameHash.get(name)!=null) System.out.println("* place name: "+name+", placeNameHash.get(name): "+placeNameHash.get(name)); return placeNameHash.get(name) != null; } /** * * @param s * @return */ public boolean isHumanName(String s) { List<String> ss = Tokenizer.wordsToList(s); //System.out.print("Tokens: "); for (int i=0; i<ss.length; i++) System.out.print(ss[i] + " "); System.out.println(); if (ss == null) return false; return isHumanName(ss); } /** * * @param words * @param index * @param numWords * @return */ public boolean isHumanName(List<String> words, int index, int numWords) { if ((index + numWords) > words.size()) return false; if (numWords == 1) { return isHumanName(Arrays.asList(words.get(index))); } if (numWords == 2) { return isHumanName(Arrays.asList(words.get(index), words.get(index+1))); } if (numWords == 3) { return isHumanName(Arrays.asList(words.get(index), words.get(index+1), words.get(index+2))); } if (numWords == 4) { return isHumanName(Arrays.asList(words.get(index), words.get(index+1), words.get(index+2), words.get(index+3))); } if (numWords == 5) { return isHumanName(Arrays.asList(words.get(index), words.get(index+1), words.get(index+2), words.get(index+3), words.get(index+4))); } return false; } /** * * @param words * @return */ public boolean isHumanName(List<String> words) { int len = words.size(); if (len == 1) { if (lastNameHash.get(words.get(0)) != null) return true; } else if (len == 2) { if (firstNameHash.get(words.get(0)) != null && lastNameHash.get(words.get(1)) != null) return true; if (prefixHash.get(words.get(0)) != null && lastNameHash.get(words.get(1)) != null) return true; } else if (len == 3) { if (firstNameHash.get(words.get(0)) != null && firstNameHash.get(words.get(1)) != null && lastNameHash.get(words.get(2)) != null) return true; if (prefixHash.get(words.get(0)) != null && firstNameHash.get(words.get(1)) != null && lastNameHash.get(words.get(2)) != null) return true; if (prefixHash.get(words.get(0)) != null && words.get(1).equals(".") && lastNameHash.get(words.get(2)) != null) return true; } else if (len == 4) { if (firstNameHash.get(words.get(0)) != null && firstNameHash.get(words.get(1)) != null && firstNameHash.get(words.get(2)) != null && lastNameHash.get(words.get(3)) != null) return true; if (firstNameHash.get(words.get(0)) != null && words.get(1).length() == 1 && words.get(2).equals(".") && lastNameHash.get(words.get(3)) != null) return true; if (prefixHash.get(words.get(0)) != null && firstNameHash.get(words.get(1)) != null && firstNameHash.get(words.get(2)) != null && lastNameHash.get(words.get(3)) != null) return true; if (prefixHash.get(words.get(0)) != null && firstNameHash.get(words.get(1)) != null && words.get(2).length()==1 && lastNameHash.get(words.get(3)) != null) return true; } else if (len == 5) { if (firstNameHash.get(words.get(0)) != null && firstNameHash.get(words.get(1)) != null && words.get(2).length()==1 && words.get(3).equals(".") && lastNameHash.get(words.get(4)) != null) return true; if (prefixHash.get(words.get(0)) != null && firstNameHash.get(words.get(1)) != null && words.get(2).length()==1 && words.get(3).equals(".") && lastNameHash.get(words.get(4)) != null) return true; } return false; } /** * */ public ExtractNames() { this("test_data/propername.ser"); } /** * * @param dataPath */ public ExtractNames(String dataPath) { if (lastNameHash != null) return; // static data already loaded try { InputStream ins = this.getClass().getClassLoader().getResourceAsStream(dataPath); if (ins == null) { ins = this.getClass().getClassLoader().getResourceAsStream(dataPath); } if (ins == null) { ins = new FileInputStream(dataPath); } if (ins == null) { System.out.println("\ncom.knowledgebooks.entity_extraction.Names: failed to open '" + dataPath + "'\n"); System.exit(1); } else { ObjectInputStream p = new ObjectInputStream(ins); lastNameHash = (Hashtable) p.readObject(); firstNameHash = (Hashtable) p.readObject(); placeNameHash = (Hashtable) p.readObject(); prefixHash = (Hashtable) p.readObject(); ins.close(); FileOutputStream fos = new FileOutputStream("lastnames.txt"); OutputStreamWriter out = new OutputStreamWriter(fos); Enumeration enum2 = lastNameHash.keys(); while (enum2.hasMoreElements()) { Object key = enum2.nextElement(); out.write(key+"\n"); } out.close(); // temp: write out hash tables: if (true) { fos = new FileOutputStream("firstnames.txt"); out = new OutputStreamWriter(fos); enum2 = firstNameHash.keys(); while (enum2.hasMoreElements()) { Object key = enum2.nextElement(); out.write(key+"\n"); } out.close(); fos = new FileOutputStream("placenames.txt"); out = new OutputStreamWriter(fos); enum2 = placeNameHash.keys(); while (enum2.hasMoreElements()) { Object key2 = enum2.nextElement(); String key = "" + key2; if (key.indexOf(';') != -1) key = key.substring(0,key.indexOf(';')); if (key.indexOf('(') != -1) key = key.substring(0,key.indexOf('(')); if (key.indexOf(',') != -1) key = key.substring(0,key.indexOf(',')); key = key.trim(); out.write(key+":" + placeNameHash.get(key) +"\n"); } out.close(); fos = new FileOutputStream("prefixnames.txt"); out = new OutputStreamWriter(fos); enum2 = prefixHash.keys(); while (enum2.hasMoreElements()) { Object key = enum2.nextElement(); out.write(key+"\n"); } out.close(); } } } catch (Exception ee) { ee.printStackTrace(); } System.out.println("# last names="+lastNameHash.size()+", # first names="+firstNameHash.size()); } /** * * @param args */ static public void main(String[] args) { ExtractNames extractNames = new ExtractNames(); // initialize everything, before printing any output - trying to see what is taking so long! if (args.length>0) { ScoredList[] ret = extractNames.getProperNames(args[0]); System.out.println("Human names: " + ret[0].getValuesAsString()); System.out.println("Place names: " + ret[1].getValuesAsString()); } else { extractNames.isPlaceName("Paris"); extractNames.isHumanName("President Bush"); extractNames.isHumanName("President George Bush"); extractNames.isHumanName("President George W. Bush"); System.out.println("Initialization complete...."); System.out.println("Paris: " + extractNames.isPlaceName("Paris")); System.out.println("Mexico: " + extractNames.isPlaceName("Mexico")); System.out.println("Fresno: " + extractNames.isPlaceName("Fresno")); System.out.println("Moscow: " + extractNames.isPlaceName("Moscow")); System.out.println("France: " + extractNames.isPlaceName("France")); System.out.println("Los Angeles: " + extractNames.isPlaceName("Los Angeles")); System.out.println("President Bush: " + extractNames.isHumanName("President Bush")); System.out.println("President George Bush: " + extractNames.isHumanName("President George Bush")); System.out.println("President George W. Bush: " + extractNames.isHumanName("President George W. Bush")); System.out.println("George W. Bush: " + extractNames.isHumanName("George W. Bush")); System.out.println("Senator Barbara Boxer: " + extractNames.isHumanName("Senator Barbara Boxer")); System.out.println("King Smith: " + extractNames.isHumanName("King Smith")); ScoredList[] ret = extractNames.getProperNames("George Bush played golf. President George W. Bush went to London England, Paris France and Mexico to see Mary Smith in Moscow. President Bush will return home Monday."); System.out.println("Human names: " + ret[0].getValuesAsString()); System.out.println("Place names: " + ret[1].getValuesAsString()); System.out.println("\n\n\n"); // for book example: ExtractNames names = new ExtractNames(); System.out.println("Los Angeles: " + names.isPlaceName("Los Angeles")); System.out.println("President Bush: " + names.isHumanName("President Bush")); System.out.println("President George Bush: " + names.isHumanName("President George Bush")); System.out.println("President George W. Bush: " + names.isHumanName("President George W. Bush")); ScoredList[] ret1 = names.getProperNames( "George Bush played golf. President George W. Bush went to London England, Paris France and Mexico to see Mary Smith in Moscow. President Bush will return home Monday."); System.out.println("Human names: " + ret1[0].getValuesAsString()); System.out.println("Place names: " + ret1[1].getValuesAsString()); } } static Hashtable lastNameHash = null; static Hashtable firstNameHash = null; static Hashtable placeNameHash = null; // cache for database access static Hashtable prefixHash = null; }