package com.knowledgebooks.nlp;
import com.knowledgebooks.nlp.util.ScoredList;
import com.knowledgebooks.nlp.util.Tokenizer;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.util.*;
/**
* Wrapper for code to find both human and place names in input text.
*/
/**
* Copyright Mark Watson 2008-2010. All Rights Reserved.
* License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt)
*/
public class ExtractNames {
/**
* Facade method: get all place and human names from a text string
*
* @param words
* @return
*/
public ScoredList[] getProperNames(List<String> words) {
ScoredList placeNames = new ScoredList();
ScoredList humanNames = new ScoredList();
ScoredList[] ret = new ScoredList[2];
ret[0] = humanNames;
ret[1] = placeNames;
if (words == null) return ret;
for (int i = 0; i < words.size(); i++) {
// 5 word human names:
if (isHumanName(words, i, 5)) {
String s = words.get(i) + " " + words.get(i + 1) + " " + words.get(i + 2) + " " + words.get(i + 3) + " " + words.get(i + 4);
humanNames.addValue(s);
i += 4;
continue;
}
// 4 word human names:
if (isHumanName(words, i, 4)) {
String s = words.get(i) + " " + words.get(i + 1) + " " + words.get(i + 2) + " " + words.get(i + 3);
humanNames.addValue(s);
i += 3;
continue;
}
// 3 word names:
if (isPlaceName(words, i, 3)) {
String s = words.get(i) + " " + words.get(i + 1) + " " + words.get(i + 2);
placeNames.addValue(s);
i += 2;
continue;
}
if (isHumanName(words, i, 3)) {
String s = words.get(i) + " " + words.get(i + 1) + " " + words.get(i + 2);
humanNames.addValue(s);
i += 2;
continue;
}
// 2 word names:
if (isPlaceName(words, i, 2)) {
String s = words.get(i) + " " + words.get(i + 1);
placeNames.addValue(s);
i += 1;
continue;
}
if (isHumanName(words, i, 2)) {
String s = words.get(i) + " " + words.get(i + 1);
humanNames.addValue(s);
i += 1;
continue;
}
// 1 word names:
if (isPlaceName(words, i, 1)) {
placeNames.addValue(words.get(i));
continue;
}
}
return ret;
}
/**
* @param s
* @return
*/
public ScoredList[] getProperNames(String s) {
List<String> words = Tokenizer.wordsToList(s);
return getProperNames(words);
}
public List<List<String>> getProperNamesAsStrings(String s) {
List<List<String>> ret = new ArrayList<List<String>>();
ScoredList[] sl = getProperNames(s);
List<String> human_names = new ArrayList<String>();
for (int i = 0, size = sl[0].size(); i < size; i++) human_names.add(sl[0].getValue(i) + ":" + sl[0].getScore(i));
List<String> place_names = new ArrayList<String>();
for (int i = 0, size = sl[1].size(); i < size; i++) place_names.add(sl[1].getValue(i) + ":" + sl[1].getScore(i));
ret.add(human_names);
ret.add(place_names);
System.out.println("** " + place_names);
return ret;
}
/**
* @param words
* @param startIndex
* @param numWords
* @return
*/
public boolean isPlaceName(List<String> words, int startIndex, int numWords) {
if ((startIndex + numWords) > words.size()) return false;
if (numWords == 1) return isPlaceName(words.get(startIndex));
String s = "";
for (int i = startIndex; i < (startIndex + numWords); i++) {
if (i < (startIndex + numWords - 1)) s = s + words.get(startIndex) + " ";
else s = s + words.get(startIndex);
}
return isPlaceName(s);
}
/**
* @param name
* @return
*/
public boolean isPlaceName(String name) {
//if (placeNameHash.get(name) != null)
// System.out.println("* place name: " + name + ", placeNameHash.get(name): " + placeNameHash.get(name));
return placeNameHash.get(name) != null;
}
/**
* @param s
* @return
*/
public boolean isHumanName(String s) {
List<String> ss = Tokenizer.wordsToList(s);
//System.out.print("Tokens: "); for (int i=0; i<ss.length; i++) System.out.print(ss[i] + " "); System.out.println();
if (ss == null) return false;
return isHumanName(ss);
}
/**
* @param words
* @param index
* @param numWords
* @return
*/
public boolean isHumanName(List<String> words, int index, int numWords) {
if ((index + numWords) > words.size()) return false;
if (numWords == 1) {
return isHumanName(Arrays.asList(words.get(index)));
}
if (numWords == 2) {
return isHumanName(Arrays.asList(words.get(index), words.get(index + 1)));
}
if (numWords == 3) {
return isHumanName(Arrays.asList(words.get(index), words.get(index + 1), words.get(index + 2)));
}
if (numWords == 4) {
return isHumanName(Arrays.asList(words.get(index), words.get(index + 1), words.get(index + 2), words.get(index + 3)));
}
if (numWords == 5) {
return isHumanName(Arrays.asList(words.get(index), words.get(index + 1), words.get(index + 2), words.get(index + 3), words.get(index + 4)));
}
return false;
}
/**
* @param words
* @return
*/
public boolean isHumanName(List<String> words) {
int len = words.size();
if (len == 1) {
if (lastNameHash.get(words.get(0)) != null) return true;
} else if (len == 2) {
if (firstNameHash.get(words.get(0)) != null && lastNameHash.get(words.get(1)) != null) return true;
if (prefixHash.get(words.get(0)) != null && lastNameHash.get(words.get(1)) != null) return true;
} else if (len == 3) {
if (firstNameHash.get(words.get(0)) != null &&
firstNameHash.get(words.get(1)) != null &&
lastNameHash.get(words.get(2)) != null) return true;
if (prefixHash.get(words.get(0)) != null &&
firstNameHash.get(words.get(1)) != null &&
lastNameHash.get(words.get(2)) != null) return true;
if (prefixHash.get(words.get(0)) != null &&
words.get(1).equals(".") &&
lastNameHash.get(words.get(2)) != null) return true;
} else if (len == 4) {
if (firstNameHash.get(words.get(0)) != null &&
firstNameHash.get(words.get(1)) != null &&
firstNameHash.get(words.get(2)) != null &&
lastNameHash.get(words.get(3)) != null) return true;
if (firstNameHash.get(words.get(0)) != null &&
words.get(1).length() == 1 &&
words.get(2).equals(".") &&
lastNameHash.get(words.get(3)) != null) return true;
if (prefixHash.get(words.get(0)) != null &&
firstNameHash.get(words.get(1)) != null &&
firstNameHash.get(words.get(2)) != null &&
lastNameHash.get(words.get(3)) != null) return true;
if (prefixHash.get(words.get(0)) != null &&
firstNameHash.get(words.get(1)) != null &&
words.get(2).length() == 1 &&
lastNameHash.get(words.get(3)) != null) return true;
} else if (len == 5) {
if (firstNameHash.get(words.get(0)) != null &&
firstNameHash.get(words.get(1)) != null &&
words.get(2).length() == 1 &&
words.get(3).equals(".") &&
lastNameHash.get(words.get(4)) != null) return true;
if (prefixHash.get(words.get(0)) != null &&
firstNameHash.get(words.get(1)) != null &&
words.get(2).length() == 1 &&
words.get(3).equals(".") &&
lastNameHash.get(words.get(4)) != null) return true;
}
return false;
}
/**
*
*/
public ExtractNames() {
this("data/propername.ser");
}
/**
* @param dataPath
*/
public ExtractNames(String dataPath) {
if (lastNameHash != null) return; // static data already loaded
try {
InputStream ins =
this.getClass().getClassLoader().getResourceAsStream(dataPath);
if (ins == null) {
ins = this.getClass().getClassLoader().getResourceAsStream(dataPath);
}
if (ins == null) {
ins = new FileInputStream(dataPath);
}
if (ins == null) {
System.out.println("\ncom.knowledgebooks.entity_extraction.Names: failed to open '" + dataPath + "'\n");
System.exit(1);
} else {
ObjectInputStream p = new ObjectInputStream(ins);
lastNameHash = (Hashtable) p.readObject();
firstNameHash = (Hashtable) p.readObject();
placeNameHash = (Hashtable) p.readObject();
prefixHash = (Hashtable) p.readObject();
ins.close();
}
} catch (Exception ee) {
ee.printStackTrace();
}
System.out.println("# last names=" + lastNameHash.size() + ", # first names=" + firstNameHash.size());
}
/**
* @param args
*/
static public void main(String[] args) {
ExtractNames extractNames = new ExtractNames();
// initialize everything, before printing any output - trying to see what is taking so long!
if (args.length > 0) {
ScoredList[] ret = extractNames.getProperNames(args[0]);
System.out.println("Human names: " + ret[0].getValuesAsString());
System.out.println("Place names: " + ret[1].getValuesAsString());
} else {
extractNames.isPlaceName("Paris");
extractNames.isHumanName("President Bush");
extractNames.isHumanName("President George Bush");
extractNames.isHumanName("President George W. Bush");
System.out.println("Initialization complete....");
System.out.println("Paris: " + extractNames.isPlaceName("Paris"));
System.out.println("Mexico: " + extractNames.isPlaceName("Mexico"));
System.out.println("Fresno: " + extractNames.isPlaceName("Fresno"));
System.out.println("Moscow: " + extractNames.isPlaceName("Moscow"));
System.out.println("France: " + extractNames.isPlaceName("France"));
System.out.println("Los Angeles: " + extractNames.isPlaceName("Los Angeles"));
System.out.println("President Bush: " + extractNames.isHumanName("President Bush"));
System.out.println("President George Bush: " + extractNames.isHumanName("President George Bush"));
System.out.println("President George W. Bush: " + extractNames.isHumanName("President George W. Bush"));
System.out.println("George W. Bush: " + extractNames.isHumanName("George W. Bush"));
System.out.println("Senator Barbara Boxer: " + extractNames.isHumanName("Senator Barbara Boxer"));
System.out.println("King Smith: " + extractNames.isHumanName("King Smith"));
ScoredList[] ret = extractNames.getProperNames("George Bush played golf. President George W. Bush went to London England, Paris France and Mexico to see Mary Smith in Moscow. President Bush will return home Monday.");
System.out.println("Human names: " + ret[0].getValuesAsString());
System.out.println("Place names: " + ret[1].getValuesAsString());
System.out.println("\n\n\n");
// for book example:
ExtractNames names = new ExtractNames();
System.out.println("Los Angeles: " +
names.isPlaceName("Los Angeles"));
System.out.println("President Bush: " +
names.isHumanName("President Bush"));
System.out.println("President George Bush: " +
names.isHumanName("President George Bush"));
System.out.println("President George W. Bush: " +
names.isHumanName("President George W. Bush"));
ScoredList[] ret1 = names.getProperNames(
"George Bush played golf. President George W. Bush went to London England, Paris France and Mexico to see Mary Smith in Moscow. President Bush will return home Monday.");
System.out.println("Human names: " +
ret1[0].getValuesAsString());
System.out.println("Place names: " +
ret1[1].getValuesAsString());
}
}
static Hashtable lastNameHash = null;
static Hashtable firstNameHash = null;
static Hashtable placeNameHash = null; // cache for database access
static Hashtable prefixHash = null;
}