// Copyright 2003-2008. Mark Watson (markw@markwatson.com). All rights reserved. // This software is released under the LGPL (www.fsf.org) // For an alternative non-GPL license: contact the author // THIS SOFTWARE COMES WITH NO WARRANTY package nlp.com.knowledgebooks.nlp; import java.io.*; import java.util.*; /** * <p/> * Copyright 2002-2007 by Mark Watson. All rights reserved. * <p/> * <p/> * Copyright 1998-2012 by Mark Watson. All rights reserved. * <p/> * This software is can be used under either of the following licenses: * <p/> * 1. LGPL v3<br/> * 2. Apache 2 * <p/> */ public class FastTag { private static Hashtable<String, String[]> lexicon = new Hashtable<String, String[]>(); static { //System.out.println("Starting to load FastTag data..."); try { //System.out.println("Starting kbs.fasttag.FastTag static initialization..."); InputStream ins = FastTag.class.getClassLoader().getResourceAsStream("lexicon.txt"); if (ins == null) { ins = new FileInputStream("data/lexicon.txt"); } if (ins == null) { System.out.println("Failed to open 'lexicon.txt'"); System.exit(1); } else { Scanner scanner = new Scanner(ins); scanner.useDelimiter (System.getProperty("line.separator")); while (scanner.hasNext()) { parseLine(scanner.next()); } scanner.close(); } } catch (Exception e) { e.printStackTrace(); } } /** * */ public FastTag() { } /** * * @param word * @return true if the input word is in the lexicon, otherwise return false */ public boolean wordInLexicon(String word) { String[] ss = lexicon.get(word); if (ss != null) return true; // 1/22/2002 mod (from Lisp code): if not in hash, try lower case: if (ss == null) ss = lexicon.get(word.toLowerCase()); if (ss != null) return true; return false; } /** * * @param words list of strings to tag with parts of speech * @return list of strings for part of speech tokens */ public List<String> tag(List<String> words) { List<String> ret = new ArrayList<String>(words.size()); for (int i = 0, size = words.size(); i < size; i++) { String[] ss = (String[]) lexicon.get(words.get(i)); // 1/22/2002 mod (from Lisp code): if not in hash, try lower case: if (ss == null) ss = lexicon.get(words.get(i).toLowerCase()); if (ss == null && words.get(i).length() == 1) ret.add(words.get(i) + "^"); if (ss == null) ret.add("NN"); else ret.add(ss[0]); } /** * Apply transformational rules **/ for (int i = 0; i < words.size(); i++) { String word = ret.get(i); // rule 1: DT, {VBD | VBP} --> DT, NN if (i > 0 && ret.get(i - 1).equals("DT")) { if (word.equals("VBD") || word.equals("VBP") || word.equals("VB")) { ret.set(i, "NN"); } } // rule 2: convert a noun to a number (CD) if "." appears in the word if (word.startsWith("N")) { if (words.get(i).indexOf(".") > -1) { ret.set(i, "CD"); } try { Float.parseFloat(words.get(i)); ret.set(i, "CD"); } catch (Exception e) { // ignore: exception OK: this just means that the string could not parse as a number } } // rule 3: convert a noun to a past participle if words.get(i) ends with "ed" if (ret.get(i).startsWith("N") && words.get(i).endsWith("ed")) ret.set(i,"VBN"); // rule 4: convert any type to adverb if it ends in "ly"; if (words.get(i).endsWith("ly")) ret.set(i, "RB"); // rule 5: convert a common noun (NN or NNS) to a adjective if it ends with "al" if (ret.get(i).startsWith("NN") && word.endsWith("al")) ret.set(i, "JJ"); // rule 6: convert a noun to a verb if the preceeding work is "would" if (i > 0 && ret.get(i).startsWith("NN") && words.get(i - 1).equalsIgnoreCase("would")) ret.set(i, "VB"); // rule 7: if a word has been categorized as a common noun and it ends with "s", // then set its type to plural common noun (NNS) if (ret.get(i).equals("NN") && words.get(i).endsWith("s")) ret.set(i, "NNS"); // rule 8: convert a common noun to a present participle verb (i.e., a gerand) if (ret.get(i).startsWith("NN") && words.get(i).endsWith("ing")) ret.set(i, "VBG"); } return ret; } /** * Simple main test program * * @param args string to tokenize and tag */ public static void main(String[] args) { if (args.length == 0) { System.out.println("Usage: argument is a string like \"The ball rolled down the street.\"\n\nSample run:\n"); List<String> words = nlp.com.knowledgebooks.nlp.util.Tokenizer.wordsToList("The ball rolled down the street."); List<String> tags = (new FastTag()).tag(words); for (int i = 0; i < words.size(); i++) System.out.println(words.get(i) + "/" + tags.get(i)); } else { List<String> words = nlp.com.knowledgebooks.nlp.util.Tokenizer.wordsToList(args[0]); List<String> tags = (new FastTag()).tag(words); for (int i = 0; i < words.size(); i++) System.out.println(words.get(i) + "/" + tags.get(i)); } } private static void parseLine(String line) { int count = 0; for (int i=0, size=line.length(); i<size; i++) if (line.charAt(i)==' ') count++; if (count==0) return; String[] ss = new String[count]; Scanner lineScanner = new Scanner(line); lineScanner.useDelimiter(" "); String word = lineScanner.next(); count=0; while (lineScanner.hasNext()) { ss[count++] = lineScanner.next(); } lexicon.put(word, ss); } }