FastTag.java example

Explorer

Java-AI-Book-Code-master
- mr_temp
  - nlp
    - com
      - knowledgebooks
        mapreduce
        NameFinder.java
        nlp
        ExtractNames.java
        util
        ScoredList.java
        Tokenizer.java
- src
  - database
    - CreateSampleDatabases.java
    - DumpMetaData.java
  - geneticalgorithm
    - Genetic.java
    - TestGenetic.java
  - markov
    - Markov.java
  - neuralnetworks
  - nlp
    - com
      - knowledgebooks
        mapreduce
        NameFinder.java
        nlp
        ASpellWrapper.java
        AutoTagger.java
        ComparableDocument.java
        ExtractNames.java
        FastTag.java
        util
        NameValue.java
        NoiseWords.java
        RunExternal.java
        ScoredList.java
        Tokenizer.java
    - public_domain
      - Stemmer.java
  - opencalais
    - OpenCalaisClient.java
  - powerloom
  - search
  - semanticweb
  - spelling
    - jazzy
      - SpellingJazzyTester.java
    - norvig
      - SpellingSuggestions.java
    - norvigwordpairs
      - SpellingSuggestionsWordPairs.java
  - textsearch
  - weka
    - WekaStocks.java
  - wordnet
    - WordNetTest.java

// Copyright 2003-2008.  Mark Watson (markw@markwatson.com).  All rights reserved.
// This software is released under the LGPL (www.fsf.org)
// For an alternative non-GPL license: contact the author
// THIS SOFTWARE COMES WITH NO WARRANTY


package nlp.com.knowledgebooks.nlp;

import java.io.*;
import java.util.*;


/**
 * <p/>
 * Copyright 2002-2007 by Mark Watson. All rights reserved.
 * <p/>
 * <p/>
 * Copyright 1998-2012 by Mark Watson. All rights reserved.
 * <p/>
 * This software is can be used under either of the following licenses:
 * <p/>
 * 1. LGPL v3<br/>
 * 2. Apache 2
 * <p/>
 */
public class FastTag {

    private static Hashtable<String, String[]> lexicon = new Hashtable<String, String[]>();

    static {
        //System.out.println("Starting to load FastTag data...");
        try {
            //System.out.println("Starting kbs.fasttag.FastTag static initialization...");
            InputStream ins = FastTag.class.getClassLoader().getResourceAsStream("lexicon.txt");
            if (ins == null) {
                ins = new FileInputStream("data/lexicon.txt");
            }
            if (ins == null) {
                System.out.println("Failed to open 'lexicon.txt'");
                System.exit(1);
            } else {
                Scanner scanner =
                        new Scanner(ins);
                scanner.useDelimiter
                        (System.getProperty("line.separator"));
                while (scanner.hasNext()) {
                    parseLine(scanner.next());
                }
                scanner.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }


    /**
     * 
     */
    public FastTag() {
    }

    /**
     * 
     * @param word
     * @return true if the input word is in the lexicon, otherwise return false
     */
    public boolean wordInLexicon(String word) {
        String[] ss = lexicon.get(word);
        if (ss != null) return true;
        // 1/22/2002 mod (from Lisp code): if not in hash, try lower case:
        if (ss == null)
            ss = lexicon.get(word.toLowerCase());
        if (ss != null) return true;
        return false;
    }

    /**
     * 
     * @param words list of strings to tag with parts of speech
     * @return list of strings for part of speech tokens
     */
    public List<String> tag(List<String> words) {
        List<String> ret = new ArrayList<String>(words.size());
        for (int i = 0, size = words.size(); i < size; i++) {
            String[] ss = (String[]) lexicon.get(words.get(i));
            // 1/22/2002 mod (from Lisp code): if not in hash, try lower case:
            if (ss == null)
                ss = lexicon.get(words.get(i).toLowerCase());
            if (ss == null && words.get(i).length() == 1)
                ret.add(words.get(i) + "^");
            if (ss == null)
                ret.add("NN");
            else
                ret.add(ss[0]);
        }
        /**
         * Apply transformational rules
         **/
        for (int i = 0; i < words.size(); i++) {
            String word = ret.get(i);
            //  rule 1: DT, {VBD | VBP} --> DT, NN
            if (i > 0 && ret.get(i - 1).equals("DT")) {
                if (word.equals("VBD")
                        || word.equals("VBP")
                        || word.equals("VB")) {
                    ret.set(i, "NN");
                }
            }
            // rule 2: convert a noun to a number (CD) if "." appears in the word
            if (word.startsWith("N")) {
                if (words.get(i).indexOf(".") > -1) {
                    ret.set(i, "CD");
                }
                try {
                    Float.parseFloat(words.get(i));
                    ret.set(i, "CD");
                } catch (Exception e) {  // ignore: exception OK: this just means that the string could not parse as a number
                }
            }
            // rule 3: convert a noun to a past participle if words.get(i) ends with "ed"
            if (ret.get(i).startsWith("N") && words.get(i).endsWith("ed"))
                ret.set(i,"VBN");
            // rule 4: convert any type to adverb if it ends in "ly";
            if (words.get(i).endsWith("ly"))
                ret.set(i, "RB");
            // rule 5: convert a common noun (NN or NNS) to a adjective if it ends with "al"
            if (ret.get(i).startsWith("NN") && word.endsWith("al"))
                ret.set(i, "JJ");
            // rule 6: convert a noun to a verb if the preceeding work is "would"
            if (i > 0
                    && ret.get(i).startsWith("NN")
                    && words.get(i - 1).equalsIgnoreCase("would"))
                ret.set(i, "VB");
            // rule 7: if a word has been categorized as a common noun and it ends with "s",
            //         then set its type to plural common noun (NNS)
            if (ret.get(i).equals("NN") && words.get(i).endsWith("s"))
                ret.set(i, "NNS");
            // rule 8: convert a common noun to a present participle verb (i.e., a gerand)
            if (ret.get(i).startsWith("NN") && words.get(i).endsWith("ing"))
                ret.set(i, "VBG");
        }
        return ret;
    }

    /**
     * Simple main test program
     * 
     * @param args string to tokenize and tag
     */
    public static void main(String[] args) {
        if (args.length == 0) {
            System.out.println("Usage: argument is a string like \"The ball rolled down the street.\"\n\nSample run:\n");
            List<String> words = nlp.com.knowledgebooks.nlp.util.Tokenizer.wordsToList("The ball rolled down the street.");
            List<String> tags = (new FastTag()).tag(words);
            for (int i = 0; i < words.size(); i++) System.out.println(words.get(i) + "/" + tags.get(i));
        } else {
            List<String> words = nlp.com.knowledgebooks.nlp.util.Tokenizer.wordsToList(args[0]);
            List<String> tags = (new FastTag()).tag(words);
            for (int i = 0; i < words.size(); i++) System.out.println(words.get(i) + "/" + tags.get(i));
        }
    }

    private static void parseLine(String line) {
        int count = 0;
        for (int i=0, size=line.length(); i<size; i++) if (line.charAt(i)==' ') count++;
        if (count==0) return;
        String[] ss = new String[count];
        Scanner lineScanner = new Scanner(line);
        lineScanner.useDelimiter(" ");
        String word = lineScanner.next();    count=0;
        while (lineScanner.hasNext()) {
            ss[count++] = lineScanner.next();
        }
        lexicon.put(word, ss);
    }

}