/*
* Copyright 2004-2010 Information & Software Engineering Group (188/1)
* Institute of Software Technology and Interactive Systems
* Vienna University of Technology, Austria
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.tuwien.ifs.somtoolbox.summarisation.methods;
import java.io.File;
import java.io.FileInputStream;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.zip.GZIPInputStream;
import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.hmm.HmmDecoder;
import com.aliasi.tokenizer.RegExTokenizerFactory;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.util.Streams;
/**
* @author Julius Penaranda
* @version $Id: PartOfSpeech.java 3583 2010-05-21 10:07:41Z mayer $
*/
public class PartOfSpeech {
public static final String NOUN = "noun";
public static final String ADJECTIVE = "adjective";
public static final String VERB = "verb";
private HmmDecoder decoder;
private final String modelpath = "./src/core/rsc/partOfSpeech/medpost.model.gz";
private TokenizerFactory TOKENIZER_FACTORY = new RegExTokenizerFactory("(-|'|\\d|\\p{L})+|\\S");
public PartOfSpeech() {
}
/**
* reads Model
*/
public void readModel() {
try {
File model = new File(modelpath);
System.out.println("Reading model from file=" + model.toString());
GZIPInputStream fileIn = new GZIPInputStream(new FileInputStream(model));
ObjectInputStream objIn = new ObjectInputStream(fileIn);
HiddenMarkovModel hmm = (HiddenMarkovModel) objIn.readObject();
Streams.closeInputStream(objIn);
decoder = new HmmDecoder(hmm);
} catch (Exception e) {
System.out.println("PartofSpeech_Feature: readModel(): " + e.getMessage());
}
}
/** returns type of tokens in the given line sentence, whether nouns, verbs... */
public ArrayList<String> getTokens(String line, String type) {
Tokenizer tokenizer;
String[] tokens;
String[] tags;
ArrayList<String> resultarray = new ArrayList<String>();
try {
char[] cs = line.toCharArray();
tokenizer = TOKENIZER_FACTORY.tokenizer(cs, 0, cs.length);
tokens = tokenizer.tokenize();
tags = decoder.firstBest(tokens);
for (int i = 0; i < tokens.length; ++i) {
// FIXME: refactor this!
// # of nouns
if (type == NOUN) {
if (tags[i].equals("NN") || tags[i].equals("NNP") || tags[i].equals("NNS")) {
// System.out.println("token: "+tokens[i]+" tag: "+tags[i]);
resultarray.add(tokens[i]);
}
}
// # of adjectives
if (type == ADJECTIVE) {
if (tags[i].equals("JJ") || tags[i].equals("JJR") || tags[i].equals("JJT")) {
resultarray.add(tokens[i]);
}
}
// # of verbs
if (type == VERB) {
if (tags[i].equals("VVB") || tags[i].equals("VVD") || tags[i].equals("VVG")
|| tags[i].equals("VVI") || tags[i].equals("VVN") || tags[i].equals("VVNJ")
|| tags[i].equals("VVGJ") || tags[i].equals("VVGN") || tags[i].equals("VVZ")) {
resultarray.add(tokens[i]);
}
}
// # of rel. pronouns
if (tags[i].equals("PNR")) {
// do nothing
}
// # of prepositions
if (tags[i].equals("II")) {
// do nothing
}
// # of adverbs
if (tags[i].equals("RR") || tags[i].equals("RRR") || tags[i].equals("RRT")) {
// do nothing
}
// # of articles
if (tokens[i].equals("the") || tokens[i].equals("The") || tokens[i].equals("THE")
|| tokens[i].equals("a") || tokens[i].equals("A") || tokens[i].equals("an")
|| tokens[i].equals("An") || tokens[i].equals("AN")) {
// do nothing
}
// # of pronouns
if (tags[i].equals("PN") || tags[i].equals("PND") || tags[i].equals("PNG")) {
// do nothing
}
// # of modals
if (tags[i].equals("VM") || tags[i].equals("VBB") || tags[i].equals("VBD") || tags[i].equals("VBG")
|| tags[i].equals("VBI") || tags[i].equals("VBN") || tags[i].equals("VBZ")
|| tags[i].equals("VDB") || tags[i].equals("VDD") || tags[i].equals("VDG")
|| tags[i].equals("VDI") || tags[i].equals("VDN") || tags[i].equals("VDZ")
|| tags[i].equals("VHB") || tags[i].equals("VHD") || tags[i].equals("VHG")
|| tags[i].equals("VHI") || tags[i].equals("VHN") || tags[i].equals("VHZ")) {
// do nothing
}
}
} catch (Exception e) {
e.printStackTrace();
System.out.println("Error occured in LingPipe!");
}
return resultarray;
}
}