package edu.uncc.cs.watsonsim.researchers; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import org.apache.commons.lang3.StringUtils; import opennlp.tools.cmdline.parser.ParserTool; import opennlp.tools.parser.Parse; import opennlp.tools.parser.Parser; import opennlp.tools.parser.ParserFactory; import opennlp.tools.parser.ParserModel; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.util.InvalidFormatException; //@author Dhaval Patel /* * Scores based on matching POS structure of 'Question+Answer' and 'each sentence of passage'. * I haven't made it extending the PassageScorer. Just showed how it works with sample strings. * */ public class POSStructureScorer { public static String modelsPath = "PASTE-PATH-HERE"; // models directory private static File parserMFile; private static File sentDetectorMFile; private static File posMFile; public static SentenceModel sentenceModel; // sentence detection model public static ParserModel parserModel; // parsing model public static POSTaggerME tagger; public static void init() throws InvalidFormatException { File modelsDir = new File(modelsPath); parserMFile = new File(modelsDir, "en-parser-chunking.bin"); sentDetectorMFile = new File(modelsDir, "en-sent.bin"); posMFile = new File(modelsDir, "en-pos-maxent.bin"); InputStream sentModelIn = null; FileInputStream parserStream; try { // for finding sentences sentModelIn = new FileInputStream(sentDetectorMFile); sentenceModel = new SentenceModel(sentModelIn); // for finding POS FileInputStream posModelStream = new FileInputStream(posMFile); POSModel model = new POSModel(posModelStream); tagger = new POSTaggerME(model); // for parsing parserStream = new FileInputStream(parserMFile); parserModel = new ParserModel(parserStream); } catch (FileNotFoundException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static Parse[] parsePassageText(String p) throws InvalidFormatException{ //initialize SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel); Parser parser = ParserFactory.create( parserModel, 20, // beam size 0.95); // advance percentage String[] sentences = sentenceDetector.sentDetect(p); Parse[] results = new Parse[sentences.length]; for (int i=0;i<sentences.length;i++){ String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]); String sent= StringUtils.join(tks," "); System.out.println("Found sentence " + sent); Parse[] sentResults = ParserTool.parseLine(sent,parser, 1); results[i]=sentResults[0]; } return results; } public static void main(String[] args) throws InvalidFormatException { init(); String sampleQuestion = "Jane Austen"; String sampleAnswer = "Jane Austen wrote Emma"; String samplePassage = "Jane Austen was very modest about her own genius.[7] She once famously described her work as "+ "the little bit (two Inches wide) of Ivory, on which I work with so fine a brush, " + "as produces little effect after much labor [7]. " + "Jane Austen wrote Emma."+ "When she was a girl she wrote stories. Her works were printed only after much revision. " + "Only four of her novels were printed while she was alive. They were Sense and Sensibility (1811), " + "Pride and Prejudice (1813), Mansfield Park (1814) and Emma (1816). " + "Two other novels, Northanger Abbey and Persuasion, were printed in 1817 with " + "a biographical notice by her brother, Henry Austen. Persuasion was written shortly before her death. " + "She also wrote two earlier works, Lady Susan, and an unfinished novel, The Watsons. " + "She had been working on a new novel, Sanditon, but she died before she could finish it."; String sampleQACombined = sampleAnswer + sampleQuestion; Parse[] sentences = parsePassageText(samplePassage); int[] scorerModelQA = POSScoreSentece(sampleQACombined); int[] scorerModelEachSentenceInPassage; double tempScore = 0; double finalScore = 0; for (int i = 0; i < sentences.length; i++) { scorerModelEachSentenceInPassage = POSScoreSentece(sentences[i].toString()); tempScore = AbsoluteScorerModelSubtractor(scorerModelQA,scorerModelEachSentenceInPassage); System.out.println("tempScore = "+tempScore); if(tempScore<= 0.1*sentences[i].toString().length()) { finalScore = finalScore + tempScore; } } System.out.println("Final Score is : " + finalScore); } private static double AbsoluteScorerModelSubtractor(int[] scorerModelQA, int[] scorerModelEachSentenceInPassage) { // TODO Auto-generated method stub return Math.abs(scorerModelQA[0]-scorerModelEachSentenceInPassage[0])+ Math.abs(scorerModelQA[1]-scorerModelEachSentenceInPassage[1])+ Math.abs(scorerModelQA[2]-scorerModelEachSentenceInPassage[2])+ Math.abs(scorerModelQA[3]-scorerModelEachSentenceInPassage[3])+ Math.abs(scorerModelQA[4]-scorerModelEachSentenceInPassage[4])+ Math.abs(scorerModelQA[5]-scorerModelEachSentenceInPassage[5]); } private static int[] POSScoreSentece(String sampleQACombined) { // TODO Auto-generated method stub int[] scorerModel = { 0, 0, 0, 0, 0, 0 }; String[] words = SimpleTokenizer.INSTANCE.tokenize(sampleQACombined); String[] result = tagger.tag(words); for (int i = 0; i < result.length; i++) { System.out.println(result[i]); } for (int i=0 ; i < words.length; i++) { if(result[i].equals("CD")){ scorerModel[0]++; }else if(result[i].equals("EX")){ scorerModel[1]++; }else if(result[i].equals("JJ") || result[i].equals("JJR") || result[i].equals("JJS")){ scorerModel[2]++; }else if(result[i].equals("NN") || result[i].equals("NNS") || result[i].equals("NNP") || result[i].equals("NNPS")){ scorerModel[3]++; }else if(result[i].equals("RB") || result[i].equals("RBR") || result[i].equals("RBS")){ scorerModel[4]++; }else if(result[i].equals("VB") || result[i].equals("VBD") || result[i].equals("VBG") || result[i].equals("VBN") || result[i].equals("VBP") || result[i].equals("VBZ")){ scorerModel[5]++; } } return scorerModel; } }