package edu.uncc.cs.watsonsim.scorers;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import opennlp.tools.cmdline.parser.ParserTool;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
//@author Varsha Devadas
public class NERScorer {
private boolean modelsAreInitialized=false;
public String modelsPath="/home/varsha/opennlpModels"; //models directory
private File parserMFile;
private File sentDetectorMFile;
private File posMFile;
private File nerMFile;
public SentenceModel sentenceModel; //sentence detection model
public ParserModel parserModel; //parsing model
public POSTaggerME tagger;
public TokenNameFinderModel nerModel;
public String ca="Jane Austen";
public String q="Jane Austen wrote Emma .";
public String passage="Jane Austen was very modest about her own genius.[7] She once famously described her work as "+
"the little bit (two Inches wide) of Ivory, on which I work with so fine a brush, " +
"as produces little effect after much labor [7]. " +
"When she was a girl she wrote stories. Her works were printed only after much revision. " +
"Only four of her novels were printed while she was alive. They were Sense and Sensibility (1811), " +
"Pride and Prejudice (1813), Mansfield Park (1814) and Emma (1816). " +
"Two other novels, Northanger Abbey and Persuasion, were printed in 1817 with " +
"a biographical notice by her brother, Henry Austen. Persuasion was written shortly before her death. " +
"She also wrote two earlier works, Lady Susan, and an unfinished novel, The Watsons. " +
"She had been working on a new novel, Sanditon, but she died before she could finish it.";
//initialize all models needed for processing a passage of text (multiple sentences)
//TODO: allow partial initialization parserInit() and chunkerInit()
public void init() throws InvalidFormatException{
File modelsDir = new File(this.modelsPath);
this.parserMFile = new File(modelsDir, "en-parser-chunking.bin");
this.sentDetectorMFile = new File(modelsDir, "en-sent.bin");
this.posMFile = new File(modelsDir,"en-pos-maxent.bin");
this.nerMFile = new File(modelsDir,"en-ner-person.bin");
InputStream sentModelIn = null;
InputStream nerModelIn = null;
FileInputStream parserStream;
try {
//for finding sentences
sentModelIn = new FileInputStream(sentDetectorMFile);
this.sentenceModel = new SentenceModel(sentModelIn);
//for finding POS
FileInputStream posModelStream = new FileInputStream(posMFile);
POSModel model = new POSModel(posModelStream);
this.tagger = new POSTaggerME(model);
//for parsing
parserStream = new FileInputStream(parserMFile);
this.parserModel = new ParserModel(parserStream);
nerModelIn = new FileInputStream(nerMFile);
this.nerModel = new TokenNameFinderModel(nerModelIn);
} catch (FileNotFoundException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
this.modelsAreInitialized=true;
}
//parses a segment of text and shows the parse, and children of the top node
public void parserTest1() throws IOException {
if (!this.modelsAreInitialized) init();
Parser parser = ParserFactory.create(
this.parserModel,
20, // beam size
0.95);
Parse[] results = ParserTool.parseLine("Jane Austen was very modest about her own genius ."+this.q,
parser, 1);
Parse[] qResults = ParserTool.parseLine(this.q,parser, 1);
Parse[] rChn = (results[0].getChildren())[0].getChildren();
results[0].expandTopNode(results[0]);
for (int i = 0; i < results.length; i++) {
results[i].show();
}
for (int i = 0; i < qResults.length; i++) {
qResults[i].show();
}
System.out.print("\n\n");
for (int i = 0; i < rChn.length; i++) {
rChn[i].show();
System.out.print("\n");
}
}
//find sentences in a text
public void testSentDetector(String testSents) throws InvalidFormatException{
init();
SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
String[] sentences = sentenceDetector.sentDetect(testSents);
for (int i=0;i<sentences.length; i++)
System.err.println("sent: "+sentences[i]);
}
/*public void testNamedEntityRecognition(String testSents) throws InvalidFormatException{
init();
NameFinderME nameFinder = new NameFinderME(this.nerModel);
//Span nameSpans[] = nameFinder.find(testSents);
}*/
public Parse[] parsePassageText(String p) throws InvalidFormatException{
if (!modelsAreInitialized)init();
//initialize
SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
NameFinderME nameFinder = new NameFinderME(this.nerModel);
Parser parser = ParserFactory.create(
this.parserModel,
20, // beam size
0.95); // advance percentage
//find sentences, tokenize each, parse each, return top parse for each
String[] sentences = sentenceDetector.sentDetect(p);
Parse[] results = new Parse[sentences.length];
for (int i=0;i<sentences.length;i++){
//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
//StringTokenizer st = new StringTokenizer(tks[i]);
//There are several tokenizers available. SimpleTokenizer works best
Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
for (int si = 0; si < sentences.length; si++) {
Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
Span[] names = nameFinder.find(tokens);
for (int ni = 0; ni < names.length; ni++) {
Span startSpan = tokenSpans[names[ni].getStart()];
int nameStart = startSpan.getStart();
Span endSpan = tokenSpans[names[ni].getEnd() - 1];
int nameEnd = endSpan.getEnd();
String name = sentences[si].substring(nameStart, nameEnd);
System.out.println(name);
}
}
String sent= StringUtils.join(tokenizer," ");
System.out.println("Found sentence " + sent);
Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
results[i]=sentResults[0];
}
return results;
}
public void taggerTest(){
String[] words = SimpleTokenizer.INSTANCE.tokenize(
"The quick, red fox jumped over the lazy, brown dogs.");
String[] result = tagger.tag(words);
for (int i=0 ; i < words.length; i++) {
System.err.print(words[i] + "/" + result[i] + " ");
}
System.err.println("n");
}
//
public Parse[] getAllChildren(Parse[] parseAr){
Parse[] allChildren = parseAr;
Parse[] allChldr;
for (int i=0; i<parseAr.length;i++){
Parse[] children = parseAr[i].getChildren();
allChldr= getAllChildren(children);
allChildren =ArrayUtils.addAll(allChildren, allChldr);
}
return allChildren;
}
public Parse[] getAllChildren(Parse parse){
Parse[] allChildren = new Parse[1];
allChildren[0]=parse;
Parse[] allChldr;
Parse[] children = parse.getChildren();
allChldr= getAllChildren(children);
allChildren =ArrayUtils.addAll(allChildren, allChldr);
return allChildren;
}
//computes the number of matches between two sets of parses
//a match means same label over the same string
public double compareParseChunks(Parse[] pa1, Parse[] pa2, boolean verbose){
HashMap<String,String> pa1h= new HashMap<String, String>();
double numMatches=0;
for (int i=0;i<pa1.length;i++){
String[] key = new String[2];
key[0]=pa1[i].getCoveredText();
key[1]=pa1[i].getLabel();
pa1h.put(key[1]+key[0],"y");
}
for (int j=0;j<pa2.length;j++){
String[] key = new String[2];
key[0]=pa2[j].getCoveredText();
key[1]=pa2[j].getLabel();
if (pa1h.containsKey(key[1]+key[0])){
numMatches++;
if (verbose) System.out.println("\n");
pa2[j].show();
if (verbose) System.out.println("span: "+pa2[j].getSpan());
if (verbose) System.out.println("type: "+pa2[j].getType());
};
}
if (verbose) System.out.println("numMatches "+numMatches);
return numMatches;
}
//a simple scorer based on the number of matches; requires the first string to be in the passage
public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException{
double score1=0, score2=0;
Parse[] caParse = this.parsePassageText(ca);
Parse[] qParse = this.parsePassageText(q);
Parse[] pasParse = this.parsePassageText(passage);
Parse[] caParseCh = getAllChildren(caParse);
Parse[] qParseCh = getAllChildren(qParse);
Parse[] pasParseCh = getAllChildren(pasParse);
score1=compareParseChunks(qParseCh, pasParseCh,verbose);
score2=compareParseChunks(caParseCh, pasParseCh,verbose);
return score1*score2;
}
//normalized scorer.
//TODO divide by passage length containing the matches, not the full passage length
public double scoreStructureNorm(String ca, String q, String passage, boolean verbose) throws InvalidFormatException{
double score1=0, score2=0;
//OnlpParserTest pt= new OnlpParserTest();
Parse[] caParse = this.parsePassageText(ca);
Parse[] qParse = this.parsePassageText(q);
Parse[] pasParse = this.parsePassageText(passage);
Parse[] caParseCh = getAllChildren(caParse);
Parse[] qParseCh = getAllChildren(qParse);
Parse[] pasParseCh = getAllChildren(pasParse);
score1=compareParseChunks(qParseCh, pasParseCh,verbose);
score2=compareParseChunks(caParseCh, pasParseCh,verbose);
return score1*score2/passage.length();
}
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
NERScorer pt= new NERScorer();
Parse[] parses = pt.parsePassageText("this is a cat . this is a dog .");
for (int i=0;i<parses.length;i++){
System.out.println("parses: ");
parses[i].show();
}
String q="red fox jumped over brown dogs .";
String pas="red fox jumped over brown dogs . The quick , red fox jumped over the lazy , brown dogs . ";
//parses = pt.parsePassageText("The quick, red fox jumped over the lazy, brown dogs. The quick , red fox jumped over the lazy , brown dogs . ");
parses = pt.parsePassageText(pas);
Parse[][] parsecs = new Parse[2][];
for (int i=0;i<parses.length;i++){
System.out.print("parse["+i+"]: ");
parses[i].show();
System.out.println(parses[i].getText());
Parse[] parsec = pt.getAllChildren(parses[i]);
parsecs[i]=parsec;
for (int j=0;j<parsec.length;j++){
System.out.print("parses child: ");
parsec[j].show();
}
System.out.println("number of children in the parses: "+parsec.length);
}
pt.compareParseChunks(parsecs[0],parsecs[1],true);
System.out.println();
System.out.println("NormalizedScore: "+pt.scoreStructureNorm("red fox",pas,q ,false));
System.out.println("Raw Score: "+pt.scoreStructure("red fox",pas,q ,false));
System.out.println("\n\n");
String ca="Jane Austen";
String qq="Jane Austen wrote Emma";
String passage="Jane Austen was very modest about her own genius.[7] She once famously described her work as "+
"the little bit (two Inches wide) of Ivory, on which I work with so fine a brush, " +
"as produces little effect after much labor [7]. " +
"Jane Austen wrote Emma."+
"When she was a girl she wrote stories. Her works were printed only after much revision. " +
"Only four of her novels were printed while she was alive. They were Sense and Sensibility (1811), " +
"Pride and Prejudice (1813), Mansfield Park (1814) and Emma (1816). " +
"Two other novels, Northanger Abbey and Persuasion, were printed in 1817 with " +
"a biographical notice by her brother, Henry Austen. Persuasion was written shortly before her death. " +
"She also wrote two earlier works, Lady Susan, and an unfinished novel, The Watsons. " +
"She had been working on a new novel, Sanditon, but she died before she could finish it.";
System.out.println();
System.out.println("NormalizedScore: "+pt.scoreStructureNorm(ca,qq, passage,false));
System.out.println("Raw Score: "+pt.scoreStructure(ca,qq, passage,false));
//pt.taggerTest();
//pt.testSentDetector(passage);
}
}
//ts here