package edu.uncc.cs.watsonsim.scorers;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.HashSet;
import java.util.List;
import edu.uncc.cs.watsonsim.Answer;
import edu.uncc.cs.watsonsim.Passage;
import edu.uncc.cs.watsonsim.Phrase;
import edu.uncc.cs.watsonsim.StringUtils;
import opennlp.tools.cmdline.parser.ParserTool;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.parser.ParserFactory;
import opennlp.tools.parser.ParserModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.SimpleTokenizer;
/* @author Wlodek
* @author Sean Gallagher
*
* Create a score based on how many parse trees the question, candidate answer
* and passage have in common.
*
* This scorer can be very slow.
*/
public class SentenceSimilarity extends PassageScorer {
public String modelsPath="data/"; //models directory
private File parserMFile;
private File sentDetectorMFile;
private File posMFile;
public SentenceModel sentenceModel; //sentence detection model
public ParserModel parserModel; //parsing model
public POSTaggerME tagger;
// Prevent unnecessary reinstantiation
SentenceDetectorME sentenceDetector;
Parser parser;
//initialize all models needed for processing a passage of text (multiple sentences)
//TODO: allow partial initialization parserInit() and chunkerInit()
public SentenceSimilarity() {
File modelsDir = new File(this.modelsPath);
this.parserMFile = new File(modelsDir, "en-parser-chunking.bin");
this.sentDetectorMFile = new File(modelsDir, "en-sent.bin");
this.posMFile = new File(modelsDir,"en-pos-maxent.bin");
InputStream sentModelIn = null;
FileInputStream parserStream;
try {
//for finding sentences
sentModelIn = new FileInputStream(sentDetectorMFile);
this.sentenceModel = new SentenceModel(sentModelIn);
//for finding POS
FileInputStream posModelStream = new FileInputStream(posMFile);
POSModel model = new POSModel(posModelStream);
this.tagger = new POSTaggerME(model);
//for parsing
parserStream = new FileInputStream(parserMFile);
this.parserModel = new ParserModel(parserStream);
} catch (FileNotFoundException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
sentenceDetector = new SentenceDetectorME(this.sentenceModel);
parser = ParserFactory.create(
this.parserModel,
20, // beam size
0.95); // advance percentage
}
/** Turn one tokenized sentence into one top-ranked parse tree. */
public Parse parseSentence(List<String> tokens) {
//StringTokenizer st = new StringTokenizer(tks[i]);
//There are several tokenizers available. SimpleTokenizer works best
System.out.print(";");
String sent= StringUtils.join(tokens," ");
return ParserTool.parseLine(sent,parser, 1)[0];
}
/** Turn a tokenized paragraph into a list of parses */
public List<Parse> parseParagraph(List<List<String>> paragraph) {
//find sentences, tokenize each, parse each, return top parse for each
List<Parse> results = new ArrayList<>(paragraph.size());
for (List<String> sentence : paragraph) {
//StringTokenizer st = new StringTokenizer(tks[i]);
//There are several tokenizers available. SimpleTokenizer works best
results.add(parseSentence(sentence));
}
return results;
}
/** Tokenize a paragraph into sentences, then into words. */
public List<List<String>> tokenizeParagraph(String paragraph) {
List<List<String>> results = new ArrayList<>();
// Find sentences, tokenize each, parse each, return top parse for each
for (String unsplit_sentence : sentenceDetector.sentDetect(paragraph)) {
results.add(Arrays.asList(
SimpleTokenizer.INSTANCE.tokenize(unsplit_sentence)
));
}
return results;
}
/** Enumerate all of the child parses of a parse tree */
public List<Parse> getAllChildren(List<Parse> parses){
List<Parse> doneChildren = new ArrayList<>(parses.size());
Deque<Parse> nextChildren = new ArrayDeque<>(100);
nextChildren.addAll(parses);
while (!nextChildren.isEmpty()) {
Parse child = nextChildren.remove();
doneChildren.add(child);
nextChildren.addAll(Arrays.asList(child.getChildren()));
}
return doneChildren;
}
/** Enumerate all the child parses of a single-root parse tree */
private List<Parse> getAllChildren(Parse parse){
List<Parse> p = new ArrayList<>(1);
p.add(parse);
return getAllChildren(p);
}
/** Compute the number of matches between two sets of parses
* where a match means same label over the same string
* @param pa1 One Parse forest
* @param pa2 Another parse forest
* @param verbose Whether to print progress to stdout
* @return score
*/
public double compareParseChunks(List<Parse> pa1, List<Parse> pa2, boolean verbose){
HashSet<String> bag1 = new HashSet<>();
HashSet<String> bag2 = new HashSet<>();
for (Parse p : pa1) {
bag1.add(p.getCoveredText()+"\n"+p.getLabel());
}
for (Parse p : pa2) {
bag2.add(p.getCoveredText()+"\n"+p.getLabel());
}
bag2.retainAll(bag1);
return bag2.size();
}
/**
* Flatten a paragraph into a set of unique tokens
* @param paragraph
* @return the flattened set
*/
public HashSet<String> flatten(List<List<String>> paragraph) {
HashSet<String> results = new HashSet<>();
for (List<String> sentence : paragraph)
for (String word : sentence)
results.add(word.toLowerCase());
return results;
}
/** Generare a normalized score.
*
*/
//TODO divide by passage length containing the matches, not the full passage length
public double scorePassage(Phrase q, Answer a, Passage p) {
boolean verbose = true;
// Tokenize the text, necessary for simple and NLP searches
List<List<String>> ca_sentences = tokenizeParagraph(a.text);
List<List<String>> q_sentences = tokenizeParagraph(q.text);
List<List<String>> passage_sentences = tokenizeParagraph(p.text);
// Run NLP on the question and candidate answer
List<Parse> ca_children = getAllChildren(parseParagraph(ca_sentences));
List<Parse> q_children = getAllChildren(parseParagraph(q_sentences));
List<Parse> p_children = new ArrayList<>();
// Speedup: Look for these tokens before running NLP
HashSet<String> target_tokens = flatten(ca_sentences);
//target_tokens.addAll(flatten(q_sentences));
// Free stop filtering (costs no more than what we were
// already doing)
target_tokens.removeAll(Arrays.asList(new String[]{
"i", "me", "you", "he", "she", "him", "they", "them",
"his", "her", "hers", "my", "mine", "your", "yours", "their", "theirs",
"of", "a", "the",
"and", "or", "not", "but",
"this", "that", "these", "those",
"on", "in", "from", "to", "over", "under", "with", "by", "for",
"without", "beside", "between",
"has", "have", "had", "will", "would", "gets", "get", "got",
"be", "am", "been", "was", "were", "being", "is",
".", ",", ":", ";", "[", "{", "}", "]", "(", ")", "<", ">",
"?", "/", "\\", "-", "_", "=", "+", "~", "`", "@", "#", "$",
"%", "^", "&", "*"
}));
for (List<String> sentence : passage_sentences) {
// Does it have the right tokens?
for (String word : sentence) {
if (target_tokens.contains(word.toLowerCase())) {
// Found a common word. Now compare the sentences.
p_children.addAll(getAllChildren(parseSentence(sentence)));
break;
}
}
}
double q_score = compareParseChunks(
q_children,
p_children,
verbose);
double ca_score = compareParseChunks(
ca_children,
p_children,
verbose);
return q_score*ca_score/p.text.length();
}
}