/*
* Copyright (c) 2016, Peter Anderson <peter.anderson@anu.edu.au>
*
* This file is part of Semantic Propositional Image Caption Evaluation
* (SPICE).
*
* SPICE is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* SPICE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public
* License along with SPICE. If not, see <http://www.gnu.org/licenses/>.
*
*/
package edu.anu.spice;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import edu.cmu.meteor.aligner.SynonymDictionary;
import edu.cmu.meteor.util.Constants;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.scenegraph.SemanticGraphEnhancer;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations;
import edu.stanford.nlp.util.CoreMap;
/**
*
* SpiceParser is a modified version of the RuleBasedParser in
* scenegraph-1.0 by Sebastian Schuster. The following changes have been
* made for use in SPICE:
* - Plural objects are not duplicated in the scene graph, instead the
* numeric modifier is made to be an attribute of the object.
* - Objects in a conjunction each receive the sentence relations of the other.
* - All nouns are added as objects, even if no relation is found.
* - Compound nouns are treated as nouns with an attribute.
*
*/
public class SpiceParser {
/* A man is riding a horse. */
static final SemgrexPattern SUBJ_PRED_OBJ_TRIPLET_PATTERN = SemgrexPattern
.compile("{}=pred >nsubj {tag:/NNP?S?/}=subj >/(iobj|dobj|nmod:.*)/=objreln {tag:/NNP?S?/}=obj !> cop {}");
/* A woman is smiling. */
static final SemgrexPattern SUBJ_PRED_PAIR_PATTERN = SemgrexPattern
.compile("{}=pred >nsubj {tag:/NNP?S?/}=subj !>/(iobj|dobj|nmod:.*)/ {tag:/NNP?S?/} !>cop {}");
/* The man is a rider. */
static final SemgrexPattern COPULAR_PATTERN = SemgrexPattern.compile("{}=pred >nsubj {tag:/NNP?S?/}=subj >cop {}");
/* A smart woman. */
static final SemgrexPattern ADJ_MOD_PATTERN = SemgrexPattern.compile("{}=obj >/(amod)/ {}=adj");
/* The man is tall. */
static final SemgrexPattern ADJ_PRED_PATTERN = SemgrexPattern.compile("{tag:/J.*/}=adj >nsubj {}=obj");
/* A woman is in the house. */
static final SemgrexPattern PP_MOD_PATTERN = SemgrexPattern.compile("{tag:/NNP?S?/}=gov >/nmod:.*/=reln {}=mod");
/* His watch. */
static final SemgrexPattern POSS_PATTERN = SemgrexPattern
.compile("{tag:/NNP?S?/}=gov >/nmod:poss/=reln {tag:/NNP?S?/}=mod");
/* */
static final SemgrexPattern AGENT_PATTERN = SemgrexPattern
.compile("{tag:/V.*/}=pred >/nmod:agent/=reln {tag:/NNP?S?/}=subj >nsubjpass {tag:/NNP?S?/}=obj ");
/* A cat sitting in a chair. */
static final SemgrexPattern ACL_PATTERN = SemgrexPattern
.compile("{}=subj >acl ({tag:/V.*/}=pred >/(iobj|dobj|nmod:.*)/=objreln {tag:/NNP?S?/}=obj)");
// Sebastian Schuster - TODO: do something special with nmod:by
// Several people use laptop computers while sitting around on couches and
// chairs.
// A red desk chair has been rolled away from the desk.
// A red emergency truck has a strong silver guard on the front of its
// grill.
// They are preparing the animals for a show.
// Two engines are visible on the plane.
// TODO: passives without agent
// TODO: adverbial modifiers - > potentially (green + light green
// more spatial relations: in the center of, in the front of,
/* Any noun in a conjuction. */
static final SemgrexPattern NOUN_CONJ_PATTERN = SemgrexPattern
.compile("{tag:/NNP?S?/}=tail >/(conj:and|conj:or)/ {tag:/NNP?S?/}=head");
/* Compound noun */
static final SemgrexPattern COMPOUND_NOUN_PATTERN = SemgrexPattern
.compile("{tag:/NNP?S?/}=tail >/(compound)/ {tag:/NNP?S?/}=head");
/* Any noun, not compound */
static final SemgrexPattern NOUN_PATTERN = SemgrexPattern.compile("{tag:/NNP?S?/}=word");
/* From SemanticGraphEnhancer - Both subject and object or PP are plurals. */
static final SemgrexPattern PLURAL_SUBJECT_OBJECT_PATTERN = SemgrexPattern.compile("{}=pred >nsubj {tag:/NNP?S/}=subj [ >/(.obj)/ ({tag:/NNP?S/}=obj) | >/(nmod:((?!agent).)*$)/ ({tag:/NNP?S/}=obj >case {}) ] ");
/* From SemanticGraphEnhancer - Only subject is plural (either no object or PP exists, or they are singular). */
static final SemgrexPattern PLURAL_SUBJECT_PATTERN = SemgrexPattern.compile("{tag:/NNP?S/}=subj [ == {$} | <nsubj ({} !>/.obj/ {tag:/NNP?S/} !>/(nmod:((?!agent).)*$)/ ({tag:/NNP?S/} > case {}) )]");
/* From SemanticGraphEnhancer - Only object is plural (either no subject or it is singular). */
static final SemgrexPattern PLURAL_OTHER_PATTERN = SemgrexPattern.compile("{tag:/NNP?S/}=word !== {$} !<nsubj {} !</.obj|nmod.*/ ({} >nsubj {tag:/NNP?S/})");
protected class ProposedTuples {
public ArrayList<ArrayList<String>> tuples;
public ProposedTuples(){
this.tuples = new ArrayList<ArrayList<String>>();
}
public void addTuple(IndexedWord subj, IndexedWord obj, String predicate){
ArrayList<String> tuple = new ArrayList<String>();
tuple.add(subj.lemma().trim().toLowerCase());
tuple.add(obj.lemma().trim().toLowerCase());
tuple.add(predicate);
this.tuples.add(tuple);
}
public void addTuple(IndexedWord subj, IndexedWord pred) {
ArrayList<String> tuple = new ArrayList<String>();
tuple.add(subj.lemma().trim().toLowerCase());
tuple.add(pred.lemma().trim().toLowerCase());
this.tuples.add(tuple);
}
public void addTuple(IndexedWord word) {
ArrayList<String> tuple = new ArrayList<String>();
tuple.add(word.lemma().trim().toLowerCase());
this.tuples.add(tuple);
}
public void addTuple(IndexedWord head, String string, String predicate) {
ArrayList<String> tuple = new ArrayList<String>();
tuple.add(head.lemma().trim().toLowerCase());
tuple.add(string);
tuple.add(predicate);
this.tuples.add(tuple);
}
}
protected final SynonymDictionary synonyms;
protected Annotator annotator;
protected LmdbTupleDB db;
Boolean mergeSimilarNodes;
public SpiceParser(String dbPath, int numThreads, Boolean mergeSimilarNodes) {
this.mergeSimilarNodes = mergeSimilarNodes;
this.annotator = new Annotator(numThreads, 10000);
if (dbPath != null){
this.db = new LmdbTupleDB(dbPath);
} else {
this.db = null;
}
URL synDirURL = Constants.DEFAULT_SYN_DIR_URL;
try {
URL excFileURL = new URL(synDirURL.toString() + "/english.exceptions");
URL synFileURL = new URL(synDirURL.toString() + "/english.synsets");
URL relFileURL = new URL(synDirURL.toString() + "/english.relations");
this.synonyms = new SynonymDictionary(excFileURL, synFileURL, relFileURL);
} catch (IOException ex) {
throw new RuntimeException("Error: Synonym dictionary could not be loaded (" + synDirURL.toString() + ")");
}
}
protected Map<String, ArrayList<ArrayList<String>>> loadTuplesFromDB(List<String> input, boolean cache){
// Load any pre-processed captions from the database
Map<String, ArrayList<ArrayList<String>>> captionTuples = this.db.getTransaction(input);
ArrayList<String> unparsed = new ArrayList<String>();
for (String caption: input){
if (!captionTuples.containsKey(caption)){
unparsed.add(caption);
}
}
// Parse and save captions not in database
if (!unparsed.isEmpty()){
this.annotator.setInput(unparsed);
Map<String, ArrayList<ArrayList<String>>> newTuples = new HashMap<String, ArrayList<ArrayList<String>>>();
Iterator<String> caption = unparsed.iterator();
while (caption.hasNext()){
List<Annotation> anns = this.annotator.parseNextBatch();
assert (!anns.isEmpty());
Iterator<Annotation> ann = anns.iterator();
while (caption.hasNext() && ann.hasNext()) {
ProposedTuples tuples = this.parseAnnotation(ann.next());
newTuples.put(caption.next(), tuples.tuples);
}
if (cache){
this.db.putTransaction(newTuples);
}
captionTuples.putAll(newTuples);
newTuples.clear();
}
}
return captionTuples;
}
protected Map<String, ArrayList<ArrayList<String>>> generateTuples(List<String> input){
Map<String, ArrayList<ArrayList<String>>> captionTuples = new HashMap<String, ArrayList<ArrayList<String>>>();
this.annotator.setInput(input);
Iterator<String> caption = input.iterator();
while(caption.hasNext()){
List<Annotation> anns = this.annotator.parseNextBatch();
assert (!anns.isEmpty());
Iterator<Annotation> ann = anns.iterator();
while (caption.hasNext() && ann.hasNext()) {
ProposedTuples tuples = this.parseAnnotation(ann.next());
captionTuples.put(caption.next(), tuples.tuples);
}
}
return captionTuples;
}
protected Map<String, ArrayList<ArrayList<String>>> loadTuples(List<String> input){
if (this.db == null){
return this.generateTuples(input);
} else {
return this.loadTuplesFromDB(input, true);
}
}
public List<SceneGraph> parseCaptions(List<String> input, List<Integer> chunks) {
// Load any pre-processed captions from the database
Map<String, ArrayList<ArrayList<String>>> captionTuples = this.loadTuples(input);
// Build scene graphs from tuples (with merging etc)
List<SceneGraph> sgs = new ArrayList<SceneGraph>();
Iterator<String> it = input.iterator();
int captionCount;
for (Integer chunk: chunks){
captionCount = 0;
SceneGraph scene = new SceneGraph(synonyms, this.mergeSimilarNodes);
while(it.hasNext() && captionCount<chunk){
ArrayList<ArrayList<String>> tuples = captionTuples.get(it.next());
assert(tuples != null);
for (ArrayList<String> tuple: tuples){
scene.addTuple(tuple);
}
captionCount++;
}
sgs.add(scene);
}
return sgs;
}
public List<SceneGraph> parseCaptions(List<String> input) {
Map<String, ArrayList<ArrayList<String>>> captionTuples = this.loadTuples(input);
List<SceneGraph> sgs = new ArrayList<SceneGraph>();
for (String caption: input){
ArrayList<ArrayList<String>> tuples = captionTuples.get(caption);
assert(tuples != null);
SceneGraph scene = new SceneGraph(synonyms, this.mergeSimilarNodes);
for (ArrayList<String> tuple: tuples){
scene.addTuple(tuple);
}
sgs.add(scene);
}
return sgs;
}
/**
* Attaches particles to the main predicate.
*/
protected String getPredicate(SemanticGraph sg, IndexedWord mainPred) {
if (sg.hasChildWithReln(mainPred, UniversalEnglishGrammaticalRelations.PHRASAL_VERB_PARTICLE)) {
IndexedWord part = sg.getChildWithReln(mainPred,
UniversalEnglishGrammaticalRelations.PHRASAL_VERB_PARTICLE);
return String.format("%s %s", mainPred.lemma().equals("be") ? "" : mainPred.lemma(), part.value());
}
return mainPred.lemma();
}
/**
* Checks if a word has a numerical modifier, and if so adds it as an object
* with attribute
*/
protected void checkForNumericAttribute(ProposedTuples tuples, SemanticGraph sg, IndexedWord word) {
if (sg.hasChildWithReln(word, UniversalEnglishGrammaticalRelations.NUMERIC_MODIFIER)) {
IndexedWord nummod = sg.getChildWithReln(word, UniversalEnglishGrammaticalRelations.NUMERIC_MODIFIER);
/* Prevent things like "number 5" */
if (nummod.index() < word.index()) {
tuples.addTuple(word, nummod);
}
} else if (sg.hasChildWithReln(word, SemanticGraphEnhancer.QMOD_RELATION)) {
IndexedWord qmod = sg.getChildWithReln(word, SemanticGraphEnhancer.QMOD_RELATION);
tuples.addTuple(word, qmod);
}
}
protected ProposedTuples parseAnnotation(Annotation ann) {
ProposedTuples tuples = new ProposedTuples();
ArrayList<SemanticGraph> sgs = new ArrayList<SemanticGraph>();
for (CoreMap sentence : ann.get(CoreAnnotations.SentencesAnnotation.class)) {
SemanticGraph sg = sentence
.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);
sgs.add(sg);
}
for (SemanticGraph sg : sgs) {
// Everything from RuleBasedParser except resolvePlurals(sg);
SemanticGraphEnhancer.processQuanftificationModifiers(sg);
SemanticGraphEnhancer.collapseCompounds(sg);
SemanticGraphEnhancer.collapseParticles(sg);
SemanticGraphEnhancer.resolvePronouns(sg);
SemgrexMatcher matcher = SUBJ_PRED_OBJ_TRIPLET_PATTERN.matcher(sg);
while (matcher.find()) {
IndexedWord subj = matcher.getNode("subj");
IndexedWord obj = matcher.getNode("obj");
IndexedWord pred = matcher.getNode("pred");
String reln = matcher.getRelnString("objreln");
String predicate = getPredicate(sg, pred);
if (reln.startsWith("nmod:") && !reln.equals("nmod:poss") && !reln.equals("nmod:agent")) {
predicate += reln.replace("nmod:", " ").replace("_", " ");
}
tuples.addTuple(subj, obj, predicate);
}
matcher = ACL_PATTERN.matcher(sg);
while (matcher.find()) {
IndexedWord subj = matcher.getNode("subj");
IndexedWord obj = matcher.getNode("obj");
IndexedWord pred = matcher.getNode("pred");
String reln = matcher.getRelnString("objreln");
String predicate = getPredicate(sg, pred);
if (reln.startsWith("nmod:") && !reln.equals("nmod:poss") && !reln.equals("nmod:agent")) {
predicate += reln.replace("nmod:", " ").replace("_", " ");
}
tuples.addTuple(subj, obj, predicate);
}
SemgrexPattern[] subjPredPatterns = { SUBJ_PRED_PAIR_PATTERN, COPULAR_PATTERN };
for (SemgrexPattern p : subjPredPatterns) {
matcher = p.matcher(sg);
while (matcher.find()) {
IndexedWord subj = matcher.getNode("subj");
IndexedWord pred = matcher.getNode("pred");
if (sg.hasChildWithReln(pred, UniversalEnglishGrammaticalRelations.CASE_MARKER)) {
IndexedWord caseMarker = sg.getChildWithReln(pred,
UniversalEnglishGrammaticalRelations.CASE_MARKER);
String prep = caseMarker.value();
if (sg.hasChildWithReln(caseMarker,
UniversalEnglishGrammaticalRelations.MULTI_WORD_EXPRESSION)) {
for (IndexedWord additionalCaseMarker : sg.getChildrenWithReln(caseMarker,
UniversalEnglishGrammaticalRelations.MULTI_WORD_EXPRESSION)) {
prep = prep + " " + additionalCaseMarker.value();
}
}
tuples.addTuple(subj, pred, prep);
} else {
if (!pred.lemma().equals("be")) {
tuples.addTuple(subj, pred);
}
}
}
}
matcher = ADJ_MOD_PATTERN.matcher(sg);
while (matcher.find()) {
IndexedWord obj = matcher.getNode("obj");
IndexedWord adj = matcher.getNode("adj");
tuples.addTuple(obj, adj);
}
matcher = ADJ_PRED_PATTERN.matcher(sg);
while (matcher.find()) {
IndexedWord obj = matcher.getNode("obj");
IndexedWord adj = matcher.getNode("adj");
tuples.addTuple(obj, adj);
}
matcher = PP_MOD_PATTERN.matcher(sg);
while (matcher.find()) {
IndexedWord gov = matcher.getNode("gov");
IndexedWord mod = matcher.getNode("mod");
String reln = matcher.getRelnString("reln");
String predicate = reln.replace("nmod:", "").replace("_", " ");
if (predicate.equals("poss") || predicate.equals("agent")) {
continue;
}
tuples.addTuple(gov, mod, predicate);
}
matcher = POSS_PATTERN.matcher(sg);
while (matcher.find()) {
IndexedWord gov = matcher.getNode("gov");
IndexedWord mod = matcher.getNode("mod");
tuples.addTuple(mod, gov, "have");
}
matcher = AGENT_PATTERN.matcher(sg);
while (matcher.find()) {
IndexedWord subj = matcher.getNode("subj");
IndexedWord obj = matcher.getNode("obj");
IndexedWord pred = matcher.getNode("pred");
tuples.addTuple(subj, obj, getPredicate(sg, pred));
}
matcher = PLURAL_SUBJECT_OBJECT_PATTERN.matcher(sg);
while (matcher.findNextMatchingNode()) {
IndexedWord subj = matcher.getNode("subj");
IndexedWord obj = matcher.getNode("obj");
checkForNumericAttribute(tuples, sg, subj);
checkForNumericAttribute(tuples, sg, obj);
}
matcher = PLURAL_SUBJECT_PATTERN.matcher(sg);
while (matcher.findNextMatchingNode()) {
IndexedWord subj = matcher.getNode("subj");
checkForNumericAttribute(tuples, sg, subj);
}
matcher = PLURAL_OTHER_PATTERN.matcher(sg);
while (matcher.findNextMatchingNode()) {
IndexedWord word = matcher.getNode("word");
checkForNumericAttribute(tuples, sg, word);
}
matcher = COMPOUND_NOUN_PATTERN.matcher(sg);
Set<IndexedWord> compoundNouns = new HashSet<IndexedWord>();
while (matcher.find()) {
IndexedWord tail = matcher.getNode("tail");
IndexedWord head = matcher.getNode("head");
compoundNouns.add(tail);
compoundNouns.add(head);
tuples.addTuple(tail, head);
}
// Must happen last, since it will reuse existing parts of the scene
// graph
matcher = NOUN_CONJ_PATTERN.matcher(sg);
while (matcher.find()) {
IndexedWord tail = matcher.getNode("tail");
IndexedWord head = matcher.getNode("head");
int original_length = tuples.tuples.size();
for (int i=0; i<original_length; ++i){
ArrayList<String> prop = tuples.tuples.get(i);
if (prop.size() == 3 && prop.get(0).equals(head)){
tuples.addTuple(head, prop.get(1), prop.get(2));
}
if (prop.size() == 3 && prop.get(1).equals(tail)){
tuples.addTuple(tail, prop.get(1), prop.get(2));
}
}
}
matcher = NOUN_PATTERN.matcher(sg);
while (matcher.find()) {
IndexedWord word = matcher.getNode("word");
if (!compoundNouns.contains(word)) {
tuples.addTuple(word);
}
}
}
return tuples;
}
public static void main(String args[]) throws IOException {
SpiceParser parser = new SpiceParser(null, 1, true);
if (args.length < 1) {
System.err.println("Processing from stdin. Enter one sentence per line.");
System.err.print("> ");
Scanner scanner = new Scanner(System.in);
String line;
while ((line = scanner.nextLine()) != null) {
List<String> captions = new ArrayList<String>();
captions.add(line);
SceneGraph scene = parser.parseCaptions(captions).get(0);
System.err.println("------------------------");
System.err.println(scene.toReadableString());
System.err.println("------------------------");
System.err.print("> ");
}
scanner.close();
}
}
}