/*
* Copyright (c) 2015 University of Illinois Board of Trustees, All rights reserved.
* Developed at GSLIS/ the iSchool, by Dr. Jana Diesner, Amirhossein Aleyasen,
* Chieh-Li Chin, Shubhanshu Mishra, Kiumars Soltani, and Liang Tao.
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, see <http://www.gnu.org/licenses>.
*
*/
package context.core.tokenizer;
import context.core.util.JavaIO;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.util.CoreMap;
import java.io.File;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.openide.util.Exceptions;
/**
*
* @author Aale
*/
public class SemanticAnnotation {
private static StanfordCoreNLP pipeline;
static {
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, parse");
pipeline = new StanfordCoreNLP(props);
}
/**
*
* @param text
* @param docId
* @return
*/
public static Map<String, List<CustomToken>> tokenizeSPOStructure(String text, String docId) {
Map<String, List<CustomToken>> sent_spo_map = new LinkedHashMap<>();
Annotation document = new Annotation(text);
pipeline.annotate(document);
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
int sentIndex = 0;
for (CoreMap sentence : sentences) {
System.out.println("sent-" + sentIndex + ": " + sentence);
final List<SPOStructure> spo_list = SPOExtractor.extractSPOs(sentence, docId, sentIndex);
for (SPOStructure spo : spo_list) {
System.out.println(spo);
}
System.out.println();
sentIndex++;
}
return sent_spo_map;
}
/**
*
* @param text
* @param docId
* @return
*/
public static Map<String, CustomEdge> tokenizeSPO(String text, String docId) {
System.out.println("starting tokenizeSPO...");
Map<String, CustomEdge> customEdges = new LinkedHashMap<>();
Annotation document = new Annotation(text);
pipeline.annotate(document);
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
System.out.println("core annotation done, start analyzing results...");
int sentIndex = 0;
for (CoreMap sentence : sentences) {
// System.out.println("sent-" + sentIndex + ": " + sentence);
final List<SPOStructure> spo_list = SPOExtractor.extractSPOs(sentence, docId, sentIndex);
customEdges.putAll(generateEdges(spo_list, docId, sentIndex));
// for (SPOStructure spo : spo_list) {
// System.out.println(spo);
// }
// System.out.println();
sentIndex++;
}
System.out.println(customEdges);
System.out.println("customEdge#" + customEdges.size());
return customEdges;
}
/**
*
* @param text
* @param docId
* @return
*/
public static Map<String, CustomEdge> tokenize(String text, String docId) {
Map<String, CustomEdge> customEdges = new LinkedHashMap<>();
Annotation document = new Annotation(text);
pipeline.annotate(document);
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
int sentIndex = 0;
for (CoreMap sentence : sentences) {
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
int index = 0;
SemanticGraph dependencies = sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);
// System.out.println(dependencies);
for (SemanticGraphEdge edge : dependencies.edgeListSorted()) {
CustomEdge cedge = new CustomEdge();
cedge.setDocId(docId);
cedge.setSentenceIndex(sentIndex);
cedge.setIndex(index);
cedge.setWord1(removePOS(edge.getSource() + ""));
cedge.setWord2(removePOS(edge.getTarget() + ""));
cedge.setType(edge.getRelation() + "");
// System.out.println(edge + " >d: " + edge.getDependent() + " >g: " + edge.getGovernor() + " > " + edge.getRelation() + "> " + edge.getSource() + " > " + edge.getTarget() + " >w: " + edge.getWeight());
customEdges.put(cedge.getWord1() + "/" + cedge.getWord2() + "/" + cedge.getDocId() + "/" + cedge.getSentenceIndex(), cedge);
index++;
}
// Collection<TypedDependency> deps = dependencies.typedDependencies();
// for (TypedDependency typedDep : deps) {
// GrammaticalRelation reln = typedDep.reln();
// String type = reln.toString();
// System.out.println("type=" + type + " >> " + typedDep);
// }
// Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
//
sentIndex++;
}
return customEdges;
}
// static String text = "In my lab, we develop and advance computational solutions that help people to better understand the interplay and co-evolution of information and socio-technical networks.";
/**
*
* @param args
*/
public static void main(String[] args) {
String text = null;
try {
text = JavaIO.readFile(new File("data\\deep-parsing\\data.txt"));
// String text = "Amir is a master student in Computer Science at UIUC. Shub is a PhD student in GSLIS at UIUC";
} catch (IOException ex) {
Exceptions.printStackTrace(ex);
}
// String text = "The cat eats a mouse. She goes to the university.";
// DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
// int index = 1;
// for (List sentence : dp) {
// System.out.println(index+++ " :: ");
// System.out.println(sentence);
// }
System.out.println();
text = text.replace('\n', ' ');
final Map<String, List<CustomToken>> tokens = SemanticAnnotation.tokenizeSPOStructure(text, "1");
for (String key : tokens.keySet()) {
System.out.print(key + "\nS-P-O: ");
String str = "";
for (CustomToken token : tokens.get(key)) {
if (token == null || token.getWord() == null) {
str += "N/A";
} else {
str += token.getWord();
}
str += " - ";
}
str = str.substring(0, str.length() - 3);
System.out.println(str);
System.out.println();
}
}
/**
*
* @param args
*/
public static void main2(String[] args) {
// String text = "Amir is a master student in Computer Science at UIUC. Shub is a PhD student in GSLIS at UIUC";
String text = "The cat eats a mouse. She goes to the university.";
final Map<String, CustomEdge> tokens = SemanticAnnotation.tokenize(text, "1");
for (String key : tokens.keySet()) {
System.out.println(key + "\t" + tokens.get(key));
}
}
/**
*
* @param word
* @return
*/
public static String removePOS(String word) {
int lastDashIndex = word.lastIndexOf("-");
if (lastDashIndex == -1) {
return word;
} else {
return word.substring(0, lastDashIndex);
}
}
private static Map<String, CustomEdge> generateEdges(List<SPOStructure> spo_list, String docId, int sentIndex) {
Map<String, CustomEdge> customEdges = new LinkedHashMap<>();
int index = 0;
for (SPOStructure spo : spo_list) {
for (CustomToken subj : spo.getSubjects()) {
CustomEdge cedge = new CustomEdge();
cedge.setDocId(docId);
cedge.setSentenceIndex(sentIndex);
cedge.setIndex(index);
cedge.setWord1(subj.word);
cedge.setWord2(spo.predicate.word);
cedge.setType("SP");
// System.out.println(edge + " >d: " + edge.getDependent() + " >g: " + edge.getGovernor() + " > " + edge.getRelation() + "> " + edge.getSource() + " > " + edge.getTarget() + " >w: " + edge.getWeight());
customEdges.put(cedge.getWord1() + "/" + cedge.getWord2() + "/" + cedge.getDocId() + "/" + cedge.getSentenceIndex(), cedge);
index++;
}
for (CustomToken obj : spo.getObjects()) {
CustomEdge cedge = new CustomEdge();
cedge.setDocId(docId);
cedge.setSentenceIndex(sentIndex);
cedge.setIndex(index);
cedge.setWord1(spo.predicate.word);
cedge.setWord2(obj.word);
cedge.setType("PO");
// System.out.println(edge + " >d: " + edge.getDependent() + " >g: " + edge.getGovernor() + " > " + edge.getRelation() + "> " + edge.getSource() + " > " + edge.getTarget() + " >w: " + edge.getWeight());
customEdges.put(cedge.getWord1() + "/" + cedge.getWord2() + "/" + cedge.getDocId() + "/" + cedge.getSentenceIndex(), cedge);
index++;
}
}
return customEdges;
}
}