package context.core.task.entitynetwork;
import context.core.entity.CorpusData;
import context.core.entity.FileData;
import context.core.entity.TabularData;
import context.core.task.entitydetection.MultiWordEntities;
import context.core.task.syntaxbased.SyntacticNetwork;
import context.core.task.syntaxbased.SyntaxBasedTaskInstance;
import context.core.util.CorpusAggregator;
import context.core.util.ForAggregation;
import context.core.util.JavaIO;
import context.core.util.MyPair;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import gnu.trove.iterator.TObjectIntIterator;
import gnu.trove.map.hash.TObjectIntHashMap;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.gephi.graph.api.DirectedGraph;
import org.gephi.graph.api.Edge;
import org.gephi.graph.api.GraphController;
import org.gephi.graph.api.GraphModel;
import org.gephi.graph.api.Node;
import org.gephi.io.exporter.api.ExportController;
import org.gephi.project.api.ProjectController;
import org.gephi.project.api.Workspace;
import org.openide.util.Lookup;
/**
*
* @author Aale
*/
public class EntityNetworkBody {
private AbstractSequenceClassifier<?> classifier3;
private AbstractSequenceClassifier<?> classifier4;
private AbstractSequenceClassifier<?> classifier7;
private StanfordCoreNLP pipeline;
private List<String[]> NetworkEdges;
private HashSet<String[]> NodeHashSet;
private EntityNetworkTaskInstance instance;
private CorpusData input;
private List<TabularData> tabularOutput;
private List<List<String[]>> toAggregate;
private List<List<List<String[]>>> entityTags;
private List<String> EntitiesToKeepTrack;
private int unitOfAnalysis;
private long timeout;
private int distance;
private List<String[]> EntitiesWithOffset;
/**
*
* @param instance
*/
public EntityNetworkBody(EntityNetworkTaskInstance instance) {
// TODO Auto-generated method stub
this.instance = instance;
init();
}
private void init(){
this.EntitiesToKeepTrack = new ArrayList<String>();
EntitiesToKeepTrack.add("PERSON");
EntitiesToKeepTrack.add("ORGANIZATION");
EntitiesToKeepTrack.add("MONEY");
EntitiesToKeepTrack.add("LOCATION");
String[] properNouns = new String[2];
properNouns[0] = "NNP";
properNouns[1] = "NNPS";
String[] commonNouns = new String[2];
commonNouns[0] = "NN";
commonNouns[1] = "NNS";
this.pipeline = instance.getPipeline();
this.classifier3 = instance.get3Classifier();
this.classifier4 = instance.get4Classifier();
this.classifier7 = instance.get7Classifier();
this.unitOfAnalysis = instance.getUnitOfAnalysis();
this.distance = 7;
this.unitOfAnalysis = 2;
this.timeout = 120000;
this.input = (CorpusData) instance.getInput();
this.EntitiesWithOffset = new ArrayList<String[]>();
this.tabularOutput = instance.getTabularOutput();
NodeHashSet = new HashSet<String[]>();
NetworkEdges = new ArrayList<String[]>();
entityTags = new ArrayList<List<List<String[]>>>();
}
/**
*
* @return
*/
public boolean genNetwork(){
List<FileData> files = input.getFiles();
toAggregate = new ArrayList<List<String[]>>();
try{
for (FileData ff:files){
File file = ff.getFile();
String text = JavaIO.readFile(file);
text = text.replaceAll("\\p{Cc}", " ");
text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " ");
//text = text.replaceAll("[^\\x00-\\x7F]", "");
Annotation document = new Annotation(text);
pipeline.annotate(document);
List<List<String[]>> DocPOSTags = new ArrayList<List<String[]>>();
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
int placeInDoc = 0;
for (CoreMap sentence : sentences) {
List<String[]> sentPOStags = new ArrayList<String[]>();
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
int placeInSent = 0;
for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
// this is the text of the token
String word = token.get(TextAnnotation.class);
// this is the POS tag of the token
String pos = token.get(PartOfSpeechAnnotation.class);
String[] entity = {word, pos, Integer.toString(placeInSent), Integer.toString(placeInDoc)};
placeInSent++;
placeInDoc++;
if (!word.matches("^[a-zA-Z0-9]*$")) {
continue;
}
if(!EntitiesToKeepTrack.contains(pos)){
continue;
}
String[] hashNode = new String[2];
hashNode[0] = word;
hashNode[1] = pos;
NodeHashSet.add(hashNode);
sentPOStags.add(entity);
}
DocPOSTags.add(sentPOStags);
}
entityTags.add(DocPOSTags);
}
if(unitOfAnalysis ==2){
for (List<List<String[]>> DocEntityTags: entityTags){
String[] word;
List<String[]> docAggregate = new ArrayList<String[]>();
for (int overIndx = 0; overIndx < DocEntityTags.size(); overIndx ++){
List<String[]> SentEntityTags = DocEntityTags.get(overIndx);
for (int indx = 0; indx < SentEntityTags.size(); indx ++){
word = SentEntityTags.get(indx);
List<String[]> TempSentEntitytTags = SentEntityTags;
int tempIndex = indx + 1;
int tempOverIndex = overIndx;
if (tempIndex >= SentEntityTags.size()){
TempSentEntitytTags = null;
Boolean breakCondition = false;
while(TempSentEntitytTags == null || TempSentEntitytTags.size() == 0){
if (tempOverIndex + 1 < DocEntityTags.size()){
tempIndex = 0;
tempOverIndex ++;
TempSentEntitytTags = DocEntityTags.get(tempOverIndex);
}
else{
breakCondition = true;
break;
}
}
if (breakCondition){
break;
}
}
String[] tempWord = null;
try{
tempWord = TempSentEntitytTags.get(tempIndex);
} catch(Exception tempE){
tempE.printStackTrace();
}
while (Integer.parseInt(tempWord[3]) - Integer.parseInt(word[3])<distance){
String[] tempEntityEdge = new String[5];
tempEntityEdge[0] = word[0];
tempEntityEdge[1] = word[1];
tempEntityEdge[2] = tempWord[0];
tempEntityEdge[3] = tempWord[1];
tempEntityEdge[4] = "1";
docAggregate.add(tempEntityEdge);
tempIndex ++;
if (tempIndex >= TempSentEntitytTags.size()){
TempSentEntitytTags = null;
boolean breakCondition = false;
while(TempSentEntitytTags == null || TempSentEntitytTags.size() == 0){
if (tempOverIndex + 1 < DocEntityTags.size()){
tempIndex = 0;
tempOverIndex ++;
TempSentEntitytTags = DocEntityTags.get(tempOverIndex);
}
else{
breakCondition = true;
break;
}
}
if (breakCondition){
break;
}
}
try{
tempWord = TempSentEntitytTags.get(tempIndex);
}catch(Exception tempE){
tempE.printStackTrace();
}
}
}
}
toAggregate.add(docAggregate);
}
NetworkEdges = new CorpusAggregator().CorpusAggregate(toAggregate);
}
} catch(Exception e){
e.printStackTrace();
return false;
}
return true;
}
/**
*
* @return
*/
public String[][] getNetworkEdges(){
String[][] NetworkEdgesArray = new String[NetworkEdges.size()][5];
NetworkEdgesArray = NetworkEdges.toArray(NetworkEdgesArray);
return NetworkEdgesArray;
}
/**
*
* @return
*/
public String[][] getNetworkNodes(){
String[][] NetworkNodes = new String[NodeHashSet.size()][2];
NetworkNodes = NodeHashSet.toArray(NetworkNodes);
return NetworkNodes;
}
/**
*
* @param filename
* @return
*/
public boolean extractGephiOutput(String filename){
String[][] nodes_str = this.getNetworkNodes();
String[][] edges_str = this.getNetworkEdges();
File new_file = new File(filename);
if(new_file.exists())
new_file.delete();
//Init a project - and therefore a workspace
ProjectController pc = Lookup.getDefault().lookup(ProjectController.class);
pc.newProject();
Workspace workspace = pc.getCurrentWorkspace();
//Get a graph model - it exists because we have a workspace
GraphModel graphModel = Lookup.getDefault().lookup(GraphController.class).getModel();
final DirectedGraph directedGraph = graphModel.getDirectedGraph();
TObjectIntHashMap<String> nodes = new TObjectIntHashMap<String>();
Vector<String> node_index = new Vector<String>();
//Create the nodes
for(String[] node_str : nodes_str){
if (nodes.containsKey(node_str[0])){
int index = nodes.get(node_str[0]);
node_index.set(index, node_index.get(index) + "," + node_str[1]);
}
else
{
node_index.add(node_str[1]);
nodes.put(node_str[0], node_index.size() - 1);
}
}
for(TObjectIntIterator<String> node_it = nodes.iterator(); node_it.hasNext();){
node_it.advance();
Node n0 = graphModel.factory().newNode(node_it.key());
n0.getAttributes().setValue("label", node_it.key());
n0.getAttributes().setValue("Type", node_index.get(node_it.value()));
directedGraph.addNode(n0);
}
TObjectIntHashMap<MyPair<String, String> > edges = new TObjectIntHashMap<MyPair<String,String>>();
for(String[] edge_str: edges_str){
MyPair<String, String> edge = new MyPair(edge_str[0], edge_str[2]);
int value = Integer.parseInt(edge_str[4]);
edges.adjustOrPutValue(edge, value, value);
}
TObjectIntIterator<MyPair<String, String> > edge_it;
for(edge_it = edges.iterator(); edge_it.hasNext(); ){
edge_it.advance();
Node s1 = directedGraph.getNode(edge_it.key().getFirst());
Node s2 = directedGraph.getNode(edge_it.key().getSecond());
if (s1 == null || s2 == null){
continue;
}
int weight = edge_it.value();
Edge e0 = graphModel.factory().newEdge(s1, s2, weight, true);
directedGraph.addEdge(e0);
}
//Export full graph
ExportController ec = Lookup.getDefault().lookup(ExportController.class);
try {
ec.exportFile(new_file);
} catch (IOException ex) {
System.out.println(ex.getMessage());
return false;
}
return true;
}
/**
*
* @param instance
* @param output_address
* @return
*/
public static boolean runUnit(SyntaxBasedTaskInstance instance, String output_address){
SyntacticNetwork SN = new SyntacticNetwork(instance);
SN.genNetwork();
if (SN.extractGephiOutput(output_address))
return false;
return true;
}
/**
*
* @return
*/
public boolean detectEntities() {
List<FileData> files = input.getFiles();
List<List<String[]>> toAggregate = new ArrayList<List<String[]>>();
try {
for (FileData ff : files) {
File file = ff.getFile();
String text;
try {
text = JavaIO.readFile(file);
text = text.replaceAll("\\p{Cc}", " ");
text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"]", " ");
List<ForAggregation> longEntities3 = new ArrayList<ForAggregation>();
List<ForAggregation> longEntities4 = new ArrayList<ForAggregation>();
List<ForAggregation> longEntities7 = new ArrayList<ForAggregation>();
MultiWordEntities MWE3 = MultiWordEntityRecognition(classifier3, text);
MultiWordEntities MWE4 = MultiWordEntityRecognition(classifier4, text);
MultiWordEntities MWE7 = MultiWordEntityRecognition(classifier7, text);
longEntities3.addAll(MWE3.forAgg);
//longEntities4.addAll(MWE4.forAgg);
longEntities7.addAll(MWE7.forAgg);
HashMap<String, Integer[]> Entities = new HashMap<String, Integer[]>();
for (int entityIndex = 0; entityIndex < longEntities3.size(); entityIndex++) {
Integer[] offsetArray = {MWE3.startInd.get(entityIndex)};
Entities.put(longEntities3.get(entityIndex).toAggregate[0], offsetArray);
EntitiesWithOffset.add(longEntities3.get(entityIndex).toAggregate);
}
// The following code is to incorporate another NER library, but it was causing
// problems for large individual documents, so for now "longEntities4.addAll(MWE4.forAgg);"
//has been commented out. In the future,
// a check on file size may be done to switch between using the library or not.
for (int entityIndex = 0; entityIndex < longEntities4.size(); entityIndex++) {
if (Entities.containsKey(longEntities4.get(entityIndex).toAggregate[0]) && Arrays.asList(Entities.get(longEntities4.get(entityIndex).toAggregate[0])).contains(MWE4.startInd.get(entityIndex))) {
continue;
} else if (Entities.containsKey(longEntities4.get(entityIndex).toAggregate[0])) {
Integer[] numOfOcc = Entities.get(longEntities4.get(entityIndex).toAggregate[0]);
Integer[] offsetArray4 = new Integer[numOfOcc.length + 1];
offsetArray4 = Arrays.copyOf(Entities.get(longEntities4.get(entityIndex).toAggregate[0]), offsetArray4.length);
offsetArray4[offsetArray4.length - 1] = MWE4.startInd.get(entityIndex);
Entities.put(longEntities4.get(entityIndex).toAggregate[0], offsetArray4);
EntitiesWithOffset.add(longEntities4.get(entityIndex).toAggregate);
} else {
Integer[] offsetArray = {MWE4.startInd.get(entityIndex)};
Entities.put(longEntities4.get(entityIndex).toAggregate[0], offsetArray);
EntitiesWithOffset.add(longEntities4.get(entityIndex).toAggregate);
}
}
for (int entityIndex = 0; entityIndex < longEntities7.size(); entityIndex++) {
if (Entities.containsKey(longEntities7.get(entityIndex).toAggregate[0]) && Arrays.asList(Entities.get(longEntities7.get(entityIndex).toAggregate[0])).contains(MWE7.startInd.get(entityIndex))) {
continue;
} else if (Entities.containsKey(longEntities7.get(entityIndex).toAggregate[0])) {
Integer[] numOfOcc = Entities.get(longEntities7.get(entityIndex).toAggregate[0]);
Integer[] offsetArray7 = new Integer[numOfOcc.length + 1];
offsetArray7 = Arrays.copyOf(Entities.get(longEntities7.get(entityIndex).toAggregate[0]), offsetArray7.length);
offsetArray7[offsetArray7.length - 1] = MWE7.startInd.get(entityIndex);
Entities.put(longEntities7.get(entityIndex).toAggregate[0], offsetArray7);
EntitiesWithOffset.add(longEntities7.get(entityIndex).toAggregate);
} else {
Integer[] offsetArray = {MWE7.startInd.get(entityIndex)};
Entities.put(longEntities7.get(entityIndex).toAggregate[0], offsetArray);
EntitiesWithOffset.add(longEntities7.get(entityIndex).toAggregate);
}
}
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}
private MultiWordEntities MultiWordEntityRecognition(AbstractSequenceClassifier<?> classifier, String inText) {
List<ForAggregation> NamedEntities = new ArrayList<ForAggregation>();
String htmlString = classifier.classifyToString(inText, "inlineXML", true);
Pattern tags = Pattern.compile("<.+?>.+?</.+?>");
Pattern tempTags = null;
Matcher matcher = tags.matcher(htmlString);
Matcher tempMatcher = null;
List<Integer> startIndicies = new ArrayList<Integer>();
HashMap<String, Integer> hashedNumOcc = new HashMap<String, Integer>();
while (matcher.find()) {
String name = (matcher.group().replaceAll("<.+?>", ""));
/*
if (name.split("\\s+").length<2){
continue;
}
*/
String[] NamedEntity_array = {name, matcher.group().replaceAll("<", "").replaceAll(">.+", "")};
if (hashedNumOcc.containsKey(name)) {
hashedNumOcc.put(name, hashedNumOcc.get(name) + 1);
} else {
hashedNumOcc.put(name, 1);
}
ForAggregation NamedEntity = new ForAggregation(NamedEntity_array);
startIndicies.add(findNthIndexOf(inText, name, hashedNumOcc.get(name)));
if (null != NamedEntity) {
NamedEntities.add(NamedEntity);
}
}
MultiWordEntities toReturn = new MultiWordEntities(NamedEntities, startIndicies);
return toReturn;
}
private int findNthIndexOf(String str, String needle, int occurence)
throws IndexOutOfBoundsException {
int index = -1;
Pattern p = Pattern.compile(needle, Pattern.MULTILINE);
Matcher m = p.matcher(str);
while (m.find()) {
if (--occurence == 0) {
index = m.start();
break;
}
}
if (index < 0) {
throw new IndexOutOfBoundsException();
}
return index;
}
}