package context.core.task.entitynetwork; import context.core.entity.CorpusData; import context.core.entity.FileData; import context.core.entity.TabularData; import context.core.task.entitydetection.MultiWordEntities; import context.core.task.syntaxbased.SyntacticNetwork; import context.core.task.syntaxbased.SyntaxBasedTaskInstance; import context.core.util.CorpusAggregator; import context.core.util.ForAggregation; import context.core.util.JavaIO; import context.core.util.MyPair; import edu.stanford.nlp.ie.AbstractSequenceClassifier; import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; import gnu.trove.iterator.TObjectIntIterator; import gnu.trove.map.hash.TObjectIntHashMap; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.gephi.graph.api.DirectedGraph; import org.gephi.graph.api.Edge; import org.gephi.graph.api.GraphController; import org.gephi.graph.api.GraphModel; import org.gephi.graph.api.Node; import org.gephi.io.exporter.api.ExportController; import org.gephi.project.api.ProjectController; import org.gephi.project.api.Workspace; import org.openide.util.Lookup; /** * * @author Aale */ public class EntityNetworkBody { private AbstractSequenceClassifier<?> classifier3; private AbstractSequenceClassifier<?> classifier4; private AbstractSequenceClassifier<?> classifier7; private StanfordCoreNLP pipeline; private List<String[]> NetworkEdges; private HashSet<String[]> NodeHashSet; private EntityNetworkTaskInstance instance; private CorpusData input; private List<TabularData> tabularOutput; private List<List<String[]>> toAggregate; private List<List<List<String[]>>> entityTags; private List<String> EntitiesToKeepTrack; private int unitOfAnalysis; private long timeout; private int distance; private List<String[]> EntitiesWithOffset; /** * * @param instance */ public EntityNetworkBody(EntityNetworkTaskInstance instance) { // TODO Auto-generated method stub this.instance = instance; init(); } private void init(){ this.EntitiesToKeepTrack = new ArrayList<String>(); EntitiesToKeepTrack.add("PERSON"); EntitiesToKeepTrack.add("ORGANIZATION"); EntitiesToKeepTrack.add("MONEY"); EntitiesToKeepTrack.add("LOCATION"); String[] properNouns = new String[2]; properNouns[0] = "NNP"; properNouns[1] = "NNPS"; String[] commonNouns = new String[2]; commonNouns[0] = "NN"; commonNouns[1] = "NNS"; this.pipeline = instance.getPipeline(); this.classifier3 = instance.get3Classifier(); this.classifier4 = instance.get4Classifier(); this.classifier7 = instance.get7Classifier(); this.unitOfAnalysis = instance.getUnitOfAnalysis(); this.distance = 7; this.unitOfAnalysis = 2; this.timeout = 120000; this.input = (CorpusData) instance.getInput(); this.EntitiesWithOffset = new ArrayList<String[]>(); this.tabularOutput = instance.getTabularOutput(); NodeHashSet = new HashSet<String[]>(); NetworkEdges = new ArrayList<String[]>(); entityTags = new ArrayList<List<List<String[]>>>(); } /** * * @return */ public boolean genNetwork(){ List<FileData> files = input.getFiles(); toAggregate = new ArrayList<List<String[]>>(); try{ for (FileData ff:files){ File file = ff.getFile(); String text = JavaIO.readFile(file); text = text.replaceAll("\\p{Cc}", " "); text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " "); //text = text.replaceAll("[^\\x00-\\x7F]", ""); Annotation document = new Annotation(text); pipeline.annotate(document); List<List<String[]>> DocPOSTags = new ArrayList<List<String[]>>(); List<CoreMap> sentences = document.get(SentencesAnnotation.class); int placeInDoc = 0; for (CoreMap sentence : sentences) { List<String[]> sentPOStags = new ArrayList<String[]>(); // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods int placeInSent = 0; for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = token.get(TextAnnotation.class); // this is the POS tag of the token String pos = token.get(PartOfSpeechAnnotation.class); String[] entity = {word, pos, Integer.toString(placeInSent), Integer.toString(placeInDoc)}; placeInSent++; placeInDoc++; if (!word.matches("^[a-zA-Z0-9]*$")) { continue; } if(!EntitiesToKeepTrack.contains(pos)){ continue; } String[] hashNode = new String[2]; hashNode[0] = word; hashNode[1] = pos; NodeHashSet.add(hashNode); sentPOStags.add(entity); } DocPOSTags.add(sentPOStags); } entityTags.add(DocPOSTags); } if(unitOfAnalysis ==2){ for (List<List<String[]>> DocEntityTags: entityTags){ String[] word; List<String[]> docAggregate = new ArrayList<String[]>(); for (int overIndx = 0; overIndx < DocEntityTags.size(); overIndx ++){ List<String[]> SentEntityTags = DocEntityTags.get(overIndx); for (int indx = 0; indx < SentEntityTags.size(); indx ++){ word = SentEntityTags.get(indx); List<String[]> TempSentEntitytTags = SentEntityTags; int tempIndex = indx + 1; int tempOverIndex = overIndx; if (tempIndex >= SentEntityTags.size()){ TempSentEntitytTags = null; Boolean breakCondition = false; while(TempSentEntitytTags == null || TempSentEntitytTags.size() == 0){ if (tempOverIndex + 1 < DocEntityTags.size()){ tempIndex = 0; tempOverIndex ++; TempSentEntitytTags = DocEntityTags.get(tempOverIndex); } else{ breakCondition = true; break; } } if (breakCondition){ break; } } String[] tempWord = null; try{ tempWord = TempSentEntitytTags.get(tempIndex); } catch(Exception tempE){ tempE.printStackTrace(); } while (Integer.parseInt(tempWord[3]) - Integer.parseInt(word[3])<distance){ String[] tempEntityEdge = new String[5]; tempEntityEdge[0] = word[0]; tempEntityEdge[1] = word[1]; tempEntityEdge[2] = tempWord[0]; tempEntityEdge[3] = tempWord[1]; tempEntityEdge[4] = "1"; docAggregate.add(tempEntityEdge); tempIndex ++; if (tempIndex >= TempSentEntitytTags.size()){ TempSentEntitytTags = null; boolean breakCondition = false; while(TempSentEntitytTags == null || TempSentEntitytTags.size() == 0){ if (tempOverIndex + 1 < DocEntityTags.size()){ tempIndex = 0; tempOverIndex ++; TempSentEntitytTags = DocEntityTags.get(tempOverIndex); } else{ breakCondition = true; break; } } if (breakCondition){ break; } } try{ tempWord = TempSentEntitytTags.get(tempIndex); }catch(Exception tempE){ tempE.printStackTrace(); } } } } toAggregate.add(docAggregate); } NetworkEdges = new CorpusAggregator().CorpusAggregate(toAggregate); } } catch(Exception e){ e.printStackTrace(); return false; } return true; } /** * * @return */ public String[][] getNetworkEdges(){ String[][] NetworkEdgesArray = new String[NetworkEdges.size()][5]; NetworkEdgesArray = NetworkEdges.toArray(NetworkEdgesArray); return NetworkEdgesArray; } /** * * @return */ public String[][] getNetworkNodes(){ String[][] NetworkNodes = new String[NodeHashSet.size()][2]; NetworkNodes = NodeHashSet.toArray(NetworkNodes); return NetworkNodes; } /** * * @param filename * @return */ public boolean extractGephiOutput(String filename){ String[][] nodes_str = this.getNetworkNodes(); String[][] edges_str = this.getNetworkEdges(); File new_file = new File(filename); if(new_file.exists()) new_file.delete(); //Init a project - and therefore a workspace ProjectController pc = Lookup.getDefault().lookup(ProjectController.class); pc.newProject(); Workspace workspace = pc.getCurrentWorkspace(); //Get a graph model - it exists because we have a workspace GraphModel graphModel = Lookup.getDefault().lookup(GraphController.class).getModel(); final DirectedGraph directedGraph = graphModel.getDirectedGraph(); TObjectIntHashMap<String> nodes = new TObjectIntHashMap<String>(); Vector<String> node_index = new Vector<String>(); //Create the nodes for(String[] node_str : nodes_str){ if (nodes.containsKey(node_str[0])){ int index = nodes.get(node_str[0]); node_index.set(index, node_index.get(index) + "," + node_str[1]); } else { node_index.add(node_str[1]); nodes.put(node_str[0], node_index.size() - 1); } } for(TObjectIntIterator<String> node_it = nodes.iterator(); node_it.hasNext();){ node_it.advance(); Node n0 = graphModel.factory().newNode(node_it.key()); n0.getAttributes().setValue("label", node_it.key()); n0.getAttributes().setValue("Type", node_index.get(node_it.value())); directedGraph.addNode(n0); } TObjectIntHashMap<MyPair<String, String> > edges = new TObjectIntHashMap<MyPair<String,String>>(); for(String[] edge_str: edges_str){ MyPair<String, String> edge = new MyPair(edge_str[0], edge_str[2]); int value = Integer.parseInt(edge_str[4]); edges.adjustOrPutValue(edge, value, value); } TObjectIntIterator<MyPair<String, String> > edge_it; for(edge_it = edges.iterator(); edge_it.hasNext(); ){ edge_it.advance(); Node s1 = directedGraph.getNode(edge_it.key().getFirst()); Node s2 = directedGraph.getNode(edge_it.key().getSecond()); if (s1 == null || s2 == null){ continue; } int weight = edge_it.value(); Edge e0 = graphModel.factory().newEdge(s1, s2, weight, true); directedGraph.addEdge(e0); } //Export full graph ExportController ec = Lookup.getDefault().lookup(ExportController.class); try { ec.exportFile(new_file); } catch (IOException ex) { System.out.println(ex.getMessage()); return false; } return true; } /** * * @param instance * @param output_address * @return */ public static boolean runUnit(SyntaxBasedTaskInstance instance, String output_address){ SyntacticNetwork SN = new SyntacticNetwork(instance); SN.genNetwork(); if (SN.extractGephiOutput(output_address)) return false; return true; } /** * * @return */ public boolean detectEntities() { List<FileData> files = input.getFiles(); List<List<String[]>> toAggregate = new ArrayList<List<String[]>>(); try { for (FileData ff : files) { File file = ff.getFile(); String text; try { text = JavaIO.readFile(file); text = text.replaceAll("\\p{Cc}", " "); text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"]", " "); List<ForAggregation> longEntities3 = new ArrayList<ForAggregation>(); List<ForAggregation> longEntities4 = new ArrayList<ForAggregation>(); List<ForAggregation> longEntities7 = new ArrayList<ForAggregation>(); MultiWordEntities MWE3 = MultiWordEntityRecognition(classifier3, text); MultiWordEntities MWE4 = MultiWordEntityRecognition(classifier4, text); MultiWordEntities MWE7 = MultiWordEntityRecognition(classifier7, text); longEntities3.addAll(MWE3.forAgg); //longEntities4.addAll(MWE4.forAgg); longEntities7.addAll(MWE7.forAgg); HashMap<String, Integer[]> Entities = new HashMap<String, Integer[]>(); for (int entityIndex = 0; entityIndex < longEntities3.size(); entityIndex++) { Integer[] offsetArray = {MWE3.startInd.get(entityIndex)}; Entities.put(longEntities3.get(entityIndex).toAggregate[0], offsetArray); EntitiesWithOffset.add(longEntities3.get(entityIndex).toAggregate); } // The following code is to incorporate another NER library, but it was causing // problems for large individual documents, so for now "longEntities4.addAll(MWE4.forAgg);" //has been commented out. In the future, // a check on file size may be done to switch between using the library or not. for (int entityIndex = 0; entityIndex < longEntities4.size(); entityIndex++) { if (Entities.containsKey(longEntities4.get(entityIndex).toAggregate[0]) && Arrays.asList(Entities.get(longEntities4.get(entityIndex).toAggregate[0])).contains(MWE4.startInd.get(entityIndex))) { continue; } else if (Entities.containsKey(longEntities4.get(entityIndex).toAggregate[0])) { Integer[] numOfOcc = Entities.get(longEntities4.get(entityIndex).toAggregate[0]); Integer[] offsetArray4 = new Integer[numOfOcc.length + 1]; offsetArray4 = Arrays.copyOf(Entities.get(longEntities4.get(entityIndex).toAggregate[0]), offsetArray4.length); offsetArray4[offsetArray4.length - 1] = MWE4.startInd.get(entityIndex); Entities.put(longEntities4.get(entityIndex).toAggregate[0], offsetArray4); EntitiesWithOffset.add(longEntities4.get(entityIndex).toAggregate); } else { Integer[] offsetArray = {MWE4.startInd.get(entityIndex)}; Entities.put(longEntities4.get(entityIndex).toAggregate[0], offsetArray); EntitiesWithOffset.add(longEntities4.get(entityIndex).toAggregate); } } for (int entityIndex = 0; entityIndex < longEntities7.size(); entityIndex++) { if (Entities.containsKey(longEntities7.get(entityIndex).toAggregate[0]) && Arrays.asList(Entities.get(longEntities7.get(entityIndex).toAggregate[0])).contains(MWE7.startInd.get(entityIndex))) { continue; } else if (Entities.containsKey(longEntities7.get(entityIndex).toAggregate[0])) { Integer[] numOfOcc = Entities.get(longEntities7.get(entityIndex).toAggregate[0]); Integer[] offsetArray7 = new Integer[numOfOcc.length + 1]; offsetArray7 = Arrays.copyOf(Entities.get(longEntities7.get(entityIndex).toAggregate[0]), offsetArray7.length); offsetArray7[offsetArray7.length - 1] = MWE7.startInd.get(entityIndex); Entities.put(longEntities7.get(entityIndex).toAggregate[0], offsetArray7); EntitiesWithOffset.add(longEntities7.get(entityIndex).toAggregate); } else { Integer[] offsetArray = {MWE7.startInd.get(entityIndex)}; Entities.put(longEntities7.get(entityIndex).toAggregate[0], offsetArray); EntitiesWithOffset.add(longEntities7.get(entityIndex).toAggregate); } } } catch (IOException e) { e.printStackTrace(); return false; } } } catch (Exception e) { e.printStackTrace(); return false; } return true; } private MultiWordEntities MultiWordEntityRecognition(AbstractSequenceClassifier<?> classifier, String inText) { List<ForAggregation> NamedEntities = new ArrayList<ForAggregation>(); String htmlString = classifier.classifyToString(inText, "inlineXML", true); Pattern tags = Pattern.compile("<.+?>.+?</.+?>"); Pattern tempTags = null; Matcher matcher = tags.matcher(htmlString); Matcher tempMatcher = null; List<Integer> startIndicies = new ArrayList<Integer>(); HashMap<String, Integer> hashedNumOcc = new HashMap<String, Integer>(); while (matcher.find()) { String name = (matcher.group().replaceAll("<.+?>", "")); /* if (name.split("\\s+").length<2){ continue; } */ String[] NamedEntity_array = {name, matcher.group().replaceAll("<", "").replaceAll(">.+", "")}; if (hashedNumOcc.containsKey(name)) { hashedNumOcc.put(name, hashedNumOcc.get(name) + 1); } else { hashedNumOcc.put(name, 1); } ForAggregation NamedEntity = new ForAggregation(NamedEntity_array); startIndicies.add(findNthIndexOf(inText, name, hashedNumOcc.get(name))); if (null != NamedEntity) { NamedEntities.add(NamedEntity); } } MultiWordEntities toReturn = new MultiWordEntities(NamedEntities, startIndicies); return toReturn; } private int findNthIndexOf(String str, String needle, int occurence) throws IndexOutOfBoundsException { int index = -1; Pattern p = Pattern.compile(needle, Pattern.MULTILINE); Matcher m = p.matcher(str); while (m.find()) { if (--occurence == 0) { index = m.start(); break; } } if (index < 0) { throw new IndexOutOfBoundsException(); } return index; } }