package context.core.task.syntaxbased; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Vector; import org.gephi.graph.api.DirectedGraph; import org.gephi.graph.api.Edge; import org.gephi.graph.api.GraphController; import org.gephi.graph.api.GraphModel; import org.gephi.graph.api.Node; import org.gephi.io.exporter.api.ExportController; import org.gephi.project.api.ProjectController; import org.gephi.project.api.Workspace; import org.openide.util.Lookup; import context.core.entity.CorpusData; import context.core.entity.FileData; import context.core.entity.TabularData; import context.core.task.pos.POSTagger; import context.core.util.CorpusAggregator; import context.core.util.JavaIO; import context.core.util.MyPair; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.trees.GrammaticalStructure; import edu.stanford.nlp.trees.GrammaticalStructureFactory; import edu.stanford.nlp.trees.PennTreebankLanguagePack; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreebankLanguagePack; import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; import edu.stanford.nlp.util.CoreMap; import gnu.trove.iterator.TObjectIntIterator; import gnu.trove.map.hash.TObjectIntHashMap; /** * * @author Aale */ public class SyntacticNetwork { /** * @param args */ private StanfordCoreNLP pipeline; private StanfordCoreNLP POSpipeline; private List<String[]> NetworkEdges; private HashSet<String[]> NodeHashSet; private SyntaxBasedTaskInstance instance; private CorpusData input; private List<TabularData> tabularOutput; private List<List<String[]>> toAggregate; private List<List<List<String[]>>> POStags; private List<String> POStoKeepTrack; private HashMap<String, List<String>> graphComponents; private HashMap<List<String>, List<List<String>>> POSedgeMap; private List<String> dependencyEdges; private int unitOfAnalysis; private long timeout; private int distance; /** * * @param instance */ public SyntacticNetwork(SyntaxBasedTaskInstance instance) { // TODO Auto-generated method stub this.instance = instance; init(); } private void init() { POSedgeMap = instance.getEdgeMap(); this.dependencyEdges = instance.getDependencyEdges(); this.unitOfAnalysis = instance.getUnitOfAnalysis(); this.timeout = instance.getTimeout(); this.distance = instance.getDistance(); this.input = (CorpusData) instance.getInput(); this.pipeline = instance.getPipeline(); this.POSpipeline = instance.getPipelinePOS(); this.tabularOutput = instance.getTabularOutput(); NodeHashSet = new HashSet<String[]>(); NetworkEdges = new ArrayList<String[]>(); POStags = new ArrayList<List<List<String[]>>>(); graphComponents = new HashMap<String, List<String>>(); List<String> properNouns = new ArrayList<String>(); properNouns.add("NNP"); properNouns.add("NNPS"); List<String> commonNouns = new ArrayList<String>(); commonNouns.add("NN"); commonNouns.add("NNS"); List<String> verbs = new ArrayList<String>(); verbs.add("VB"); verbs.add("VBD"); verbs.add("VBZ"); verbs.add("VBG"); verbs.add("VBN"); verbs.add("VBP"); List<String> conjunctions = new ArrayList<String>(); conjunctions.add("CC"); List<String> numbers = new ArrayList<String>(); numbers.add("CD"); List<String> foreign = new ArrayList<String>(); foreign.add("FW"); List<String> modals = new ArrayList<String>(); modals.add("MD"); List<String> pronouns = new ArrayList<String>(); pronouns.add("PRP"); pronouns.add("PRP$"); pronouns.add("WP"); pronouns.add("WP$"); List<String> adjectives = new ArrayList<String>(); adjectives.add("JJ"); adjectives.add("JJR"); adjectives.add("JJS"); List<String> symbols = new ArrayList<String>(); symbols.add("SYM"); List<String> interjections = new ArrayList<String>(); interjections.add("UH"); try { graphComponents.put("NN", commonNouns); graphComponents.put("NNS", commonNouns); graphComponents.put("NNP", properNouns); graphComponents.put("NNPS", properNouns); graphComponents.put("CC", conjunctions); graphComponents.put("CD", numbers); graphComponents.put("FW", foreign); graphComponents.put("MD", modals); graphComponents.put("PRP", pronouns); graphComponents.put("PRP$", pronouns); graphComponents.put("WP", pronouns); graphComponents.put("WP$", pronouns); graphComponents.put("VB", verbs); graphComponents.put("VBD", verbs); graphComponents.put("VBG", verbs); graphComponents.put("VBN", verbs); graphComponents.put("VBZ", verbs); graphComponents.put("VBP", verbs); graphComponents.put("JJ", adjectives); graphComponents.put("JJR", adjectives); graphComponents.put("JJS", adjectives); graphComponents.put("SYM", symbols); graphComponents.put("UH", interjections); } catch (Exception e) { e.printStackTrace(); } ///// ///// Fixed params for Demo ///// this.distance = 7; this.unitOfAnalysis = 2; this.timeout = 120000; this.POStoKeepTrack = new ArrayList<String>(); POStoKeepTrack.add("NN"); POStoKeepTrack.add("NNS"); POStoKeepTrack.add("NNP"); POStoKeepTrack.add("NNPS"); POSedgeMap = new HashMap<List<String>, List<List<String>>>(); List<List<String>> commonNounEdges = new ArrayList<List<String>>(); commonNounEdges.add(properNouns); List<List<String>> properNounEdges = new ArrayList<List<String>>(); properNounEdges.add(commonNouns); POSedgeMap.put(commonNouns, commonNounEdges); POSedgeMap.put(properNouns, properNounEdges); dependencyEdges = new ArrayList<String>(); dependencyEdges.add("nsubj"); dependencyEdges.add("dobj"); ///// ///// End fixed params ///// } /** * * @return */ public boolean genPOSNetwork() { List<FileData> files = input.getFiles(); toAggregate = new ArrayList<List<String[]>>(); try { for (FileData ff : files) { File file = ff.getFile(); String text = JavaIO.readFile(file); text = text.replaceAll("\\p{Cc}", " "); text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " "); //text = text.replaceAll("[^\\x00-\\x7F]", ""); Annotation document = new Annotation(text); POSpipeline.annotate(document); List<List<String[]>> DocPOSTags = new ArrayList<List<String[]>>(); List<CoreMap> sentences = document.get(SentencesAnnotation.class); int placeInDoc = 0; for (CoreMap sentence : sentences) { List<String[]> sentPOStags = new ArrayList<String[]>(); // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods int placeInSent = 0; final List<CoreLabel> sent = sentence.get(TokensAnnotation.class); final List<TaggedWord> taggedWords = POSTagger.tag(sent, "en"); for (TaggedWord token : taggedWords) { // this is the text of the token String word = token.word(); // this is the POS tag of the token String pos = token.tag(); String[] entity = {word, pos, Integer.toString(placeInSent), Integer.toString(placeInDoc)}; placeInSent++; placeInDoc++; if (!word.matches("^[a-zA-Z0-9]*$")) { continue; } if (!POStoKeepTrack.contains(pos)) { continue; } String[] hashNode = new String[2]; hashNode[0] = word; hashNode[1] = pos; NodeHashSet.add(hashNode); sentPOStags.add(entity); } DocPOSTags.add(sentPOStags); } POStags.add(DocPOSTags); } if (unitOfAnalysis == 1) { for (List<List<String[]>> DocPOSTags : POStags) { String[] word; List<String[]> docAggregate = new ArrayList<String[]>(); for (int overIndx = 0; overIndx < DocPOSTags.size(); overIndx++) { List<String[]> SentPOSTags = DocPOSTags.get(overIndx); for (int indx = 0; indx < SentPOSTags.size(); indx++) { word = SentPOSTags.get(indx); List<String[]> TempSentPOSTags = SentPOSTags; int tempIndex = indx + 1; if (tempIndex >= SentPOSTags.size()) { break; } String[] tempWord = null; try { tempWord = TempSentPOSTags.get(tempIndex); } catch (Exception tempE) { tempE.printStackTrace(); } while (Integer.parseInt(tempWord[3]) - Integer.parseInt(word[3]) < distance) { if (POSedgeMap.get(graphComponents.get(tempWord[1])).contains(graphComponents.get(word[1]))) { String[] tempPOSEdge = new String[5]; tempPOSEdge[0] = word[0]; tempPOSEdge[1] = word[1]; tempPOSEdge[2] = tempWord[0]; tempPOSEdge[3] = tempWord[1]; tempPOSEdge[4] = "1"; docAggregate.add(tempPOSEdge); } tempIndex++; if (tempIndex >= TempSentPOSTags.size()) { break; } try { tempWord = TempSentPOSTags.get(tempIndex); } catch (Exception tempE) { tempE.printStackTrace(); } } } } toAggregate.add(docAggregate); } NetworkEdges = new CorpusAggregator().CorpusAggregate(toAggregate); } if (unitOfAnalysis == 2) { for (List<List<String[]>> DocPOSTags : POStags) { String[] word; List<String[]> docAggregate = new ArrayList<String[]>(); for (int overIndx = 0; overIndx < DocPOSTags.size(); overIndx++) { List<String[]> SentPOSTags = DocPOSTags.get(overIndx); for (int indx = 0; indx < SentPOSTags.size(); indx++) { word = SentPOSTags.get(indx); List<String[]> TempSentPOSTags = SentPOSTags; int tempIndex = indx + 1; int tempOverIndex = overIndx; if (tempIndex >= SentPOSTags.size()) { TempSentPOSTags = null; Boolean breakCondition = false; while (TempSentPOSTags == null || TempSentPOSTags.size() == 0) { if (tempOverIndex + 1 < DocPOSTags.size()) { tempIndex = 0; tempOverIndex++; TempSentPOSTags = DocPOSTags.get(tempOverIndex); } else { breakCondition = true; break; } } if (breakCondition) { break; } } String[] tempWord = null; try { tempWord = TempSentPOSTags.get(tempIndex); } catch (Exception tempE) { tempE.printStackTrace(); } while (Integer.parseInt(tempWord[3]) - Integer.parseInt(word[3]) < distance) { if (POSedgeMap.get(graphComponents.get(tempWord[1])).contains(graphComponents.get(word[1]))) { String[] tempPOSEdge = new String[5]; tempPOSEdge[0] = word[0]; tempPOSEdge[1] = word[1]; tempPOSEdge[2] = tempWord[0]; tempPOSEdge[3] = tempWord[1]; tempPOSEdge[4] = "1"; docAggregate.add(tempPOSEdge); } tempIndex++; if (tempIndex >= TempSentPOSTags.size()) { TempSentPOSTags = null; boolean breakCondition = false; while (TempSentPOSTags == null || TempSentPOSTags.size() == 0) { if (tempOverIndex + 1 < DocPOSTags.size()) { tempIndex = 0; tempOverIndex++; TempSentPOSTags = DocPOSTags.get(tempOverIndex); } else { breakCondition = true; break; } } if (breakCondition) { break; } } try { tempWord = TempSentPOSTags.get(tempIndex); } catch (Exception tempE) { tempE.printStackTrace(); } } } } toAggregate.add(docAggregate); } NetworkEdges = new CorpusAggregator().CorpusAggregate(toAggregate); } } catch (Exception e) { e.printStackTrace(); return false; } return true; } /** * * @return */ public boolean genNetwork() { toAggregate = new ArrayList<List<String[]>>(); List<FileData> files = input.getFiles(); try { for (FileData ff : files) { File file = ff.getFile(); final File finalFile = file; System.out.println(file.getName()); Thread myThread = new Thread(new Runnable() { @Override public void run() { genFileSyntax(finalFile); } }); myThread.start(); long endTimeMillis = System.currentTimeMillis() + timeout; while (myThread.isAlive()) { if (System.currentTimeMillis() > endTimeMillis) { myThread.interrupt(); break; } try { Thread.sleep(500); } catch (InterruptedException t) { } } } } catch (Exception e) { e.printStackTrace(); return false; } NetworkEdges = new CorpusAggregator().CorpusAggregate(toAggregate); return true; } private boolean genFileSyntax(File file) { try { String text = JavaIO.readFile(file); text = text.replaceAll("\\p{Cc}", " "); text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " "); // create an empty Annotation just with the given text Annotation document = new Annotation(text); // run all Annotators on this text pipeline.annotate(document); // these are all the sentences in this document // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { if (Thread.interrupted()) { return false; } Tree tree = sentence.get(TreeAnnotation.class); //System.out.println(dependencies.getAllNodesByWordPattern("it")); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); if (tree == null) { continue; } GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection tdl = gs.typedDependenciesCollapsed(); Iterator iter1 = tdl.iterator(); String typedDep = ""; List<String[]> tempDependencies = new ArrayList<String[]>(); while (iter1.hasNext()) { typedDep = iter1.next().toString(); if (typedDep.trim().split("\\(")[0].equals("nsubj") && dependencyEdges.contains("nsubj")) { String[] tempDependency = new String[5]; tempDependency[0] = typedDep.trim().split("\\(")[1].split(",")[1].trim().split("-")[0]; tempDependency[1] = ""; tempDependency[2] = typedDep.trim().split("\\(")[1].split(",")[0].trim().split("-")[0]; tempDependency[3] = ""; tempDependency[4] = Integer.toString(1); tempDependencies.add(tempDependency); } if (typedDep.trim().split("\\(")[0].equals("nsubjpass") && dependencyEdges.contains("nsubjpass")) { String[] tempDependency = new String[5]; tempDependency[0] = typedDep.trim().split("\\(")[1].split(",")[0].trim().split("-")[0]; tempDependency[1] = ""; tempDependency[2] = typedDep.trim().split("\\(")[1].split(",")[1].trim().split("-")[0]; tempDependency[3] = ""; tempDependency[4] = Integer.toString(1); tempDependencies.add(tempDependency); } if (typedDep.trim().split("\\(")[0].equals("dobj") && dependencyEdges.contains("dobj")) { String[] tempDependency = new String[5]; tempDependency[0] = typedDep.trim().split("\\(")[1].split(",")[0].trim().split("-")[0]; tempDependency[1] = ""; tempDependency[2] = typedDep.trim().split("\\(")[1].split(",")[1].trim().split("-")[0]; tempDependency[3] = ""; tempDependency[4] = Integer.toString(1); tempDependencies.add(tempDependency); } if (typedDep.trim().split("\\(")[0].equals("iobj") && dependencyEdges.contains("iobj")) { String[] tempDependency = new String[5]; tempDependency[0] = typedDep.trim().split("\\(")[1].split(",")[0].trim().split("-")[0]; tempDependency[1] = ""; tempDependency[2] = typedDep.trim().split("\\(")[1].split(",")[1].trim().split("-")[0]; tempDependency[3] = ""; tempDependency[4] = Integer.toString(1); tempDependencies.add(tempDependency); } } final List<CoreLabel> sent = sentence.get(TokensAnnotation.class); final List<TaggedWord> taggedWords = POSTagger.tag(sent, "en"); for (TaggedWord token : taggedWords) { // this is the text of the token String word = token.word(); // this is the POS tag of the token String pos = token.tag(); for (String[] tempDep : tempDependencies) { if (tempDep[0].equals(word)) { tempDep[1] = pos; String[] hashNode = new String[2]; hashNode[0] = word; hashNode[1] = pos; NodeHashSet.add(hashNode); } if (tempDep[2].equals(word)) { tempDep[3] = pos; String[] hashNode = new String[2]; hashNode[0] = word; hashNode[1] = pos; NodeHashSet.add(hashNode); } } } toAggregate.add(tempDependencies); } } catch (Exception e) { e.printStackTrace(); return false; } return true; } /** * * @return */ public String[][] getNetworkEdges() { String[][] NetworkEdgesArray = new String[NetworkEdges.size()][5]; NetworkEdgesArray = NetworkEdges.toArray(NetworkEdgesArray); return NetworkEdgesArray; } /** * * @return */ public String[][] getNetworkNodes() { String[][] NetworkNodes = new String[NodeHashSet.size()][2]; NetworkNodes = NodeHashSet.toArray(NetworkNodes); return NetworkNodes; } /** * * @param filename * @return */ public boolean extractGephiOutput(String filename) { String[][] nodes_str = this.getNetworkNodes(); String[][] edges_str = this.getNetworkEdges(); File new_file = new File(filename); if (new_file.exists()) { new_file.delete(); } //Init a project - and therefore a workspace ProjectController pc = Lookup.getDefault().lookup(ProjectController.class); pc.newProject(); Workspace workspace = pc.getCurrentWorkspace(); //Get a graph model - it exists because we have a workspace GraphModel graphModel = Lookup.getDefault().lookup(GraphController.class).getModel(); final DirectedGraph directedGraph = graphModel.getDirectedGraph(); TObjectIntHashMap<String> nodes = new TObjectIntHashMap<String>(); Vector<String> node_index = new Vector<String>(); //Create the nodes for (String[] node_str : nodes_str) { if (nodes.containsKey(node_str[0])) { int index = nodes.get(node_str[0]); node_index.set(index, node_index.get(index) + "," + node_str[1]); } else { node_index.add(node_str[1]); nodes.put(node_str[0], node_index.size() - 1); } } for (TObjectIntIterator<String> node_it = nodes.iterator(); node_it.hasNext();) { node_it.advance(); Node n0 = graphModel.factory().newNode(node_it.key()); n0.getAttributes().setValue("label", node_it.key()); n0.getAttributes().setValue("Type", node_index.get(node_it.value())); directedGraph.addNode(n0); } TObjectIntHashMap<MyPair<String, String>> edges = new TObjectIntHashMap<MyPair<String, String>>(); for (String[] edge_str : edges_str) { MyPair<String, String> edge = new MyPair(edge_str[0], edge_str[2]); int value = Integer.parseInt(edge_str[4]); edges.adjustOrPutValue(edge, value, value); } TObjectIntIterator<MyPair<String, String>> edge_it; for (edge_it = edges.iterator(); edge_it.hasNext();) { edge_it.advance(); Node s1 = directedGraph.getNode(edge_it.key().getFirst()); Node s2 = directedGraph.getNode(edge_it.key().getSecond()); if (s1 == null || s2 == null) { continue; } int weight = edge_it.value(); Edge e0 = graphModel.factory().newEdge(s1, s2, weight, true); directedGraph.addEdge(e0); } //Export full graph ExportController ec = Lookup.getDefault().lookup(ExportController.class); try { ec.exportFile(new_file); } catch (IOException ex) { System.out.println(ex.getMessage()); return false; } return true; } /** * * @param instance * @param output_address * @return */ public static boolean runUnit(SyntaxBasedTaskInstance instance, String output_address) { SyntacticNetwork SN = new SyntacticNetwork(instance); SN.genNetwork(); if (SN.extractGephiOutput(output_address)) { return false; } return true; } }