package context.core.task.codebook; import context.core.entity.CorpusData; import context.core.entity.FileData; import context.core.util.CodebookUtils; import context.core.util.JavaIO; import gnu.trove.TIntObjectHashMap; import gnu.trove.TIntObjectIterator; import gnu.trove.iterator.TObjectIntIterator; import gnu.trove.map.hash.TObjectIntHashMap; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Vector; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.gephi.graph.api.Edge; import org.gephi.graph.api.GraphController; import org.gephi.graph.api.GraphModel; import org.gephi.graph.api.Node; import org.gephi.graph.api.UndirectedGraph; import org.gephi.io.exporter.api.ExportController; import org.gephi.project.api.ProjectController; import org.openide.util.Lookup; /** * * @author Kiumars Soltani * */ public class NetworkGeneration { private Codebook cb; private CodebookApplicationTaskInstance instance; private Vector<Pair<String, String>> cbInfo; /** * * @param cb * @param instance */ public NetworkGeneration(Codebook cb, CodebookApplicationTaskInstance instance) { this.cb = cb; this.instance = instance; this.cbInfo = this.cb.getCbInfo(); } private TObjectIntHashMap<String> fillCodebookSet() { TObjectIntHashMap<String> codebookSet = new TObjectIntHashMap<String>(); for (int i = 0; i < this.cbInfo.size(); i++) { Pair<String, String> ce = this.cbInfo.get(i); if (instance.getIsNormal() == 0) { codebookSet.put(ce.getLeft().toLowerCase(), i); } else { // == 1 codebookSet.put(ce.getRight().toLowerCase(), i); } } return codebookSet; } private void makeNetwork(Vector<String> words, TIntObjectHashMap<Pair<String, String>> nodes, TObjectIntHashMap<Pair<Integer, Integer>> edges, TObjectIntHashMap<String> setCodebook) { String word = ""; Integer wordIndex = null; int windowSize = instance.getDistance(); //System.out.println("Harathi windowsize"+windowSize); /** * Start with an element in the words vector and seek ahead till the * window size. If you find an element then create an edge else move * ahead. Once all elements in windowSize are finished increment to the * next element in the words vector and repeat. TODO - Add feature for * skipping the seperator. */ //System.out.println("Harathi words"+words.size()); //if(windowSize >= words.size()) //{ // System.out.println("Harathi window size exceeded words size, so changin it to words count -1 "+ (words.size()-1)); // windowSize = words.size() -1; //} for (int i = 0; i < words.size(); i++) { word = words.get(i); if (word == null) { continue; } word = word.toLowerCase(); //System.out.println("Harathi first word"+word); if (setCodebook.contains(word)) { wordIndex = i; } else { wordIndex = null; } //System.out.println("Harathi Source word and word index: " + word + ", " + wordIndex); int dist = 0; for (int j = 1; j + i < words.size() && dist <= windowSize && wordIndex != null; j++) { Integer targetIndex = i + j; if(targetIndex >= words.size()) break; String target = words.get(targetIndex); target = target.toLowerCase(); // System.out.println("Target: (" + dist + ")" + target + ", " + targetIndex); if (target == null) { // System.out.println("Seperator found and also breaking out of window: (" + dist + ")" + target + ", " + targetIndex); //System.out.println("Harathi in if loop target before continue"+target); //System.out.println("Harathi in if loop word before continue"+word); dist++; continue; } //System.out.println("Harathi in if loop target after if loop"+target); //System.out.println("Harathi in if loop word after if loop"+word); if (target.equals(".")) { dist++; System.out.println("Seperator found: (" + dist + ")" + target + ", " + targetIndex); continue; } if (target.startsWith("`")) { System.out.println("Seperator found: (" + dist + ")" + target + ", " + targetIndex); dist++; continue; } // System.out.println("No Seperator found: (" + dist + ")" + target + ", " + targetIndex); dist++; if (!setCodebook.contains(target)) { continue; } int id1 = setCodebook.get(word); int id2 = setCodebook.get(target); // System.out.println("Edge: " + id1 + ", " + id2); //if (id1 == id2) { // continue; //} System.out.println("Edge: " + word + ", " + target); Pair<String, String> cb1 = this.cbInfo.get(id1); Pair<String, String> cb2 = this.cbInfo.get(id2); // System.out.println("Edge: " + cb1 + ", " + cb2); nodes.put(id1, new ImmutablePair<String, String>(cb1.getLeft(), cb1.getRight())); nodes.put(id2, new ImmutablePair<String, String>(cb2.getLeft(), cb2.getRight())); // System.out.println("Nodes: " + nodes); if (id1 <= id2) { edges.adjustOrPutValue(new ImmutablePair<Integer, Integer>(id1, id2), 1, 1); } else { edges.adjustOrPutValue(new ImmutablePair<Integer, Integer>(id1, id2), 1, 1); } // System.out.println("Added Edge: " + id1 + "," + id2); } // System.out.println("Exceeded window size: " + windowSize + ". Sliding window now."); } System.out.println("Finished Generating Network now off to printing files. "); } /** * * @param nodes * @param edges * @param csvFilepath * @param gexfFilepath */ public void writeOutput(TIntObjectHashMap<Pair<String, String>> nodes, TObjectIntHashMap<Pair<Integer, Integer>> edges, String csvFilepath, String gexfFilepath) { //Write CSV if (instance.isNetOutputCSV()) { this.writeCsv(edges, csvFilepath); } //Write GEXF if (instance.isNetOutputGEXF()) { this.writeGexf(nodes, edges, gexfFilepath); } } private boolean writeGexf(TIntObjectHashMap<Pair<String, String>> nodes, TObjectIntHashMap<Pair<Integer, Integer>> edges, String filepath) { //Init a project - and therefore a workspace ProjectController pc = Lookup.getDefault().lookup(ProjectController.class); pc.newProject(); //Workspace workspace = pc.getCurrentWorkspace(); //Get a graph model - it exists because we have a workspace GraphModel graphModel = Lookup.getDefault().lookup(GraphController.class).getModel(); final UndirectedGraph undirectedGraph = graphModel.getUndirectedGraph(); //Create three nodes TIntObjectIterator<Pair<String, String>> node_it; for (node_it = nodes.iterator(); node_it.hasNext();) { node_it.advance(); Node n0 = graphModel.factory().newNode(node_it.value().getLeft()); n0.getAttributes().setValue("label", node_it.value().getLeft()); if (instance.getNetOutputType() == 1) { n0.getAttributes().setValue("Type", node_it.value().getRight()); } undirectedGraph.addNode(n0); } TObjectIntIterator<Pair<Integer, Integer>> edge_it; for (edge_it = edges.iterator(); edge_it.hasNext();) { edge_it.advance(); Node s1 = undirectedGraph.getNode(nodes.get(edge_it.key().getLeft()).getLeft()); Node s2 = undirectedGraph.getNode(nodes.get(edge_it.key().getRight()).getLeft()); float weight = edge_it.value(); Edge e0 = graphModel.factory().newEdge(s1, s2, weight, false); undirectedGraph.addEdge(e0); } //Export full graph ExportController ec = Lookup.getDefault().lookup(ExportController.class); try { ec.exportFile(new File(filepath)); } catch (IOException ex) { return false; } return true; } private void writeCsv(TObjectIntHashMap<Pair<Integer, Integer>> edges, String filepath) { StringBuffer sb = new StringBuffer(); if (instance.getNetOutputType() == 0) { sb.append("source,target,weight\n"); } else { sb.append("source,source_type,target,target_type,weight\n"); } TObjectIntIterator<Pair<Integer, Integer>> networkIt; for (networkIt = edges.iterator(); networkIt.hasNext();) { networkIt.advance(); Pair<String, String> node1 = this.cbInfo.get(networkIt.key().getLeft()); Pair<String, String> node2 = this.cbInfo.get(networkIt.key().getRight()); if (instance.getNetOutputType() == 0) { sb.append(node1.getLeft() + "," + node2.getLeft() + "," + networkIt.value() + "\n"); } else { sb.append(node1.getLeft() + "," + node1.getRight() + "," + node2.getLeft() + "," + node2.getRight() + "," + networkIt.value() + "\n"); } } // 2016.03 Add this code to delete existing file File toDelete = new File(filepath); if (toDelete.exists()) { toDelete.delete(); } FileData.writeDataIntoFile(sb.toString(), filepath); } /** * * @return */ public boolean applyNetwork() { //System.out.println("Harathi in APPLYNETWORK"); TObjectIntHashMap<String> setCodebook = this.fillCodebookSet(); TIntObjectHashMap<Pair<String, String>> nodes = new TIntObjectHashMap<Pair<String, String>>(); TObjectIntHashMap<Pair<Integer, Integer>> edges = new TObjectIntHashMap<Pair<Integer, Integer>>(); CorpusData output = (CorpusData) this.instance.getTextOutput(); List<FileData> files = output.getFiles(); String path = FilenameUtils.getFullPath(files.get(0).getFile().getAbsolutePath()); /** * The following line is to insure that the network files are not * written to the same folder as the Codebook applied corpus Very Dirty * Hack to get it working. Author: Shubhanshu Mishra * */ path = path + "../"; try { for (FileData f : files) { String content = JavaIO.readFile(f.getFile()); //System.out.println("content before =======" + content); // content = codebookificationContent(content, setCodebook); // System.out.println("content after========" + content); if (content.isEmpty()) { continue; } //Vector<String> words = CodebookUtils.getWords(content, instance.getSeparator(), instance.getCustomTag()); if(instance.getSeparator() ==1) { String[] sentences = content.split("[.\n]+"); for(String sentence : sentences) { System.out.println("processing sentence" + sentence); String[] ss = sentence.split("[ .,\n]+"); Vector<String> words = new Vector<String>(); for (String word : ss) { words.add( word); } if (!instance.isNetInputCorpus()) { nodes.clear(); edges.clear(); for (int i = 0; i < words.size() ; i++) { //System.out.println("Harathi in appy network word"+words.get(i)); } makeNetwork(words, nodes, edges, setCodebook); String nameInputFileWithoutExtension = FilenameUtils.getBaseName(f.getFile().getName()); path = FilenameUtils.getFullPath(f.getFile().getAbsolutePath()); final String filepath = path + nameInputFileWithoutExtension + "-Network"; System.out.println("filepath (without extension)=" + filepath); this.writeOutput(nodes, edges, filepath + ".csv", filepath + ".gexf"); } else { for (int i = 0; i < words.size() ; i++) { //System.out.println("Harathi in appy network word"+words.get(i)); } makeNetwork(words, nodes, edges, setCodebook); } } } else if(instance.getSeparator() ==2) { System.out.println("processing paragraphs"); String[] paras = content.split("[\n]+"); for(String para : paras) { System.out.println("processing paragraph " + para); String[] ss = para.split("[ .,]+"); Vector<String> words = new Vector<String>(); for (String word : ss) { words.add( word); } if (!instance.isNetInputCorpus()) { nodes.clear(); edges.clear(); for (int i = 0; i < words.size() ; i++) { //System.out.println("Harathi in appy network word"+words.get(i)); } makeNetwork(words, nodes, edges, setCodebook); String nameInputFileWithoutExtension = FilenameUtils.getBaseName(f.getFile().getName()); path = FilenameUtils.getFullPath(f.getFile().getAbsolutePath()); final String filepath = path + nameInputFileWithoutExtension + "-Network"; System.out.println("filepath (without extension)=" + filepath); this.writeOutput(nodes, edges, filepath + ".csv", filepath + ".gexf"); } else { for (int i = 0; i < words.size() ; i++) { //System.out.println("Harathi in appy network word"+words.get(i)); } makeNetwork(words, nodes, edges, setCodebook); } } } if(instance.getSeparator() ==3) { System.out.println("processing text" ); { System.out.println("processing content" + content); String[] ss = content.split("[ .,\n]+"); Vector<String> words = new Vector<String>(); for (String word : ss) { words.add( word); } if (!instance.isNetInputCorpus()) { nodes.clear(); edges.clear(); for (int i = 0; i < words.size() ; i++) { //System.out.println("Harathi in appy network word"+words.get(i)); } makeNetwork(words, nodes, edges, setCodebook); String nameInputFileWithoutExtension = FilenameUtils.getBaseName(f.getFile().getName()); path = FilenameUtils.getFullPath(f.getFile().getAbsolutePath()); final String filepath = path + nameInputFileWithoutExtension + "-Network"; System.out.println("filepath (without extension)=" + filepath); this.writeOutput(nodes, edges, filepath + ".csv", filepath + ".gexf"); } else { for (int i = 0; i < words.size() ; i++) { //System.out.println("Harathi in appy network word"+words.get(i)); } makeNetwork(words, nodes, edges, setCodebook); } } } } if (instance.isNetInputCorpus()) { this.writeOutput(nodes, edges, path + "CorpusNetwork.csv", path + "CorpusNetwork.gexf"); } } catch (IOException e) { e.printStackTrace(); return false; } return true; } private String codebookificationContent(String content, TObjectIntHashMap<String> setCodebook) { TObjectIntIterator<String> cb_it; // content = content.toLowerCase(); for (cb_it = setCodebook.iterator(); cb_it.hasNext();) { cb_it.advance(); System.out.println(cb_it.key() + " " + cb_it.value()); System.out.println("replaceUnderscore=" + replaceUnderscores(cb_it.key())); System.out.println("repalced by=" + cb_it.key().toLowerCase()); content = content.replace(replaceUnderscores(cb_it.key()), cb_it.key().toLowerCase()); System.out.println("updated content = " + content); } return content; } private CharSequence replaceUnderscores(String str) { return str.replace('_', ' ').toLowerCase(); } }