SyntacticNetwork.java example

Explorer
context-master
- src
  - context
package context.core.task.syntaxbased;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;

import org.gephi.graph.api.DirectedGraph;
import org.gephi.graph.api.Edge;
import org.gephi.graph.api.GraphController;
import org.gephi.graph.api.GraphModel;
import org.gephi.graph.api.Node;
import org.gephi.io.exporter.api.ExportController;
import org.gephi.project.api.ProjectController;
import org.gephi.project.api.Workspace;
import org.openide.util.Lookup;

import context.core.entity.CorpusData;
import context.core.entity.FileData;
import context.core.entity.TabularData;
import context.core.task.pos.POSTagger;
import context.core.util.CorpusAggregator;
import context.core.util.JavaIO;
import context.core.util.MyPair;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;
import gnu.trove.iterator.TObjectIntIterator;
import gnu.trove.map.hash.TObjectIntHashMap;

/**
 *
 * @author Aale
 */
public class SyntacticNetwork {

    /**
     * @param args
     */
    private StanfordCoreNLP pipeline;
    private StanfordCoreNLP POSpipeline;
    private List<String[]> NetworkEdges;
    private HashSet<String[]> NodeHashSet;
    private SyntaxBasedTaskInstance instance;
    private CorpusData input;
    private List<TabularData> tabularOutput;
    private List<List<String[]>> toAggregate;
    private List<List<List<String[]>>> POStags;
    private List<String> POStoKeepTrack;
    private HashMap<String, List<String>> graphComponents;
    private HashMap<List<String>, List<List<String>>> POSedgeMap;
    private List<String> dependencyEdges;
    private int unitOfAnalysis;
    private long timeout;
    private int distance;

    /**
     *
     * @param instance
     */
    public SyntacticNetwork(SyntaxBasedTaskInstance instance) {
        // TODO Auto-generated method stub

        this.instance = instance;
        init();
    }

    private void init() {

        POSedgeMap = instance.getEdgeMap();
        this.dependencyEdges = instance.getDependencyEdges();
        this.unitOfAnalysis = instance.getUnitOfAnalysis();
        this.timeout = instance.getTimeout();
        this.distance = instance.getDistance();
        this.input = (CorpusData) instance.getInput();
        this.pipeline = instance.getPipeline();
        this.POSpipeline = instance.getPipelinePOS();
        this.tabularOutput = instance.getTabularOutput();
        NodeHashSet = new HashSet<String[]>();
        NetworkEdges = new ArrayList<String[]>();
        POStags = new ArrayList<List<List<String[]>>>();

        graphComponents = new HashMap<String, List<String>>();
        List<String> properNouns = new ArrayList<String>();
        properNouns.add("NNP");
        properNouns.add("NNPS");
        List<String> commonNouns = new ArrayList<String>();
        commonNouns.add("NN");
        commonNouns.add("NNS");
        List<String> verbs = new ArrayList<String>();
        verbs.add("VB");
        verbs.add("VBD");
        verbs.add("VBZ");
        verbs.add("VBG");
        verbs.add("VBN");
        verbs.add("VBP");
        List<String> conjunctions = new ArrayList<String>();
        conjunctions.add("CC");
        List<String> numbers = new ArrayList<String>();
        numbers.add("CD");
        List<String> foreign = new ArrayList<String>();
        foreign.add("FW");
        List<String> modals = new ArrayList<String>();
        modals.add("MD");
        List<String> pronouns = new ArrayList<String>();
        pronouns.add("PRP");
        pronouns.add("PRP$");
        pronouns.add("WP");
        pronouns.add("WP$");
        List<String> adjectives = new ArrayList<String>();
        adjectives.add("JJ");
        adjectives.add("JJR");
        adjectives.add("JJS");
        List<String> symbols = new ArrayList<String>();
        symbols.add("SYM");
        List<String> interjections = new ArrayList<String>();
        interjections.add("UH");
        try {
            graphComponents.put("NN", commonNouns);
            graphComponents.put("NNS", commonNouns);
            graphComponents.put("NNP", properNouns);
            graphComponents.put("NNPS", properNouns);
            graphComponents.put("CC", conjunctions);
            graphComponents.put("CD", numbers);
            graphComponents.put("FW", foreign);
            graphComponents.put("MD", modals);
            graphComponents.put("PRP", pronouns);
            graphComponents.put("PRP$", pronouns);
            graphComponents.put("WP", pronouns);
            graphComponents.put("WP$", pronouns);
            graphComponents.put("VB", verbs);
            graphComponents.put("VBD", verbs);
            graphComponents.put("VBG", verbs);
            graphComponents.put("VBN", verbs);
            graphComponents.put("VBZ", verbs);
            graphComponents.put("VBP", verbs);
            graphComponents.put("JJ", adjectives);
            graphComponents.put("JJR", adjectives);
            graphComponents.put("JJS", adjectives);
            graphComponents.put("SYM", symbols);
            graphComponents.put("UH", interjections);

        } catch (Exception e) {
            e.printStackTrace();
        }

        /////
        ///// Fixed params for Demo
        /////
        this.distance = 7;
        this.unitOfAnalysis = 2;
        this.timeout = 120000;

        this.POStoKeepTrack = new ArrayList<String>();
        POStoKeepTrack.add("NN");
        POStoKeepTrack.add("NNS");
        POStoKeepTrack.add("NNP");
        POStoKeepTrack.add("NNPS");

        POSedgeMap = new HashMap<List<String>, List<List<String>>>();
        List<List<String>> commonNounEdges = new ArrayList<List<String>>();
        commonNounEdges.add(properNouns);
        List<List<String>> properNounEdges = new ArrayList<List<String>>();
        properNounEdges.add(commonNouns);
        POSedgeMap.put(commonNouns, commonNounEdges);
        POSedgeMap.put(properNouns, properNounEdges);

        dependencyEdges = new ArrayList<String>();
        dependencyEdges.add("nsubj");
        dependencyEdges.add("dobj");
        /////
        ///// End fixed params
        /////

    }

    /**
     *
     * @return
     */
    public boolean genPOSNetwork() {
        List<FileData> files = input.getFiles();
        toAggregate = new ArrayList<List<String[]>>();
        try {
            for (FileData ff : files) {
                File file = ff.getFile();
                String text = JavaIO.readFile(file);
                text = text.replaceAll("\\p{Cc}", " ");
                text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " ");
                //text = text.replaceAll("[^\\x00-\\x7F]", "");
                Annotation document = new Annotation(text);
                POSpipeline.annotate(document);

                List<List<String[]>> DocPOSTags = new ArrayList<List<String[]>>();
                List<CoreMap> sentences = document.get(SentencesAnnotation.class);

                int placeInDoc = 0;
                for (CoreMap sentence : sentences) {
                    List<String[]> sentPOStags = new ArrayList<String[]>();
                    // traversing the words in the current sentence
                    // a CoreLabel is a CoreMap with additional token-specific methods
                    int placeInSent = 0;

                    final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);
                    final List<TaggedWord> taggedWords = POSTagger.tag(sent, "en");
                    for (TaggedWord token : taggedWords) {
                        // this is the text of the token
                        String word = token.word();
                        // this is the POS tag of the token
                        String pos = token.tag();
                        String[] entity = {word, pos, Integer.toString(placeInSent), Integer.toString(placeInDoc)};
                        placeInSent++;
                        placeInDoc++;
                        if (!word.matches("^[a-zA-Z0-9]*$")) {
                            continue;
                        }
                        if (!POStoKeepTrack.contains(pos)) {
                            continue;
                        }
                        String[] hashNode = new String[2];
                        hashNode[0] = word;
                        hashNode[1] = pos;
                        NodeHashSet.add(hashNode);
                        sentPOStags.add(entity);
                    }
                    DocPOSTags.add(sentPOStags);
                }
                POStags.add(DocPOSTags);
            }

            if (unitOfAnalysis == 1) {
                for (List<List<String[]>> DocPOSTags : POStags) {
                    String[] word;
                    List<String[]> docAggregate = new ArrayList<String[]>();
                    for (int overIndx = 0; overIndx < DocPOSTags.size(); overIndx++) {
                        List<String[]> SentPOSTags = DocPOSTags.get(overIndx);
                        for (int indx = 0; indx < SentPOSTags.size(); indx++) {
                            word = SentPOSTags.get(indx);
                            List<String[]> TempSentPOSTags = SentPOSTags;
                            int tempIndex = indx + 1;
                            if (tempIndex >= SentPOSTags.size()) {
                                break;
                            }
                            String[] tempWord = null;
                            try {
                                tempWord = TempSentPOSTags.get(tempIndex);
                            } catch (Exception tempE) {
                                tempE.printStackTrace();
                            }
                            while (Integer.parseInt(tempWord[3]) - Integer.parseInt(word[3]) < distance) {
                                if (POSedgeMap.get(graphComponents.get(tempWord[1])).contains(graphComponents.get(word[1]))) {
                                    String[] tempPOSEdge = new String[5];
                                    tempPOSEdge[0] = word[0];
                                    tempPOSEdge[1] = word[1];
                                    tempPOSEdge[2] = tempWord[0];
                                    tempPOSEdge[3] = tempWord[1];
                                    tempPOSEdge[4] = "1";
                                    docAggregate.add(tempPOSEdge);
                                }
                                tempIndex++;
                                if (tempIndex >= TempSentPOSTags.size()) {
                                    break;
                                }
                                try {
                                    tempWord = TempSentPOSTags.get(tempIndex);
                                } catch (Exception tempE) {
                                    tempE.printStackTrace();
                                }
                            }
                        }
                    }
                    toAggregate.add(docAggregate);
                }
                NetworkEdges = new CorpusAggregator().CorpusAggregate(toAggregate);
            }

            if (unitOfAnalysis == 2) {
                for (List<List<String[]>> DocPOSTags : POStags) {
                    String[] word;
                    List<String[]> docAggregate = new ArrayList<String[]>();
                    for (int overIndx = 0; overIndx < DocPOSTags.size(); overIndx++) {
                        List<String[]> SentPOSTags = DocPOSTags.get(overIndx);
                        for (int indx = 0; indx < SentPOSTags.size(); indx++) {
                            word = SentPOSTags.get(indx);
                            List<String[]> TempSentPOSTags = SentPOSTags;
                            int tempIndex = indx + 1;
                            int tempOverIndex = overIndx;
                            if (tempIndex >= SentPOSTags.size()) {
                                TempSentPOSTags = null;
                                Boolean breakCondition = false;
                                while (TempSentPOSTags == null || TempSentPOSTags.size() == 0) {
                                    if (tempOverIndex + 1 < DocPOSTags.size()) {
                                        tempIndex = 0;
                                        tempOverIndex++;
                                        TempSentPOSTags = DocPOSTags.get(tempOverIndex);
                                    } else {
                                        breakCondition = true;
                                        break;
                                    }
                                }
                                if (breakCondition) {
                                    break;
                                }

                            }
                            String[] tempWord = null;
                            try {
                                tempWord = TempSentPOSTags.get(tempIndex);
                            } catch (Exception tempE) {
                                tempE.printStackTrace();
                            }
                            while (Integer.parseInt(tempWord[3]) - Integer.parseInt(word[3]) < distance) {
                                if (POSedgeMap.get(graphComponents.get(tempWord[1])).contains(graphComponents.get(word[1]))) {
                                    String[] tempPOSEdge = new String[5];
                                    tempPOSEdge[0] = word[0];
                                    tempPOSEdge[1] = word[1];
                                    tempPOSEdge[2] = tempWord[0];
                                    tempPOSEdge[3] = tempWord[1];
                                    tempPOSEdge[4] = "1";
                                    docAggregate.add(tempPOSEdge);
                                }
                                tempIndex++;
                                if (tempIndex >= TempSentPOSTags.size()) {
                                    TempSentPOSTags = null;
                                    boolean breakCondition = false;
                                    while (TempSentPOSTags == null || TempSentPOSTags.size() == 0) {
                                        if (tempOverIndex + 1 < DocPOSTags.size()) {
                                            tempIndex = 0;
                                            tempOverIndex++;
                                            TempSentPOSTags = DocPOSTags.get(tempOverIndex);
                                        } else {
                                            breakCondition = true;
                                            break;
                                        }
                                    }
                                    if (breakCondition) {
                                        break;
                                    }
                                }
                                try {
                                    tempWord = TempSentPOSTags.get(tempIndex);
                                } catch (Exception tempE) {
                                    tempE.printStackTrace();
                                }
                            }
                        }
                    }
                    toAggregate.add(docAggregate);
                }
                NetworkEdges = new CorpusAggregator().CorpusAggregate(toAggregate);
            }

        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }
        return true;
    }

    /**
     *
     * @return
     */
    public boolean genNetwork() {

        toAggregate = new ArrayList<List<String[]>>();
        List<FileData> files = input.getFiles();
        try {
            for (FileData ff : files) {

                File file = ff.getFile();
                final File finalFile = file;
                System.out.println(file.getName());
                Thread myThread = new Thread(new Runnable() {
                    @Override
                    public void run() {
                        genFileSyntax(finalFile);
                    }
                });
                myThread.start();
                long endTimeMillis = System.currentTimeMillis() + timeout;
                while (myThread.isAlive()) {
                    if (System.currentTimeMillis() > endTimeMillis) {
                        myThread.interrupt();
                        break;
                    }
                    try {
                        Thread.sleep(500);
                    } catch (InterruptedException t) {
                    }
                }
            }

        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }
        NetworkEdges = new CorpusAggregator().CorpusAggregate(toAggregate);
        return true;
    }

    private boolean genFileSyntax(File file) {
        try {
            String text = JavaIO.readFile(file);
            text = text.replaceAll("\\p{Cc}", " ");
            text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " ");
            // create an empty Annotation just with the given text
            Annotation document = new Annotation(text);

            // run all Annotators on this text
            pipeline.annotate(document);

            // these are all the sentences in this document
            // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
            List<CoreMap> sentences = document.get(SentencesAnnotation.class);

            for (CoreMap sentence : sentences) {
                if (Thread.interrupted()) {
                    return false;
                }
                Tree tree = sentence.get(TreeAnnotation.class);
                //System.out.println(dependencies.getAllNodesByWordPattern("it"));
                TreebankLanguagePack tlp = new PennTreebankLanguagePack();
                GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
                if (tree == null) {
                    continue;
                }
                GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
                Collection tdl = gs.typedDependenciesCollapsed();
                Iterator iter1 = tdl.iterator();
                String typedDep = "";
                List<String[]> tempDependencies = new ArrayList<String[]>();
                while (iter1.hasNext()) {
                    typedDep = iter1.next().toString();

                    if (typedDep.trim().split("\\(")[0].equals("nsubj") && dependencyEdges.contains("nsubj")) {
                        String[] tempDependency = new String[5];
                        tempDependency[0] = typedDep.trim().split("\\(")[1].split(",")[1].trim().split("-")[0];
                        tempDependency[1] = "";
                        tempDependency[2] = typedDep.trim().split("\\(")[1].split(",")[0].trim().split("-")[0];
                        tempDependency[3] = "";
                        tempDependency[4] = Integer.toString(1);
                        tempDependencies.add(tempDependency);

                    }
                    if (typedDep.trim().split("\\(")[0].equals("nsubjpass") && dependencyEdges.contains("nsubjpass")) {
                        String[] tempDependency = new String[5];
                        tempDependency[0] = typedDep.trim().split("\\(")[1].split(",")[0].trim().split("-")[0];
                        tempDependency[1] = "";
                        tempDependency[2] = typedDep.trim().split("\\(")[1].split(",")[1].trim().split("-")[0];
                        tempDependency[3] = "";
                        tempDependency[4] = Integer.toString(1);
                        tempDependencies.add(tempDependency);

                    }
                    if (typedDep.trim().split("\\(")[0].equals("dobj") && dependencyEdges.contains("dobj")) {
                        String[] tempDependency = new String[5];
                        tempDependency[0] = typedDep.trim().split("\\(")[1].split(",")[0].trim().split("-")[0];
                        tempDependency[1] = "";
                        tempDependency[2] = typedDep.trim().split("\\(")[1].split(",")[1].trim().split("-")[0];
                        tempDependency[3] = "";
                        tempDependency[4] = Integer.toString(1);
                        tempDependencies.add(tempDependency);
                    }
                    if (typedDep.trim().split("\\(")[0].equals("iobj") && dependencyEdges.contains("iobj")) {
                        String[] tempDependency = new String[5];
                        tempDependency[0] = typedDep.trim().split("\\(")[1].split(",")[0].trim().split("-")[0];
                        tempDependency[1] = "";
                        tempDependency[2] = typedDep.trim().split("\\(")[1].split(",")[1].trim().split("-")[0];
                        tempDependency[3] = "";
                        tempDependency[4] = Integer.toString(1);
                        tempDependencies.add(tempDependency);
                    }

                }

                final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);
                final List<TaggedWord> taggedWords = POSTagger.tag(sent, "en");
                for (TaggedWord token : taggedWords) {
                    // this is the text of the token
                    String word = token.word();
                    // this is the POS tag of the token
                    String pos = token.tag();
                    for (String[] tempDep : tempDependencies) {
                        if (tempDep[0].equals(word)) {
                            tempDep[1] = pos;
                            String[] hashNode = new String[2];
                            hashNode[0] = word;
                            hashNode[1] = pos;
                            NodeHashSet.add(hashNode);
                        }
                        if (tempDep[2].equals(word)) {
                            tempDep[3] = pos;
                            String[] hashNode = new String[2];
                            hashNode[0] = word;
                            hashNode[1] = pos;
                            NodeHashSet.add(hashNode);
                        }
                    }
                }

                toAggregate.add(tempDependencies);
            }
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }
        return true;
    }

    /**
     *
     * @return
     */
    public String[][] getNetworkEdges() {
        String[][] NetworkEdgesArray = new String[NetworkEdges.size()][5];
        NetworkEdgesArray = NetworkEdges.toArray(NetworkEdgesArray);
        return NetworkEdgesArray;
    }

    /**
     *
     * @return
     */
    public String[][] getNetworkNodes() {
        String[][] NetworkNodes = new String[NodeHashSet.size()][2];
        NetworkNodes = NodeHashSet.toArray(NetworkNodes);
        return NetworkNodes;

    }

    /**
     *
     * @param filename
     * @return
     */
    public boolean extractGephiOutput(String filename) {

        String[][] nodes_str = this.getNetworkNodes();
        String[][] edges_str = this.getNetworkEdges();

        File new_file = new File(filename);
        if (new_file.exists()) {
            new_file.delete();
        }

        //Init a project - and therefore a workspace
        ProjectController pc = Lookup.getDefault().lookup(ProjectController.class);
        pc.newProject();
        Workspace workspace = pc.getCurrentWorkspace();

        //Get a graph model - it exists because we have a workspace
        GraphModel graphModel = Lookup.getDefault().lookup(GraphController.class).getModel();
        final DirectedGraph directedGraph = graphModel.getDirectedGraph();

        TObjectIntHashMap<String> nodes = new TObjectIntHashMap<String>();

        Vector<String> node_index = new Vector<String>();

        //Create the nodes
        for (String[] node_str : nodes_str) {

            if (nodes.containsKey(node_str[0])) {
                int index = nodes.get(node_str[0]);
                node_index.set(index, node_index.get(index) + "," + node_str[1]);
            } else {
                node_index.add(node_str[1]);
                nodes.put(node_str[0], node_index.size() - 1);
            }
        }

        for (TObjectIntIterator<String> node_it = nodes.iterator(); node_it.hasNext();) {
            node_it.advance();

            Node n0 = graphModel.factory().newNode(node_it.key());

            n0.getAttributes().setValue("label", node_it.key());
            n0.getAttributes().setValue("Type", node_index.get(node_it.value()));
            directedGraph.addNode(n0);
        }

        TObjectIntHashMap<MyPair<String, String>> edges = new TObjectIntHashMap<MyPair<String, String>>();

        for (String[] edge_str : edges_str) {

            MyPair<String, String> edge = new MyPair(edge_str[0], edge_str[2]);

            int value = Integer.parseInt(edge_str[4]);
            edges.adjustOrPutValue(edge, value, value);
        }

        TObjectIntIterator<MyPair<String, String>> edge_it;

        for (edge_it = edges.iterator(); edge_it.hasNext();) {
            edge_it.advance();

            Node s1 = directedGraph.getNode(edge_it.key().getFirst());
            Node s2 = directedGraph.getNode(edge_it.key().getSecond());
            if (s1 == null || s2 == null) {
                continue;
            }

            int weight = edge_it.value();

            Edge e0 = graphModel.factory().newEdge(s1, s2, weight, true);
            directedGraph.addEdge(e0);

        }

        //Export full graph
        ExportController ec = Lookup.getDefault().lookup(ExportController.class);
        try {
            ec.exportFile(new_file);
        } catch (IOException ex) {
            System.out.println(ex.getMessage());
            return false;
        }
        return true;
    }

    /**
     *
     * @param instance
     * @param output_address
     * @return
     */
    public static boolean runUnit(SyntaxBasedTaskInstance instance, String output_address) {

        SyntacticNetwork SN = new SyntacticNetwork(instance);
        SN.genNetwork();
        if (SN.extractGephiOutput(output_address)) {
            return false;
        }

        return true;
    }

}