Entropybody.java example

Explorer
context-master
- src
  - context
package context.core.task.entropy;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import context.core.entity.CorpusData;
import context.core.entity.FileData;
import context.core.entity.TabularData;
import context.core.task.pos.POSTagger;
import context.core.util.CorpusAggregator;
import context.core.util.ForAggregationNoCase;
import context.core.util.JavaIO;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

/**
 *
 * @author Aale
 */
public class Entropybody {

    /**
     * @Author Ming Jiang
     */
//    StanfordCoreNLP pipeline;
//    
//    private List<String[]> corpusStatsWithTFIDF;
//    private File stopFile;
    Map<String, Integer> file_length = new HashMap<String, Integer>();

    private EntropyTaskInstance instance;
    private CorpusData input;
    private List<TabularData> tabularOutput;

    private StanfordCoreNLP pipeline;
    private List<String[]> corpusStatsWithTFIDF;
    Map<String, String[]> Files_entropy;

    /**
     *
     * @param instance
     */
    public Entropybody(EntropyTaskInstance instance) {
        // TODO Auto-generated method stub

        this.instance = instance;
        init();

    }

    private void init() {

        this.input = (CorpusData) instance.getInput();
        this.tabularOutput = instance.getTabularOutput();
        this.pipeline = instance.getPipeline();
    }

    /**
     *
     * @return
     */
    public boolean RunEntropyComputation(/*String folderpath*/) {
        // TODO Auto-generated method stub

        List<List<String[]>> toAggregate = new ArrayList<List<String[]>>();
        int numTerms = 0;

//        File folder = new File(folderpath);
//		File[] listOfFiles = folder.listFiles();
        List<FileData> files = input.getFiles();
        try {

//            for (int i = 0; i < listOfFiles.length; i++) {
//            	File file = listOfFiles[i];
            for (FileData ff : files) {

                File file = ff.getFile();
                int doc_length = 0;
                String text;
                List<String[]> CorpusStatTags = new ArrayList<String[]>();
                try {
                    text = JavaIO.readFile(file);
                    Annotation document = new Annotation(text);
                    pipeline.annotate(document);

                    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

                    for (CoreMap sentence : sentences) {
                        // traversing the words in the current sentence
                        // a CoreLabel is a CoreMap with additional token-specific methods

                        final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);
                        final List<TaggedWord> taggedWords = POSTagger.tag(sent, "en");
                        for (TaggedWord token : taggedWords) {
                            // this is the text of the token
                            String word = token.word();
                            // this is the POS tag of the token
                            String pos = token.tag();
                            boolean val = pos.contains("NN");
                            if (!val) {
                                if (word == "The") {
                                    System.out.println("Que?");
                                }
//                            word = word.toLowerCase();

                            }

                            word = word.toLowerCase();

                            String[] entity = {word, pos, file.getName(), Integer.toString(1)};
                            if ("en".equals("en")) {
                                if (!word.matches("[a-zA-Z0-9]*")) {
                                    continue;
                                }
                            }
                            CorpusStatTags.add(entity);
                            doc_length++;
                            numTerms++;
                        }
                    }
                    toAggregate.add(CorpusStatTags);
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }

                file_length.put(file.getName(), doc_length);
            }

            List<String[]> CorpusStatsByFile = new CorpusAggregator().CorpusAggregateNoCase(toAggregate);
            HashMap<ForAggregationNoCase, Integer[]> CorpusStats = new HashMap<ForAggregationNoCase, Integer[]>();
            for (String[] CorpusStatByFile : CorpusStatsByFile) {

                String[] identifierForRatio = {CorpusStatByFile[0]};

                ForAggregationNoCase identifierForAggregate = new ForAggregationNoCase(identifierForRatio);
                if (!(null == CorpusStats.get(identifierForAggregate))) {
                    Integer currNumDocs = ((Integer[]) CorpusStats.get(identifierForAggregate))[0];
                    Integer[] NumDocsAndTotalCount = {currNumDocs + 1, ((Integer[]) CorpusStats.get(identifierForAggregate))[1] + Integer.parseInt(CorpusStatByFile[3])};
                    CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount.clone());
                } else {
                    Integer[] NumDocsAndTotalCount = {1, Integer.parseInt(CorpusStatByFile[3])};
                    CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount);
                }
            }
            List<String[]> corpusStatsWithTFIDFList = new ArrayList<String[]>();
            Iterator<ForAggregationNoCase> it = CorpusStats.keySet().iterator();
            while (it.hasNext()) {
                String[] currentCorpusStatWithTFIDF = new String[3];
                ForAggregationNoCase next = it.next();

                currentCorpusStatWithTFIDF[0] = next.toAggregate[0];

                currentCorpusStatWithTFIDF[1] = Integer.toString(CorpusStats.get(next)[1]);   //tf

                float tf = Float.parseFloat(currentCorpusStatWithTFIDF[1]) / (Float.MIN_VALUE + numTerms);
                //               float idf = (float) Math.log10(((float) listOfFiles.length - 1) / (Float.MIN_VALUE + ((float) CorpusStats.get(next)[0])));
                float idf = (float) Math.log10(((float) files.size()) / (Float.MIN_VALUE + ((float) CorpusStats.get(next)[0])));
                currentCorpusStatWithTFIDF[2] = Float.toString((tf * idf));   //weight

                if (corpusStatsWithTFIDFList.contains(currentCorpusStatWithTFIDF[0])) {
                    System.out.println(currentCorpusStatWithTFIDF[0]);
                } else {
                    corpusStatsWithTFIDFList.add(currentCorpusStatWithTFIDF);
                }
            }

            this.corpusStatsWithTFIDF = corpusStatsWithTFIDFList;
            this.Files_entropy = this.Entropy(/*listOfFiles*/);
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }

        return true;
    }

    /**
     *
     * @param x
     * @param base
     * @return
     */
    public Double log(Double x, int base) //Log function
    {
        return (Double) (Math.log(x) / Math.log(base));
    }

    /**
     *
     * @return
     */
    public Map<String, String[]> Entropy(/*File[] fileslist*/) {

        double total_weight = 0.0;
        Map<String, Double> corpus_weight = new HashMap<String, Double>();
        Map<String, String[]> files_entropy = new HashMap<String, String[]>();
        for (int i1 = 0; i1 < corpusStatsWithTFIDF.size(); i1++) {

            corpus_weight.put(corpusStatsWithTFIDF.get(i1)[0], Double.parseDouble(corpusStatsWithTFIDF.get(i1)[2]));
            total_weight += Double.parseDouble(corpusStatsWithTFIDF.get(i1)[2]);

        }

        List<FileData> files = input.getFiles();
        try {

//        	for (int i = 0; i < fileslist.length; i++) {
//            	File file = fileslist[i];
            for (FileData ff : files) {

                File file = ff.getFile();
                String text;
                Double H_X = 0.0;

                try {
                    text = JavaIO.readFile(file);
//                    Properties props = new Properties();
//            		props.setProperty("annotators", "tokenize, ssplit, pos, lemma");
//            		pipeline = new StanfordCoreNLP(props);
                    Annotation document = new Annotation(text);
                    pipeline.annotate(document);

                    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

                    for (CoreMap sentence : sentences) {
                        // traversing the words in the current sentence
                        // a CoreLabel is a CoreMap with additional token-specific methods
                        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
                            // this is the text of the token
                            Double prob_word = 0.0;
                            String word = token.get(TextAnnotation.class);
                            word = word.toLowerCase();
                            if (!word.matches("[a-zA-Z0-9]*")) {
                                continue;
                            }

                            prob_word = corpus_weight.get(word) / total_weight;
                            if (prob_word > 0) {
                                H_X += -1.0 * prob_word * this.log(prob_word, 2);
                            }

                            // System.out.println(word + " , " + prob_word + " , " + H_X);
                        }
                    }
                } catch (Exception e) {
                    e.printStackTrace();

                }

                int count_file_terms = file_length.get(file.getName());
                //H_X = H_X / Math.log10(count_file_terms);
                Double Smoothed_H_X = H_X / Math.log10(count_file_terms);

                String[] entropy_entity = new String[3];
                entropy_entity[0] = String.valueOf(count_file_terms);
                entropy_entity[1] = String.valueOf(H_X);
                entropy_entity[2] = String.valueOf(Smoothed_H_X);

                //System.out.println(file.getName() + " , " + count_file_terms + " , " + H_X + " , " + H_X / Math.log10(count_file_terms));
                files_entropy.put(file.getName(), entropy_entity);
            }

        } catch (Exception e) {
            e.printStackTrace();

        }
        return files_entropy;
    }

    /**
     *
     * @param filepath
     */
    public void writeOutput(String filepath) {
        //Write CSV
        this.writeCsv(Files_entropy, filepath);
    }

    private void writeCsv(Map<String, String[]> files_entropy, String filepath) {

        StringBuffer sb = new StringBuffer();
        sb.append("File_Name, File_length, Entropy, Normalized_Entropy" + "\n");

        String toWrite = "";
        for (String fkey : files_entropy.keySet()) {
            String[] temp = files_entropy.get(fkey);
            toWrite = fkey + "," + temp[0] + "," + temp[1] + "," + temp[2] + "\n";
            sb.append(toWrite);
        }
        System.out.println("in writecsv before writeDataIntoFile");
        
        // 2016.03 Add this code to delete existing file
        File toDelete = new File(filepath);
        	if (toDelete.exists()) {
        		toDelete.delete(); 
        	}
        //
        
        FileData.writeDataIntoFile(sb.toString(), filepath);

    }
}