package context.core.task.entropy; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import context.core.entity.CorpusData; import context.core.entity.FileData; import context.core.entity.TabularData; import context.core.task.pos.POSTagger; import context.core.util.CorpusAggregator; import context.core.util.ForAggregationNoCase; import context.core.util.JavaIO; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.TaggedWord; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; /** * * @author Aale */ public class Entropybody { /** * @Author Ming Jiang */ // StanfordCoreNLP pipeline; // // private List<String[]> corpusStatsWithTFIDF; // private File stopFile; Map<String, Integer> file_length = new HashMap<String, Integer>(); private EntropyTaskInstance instance; private CorpusData input; private List<TabularData> tabularOutput; private StanfordCoreNLP pipeline; private List<String[]> corpusStatsWithTFIDF; Map<String, String[]> Files_entropy; /** * * @param instance */ public Entropybody(EntropyTaskInstance instance) { // TODO Auto-generated method stub this.instance = instance; init(); } private void init() { this.input = (CorpusData) instance.getInput(); this.tabularOutput = instance.getTabularOutput(); this.pipeline = instance.getPipeline(); } /** * * @return */ public boolean RunEntropyComputation(/*String folderpath*/) { // TODO Auto-generated method stub List<List<String[]>> toAggregate = new ArrayList<List<String[]>>(); int numTerms = 0; // File folder = new File(folderpath); // File[] listOfFiles = folder.listFiles(); List<FileData> files = input.getFiles(); try { // for (int i = 0; i < listOfFiles.length; i++) { // File file = listOfFiles[i]; for (FileData ff : files) { File file = ff.getFile(); int doc_length = 0; String text; List<String[]> CorpusStatTags = new ArrayList<String[]>(); try { text = JavaIO.readFile(file); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods final List<CoreLabel> sent = sentence.get(TokensAnnotation.class); final List<TaggedWord> taggedWords = POSTagger.tag(sent, "en"); for (TaggedWord token : taggedWords) { // this is the text of the token String word = token.word(); // this is the POS tag of the token String pos = token.tag(); boolean val = pos.contains("NN"); if (!val) { if (word == "The") { System.out.println("Que?"); } // word = word.toLowerCase(); } word = word.toLowerCase(); String[] entity = {word, pos, file.getName(), Integer.toString(1)}; if ("en".equals("en")) { if (!word.matches("[a-zA-Z0-9]*")) { continue; } } CorpusStatTags.add(entity); doc_length++; numTerms++; } } toAggregate.add(CorpusStatTags); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } file_length.put(file.getName(), doc_length); } List<String[]> CorpusStatsByFile = new CorpusAggregator().CorpusAggregateNoCase(toAggregate); HashMap<ForAggregationNoCase, Integer[]> CorpusStats = new HashMap<ForAggregationNoCase, Integer[]>(); for (String[] CorpusStatByFile : CorpusStatsByFile) { String[] identifierForRatio = {CorpusStatByFile[0]}; ForAggregationNoCase identifierForAggregate = new ForAggregationNoCase(identifierForRatio); if (!(null == CorpusStats.get(identifierForAggregate))) { Integer currNumDocs = ((Integer[]) CorpusStats.get(identifierForAggregate))[0]; Integer[] NumDocsAndTotalCount = {currNumDocs + 1, ((Integer[]) CorpusStats.get(identifierForAggregate))[1] + Integer.parseInt(CorpusStatByFile[3])}; CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount.clone()); } else { Integer[] NumDocsAndTotalCount = {1, Integer.parseInt(CorpusStatByFile[3])}; CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount); } } List<String[]> corpusStatsWithTFIDFList = new ArrayList<String[]>(); Iterator<ForAggregationNoCase> it = CorpusStats.keySet().iterator(); while (it.hasNext()) { String[] currentCorpusStatWithTFIDF = new String[3]; ForAggregationNoCase next = it.next(); currentCorpusStatWithTFIDF[0] = next.toAggregate[0]; currentCorpusStatWithTFIDF[1] = Integer.toString(CorpusStats.get(next)[1]); //tf float tf = Float.parseFloat(currentCorpusStatWithTFIDF[1]) / (Float.MIN_VALUE + numTerms); // float idf = (float) Math.log10(((float) listOfFiles.length - 1) / (Float.MIN_VALUE + ((float) CorpusStats.get(next)[0]))); float idf = (float) Math.log10(((float) files.size()) / (Float.MIN_VALUE + ((float) CorpusStats.get(next)[0]))); currentCorpusStatWithTFIDF[2] = Float.toString((tf * idf)); //weight if (corpusStatsWithTFIDFList.contains(currentCorpusStatWithTFIDF[0])) { System.out.println(currentCorpusStatWithTFIDF[0]); } else { corpusStatsWithTFIDFList.add(currentCorpusStatWithTFIDF); } } this.corpusStatsWithTFIDF = corpusStatsWithTFIDFList; this.Files_entropy = this.Entropy(/*listOfFiles*/); } catch (Exception e) { e.printStackTrace(); return false; } return true; } /** * * @param x * @param base * @return */ public Double log(Double x, int base) //Log function { return (Double) (Math.log(x) / Math.log(base)); } /** * * @return */ public Map<String, String[]> Entropy(/*File[] fileslist*/) { double total_weight = 0.0; Map<String, Double> corpus_weight = new HashMap<String, Double>(); Map<String, String[]> files_entropy = new HashMap<String, String[]>(); for (int i1 = 0; i1 < corpusStatsWithTFIDF.size(); i1++) { corpus_weight.put(corpusStatsWithTFIDF.get(i1)[0], Double.parseDouble(corpusStatsWithTFIDF.get(i1)[2])); total_weight += Double.parseDouble(corpusStatsWithTFIDF.get(i1)[2]); } List<FileData> files = input.getFiles(); try { // for (int i = 0; i < fileslist.length; i++) { // File file = fileslist[i]; for (FileData ff : files) { File file = ff.getFile(); String text; Double H_X = 0.0; try { text = JavaIO.readFile(file); // Properties props = new Properties(); // props.setProperty("annotators", "tokenize, ssplit, pos, lemma"); // pipeline = new StanfordCoreNLP(props); Annotation document = new Annotation(text); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { // traversing the words in the current sentence // a CoreLabel is a CoreMap with additional token-specific methods for (CoreLabel token : sentence.get(TokensAnnotation.class)) { // this is the text of the token Double prob_word = 0.0; String word = token.get(TextAnnotation.class); word = word.toLowerCase(); if (!word.matches("[a-zA-Z0-9]*")) { continue; } prob_word = corpus_weight.get(word) / total_weight; if (prob_word > 0) { H_X += -1.0 * prob_word * this.log(prob_word, 2); } // System.out.println(word + " , " + prob_word + " , " + H_X); } } } catch (Exception e) { e.printStackTrace(); } int count_file_terms = file_length.get(file.getName()); //H_X = H_X / Math.log10(count_file_terms); Double Smoothed_H_X = H_X / Math.log10(count_file_terms); String[] entropy_entity = new String[3]; entropy_entity[0] = String.valueOf(count_file_terms); entropy_entity[1] = String.valueOf(H_X); entropy_entity[2] = String.valueOf(Smoothed_H_X); //System.out.println(file.getName() + " , " + count_file_terms + " , " + H_X + " , " + H_X / Math.log10(count_file_terms)); files_entropy.put(file.getName(), entropy_entity); } } catch (Exception e) { e.printStackTrace(); } return files_entropy; } /** * * @param filepath */ public void writeOutput(String filepath) { //Write CSV this.writeCsv(Files_entropy, filepath); } private void writeCsv(Map<String, String[]> files_entropy, String filepath) { StringBuffer sb = new StringBuffer(); sb.append("File_Name, File_length, Entropy, Normalized_Entropy" + "\n"); String toWrite = ""; for (String fkey : files_entropy.keySet()) { String[] temp = files_entropy.get(fkey); toWrite = fkey + "," + temp[0] + "," + temp[1] + "," + temp[2] + "\n"; sb.append(toWrite); } System.out.println("in writecsv before writeDataIntoFile"); // 2016.03 Add this code to delete existing file File toDelete = new File(filepath); if (toDelete.exists()) { toDelete.delete(); } // FileData.writeDataIntoFile(sb.toString(), filepath); } }