CorpusStatisticsBody.java example

Explorer
context-master
- src
  - context
package context.core.task.corpusstat;

import context.core.entity.CorpusData;
import context.core.entity.FileData;
import context.core.entity.TabularData;
import context.core.task.pos.POSTagger;
import context.core.util.CorpusAggregator;
import context.core.util.ForAggregationNoCase;
import context.core.util.JavaIO;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.util.CoreMap;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;

/**
 *
 * @author Aale
 */
public class CorpusStatisticsBody {

    /**
     * @param args
     */
    private CorpusStatTaskInstance instance;
    private CorpusData input;
    private List<TabularData> tabularOutput;

    private StanfordCoreNLP pipeline;
    private List<String[]> corpusStatsWithTFIDF;
    private File stopFile;

    /**
     *
     * @param instance
     */
    public CorpusStatisticsBody(CorpusStatTaskInstance instance) {
        // TODO Auto-generated method stub

        this.instance = instance;
        init();

    }

    private void init() {

        this.input = (CorpusData) instance.getInput();
        this.tabularOutput = instance.getTabularOutput();
        this.pipeline = instance.getPipeline();
    }

    /**
     *
     * @param includePOS
     * @return
     */
    public boolean RunCorpusStatistics(boolean includePOS) {
        if (includePOS) {
            return RunCorpusStatisticsWithPOS();
        } else {
            return RunCorpusStatisticsWithoutPOS();
        }
    }

    /**
     *
     * @return
     */
    public boolean RunCorpusStatisticsWithPOS() {
        System.out.println("CorpusStat with POS");

        List<List<String[]>> toAggregate = new ArrayList<List<String[]>>();
        int numTerms = 0;

        List<FileData> files = input.getFiles();
        try {
            for (FileData ff : files) {

                File file = ff.getFile();
                String text;
                List<String[]> CorpusStatTags = new ArrayList<String[]>();
                try {
                    text = JavaIO.readFile(file);
                    Annotation document = new Annotation(text);
                    pipeline.annotate(document);

                    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

                    for (CoreMap sentence : sentences) {
                        // traversing the words in the current sentence
                        // a CoreLabel is a CoreMap with additional token-specific methods
                        final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);
                        final List<TaggedWord> taggedWords = POSTagger.tag(sent, instance.getLanguage());
                        for (TaggedWord token : taggedWords) {
                            // this is the text of the token
                            String word = token.word();
                            // this is the POS tag of the token
                            String pos = token.tag();
                            boolean val = pos.contains("NN");
                            if (!val) {
                                if (word == "The") {
                                    System.out.println("Que?");
                                }
                                word = word.toLowerCase();
                            }
                            String[] entity = {word, pos, file.getName(), Integer.toString(1)};
                            if (!word.matches("[a-zA-Z0-9_@#]*|:\\)|:-\\)|:\\(|:-\\(|:\\/|:-\\/|:\\\\|:-\\\\|:p|:-p|;\\)|;-\\)|:>|:->")) {
                                    continue;
                            }
                            CorpusStatTags.add(entity);
                            numTerms++;
                        }
                    }
                        for (String retval: text.split(" ")){
    						//System.out.println("processing " + retval);
    						if (retval.matches(":\\)|:-\\)|:\\(|:-\\(|:\\/|:-\\/|:\\\\|:-\\\\|:p|:-p|;\\)|;-\\)|:>|:->")) {
    						String[] entity = {retval, "A", file.getName(), Integer.toString(1)};
    						System.out.println("pass:" + numTerms);
                            CorpusStatTags.add(entity);
                            numTerms++;
                            //System.out.println("Printing words after corpusstattags:"+retval);
                            }
    						else
    						{
    						System.out.println("regex dint match " +retval);
    						}
    						}
                    toAggregate.add(CorpusStatTags);
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                    return false;
                }
            }
            List<String[]> CorpusStatsByFile = new CorpusAggregator().CorpusAggregateNoCase(toAggregate);
            HashMap<ForAggregationNoCase, Integer[]> CorpusStats = new HashMap<ForAggregationNoCase, Integer[]>();
            for (String[] CorpusStatByFile : CorpusStatsByFile) {
                String[] identifierForRatio = {CorpusStatByFile[0], CorpusStatByFile[1]};
                ForAggregationNoCase identifierForAggregate = new ForAggregationNoCase(identifierForRatio);
                if (!(null == CorpusStats.get(identifierForAggregate))) {
                    Integer currNumDocs = ((Integer[]) CorpusStats.get(identifierForAggregate))[0];
                    Integer[] NumDocsAndTotalCount = {currNumDocs + 1, ((Integer[]) CorpusStats.get(identifierForAggregate))[1] + Integer.parseInt(CorpusStatByFile[3])};
                    CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount.clone());
                } else {
                    Integer[] NumDocsAndTotalCount = {1, Integer.parseInt(CorpusStatByFile[3])};
                    CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount);
                }
            }
            List<String[]> corpusStatsWithTFIDFList = new ArrayList<String[]>();
            Iterator<ForAggregationNoCase> it = CorpusStats.keySet().iterator();
            while (it.hasNext()) {
                String[] currentCorpusStatWithTFIDF = new String[5];
                ForAggregationNoCase next = it.next();
                currentCorpusStatWithTFIDF[0] = next.toAggregate[0];
                currentCorpusStatWithTFIDF[4] = next.toAggregate[1];
                currentCorpusStatWithTFIDF[1] = Integer.toString(CorpusStats.get(next)[1]);
                currentCorpusStatWithTFIDF[3] = Float.toString(((float) CorpusStats.get(next)[0]) / ((float) files.size()));

                float tf = Float.parseFloat(currentCorpusStatWithTFIDF[1]) / (Float.MIN_VALUE + numTerms);
                float idf = (float) Math.log10(((float) files.size()) / (Float.MIN_VALUE + ((float) CorpusStats.get(next)[0])));
                currentCorpusStatWithTFIDF[2] = Float.toString((tf * idf));

                corpusStatsWithTFIDFList.add(currentCorpusStatWithTFIDF);
            }

            this.corpusStatsWithTFIDF = corpusStatsWithTFIDFList;
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }

        return true;

    }

    /**
     *
     * @return
     */
    public boolean RunCorpusStatisticsWithoutPOS() {
        System.out.println("CorpusStat without POS");

        List<List<String[]>> toAggregate = new ArrayList<List<String[]>>();
        int numTerms = 0;

        List<FileData> files = input.getFiles();
        try {
            for (FileData ff : files) {

                File file = ff.getFile();
                String text;
                List<String[]> CorpusStatTags = new ArrayList<String[]>();
                try {
                    text = JavaIO.readFile(file);
                    List<String> words = getTokens(text);
                    for (String word : words) {

                        String[] entity = {word, "A", file.getName(), Integer.toString(1)};

                        if (!word.matches("[a-zA-Z0-9_@#]*|:\\)|:\\(|:\\/|:\\\\|:p|;\\)|;-\\)")) {

                            continue;
                        }
                        System.out.println("pass:" + numTerms);
                        CorpusStatTags.add(entity);
                        numTerms++;
                    }
					for (String retval: text.split(" ")){
						//System.out.println("processing " + retval);
						if (retval.matches(":\\)|:\\(|:\\/|:\\\\|:p|;\\)|;-\\)")) {
						String[] entity = {retval, "A", file.getName(), Integer.toString(1)};
						System.out.println("pass:" + numTerms);
                        CorpusStatTags.add(entity);
                        numTerms++;
                        //System.out.println("Printing words after corpusstattags:"+retval);
                        }
						else
						{
						System.out.println("regex dint match " +retval);
						}
						}
                    toAggregate.add(CorpusStatTags);
                    //System.out.println("CorpusStatTags" + CorpusStatTags.size());
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                    return false;
                }
            }
            System.out.println("toAggregate" + toAggregate.size());
            List<String[]> CorpusStatsByFile = new CorpusAggregator().CorpusAggregateNoCase(toAggregate);
            HashMap<ForAggregationNoCase, Integer[]> CorpusStats = new HashMap<ForAggregationNoCase, Integer[]>();
            for (String[] CorpusStatByFile : CorpusStatsByFile) {
                String[] identifierForRatio = {CorpusStatByFile[0], CorpusStatByFile[1]};
                ForAggregationNoCase identifierForAggregate = new ForAggregationNoCase(identifierForRatio);
                if (!(null == CorpusStats.get(identifierForAggregate))) {
                    Integer currNumDocs = ((Integer[]) CorpusStats.get(identifierForAggregate))[0];
                    Integer[] NumDocsAndTotalCount = {currNumDocs + 1, ((Integer[]) CorpusStats.get(identifierForAggregate))[1] + Integer.parseInt(CorpusStatByFile[3])};
                    CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount.clone());
                } else {
                    Integer[] NumDocsAndTotalCount = {1, Integer.parseInt(CorpusStatByFile[3])};
                    CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount);
                }
            }
            List<String[]> corpusStatsWithTFIDFList = new ArrayList<String[]>();
            Iterator<ForAggregationNoCase> it = CorpusStats.keySet().iterator();
            while (it.hasNext()) {
                String[] currentCorpusStatWithTFIDF = new String[5];
                ForAggregationNoCase next = it.next();
                currentCorpusStatWithTFIDF[0] = next.toAggregate[0];
                currentCorpusStatWithTFIDF[4] = next.toAggregate[1];
                currentCorpusStatWithTFIDF[1] = Integer.toString(CorpusStats.get(next)[1]);
                currentCorpusStatWithTFIDF[3] = Float.toString(((float) CorpusStats.get(next)[0]) / ((float) files.size()));

                float tf = Float.parseFloat(currentCorpusStatWithTFIDF[1]) / (Float.MIN_VALUE + numTerms);
                float idf = (float) Math.log10(((float) files.size()) / (Float.MIN_VALUE + ((float) CorpusStats.get(next)[0])));
                currentCorpusStatWithTFIDF[2] = Float.toString((tf * idf));

                corpusStatsWithTFIDFList.add(currentCorpusStatWithTFIDF);
            }

            this.corpusStatsWithTFIDF = corpusStatsWithTFIDFList;
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }

        return true;

    }

    /**
     *
     * @param filepath
     * @param includePOS
     */
    public void writeOutput(String filepath, boolean includePOS) {
        //Write CSV
        this.writeCsv(corpusStatsWithTFIDF, filepath, includePOS);
    }

    private void writeCsv(List<String[]> CorpusStatsWithTFIDF, String filepath, boolean includePOS) {

        StringBuffer sb = new StringBuffer();
        if (includePOS) {
            sb.append("Term, Frequency, TF*IDF, Ratio of texts occurring in, Part of speech" + "\n");
        } else {
            sb.append("Term, Frequency, TF*IDF, Ratio of texts occurring in" + "\n");

        }

        String toWrite = "";
        for (int i1 = 0; i1 < CorpusStatsWithTFIDF.size(); i1++) {
            if (includePOS) {
                toWrite = CorpusStatsWithTFIDF.get(i1)[0] + "," + CorpusStatsWithTFIDF.get(i1)[1] + "," + CorpusStatsWithTFIDF.get(i1)[2] + "," + CorpusStatsWithTFIDF.get(i1)[3] + "," + CorpusStatsWithTFIDF.get(i1)[4] + "\n";
            } else {
                toWrite = CorpusStatsWithTFIDF.get(i1)[0] + "," + CorpusStatsWithTFIDF.get(i1)[1] + "," + CorpusStatsWithTFIDF.get(i1)[2] + "," + CorpusStatsWithTFIDF.get(i1)[3] + "\n";
            }
            sb.append(toWrite);
        }
        System.out.println("in writecsv before writeDataIntoFile");
        
        // 2016.03 Add this code to delete existing file
        File toDelete = new File(filepath);
        	if (toDelete.exists()) {
        		toDelete.delete(); 
        	}
        //
        	
        FileData.writeDataIntoFile(sb.toString(), filepath);

    }

    private List<String> getTokens(String text) {
        List<String> tokens = new ArrayList();
        PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer(new StringReader(text),
                new CoreLabelTokenFactory(), "");
        for (CoreLabel label; ptbt.hasNext();) {
            label = ptbt.next();
            tokens.add(label.originalText());
        }
        System.out.println("Tokens#:" + tokens.size());
        return tokens;
    }

}