POSBody.java example

Explorer
context-master
- src
  - context
/*
 
 * Copyright (c) 2015 University of Illinois Board of Trustees, All rights reserved.   
 * Developed at GSLIS/ the iSchool, by Dr. Jana Diesner, Amirhossein Aleyasen,    
 * Chieh-Li Chin, Shubhanshu Mishra, Kiumars Soltani, and Liang Tao.     
 *   
 * This program is free software; you can redistribute it and/or modify it under   
 * the terms of the GNU General Public License as published by the Free Software   
 * Foundation; either version 2 of the License, or any later version.   
 *    
 * This program is distributed in the hope that it will be useful, but WITHOUT   
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or    
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for   
 * more details.   
 *    
 * You should have received a copy of the GNU General Public License along with   
 * this program; if not, see <http://www.gnu.org/licenses>.   
 *
 
 
 */
package context.core.task.pos;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import context.core.entity.CorpusData;
import context.core.entity.FileData;
import context.core.entity.TabularData;
import context.core.util.CorpusAggregator;
import context.core.util.JavaIO;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

/**
 *
 * @author Aale
 */
public class POSBody {

    /**
     * @param args
     */
    private POSTaskInstance instance;
    private CorpusData input;
    private List<TabularData> tabularOutput;

    /**
     *
     */
    protected StanfordCoreNLP pipeline;
    private List<String[]> POStagsWithCount;

    /**
     *
     * @param instance
     */
    public POSBody(POSTaskInstance instance) {
		// TODO Auto-generated method stub

        // File[] Files, StanfordCoreNLP pipeline, String outputDir
        this.instance = instance;
        init();

    }

    private void init() {
        this.input = (CorpusData) instance.getInput();
        this.pipeline = instance.getPipeline();
        this.tabularOutput = instance.getTabularOutput();
    }

    /**
     *
     * @return
     */
    public boolean tagPOS() {
        List<List<String[]>> toAggregate = new ArrayList<List<String[]>>();

        List<FileData> files = input.getFiles();
        try {
            for (FileData ff : files) {

                File file = ff.getFile();
                String text;
                List<String[]> POStags = new ArrayList<String[]>();
                try {
                    text = JavaIO.readFile(file);
                    if (instance.getLanguage().equals("en")) {
                        text = text.replaceAll("\\p{Cc}", " ");
                        text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " ");
                    }
                    Annotation document = new Annotation(text);
                    pipeline.annotate(document);

                    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

                    for (CoreMap sentence : sentences) {
                        // traversing the words in the current sentence
                        // a CoreLabel is a CoreMap with additional token-specific methods
                        final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);

                        final List<TaggedWord> taggedWords = POSTagger.tag(sent, instance.getLanguage());

                        for (TaggedWord token : taggedWords) {
                            // this is the text of the token
                            String word = token.word();
                            // this is the POS tag of the token
                            String pos = token.tag();
                            String[] entity = {word, pos, Integer.toString(1)};
                            if (instance.getLanguage().equals("en")) {
                                if (!word.matches("^[a-zA-Z0-9]*$")) {
                                    continue;
                                }
                            }
                            POStags.add(entity);
                        }
                    }
                    toAggregate.add(POStags);
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                    return false;
                }
            }
            POStagsWithCount = new CorpusAggregator().CorpusAggregate(toAggregate);
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }
        return true;
    }

    /**
     *
     * @param filepath
     */
    public void writeOutput(String filepath) {
        //Write CSV
        this.writeCsv(POStagsWithCount, filepath);
    }

    private void writeCsv(List<String[]> taggedPOS, String filepath) {

        System.out.println("POS Size=" + taggedPOS.size());
        StringBuffer sb = new StringBuffer();

        sb.append("Word,POS,Frequency\n");

        String toWrite = "";
        for (int i1 = 0; i1 < taggedPOS.size(); i1++) {
//            System.out.println(corpusStatsWithTFIDF.get(i1).length);

            toWrite = taggedPOS.get(i1)[0] + "," + taggedPOS.get(i1)[1] + "," + taggedPOS.get(i1)[2] + "\n";
            sb.append(toWrite);
        }
//        System.out.println("size of string to write=" + sb.toString().length());
        
        // 2016.03 Add this code to delete existing file
        File toDelete = new File(filepath);
        	if (toDelete.exists()) {
        		toDelete.delete(); 
        	}
        //
        
        FileData.writeDataIntoFile(sb.toString(), filepath);
    }

}