/*
* Copyright (c) 2015 University of Illinois Board of Trustees, All rights reserved.
* Developed at GSLIS/ the iSchool, by Dr. Jana Diesner, Amirhossein Aleyasen,
* Chieh-Li Chin, Shubhanshu Mishra, Kiumars Soltani, and Liang Tao.
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, see <http://www.gnu.org/licenses>.
*
*/
package context.core.task.pos;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import context.core.entity.CorpusData;
import context.core.entity.FileData;
import context.core.entity.TabularData;
import context.core.util.CorpusAggregator;
import context.core.util.JavaIO;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
/**
*
* @author Aale
*/
public class POSBody {
/**
* @param args
*/
private POSTaskInstance instance;
private CorpusData input;
private List<TabularData> tabularOutput;
/**
*
*/
protected StanfordCoreNLP pipeline;
private List<String[]> POStagsWithCount;
/**
*
* @param instance
*/
public POSBody(POSTaskInstance instance) {
// TODO Auto-generated method stub
// File[] Files, StanfordCoreNLP pipeline, String outputDir
this.instance = instance;
init();
}
private void init() {
this.input = (CorpusData) instance.getInput();
this.pipeline = instance.getPipeline();
this.tabularOutput = instance.getTabularOutput();
}
/**
*
* @return
*/
public boolean tagPOS() {
List<List<String[]>> toAggregate = new ArrayList<List<String[]>>();
List<FileData> files = input.getFiles();
try {
for (FileData ff : files) {
File file = ff.getFile();
String text;
List<String[]> POStags = new ArrayList<String[]>();
try {
text = JavaIO.readFile(file);
if (instance.getLanguage().equals("en")) {
text = text.replaceAll("\\p{Cc}", " ");
text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " ");
}
Annotation document = new Annotation(text);
pipeline.annotate(document);
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);
final List<TaggedWord> taggedWords = POSTagger.tag(sent, instance.getLanguage());
for (TaggedWord token : taggedWords) {
// this is the text of the token
String word = token.word();
// this is the POS tag of the token
String pos = token.tag();
String[] entity = {word, pos, Integer.toString(1)};
if (instance.getLanguage().equals("en")) {
if (!word.matches("^[a-zA-Z0-9]*$")) {
continue;
}
}
POStags.add(entity);
}
}
toAggregate.add(POStags);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
}
POStagsWithCount = new CorpusAggregator().CorpusAggregate(toAggregate);
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}
/**
*
* @param filepath
*/
public void writeOutput(String filepath) {
//Write CSV
this.writeCsv(POStagsWithCount, filepath);
}
private void writeCsv(List<String[]> taggedPOS, String filepath) {
System.out.println("POS Size=" + taggedPOS.size());
StringBuffer sb = new StringBuffer();
sb.append("Word,POS,Frequency\n");
String toWrite = "";
for (int i1 = 0; i1 < taggedPOS.size(); i1++) {
// System.out.println(corpusStatsWithTFIDF.get(i1).length);
toWrite = taggedPOS.get(i1)[0] + "," + taggedPOS.get(i1)[1] + "," + taggedPOS.get(i1)[2] + "\n";
sb.append(toWrite);
}
// System.out.println("size of string to write=" + sb.toString().length());
// 2016.03 Add this code to delete existing file
File toDelete = new File(filepath);
if (toDelete.exists()) {
toDelete.delete();
}
//
FileData.writeDataIntoFile(sb.toString(), filepath);
}
}