package context.core.task.corpusstat;
import context.core.entity.CorpusData;
import context.core.entity.FileData;
import context.core.entity.TabularData;
import context.core.task.pos.POSTagger;
import context.core.util.CorpusAggregator;
import context.core.util.ForAggregationNoCase;
import context.core.util.JavaIO;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.util.CoreMap;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
/**
*
* @author Aale
*/
public class CorpusStatisticsBody {
/**
* @param args
*/
private CorpusStatTaskInstance instance;
private CorpusData input;
private List<TabularData> tabularOutput;
private StanfordCoreNLP pipeline;
private List<String[]> corpusStatsWithTFIDF;
private File stopFile;
/**
*
* @param instance
*/
public CorpusStatisticsBody(CorpusStatTaskInstance instance) {
// TODO Auto-generated method stub
this.instance = instance;
init();
}
private void init() {
this.input = (CorpusData) instance.getInput();
this.tabularOutput = instance.getTabularOutput();
this.pipeline = instance.getPipeline();
}
/**
*
* @param includePOS
* @return
*/
public boolean RunCorpusStatistics(boolean includePOS) {
if (includePOS) {
return RunCorpusStatisticsWithPOS();
} else {
return RunCorpusStatisticsWithoutPOS();
}
}
/**
*
* @return
*/
public boolean RunCorpusStatisticsWithPOS() {
System.out.println("CorpusStat with POS");
List<List<String[]>> toAggregate = new ArrayList<List<String[]>>();
int numTerms = 0;
List<FileData> files = input.getFiles();
try {
for (FileData ff : files) {
File file = ff.getFile();
String text;
List<String[]> CorpusStatTags = new ArrayList<String[]>();
try {
text = JavaIO.readFile(file);
Annotation document = new Annotation(text);
pipeline.annotate(document);
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);
final List<TaggedWord> taggedWords = POSTagger.tag(sent, instance.getLanguage());
for (TaggedWord token : taggedWords) {
// this is the text of the token
String word = token.word();
// this is the POS tag of the token
String pos = token.tag();
boolean val = pos.contains("NN");
if (!val) {
if (word == "The") {
System.out.println("Que?");
}
word = word.toLowerCase();
}
String[] entity = {word, pos, file.getName(), Integer.toString(1)};
if (!word.matches("[a-zA-Z0-9_@#]*|:\\)|:-\\)|:\\(|:-\\(|:\\/|:-\\/|:\\\\|:-\\\\|:p|:-p|;\\)|;-\\)|:>|:->")) {
continue;
}
CorpusStatTags.add(entity);
numTerms++;
}
}
for (String retval: text.split(" ")){
//System.out.println("processing " + retval);
if (retval.matches(":\\)|:-\\)|:\\(|:-\\(|:\\/|:-\\/|:\\\\|:-\\\\|:p|:-p|;\\)|;-\\)|:>|:->")) {
String[] entity = {retval, "A", file.getName(), Integer.toString(1)};
System.out.println("pass:" + numTerms);
CorpusStatTags.add(entity);
numTerms++;
//System.out.println("Printing words after corpusstattags:"+retval);
}
else
{
System.out.println("regex dint match " +retval);
}
}
toAggregate.add(CorpusStatTags);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
}
List<String[]> CorpusStatsByFile = new CorpusAggregator().CorpusAggregateNoCase(toAggregate);
HashMap<ForAggregationNoCase, Integer[]> CorpusStats = new HashMap<ForAggregationNoCase, Integer[]>();
for (String[] CorpusStatByFile : CorpusStatsByFile) {
String[] identifierForRatio = {CorpusStatByFile[0], CorpusStatByFile[1]};
ForAggregationNoCase identifierForAggregate = new ForAggregationNoCase(identifierForRatio);
if (!(null == CorpusStats.get(identifierForAggregate))) {
Integer currNumDocs = ((Integer[]) CorpusStats.get(identifierForAggregate))[0];
Integer[] NumDocsAndTotalCount = {currNumDocs + 1, ((Integer[]) CorpusStats.get(identifierForAggregate))[1] + Integer.parseInt(CorpusStatByFile[3])};
CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount.clone());
} else {
Integer[] NumDocsAndTotalCount = {1, Integer.parseInt(CorpusStatByFile[3])};
CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount);
}
}
List<String[]> corpusStatsWithTFIDFList = new ArrayList<String[]>();
Iterator<ForAggregationNoCase> it = CorpusStats.keySet().iterator();
while (it.hasNext()) {
String[] currentCorpusStatWithTFIDF = new String[5];
ForAggregationNoCase next = it.next();
currentCorpusStatWithTFIDF[0] = next.toAggregate[0];
currentCorpusStatWithTFIDF[4] = next.toAggregate[1];
currentCorpusStatWithTFIDF[1] = Integer.toString(CorpusStats.get(next)[1]);
currentCorpusStatWithTFIDF[3] = Float.toString(((float) CorpusStats.get(next)[0]) / ((float) files.size()));
float tf = Float.parseFloat(currentCorpusStatWithTFIDF[1]) / (Float.MIN_VALUE + numTerms);
float idf = (float) Math.log10(((float) files.size()) / (Float.MIN_VALUE + ((float) CorpusStats.get(next)[0])));
currentCorpusStatWithTFIDF[2] = Float.toString((tf * idf));
corpusStatsWithTFIDFList.add(currentCorpusStatWithTFIDF);
}
this.corpusStatsWithTFIDF = corpusStatsWithTFIDFList;
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}
/**
*
* @return
*/
public boolean RunCorpusStatisticsWithoutPOS() {
System.out.println("CorpusStat without POS");
List<List<String[]>> toAggregate = new ArrayList<List<String[]>>();
int numTerms = 0;
List<FileData> files = input.getFiles();
try {
for (FileData ff : files) {
File file = ff.getFile();
String text;
List<String[]> CorpusStatTags = new ArrayList<String[]>();
try {
text = JavaIO.readFile(file);
List<String> words = getTokens(text);
for (String word : words) {
String[] entity = {word, "A", file.getName(), Integer.toString(1)};
if (!word.matches("[a-zA-Z0-9_@#]*|:\\)|:\\(|:\\/|:\\\\|:p|;\\)|;-\\)")) {
continue;
}
System.out.println("pass:" + numTerms);
CorpusStatTags.add(entity);
numTerms++;
}
for (String retval: text.split(" ")){
//System.out.println("processing " + retval);
if (retval.matches(":\\)|:\\(|:\\/|:\\\\|:p|;\\)|;-\\)")) {
String[] entity = {retval, "A", file.getName(), Integer.toString(1)};
System.out.println("pass:" + numTerms);
CorpusStatTags.add(entity);
numTerms++;
//System.out.println("Printing words after corpusstattags:"+retval);
}
else
{
System.out.println("regex dint match " +retval);
}
}
toAggregate.add(CorpusStatTags);
//System.out.println("CorpusStatTags" + CorpusStatTags.size());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return false;
}
}
System.out.println("toAggregate" + toAggregate.size());
List<String[]> CorpusStatsByFile = new CorpusAggregator().CorpusAggregateNoCase(toAggregate);
HashMap<ForAggregationNoCase, Integer[]> CorpusStats = new HashMap<ForAggregationNoCase, Integer[]>();
for (String[] CorpusStatByFile : CorpusStatsByFile) {
String[] identifierForRatio = {CorpusStatByFile[0], CorpusStatByFile[1]};
ForAggregationNoCase identifierForAggregate = new ForAggregationNoCase(identifierForRatio);
if (!(null == CorpusStats.get(identifierForAggregate))) {
Integer currNumDocs = ((Integer[]) CorpusStats.get(identifierForAggregate))[0];
Integer[] NumDocsAndTotalCount = {currNumDocs + 1, ((Integer[]) CorpusStats.get(identifierForAggregate))[1] + Integer.parseInt(CorpusStatByFile[3])};
CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount.clone());
} else {
Integer[] NumDocsAndTotalCount = {1, Integer.parseInt(CorpusStatByFile[3])};
CorpusStats.put(identifierForAggregate, NumDocsAndTotalCount);
}
}
List<String[]> corpusStatsWithTFIDFList = new ArrayList<String[]>();
Iterator<ForAggregationNoCase> it = CorpusStats.keySet().iterator();
while (it.hasNext()) {
String[] currentCorpusStatWithTFIDF = new String[5];
ForAggregationNoCase next = it.next();
currentCorpusStatWithTFIDF[0] = next.toAggregate[0];
currentCorpusStatWithTFIDF[4] = next.toAggregate[1];
currentCorpusStatWithTFIDF[1] = Integer.toString(CorpusStats.get(next)[1]);
currentCorpusStatWithTFIDF[3] = Float.toString(((float) CorpusStats.get(next)[0]) / ((float) files.size()));
float tf = Float.parseFloat(currentCorpusStatWithTFIDF[1]) / (Float.MIN_VALUE + numTerms);
float idf = (float) Math.log10(((float) files.size()) / (Float.MIN_VALUE + ((float) CorpusStats.get(next)[0])));
currentCorpusStatWithTFIDF[2] = Float.toString((tf * idf));
corpusStatsWithTFIDFList.add(currentCorpusStatWithTFIDF);
}
this.corpusStatsWithTFIDF = corpusStatsWithTFIDFList;
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}
/**
*
* @param filepath
* @param includePOS
*/
public void writeOutput(String filepath, boolean includePOS) {
//Write CSV
this.writeCsv(corpusStatsWithTFIDF, filepath, includePOS);
}
private void writeCsv(List<String[]> CorpusStatsWithTFIDF, String filepath, boolean includePOS) {
StringBuffer sb = new StringBuffer();
if (includePOS) {
sb.append("Term, Frequency, TF*IDF, Ratio of texts occurring in, Part of speech" + "\n");
} else {
sb.append("Term, Frequency, TF*IDF, Ratio of texts occurring in" + "\n");
}
String toWrite = "";
for (int i1 = 0; i1 < CorpusStatsWithTFIDF.size(); i1++) {
if (includePOS) {
toWrite = CorpusStatsWithTFIDF.get(i1)[0] + "," + CorpusStatsWithTFIDF.get(i1)[1] + "," + CorpusStatsWithTFIDF.get(i1)[2] + "," + CorpusStatsWithTFIDF.get(i1)[3] + "," + CorpusStatsWithTFIDF.get(i1)[4] + "\n";
} else {
toWrite = CorpusStatsWithTFIDF.get(i1)[0] + "," + CorpusStatsWithTFIDF.get(i1)[1] + "," + CorpusStatsWithTFIDF.get(i1)[2] + "," + CorpusStatsWithTFIDF.get(i1)[3] + "\n";
}
sb.append(toWrite);
}
System.out.println("in writecsv before writeDataIntoFile");
// 2016.03 Add this code to delete existing file
File toDelete = new File(filepath);
if (toDelete.exists()) {
toDelete.delete();
}
//
FileData.writeDataIntoFile(sb.toString(), filepath);
}
private List<String> getTokens(String text) {
List<String> tokens = new ArrayList();
PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer(new StringReader(text),
new CoreLabelTokenFactory(), "");
for (CoreLabel label; ptbt.hasNext();) {
label = ptbt.next();
tokens.add(label.originalText());
}
System.out.println("Tokens#:" + tokens.size());
return tokens;
}
}