package context.core.task.stemming;
import context.core.entity.CorpusData;
import context.core.entity.FileData;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import java.util.List;
import org.apache.commons.io.FilenameUtils;
/**
*
* @author Aale
*/
public class StemmingBody {
/**
* @param args
*/
String Result;
private StemmingTaskInstance instance;
private CorpusData input;
private CorpusData output;
private StanfordCoreNLP pipeline;
/**
*
* @param instance
*/
public StemmingBody(StemmingTaskInstance instance) {
this.instance = instance;
init();
}
private void init() {
this.input = (CorpusData) instance.getInput();
this.output = (CorpusData) instance.getTextOutput();
this.pipeline = instance.getPipeline();
}
/**
*
* @return
*/
public boolean StemText() {
// TODO Auto-generated method stub
System.out.println("Begin of StemText");
List<FileData> files = input.getFiles();
System.out.println("file list size=" + files.size());
try {
for (FileData file : files) {
System.out.println("processing " + file.getPath());
String text = "";
text = file.readFileIntoString();
text = text.replaceAll("\\p{Cc}", " ");
text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " ");
// create an empty Annotation just with the given text
Annotation document = new Annotation(text);
// run all Annotators on this text
pipeline.annotate(document);
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
String lemmatizedText = "";
for (CoreMap sentence : sentences) {
final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);
final List<TaggedWord> taggedWords = LemmaTagger.lemmatize(sent, "en");
for (TaggedWord token : taggedWords) {
// this is the text of the token
String word = token.word();
// this is the lemma tag of the token
String lemma = token.tag();
lemmatizedText += lemma + " ";
}
}
String inputNameWithoutExtension = FilenameUtils.getBaseName(file.getFile().getName());
String inputExtension = FilenameUtils.getExtension(file.getFile().getName());
final String name = inputNameWithoutExtension + "-ST." + inputExtension;
int index = output.addFile(name);
System.out.println("write file " + name);
output.writeFile(index, lemmatizedText);
//index++;
}
} catch (Exception e) {
e.printStackTrace();
return false;
}
return true;
}
/**
*
* @return
*/
public String getResult() {
return Result;
}
}