StemmingBody.java example

Explorer
context-master
- src
  - context
package context.core.task.stemming;

import context.core.entity.CorpusData;
import context.core.entity.FileData;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import java.util.List;
import org.apache.commons.io.FilenameUtils;

/**
 *
 * @author Aale
 */
public class StemmingBody {

    /**
     * @param args
     */
    String Result;

    private StemmingTaskInstance instance;
    private CorpusData input;
    private CorpusData output;
    private StanfordCoreNLP pipeline;

    /**
     *
     * @param instance
     */
    public StemmingBody(StemmingTaskInstance instance) {
        this.instance = instance;
        init();
    }

    private void init() {

        this.input = (CorpusData) instance.getInput();
        this.output = (CorpusData) instance.getTextOutput();
        this.pipeline = instance.getPipeline();

    }

    /**
     *
     * @return
     */
    public boolean StemText() {
        // TODO Auto-generated method stub
        System.out.println("Begin of StemText");
        List<FileData> files = input.getFiles();
        System.out.println("file list size=" + files.size());
        try {
            for (FileData file : files) {
                System.out.println("processing " + file.getPath());
                String text = "";
                text = file.readFileIntoString();

                text = text.replaceAll("\\p{Cc}", " ");
                text = text.replaceAll("[^A-Za-z0-9 :;!\\?\\.,\'\"-]", " ");
                // create an empty Annotation just with the given text
                Annotation document = new Annotation(text);
                // run all Annotators on this text
                pipeline.annotate(document);
                // these are all the sentences in this document
                // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
                List<CoreMap> sentences = document.get(SentencesAnnotation.class);
                String lemmatizedText = "";
                for (CoreMap sentence : sentences) {

                    final List<CoreLabel> sent = sentence.get(TokensAnnotation.class);
                    final List<TaggedWord> taggedWords = LemmaTagger.lemmatize(sent, "en");
                    for (TaggedWord token : taggedWords) {
                        // this is the text of the token
                        String word = token.word();
                        // this is the lemma tag of the token
                        String lemma = token.tag();
                        lemmatizedText += lemma + " ";
                    }
                }
                String inputNameWithoutExtension = FilenameUtils.getBaseName(file.getFile().getName());
                String inputExtension = FilenameUtils.getExtension(file.getFile().getName());
                final String name = inputNameWithoutExtension + "-ST." + inputExtension;

                int index = output.addFile(name);
                System.out.println("write file " + name);
                output.writeFile(index, lemmatizedText);

                //index++;
            }
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }
        return true;

    }

    /**
     *
     * @return
     */
    public String getResult() {
        return Result;
    }

}