/** ========================================================================
* handytrowel: src/main/java/nlp/TextAnalyzer.java
* Normalize/pre-process text, output post-processed text and n-grams.
* ========================================================================
* Copyright (c) 2014, Asim Ihsan, All rights reserved.
* <http://www.asimihsan.com>
* https://github.com/asimihsan/handytrowel/blob/master/LICENSE
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* ========================================================================
*/
package com.asimihsan.handytrowel.nlp;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;
import org.tartarus.snowball.SnowballStemmer;
import org.tartarus.snowball.ext.englishStemmer;
import com.google.common.base.Joiner;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.Pair;
/**
* Take a body of text, perform common pre-processing and normalization
* tasks, and return the post-processed body and n-gram statistics.
*
* @author Asim Ihsan
*/
public class TextAnalyzer {
/**
* Body of raw text that you wish to analyze.
*
* Required parameter via builder.
*/
private final String body;
public static class TextAnalyzerBuilder {
private String body;
public TextAnalyzerBuilder body(String body) {
this.body = body;
return this;
}
public TextAnalyzer build() {
return new TextAnalyzer(this);
}
}
private TextAnalyzer(TextAnalyzerBuilder builder) {
this.body = builder.body;
}
/**
* List of tokens that are created by a call to analyze() and then
* retrieved by a call to getTokens()
*/
private final List<String> tokens = new LinkedList<>();
/**
* Regular expression object that matches for punctuation. Note that
* this also matches full stops, so we lose sentence information.
*
* Sometimes Stanford CoreNLP's tokenizer spits out "'s" and 'n't" on
* its own, so we ignore single letters before/after punctuation too.
*
* Note that Stanford CoreNLP helpfully points out brackets with
* -lrb- and -rrb-. Let's chuck those too.
*/
private final Pattern punctuation = Pattern.compile("(?:[a-z]?[\\p{Punct}]+[a-z]?|-[lr].b-)");
/**
* A compiled number regular expression so we can replace all using it
* with $NUMBER.
*/
private final Pattern number = Pattern.compile("[0-9]+");
public List<String> getTokens() {
return tokens;
}
public TextAnalyzer analyze() {
// Stanford CoreNLP, avoid lemmatization as it's very slow to use Porter2 stemming
// instead. (Porter -> Snowball (Porter2) -> Lancaster is order of stemming
// aggressiveness.
//
// other ideas
// - remove top 10k most common english words
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, stopword");
props.setProperty("customAnnotatorClass.stopword", "com.asimihsan.handytrowel.nlp.StopwordAnnotator");
List<String> stopWords = null;
try {
stopWords = WordReader.wordReaderWithResourcePath("/nlp/top1000words.txt").getWords();
} catch (IOException e) {
e.printStackTrace();
return this;
}
String customStopWordList = Joiner.on(",").join(stopWords);
props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList);
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document = new Annotation(body);
pipeline.annotate(document);
List<CoreLabel> inputTokens = document.get(CoreAnnotations.TokensAnnotation.class);
SnowballStemmer stemmer = new englishStemmer();
for (CoreLabel token : inputTokens) {
Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);
if (stopword.first())
continue;
String word = token.word().toLowerCase();
//!!AI TODO this sucks, should make another annotator and make it optional etc.
//also we're matching full stops! so we lose sentence information.
if (punctuation.matcher(word).matches())
continue;
//!AI TODO again this would be its own annotator and optional
word = number.matcher(word).replaceAll("NUMBER");
stemmer.setCurrent(word);
stemmer.stem();
word = stemmer.getCurrent();
tokens.add(word);
}
return this;
}
// Stanford NLP tokenizer, trained on Penn Tree Bank (PTB)
// to use lemmatization need very large models in classpath
// http://search.maven.org/remotecontent?filepath=edu/stanford/nlp/stanford-corenlp/3.3.1/stanford-corenlp-3.3.1-models.jar
//
// lemmatization requires massive models and a lot of space, not worth it.
/*
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma, stopword");
props.setProperty("customAnnotatorClass.stopword", "com.asimihsan.handytrowel.StopwordAnnotator");
String customStopWordList = "start,starts,period,periods,a,an,and,are,as,at,be,but,by,for,he,had,if,in,into,is,it,no,not,of,on,or,such,that,the,their,then,there,these,they,this,to,was,will,with";
props.setProperty(StopwordAnnotator.STOPWORDS_LIST, customStopWordList);
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
Annotation document = new Annotation(pageContents);
pipeline.annotate(document);
List<CoreLabel> inputTokens = document.get(CoreAnnotations.TokensAnnotation.class);
for (CoreLabel token : inputTokens) {
Pair<Boolean, Boolean> stopword = token.get(StopwordAnnotator.class);
if (!stopword.first()) {
String word = token.get(LemmaAnnotation.class).toLowerCase();
outputTokens.add(word);
}
}
*/
}