/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package org.wikipedia.miner.util.text;
import java.io.File;
import java.io.IOException;
import java.text.Normalizer;
import java.util.regex.Pattern;
import org.wikipedia.miner.db.WEnvironment;
import org.wikipedia.miner.util.WikipediaConfiguration;
/**
* Class that normalizes,processes case folding and processes plurals using
* PlingStemmer from http://mpii.de/yago-naga/javatools
* @author angel
*/
public class CaseAccentSimpleTextProcessor extends TextProcessor {
private final CaseFolder caseFolder=new CaseFolder();
private final PlingStemmer stemmer= new PlingStemmer();
private final Pattern pattern =Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
@Override
public String processText(String string) {
String normalizedText = Normalizer.normalize(string, Normalizer.Form.NFD);
normalizedText=pattern.matcher(normalizedText).replaceAll("");
normalizedText=caseFolder.processText(normalizedText);
normalizedText=stemmer.stem(normalizedText);
return normalizedText;
}
// public static void main(String[] args) throws IOException,Exception {
// CaseAccentSimpleTextProcessor tex=new CaseAccentSimpleTextProcessor();
// WikipediaConfiguration conf=new WikipediaConfiguration(new File("/home/angel/wikiminer/configs/wikipedia-en.xml"));
// WEnvironment.prepareTextProcessor(tex, conf, new File("/tmp/"), true, 4);
//
// }
}