package chipmunk.segmenter;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import marmot.util.FileUtils;
public class Stemmer {
public enum Mode {
ROOT_DETECTION, STEMMING
};
private Segmenter segmenter_;
private Mode mode_;
public Stemmer(Segmenter segmenter, Mode mode) {
segmenter_ = segmenter;
mode_ = mode;
}
String stem(Word word) {
SegmentationReading reading = segmenter_.segment(word);
StringBuilder sb = new StringBuilder();
Iterator<String> segments = reading.getSegments().iterator();
Iterator<String> tags = reading.getTags().iterator();
while (segments.hasNext()) {
String segment = segments.next();
String tag = tags.next();
if (tag.startsWith("ROOT")
|| (tag.startsWith("DERI") && mode_ == Mode.STEMMING)
|| tag.startsWith(TagSet.NUMBER)
|| tag.startsWith(TagSet.SPECIAL)) {
sb.append(segment);
}
}
if (sb.length() == 0) {
return "EMPTY";
}
return sb.toString();
}
public void stemToFile(String outfile, SegmentationDataReader words)
throws IOException {
Writer writer = FileUtils.openFileWriter(outfile);
for (Word word : words) {
String stem = stem(word);
writer.write(word.getWord());
writer.write('\t');
writer.write(stem);
writer.write('\n');
}
writer.close();
}
}