package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.*;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.SegmenterCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
/**
* This class will add segmentation information to an Annotation.
* It assumes that the original document is a List of sentences under the
* SentencesAnnotation.class key, and that each sentence has a
* TextAnnotation.class key. This Annotator adds corresponding
* information under a CharactersAnnotation.class key prior to segmentation,
* and a TokensAnnotation.class key with value of a List of CoreLabel
* after segmentation.
*
* @author Pi-Chuan Chang
*/
public class ChineseSegmenterAnnotator implements Annotator {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(ChineseSegmenterAnnotator.class);
private AbstractSequenceClassifier<?> segmenter;
private final boolean VERBOSE;
private static final String DEFAULT_SEG_LOC =
"/u/nlp/data/gale/segtool/stanford-seg/classifiers-2010/05202008-ctb6.processed-chris6.lex.gz";
private static final String DEFAULT_SER_DICTIONARY =
"/u/nlp/data/gale/segtool/stanford-seg/classifiers/dict-chris6.ser.gz";
private static final String DEFAULT_SIGHAN_CORPORA_DICT =
"/u/nlp/data/gale/segtool/stanford-seg/releasedata";
public ChineseSegmenterAnnotator() {
this(DEFAULT_SEG_LOC, false);
}
public ChineseSegmenterAnnotator(boolean verbose) {
this(DEFAULT_SEG_LOC, verbose);
}
public ChineseSegmenterAnnotator(String segLoc, boolean verbose) {
this(segLoc, verbose, DEFAULT_SER_DICTIONARY, DEFAULT_SIGHAN_CORPORA_DICT);
}
public ChineseSegmenterAnnotator(String segLoc, boolean verbose, String serDictionary, String sighanCorporaDict) {
VERBOSE = verbose;
Properties props = new Properties();
props.setProperty("serDictionary", serDictionary);
props.setProperty("sighanCorporaDict", sighanCorporaDict);
loadModel(segLoc, props);
}
public ChineseSegmenterAnnotator(String name, Properties props) {
String model = null;
// Keep only the properties that apply to this annotator
Properties modelProps = new Properties();
String desiredKey = name + '.';
for (String key : props.stringPropertyNames()) {
if (key.startsWith(desiredKey)) {
// skip past name and the subsequent "."
String modelKey = key.substring(desiredKey.length());
if (modelKey.equals("model")) {
model = props.getProperty(key);
} else {
modelProps.setProperty(modelKey, props.getProperty(key));
}
}
}
this.VERBOSE = PropertiesUtils.getBool(props, name + ".verbose", false);
if (model == null) {
throw new RuntimeException("Expected a property " + name + ".model");
}
loadModel(model, modelProps);
}
@SuppressWarnings("unused")
private void loadModel(String segLoc) {
// don't write very much, because the CRFClassifier already reports loading
if (VERBOSE) {
log.info("Loading segmentation model ... ");
}
segmenter = CRFClassifier.getClassifierNoExceptions(segLoc);
}
private void loadModel(String segLoc, Properties props) {
// don't write very much, because the CRFClassifier already reports loading
if (VERBOSE) {
log.info("Loading Segmentation Model ... ");
}
try {
segmenter = CRFClassifier.getClassifier(segLoc, props);
} catch (RuntimeException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
@Override
public void annotate(Annotation annotation) {
if (VERBOSE) {
log.info("Adding Segmentation annotation ... ");
}
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
if (sentences != null) {
for (CoreMap sentence : sentences) {
doOneSentence(sentence);
}
} else {
doOneSentence(annotation);
}
}
private void doOneSentence(CoreMap annotation) {
splitCharacters(annotation);
runSegmentation(annotation);
}
private static void splitCharacters(CoreMap annotation) {
String origText = annotation.get(CoreAnnotations.TextAnnotation.class);
boolean seg = true;
List<CoreLabel> words = new ArrayList<>();
for (int i = 0; i < origText.length(); i++) {
CoreLabel wi = new CoreLabel();
char[] ca = {origText.charAt(i)};
String wordString = new String(ca);
// if this word is a whitespace or a control character, set 'seg' to true for next word, and break
if ((Character.isSpaceChar(origText.charAt(i)) || Character.isISOControl(origText.charAt(i))) &&
! (origText.charAt(i) == '\n' || origText.charAt(i) == '\r')) {
seg = true;
} else if (Character.isISOControl(origText.charAt(i))) {
// skip it but don't set seg
seg = false;
} else {
// if this word is a word, put it as a feature label and set seg to false for next word
wi.set(CoreAnnotations.ChineseCharAnnotation.class, wordString);
if (seg) {
wi.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
} else {
wi.set(CoreAnnotations.ChineseSegAnnotation.class, "0");
}
wi.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, i);
wi.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, (i + 1));
words.add(wi);
seg = false;
}
}
annotation.set(SegmenterCoreAnnotations.CharactersAnnotation.class, words);
}
private void runSegmentation(CoreMap annotation) {
//0 2
// A BC D E
// 1 10 1 1
// 0 12 3 4
// 0, 0+1 ,
String text = annotation.get(CoreAnnotations.TextAnnotation.class);
List<CoreLabel> sentChars = annotation.get(SegmenterCoreAnnotations.CharactersAnnotation.class);
List<CoreLabel> tokens = new ArrayList<>();
annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
text = text.replaceAll("[\n\r]", "");
List<String> words = segmenter.segmentString(text);
if (VERBOSE) {
log.info(text);
log.info("--->");
log.info(words);
}
int pos = 0;
for (String w : words) {
CoreLabel fl = sentChars.get(pos);
fl.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
if (w.isEmpty()) {
continue;
}
CoreLabel token = new CoreLabel();
token.setWord(w);
token.setValue(w);
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, fl.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
pos += w.length();
fl = sentChars.get(pos - 1);
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, fl.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
if (VERBOSE) {
log.info("Adding token " + token.toShorterString());
}
tokens.add(token);
}
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return Collections.emptySet();
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return new HashSet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class,
CoreAnnotations.BeforeAnnotation.class,
CoreAnnotations.AfterAnnotation.class,
CoreAnnotations.TokenBeginAnnotation.class,
CoreAnnotations.TokenEndAnnotation.class,
CoreAnnotations.PositionAnnotation.class,
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.OriginalTextAnnotation.class,
CoreAnnotations.ValueAnnotation.class
));
}
}