package edu.stanford.nlp.pipeline;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.ChineseCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.Timing;
/**
* This class will add Segmentation information to an
* Annotation.
* It assumes that the original String or List<String> is under the Annotation.ORIG_STRING_KEY
* and also corresponding character level information is under Annotation.WORDS_KEY
* and addes segmentation information to each CoreLabel,
* in the CoreLabel.CH_SEG_KEY field.
*
* @author Pi-Chuan Chang
*/
public class ChineseSegmenterAnnotator implements Annotator {
private AbstractSequenceClassifier<?> segmenter = null;
private Timing timer = new Timing();
private static long millisecondsAnnotating = 0;
private boolean VERBOSE = false;
private static final String DEFAULT_SEG_LOC =
"/u/nlp/data/gale/segtool/stanford-seg/classifiers-2010/05202008-ctb6.processed-chris6.lex.gz";
private static final String DEFAULT_SER_DICTIONARY =
"/u/nlp/data/gale/segtool/stanford-seg/classifiers/dict-chris6.ser.gz";
private static final String DEFAULT_SIGHAN_CORPORA_DICT =
"/u/nlp/data/gale/segtool/stanford-seg/releasedata";
public ChineseSegmenterAnnotator() {
this(DEFAULT_SEG_LOC, false);
}
public ChineseSegmenterAnnotator(boolean verbose) {
this(DEFAULT_SEG_LOC, verbose);
}
public ChineseSegmenterAnnotator(String segLoc, boolean verbose) {
this(segLoc, verbose, DEFAULT_SER_DICTIONARY, DEFAULT_SIGHAN_CORPORA_DICT);
}
public ChineseSegmenterAnnotator(String segLoc, boolean verbose, String serDictionary, String sighanCorporaDict) {
VERBOSE = verbose;
Properties props = new Properties();
props.setProperty("serDictionary", serDictionary);
props.setProperty("sighanCorporaDict", sighanCorporaDict);
loadModel(segLoc, props);
}
public ChineseSegmenterAnnotator(String name, Properties props) {
String model = null;
// Keep only the properties that apply to this annotator
Properties modelProps = new Properties();
for (String key : props.stringPropertyNames()) {
if (key.startsWith(name + ".")) {
// skip past name and the subsequent "."
String modelKey = key.substring(name.length() + 1);
if (modelKey.equals("model")) {
model = props.getProperty(key);
} else {
modelProps.setProperty(modelKey, props.getProperty(key));
}
}
}
this.VERBOSE = PropertiesUtils.getBool(props, name + ".verbose", true);
if (model == null) {
throw new RuntimeException("Expected a property " + name + ".model");
}
loadModel(model, modelProps);
}
private void loadModel(String segLoc) {
if (VERBOSE) {
timer.start();
System.err.print("Loading Segmentation Model ["+segLoc+"]...");
}
segmenter = CRFClassifier.getClassifierNoExceptions(segLoc);
if (VERBOSE) {
timer.stop("done.");
}
}
private void loadModel(String segLoc, Properties props) {
if (VERBOSE) {
timer.start();
System.err.print("Loading Segmentation Model ["+segLoc+"]...");
}
try {
segmenter = CRFClassifier.getClassifier(segLoc, props);
} catch (RuntimeException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException(e);
}
if (VERBOSE) {
timer.stop("done.");
}
}
public void annotate(Annotation annotation) {
if (VERBOSE) {
timer.start();
System.err.print("Adding Segmentation annotation...");
}
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
if (sentences != null) {
for (CoreMap sentence : sentences) {
doOneSentence(sentence);
}
} else {
doOneSentence(annotation);
}
if (VERBOSE) {
millisecondsAnnotating += timer.stop("done.");
//System.err.println("output: "+l+"\n");
}
}
public void doOneSentence(CoreMap annotation) {
splitCharacters(annotation);
runSegmentation(annotation);
}
public void splitCharacters(CoreMap annotation) {
String origText = annotation.get(CoreAnnotations.TextAnnotation.class);
boolean seg = true;
List<CoreLabel> words = new ArrayList<CoreLabel>();
for (int i = 0; i < origText.length(); i++) {
CoreLabel wi = new CoreLabel();
char[] ca = {origText.charAt(i)};
String wordString = new String(ca);
// if this word is a whitespace or a control character, set 'seg' to true for next word, and break
if (Character.isWhitespace(origText.charAt(i)) || Character.isISOControl(origText.charAt(i))) {
seg = true;
continue;
} else {
// if this word is a word, put it as a feature label and set seg to false for next word
wi.set(CoreAnnotations.ChineseCharAnnotation.class, wordString);
if (seg) {
wi.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
} else {
wi.set(CoreAnnotations.ChineseSegAnnotation.class, "0");
}
wi.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, i);
wi.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, (i + 1));
words.add(wi);
seg = false;
}
}
annotation.set(ChineseCoreAnnotations.CharactersAnnotation.class, words);
if (VERBOSE) {
System.err.println("output: " + words);
}
}
public void runSegmentation(CoreMap annotation) {
//0 2
// A BC D E
// 1 10 1 1
// 0 12 3 4
// 0, 0+1 ,
String text = annotation.get(CoreAnnotations.TextAnnotation.class);
List<CoreLabel> sentChars = annotation.get(ChineseCoreAnnotations.CharactersAnnotation.class);
List<CoreLabel> tokens = new ArrayList<CoreLabel>();
annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
List<String> words = segmenter.segmentString(text);
if (VERBOSE) {
System.err.println(text);
System.err.println("--->");
System.err.println(words);
}
int pos = 0;
for (String w : words) {
CoreLabel fl = sentChars.get(pos);
fl.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
if (w.length() == 0) {
continue;
}
CoreLabel token = new CoreLabel();
token.setWord(w);
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, fl.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
pos += w.length();
fl = sentChars.get(pos - 1);
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, fl.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
tokens.add(token);
}
}
@Override
public Set<Requirement> requires() {
return Collections.emptySet();
}
@Override
public Set<Requirement> requirementsSatisfied() {
return Collections.singleton(TOKENIZE_REQUIREMENT);
}
}