package edu.stanford.nlp.pipeline; import java.util.*; import edu.stanford.nlp.international.arabic.process.ArabicSegmenter; import edu.stanford.nlp.ling.SegmenterCoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.logging.Redwood; /** * This class will add segmentation information to an Annotation. * It assumes that the original document is a List of sentences under the * SentencesAnnotation.class key, and that each sentence has a * TextAnnotation.class key. This Annotator adds corresponding * information under a CharactersAnnotation.class key prior to segmentation, * and a TokensAnnotation.class key with value of a List of CoreLabel * after segmentation. * * Based on the ChineseSegmenterAnnotator by Pi-Chuan Chang. * * @author Will Monroe */ public class ArabicSegmenterAnnotator implements Annotator { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ArabicSegmenterAnnotator.class); private ArabicSegmenter segmenter; private final boolean VERBOSE; private static final String DEFAULT_SEG_LOC = "/u/nlp/data/arabic-segmenter/arabic-segmenter-atb+bn+arztrain.ser.gz"; public ArabicSegmenterAnnotator() { this(DEFAULT_SEG_LOC, false); } public ArabicSegmenterAnnotator(boolean verbose) { this(DEFAULT_SEG_LOC, verbose); } public ArabicSegmenterAnnotator(String segLoc, boolean verbose) { VERBOSE = verbose; Properties props = new Properties(); loadModel(segLoc, props); } public ArabicSegmenterAnnotator(String name, Properties props) { String model = null; // Keep only the properties that apply to this annotator Properties modelProps = new Properties(); String desiredKey = name + '.'; for (String key : props.stringPropertyNames()) { if (key.startsWith(desiredKey)) { // skip past name and the subsequent "." String modelKey = key.substring(desiredKey.length()); if (modelKey.equals("model")) { model = props.getProperty(key); } else { modelProps.setProperty(modelKey, props.getProperty(key)); } } } this.VERBOSE = PropertiesUtils.getBool(props, name + ".verbose", false); if (model == null) { throw new RuntimeException("Expected a property " + name + ".model"); } loadModel(model, modelProps); } @SuppressWarnings("unused") private void loadModel(String segLoc) { // don't write very much, because the CRFClassifier already reports loading if (VERBOSE) { log.info("Loading segmentation model ... "); } Properties modelProps = new Properties(); modelProps.setProperty("model", segLoc); segmenter = ArabicSegmenter.getSegmenter(modelProps); } private void loadModel(String segLoc, Properties props) { // don't write very much, because the CRFClassifier already reports loading if (VERBOSE) { log.info("Loading Segmentation Model ... "); } Properties modelProps = new Properties(); modelProps.setProperty("model", segLoc); modelProps.putAll(props); try { segmenter = ArabicSegmenter.getSegmenter(modelProps); } catch (RuntimeException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } } @Override public void annotate(Annotation annotation) { if (VERBOSE) { log.info("Adding Segmentation annotation ... "); } List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); if (sentences != null) { for (CoreMap sentence : sentences) { doOneSentence(sentence); } } else { doOneSentence(annotation); } } private void doOneSentence(CoreMap annotation) { String text = annotation.get(CoreAnnotations.TextAnnotation.class); List<CoreLabel> tokens = segmenter.segmentStringToTokenList(text); annotation.set(CoreAnnotations.TokensAnnotation.class, tokens); } @Override public Set<Class<? extends CoreAnnotation>> requires() { return Collections.emptySet(); } @Override public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() { return new HashSet<>(Arrays.asList( CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.CharacterOffsetBeginAnnotation.class, CoreAnnotations.CharacterOffsetEndAnnotation.class, CoreAnnotations.BeforeAnnotation.class, CoreAnnotations.AfterAnnotation.class, CoreAnnotations.TokenBeginAnnotation.class, CoreAnnotations.TokenEndAnnotation.class, CoreAnnotations.PositionAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.OriginalTextAnnotation.class, CoreAnnotations.ValueAnnotation.class )); } }