ArabicSegmenterAnnotatorITest.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import junit.framework.TestCase;

import java.util.List;
import java.util.Properties;

import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;

public class ArabicSegmenterAnnotatorITest extends TestCase {
  StanfordCoreNLP pipeline = null;

  @Override
  public void setUp()
    throws Exception
  {
    if (pipeline != null) {
      return;
    }
    Properties props = new Properties();
    props.setProperty("annotators", "segment");
    props.setProperty("customAnnotatorClass.segment", "edu.stanford.nlp.pipeline.ArabicSegmenterAnnotator");
    props.setProperty("segment.model", "/u/nlp/data/arabic-segmenter/arabic-segmenter-atb+bn+arztrain.ser.gz");
    pipeline = new StanfordCoreNLP(props);
  }

  public void testPipeline() {
    String query = "وما هي كلمتُك المفضلة للدراسة؟";
    String[] expectedWords = {"و", "ما", "هي", "كلمة", "ك", "المفضلة", "ل", "الدراسة", "?"};
    int[] expectedStartPositions = {0, 1, 4, 7, 12, 14, 22, 23, 29};
    int[] expectedEndPositions = {1, 3, 6, 11, 13, 21, 23, 29, 30};
    Annotation annotation = new Annotation(query);
    pipeline.annotate(annotation);

    List<CoreLabel> tokens = annotation.get(TokensAnnotation.class);
    assertEquals(expectedWords.length, tokens.size());
    for (int i = 0; i < expectedWords.length; ++i) {
      assertEquals(expectedWords[i], tokens.get(i).word());
      assertEquals(expectedStartPositions[i], tokens.get(i).beginPosition());
      assertEquals(expectedEndPositions[i], tokens.get(i).endPosition());
    }
  }
}