ArabicTokenizerTester.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.international.arabic.process; 
import edu.stanford.nlp.util.logging.Redwood;

import java.io.*;
import java.util.*;

import edu.stanford.nlp.international.arabic.pipeline.DefaultLexicalMapper;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.trees.treebank.Mapper;

/**
 * Compares the output of the JFlex-based ArabicTokenizer to DefaultLexicalMapper, which
 * is used in the parser and elsewhere.
 *
 * @author Spence Green
 *
 */
public class ArabicTokenizerTester  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ArabicTokenizerTester.class);

  /**
   * arg[0] := tokenizer options
   * args[1] := file to tokenize
   *
   * @param args
   */
  public static void main(String[] args) {
    if (args.length != 2) {
      System.out.printf("Usage: java %s OPTS filename%n", ArabicTokenizerTester.class.getName());
      System.exit(-1);
    }
    String tokOptions = args[0];
    File path = new File(args[1]);
    log.info("Reading from: " + path.getPath());
    try {
      BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
      TokenizerFactory<CoreLabel> tf = ArabicTokenizer.factory();
      tf.setOptions(tokOptions);
      Mapper lexMapper = new DefaultLexicalMapper();
      lexMapper.setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8");

      int lineId = 0;
      for(String line; (line = br.readLine()) != null; lineId++) {
        line = line.trim();

        // Tokenize with the tokenizer
        List<CoreLabel> tokenizedLine = tf.getTokenizer(new StringReader(line)).tokenize();
        System.out.println(SentenceUtils.listToString(tokenizedLine));

        // Tokenize with the mapper
        StringBuilder sb = new StringBuilder();
        String[] toks = line.split("\\s+");
        for (String tok : toks) {
          String mappedTok = lexMapper.map(null, tok);
          sb.append(mappedTok).append(" ");
        }
        List<String> mappedToks = Arrays.asList(sb.toString().trim().split("\\s+"));

        // Evaluate the output
        if (mappedToks.size() != tokenizedLine.size()) {
          System.err.printf("Line length mismatch:%norig: %s%ntok: %s%nmap: %s%n%n",
              line,
              SentenceUtils.listToString(tokenizedLine),
              SentenceUtils.listToString(mappedToks));
        } else {
          boolean printLines = false;
          for (int i = 0; i < mappedToks.size(); ++i) {
            String mappedTok = mappedToks.get(i);
            String tokenizedTok = tokenizedLine.get(i).word();
            if ( ! mappedTok.equals(tokenizedTok)) {
              System.err.printf("Token mismatch:%nmap: %s%ntok: %s%n", mappedTok, tokenizedTok);
              printLines = true;
            }
          }
          if (printLines) {
            System.err.printf("orig: %s%ntok: %s%nmap: %s%n%n",
                line,
                SentenceUtils.listToString(tokenizedLine),
                SentenceUtils.listToString(mappedToks));
          }
        }
      }

      System.err.printf("Read %d lines.%n", lineId);

    } catch (UnsupportedEncodingException e) {
      e.printStackTrace();
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}