package edu.stanford.nlp.international.arabic.process; import edu.stanford.nlp.util.logging.Redwood; import java.io.*; import java.util.*; import edu.stanford.nlp.international.arabic.pipeline.DefaultLexicalMapper; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.SentenceUtils; import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.trees.treebank.Mapper; /** * Compares the output of the JFlex-based ArabicTokenizer to DefaultLexicalMapper, which * is used in the parser and elsewhere. * * @author Spence Green * */ public class ArabicTokenizerTester { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ArabicTokenizerTester.class); /** * arg[0] := tokenizer options * args[1] := file to tokenize * * @param args */ public static void main(String[] args) { if (args.length != 2) { System.out.printf("Usage: java %s OPTS filename%n", ArabicTokenizerTester.class.getName()); System.exit(-1); } String tokOptions = args[0]; File path = new File(args[1]); log.info("Reading from: " + path.getPath()); try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8")); TokenizerFactory<CoreLabel> tf = ArabicTokenizer.factory(); tf.setOptions(tokOptions); Mapper lexMapper = new DefaultLexicalMapper(); lexMapper.setup(null, "StripSegMarkersInUTF8", "StripMorphMarkersInUTF8"); int lineId = 0; for(String line; (line = br.readLine()) != null; lineId++) { line = line.trim(); // Tokenize with the tokenizer List<CoreLabel> tokenizedLine = tf.getTokenizer(new StringReader(line)).tokenize(); System.out.println(SentenceUtils.listToString(tokenizedLine)); // Tokenize with the mapper StringBuilder sb = new StringBuilder(); String[] toks = line.split("\\s+"); for (String tok : toks) { String mappedTok = lexMapper.map(null, tok); sb.append(mappedTok).append(" "); } List<String> mappedToks = Arrays.asList(sb.toString().trim().split("\\s+")); // Evaluate the output if (mappedToks.size() != tokenizedLine.size()) { System.err.printf("Line length mismatch:%norig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.listToString(tokenizedLine), SentenceUtils.listToString(mappedToks)); } else { boolean printLines = false; for (int i = 0; i < mappedToks.size(); ++i) { String mappedTok = mappedToks.get(i); String tokenizedTok = tokenizedLine.get(i).word(); if ( ! mappedTok.equals(tokenizedTok)) { System.err.printf("Token mismatch:%nmap: %s%ntok: %s%n", mappedTok, tokenizedTok); printLines = true; } } if (printLines) { System.err.printf("orig: %s%ntok: %s%nmap: %s%n%n", line, SentenceUtils.listToString(tokenizedLine), SentenceUtils.listToString(mappedToks)); } } } System.err.printf("Read %d lines.%n", lineId); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }