TokenizerByWord.java example

Explorer
docx4j-master
- src
/*
 * This file is part of the DiffX library.
 *
 * For licensing information please see the file license.txt included in the release.
 * A copy of this licence can also be found at
 *   http://www.opensource.org/licenses/artistic-license-2.0.php
 */
package com.topologi.diffx.load.text;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.topologi.diffx.config.TextGranularity;
import com.topologi.diffx.config.WhiteSpaceProcessing;
import com.topologi.diffx.event.TextEvent;
import com.topologi.diffx.event.impl.IgnorableSpaceEvent;
import com.topologi.diffx.event.impl.SpaceEvent;
import com.topologi.diffx.event.impl.WordEvent;

/**
 * The tokeniser for characters events.
 * 
 * <p>This class is not synchronized.
 * 
 * @author Christophe Lauret
 * @version 11 May 2010
 */
public final class TokenizerByWord implements TextTokenizer {

  /**
   * Map characters to events in order to recycle events as they are created.
   */
  private final Map<String, TextEvent> recycling = new HashMap<String, TextEvent>();

  /**
   * Define the whitespace processing.
   */
  private final WhiteSpaceProcessing whitespace;

  /**
   * Creates a new tokenizer.
   * 
   * @param whitespace the whitespace processing for this tokenizer.
   * 
   * @throws NullPointerException if the white space processing is not specified.
   */
  public TokenizerByWord(WhiteSpaceProcessing whitespace) {
    if (whitespace == null) throw new NullPointerException("the white space processing must be specified.");
    this.whitespace = whitespace;
  }

  /**
   * {@inheritDoc}
   */
  public List<TextEvent> tokenize(CharSequence seq) {
    if (seq == null) return null;
    if (seq.length() == 0) return Collections.emptyList();
    List<TextEvent> events = new ArrayList<TextEvent>(seq.length());

    Pattern p = Pattern.compile("\\s+");
    Matcher m = p.matcher(seq);
    int index = 0;

    // Add segments before each match found
    while (m.find()) {
      if (index != m.start()) {
        String word = seq.subSequence(index, m.start()).toString();
        events.add(getWordEvent(word));
      }
      // We don't even need to record a white space if they are ignored!
      if (this.whitespace != WhiteSpaceProcessing.IGNORE) {
        String space = seq.subSequence(m.start(), m.end()).toString();
        events.add(getSpaceEvent(space));
      }
      index = m.end();
    }

    // Add remaining word if any
    if (index != seq.length()) {
      String word = seq.subSequence(index, seq.length()).toString();
      events.add(getWordEvent(word));
    }

    return events;
  }

  /**
   * Always <code>TextGranularity.WORD</code>.
   * 
   * {@inheritDoc}
   */
  public TextGranularity granurality() {
    return TextGranularity.WORD;
  }

  // Private helpers ------------------------------------------------------------------------------

  /**
   * Returns the word event corresponding to the specified characters.
   * 
   * @param word the characters of the word
   * @return the corresponding word event
   */
  private TextEvent getWordEvent(String word) {
    TextEvent e = this.recycling.get(word);
    if (e == null) {
      e = new WordEvent(word);
      this.recycling.put(word, e);
    }
    return e;
  }

  /**
   * Returns the space event corresponding to the specified characters.
   * 
   * @param space the characters of the space
   * @return the corresponding space event
   */
  private TextEvent getSpaceEvent(String space) {
    // preserve the actual white space used
    TextEvent e = this.recycling.get(space);
    if (e == null) {
      if (this.whitespace == WhiteSpaceProcessing.PRESERVE) {
        e = new IgnorableSpaceEvent(space);
      } else {
        e = SpaceEvent.getInstance(space);
      }
      this.recycling.put(space, e);
    }
    return e;
  }

}