MaxMatchSegmenter.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.wordseg;

import edu.stanford.nlp.fsm.DFSA;
import edu.stanford.nlp.fsm.DFSAState;
import edu.stanford.nlp.fsm.DFSATransition;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.process.WordSegmenter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;

import java.util.*;
import java.io.*;
import java.util.regex.Pattern;


import edu.stanford.nlp.util.logging.Redwood;

/**
 * Lexicon-based segmenter. Uses dynamic programming to find a word
 * segmentation that satisfies the following two preferences:
 * (1) minimize the number of out-of-vocabulary (OOV) words;
 * (2) if there are multiple segmentations with the same number
 * of OOV words, then select the one that minimizes the number
 * of segments. Note that {@link edu.stanford.nlp.parser.lexparser.MaxMatchSegmenter}
 * contains a greedy version of this algorithm.
 *
 * Note that the output segmentation may need to postprocessing for the segmentation
 * of non-Chinese characters (e.g., punctuation, foreign names).
 *
 * @author Michel Galley
 */
public class MaxMatchSegmenter implements WordSegmenter {

  private static final boolean DEBUG = false;

  private static Redwood.RedwoodChannels logger = Redwood.channels(MaxMatchSegmenter.class);

  private final Set<String> words = Generics.newHashSet();
  private int len = -1;
  private int edgesNb = 0;
  private static final int maxLength = 10;
  private List<DFSAState<Word, Integer>> states;
  private DFSA<Word, Integer> lattice = null;
  public enum MatchHeuristic { MINWORDS, MAXWORDS, MAXLEN }

  private static final Pattern chineseStartChars = Pattern.compile("^[\u4E00-\u9FFF]");
  private static final Pattern chineseEndChars = Pattern.compile("[\u4E00-\u9FFF]$");
  private static final Pattern chineseChars = Pattern.compile("[\u4E00-\u9FFF]");

  private static final Pattern excludeChars = Pattern.compile("[0-9\uff10-\uff19" +
        "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4E5D\u5341" +
        "\u96F6\u3007\u767E\u5343\u4E07\u4ebf\u5169\u25cb\u25ef\u3021-\u3029\u3038-\u303A" +
        "-#$%&'*+/@_\uff0d\uff03\uff04\uff05\uff06\uff07\uff0a\uff0b\uff0f\uff20\uff3f]");

  @Override
  public void initializeTraining(double numTrees) {}

  @Override
  public void train(Collection<Tree> trees) {
    for (Tree tree : trees) {
      train(tree);
    }
  }

  @Override
  public void train(Tree tree) {
    train(tree.taggedYield());
  }

  @Override
  public void train(List<TaggedWord> sentence) {
    for (TaggedWord word : sentence) {
      if (word.word().length() <= maxLength) {
        addStringToLexicon(word.word());
      }
    }
  }

  @Override
  public void finishTraining() {}

  @Override
  public void loadSegmenter(String filename) {
    addLexicon(filename);
  }

  public List<HasWord> segment(String s) {
    buildSegmentationLattice(s);
    ArrayList<Word> sent = maxMatchSegmentation();
    printlnErr("raw output: "+ SentenceUtils.listToString(sent));
    ArrayList<Word> postProcessedSent = postProcessSentence(sent);
    printlnErr("processed output: "+ SentenceUtils.listToString(postProcessedSent));
    ChineseStringUtils.CTPPostProcessor postProcessor = new ChineseStringUtils.CTPPostProcessor();
    String postSentString = postProcessor.postProcessingAnswer(postProcessedSent.toString(), false);
    printlnErr("Sighan2005 output: "+postSentString);
    String[] postSentArray = postSentString.split("\\s+");
    ArrayList<Word> postSent = new ArrayList<>();
    for(String w : postSentArray) {
      postSent.add(new Word(w));
    }
    return new ArrayList<>(postSent);
  }

  /**
   * Add a word to the lexicon, unless it contains some non-Chinese character.
   */
  private void addStringToLexicon(String str) {
    if(str.equals("")) {
      logger.warn("WARNING: blank line in lexicon");
    } else if(str.contains(" ")) {
      logger.warn("WARNING: word with space in lexicon");
    } else {
      if(excludeChar(str)) {
        printlnErr("skipping word: "+str);
        return;
      }
      // printlnErr("adding word: "+str);
      words.add(str);
    }
  }

  /**
   * Read lexicon from a one-column text file.
   */
  private void addLexicon(String filename) {
    try {
      BufferedReader lexiconReader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));
      String lexiconLine;
      while ((lexiconLine = lexiconReader.readLine()) != null) {
        addStringToLexicon(lexiconLine);
      }
    } catch (FileNotFoundException e) {
      logger.error("Lexicon not found: "+ filename);
      System.exit(-1);
    } catch (IOException e) {
      logger.error("IO error while reading: "+ filename, e);
      throw new RuntimeException(e);
    }
  }

  /**
   * Builds a lattice of all possible segmentations using only words
   * present in the lexicon. This function must be run prior to
   * running maxMatchSegmentation.
   */
  private void buildSegmentationLattice(String s) {
    edgesNb = 0;
    len = s.length();
    // Initialize word lattice:
    states = new ArrayList<>();
    lattice = new DFSA<>("wordLattice");
    for (int i=0; i<=s.length(); ++i)
      states.add(new DFSAState<>(i, lattice));
    // Set start and accepting state:
    lattice.setInitialState(states.get(0));
    states.get(len).setAccepting(true);
    // Find all instances of lexicon words in input string:
    for (int start=0; start<len; ++start) {
      for (int end=len; end>start; --end) {
        String str = s.substring(start, end);
        assert(str.length() > 0);
        boolean isOneChar = (start+1 == end);
        boolean isInDict = words.contains(str);
        if (isInDict || isOneChar) {
          double cost = isInDict ? 1 : 100;
          DFSATransition<Word, Integer> trans =
                  new DFSATransition<>(null, states.get(start), states.get(end), new Word(str), null, cost);
          //logger.info("start="+start+" end="+end+" word="+str);
          states.get(start).addTransition(trans);
          ++edgesNb;
        }
      }
    }
  }

  /**
   *  Returns the lexicon-based segmentation that minimizes the number of words.
   * @return Segmented sentence.
   */
  public ArrayList<Word> maxMatchSegmentation() {
    return segmentWords(MatchHeuristic.MINWORDS);
  }

  /**
   * Returns the lexicon-based segmentation following heuristic h.
   * Note that buildSegmentationLattice must be run first.
   * Two heuristics are currently available -- MINWORDS and MAXWORDS --
   * to respectively minimize and maximize the number of segment
   * (where each segment is a lexicon word, if possible).
   *
   * @param h Heuristic to use for segmentation.
   * @return Segmented sentence.
   * @throws UnsupportedOperationException
   * @see #buildSegmentationLattice
   */
  public ArrayList<Word> segmentWords(MatchHeuristic h) throws UnsupportedOperationException {
    if(lattice==null || len < 0)
      throw new UnsupportedOperationException("segmentWords must be run first");
    List<Word> segmentedWords = new ArrayList<>();
    // Init dynamic programming:
    double[] costs = new double[len+1];
    List<DFSATransition<Word, Integer>> bptrs = new ArrayList<>();
    for (int i = 0; i < len + 1; ++i) {
      bptrs.add(null);
    }
    costs[0]=0.0;
    for (int i=1; i<=len; ++i)
       costs[i] = Double.MAX_VALUE;
    // DP:
    for (int start=0; start<len; ++start) {
      DFSAState<Word, Integer> fromState = states.get(start);
      Collection<DFSATransition<Word, Integer>> trs = fromState.transitions();
      for (DFSATransition<Word, Integer> tr : trs) {
        DFSAState<Word, Integer> toState = tr.getTarget();
        double lcost = tr.score();
        int end = toState.stateID();
        //logger.debug("start="+start+" end="+end+" word="+tr.getInput());
        if (h == MatchHeuristic.MINWORDS) {
          // Minimize number of words:
          if (costs[start]+1 < costs[end]) {
            costs[end] = costs[start]+lcost;
            bptrs.set(end, tr);
            //logger.debug("start="+start+" end="+end+" word="+tr.getInput());
          }
        } else if (h == MatchHeuristic.MAXWORDS) {
          // Maximze number of words:
          if (costs[start]+1 < costs[end]) {
            costs[end] = costs[start]-lcost;
            bptrs.set(end, tr);
          }
        } else {
          throw new UnsupportedOperationException("unimplemented heuristic");
        }
      }
    }
    // Extract min-cost path:
    int i=len;
    while (i>0) {
      DFSATransition<Word, Integer> tr = bptrs.get(i);
      DFSAState<Word, Integer> fromState = tr.getSource();
      Word word = tr.getInput();
      if (!word.word().equals(" "))
        segmentedWords.add(0, word);
      i = fromState.stateID();
    }
    if(DEBUG) {
      // Print lattice density ([1,+inf[) : if equal to 1, it means
      // there is only one segmentation using words of the lexicon.
      double density = edgesNb*1.0/segmentedWords.size();
      logger.debug("latticeDensity: "+density+" cost: "+costs[len]);
    }
    return new ArrayList<>(segmentedWords);
  }

  /**
   * Returns a lexicon-based segmentation. At each position x in the input string,
   * it attempts to find largest value y, so that [x,y] is part of the lexicon.
   * Then, it tried to match more input from position y+1. This greedy algorithm
   * (taken from edu.stanford.nlp.lexparser.MaxMatchSegmenter) has no theoretical
   * guarantee, and it would be wise to use segmentWords instead.
   *
   * @param s Input (unsegmented) string.
   * @return Segmented sentence.
   */
  public ArrayList<Word> greedilySegmentWords(String s) {
    List<Word> segmentedWords = new ArrayList<>();
    int length = s.length();
    int start = 0;
    while (start < length) {
      int end = Math.min(length, start + maxLength);
      while (end > start + 1) {
        String nextWord = s.substring(start, end);
        if (words.contains(nextWord)) {
          segmentedWords.add(new Word(nextWord));
          break;
        }
        end--;
      }
      if (end == start + 1) {
        // character does not start any word in our dictionary
        segmentedWords.add(new Word(new String(new char[] {s.charAt(start)} )));
        start++;
      } else {
        start = end;
      }
    }
    return new ArrayList<>(segmentedWords);
  }

  public static void main(String[] args) {
    Properties props = StringUtils.argsToProperties(args);
    // logger.debug(props.toString());
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    MaxMatchSegmenter seg = new MaxMatchSegmenter();
    String lexiconFile = props.getProperty("lexicon");
    if(lexiconFile != null) {
      seg.addLexicon(lexiconFile);
    } else {
      logger.error("Error: no lexicon file!");
      System.exit(1);
    }

    Sighan2005DocumentReaderAndWriter sighanRW = new Sighan2005DocumentReaderAndWriter();
    sighanRW.init(flags);

    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
    PrintWriter stdoutW = new PrintWriter(System.out);
    int lineNb = 0;
    for ( ; ; ) {
      ++lineNb;
      logger.info("line: "+lineNb);
      try {
        String line = br.readLine();
        if(line == null)
          break;
        String outputLine = null;
        if(props.getProperty("greedy") != null) {
          ArrayList<Word> sentence = seg.greedilySegmentWords(line);
          outputLine = SentenceUtils.listToString(sentence);
        } else if(props.getProperty("maxwords") != null) {
          seg.buildSegmentationLattice(line);
          outputLine = SentenceUtils.listToString(seg.segmentWords(MatchHeuristic.MAXWORDS));
        } else {
          seg.buildSegmentationLattice(line);
          outputLine = SentenceUtils.listToString(seg.maxMatchSegmentation());
        }
        StringReader strR = new StringReader(outputLine);
        Iterator<List<CoreLabel>> itr = sighanRW.getIterator(strR);
        while(itr.hasNext()) {
          sighanRW.printAnswers(itr.next(), stdoutW);
        }
        // System.out.println(outputLine);
      }
      catch (IOException e) {
        break;
      }
    }
    stdoutW.flush();
  }

  private static void printlnErr(String s) {
    EncodingPrintWriter.err.println(s, "UTF-8");
  }

  private static ArrayList<Word> postProcessSentence(ArrayList<Word> sent) {
    ArrayList<Word> newSent = new ArrayList<>();
    for(Word word : sent) {
      if(newSent.size() > 0) {
        String prevWord = newSent.get(newSent.size()-1).toString();
        String curWord = word.toString();
        String prevChar = prevWord.substring(prevWord.length()-1);
        String curChar = curWord.substring(0,1);
        if(!isChinese(prevChar) && !isChinese(curChar)) {
          Word mergedWord = new Word(prevWord+curWord);
          newSent.set(newSent.size()-1, mergedWord);
          //printlnErr("merged: "+mergedWord);
          //printlnErr("merged: "+mergedWord+" from: "+prevWord+" and: "+curWord);
          continue;
        }
      }
      newSent.add(word);
    }
    return new ArrayList<>(newSent);
  }

  private static boolean startsWithChinese(String str) { return chineseStartChars.matcher(str).matches(); }
  private static boolean endsWithChinese(String str) { return chineseEndChars.matcher(str).matches(); }
  private static boolean isChinese(String str) { return chineseChars.matcher(str).matches(); }
  private static boolean excludeChar(String str) { return excludeChars.matcher(str).matches(); }

  private static final long serialVersionUID   = 8263734344886904724L;

}