UnknownDictionaryBuilder.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja.util;


import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.lucene.analysis.ja.dict.CharacterDefinition;

public class UnknownDictionaryBuilder {
  private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
  
  private String encoding = "euc-jp";
  
  public UnknownDictionaryBuilder(String encoding) {
    this.encoding = encoding;
  }
  
  public UnknownDictionaryWriter build(String dirname) throws IOException {
    UnknownDictionaryWriter unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def");  //Should be only one file
    readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary);
    return unkDictionary;
  }
  
  public UnknownDictionaryWriter readDictionaryFile(String filename)
      throws IOException {
    return readDictionaryFile(filename, encoding);
  }
  
  public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding)
      throws IOException {
    UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
    
    FileInputStream inputStream = new FileInputStream(filename);
    Charset cs = Charset.forName(encoding);
    CharsetDecoder decoder = cs.newDecoder()
        .onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
    InputStreamReader streamReader = new InputStreamReader(inputStream, decoder);
    LineNumberReader lineReader = new LineNumberReader(streamReader);
    
    dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
    
    List<String[]> lines = new ArrayList<>();
    String line = null;
    while ((line = lineReader.readLine()) != null) {
      // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
      // even though the unknown dictionary returns hardcoded null here.
      final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry
      lines.add(parsed);
    }
    
    Collections.sort(lines, new Comparator<String[]>() {
      public int compare(String[] left, String[] right) {
        int leftId = CharacterDefinition.lookupCharacterClass(left[0]);
        int rightId = CharacterDefinition.lookupCharacterClass(right[0]);
        return leftId - rightId;
      }
    });
    
    for (String[] entry : lines) {
      dictionary.put(entry);
    }
    
    return dictionary;
  }
  
  public void readCharacterDefinition(String filename, UnknownDictionaryWriter dictionary) throws IOException {
    FileInputStream inputStream = new FileInputStream(filename);
    InputStreamReader streamReader = new InputStreamReader(inputStream, encoding);
    LineNumberReader lineReader = new LineNumberReader(streamReader);
    
    String line = null;
    
    while ((line = lineReader.readLine()) != null) {
      line = line.replaceAll("^\\s", "");
      line = line.replaceAll("\\s*#.*", "");
      line = line.replaceAll("\\s+", " ");
      
      // Skip empty line or comment line
      if(line.length() == 0) {
        continue;
      }
      
      if(line.startsWith("0x")) {  // Category mapping
        String[] values = line.split(" ", 2);  // Split only first space
        
        if(!values[0].contains("..")) {
          int cp = Integer.decode(values[0]).intValue();
          dictionary.putCharacterCategory(cp, values[1]);
        } else {
          String[] codePoints = values[0].split("\\.\\.");
          int cpFrom = Integer.decode(codePoints[0]).intValue();
          int cpTo = Integer.decode(codePoints[1]).intValue();
          
          for(int i = cpFrom; i <= cpTo; i++){
            dictionary.putCharacterCategory(i, values[1]);
          }
        }
      } else {  // Invoke definition
        String[] values = line.split(" "); // Consecutive space is merged above
        String characterClassName = values[0];
        int invoke = Integer.parseInt(values[1]);
        int group = Integer.parseInt(values[2]);
        int length = Integer.parseInt(values[3]);
        dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
      }
    }
  }
}