/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.ja.util; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import org.apache.lucene.analysis.ja.dict.CharacterDefinition; public class UnknownDictionaryBuilder { private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*"; private String encoding = "euc-jp"; public UnknownDictionaryBuilder(String encoding) { this.encoding = encoding; } public UnknownDictionaryWriter build(String dirname) throws IOException { UnknownDictionaryWriter unkDictionary = readDictionaryFile(dirname + File.separator + "unk.def"); //Should be only one file readCharacterDefinition(dirname + File.separator + "char.def", unkDictionary); return unkDictionary; } public UnknownDictionaryWriter readDictionaryFile(String filename) throws IOException { return readDictionaryFile(filename, encoding); } public UnknownDictionaryWriter readDictionaryFile(String filename, String encoding) throws IOException { UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024); FileInputStream inputStream = new FileInputStream(filename); Charset cs = Charset.forName(encoding); CharsetDecoder decoder = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); InputStreamReader streamReader = new InputStreamReader(inputStream, decoder); LineNumberReader lineReader = new LineNumberReader(streamReader); dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY)); List<String[]> lines = new ArrayList<>(); String line = null; while ((line = lineReader.readLine()) != null) { // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation, // even though the unknown dictionary returns hardcoded null here. final String[] parsed = CSVUtil.parse(line + ",*,*"); // Probably we don't need to validate entry lines.add(parsed); } Collections.sort(lines, new Comparator<String[]>() { public int compare(String[] left, String[] right) { int leftId = CharacterDefinition.lookupCharacterClass(left[0]); int rightId = CharacterDefinition.lookupCharacterClass(right[0]); return leftId - rightId; } }); for (String[] entry : lines) { dictionary.put(entry); } return dictionary; } public void readCharacterDefinition(String filename, UnknownDictionaryWriter dictionary) throws IOException { FileInputStream inputStream = new FileInputStream(filename); InputStreamReader streamReader = new InputStreamReader(inputStream, encoding); LineNumberReader lineReader = new LineNumberReader(streamReader); String line = null; while ((line = lineReader.readLine()) != null) { line = line.replaceAll("^\\s", ""); line = line.replaceAll("\\s*#.*", ""); line = line.replaceAll("\\s+", " "); // Skip empty line or comment line if(line.length() == 0) { continue; } if(line.startsWith("0x")) { // Category mapping String[] values = line.split(" ", 2); // Split only first space if(!values[0].contains("..")) { int cp = Integer.decode(values[0]).intValue(); dictionary.putCharacterCategory(cp, values[1]); } else { String[] codePoints = values[0].split("\\.\\."); int cpFrom = Integer.decode(codePoints[0]).intValue(); int cpTo = Integer.decode(codePoints[1]).intValue(); for(int i = cpFrom; i <= cpTo; i++){ dictionary.putCharacterCategory(i, values[1]); } } } else { // Invoke definition String[] values = line.split(" "); // Consecutive space is merged above String characterClassName = values[0]; int invoke = Integer.parseInt(values[1]); int group = Integer.parseInt(values[2]); int length = Integer.parseInt(values[3]); dictionary.putInvokeDefinition(characterClassName, invoke, group, length); } } } }