/* * Copyright 2013 Websquared, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fastcatsearch.ir.dic; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import org.fastcatsearch.ir.common.IRException; import org.fastcatsearch.ir.io.CharVector; import org.fastcatsearch.ir.io.DirBufferedReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class HashSetDictionaryCompiler { private static Logger logger = LoggerFactory.getLogger(HashSetDictionaryCompiler.class); // private static int BUCKET_SIZE = 16 * 1024; boolean splitByWhitespace; // 공백기준으로 단어들을 분리할지 여부. public static void main(String[] args) throws IRException { HashSetDictionaryCompiler c = new HashSetDictionaryCompiler(); String splitByWhitespaceValue = System.getProperty("splitByWhitespace"); if (splitByWhitespaceValue != null && splitByWhitespaceValue.equalsIgnoreCase("true")) { c.splitByWhitespace = true; logger.info("Use splitByWhitespace"); } if (args.length == 3) { c.compile(new File(args[0]), args[1], new File(args[2])); } else if (args.length == 4) { c.compile(new File[] { new File(args[0]), new File(args[1]) }, args[2], new File(args[3])); } } public void compile(File[] inputList, String charset, File output) throws IRException { try { logger.info("Dictionary compile2 start! {}", charset); DirBufferedReader br = new DirBufferedReader(inputList, charset); int bucketSize = getEstimatedBucketSize(inputList, charset); compile0(br, output, bucketSize); br.close(); } catch (FileNotFoundException e) { logger.error("FileNotFoundException", e); throw new IRException(e); } catch (IOException e) { logger.error("IOException", e); throw new IRException(e); } } public void compile(File input, String charset, File output) throws IRException { try { logger.info("Dictionary compile start! {}", charset); int bucketSize = getEstimatedBucketSize(input, charset); DirBufferedReader br = new DirBufferedReader(input, charset); compile0(br, output, bucketSize); br.close(); } catch (FileNotFoundException e) { logger.error("FileNotFoundException", e); throw new IRException(e); } catch (IOException e) { logger.error("IOException", e); throw new IRException(e); } } private int getEstimatedBucketSize(File[] inputList, String charset) throws IOException { int bucketSize = 0; for (int i = 0; i < inputList.length; i++) { bucketSize += getEstimatedBucketSize(inputList[i], charset); } return bucketSize; } private int getEstimatedBucketSize(File input, String charset) throws IOException { int multipleNumber = 256; int count = (int) Math.ceil(getLineCount(input, charset) * 1.5);// 1.5배의 여유공간. return (count + multipleNumber - 1) / multipleNumber * multipleNumber; } private int getLineCount(File input, String charset) throws IOException { DirBufferedReader br = new DirBufferedReader(input, charset); try { int count = 0; while (br.readLine() != null) { count++; } return count; } finally { if (br != null) { br.close(); } } } private void compile0(DirBufferedReader br, File output, int bucketSize) throws IOException, IRException { HashSetDictionary dic = new HashSetDictionary(bucketSize); String line = null; long st = System.currentTimeMillis(); int cnt = 0; while ((line = br.readLine()) != null) { if (line.startsWith("//")) continue; line = line.trim(); if (line.length() > 0) { // logger.debug("--"+line); int p = line.indexOf('/'); String a = null; if (p > 0) { a = line.substring(0, p); } else { a = line; } if (splitByWhitespace && a.contains(" ")) { // 공백이 있다면 떼어내어 여러 키워드를 입력한다. String[] keywordList = a.split(" "); for (int i = 0; i < keywordList.length; i++) { if(keywordList[i].length() <= 1){ continue; }else if(keywordList[i].length() == 2){ //영문 또는 숫자이면 추가하지 않는다. if(Character.isDigit(keywordList[i].charAt(0)) && Character.isDigit(keywordList[i].charAt(1))){ continue; } if(isAlphabetic(keywordList[i].charAt(0)) && isAlphabetic(keywordList[i].charAt(1))){ continue; } } CharVector term = new CharVector(keywordList[i]); dic.put(term); cnt++; } } else { // logger.debug("* " + a); if(a.length() <= 1){ continue; } CharVector term = new CharVector(a); dic.put(term); cnt++; } } } dic.save(output); logger.info("Dictionary compile done.. total {} words. putTerm={} {}ms", new Object[]{dic.count(), cnt, System.currentTimeMillis() - st}); } // // java 1.7 has method 'isAlphabetic' but 1.6 doesn't // so we add it for compiler compatibility // public boolean isAlphabetic(char c) { if((c>='A' && c<='Z') || (c>='a' && c<='z')) { return true; } return false; } }