/* * Copyright 2013 Websquared, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fastcatsearch.ir.dic; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import org.fastcatsearch.ir.common.IRException; import org.fastcatsearch.ir.io.CharVector; import org.fastcatsearch.ir.io.DirBufferedReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @deprecated * */ public class HashMapDictionaryCompiler { private static Logger logger = LoggerFactory.getLogger(HashMapDictionaryCompiler.class); private static int BUCKET_SIZE = 16 * 1024; public static void main(String[] args) throws IRException { HashMapDictionaryCompiler c = new HashMapDictionaryCompiler(); c.compile(new File(args[0]), args[1], new File(args[2])); } public void compile(File input, String charset, File output) throws IRException{ try { logger.info("Dictionary compile start!"); HashMapDictionary dic = new HashMapDictionary(BUCKET_SIZE); DirBufferedReader br = new DirBufferedReader(input, charset); String line = null; long st = System.currentTimeMillis(); int cnt = 0; int[] startPos = new int[128]; while((line = br.readLine())!= null){ if(line.length() > 0){ // logger.debug("--"+line); String key = null; CharVector[] termList = null; int p = line.indexOf('\t'); if(p > 0){ key = line.substring(0, p); // logger.debug("key = "+key); String value = line.substring(p + 1); String[] tmp = value.split(","); if(tmp.length == 1 && tmp[0].length() == 0){ termList = new CharVector[]{new CharVector(key)}; // logger.debug("val = "+termList[0]); }else{ termList = new CharVector[tmp.length]; for (int i = 0; i < tmp.length; i++) { termList[i] = new CharVector(tmp[i]); } } }else{ logger.warn("Cannot find '\\\\t' character. ignore read = "+ line); continue; } CharVector term = new CharVector(key); // logger.debug("put "+term+" : "+termList); dic.put(term, termList); cnt++; } } dic.save(output); logger.info("Dictionary compile done.. total {} word. {}ms", cnt, System.currentTimeMillis() - st); } catch (FileNotFoundException e) { logger.error("FileNotFoundException",e); throw new IRException(e); } catch (IOException e) { logger.error("IOException",e); throw new IRException(e); } } }