LMGrammarJAVA.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.decoder.ff.lm.buildin_lm;

import joshua.corpus.vocab.SymbolTable;
import joshua.decoder.ff.lm.AbstractLM;
import joshua.decoder.ff.lm.LanguageModelFF;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.Support;
import joshua.util.io.LineReader;
import joshua.util.Regex;

import java.io.IOException;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;

// TODO: This class has a *huge* amount of redundant code. Eliminate it


/**
 * this class implement 
 * (1) read the LM file into a Trie data structure
 * (2) get LM probablity for a given n-grm
 * (3) get equivilent state 
 * 
 * @author Zhifei Li, <zhifei.work@gmail.com>
 * @version $LastChangedDate:2008-07-28 18:44:45 -0400 (Mon, 28 Jul 2008) $
 */
public class LMGrammarJAVA extends AbstractLM {

	// BUG: Why are the IDs not static? Why are the strings not final?
	static String BACKOFF_WGHT_SYM = "<bow>";
	int BACKOFF_WGHT_SYM_ID; // used by LMModel
	
	static String LM_HAVE_PREFIX_SYM = "<havelzfprefix>"; // to indicate that lm trie node has children
	int LM_HAVE_PREFIX_SYM_ID;
	
	static String UNK_SYM = "<unk>"; // unknown lm word
	int UNK_SYM_ID;
	
	
	/** Used for logging the time cost for things */
	private long start_loading_time;
	
	/*a backoff node is a hashtable, it may include:
	 * (1) probabilititis for next words
	 * (2) pointers to a next-layer backoff node (hashtable); the key lookup the value is: sym_id + highestID
	 * (3) backoff weight for this node
	 * (4) suffix/prefix flag to indicate that there is ngrams start from this suffix
	 */
	private LMHash root = null;
	private int g_n_bow_nodes = 0;
	private int g_n_suffix_nodes = 0;
	static private float MIN_LOG_P = -9999.0f; //ngram prob must be smaller than this number
	static private double SUFFIX_ONLY = MIN_LOG_P*3; //ngram prob must be smaller than this number
	
	private double NON_EXIST_WEIGHT = 0; // the history has not appeared at all
	private int num_rule_read       = 0;
	boolean g_is_add_prefix_infor   = false;
	boolean g_is_add_suffix_infor   = false;
	
	HashMap<String, int[]> request_cache_prob        = new HashMap<String, int[]>();//cmd with result
	HashMap<String, int[]> request_cache_backoff     = new HashMap<String, int[]>();//cmd with result
	HashMap<String, int[]> request_cache_left_equiv  = new HashMap<String, int[]>();//cmd with result
	HashMap<String, int[]> request_cache_right_equiv = new HashMap<String, int[]>();//cmd with result
	int cache_size_limit= 250000;
	
	
	private static final Logger logger = 
		Logger.getLogger(LMGrammarJAVA.class.getName());
	
	public LMGrammarJAVA(SymbolTable psymbol, int order, String lm_file, boolean is_add_suffix_infor, boolean is_add_prefix_infor) throws IOException {
		super(psymbol, order);
		logger.info("use java lm");
		
		this.BACKOFF_WGHT_SYM_ID   = psymbol.addTerminal(BACKOFF_WGHT_SYM);
		this.LM_HAVE_PREFIX_SYM_ID = psymbol.addTerminal(LM_HAVE_PREFIX_SYM);
		this.UNK_SYM_ID            = psymbol.addTerminal(UNK_SYM);
		
		
		g_is_add_prefix_infor = is_add_prefix_infor;
		g_is_add_suffix_infor = is_add_suffix_infor;
		
		read_lm_grammar_from_file(lm_file);//TODO: what about sentence-specific?
		
		//Symbol.add_global_symbols(true);
		/*//debug 
		LMHash[] t_arrays = new LMHash[10000000];
		System.out.println("##### mem used (kb): " + Support.getMemoryUse());
		System.out.println("##### time used (seconds): " + (System.currentTimeMillis()-start_loading_time)/1000);
		for(int i=0; i<10000000;i++){
			LMHash t_h = new LMHash(5);
			double j=0.1f;
			t_h.put(i, j);
			
			//System.out.println("ele is " + t_h.get(i));
			t_arrays[i]=t_h;
			if(i%1000000==0){
				System.out.println(i +" ##### mem used (kb): " + Support.getMemoryUse());
				System.out.println("##### time used (seconds): " + (System.currentTimeMillis()-start_loading_time)/1000);
			}
		}
		System.exit(0);
		//end*/
	
		
		/*//debug
		double[] bow = new double[1];
		int[] backoff_history = new int[1];
		backoff_history[0]=Symbol.UNTRANS_SYM_ID;
		boolean finalized_backoff = check_backoff_weight(backoff_history, bow, 0);//backoff weight is already added outside this function?
		
		//System.out.println("bow_weigth id: " + Symbol.BACKOFF_WGHT_SYM_ID);
		System.out.println("is final: " + finalized_backoff);
		System.out.println("bow: " + bow[0]);
		System.exit(0);*/
	}
	
	
	//	signature of this item: i, j, lhs, states (in fact, we do not need i, j)
	private String get_signature(int[] words) {
		StringBuffer s = new StringBuffer(words.length);
		for (int i = 0; i < words.length; i++) {
			s.append(' ').append(words[i]);
		}
		return s.toString();
	}
	
	
	
	/*note: the mismatch between srilm and our java implemtation is in: when unk words used as context, in java it will be replaced with "<unk>", but srilm will not, therefore the 
	*lm cost by srilm may be smaller than by java, this happens only when the LM file have "<unk>" in backoff state*/
	protected double ngramLogProbability_helper(int[] ngram, int order) {
		Double res;
		//cache
		//String sig = get_signature(ngram);
		//res = (Double)request_cache_prob.get(sig);
		//if(res!=null)return res;
		
		int[] ngram_wrds = replace_with_unk(ngram); // TODO
		if (ngram_wrds[ngram_wrds.length-1] == UNK_SYM_ID) { // TODO: wrong implementation in hiero
			res = -JoshuaConfiguration.lm_ceiling_cost;
		} else {
			//TODO: untranslated words
			if (null == root) {
				throw new RuntimeException("root is null");
			}
			int last_word_id = ngram_wrds[ngram_wrds.length-1];
			LMHash pos = root;
			Double prob = get_valid_prob(pos,last_word_id);
			double bow_sum = 0;
			// reverse search, start from the second-last word
			for (int i = ngram_wrds.length - 2; i >= 0; i--) {
				LMHash next_layer = 
					(LMHash) pos.get(ngram_wrds[i] + this.symbolTable.getHighestID());
					
				if (null != next_layer) { // have context/bow node
					pos = next_layer;
					Double prob2 = get_valid_prob(pos,last_word_id);
					if (null != prob2) { // reset, if backoff, will at least back off to here
						prob    = prob2;
						bow_sum = 0;
					} else {
						Double bow = (Double) pos.get(BACKOFF_WGHT_SYM_ID);
						if (null != bow) {
							bow_sum += bow;
						}
					}
				} else { // do not have context/bow node
					break;
				}
			}
			res = prob + bow_sum;
		}
		//cache
		//if(request_cache_prob.size()>cache_size_limit)
		//	request_cache_prob.clear();
		//request_cache_prob.put(sig, res);
		
		return res;
	}
	
	private Double get_valid_prob(LMHash pos, int wrd) {
		Double res = (Double)pos.get(wrd);
		if (! g_is_add_suffix_infor) {
			return res;
		}
		
		if (null != res) {
			if (res == SUFFIX_ONLY) {
				return null;
			} else if (res > MIN_LOG_P) { // logP without suffix flag
				return res;
			} else { // logP with suffix flag
				return res - MIN_LOG_P;
			}
		}
		return null;
	}
	
//	##################### begin right equivalent state ############# 
	//idea: from right to left, if a span does not have a backoff weight, which means all ngram having this span will backoff, and we can safely remove this state
	//the absence of backoff weight for low-order ngram implies the absence of higher-order ngram
	//the absence of backoff weight for low-order ngram implies the absence of backoff weight for high order ngram ????????????????
	/*e.g., if we do not have bow node for A, then we can say there is no bow nodes for
	 * (1)*A: implied by the trie structure
	 * (2)A*: if we have a BOW node for A* (with bow weight), due to the representantion of ARPA format, then we must have a probability for A*, which implies we have a BOW node for A
	 * (3)*A*
	 */
	
	//the returned array lenght must be the same the len of original_state
	//the only change to the original_state is: replace with more non-null state words to null state
	//O(n^2)
	public int[] rightEquivalentState(int[] original_state_in, int order) {
		if ( !JoshuaConfiguration.use_right_equivalent_state
		|| original_state_in.length != ngramOrder - 1) {
			return original_state_in;
		}
		int[] res;
		//cache
		String sig = get_signature(original_state_in);
		res = (int[])request_cache_right_equiv.get(sig);
		if (null != res) {
			//System.out.println("right cache hit");
			return res;
		}
		
		// we do not put this statement at the beging to match the SRILM condition (who does not have replace_with_unk)
		int[] original_state = replace_with_unk(original_state_in);
		
		res = new int[original_state.length];
		for (int i = 1; i <= original_state.length; i++) { // forward search				
			int[] cur_wrds = Support.sub_int_array(original_state, i-1, original_state.length);
			if (! have_prefix(cur_wrds)) {
				res[i-1] = LanguageModelFF.NULL_RIGHT_LM_STATE_SYM_ID;
			} else {
				for (int j = i; j <= original_state.length; j++) {
					res[j-1] = original_state[j-1];
				}
				break;
			}
		}
		//cache
		if (request_cache_right_equiv.size() > cache_size_limit) {
			request_cache_right_equiv.clear();
		}
		request_cache_right_equiv.put(sig, res);
		
		//System.out.println("right org state: " + Symbol.get_string(original_state) +"; equiv state: " + Symbol.get_string(res));
		return res;
	}
	
	
	//O(n)
	private boolean have_prefix(int[] words) {
		LMHash pos = root;
		int i = words.length - 1;
		for ( ; i >= 0; i--) { // reverse search
			LMHash next_layer =
				(LMHash) pos.get(words[i] + this.symbolTable.getHighestID());
			if (null != next_layer) {
				pos = next_layer;
			} else {
				break;
			}
		}
		return (i == -1 && pos.containsKey(LM_HAVE_PREFIX_SYM_ID));
	}
	 
//		##################### end right equivalent state #############
	 

	 //############################ begin left equivalent state ##############################

	 
	/*several observation: 
	 * In general:
	 * (1) In general, there might be more than one <bo> or <null>, and they can be in any position
	 * (2) in general, whenever there is a <bo> or <null> in a given ngram, then it will definitely backoff since it has same/more context
	*/		
	//return: (1) the equivlant state vector; (2) the finalized cost; (3) the estimated cost
	//	O(n^2)
	public int[] leftEquivalentState(int[] original_state_wrds_in, int order, double[] cost) {
		if (! JoshuaConfiguration.use_left_equivalent_state) {
			return original_state_wrds_in;
		}
		
		// we do not put this statement at the beging to match the SRILM condition (who does not have replace_with_unk)
		int[] original_state_wrds =
			replace_with_unk(original_state_wrds_in);
		
		//## deal with case overlap state
		if (original_state_wrds.length < ngramOrder - 1) {
			for (int i = 0; i < original_state_wrds.length; i++) {
				int[] currentWords = Support.sub_int_array(original_state_wrds, 0, i+1);
				
				// add estimated cost
				cost[1] += -ngramLogProbability(currentWords, currentWords.length);
			}
			return original_state_wrds;
		}
		
		//## non-overlaping state
		int[]  res_equi_state = new int[original_state_wrds.length];
		double res_final_cost = 0.0; // finalized cost
		double res_est_cost   = 0.0; // estimated cost
		
		BACKWORD_SEARCH:
		for (int i = original_state_wrds.length; i > 0; i--) {
			int[] cur_wrds =
				Support.sub_int_array(original_state_wrds, 0, i);
			if (! have_suffix(cur_wrds)) {
				int last_wrd = cur_wrds[i-1];
				if (last_wrd == UNK_SYM_ID) {
					res_equi_state[i-1] = last_wrd;
					
					// add estimated cost
					res_est_cost += -ngramLogProbability(cur_wrds, cur_wrds.length);
				} else {
					if (last_wrd != LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID) {
						res_final_cost += -ngramLogProbability(cur_wrds, cur_wrds.length);
					}
					
					res_equi_state[i-1] =
						LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID;
					
					/*//TODO: for simplicity, we may just need BACKOFF_LEFT_LM_STATE_SYM_ID??
					int[] backoff_history = Support.sub_int_array(cur_wrds, 0, cur_wrds.length-1);//ignore last wrd
					double[] bow = new double[1];
					boolean finalized_backoff = check_backoff_weight(backoff_history, bow, 0);//backoff weight is already added outside this function?						 
					if(finalized_backoff==true){
						res_equi_state[i-1]=Symbol.NULL_LEFT_LM_STATE_SYM_ID;//no state, no bow, no est_cost 
					}else{
						res_equi_state[i-1]=Symbol.BACKOFF_LEFT_LM_STATE_SYM_ID;				
					}*/
				}
				
			} else { // we do have a suffix
				for (int j = i; j > 0; j--) {
					res_equi_state[j-1] = original_state_wrds[j-1];
					cur_wrds = Support.sub_int_array(original_state_wrds, 0, j);
					
					// Estimated cost
					res_est_cost += -ngramLogProbability(cur_wrds, cur_wrds.length);
				}
				break BACKWORD_SEARCH;
			}
		}
		
		cost[0] = res_final_cost;
		cost[1] = res_est_cost;
		return res_equi_state;
	}
	
	
	private boolean have_suffix(int[] words) {
		LMHash pos = root;
		//reverse search, start from the second-last word
		for (int i = words.length-2; i >= 0; i--) {
			LMHash next_layer =
				(LMHash) pos.get(words[i] + this.symbolTable.getHighestID());
			if (null != next_layer) {
				pos = next_layer;
			} else {
				return false;
			}
		}
		Double prob = (Double)pos.get(words[words.length-1]);
		return (null != prob && prob <= MIN_LOG_P);
	}
	
	
	protected double logProbabilityOfBackoffState_helper(int[] ngram_wrds, int order, int n_additional_bow) {
		int[] backoff_wrds =
			Support.sub_int_array(ngram_wrds, 0, ngram_wrds.length - 1);
		double[] sum_bow = new double[1];
		check_backoff_weight(backoff_wrds, sum_bow, n_additional_bow);
		return sum_bow[0];
	}
	
	
	//if exist backoff weight for backoff_words, then return the accumated backoff weight
	//	if there is no backoff weight for backoff_words, then, we can return the finalized backoff weight
	private boolean check_backoff_weight(int[] backoff_words, double[] sum_bow, int num_backoff) {
		if (backoff_words.length <= 0) return false;
		
		double sum = 0;
		LMHash pos = root;
		
		//the start index that backoff should be applied
		int start_use_i = num_backoff - 1;
		
		Double bow = null;
		int i = backoff_words.length - 1;
		for(; i >= 0; i--) {
			LMHash next_layer = (LMHash) pos.get(
				backoff_words[i] + this.symbolTable.getHighestID());
			
			if (null != next_layer) {
				bow = (Double)next_layer.get(BACKOFF_WGHT_SYM_ID);
				if (null != bow && i <= start_use_i) {
					sum += bow;
				}
				pos = next_layer;
			} else {
				break;
			}
		}
		sum_bow[0] = sum;
		
		//the higest order have backoff weight, so we cannot finalize
		return (i != -1 || null == bow);
	}
//	######################################## end left equiv state ###########################################


//	######################################## general helper function ###########################################
	protected final int[] replace_with_unk(int[] in) {
		int[] res = new int[in.length];
		for (int i = 0; i < in.length; i++) {
			res[i] = replace_with_unk(in[i]);
		}
		return res;
	}
	
	
	protected int replace_with_unk(int in) {
		if (root.containsKey(in)
		|| in == LanguageModelFF.NULL_RIGHT_LM_STATE_SYM_ID
		|| in == LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID
		) {
			return in;
		} else {
			return UNK_SYM_ID;
		}
	}
	
	
//	######################################## read LM grammar by the Java implementation ###########################################
	
	/*a backoff node is a hashtable, it may include:
	 * (1) probability for next words: key id is positive
	 * (2) pointer to a next-layer backoff node (hashtable): key id is negative!!!
	 * (3) backoff weight for this node
	 * (4) suffix flag to indicate that there is ngrams start from this suffix
	 */
	
	//read grammar locally by the Java implementation
	private void read_lm_grammar_from_file(String grammar_file)
	throws IOException {
		start_loading_time = System.currentTimeMillis();
		root = new LMHash();
		root.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT);
		
		if (logger.isLoggable(Level.INFO))
			logger.info("Reading grammar from file " + grammar_file);
		
		boolean start = false;
		int order = 0;
		
		Regex blankLine  = new Regex("^\\s*$");
		Regex ngramsLine = new Regex("^\\\\\\d-grams:\\s*$");
		
		LineReader grammarReader = new LineReader(grammar_file);
		try { for (String line : grammarReader) {
			line = line.trim();
			if (blankLine.matches(line)) {
				continue;
			}
			if (ngramsLine.matches(line)) { // \1-grams:
				start = true;
				order = Integer.parseInt(line.substring(1, 2));
				if (order > ngramOrder) {
					break;
				}
				if (logger.isLoggable(Level.INFO))
					logger.info("begin to read ngrams with order " + order);
				
				continue; //skip this line
			}
			if (start) {
				add_rule(line,order, g_is_add_suffix_infor, g_is_add_prefix_infor);
			}
		} } finally { grammarReader.close(); }
		
		if (logger.isLoggable(Level.FINE)) {
			logger.fine("# of bow nodes: " + g_n_bow_nodes + " ; # of suffix nodes: " + g_n_suffix_nodes);
			logger.fine("add LMHash  " + g_n_bow_nodes);
			logger.fine("##### mem used (kb): " + Support.getMemoryUse());
			logger.fine("##### time used (seconds): "
				+ (System.currentTimeMillis() - start_loading_time) / 1000);
		}
	}
	
	
	// format: prob \t ngram \t backoff-weight
	private void add_rule(String line, int order, boolean is_add_suffix_infor, boolean is_add_prefix_infor) {
		num_rule_read++;
		if (num_rule_read % 1000000 == 0) {
			if (logger.isLoggable(Level.FINE))
				logger.fine("read rules " + num_rule_read);
			
			//System.out.println("##### mem used (kb): " + Support.getMemoryUse());
			if (logger.isLoggable(Level.FINE))
				logger.fine("##### time used (seconds): "
					+ (System.currentTimeMillis() - start_loading_time) / 1000);
		}
		String[] wrds = Regex.spaces.split(line.trim());
		
		if (wrds.length < order + 1 || wrds.length > order + 2) { // TODO: error
			//logger.severe("wrong line: "+ line);
			return;
		}
		int last_word_id = this.symbolTable.addTerminal(wrds[order]);
		
		//##### identify the BOW position, insert the backoff node if necessary, and add suffix information
		LMHash pos = root;
		// reverse search, start from the second-last word
		for (int i = order - 1; i > 0; i--) {
			if (is_add_suffix_infor) {
				Double t_prob = (Double) pos.get(last_word_id);
				if (null != t_prob) {
					if (t_prob > MIN_LOG_P) { // have prob, but not suffix flag
						double tem = t_prob + MIN_LOG_P;
						pos.put(last_word_id, tem); // overwrite
					}
				} else {
					pos.put(last_word_id, SUFFIX_ONLY);
				}
			}
			int cur_sym_id = this.symbolTable.addTerminal(wrds[i]);
			//System.out.println(this.symbolTable.getHighestID());
			LMHash next_layer =
				(LMHash) pos.get(cur_sym_id + this.symbolTable.getHighestID());
			if (null != next_layer) {
				pos = next_layer;
			} else {
				LMHash new_tnode = new LMHash(); // create new bow node
				pos.put(cur_sym_id + this.symbolTable.getHighestID(), new_tnode);
				pos = new_tnode;
				
				g_n_bow_nodes++;
				if (g_n_bow_nodes % 1000000 == 0) {
					if (logger.isLoggable(Level.FINE))
						logger.fine("add LMHash  " + g_n_bow_nodes);
					
					//System.out.println("##### mem used (kb): " + Support.getMemoryUse());
					if (logger.isLoggable(Level.FINE))
						logger.fine("##### time used (seconds): "
							+ (System.currentTimeMillis() - start_loading_time) / 1000);
				}
			}
			if (! pos.containsKey(BACKOFF_WGHT_SYM_ID)) {
				//indicate it is a backoof node, to distinguish from a pure prefix node
				pos.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT);
			}
		}
		
		//##### add probability
		if (is_add_suffix_infor && pos.containsKey(last_word_id)) {
			double tem = Double.parseDouble(wrds[0]) + MIN_LOG_P;
			pos.put(last_word_id, tem); // add probability and suffix flag
		} else {
			// add probability
			pos.put(last_word_id, Double.parseDouble(wrds[0]));
		}
		
		//##### add prefix infor, a prefix node is just like a BOW node
		if (is_add_prefix_infor) {
			pos.put(LM_HAVE_PREFIX_SYM_ID, 1); // for preifx [1,order-1]
			for (int i = 1; i < order-1; i++) { // ignore the last prefix
				pos = root; // reset pos
				for (int j = i; j >= 1; j--) { // reverse search: [1,i]
					int cur_sym_id = this.symbolTable.addTerminal(wrds[j]);
					LMHash next_layer= (LMHash) pos.get(
						cur_sym_id + this.symbolTable.getHighestID());
					
					if (null != next_layer) {
						pos = next_layer;
					} else {
						LMHash new_tnode = new LMHash();//create new prefix node						
						pos.put(cur_sym_id + this.symbolTable.getHighestID(), new_tnode);
						pos = new_tnode;
						
						g_n_bow_nodes++;
						if (g_n_bow_nodes % 1000000 == 0) {
							if (logger.isLoggable(Level.FINE))
								logger.fine("add LMHash  " + g_n_bow_nodes);
							
							//System.out.println("##### mem used (kb): " + Support.getMemoryUse());
							if (logger.isLoggable(Level.FINE))
								logger.fine("##### time used (seconds): "
									+ (System.currentTimeMillis() - start_loading_time) / 1000);
						}
					}
				}
				pos.put( LM_HAVE_PREFIX_SYM_ID, 1);//only the last node should have this flag
			}
		}
		
		
		//##### add bow
		if (wrds.length == order+2) { // have bow weight to add
			pos = root;
			// reverse search, start from the last word
			for (int i = order; i >= 1; i--) {
				int cur_sym_id = this.symbolTable.addTerminal(wrds[i]);
				LMHash next_layer = (LMHash) pos.get(
					cur_sym_id + this.symbolTable.getHighestID());
				if (null != next_layer) {
					pos = next_layer;
				} else {
					LMHash new_tnode = new LMHash(); // create new bow node					
					pos.put(cur_sym_id + this.symbolTable.getHighestID(), new_tnode);
					pos = new_tnode;
					
					g_n_bow_nodes++;
					if (g_n_bow_nodes % 1000000 == 0) {
						if (logger.isLoggable(Level.FINE))
							logger.fine("add LMHash  " + g_n_bow_nodes);					
						//System.out.println("##### mem used (kb): " + Support.getMemoryUse());
						if (logger.isLoggable(Level.FINE))
							logger.fine("##### time used (seconds): "
								+ (System.currentTimeMillis() - start_loading_time) / 1000);
					}
				}
				
				//add bow weight here
				if (i == 1) { // force to override the backoff weight
					double backoff_weight = Double.parseDouble(wrds[order+1]);
					pos.put(BACKOFF_WGHT_SYM_ID, backoff_weight);
				} else {
					if (! pos.containsKey(BACKOFF_WGHT_SYM_ID)) {
						//indicate it is a backoof node, to distinguish from a pure prefix node
						pos.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT);
					}
				}
			}
		}
	}
	
	
	/* ###################### not used
	 private boolean have_suffix_old(int[] words){
		 LMHash pos=root;
		 int i=words.length-1;
		 for(; i>=0; i--){//reverse search			
				LMHash next_layer=(LMHash) pos.get(words[i]+p_symbol.getLMEndID());
				if(next_layer!=null){
					pos=next_layer;					
				}else{
					break;
				}
		 }
		 if(i==-1 && pos.containsKey(Symbol.LM_HAVE_SUFFIX_SYM_ID))
			 return true;
		 else
			 return false;
	 } 
	 */
	
	
	//in theory: 64bytes (init size is 5)
	//in practice: 86 bytes (init size is 5)
	//in practice: 132 bytes (init size is 10)
	//in practice: 211 bytes (init size is 20)
	//important note: if we use tbl.put(key, new Integer(1)) instead of tbl.put(key, (new Integer(1)).intValue()), then for each element, we waste 16 bytes for the Integer object, 
	//and the GC will not collect this Double object, because the hashtable ref it
	private static class LMHash //4bytes
	{
		//######note: key must be positive integer, and value must not be null
		/*if key can be both positive and negative, then lot of collision, or will take very long to call get()
		 * imagine, we put all numbers in [1,20000] in hashtable, but try to call get() by numbers [-20000,-1], it will take very long time
		 */
		
		//TODO: should round the array size to a prime number?
		static double load_factor = 0.6;
		static int default_init_size = 5;
		
		int size = 0; // 8 bytes?
		int[] key_array; // pointer itself is 4 bytes?, when it is newed, then add 10 more bytes, and the int itself
		Object[] val_array; // pointer itself is 4 bytes?, when it is newed, then add 10 more bytes, and the object itself
		
		public LMHash(int init_size) {
			key_array = new int[init_size];
			val_array = new Object[init_size];
		}
		
		public LMHash() {
			key_array = new int[default_init_size];
			val_array = new Object[default_init_size];
		}
		
		//return the positive position for the key
		private int hash_pos(int key, int length) {
			//return Math.abs(key % length);
			return key % length;
		}
		
		public Object get(int key) {
			Object res = null;
			int pos = hash_pos(key, key_array.length);
			while (key_array[pos] != 0) { // search until empty cell,
				if (key_array[pos] == key) {
					return val_array[pos]; // found
				}
				pos++; //linear search
				pos = hash_pos(pos, key_array.length);
			}
			return res;
		}
		
		public boolean containsKey(int key) {
			return (null != get(key));
		}
		
		public int size() {
			return size;
		}
		
		public void put(int key, Object value) {
			if (null == value) {
				throw new IllegalArgumentException("LMHash, value is null");
			}
			
			int pos = hash_pos(key, key_array.length);
			while (key_array[pos] != 0) { // search until empty cell,
				if (key_array[pos] == key) {
					val_array[pos] = value; // found, and overwrite
					return;
				}
				pos++; //linear search
				pos = hash_pos(pos, key_array.length);
			}
			
			//we get to here, means we do not have this key, need to insert it
			//data_array[pos] = new LMItem(key, value);
			key_array[pos] = key;
			val_array[pos] = value;
			
			size++;
			if (size >= key_array.length * load_factor) {
				expand_tbl();
			}
		}
		
		
		private void expand_tbl() {
			int new_size = key_array.length * 2 + 1; // TODO
			int[] new_key_array = new int[new_size];
			Object[] new_val_array = new Object[new_size];
			
			for (int i = 0; i < key_array.length; i++) {
				if (key_array[i] != 0) { // add the element
					int pos = hash_pos(key_array[i], new_key_array.length);
					
					// find first empty postition, note that it is not possible that we need to overwrite
					while (new_key_array[pos] != 0) {
						pos++; //linear search
						pos = hash_pos(pos, new_key_array.length);
					}
					new_key_array[pos] = key_array[i];	
					new_val_array[pos] = val_array[i];
				}
			}
			key_array = new_key_array;
			val_array = new_val_array;
		}
	}
}