package ivory.sqe.querygenerator; import ivory.core.tokenize.Tokenizer; import ivory.sqe.retrieval.Constants; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.log4j.Logger; import tl.lin.data.map.HMapStFW; import tl.lin.data.map.HMapStIW; public class TranslationFactory { private static final Logger LOG = Logger.getLogger(TranslationFactory.class); public static Translation readTranslationsFromNBest(String queryRepresentation, float alpha, Set<String> unknownWords, Tokenizer queryLangTokenizer, Tokenizer docLangTokenizer, Configuration conf) { String[] arr = queryRepresentation.trim().split("\\|\\|\\|\\|"); String origQuery = arr[0]; Map<String,String> stemmed2Stemmed = Utils.getStemMapping(origQuery, queryLangTokenizer, docLangTokenizer); int n = arr.length - 1; if (n <= 0) { throw new RuntimeException("Bad query format!: " + queryRepresentation); } // apply discount on logprobs to avoid floating point errors float discount = 0; String[] line = arr[1].trim().split(";;;"); discount = -Float.parseFloat(line[0]); float[] transProbs = new float[n]; float sumOfProbs = 0; for (int k = 0; k < n; k++){ line = arr[k+1].trim().split(";;;"); transProbs[k] = (float) Math.pow(Math.E, alpha * (Float.parseFloat(line[0]) + discount)); sumOfProbs += transProbs[k]; } boolean isPhrase = conf.getInt(Constants.MaxWindow, 0) > 0; int one2many = conf.getInt(Constants.One2Many, 2); // src token --> (trg token --> prob(trg|src)) Map<String,HMapStFW> token2tokenDist = new HashMap<String,HMapStFW>(); // target phrase --> prob HMapStFW phraseDist = new HMapStFW(); HMapStIW srcTokenCnt = new HMapStIW(); Set<String> bagOfTargetTokens = new HashSet<String>(); for (int k = 0; k < n; k++) { transProbs[k] = transProbs[k]/sumOfProbs; line = arr[k+1].trim().split(";;;"); // first elt of line is the score of kth best translation for (int i = 1; i < line.length; i++) { try { Utils.processRule(one2many, isPhrase, transProbs[k], line[i], bagOfTargetTokens, token2tokenDist, phraseDist, srcTokenCnt, queryLangTokenizer, docLangTokenizer, stemmed2Stemmed, unknownWords); } catch (Exception e) { e.printStackTrace(); LOG.info("Error while processing rule: " + line[i]); } } } // normalize Utils.normalize(token2tokenDist, conf.getFloat(Constants.LexicalProbThreshold, 0), conf.getFloat(Constants.CumulativeProbThreshold, 1f), 30); Utils.filter(phraseDist, conf.getFloat(Constants.LexicalProbThreshold, 0)); return new TranslationFromNBest(n, origQuery, stemmed2Stemmed, bagOfTargetTokens, token2tokenDist, phraseDist, srcTokenCnt); } }