package ivory.sqe.querygenerator; import ivory.core.ConfigurationException; import ivory.core.RetrievalEnvironment; import ivory.core.tokenize.BigramChineseTokenizer; import ivory.core.tokenize.Tokenizer; import ivory.core.tokenize.TokenizerFactory; import ivory.sqe.retrieval.Constants; import ivory.sqe.retrieval.StructuredQuery; import java.io.IOException; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.log4j.Level; import org.apache.log4j.Logger; import tl.lin.data.map.HMapStFW; import tl.lin.data.map.MapKF; import com.google.gson.JsonArray; import com.google.gson.JsonObject; import com.google.gson.JsonPrimitive; /** * In addition to PSQ implemented in CLWordQueryGenerator, phrase translations are learned from a provided Synchronous context-free grammar (SCFG) (Hiero, 2006) and added to the representation. * * @author ferhanture * */ public class SCFGQueryGenerator implements QueryGenerator { private static final Logger LOG = Logger.getLogger(SCFGQueryGenerator.class); private Tokenizer defaultTokenizer, docLangTokenizer, queryLangTokenizerWithStemming, queryLangTokenizer, bigramTokenizer; private Map<String, Map<String, HMapStFW>> query2probMap; private int length, numTransPerToken; private boolean isDocStemmed, isStemming, bigramSegment = false; private RetrievalEnvironment env; private String queryLang, docLang; // private List<String> originalQueries; private float lexProbThreshold, cumProbThreshold; public SCFGQueryGenerator() throws IOException { super(); } @Override public void init(FileSystem fs, Configuration conf) throws IOException { if (conf.getBoolean(Constants.Quiet, false)) { LOG.setLevel(Level.OFF); } LOG.info(conf.get(Constants.DocLanguage)); LOG.info(conf.get(Constants.DocTokenizerData)); LOG.info(conf.get(Constants.MinWindow)); LOG.info(conf.get(Constants.MaxWindow)); LOG.info(conf.get(Constants.GrammarWeight)); LOG.info("Stemmed stopword list file in query-language:" + conf.get(Constants.StemmedStopwordListQ)); LOG.info("Stemmed stopword list file in doc-language:" + conf.get(Constants.StemmedStopwordListD)); numTransPerToken = conf.getInt(Constants.NumTransPerToken, Integer.MAX_VALUE); queryLang = conf.get(Constants.QueryLanguage); docLang = conf.get(Constants.DocLanguage); lexProbThreshold = conf.getFloat(Constants.LexicalProbThreshold, 0f); cumProbThreshold = conf.getFloat(Constants.CumulativeProbThreshold, 1f); String bigram = conf.get(Constants.BigramSegment); bigramSegment = (bigram != null && bigram.equals("on")) ? true : false; if (bigramSegment) { bigramTokenizer = new BigramChineseTokenizer(); } LOG.info("Bigram segmentation = " + bigramSegment); // initialize environment to access index try { env = new RetrievalEnvironment(conf.get(Constants.IndexPath), fs); env.initialize(true); } catch (ConfigurationException e) { e.printStackTrace(); } isDocStemmed = conf.getBoolean(Constants.IsDocStemmed, false); isStemming = conf.getBoolean(Constants.IsStemming, false); queryLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), false, null, null, null); queryLangTokenizerWithStemming = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), true, null, conf.get(Constants.StemmedStopwordListQ), null); if (isStemming) { defaultTokenizer = queryLangTokenizerWithStemming; } else { defaultTokenizer = queryLangTokenizer; } docLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, docLang, conf.get(Constants.DocTokenizerData), true, null, conf.get(Constants.StemmedStopwordListD), null); } @Override public StructuredQuery parseQuery(String query, FileSystem fs, Configuration conf) { JsonObject queryJson = new JsonObject(); String origQuery = query.trim().split("\\|\\|\\|\\|")[0].trim(); String grammarFile = conf.get(Constants.GrammarPath); Map<String, HMapStFW> probMap = processGrammar(fs, conf, grammarFile); Map<String, String> stemmed2Stemmed = Utils.getStemMapping(origQuery, defaultTokenizer, docLangTokenizer); JsonArray tokenTranslations = new JsonArray(); String[] tokens = queryLangTokenizerWithStemming.processContent(origQuery); length = tokens.length; for (String token : tokens) { if (queryLangTokenizerWithStemming.isStopWord(token)){ continue; } // if we do bigram segmentation on translation alternatives, we have to a have a weight // structure, even if k=1 // unless the token has less than 3 characters, in which bigram segmentation will only return // the token itself if (numTransPerToken == 1 && !bigramSegment) { String trans = getBestTranslation(origQuery, token); if (trans != null) { tokenTranslations.add(new JsonPrimitive(trans)); } } else { JsonObject tokenTrans = new JsonObject(); JsonArray weights = Utils.createJsonArrayFromProbabilities(getTranslations(origQuery, token, probMap, stemmed2Stemmed)); if (weights != null) { tokenTrans.add("#weight", weights); tokenTranslations.add(tokenTrans); } } } queryJson.add("#combine", tokenTranslations); return new StructuredQuery(queryJson, length); } public Map<String, HMapStFW> processGrammar(FileSystem fs, Configuration conf, String grammarFile) { Map<String, HMapStFW> probMap = Utils.generateTranslationTable(fs, conf, grammarFile, queryLangTokenizerWithStemming, docLangTokenizer); if (probMap == null) { LOG.info("No probabilities extracted from " + grammarFile); }else { //remove low-prob entries, cut off after certain cumulative prob., and normalize Utils.normalize(probMap, lexProbThreshold, cumProbThreshold, 30); } return probMap; } private String getBestTranslation(String query, String token) { HMapStFW probDist = query2probMap.get(query).get(token); if(probDist == null){ return token; } float maxProb = 0f; String maxProbTrans = null; for (MapKF.Entry<String> entry : probDist.entrySet()) { if (entry.getValue() > maxProb) { maxProb = entry.getValue(); maxProbTrans = entry.getKey(); } } return maxProbTrans; } protected HMapStFW getTranslations(String query, String token, Map<String, HMapStFW> probMap, Map<String, String> stemmed2Stemmed) { HMapStFW probDist = null; try { probDist = probMap.get(token); } catch (NullPointerException e) { LOG.info("Prob map not found for " + query); e.printStackTrace(); } if(probDist == null){ // borrow OOV word heuristic from MT: if no translation found, include itself as translation probDist = new HMapStFW(); String targetStem = stemmed2Stemmed.get(token); String target = (stemmed2Stemmed == null || targetStem == null) ? token : stemmed2Stemmed.get(token); probDist.put(target, 1); return probDist; } return probDist; } }