SCFGQueryGenerator.java example

Explorer
Ivory-master
- src
  - java
package ivory.sqe.querygenerator;

import ivory.core.ConfigurationException;
import ivory.core.RetrievalEnvironment;
import ivory.core.tokenize.BigramChineseTokenizer;
import ivory.core.tokenize.Tokenizer;
import ivory.core.tokenize.TokenizerFactory;
import ivory.sqe.retrieval.Constants;
import ivory.sqe.retrieval.StructuredQuery;

import java.io.IOException;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import tl.lin.data.map.HMapStFW;
import tl.lin.data.map.MapKF;

import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonPrimitive;

/**
 * In addition to PSQ implemented in CLWordQueryGenerator, phrase translations are learned from a provided Synchronous context-free grammar (SCFG) (Hiero, 2006) and added to the representation.
 * 
 * @author ferhanture
 *
 */
public class SCFGQueryGenerator implements QueryGenerator {
  private static final Logger LOG = Logger.getLogger(SCFGQueryGenerator.class);
  private Tokenizer defaultTokenizer, docLangTokenizer, queryLangTokenizerWithStemming, queryLangTokenizer, bigramTokenizer;
  private Map<String, Map<String, HMapStFW>> query2probMap;
  private int length, numTransPerToken;
  private boolean isDocStemmed, isStemming, bigramSegment = false;
  private RetrievalEnvironment env;
  private String queryLang, docLang;
//  private List<String> originalQueries; 
  private float lexProbThreshold, cumProbThreshold;
  public SCFGQueryGenerator() throws IOException {
    super();
  }

  @Override
  public void init(FileSystem fs, Configuration conf) throws IOException {
    if (conf.getBoolean(Constants.Quiet, false)) {
      LOG.setLevel(Level.OFF);
    }
    
    LOG.info(conf.get(Constants.DocLanguage));
    LOG.info(conf.get(Constants.DocTokenizerData));
    LOG.info(conf.get(Constants.MinWindow));
    LOG.info(conf.get(Constants.MaxWindow));
    LOG.info(conf.get(Constants.GrammarWeight));

    LOG.info("Stemmed stopword list file in query-language:" + conf.get(Constants.StemmedStopwordListQ));
    LOG.info("Stemmed stopword list file in doc-language:" + conf.get(Constants.StemmedStopwordListD));

    numTransPerToken = conf.getInt(Constants.NumTransPerToken, Integer.MAX_VALUE);

    queryLang = conf.get(Constants.QueryLanguage);
    docLang = conf.get(Constants.DocLanguage);

    lexProbThreshold = conf.getFloat(Constants.LexicalProbThreshold, 0f);
    cumProbThreshold = conf.getFloat(Constants.CumulativeProbThreshold, 1f);

    String bigram = conf.get(Constants.BigramSegment);
    bigramSegment = (bigram != null && bigram.equals("on")) ? true : false;
    if (bigramSegment) {
      bigramTokenizer = new BigramChineseTokenizer();
    }
    LOG.info("Bigram segmentation = " + bigramSegment);

    // initialize environment to access index
    try {
      env = new RetrievalEnvironment(conf.get(Constants.IndexPath), fs);
      env.initialize(true);
    } catch (ConfigurationException e) {
      e.printStackTrace();
    }

    isDocStemmed = conf.getBoolean(Constants.IsDocStemmed, false);
    isStemming = conf.getBoolean(Constants.IsStemming, false);
    
    queryLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), false, null, null, null);
    queryLangTokenizerWithStemming = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), true, null, conf.get(Constants.StemmedStopwordListQ), null);
    
    if (isStemming) {
      defaultTokenizer = queryLangTokenizerWithStemming;
    } else {
      defaultTokenizer = queryLangTokenizer;
    }
    docLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, docLang, conf.get(Constants.DocTokenizerData), true, null, conf.get(Constants.StemmedStopwordListD), null);
  }

  @Override
  public StructuredQuery parseQuery(String query, FileSystem fs, Configuration conf) {   
    JsonObject queryJson = new JsonObject();
   
    String origQuery = query.trim().split("\\|\\|\\|\\|")[0].trim();
    String grammarFile = conf.get(Constants.GrammarPath);
    Map<String, HMapStFW> probMap = processGrammar(fs, conf, grammarFile);
    
    Map<String, String> stemmed2Stemmed = Utils.getStemMapping(origQuery, defaultTokenizer, docLangTokenizer);
    JsonArray tokenTranslations = new JsonArray();
    String[] tokens = queryLangTokenizerWithStemming.processContent(origQuery);
    length = tokens.length;
    for (String token : tokens) {
      if (queryLangTokenizerWithStemming.isStopWord(token)){
        continue;
      }
      // if we do bigram segmentation on translation alternatives, we have to a have a weight
      // structure, even if k=1
      // unless the token has less than 3 characters, in which bigram segmentation will only return
      // the token itself
      if (numTransPerToken == 1 && !bigramSegment) {
        String trans = getBestTranslation(origQuery, token);
        if (trans != null) {
          tokenTranslations.add(new JsonPrimitive(trans));
        }
      } else {
        JsonObject tokenTrans = new JsonObject();
        JsonArray weights = Utils.createJsonArrayFromProbabilities(getTranslations(origQuery, token, probMap, stemmed2Stemmed));
        if (weights != null) {
          tokenTrans.add("#weight", weights);
          tokenTranslations.add(tokenTrans);
        }
      }
    }
    queryJson.add("#combine", tokenTranslations);

    return new StructuredQuery(queryJson, length);
  }

  public Map<String, HMapStFW> processGrammar(FileSystem fs, Configuration conf, String grammarFile) {
    Map<String, HMapStFW> probMap = Utils.generateTranslationTable(fs, conf, grammarFile, 
        queryLangTokenizerWithStemming, docLangTokenizer);
    if (probMap == null) {
      LOG.info("No probabilities extracted from " + grammarFile);
    }else {
      //remove low-prob entries, cut off after certain cumulative prob., and normalize
      Utils.normalize(probMap, lexProbThreshold, cumProbThreshold, 30);      
    }   
    return probMap;
  }

  private String getBestTranslation(String query, String token) {
    HMapStFW probDist = query2probMap.get(query).get(token);

    if(probDist == null){
      return token;
    }

    float maxProb = 0f;
    String maxProbTrans = null;
    for (MapKF.Entry<String> entry : probDist.entrySet()) {
      if (entry.getValue() > maxProb) {
        maxProb = entry.getValue();
        maxProbTrans = entry.getKey();
      }
    }
    return maxProbTrans;
  }

  protected HMapStFW getTranslations(String query, String token, Map<String, HMapStFW> probMap, Map<String, String> stemmed2Stemmed) {
    HMapStFW probDist = null;
    try {
      probDist = probMap.get(token);
    } catch (NullPointerException e) {
      LOG.info("Prob map not found for " + query);
      e.printStackTrace();
    }
    
    if(probDist == null){
      // borrow OOV word heuristic from MT: if no translation found, include itself as translation
      probDist = new HMapStFW();
      String targetStem = stemmed2Stemmed.get(token);
      String target = (stemmed2Stemmed == null || targetStem == null) ? token : stemmed2Stemmed.get(token);
      probDist.put(target, 1);      
      return probDist;
    }

    return probDist;
  }
}