PreprocessHelper.java example

Explorer
Ivory-master
- src
  - java
package ivory.lsh.bitext;

import ivory.core.RetrievalEnvironment;
import ivory.core.data.dictionary.DefaultFrequencySortedDictionary;
import ivory.core.data.stat.DfTableArray;
import ivory.core.tokenize.Tokenizer;
import ivory.core.tokenize.TokenizerFactory;
import ivory.core.util.CLIRUtils;
import ivory.pwsim.score.Bm25;
import ivory.pwsim.score.ScoringModel;

import java.io.IOException;
import java.io.InputStream;
import java.util.Map;

import opennlp.model.MaxentModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import tl.lin.data.array.ArrayListOfIntsWritable;
import tl.lin.data.array.ArrayListWritable;
import tl.lin.data.map.HMapIFW;
import tl.lin.data.map.HMapStFW;
import tl.lin.data.map.HMapStIW;
import tl.lin.data.map.MapKI;

import com.google.common.collect.Maps;

import edu.umd.hooka.Vocab;
import edu.umd.hooka.VocabularyWritable;
import edu.umd.hooka.alignment.HadoopAlign;
import edu.umd.hooka.ttables.TTable_monolithic_IFAs;

public class PreprocessHelper {
  private String eLang, fLang, eDir;
  private int MinVectorTerms, MinSentenceLength;
  private SentenceDetectorME fModel, eModel;
  private Tokenizer fTok, eTok;
  private VocabularyWritable eVocabSrc, eVocabTrg, fVocabTrg, fVocabSrc;
  private TTable_monolithic_IFAs f2e_Probs;
  private TTable_monolithic_IFAs e2f_Probs;
  private ScoringModel fScoreFn, eScoreFn;
  private MaxentModel classifier;
  private DfTableArray dfTable;
  private DefaultFrequencySortedDictionary dict;
  private final Logger sLogger = Logger.getLogger(PreprocessHelper.class);
  private static final HMapStIW lang2AvgSentLen = new HMapStIW();
  static {
    // took average # of tokens per sentence in Wikipedia data
    lang2AvgSentLen.put("en",21);       
    lang2AvgSentLen.put("de",16);     
    lang2AvgSentLen.put("zh",27);       
    lang2AvgSentLen.put("fr",18);     
    lang2AvgSentLen.put("tr",12);    
    lang2AvgSentLen.put("ar",22);  
    lang2AvgSentLen.put("es",19);     
    // set to same as fr for now (no data yet)
    lang2AvgSentLen.put("cs",18);       
  };

  /**
   * Implemented for HDFS cluster mode, files read from local cache
   */
  public PreprocessHelper(int minVectorTerms, int minSentenceLength, JobConf conf) throws Exception {
    super();
    sLogger.setLevel(Level.INFO);
    fLang = conf.get("fLang");
    eLang = conf.get("eLang");
    eDir = conf.get("eDir");

    MinVectorTerms = minVectorTerms;
    MinSentenceLength = minSentenceLength;
    loadModels(conf);
  }

  /**
   * Implemented for non-cluster mode, files read directly from local FS
   */
  public PreprocessHelper(int minVectorTerms, int minSentenceLength, Configuration conf) throws Exception {
    super();
    //    sLogger.setLevel(Level.DEBUG);
    fLang = conf.get("fLang");
    eLang = conf.get("eLang");
    eDir = conf.get("eDir");

    MinVectorTerms = minVectorTerms;
    MinSentenceLength = minSentenceLength;
    loadFModels(conf);
    loadEModels(conf);
  }

  public void loadModels(JobConf job) throws Exception{
    loadFModels(job);
    loadEModels(job);
  }

  @SuppressWarnings("deprecation")
  private void loadFModels(JobConf conf) throws Exception {
    sLogger.info("Loading models for " + fLang + " ...");

    FileSystem fs = FileSystem.get(conf);
    FileSystem localFs = FileSystem.getLocal(conf);
    Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);

    String sentDetectorFile = getSentDetectorFile(fLang); //localFiles[6].toString();
    String eVocabSrcFile = getSrcVocab(eLang, fLang);           //localFiles[3].toString();
    String eVocabTrgFile = getTrgVocab(fLang, eLang);           //localFiles[4].toString();
    String fVocabSrcFile = getSrcVocab(fLang, eLang);           //localFiles[7].toString();
    String fVocabTrgFile = getTrgVocab(eLang, fLang);           //localFiles[8].toString();
    String f2e_ttableFile = getTTable(fLang, eLang);          //localFiles[9].toString();
    String e2f_ttableFile = getTTable(eLang, fLang);          //localFiles[10].toString();
    String modelFileName = getClassifierFile();           //localFiles[12].toString();

    Map<String, Path> pathMapping = Maps.newHashMap();
    for (Path p : localFiles) {
      sLogger.info("In DistributedCache: " + p);
      if (p.toString().contains(sentDetectorFile)) {
        pathMapping.put(sentDetectorFile, p);
        sLogger.info("--> sentdetector");
      } else if (p.toString().contains(eVocabSrcFile)) {
        pathMapping.put(eVocabSrcFile, p);
        sLogger.info("--> eVocabSrcFile");
      } else if (p.toString().contains(eVocabTrgFile)) {
        pathMapping.put(eVocabTrgFile, p);
        sLogger.info("--> eVocabTrgFile");
      } else if (p.toString().contains(fVocabSrcFile)) {
        pathMapping.put(fVocabSrcFile, p);
        sLogger.info("--> fVocabSrcFile");
      } else if (p.toString().contains(fVocabTrgFile)) {
        pathMapping.put(fVocabTrgFile, p);
        sLogger.info("--> fVocabTrgFile");
      } else if (p.toString().contains(f2e_ttableFile)) {
        pathMapping.put(f2e_ttableFile, p);
        sLogger.info("--> f2e_ttableFile");
      } else if (p.toString().contains(e2f_ttableFile)) {
        pathMapping.put(e2f_ttableFile, p);
        sLogger.info("--> e2f_ttableFile");
      } else if (p.toString().contains(modelFileName)) {
        pathMapping.put(modelFileName, p);
        sLogger.info("--> classifier model");
      }
    }

    InputStream modelIn = localFs.open(pathMapping.get(sentDetectorFile));
    SentenceModel model = new SentenceModel(modelIn);
    fModel = new SentenceDetectorME(model);
    sLogger.info("Sentence model created successfully from " + pathMapping.get(sentDetectorFile));

    eVocabSrc = (VocabularyWritable) HadoopAlign.loadVocab(pathMapping.get(eVocabSrcFile), localFs);
    eVocabTrg = (VocabularyWritable) HadoopAlign.loadVocab(pathMapping.get(eVocabTrgFile), localFs);
    fVocabSrc = (VocabularyWritable) HadoopAlign.loadVocab(pathMapping.get(fVocabSrcFile), localFs);
    fVocabTrg = (VocabularyWritable) HadoopAlign.loadVocab(pathMapping.get(fVocabTrgFile), localFs);         
    f2e_Probs = new TTable_monolithic_IFAs(localFs, pathMapping.get(f2e_ttableFile), true);
    e2f_Probs = new TTable_monolithic_IFAs(localFs, pathMapping.get(e2f_ttableFile), true);

    // tokenizer file not read from cache, since it might be a directory (e.g. Chinese segmenter)
    String tokenizerFile = conf.get("fTokenizer");
    fTok = TokenizerFactory.createTokenizer(fs, fLang, tokenizerFile, true, conf.get("fStopword"), conf.get("fStemmedStopword"), null);
    sLogger.info("Tokenizer and vocabs created successfully from " + fLang + " " + tokenizerFile + "," + conf.get("fStopword") + "," + conf.get("fStemmedStopword"));

    // average sentence length = just a heuristic derived from sample text
    fScoreFn = (ScoringModel) new Bm25();
    fScoreFn.setAvgDocLength(lang2AvgSentLen.get(fLang));         

    // we use df table of English side, so we should read collection doc count from English dir
    RetrievalEnvironment eEnv = new RetrievalEnvironment(eDir, fs);
    fScoreFn.setDocCount(eEnv.readCollectionDocumentCount());   

    if (pathMapping.containsKey(modelFileName)) {
      classifier = new MoreGenericModelReader(pathMapping.get(modelFileName), localFs).constructModel();
      sLogger.info("Bitext classifier created successfully from " + pathMapping.get(modelFileName));
    }
  }

  private void loadEModels(JobConf conf) throws Exception {
    sLogger.info("Loading models for " + eLang + " ...");

    String sentDetectorFile = getSentDetectorFile(eLang); //localFiles[1].toString();

    Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
    Map<String, Path> pathMapping = Maps.newHashMap();
    for (Path p : localFiles) {
      sLogger.info("In DistributedCache: " + p);
      if (p.toString().contains(sentDetectorFile)) {
        pathMapping.put(sentDetectorFile, p);
        sLogger.info("--> sentdetector");
      } 
    }

    FileSystem localFs = FileSystem.getLocal(conf);
    InputStream modelIn = localFs.open(pathMapping.get(sentDetectorFile));
    SentenceModel model = new SentenceModel(modelIn);
    eModel = new SentenceDetectorME(model);
    sLogger.info("Sentence model created successfully.");

    FileSystem fs = FileSystem.get(conf);   
    RetrievalEnvironment env = new RetrievalEnvironment(eDir, fs);
    sLogger.info("Environment created successfully at " + eDir);

    String tokenizerFile = conf.get("eTokenizer");
    eTok = TokenizerFactory.createTokenizer(fs, eLang, tokenizerFile, true, conf.get("eStopword"), conf.get("eStemmedStopword"), null);
    sLogger.info("Tokenizer and vocabs created successfully from " + eLang + " " + tokenizerFile + "," + conf.get("eStopword") + "," + conf.get("eStemmedStopword"));

    eScoreFn = (ScoringModel) new Bm25();
    eScoreFn.setAvgDocLength(lang2AvgSentLen.get(eLang));        //average sentence length = heuristic based on De-En data
    eScoreFn.setDocCount(env.readCollectionDocumentCount());

    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), fs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), fs);
  }

  public HMapStFW createFDocVector(String sentence) {
    return createFDocVector(sentence, new HMapStIW());
  }

  public HMapStFW createFDocVector(String sentence, HMapStIW term2Tf) {
    String[] terms = fTok.processContent(sentence);

    for(String term : terms){
      term2Tf.increment(term);
    }

    //translated tf values
    HMapIFW transTermTf = new HMapIFW();
    for(MapKI.Entry<String> entry : term2Tf.entrySet()){
      String fTerm = entry.getKey();
      int tf = entry.getValue();
      // transTermTf won't be updated if fTerm not in vocab
      transTermTf = CLIRUtils.updateTFsByTerm(fTerm, tf, transTermTf, eVocabSrc, eVocabTrg, fVocabSrc, fVocabTrg, e2f_Probs, f2e_Probs, eTok, sLogger);
    }

    HMapStFW weightedVector = CLIRUtils.createTermDocVector(terms.length, transTermTf, eVocabTrg, fScoreFn, dict, dfTable, true, sLogger);

    // don't count numbers for the min #terms constraint since Wikipedia has "sentences" full of numbers that doesn't make any sense
    int numNonNumbers = 0;
    for(String term : weightedVector.keySet()){      
      if (!term.matches("\\d+")) {
        numNonNumbers++;
      }
    }
    if(numNonNumbers < MinVectorTerms){
      return null;
    }else {
      return weightedVector;
    }
  }

  public HMapStFW createEDocVector(String sentence) {
    return createEDocVector(sentence, new HMapStIW());
  }

  public HMapStFW createEDocVector(String sentence, HMapStIW term2Tf) {
    HMapStFW weightedVector = new HMapStFW();
    String[] terms = eTok.processContent(sentence);

    for(String term : terms){
      term2Tf.increment(term);     
    }

    weightedVector = CLIRUtils.createTermDocVector(terms.length, term2Tf, eScoreFn, dict, dfTable, true, sLogger);
    // don't count numbers for the min #terms constraint since Wikipedia has "sentences" full of numbers that doesn't make any sense
    int numNonNumbers = 0;
    for(String term : weightedVector.keySet()){      
      if (!term.matches("\\d+")) {
        numNonNumbers++;
      }
    }
    if(numNonNumbers < MinVectorTerms){
      return null;
    }else {
      return weightedVector;
    }
  }

  public ArrayListWritable<Text> getESentences(String text, ArrayListWritable<HMapStFW> vectors, ArrayListOfIntsWritable sentLengths) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException {
    ArrayListWritable<Text> sentences = new ArrayListWritable<Text>();
    String[] lines = text.split("\n");

    for(String line : lines){
      if(!line.matches("\\s+") && !line.isEmpty()){
        String[] sents = eModel.sentDetect(line);

        for(String sent : sents){
          if(sent.contains("date:")||sent.contains("jpg")||sent.contains("png")||sent.contains("gif")||sent.contains("fontsize:")||sent.contains("category:")){
            continue;
          }
          int length = eTok.getNumberTokens(sent);
          if(length >= MinSentenceLength){
            HMapStFW vector = createEDocVector(sent.toString());
            if(vector != null){
              vectors.add(vector);
              sentences.add(new Text(sent));
              if (sentLengths != null) sentLengths.add(length);
            }
          }
        }
      }
    }
    return sentences;
  }

  public ArrayListWritable<Text> getFSentences(String text, ArrayListWritable<HMapStFW> vectors, ArrayListOfIntsWritable sentLengths) throws ClassNotFoundException, InstantiationException, IllegalAccessException, IOException {
    //        sLogger.setLevel(Level.DEBUG);
    sLogger.debug("text length="+text.length());

    ArrayListWritable<Text> sentences = new ArrayListWritable<Text>();
    String[] lines = text.split("\n");
    sLogger.debug("num lines="+lines.length);

    for(String line : lines){
      // convert '。' to standard period character '.' for sentence detector to work
      StringBuffer sb = new StringBuffer();
      for (int i = 0; i < line.length(); i++) {
        char c = line.charAt(i);
        String unicode = String.format("%04x", (int) c);
        if (unicode.equals("3002")) {
          sb.append(". ");
        }else {
          sb.append(c);
        }
      }
      line = sb.toString();
      sLogger.debug("line="+line);
      if (!line.matches("\\s+") && !line.isEmpty()) {
        String[] sents = fModel.sentDetect(line);

        for (String sent : sents) {
          sLogger.debug("sent="+sent);
          // discard some of the non-text content in Wikipedia
          if (sent.contains("datei:") || sent.contains("jpg") || sent.contains("png") || sent.contains("fontsize:") || sent.contains("kategorie:")) {
            continue;
          }
          int length = fTok.getNumberTokens(sent);
          if (length >= MinSentenceLength) {
            HMapStFW vector = createFDocVector(sent);
            if (vector != null) {
              vectors.add(vector);
              sentences.add(new Text(sent));
              sLogger.debug("added="+vector);

              if (sentLengths != null) sentLengths.add(length);
            }
          }
        }
      }
    }

    sLogger.setLevel(Level.INFO);

    return sentences;
  }

  private String getSentDetectorFile(String lang) {
    return lang+"-sent.bin";
  }

  private String getClassifierFile() {
    return "classifier-";
  }

  private String getTTable(String srcLang, String trgLang) {
    return "ttable." + srcLang + "-" + trgLang;
  }

  private String getTrgVocab(String srcLang, String trgLang) {
    return "vocab." + srcLang + "-" + trgLang + "." + trgLang;
  }

  private String getSrcVocab(String srcLang, String trgLang) {
    return "vocab." + srcLang + "-" + trgLang + "." + srcLang;
  }

  public MaxentModel getClassifier() {
    return classifier;
  }

  public Tokenizer getETokenizer() {
    return eTok;
  }

  public Tokenizer getFTokenizer() {
    return fTok;
  }

  public TTable_monolithic_IFAs getE2F() {
    return e2f_Probs;
  }

  public TTable_monolithic_IFAs getF2E() {
    return f2e_Probs;
  }

  public Vocab getFSrc() {
    return fVocabSrc;
  }

  public Vocab getETrg() {
    return eVocabTrg;
  }

  public Vocab getESrc() {
    return eVocabSrc;
  }

  public Vocab getFTrg() {
    return fVocabTrg;
  }

  public SentenceDetectorME getFSentenceModel() {
    return fModel;
  }

  public SentenceDetectorME getESentenceModel() {
    return eModel;
  }

  /**
   * Load from local FS instead of HDFS
   */
  private void loadFModels(Configuration conf) throws Exception {
    sLogger.info("Loading models for " + fLang + " ...");
    //    FileSystem fs = FileSystem.get(conf);
    FileSystem localFs = FileSystem.getLocal(conf);

    InputStream modelIn = localFs.open(new Path(conf.get("eSentDetectorFile")));
    SentenceModel model = new SentenceModel(modelIn);
    fModel = new SentenceDetectorME(model);
    sLogger.info("Sentence model created successfully.");

    eVocabSrc = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get("eVocabSrcFile")), localFs);
    eVocabTrg = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get("eVocabTrgFile")), localFs);
    fVocabSrc = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get("fVocabSrcFile")), localFs);
    fVocabTrg = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get("fVocabTrgFile")), localFs);         
    f2e_Probs = new TTable_monolithic_IFAs(localFs, new Path(conf.get("f2e_ttableFile")), true);
    e2f_Probs = new TTable_monolithic_IFAs(localFs, new Path(conf.get("e2f_ttableFile")), true);

    // tokenizer file not read from cache, since it might be a directory (e.g. Chinese segmenter)
    String tokenizerFile = conf.get("fTokenizer");
    fTok = TokenizerFactory.createTokenizer(localFs, fLang, tokenizerFile, true, conf.get("fStopword"), null, null);
    sLogger.info("Tokenizer and vocabs created successfully.");

    // average sentence length = just a heuristic derived from sample text
    fScoreFn = (ScoringModel) new Bm25();
    fScoreFn.setAvgDocLength(lang2AvgSentLen.get(fLang));         

    // we use df table of English side, so we should read collection doc count from English dir
    RetrievalEnvironment eEnv = new RetrievalEnvironment(eDir, localFs);
    fScoreFn.setDocCount(eEnv.readCollectionDocumentCount());   

    classifier = new MoreGenericModelReader(new Path(conf.get("modelFileName")), localFs).constructModel();
  }

  private void loadEModels(Configuration conf) throws Exception {
    sLogger.info("Loading models for " + eLang + " ...");

    FileSystem localFs = FileSystem.getLocal(conf);
    InputStream modelIn = localFs.open(new Path(conf.get("fSentDetectorFile")));
    SentenceModel model = new SentenceModel(modelIn);
    eModel = new SentenceDetectorME(model);
    sLogger.info("Sentence model created successfully.");

    //    FileSystem fs = FileSystem.get(conf);   
    RetrievalEnvironment env = new RetrievalEnvironment(eDir, localFs);
    sLogger.info("Environment created successfully.");

    String tokenizerFile = conf.get("eTokenizer");
    eTok = TokenizerFactory.createTokenizer(localFs, eLang, tokenizerFile, true, conf.get("eStopword"), null, null);
    sLogger.info("Tokenizer and vocabs created successfully.");

    eScoreFn = (ScoringModel) new Bm25();
    eScoreFn.setAvgDocLength(lang2AvgSentLen.get(eLang));        //average sentence length = heuristic based on De-En data
    eScoreFn.setDocCount(env.readCollectionDocumentCount());

    dict = new DefaultFrequencySortedDictionary(new Path(env.getIndexTermsData()), new Path(env.getIndexTermIdsData()), new Path(env.getIndexTermIdMappingData()), localFs);
    dfTable = new DfTableArray(new Path(env.getDfByTermData()), localFs);
  }

  public float getFOOVRate(String fSent) {
    return fTok.getOOVRate(fSent, fVocabSrc);
  }

  public float getEOOVRate(String eSent) {
    return eTok.getOOVRate(eSent, eVocabSrc);
  }
}