package ivory.lsh.bitext; import ivory.core.RetrievalEnvironment; import ivory.core.util.CLIRUtils; import ivory.lsh.data.WikiDocInfo; import java.io.IOException; import java.net.URI; import java.util.Iterator; import opennlp.model.RealValueFileEventStream; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Level; import org.apache.log4j.Logger; import tl.lin.data.array.ArrayListOfIntsWritable; import tl.lin.data.array.ArrayListWritable; import tl.lin.data.map.HMapIV; import tl.lin.data.map.HMapStFW; import tl.lin.data.pair.PairOfInts; import edu.umd.cloud9.collection.Indexable; import edu.umd.cloud9.collection.wikipedia.WikipediaPage; /** * @author ferhanture * */ @SuppressWarnings("deprecation") public class FindParallelSentencePairsOld extends Configured implements Tool { private static final Logger sLogger = Logger.getLogger(FindParallelSentencePairsOld.class); private static final int MinVectorTerms = 3, MinSentenceLength = 5, E=-1, F=1; enum Docs{ pairsE, pairsF, pairs, pairsIncompleteF, pairsIncompleteE } enum Sentences{ pairsE, pairsF, pairsProcessed, pairsCandidate, pairsFilteredByVectorSize, pairsFilteredBySentRatio, parallel } //AssertTrue //pairsCandidate=sum(pairsProcessed, pairsFilteredBySentRatio) //SanityCheck //pairsCandidate/Docs.pairsF = number of sentence pairs per doc pair public FindParallelSentencePairsOld() { } private static int printUsage() { sLogger.info("usage: [cl-pwsim-output-path] [output-path] [e-path] [f-path] [e-dir] [f-dir] [vocab-dir] [e-lang] [f-lang] [classifier] [threshold] [classifier parallel-label id]"); ToolRunner.printGenericCommandUsage(System.out); return -1; } /** * Candidate generation * * Map: (docno, wikiPage) --> (<fDocno, eDocno>, <lang id,docno,vectors,sentences>) * input is union of source and target collections * sentences = extract sentences in wikiPage * vectors = convert sentence text into td-idf vector * similar_pairs = from pwsim output, find if there's any pair corresponding to docno * foreach similar_pair * emit(similar_pair, <lang id,docno,vectors,sentences>) * * @author ferhanture */ private static class MyMapper extends MapReduceBase implements Mapper<Writable, Indexable, PairOfInts, WikiDocInfo> { private HMapIV<ArrayListOfIntsWritable> pwsimMapping; // mapping for pwsim pairs private PairOfInts keyOut; private JobConf mJob; private WikiDocInfo valOut; private PreprocessHelper helper; // for modularity, helper provides methods to preprocess data public void configure(JobConf job) { sLogger.setLevel(Level.INFO); mJob = job; pwsimMapping = new HMapIV<ArrayListOfIntsWritable>(); try { helper = new PreprocessHelper(MinVectorTerms, MinSentenceLength, job); } catch (Exception e) { e.printStackTrace(); } keyOut = new PairOfInts(); valOut = new WikiDocInfo(); } /** * if lang id points to foreign language, then load pwsim algorithm's output as mapping: {foreign docno N --> list<english docnos> associated with N} * otherwise, mapping is like: {english docno N --> list<foreign docnos> associated with N} * * lang id is the same for every Map call of a given mapper, since input sequence files will be uniform in terms of language * (i.e., a mapper will receive either all foreign or all english documents) * * @param pwsimMapping * mapping from source (target) docno to list of target (source) docnos associated with it * @param lang * language identifier * @param job * job configuration object * @param reporter * reporter object for counters */ private static void loadPairs(HMapIV<ArrayListOfIntsWritable> pwsimMapping, String lang, JobConf job, Reporter reporter){ try { Path[] localFiles = null; localFiles = DistributedCache.getLocalCacheFiles(job); SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(job), localFiles[14], job); PairOfInts key = (PairOfInts) reader.getKeyClass().newInstance(); IntWritable value = (IntWritable) reader.getValueClass().newInstance(); while (reader.next(key, value)) { int fDocno = key.getRightElement(); fDocno -= 1000000000; int eDocno = key.getLeftElement(); if(lang.equals("en")){ if(!pwsimMapping.containsKey(eDocno)){ pwsimMapping.put(eDocno, new ArrayListOfIntsWritable()); } pwsimMapping.get(eDocno).add(fDocno); // we add 1000000000 to foreign docnos to distinguish them during pwsim algo }else{ if(!pwsimMapping.containsKey(fDocno)){ pwsimMapping.put(fDocno, new ArrayListOfIntsWritable()); } pwsimMapping.get(fDocno).add(eDocno); // we add 1000000000 to foreign docnos to distinguish them during pwsim algo } key = (PairOfInts) reader.getKeyClass().newInstance(); value = (IntWritable) reader.getValueClass().newInstance(); } reader.close(); sLogger.info(pwsimMapping.size()+" pairs loaded."); } catch (Exception e) { throw new RuntimeException(e); } } public void map(Writable docnoKey, Indexable page, OutputCollector<PairOfInts, WikiDocInfo> output, Reporter reporter) throws IOException { int docno = ((IntWritable)docnoKey).get(); WikipediaPage p = (WikipediaPage) page; String lang = p.getLanguage(); ArrayListOfIntsWritable similarDocnos; // we only load the mapping once, during the first map() call of a mapper. // this works b/c all input kv pairs of a given mapper will have same lang id (reason explained above) if(pwsimMapping.isEmpty()){ loadPairs(pwsimMapping, lang, mJob, reporter); sLogger.debug(pwsimMapping.size()); } // if no similar docs for docno, return if(pwsimMapping.containsKey(docno)){ similarDocnos = pwsimMapping.get(docno); }else{ return; } ArrayListWritable<Text> sentences; ArrayListWritable<HMapStFW> vectors = new ArrayListWritable<HMapStFW>(); ArrayListOfIntsWritable sentLengths = new ArrayListOfIntsWritable(); try { if(lang.equals("en")){ // identify sentences in document, filter out ones below MinSentLength threshold // convert each sentence into a tf-idf vector, using general DF map for collection and a heuristic for avg. doc length // filter out sentences for which the vector has less than MinVectorTerms terms sentences = helper.getESentences(p.getContent(), vectors, sentLengths); }else{ sentences = helper.getFSentences(p.getContent(), vectors, sentLengths); } if(sentences.size() != vectors.size()) { throw new RuntimeException("Sentences.size != Vectors.size"); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } for(int similarDocno : similarDocnos){ if(lang.equals("en")){ keyOut.set(similarDocno, docno); valOut.set(E, vectors, sentences); reporter.incrCounter(Docs.pairsE, 1); reporter.incrCounter(Sentences.pairsE, vectors.size()); }else{ keyOut.set(docno, similarDocno); valOut.set(F, vectors, sentences); reporter.incrCounter(Docs.pairsF, 1); reporter.incrCounter(Sentences.pairsF, vectors.size()); } output.collect(keyOut, valOut); } } } /** * Bilingual sentence pair detection * * Reduce: (<fDocno, eDocno>, [ <E,eDocno,eVectors,eSentences>, <F,fDocno,fVectors,fSentences>]) --> (fSentence, eSentence) * * @author ferhanture * */ private static class MyReducer extends MapReduceBase implements Reducer<PairOfInts, WikiDocInfo, Text, Text>{ private int fDocno, eDocno; private int classifierPositiveId; private ArrayListWritable<HMapStFW> fVectors, eVectors; private ArrayListWritable<Text> fSentences, eSentences; private PreprocessHelper helper; // for modularity, helper provides methods to preprocess data private float classifierThreshold; private Text emptyValue = new Text(); public void configure(JobConf job) { sLogger.setLevel(Level.INFO); try { helper = new PreprocessHelper(MinVectorTerms, MinSentenceLength, job); } catch (Exception e) { e.printStackTrace(); } classifierPositiveId = job.getInt("ClassifierId", -1); if(classifierPositiveId != 0 && classifierPositiveId != 1){ throw new RuntimeException("Id of parallel label in MaxEnt classifier not specified properly: "+classifierPositiveId); } classifierThreshold = job.getFloat("ClassifierThreshold", 2); if (classifierThreshold > 1f) { throw new RuntimeException("Classifier confidence threshold > 1, provide value in [0,1]: "+classifierThreshold); } eVectors = new ArrayListWritable<HMapStFW>(); fVectors = new ArrayListWritable<HMapStFW>(); eSentences = new ArrayListWritable<Text>(); fSentences = new ArrayListWritable<Text>(); } public void reduce(PairOfInts docnoPair, Iterator<WikiDocInfo> wikiTexts, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { eVectors.clear(); fVectors.clear(); eSentences.clear(); fSentences.clear(); fDocno = docnoPair.getLeftElement(); eDocno = docnoPair.getRightElement(); // parse WikiDocInfo object into sentences and vectors, based on the language id WikiDocInfo page; int eCnt = 0, fCnt = 0; while (wikiTexts.hasNext() && (eCnt < 1 || fCnt < 1)) { page = wikiTexts.next(); if(page.getLangID() == F && fVectors.isEmpty()){ fCnt++; fVectors = page.getVectors(); fSentences = page.getSentences(); reporter.incrCounter(Sentences.pairsF, fVectors.size()); }else if(page.getLangID() == E && eVectors.isEmpty()){ eCnt++; eVectors = page.getVectors(); eSentences = page.getSentences(); reporter.incrCounter(Sentences.pairsE, eVectors.size()); } } /** * @TODO look into exact cause of this... * * if the input collection has differences from the pwsim output, * we may not find the actual wiki page corresponding to a similar pair of docnos */ if((eCnt < 1 || fCnt < 1)){ sLogger.debug("Read "+eCnt+","+fCnt+" pages: ="+eDocno+","+fDocno); if(fVectors.isEmpty()){ reporter.incrCounter(Docs.pairsIncompleteF, 1); }else{ reporter.incrCounter(Docs.pairsIncompleteE, 1); } return; } reporter.incrCounter(Docs.pairs, 1); // if either document has no vectors, no need to continue if(fVectors.size()==0 || eVectors.size()==0){ return; } // counters for debug purposes only reporter.incrCounter(Sentences.pairsCandidate, fVectors.size()*eVectors.size()); int numProcessed = 0; long time = 0; // classify each e-f sentence pair in the candidate set for (int f = 0; f < fVectors.size(); f++) { HMapStFW fVector = fVectors.get(f); int fSentLength = fSentences.get(f).getLength(); for (int e = 0; e < eVectors.size(); e++) { HMapStFW eVector = eVectors.get(e); int eSentLength = eSentences.get(e).getLength(); if (eSentLength > 2 * fSentLength || fSentLength > 2 * eSentLength) { reporter.incrCounter(Sentences.pairsFilteredBySentRatio, 1); continue; } reporter.incrCounter(Sentences.pairsProcessed, 1); numProcessed++; // compute features long start = System.currentTimeMillis(); String[] instance = CLIRUtils.computeFeaturesF1(eVector, fVector, eSentLength, fSentLength); time += (System.currentTimeMillis()-start); // classify w/ maxent model // emit if labeled parallel if(instance == null){ throw new RuntimeException("SHOULD NOT HAPPEN!"); } // apply MaxEnt classifier to instance float[] values = RealValueFileEventStream.parseContexts(instance); double[] probs = helper.getClassifier().eval(instance, values); // check if confidence above specified threshold double confidence = probs[classifierPositiveId]; if(confidence>classifierThreshold){ reporter.incrCounter(Sentences.parallel, 1); output.collect(new Text(fSentences.get(f)+"<GERMAN2ENGLISH>"+eSentences.get(e)), emptyValue); } } } // sLogger.info("Finished processing "+numProcessed+" out of "+fVectors.size()*eVectors.size()+", avg process time="+time/(1f*numProcessed)+" avg map time="+(System.currentTimeMillis()-mapStartTime)/(1f*numProcessed)); } } /** * Runs this tool. */ public int run(String[] args) throws Exception { if (args.length != 12) { printUsage(); return -1; } JobConf conf = new JobConf(getConf(), FindParallelSentencePairsOld.class); // Read commandline argument String pwsimPairsPath = args[0]; String outputPath = args[1]; String eCollectionPath = args[2]; String fCollectionPath = args[3]; String eDir = args[4]; String fDir = args[5]; RetrievalEnvironment eEnv = new RetrievalEnvironment(eDir, FileSystem.get(conf)); String vocabDir = args[6]; String eLang = args[7]; String fLang = args[8]; String classifierFile = args[9]; float classifierThreshold = Float.parseFloat(args[10]); int classifierId = Integer.parseInt(args[11]); conf.setJobName("FindParallelSentences_" + fLang +"-" + eLang +"_F1="+classifierThreshold+"["+classifierId+"]"); String eSentDetect = vocabDir+"/"+eLang+"-sent.bin"; String eTokenizer = vocabDir+"/"+eLang+"-token.bin"; String eVocabSrc = vocabDir+"/vocab."+eLang+"-"+fLang+"."+eLang; String eVocabTrg = vocabDir+"/vocab."+fLang+"-"+eLang+"."+eLang; String fSentDetect = vocabDir+"/"+fLang+"-sent.bin"; String fTokenizer = vocabDir+"/"+fLang+"-token.bin"; String fVocabSrc = vocabDir+"/vocab."+fLang+"-"+eLang+"."+fLang; String fVocabTrg = vocabDir+"/vocab."+eLang+"-"+fLang+"."+fLang; String f2e_ttableFile = vocabDir+"/ttable."+fLang+"-"+eLang; String e2f_ttableFile = vocabDir+"/ttable."+eLang+"-"+fLang; int numReducers = 50; conf.set("eDir", eDir); conf.set("fDir", fDir); conf.set("eLang", eLang); conf.set("fLang", fLang); conf.setInt("NumReducers", numReducers); conf.setFloat("ClassifierThreshold", classifierThreshold); conf.setInt("ClassifierId", classifierId); sLogger.info("caching files..."); //e-files sLogger.info("caching files...0,1,2,3,4"); DistributedCache.addCacheFile(new URI(eEnv.getDfByTermData()), conf); DistributedCache.addCacheFile(new URI(eSentDetect), conf); DistributedCache.addCacheFile(new URI(eTokenizer), conf); DistributedCache.addCacheFile(new URI(eVocabSrc), conf); DistributedCache.addCacheFile(new URI(eVocabTrg), conf); //f-files sLogger.info("caching files...5,6,7,8,9"); DistributedCache.addCacheFile(new URI(fDir+"/transDf.dat"), conf); DistributedCache.addCacheFile(new URI(fSentDetect), conf); DistributedCache.addCacheFile(new URI(fTokenizer), conf); DistributedCache.addCacheFile(new URI(fVocabSrc), conf); DistributedCache.addCacheFile(new URI(fVocabTrg), conf); /////cross-lang files sLogger.info("caching files...10,11,12,13,14"); DistributedCache.addCacheFile(new URI(f2e_ttableFile), conf); DistributedCache.addCacheFile(new URI(e2f_ttableFile), conf); DistributedCache.addCacheFile(new URI(eEnv.getIndexTermsData()), conf); DistributedCache.addCacheFile(new URI(classifierFile), conf); DistributedCache.addCacheFile(new URI(pwsimPairsPath), conf); FileInputFormat.addInputPaths(conf, eCollectionPath); FileInputFormat.addInputPaths(conf, fCollectionPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInt("mapred.task.timeout", 60000000); conf.set("mapred.child.java.opts", "-Xmx2000m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setNumMapTasks(100); conf.setNumReduceTasks(numReducers); conf.setInt("mapred.min.split.size", 2000000000); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(PairOfInts.class); conf.setMapOutputValueClass(WikiDocInfo.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); JobClient.runJob(conf); return 0; } /** * Dispatches command-line arguments to the tool via the * <code>ToolRunner</code>. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new FindParallelSentencePairsOld(), args); System.exit(res); } }