package ivory.lsh.bitext; import ivory.core.util.CLIRUtils; import ivory.lsh.data.WikiSentenceInfo; import ivory.lsh.driver.PwsimEnvironment; import java.io.IOException; import java.util.Iterator; import opennlp.model.RealValueFileEventStream; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import tl.lin.data.array.ArrayListOfIntsWritable; import tl.lin.data.array.ArrayListWritable; import tl.lin.data.map.HMapIV; import tl.lin.data.map.HMapStFW; import tl.lin.data.pair.PairOfInts; /** Step 1 of the bitext extraction algorithm. * @author ferhanture * */ @SuppressWarnings("deprecation") public class FindParallelSentencePairs extends Configured implements Tool { private static final Logger sLogger = Logger.getLogger(FindParallelSentencePairs.class); enum Docs{ pairsE, pairsF, pairs, pairsIncompleteF, pairsIncompleteE, dbg } enum Sentences{ E, F, pairsE, pairsF, pairsProcessed, pairsCandidate, pairsFilteredByVectorSize, pairsFilteredBySentRatio, parallel } //AssertTrue //pairsCandidate=sum(pairsProcessed, pairsFilteredBySentRatio) //SanityCheck //pairsCandidate/Docs.pairsF = number of sentence pairs per doc pair public FindParallelSentencePairs() { } private static void printUsage() { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "FindParallelSentencePairs", options ); System.exit(-1); } /** * Candidate generation * * Map: (docno, wikiPage) --> (<fDocno, fSentID, eDocno, eSentID>, <lang id,vector,sentence>) * input is union of source and target collections * sentences = extract sentences in wikiPage * vectors = convert sentence text into td-idf vector * similar_pairs = from pwsim output, find if there's any pair corresponding to docno * foreach similar_pair * emit(similar_pair, <lang id,docno,vectors,sentences>) * * @author ferhanture */ private static class MyMapper extends MapReduceBase implements Mapper<PairOfInts, WikiSentenceInfo, PairOfInts, WikiSentenceInfo> { private HMapIV<ArrayListOfIntsWritable> pwsimMapping; // mapping for pwsim pairs private PairOfInts keyOut; private JobConf mJob; private ArrayListOfIntsWritable similarDocnos; public void configure(JobConf job) { // sLogger.setLevel(Level.DEBUG); mJob = job; pwsimMapping = new HMapIV<ArrayListOfIntsWritable>(); keyOut = new PairOfInts(); } private static String getFilename(String s) { return s.substring(s.lastIndexOf("/") + 1); } /** * if lang id points to foreign language, then load pwsim algorithm's output as mapping: {foreign docno N --> list<english docnos> associated with N} * otherwise, mapping is like: {english docno N --> list<foreign docnos> associated with N} * * lang id is the same for every Map call of a given mapper, since input sequence files will be uniform in terms of language * (i.e., a mapper will receive either all foreign or all english documents) * * @param pwsimMapping * mapping from source (target) docno to list of target (source) docnos associated with it * @param lang * language identifier * @param job * job configuration object * @param reporter * reporter object for counters */ private static void loadPairs(HMapIV<ArrayListOfIntsWritable> pwsimMapping, int langID, JobConf job, Reporter reporter){ try { Path[] localFiles = DistributedCache.getLocalCacheFiles(job); String pwsimFile = job.get("PwsimPairs"); for (Path localFile : localFiles) { if (localFile.toString().contains(getFilename(pwsimFile))) { SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(job), localFile, job); PairOfInts key = (PairOfInts) reader.getKeyClass().newInstance(); IntWritable value = (IntWritable) reader.getValueClass().newInstance(); int cnt = 0; while (reader.next(key, value)) { int fDocno = key.getRightElement(); // fDocno -= 1000000000; int eDocno = key.getLeftElement(); if(langID == CLIRUtils.E){ if(!pwsimMapping.containsKey(eDocno)){ pwsimMapping.put(eDocno, new ArrayListOfIntsWritable()); } pwsimMapping.get(eDocno).add(fDocno); // we add 1000000000 to foreign docnos to distinguish them during pwsim algo }else{ if(!pwsimMapping.containsKey(fDocno)){ pwsimMapping.put(fDocno, new ArrayListOfIntsWritable()); } pwsimMapping.get(fDocno).add(eDocno); // we add 1000000000 to foreign docnos to distinguish them during pwsim algo } cnt++; key = (PairOfInts) reader.getKeyClass().newInstance(); value = (IntWritable) reader.getValueClass().newInstance(); } reader.close(); sLogger.info(pwsimMapping.size() + "," + cnt + " pairs loaded from " + localFile); } } } catch (Exception e) { throw new RuntimeException(e); } } public void map(PairOfInts sentenceId, WikiSentenceInfo sentenceInfo, OutputCollector<PairOfInts, WikiSentenceInfo> output, Reporter reporter) throws IOException { int docno = sentenceId.getLeftElement(); int langID = sentenceInfo.getLangID(); // we only load the mapping once, during the first map() call of a mapper. // this works b/c all input kv pairs of a given mapper will have same lang id (reason explained above) if (pwsimMapping.isEmpty()) { loadPairs(pwsimMapping, langID, mJob, reporter); sLogger.info("Mapping loaded: " + pwsimMapping.size()); } if (langID == CLIRUtils.F) { docno += 1000000000; } // if no similar docs for docno, return if (pwsimMapping.containsKey(docno)) { similarDocnos = pwsimMapping.get(docno); }else{ return; } if (langID == CLIRUtils.E) { reporter.incrCounter(Sentences.E, 1); reporter.incrCounter(Sentences.pairsE, similarDocnos.size()); }else { reporter.incrCounter(Sentences.F, 1); reporter.incrCounter(Sentences.pairsF, similarDocnos.size()); } for (int similarDocno : similarDocnos) { if (langID == CLIRUtils.E) { keyOut.set(similarDocno, docno); }else { keyOut.set(docno, similarDocno); } output.collect(keyOut, sentenceInfo); } } } /** * Bilingual sentence pair detection with simple classifier * * Reduce: (<fDocno, eDocno>, [ <E,eDocno,eVectors,eSentences>, <F,fDocno,fVectors,fSentences>]) --> (fSentence, eSentence) * * @author ferhanture * */ private static class MyReducer extends MapReduceBase implements Reducer<PairOfInts, WikiSentenceInfo, Text, Text>{ private int fDocno, eDocno; private int classifierPositiveId; private ArrayListWritable<HMapStFW> fVectors, eVectors; private ArrayListWritable<Text> fSentences, eSentences; private PreprocessHelper helper; // for modularity, helper provides methods to preprocess data private float classifierThreshold; private Text emptyValue = new Text(); public void configure(JobConf job) { // sLogger.setLevel(Level.DEBUG); try { helper = new PreprocessHelper(CLIRUtils.MinVectorTerms, CLIRUtils.MinSentenceLength, job); } catch (Exception e) { e.printStackTrace(); } classifierPositiveId = job.getInt("ClassifierId", -1); if(classifierPositiveId != 0 && classifierPositiveId != 1){ throw new RuntimeException("Id of parallel label in MaxEnt classifier not specified properly: "+classifierPositiveId); } classifierThreshold = job.getFloat("ClassifierThreshold", 2); if (classifierThreshold > 1f) { throw new RuntimeException("Classifier confidence threshold > 1, provide value in [0,1]: "+classifierThreshold); } eVectors = new ArrayListWritable<HMapStFW>(); fVectors = new ArrayListWritable<HMapStFW>(); eSentences = new ArrayListWritable<Text>(); fSentences = new ArrayListWritable<Text>(); } public void reduce(PairOfInts docnoPair, Iterator<WikiSentenceInfo> wikiSentences, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { eVectors.clear(); fVectors.clear(); eSentences.clear(); fSentences.clear(); fDocno = docnoPair.getLeftElement(); eDocno = docnoPair.getRightElement(); // parse WikiDocInfo object into sentences and vectors, based on the language id WikiSentenceInfo sentenceInfo; int eCnt = 0, fCnt = 0; while (wikiSentences.hasNext()) { sentenceInfo = wikiSentences.next(); if(sentenceInfo.getLangID() == CLIRUtils.F){ fCnt++; fVectors.add(sentenceInfo.getVector()); fSentences.add(sentenceInfo.getSentence()); reporter.incrCounter(Sentences.F, 1); }else if(sentenceInfo.getLangID() == CLIRUtils.E){ eCnt++; eVectors.add(sentenceInfo.getVector()); eSentences.add(sentenceInfo.getSentence()); reporter.incrCounter(Sentences.E, 1); }else { throw new RuntimeException("Unknown language ID -- should not happen!"); } } /** * some sentences in docs are removed in previous step (i.e., Docs2Sentences) due to length etc. * if all of the sentences in a document are removed, then it will not show up here * therefore the pair will be "incomplete". we simply ignore these pairs for bitext extraction. */ if((eCnt == 0 || fCnt == 0)){ sLogger.debug("Read "+eCnt+","+fCnt+" sentences: ="+eDocno+","+fDocno); if(eCnt == 0){ reporter.incrCounter(Docs.pairsIncompleteE, 1); }else{ reporter.incrCounter(Docs.pairsIncompleteF, 1); } return; } // counters for debug purposes only reporter.incrCounter(Docs.pairs, 1); reporter.incrCounter(Sentences.pairsCandidate, fVectors.size() * eVectors.size()); int numProcessed = 0; long time = 0; sLogger.debug(fSentences.size()+","+eSentences.size()); // classify each e-f sentence pair in the candidate set for (int f = 0; f < fVectors.size(); f++) { HMapStFW fVector = fVectors.get(f); int fSentLength = fSentences.get(f).getLength(); for (int e = 0; e < eVectors.size(); e++) { HMapStFW eVector = eVectors.get(e); int eSentLength = eSentences.get(e).getLength(); if (eSentLength > 2 * fSentLength || fSentLength > 2 * eSentLength) { // sLogger.debug("length filter"); reporter.incrCounter(Sentences.pairsFilteredBySentRatio, 1); continue; } reporter.incrCounter(Sentences.pairsProcessed, 1); numProcessed++; // compute features long start = System.currentTimeMillis(); String[] instance = CLIRUtils.computeFeaturesF1(eVector, fVector, eSentLength, fSentLength); time += (System.currentTimeMillis()-start); // classify w/ maxent model // emit if labeled parallel if (instance == null) { throw new RuntimeException("SHOULD NOT HAPPEN!"); } // apply MaxEnt classifier to instance float[] values = RealValueFileEventStream.parseContexts(instance); double[] probs = helper.getClassifier().eval(instance, values); // check if confidence above specified threshold double confidence = probs[classifierPositiveId]; if (confidence > classifierThreshold) { reporter.incrCounter(Sentences.parallel, 1); output.collect(new Text(fSentences.get(f) + CLIRUtils.BitextSeparator + eSentences.get(e)), emptyValue); } } } } } /** * Runs this tool. */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), FindParallelSentencePairs.class); // Read commandline argument conf = setupConf(conf, args); if (conf == null) { printUsage(); return -1; } conf.setInt("mapred.task.timeout", 60000000); conf.set("mapreduce.map.memory.mb", "4096"); conf.set("mapreduce.map.java.opts", "-Xmx4096m"); conf.set("mapreduce.reduce.memory.mb", "8192"); conf.set("mapreduce.reduce.java.opts", "-Xmx8192m"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setNumMapTasks(100); conf.setNumReduceTasks(100); conf.setInt("mapred.min.split.size", 2000000000); conf.setFloat("mapred.reduce.slowstart.completed.maps", 0.9f); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(PairOfInts.class); conf.setMapOutputValueClass(WikiSentenceInfo.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); long startTime = System.currentTimeMillis(); JobClient.runJob(conf); sLogger.info("Job finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; } private static final String FCOLLECTION_OPTION = "f_collection"; private static final String ECOLLECTION_OPTION = "e_collection"; private static final String FLANG_OPTION = "f_lang"; private static final String ELANG_OPTION = "e_lang"; private static final String FINDEX_OPTION = "f_index"; private static final String EINDEX_OPTION = "e_index"; private static final String BITEXTNAME_OPTION = "name"; private static final String SENTENCES_OPTION = "sentences"; private static final String BITEXT_OPTION = "bitext"; private static final String DATADIR_OPTION = "data"; private static final String PWSIM_OPTION = "pwsim_output"; private static final String CLASSIFIERID_OPTION = "classifier_id"; private static final String CLASSIFIERTHRESHOLD_OPTION = "threshold"; private static final String LIBJARS_OPTION = "libjars"; private static Options options; @SuppressWarnings("static-access") protected JobConf setupConf(JobConf conf, String[] args) throws Exception { options = new Options(); options.addOption(OptionBuilder.withDescription("source-side raw collection path").withArgName("path").hasArg().isRequired().create(FCOLLECTION_OPTION)); options.addOption(OptionBuilder.withDescription("target-side raw collection path").withArgName("path").hasArg().isRequired().create(ECOLLECTION_OPTION)); options.addOption(OptionBuilder.withDescription("two-letter code for f-language").withArgName("en|de|tr|cs|zh|ar|es").hasArg().isRequired().create(FLANG_OPTION)); options.addOption(OptionBuilder.withDescription("two-letter code for e-language").withArgName("en|de|tr|cs|zh|ar|es").hasArg().isRequired().create(ELANG_OPTION)); options.addOption(OptionBuilder.withDescription("source-side index path").withArgName("path").hasArg().isRequired().create(FINDEX_OPTION)); options.addOption(OptionBuilder.withDescription("target-side index path").withArgName("path").hasArg().isRequired().create(EINDEX_OPTION)); options.addOption(OptionBuilder.withDescription("name of bitext").withArgName("string").hasArg().create(BITEXTNAME_OPTION)); options.addOption(OptionBuilder.withDescription("path to data files on HDFS").withArgName("path").hasArg().isRequired().create(DATADIR_OPTION)); options.addOption(OptionBuilder.withDescription("path to output of pwsim algorithm").withArgName("path").hasArg().isRequired().create(PWSIM_OPTION)); options.addOption(OptionBuilder.withDescription("classifier id to retrieve P('PARALLEL'|instance)").withArgName("0 or 1").hasArg().isRequired().create(CLASSIFIERID_OPTION)); options.addOption(OptionBuilder.withDescription("target vocabulary (e-side) of P(e|f)").withArgName("0-1").hasArg().isRequired().create(CLASSIFIERTHRESHOLD_OPTION)); options.addOption(OptionBuilder.withDescription("path to collection sentences").withArgName("path").hasArg().isRequired().create(SENTENCES_OPTION)); options.addOption(OptionBuilder.withDescription("path to output bitext").withArgName("path").hasArg().isRequired().create(BITEXT_OPTION)); options.addOption(OptionBuilder.withDescription("Hadoop option to load external jars").withArgName("jar packages").hasArg().create(LIBJARS_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return null; } String pwsimPairsPath = cmdline.getOptionValue(PWSIM_OPTION); String eDir = cmdline.getOptionValue(EINDEX_OPTION); String fDir = cmdline.getOptionValue(FINDEX_OPTION); String dataDir = cmdline.getOptionValue(DATADIR_OPTION); String eLang = cmdline.getOptionValue(ELANG_OPTION); String fLang = cmdline.getOptionValue(FLANG_OPTION); String bitextName = cmdline.hasOption(BITEXTNAME_OPTION) ? cmdline.getOptionValue(BITEXTNAME_OPTION) : ""; float classifierThreshold = Float.parseFloat(cmdline.getOptionValue(CLASSIFIERTHRESHOLD_OPTION)); int classifierId = Integer.parseInt(cmdline.getOptionValue(CLASSIFIERID_OPTION)); String sentsPath = cmdline.getOptionValue(SENTENCES_OPTION); String outputPath = cmdline.getOptionValue(BITEXT_OPTION); if (!FileSystem.get(conf).exists(new Path(sentsPath))) { System.err.println("Input sentences does not exist at: " + sentsPath + ". Exiting..."); return null; } FileInputFormat.addInputPaths(conf, sentsPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setJobName(this.getClass().getCanonicalName() + "_" + fLang +"-" + eLang +"_F1="+classifierThreshold+"["+classifierId+"]"); try { conf = PwsimEnvironment.setBitextPaths(conf, dataDir, eLang, fLang, bitextName, eDir, fDir, classifierThreshold, classifierId, pwsimPairsPath, "simple"); } catch (Exception e) { e.printStackTrace(); System.err.println("Error configuring paths: " + e.getMessage()); return null; } sLogger.info("Running job " + conf.getJobName()); sLogger.info("Pwsim output path: " + pwsimPairsPath); sLogger.info("Sentences path: " + sentsPath); sLogger.info("Output path: " + outputPath); return conf; } /** * Dispatches command-line arguments to the tool via the * <code>ToolRunner</code>. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new FindParallelSentencePairs(), args); System.exit(res); } }