package com.skp.experiment.cf.als.hadoop; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.iterator.FileLineIterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.skp.experiment.common.DistinctColumnValuesJob; import com.skp.experiment.common.HadoopClusterUtil; import com.skp.experiment.common.OptionParseUtil; import com.skp.experiment.common.join.ImprovedRepartitionJoinAndFilterJob; import com.skp.experiment.common.join.JoinOptionUtils; import com.skp.experiment.common.mapreduce.IdentityMapper; import com.skp.experiment.common.parameter.DefaultOptionCreator; import com.skp.experiment.integeration.common.SequentialIdGeneratorJob; /** * * @author doyoung * */ public class Conv2IndexJob extends AbstractJob { private static final Logger log = LoggerFactory.getLogger(Conv2IndexJob.class); private static final String DELIMETER = ","; private boolean oldIndexExist = false; public static void main(String[] args) throws Exception { ToolRunner.run(new Conv2IndexJob(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption("columnIndexs", "cidxs", "column indexs to convert into integer index"); addOption("mapOnlyColumnIndexs", "mcidxs", "column indexs to load into memory.", null); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } //oldIndexExist = checkIndexExist(); Path distinctPath = getTempPath("distinct"); String columnIndexs = getOption("columnIndexs"); List<Integer> cidxs = OptionParseUtil.decode(columnIndexs, JoinOptionUtils.DELIMETER); List<Integer> mapOnlyCidxs = new ArrayList<Integer>(); if (getOption("mapOnlyColumnIndexs") != null) { mapOnlyCidxs = OptionParseUtil.decode(getOption("mapOnlyColumnIndexs"), JoinOptionUtils.DELIMETER); } // step 1. get distinct values for columnIndexs Map<Integer, Long> totalIndexSizes = buildIndex(getInputPath(), columnIndexs, cidxs, getOutputPath()); log.info("Writting out: {}", pathToIndexSize("new").toString()); writeIndexSizesIntoHdfs(pathToIndexSize("new"), totalIndexSizes); // step 3. substitute column values with indexs Path tgtPath = getInputPath(); for (Integer cidx : cidxs) { log.info("Subs: {}:{}", cidx, tgtPath.toString()); String[] jobArgs = null; ImprovedRepartitionJoinAndFilterJob joinJob = new ImprovedRepartitionJoinAndFilterJob(); joinJob.setConf(getConf()); if (mapOnlyCidxs.contains(cidx)) { jobArgs = new String[]{ "-i", tgtPath.toString(), "-o", new Path(distinctPath, cidx + "_append").toString(), "-sidx", String.valueOf(cidx), "-tgt", pathToIndexOutput(cidx, true).toString() + ":" + cidx + ":1:0:sub", "--mapOnly", "true" }; } else { jobArgs = new String[]{ "-i", tgtPath.toString(), "-o", new Path(distinctPath, cidx + "_append").toString(), "-sidx", String.valueOf(cidx), "-tgt", pathToIndexOutput(cidx, true).toString() + ":" + cidx + ":1:0:sub" }; } joinJob.run(jobArgs); tgtPath = new Path(distinctPath, cidx + "_append"); } HadoopClusterUtil.renamePath(getConf(), tgtPath, getOutputPath()); return 0; } private boolean checkIndexExist() throws IOException { boolean exist = false; Path oldIndex = pathToIndex("new"); Path oldIndexSize = pathToIndexSize("new"); FileSystem fs = FileSystem.get(getConf()); if (fs.exists(oldIndex) && fs.exists(oldIndexSize)) { log.info("Move {} --> {}", pathToIndex("new").toString(), pathToIndex("old").toString()); fs.rename(pathToIndex("new"), pathToIndex("old")); log.info("Move {} --> {}", pathToIndexSize("new").toString(), pathToIndexSize("old").toString()); fs.rename(pathToIndexSize("new"), pathToIndex("old")); exist = true; } return exist; } /* * case1: no reference exist * 1. build distinct values into "output/0, output/1"... * 2. build index file into output_index_new/0, output_index_new/1.... * 3. store each index`s dimension(max_id + 1) into indexSizes * case2: there are old indexs exist * 1. get distinct values into "output_all/0, output_all/1" ... * 2. get current distinct values - old index file. store this into output_all/0_minus, output_all/1_minus * 3. get old index`s size per column * 4. generate sequential id per output_all/0_minus, output_all/1_minus start from old index`s dimension * store this into output_all/0_minus_index, output_all/1_minus_index * 5. merge output_all/0_minus_index and old_index_path/0 */ private Map<Integer, Long> buildIndex(Path input, String columnIndexs, List<Integer> cidxs, Path output) throws Exception { Map<Integer, Long> totalIndexSizes = new HashMap<Integer, Long>(); // step 1. get distinct values for columnIndexs // create output/0/.... output/1/.... Path distinctValueParentPath = oldIndexExist == false ? getTempPath("distinct") : getTempPath("distinct_all"); ToolRunner.run(new DistinctColumnValuesJob(), new String[]{ "-i", getInputPath().toString(), "-o", distinctValueParentPath.toString(), "--columnIndexs", columnIndexs }); for (Integer cidx : cidxs) { if (!oldIndexExist) { ToolRunner.run(new SequentialIdGeneratorJob(), new String[] { "-i", pathToDistinctValues(distinctValueParentPath, cidx).toString(), "-o", pathToIndexOutput(cidx, true).toString(), "--tempDir", getTempPath("idgenerate" + cidx).toString(), "--cleanUp", "false" }); totalIndexSizes.put(cidx, SequentialIdGeneratorJob.totalIdCount); } else { // get new commers only ToolRunner.run(new ImprovedRepartitionJoinAndFilterJob(), new String[]{ "-i", pathToDistinctValues(distinctValueParentPath, cidx).toString(), "-o", pathToDistinctValues(distinctValueParentPath, cidx).toString() + "_filtered", "-sidx", "0", "-tgt", pathToIndex("old") + "/" + cidx + ":0:1:0:filter" }); // we need to calculate old index size to append FileLineIterator iter = new FileLineIterator(FileSystem.get(getConf()).open(pathToIndexSize("old"))); while (iter.hasNext()) { String[] tokens = iter.next().split(DELIMETER); totalIndexSizes.put(Integer.parseInt(tokens[0]), Long.parseLong(tokens[1])); //System.out.println("Old: " + tokens[0] + "\tSize: " + tokens[1]); } ToolRunner.run(new SequentialIdGeneratorJob(), new String[]{ "-i", pathToDistinctValues(distinctValueParentPath, cidx).toString() + "_filtered", "-o", pathToDistinctValues(distinctValueParentPath, cidx).toString() + "_filtered_index", "--tempDir", getTempPath("idgenerate2" + cidx).toString(), "--cleanUp", "false", "--startIndex", String.valueOf(totalIndexSizes.get(cidx)) }); //System.out.println("New: " + cidx + "\tSize: "+ SequentialIdGeneratorJob.totalIdCount); totalIndexSizes.put(cidx, totalIndexSizes.get(cidx) + SequentialIdGeneratorJob.totalIdCount); //System.out.println("Total: " + cidx + "\tSize: " + totalIndexSizes.get(cidx)); mergeTwoPath(new Path(pathToIndex("old"), String.valueOf(cidx)), new Path(pathToDistinctValues(distinctValueParentPath, cidx).toString() + "_filtered_index"), pathToIndexOutput(cidx, true)); } } return totalIndexSizes; } private void writeIndexSizesIntoHdfs(Path output, Map<Integer, Long> indexSizes) throws IOException { StringBuffer sb = new StringBuffer(); for (Entry<Integer, Long> idxSize : indexSizes.entrySet()) { //System.out.println("writting out\t" + idxSize.getKey() + ", " + idxSize.getValue()); sb.append(idxSize.getKey() + DELIMETER + idxSize.getValue() + DefaultOptionCreator.NEWLINE); } HadoopClusterUtil.writeToHdfs(getConf(), output, sb.toString(), false); } //hadoop 0.20.2 doens`t support append mode private void mergeTwoPath(Path inputA, Path inputB, Path output) throws IOException, InterruptedException, ClassNotFoundException { Job mergeJob = prepareJob(inputA, output, TextInputFormat.class, IdentityMapper.class, NullWritable.class, Text.class, TextOutputFormat.class); FileInputFormat.addInputPath(mergeJob, inputB); mergeJob.getConfiguration().setBoolean(IdentityMapper.VALUE_ONLY_OUT, true); mergeJob.waitForCompletion(true); } private Path pathToIndexSize(String oldOrNew) { return new Path(getOutputPath().toString() + "_index_" + oldOrNew + "_size"); } private Path pathToIndex(String oldOrNew) { return new Path(getOutputPath().toString() + "_index_" + oldOrNew); } private Path pathToDistinctValues(Path basePath, int columnIndex) { return new Path(basePath, String.valueOf(columnIndex)); } private Path pathToIndexOutput(int columnIndex, boolean isNew) { return new Path(getOutputPath().toString() + (isNew ? "_index_new" : "_index_old"), String.valueOf(columnIndex)); } }