package com.skp.experiment.common.join; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Mapper.Context; import com.skp.experiment.common.OptionParseUtil; public class JoinUtils { public static Map<String, String> fetchTextFiles(Context ctx, Path input, String delimeter, List<Integer> keyIdxs, List<Integer> valueIdxs) throws IOException { Configuration conf = ctx.getConfiguration(); Map<String, String> caches = new HashMap<String, String>(); // read target file. FileSystem fs = FileSystem.get(conf); FileStatus[] files = fs.globStatus(new Path(input.toString() + "/^[^_]*")); for (FileStatus file : files) { FSDataInputStream in = fs.open(file.getPath()); BufferedReader reader = new BufferedReader(new InputStreamReader(in)); String line = null; while ((line = reader.readLine()) != null) { String[] tokens = line.split(delimeter); // record target key, value String key = OptionParseUtil.encode(tokens, keyIdxs, delimeter); String value = OptionParseUtil.encode(tokens, valueIdxs, delimeter); caches.put(key, value); if (caches.size() % 1000000 == 0) { ctx.setStatus("fetched " + caches.size()); } } } return caches; } public static Map<String, String> fetchTextFile(Context ctx, Path input, String delimeter, List<Integer> keyIdxs, List<Integer> valueIdxs) throws IOException { Configuration conf = ctx.getConfiguration(); Map<String, String> caches = new HashMap<String, String>(); // read target file. FileSystem fs = FileSystem.get(conf); FSDataInputStream in = fs.open(input); BufferedReader reader = new BufferedReader(new InputStreamReader(in)); String line = null; while ((line = reader.readLine()) != null) { String[] tokens = line.split(delimeter); // record target key, value String key = OptionParseUtil.encode(tokens, keyIdxs, delimeter); String value = OptionParseUtil.encode(tokens, valueIdxs, delimeter); caches.put(key, value); if (caches.size() % 1000000 == 0) { ctx.setStatus("fetched " + caches.size()); } } return caches; } }