/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
/**
*
* @author rana
*
*/
public class JoinMapper implements Mapper<WritableComparable,Writable,WritableComparable,JoinValue> {
private static final Log LOG = LogFactory.getLog(JoinMapper.class);
public static final String PATH_TO_TAG_MAPPING = "path_to_tag";
static String getParentDirFromPath(Path path) {
Path parent = path.getParent();
if (Character.isDigit(parent.getName().charAt(0))) {
return parent.getParent().getName();
}
return parent.getName();
}
@Override
public void configure(JobConf job) {
FileSystem fs;
try {
fs = FileSystem.get(job);
readTagMappings(job);
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
throw new RuntimeException(e);
}
Path inputSplit = new Path(job.get("map.input.file"));
String tagName = getParentDirFromPath(inputSplit);
if (_mappings != null) {
String tagId = _mappings.get(inputSplit.getParent().makeQualified(fs));
LOG.info("Mapped Path:" + inputSplit.getParent() + " to:" + tagId);
_tagType.set(tagId);
}
else {
_tagType.set(tagName);
}
}
@Override
public void close() throws IOException {
}
public static void setPathToTagMapping(Map<Path,String> mappings,JobConf conf)throws IOException {
JsonArray jsonArray = new JsonArray();
for (Map.Entry<Path,String> entry : mappings.entrySet()) {
JsonObject jsonEntry = new JsonObject();
jsonEntry.addProperty("key", entry.getKey().toString());
jsonEntry.addProperty("value", entry.getValue());
jsonArray.add(jsonEntry);
}
conf.set(PATH_TO_TAG_MAPPING, jsonArray.toString());
}
Map<Path,String> _mappings = null;
void readTagMappings(JobConf jobConf)throws IOException {
FileSystem fs = FileSystem.get(jobConf);
try {
JsonParser parser = new JsonParser();
String mappings = jobConf.get(PATH_TO_TAG_MAPPING);
LOG.info("Got Mappings:" + mappings);
if (mappings != null) {
JsonArray jsonArray = parser.parse(mappings).getAsJsonArray();
if (jsonArray != null && jsonArray.size() != 0) {
_mappings = new HashMap<Path,String>();
for (int i=0;i<jsonArray.size();++i) {
JsonObject jsonObject = jsonArray.get(i).getAsJsonObject();
_mappings.put(new Path(jsonObject.get("key").getAsString()).makeQualified(fs),jsonObject.get("value").getAsString());
LOG.info("Got Key/Value Mapping:"+ jsonObject.get("key").getAsString() + " " + jsonObject.get("value").getAsString());
}
}
}
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
throw new IOException("Failed to Parse Path to Tag Mappings!");
}
}
TextBytes _tagType = new TextBytes();
public WritableComparable mapKey(WritableComparable key,JoinValue value)throws IOException {
return key;
}
@Override
public void map(WritableComparable key, Writable value,
OutputCollector<WritableComparable, JoinValue> output, Reporter reporter)
throws IOException {
JoinValue joinValue = JoinValue.getJoinValue(_tagType, value);
WritableComparable outputKey = mapKey(key,joinValue);
output.collect(outputKey, joinValue);
}
}