package edu.isi.karma.mapreduce.driver; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class XMLElementExtractorMapper extends Mapper<Text, BytesWritable, Text, Text> { private Text reusableKey = new Text(); private Text reusableValue = new Text(); private String tag; private String startTag; private String endTag; private String prologue = ""; private String epilogue = ""; @Override public void setup(Context context) { Configuration config = context.getConfiguration(); tag = config.get("karma.extraction.xml.tag"); prologue = config.get("karma.extraction.xml.prologue",""); epilogue = config.get("karma.extraction.xml.epilogue",""); startTag = "<" + tag; endTag ="</" + tag+ ">"; } @Override public void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException { String ouputKey = key.toString(); reusableKey.set(ouputKey); int start =0; int end; int position; while(-1 != (position = makeSureTagIsValid(value.getBytes(), startTag, start))) { end = rawIndexOf(value.getBytes(), endTag, position); String patent = new String(value.getBytes(), position, end + endTag.length() - position); reusableValue.set(prologue + patent + epilogue); context.write(reusableKey, reusableValue); start = end + 1; } } public int makeSureTagIsValid(byte[] bytes, String startTag, int start) { int position = rawIndexOf(bytes, startTag, start); while(position != -1) { char nextChar = (char)bytes[position + startTag.length()]; if(nextChar != ' ' && nextChar != '>'){ position = rawIndexOf(bytes, startTag, position +1 ); } else { break; } } return position; } public int rawIndexOf(byte[] input, String startTag, int start) { int temp = start; byte[] startTagBytes = startTag.getBytes(); int i = 0; int log = -1; while(temp < input.length) { if(input[temp] == startTagBytes[i]) { i++; } else if (i > 0) { i= 0; temp = log; log = -1; } if(i == 1) { log = temp; } else if (i == startTag.length()) { break; } temp++; } if(temp >= input.length) return -1; return log; } }