package hadoop.wikievidence.ldadataconstruction; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import com.google.gson.Gson; public class WikipediaLDADataGeneratorMapper extends Mapper<LongWritable, Text, Text, Text> { private Gson gson; public WikipediaLDADataGeneratorMapper() { super(); this.gson = new Gson(); } @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); Output o = gson.fromJson(line, Output.class); // Eventual content string processing. We keep the format of 1000 words before and after the entity mention // context.write(new Text(o.getUrl()), new Text(o.getMention()+"---"+o.getContent())); context.write(new Text(o.getUrl()), new Text(o.getContent())); } public class Output { private String entity; private String content; private String mention; public String getUrl() { return entity; } public void setUrl(String url) { this.entity = url; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getMention() { return mention; } public void setMention(String mention) { this.mention = mention; } } }