package edu.isi.dig.elasticsearch.mapreduce.driver; import java.io.ByteArrayInputStream; import java.io.IOException; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import net.sf.json.JSONSerializer; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Mapper; import org.apache.tika.exception.TikaException; import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; public class ESMapper extends Mapper<Writable,Text,Text,Text>{ private Text reusableKey = new Text(); @Override public void map(Writable key,Text value, Context context) throws IOException, InterruptedException { JSONObject jObj = extractTika(value.toString()); if (jObj.containsKey("_id")) { reusableKey.set(jObj.getString("_id")); context.write(reusableKey, new Text(jObj.toString())); } else { context.write(null, new Text(jObj.toString())); } } private JSONObject extractTika(String contents){ JSONObject jObj = (JSONObject)JSONSerializer.toJSON(contents); if(jObj.containsKey("_source")) { JSONObject jObjSource = jObj.getJSONObject("_source"); if(jObjSource.containsKey("raw_content")) { String rawHtml = jObjSource.getString("raw_content"); ByteArrayInputStream bIs = new ByteArrayInputStream(rawHtml.getBytes()); Metadata metadata = new Metadata(); AutoDetectParser adp = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(10*1024*1024); try { adp.parse(bIs, handler, metadata); String[] metadataNames = metadata.names(); JSONObject jObjMetadata = new JSONObject(); for(String metadataName:metadataNames) { String[] values = metadata.getValues(metadataName); JSONArray jArray = new JSONArray(); for(String mValue: values) { jArray.add(mValue); } jObjMetadata.accumulate(metadataName, jArray); } //remove empty lines from the text String rawTextAdjusted = handler.toString().replaceAll("(?m)^[ \t]*\r?\n", ""); //detect language LanguageIdentifier li = new LanguageIdentifier(rawTextAdjusted); jObjSource.accumulate("tikametadata", jObjMetadata); jObjSource.accumulate("raw_text", rawTextAdjusted); jObjSource.accumulate("rawtextdetectedlanguage", li.getLanguage()); } catch (SAXException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (TikaException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } return jObj; } }