package com.caseystella.util.pig.udf; import com.caseystella.util.common.enrich.ExtractContent; import org.apache.pig.EvalFunc; import org.apache.pig.data.*; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.xml.sax.SAXException; import java.io.IOException; import java.util.Map; /** * Created by cstella on 9/3/14. */ public class ContentExtractor extends EvalFunc<Tuple> { TupleFactory tupleFactory; BagFactory bagFactory; public ContentExtractor() { tupleFactory = TupleFactory.getInstance(); bagFactory = DefaultBagFactory.getInstance(); } @Override public Tuple exec(Tuple objects) throws IOException { Map.Entry<String, Metadata> content ; try { content = ExtractContent.INSTANCE.extractTextWithMetadata((String)objects.get(0) ,((DataByteArray)objects.get(1)).get() ); } catch (SAXException e) { throw new RuntimeException("Unable to parse.", e); } catch (TikaException e) { throw new RuntimeException("Unable to parse.", e); } DataBag metadataBag = bagFactory.newDefaultBag(); Tuple ret = tupleFactory.newTuple(2); ret.set(0, content.getKey()); for(String key : content.getValue().names()) { String value = content.getValue().get(key); Tuple t = tupleFactory.newTuple(2); t.set(0, key); t.set(1, value); metadataBag.add(t); } ret.set(1, metadataBag); return ret; } public Schema outputSchema(Schema inputSchema) { Schema ret = new Schema(); try { ret.add(new Schema.FieldSchema("content", DataType.CHARARRAY)); Schema bagSchema = new Schema(); bagSchema.add(new Schema.FieldSchema("key", DataType.CHARARRAY)); bagSchema.add(new Schema.FieldSchema("value", DataType.CHARARRAY)); ret.add(new Schema.FieldSchema("metadata", bagSchema, DataType.BAG)); return new Schema(new Schema.FieldSchema(getSchemaName("extracted", inputSchema), ret, DataType.TUPLE)); } catch (FrontendException e) { throw new RuntimeException("Unable to create metadata bag schema.", e); } } }