package eu.dnetlib.iis.wf.metadataextraction; import java.io.InputStream; import java.nio.ByteBuffer; import java.util.Collections; import java.util.HashMap; import java.util.Map; import org.apache.avro.file.DataFileWriter; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import eu.dnetlib.iis.common.java.PortBindings; import eu.dnetlib.iis.common.java.Process; import eu.dnetlib.iis.common.java.io.DataStore; import eu.dnetlib.iis.common.java.io.FileSystemPath; import eu.dnetlib.iis.common.java.porttype.AvroPortType; import eu.dnetlib.iis.common.java.porttype.PortType; import eu.dnetlib.iis.importer.schemas.DocumentContent; /** * Generates a directory with a few example avro files containing PDFs. * * @author Dominika Tkaczyk * */ public class ExamplePdfBasedDocumentContentGenerator implements Process { private static final String PORT_OUT_DOC_CONTENT = "doc_content"; private static final String PARAM_PDF_SOURCE_DIR = "pdfs_resource_dir"; private final Map<String, PortType> outputPorts = new HashMap<String, PortType>(); // ------------------------- CONSTRUCTORS -------------------------------- public ExamplePdfBasedDocumentContentGenerator() { outputPorts.put(PORT_OUT_DOC_CONTENT, new AvroPortType(DocumentContent.SCHEMA$)); } // ------------------------- LOGIC --------------------------------------- @Override public Map<String, PortType> getInputPorts() { return Collections.emptyMap(); } @Override public Map<String, PortType> getOutputPorts() { return outputPorts; } @Override public void run(PortBindings portBindings, Configuration conf, Map<String, String> parameters) throws Exception { FileSystem fs = FileSystem.get(conf); FileSystemPath fsPath = new FileSystemPath(fs, portBindings.getOutput().get(PORT_OUT_DOC_CONTENT)); int id = 0; try (DataFileWriter<DocumentContent> writer = DataStore.create(fsPath, DocumentContent.SCHEMA$)) { for (InputStream is : StandardPDFExamples.getFilesFromResources(parameters.get(PARAM_PDF_SOURCE_DIR))) { DocumentContent.Builder docContentBuilder = DocumentContent.newBuilder(); docContentBuilder.setId("id" + (id++)); try { docContentBuilder.setPdf(ByteBuffer.wrap(IOUtils.toByteArray(is))); } finally { is.close(); } writer.append(docContentBuilder.build()); } } } }