/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.mapper.attachments;
import org.apache.commons.cli.CommandLine;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.cli.CliTool;
import org.elasticsearch.common.cli.CliToolConfig;
import org.elasticsearch.common.cli.Terminal;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.io.PathUtils;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.DocumentMapperParser;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.core.MapperTestUtils;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Locale;
import static org.elasticsearch.common.cli.CliToolConfig.Builder.cmd;
import static org.elasticsearch.common.cli.CliToolConfig.Builder.option;
import static org.elasticsearch.common.io.Streams.copy;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.mapper.attachments.AttachmentUnitTestCase.getIndicesModuleWithRegisteredAttachmentMapper;
import static org.elasticsearch.test.StreamsUtils.copyToStringFromClasspath;
/**
* This class provides a simple main class which can be used to test what is extracted from a given binary file.
* You can run it using
* -u file://URL/TO/YOUR/DOC
* --size set extracted size (default to mapper attachment size)
* BASE64 encoded binary
*
* Example:
* StandaloneRunner BASE64Text
* StandaloneRunner -u /tmp/mydoc.pdf
* StandaloneRunner -u /tmp/mydoc.pdf --size 1000000
*/
public class StandaloneRunner extends CliTool {
private static final CliToolConfig CONFIG = CliToolConfig.config("tika", StandaloneRunner.class)
.cmds(TikaRunner.CMD)
.build();
static {
System.setProperty("es.path.home", "/tmp");
}
static class TikaRunner extends Command {
private static final String NAME = "tika";
private final String url;
private final Integer size;
private final String base64text;
private final DocumentMapper docMapper;
private static final CliToolConfig.Cmd CMD = cmd(NAME, TikaRunner.class)
.options(option("u", "url").required(false).hasArg(false))
.options(option("t", "size").required(false).hasArg(false))
.build();
protected TikaRunner(Terminal terminal, String url, Integer size, String base64text) throws IOException {
super(terminal);
this.size = size;
this.url = url;
this.base64text = base64text;
DocumentMapperParser mapperParser = MapperTestUtils.newMapperService(PathUtils.get("."), Settings.EMPTY, getIndicesModuleWithRegisteredAttachmentMapper()).documentMapperParser(); // use CWD b/c it won't be used
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/attachment/test/standalone/standalone-mapping.json");
docMapper = mapperParser.parse("person", new CompressedXContent(mapping));
}
@Override
public ExitStatus execute(Settings settings, Environment env) throws Exception {
XContentBuilder builder = jsonBuilder().startObject().field("file").startObject();
if (base64text != null) {
// If base64 is provided
builder.field("_content", base64text);
} else {
// A file is provided
byte[] bytes = copyToBytes(PathUtils.get(url));
builder.field("_content", bytes);
}
if (size >= 0) {
builder.field("_indexed_chars", size);
}
BytesReference json = builder.endObject().endObject().bytes();
ParseContext.Document doc = docMapper.parse("person", "person", "1", json).rootDoc();
terminal.println("## Extracted text");
terminal.println("--------------------- BEGIN -----------------------");
terminal.println("%s", doc.get("file.content"));
terminal.println("---------------------- END ------------------------");
terminal.println("## Metadata");
printMetadataContent(doc, AttachmentMapper.FieldNames.AUTHOR);
printMetadataContent(doc, AttachmentMapper.FieldNames.CONTENT_LENGTH);
printMetadataContent(doc, AttachmentMapper.FieldNames.CONTENT_TYPE);
printMetadataContent(doc, AttachmentMapper.FieldNames.DATE);
printMetadataContent(doc, AttachmentMapper.FieldNames.KEYWORDS);
printMetadataContent(doc, AttachmentMapper.FieldNames.LANGUAGE);
printMetadataContent(doc, AttachmentMapper.FieldNames.NAME);
printMetadataContent(doc, AttachmentMapper.FieldNames.TITLE);
return ExitStatus.OK;
}
private void printMetadataContent(ParseContext.Document doc, String field) {
terminal.println("- %s: %s", field, doc.get(docMapper.mappers().getMapper("file." + field).fieldType().names().indexName()));
}
public static byte[] copyToBytes(Path path) throws IOException {
try (InputStream is = Files.newInputStream(path)) {
if (is == null) {
throw new FileNotFoundException("Resource [" + path + "] not found in classpath");
}
try (BytesStreamOutput out = new BytesStreamOutput()) {
copy(is, out);
return out.bytes().toBytes();
}
}
}
public static Command parse(Terminal terminal, CommandLine cli) throws IOException {
String url = cli.getOptionValue("u");
String base64text = null;
String sSize = cli.getOptionValue("size");
Integer size = sSize != null ? Integer.parseInt(sSize) : -1;
if (url == null && cli.getArgs().length == 0) {
return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided (type -h for help)");
}
if (url == null) {
if (cli.getArgs().length == 0) {
return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided (type -h for help)");
}
base64text = cli.getArgs()[0];
} else {
if (cli.getArgs().length == 1) {
return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided. Not both. (type -h for help)");
}
}
return new TikaRunner(terminal, url, size, base64text);
}
}
public StandaloneRunner() {
super(CONFIG);
}
public static void main(String[] args) {
StandaloneRunner pluginManager = new StandaloneRunner();
pluginManager.execute(args);
}
@Override
protected Command parse(String cmdName, CommandLine cli) throws Exception {
switch (cmdName.toLowerCase(Locale.ROOT)) {
case TikaRunner.NAME: return TikaRunner.parse(terminal, cli);
default:
assert false : "can't get here as cmd name is validated before this method is called";
return exitCmd(ExitStatus.CODE_ERROR);
}
}
}