package org.apache.tika.parser.mock;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static java.nio.charset.StandardCharsets.UTF_8;
import javax.xml.parsers.DocumentBuilder;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* This class enables mocking of parser behavior for use in testing
* wrappers and drivers of parsers.
* <p>
* See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation
* of all the options for this MockParser.
* <p>
* Tests for this class are in tika-parsers.
* <p>
* See also {@link org.apache.tika.parser.DummyParser} for another option.
*/
public class MockParser extends AbstractParser {
private static final long serialVersionUID = 1L;
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
Set<MediaType> types = new HashSet<MediaType>();
MediaType type = MediaType.application("mock+xml");
types.add(type);
return types;
}
@Override
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
Document doc = null;
try {
DocumentBuilder docBuilder = context.getDocumentBuilder();
doc = docBuilder.parse(stream);
} catch (SAXException e) {
//to distinguish between SAX on read vs SAX while writing
throw new IOExceptionWithCause(e);
}
Node root = doc.getDocumentElement();
NodeList actions = root.getChildNodes();
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
for (int i = 0; i < actions.getLength(); i++) {
executeAction(actions.item(i), metadata, context, xhtml);
}
xhtml.endDocument();
}
private void executeAction(Node action, Metadata metadata, ParseContext context,
XHTMLContentHandler xhtml) throws SAXException,
IOException, TikaException {
if (action.getNodeType() != 1) {
return;
}
String name = action.getNodeName();
if ("metadata".equals(name)) {
metadata(action, metadata);
} else if("write".equals(name)) {
write(action, xhtml);
} else if ("throw".equals(name)) {
throwIt(action);
} else if ("hang".equals(name)) {
hang(action);
} else if ("oom".equals(name)) {
kabOOM();
} else if ("print_out".equals(name) || "print_err".equals(name)){
print(action, name);
} else if ("embedded".equals(name)) {
handleEmbedded(action, xhtml, context);
} else if ("throwIllegalChars".equals(name)) {
throwIllegalChars();
} else {
throw new IllegalArgumentException("Didn't recognize mock action: "+name);
}
}
private void throwIllegalChars() throws IOException {
throw new IOException("Can't say \u0000 in xml or \u0001 or \u0002 or \u0003");
}
private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseContext context)
throws TikaException, SAXException, IOException {
String fileName = "";
String contentType = "";
NamedNodeMap attrs = action.getAttributes();
if (attrs != null) {
Node n = attrs.getNamedItem("filename");
if (n != null) {
fileName = n.getNodeValue();
}
n = attrs.getNamedItem("content-type");
if (n != null) {
contentType = n.getNodeValue();
}
}
String embeddedText = action.getTextContent();
EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(context);
Metadata m = new Metadata();
m.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
if (! "".equals(contentType)) {
m.set(Metadata.CONTENT_TYPE, contentType);
}
InputStream is = new ByteArrayInputStream(embeddedText.getBytes(UTF_8));
extractor.parseEmbedded(
is,
new EmbeddedContentHandler(handler),
m, true);
}
protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor(ParseContext context) {
EmbeddedDocumentExtractor extractor =
context.get(EmbeddedDocumentExtractor.class);
if (extractor == null) {
Parser p = context.get(Parser.class);
if (p == null) {
context.set(Parser.class, new MockParser());
}
extractor = new ParsingEmbeddedDocumentExtractor(context);
}
return extractor;
}
private void print(Node action, String name) {
String content = action.getTextContent();
if ("print_out".equals(name)) {
System.out.println(content);
} else if ("print_err".equals(name)) {
System.err.println(content);
} else {
throw new IllegalArgumentException("must be print_out or print_err");
}
}
private void hang(Node action) {
boolean interruptible = true;
boolean heavy = false;
long millis = -1;
long pulseMillis = -1;
NamedNodeMap attrs = action.getAttributes();
Node iNode = attrs.getNamedItem("interruptible");
if (iNode != null) {
interruptible = ("true".equals(iNode.getNodeValue()));
}
Node hNode = attrs.getNamedItem("heavy");
if (hNode != null) {
heavy = ("true".equals(hNode.getNodeValue()));
}
Node mNode = attrs.getNamedItem("millis");
if (mNode == null) {
throw new RuntimeException("Must specify \"millis\" attribute for hang.");
}
String millisString = mNode.getNodeValue();
try {
millis = Long.parseLong(millisString);
} catch (NumberFormatException e) {
throw new RuntimeException("Value for \"millis\" attribute must be a long.");
}
if (heavy) {
Node pNode = attrs.getNamedItem("pulse_millis");
if (pNode == null) {
throw new RuntimeException("Must specify attribute \"pulse_millis\" if the hang is \"heavy\"");
}
String pulseMillisString = mNode.getNodeValue();
try {
pulseMillis = Long.parseLong(pulseMillisString);
} catch (NumberFormatException e) {
throw new RuntimeException("Value for \"millis\" attribute must be a long.");
}
}
if (heavy) {
hangHeavy(millis, pulseMillis, interruptible);
} else {
sleep(millis, interruptible);
}
}
private void throwIt(Node action) throws IOException,
SAXException, TikaException {
NamedNodeMap attrs = action.getAttributes();
String className = attrs.getNamedItem("class").getNodeValue();
String msg = action.getTextContent();
throwIt(className, msg);
}
private void metadata(Node action, Metadata metadata) {
NamedNodeMap attrs = action.getAttributes();
//throws npe unless there is a name
String name = attrs.getNamedItem("name").getNodeValue();
String value = action.getTextContent();
Node actionType = attrs.getNamedItem("action");
if (actionType == null) {
metadata.add(name, value);
} else {
if ("set".equals(actionType.getNodeValue())) {
metadata.set(name, value);
} else {
metadata.add(name, value);
}
}
}
private void write(Node action, XHTMLContentHandler xhtml) throws SAXException {
NamedNodeMap attrs = action.getAttributes();
Node eNode = attrs.getNamedItem("element");
String elementType = "p";
if (eNode != null) {
elementType = eNode.getTextContent();
}
String text = action.getTextContent();
xhtml.startElement(elementType);
xhtml.characters(text);
xhtml.endElement(elementType);
}
private void throwIt(String className, String msg) throws IOException,
SAXException, TikaException {
Throwable t = null;
if (msg == null || msg.equals("")) {
try {
t = (Throwable) Class.forName(className).newInstance();
} catch (Exception e) {
throw new RuntimeException("couldn't create throwable class:"+className, e);
}
} else {
try {
Class<?> clazz = Class.forName(className);
Constructor<?> con = clazz.getConstructor(String.class);
t = (Throwable) con.newInstance(msg);
} catch (Exception e) {
throw new RuntimeException("couldn't create throwable class:" + className, e);
}
}
if (t instanceof SAXException) {
throw (SAXException)t;
} else if (t instanceof IOException) {
throw (IOException) t;
} else if (t instanceof TikaException) {
throw (TikaException) t;
} else if (t instanceof Error) {
throw (Error) t;
} else if (t instanceof RuntimeException) {
throw (RuntimeException) t;
} else {
//wrap the throwable in a RuntimeException
throw new RuntimeException(t);
}
}
private void kabOOM() {
List<int[]> ints = new ArrayList<int[]>();
while (true) {
int[] intArr = new int[32000];
ints.add(intArr);
}
}
private void hangHeavy(long maxMillis, long pulseCheckMillis, boolean interruptible) {
//do some heavy computation and occasionally check for
//whether time has exceeded maxMillis (see TIKA-1132 for inspiration)
//or whether the thread was interrupted
long start = new Date().getTime();
int lastChecked = 0;
while (true) {
for (int i = 1; i < Integer.MAX_VALUE; i++) {
for (int j = 1; j < Integer.MAX_VALUE; j++) {
double div = (double) i / (double) j;
lastChecked++;
if (lastChecked > pulseCheckMillis) {
lastChecked = 0;
if (interruptible && Thread.currentThread().isInterrupted()) {
return;
}
long elapsed = new Date().getTime()-start;
if (elapsed > maxMillis) {
return;
}
}
}
}
}
}
private void sleep(long maxMillis, boolean isInterruptible) {
long start = new Date().getTime();
long millisRemaining = maxMillis;
while (true) {
try {
Thread.sleep(millisRemaining);
} catch (InterruptedException e) {
if (isInterruptible) {
return;
}
}
long elapsed = new Date().getTime()-start;
millisRemaining = maxMillis - elapsed;
if (millisRemaining <= 0) {
break;
}
}
}
}