package org.apache.tika.parser.jdbc;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.tika.TikaTest;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Database;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ToXMLContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
public class SQLite3ParserTest extends TikaTest {
private final static String TEST_FILE_NAME = "testSqlite3b.db";
private final static String TEST_FILE1 = "/test-documents/" + TEST_FILE_NAME;
@Test
public void testBasic() throws Exception {
Parser p = new AutoDetectParser();
//test different types of input streams
//actual inputstream, memory buffered bytearray and literal file
InputStream[] streams = new InputStream[3];
streams[0] = getResourceAsStream(TEST_FILE1);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(getResourceAsStream(TEST_FILE1), bos);
streams[1] = new ByteArrayInputStream(bos.toByteArray());
streams[2] = TikaInputStream.get(getResourceAsFile(TEST_FILE1));
int tests = 0;
for (InputStream stream : streams) {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
//1) getXML closes the stream
//2) getXML runs recursively on the contents, so the embedded docs should show up
XMLResult result = getXML(stream, p, metadata);
stream.close();
String x = result.xml;
//first table name
assertContains("<table name=\"my_table1\"><thead><tr>\t<th>PK</th>", x);
//non-ascii
assertContains("<td>普林斯顿大学</td>", x);
//boolean
assertContains("<td>true</td>\t<td>2015-01-02</td>", x);
//date test
assertContains("2015-01-04", x);
//timestamp test
assertContains("2015-01-03 15:17:03", x);
//first embedded doc's image tag
assertContains("alt=\"image1.png\"", x);
//second embedded doc's image tag
assertContains("alt=\"A description...\"", x);
//second table name
assertContains("<table name=\"my_table2\"><thead><tr>\t<th>INT_COL2</th>", x);
Metadata post = result.metadata;
String[] tableNames = post.getValues(Database.TABLE_NAME);
assertEquals(2, tableNames.length);
assertEquals("my_table1", tableNames[0]);
assertEquals("my_table2", tableNames[1]);
tests++;
}
assertEquals(3, tests);
}
//make sure that table cells and rows are properly marked to
//yield \t and \n at the appropriate places
@Test
public void testSpacesInBodyContentHandler() throws Exception {
Parser p = new AutoDetectParser();
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
ContentHandler handler = new BodyContentHandler(-1);
ParseContext ctx = new ParseContext();
ctx.set(Parser.class, p);
try (InputStream stream = getResourceAsStream(TEST_FILE1)) {
p.parse(stream, handler, metadata, ctx);
}
String s = handler.toString();
assertContains("0\t2.3\t2.4\tlorem", s);
assertContains("tempor\n", s);
}
//test what happens if the user does not want embedded docs handled
@Test
public void testNotAddingEmbeddedParserToParseContext() throws Exception {
Parser p = new AutoDetectParser();
ContentHandler handler = new ToXMLContentHandler();
Metadata metadata = new Metadata();
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, new EmptyParser());
try (InputStream is = getResourceAsStream(TEST_FILE1)) {
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
p.parse(is, handler, metadata, parseContext);
}
String xml = handler.toString();
//just includes headers for embedded documents
assertContains("<table name=\"my_table1\"><thead><tr>", xml);
assertContains("<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>", xml);
//but no other content
assertNotContained("dog", xml);
assertNotContained("alt=\"image1.png\"", xml);
//second embedded doc's image tag
assertNotContained("alt=\"A description...\"", xml);
}
@Test
public void testRecursiveParserWrapper() throws Exception {
Parser p = new AutoDetectParser();
RecursiveParserWrapper wrapper =
new RecursiveParserWrapper(p, new BasicContentHandlerFactory(
BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
Metadata metadata = new Metadata();
try (InputStream is = getResourceAsStream(TEST_FILE1)) {
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
wrapper.parse(is, new BodyContentHandler(-1), metadata, new ParseContext());
}
List<Metadata> metadataList = wrapper.getMetadata();
int i = 0;
assertEquals(5, metadataList.size());
//make sure the \t are inserted in a body handler
String table = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
assertContains("0\t2.3\t2.4\tlorem", table);
assertContains("普林斯顿大学", table);
//make sure the \n is inserted
String table2 = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
assertContains("do eiusmod tempor\n", table2);
assertContains("The quick brown fox", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
assertContains("The quick brown fox", metadataList.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));
//confirm .doc was added to blob
assertEquals("/BYTES_COL_0.doc/image1.png", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
}
@Test
public void testParserContainerExtractor() throws Exception {
//There should be 6 embedded documents:
//2x tables -- UTF-8 csv representations of the tables
//2x word files, one doc and one docx
//2x png files, the same image embedded in each of the doc and docx
ParserContainerExtractor ex = new ParserContainerExtractor();
ByteCopyingHandler byteCopier = new ByteCopyingHandler();
Metadata metadata = new Metadata();
try (TikaInputStream is = TikaInputStream.get(getResourceAsStream(TEST_FILE1))) {
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
ex.extract(is, ex, byteCopier);
}
assertEquals(4, byteCopier.bytes.size());
String[] strings = new String[4];
for (int i = 1; i < byteCopier.bytes.size(); i++) {
byte[] byteArr = byteCopier.bytes.get(i);
String s = new String(byteArr, 0, Math.min(byteArr.length, 1000), UTF_8);
strings[i] = s;
}
byte[] oleBytes = new byte[]{
(byte) -48,
(byte) -49,
(byte) 17,
(byte) -32,
(byte) -95,
(byte) -79,
(byte) 26,
(byte) -31,
(byte) 0,
(byte) 0,
};
//test OLE
for (int i = 0; i < 10; i++) {
assertEquals(oleBytes[i], byteCopier.bytes.get(0)[i]);
}
assertContains("PNG", strings[1]);
assertContains("PK", strings[2]);
assertContains("PNG", strings[3]);
}
//This confirms that reading the stream twice is not
//quadrupling the number of attachments.
@Test
public void testInputStreamReset() throws Exception {
//There should be 8 embedded documents:
//4x word files, two docs and two docxs
//4x png files, the same image embedded in each of the doc and docx
ParserContainerExtractor ex = new ParserContainerExtractor();
InputStreamResettingHandler byteCopier = new InputStreamResettingHandler();
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
try (InputStream is = getResourceAsStream(TEST_FILE1)) {
try (TikaInputStream tis = TikaInputStream.get(is)) {
ex.extract(tis, ex, byteCopier);
is.reset();
}
}
assertEquals(8, byteCopier.bytes.size());
}
@Test
public void testNulls() throws Exception {
String xml = getXML(TEST_FILE_NAME).xml.replaceAll("\\s+", "");
//everything except for the first key column should be empty
assertContains("<tr><td>2</td><td/><td/><td/><td/><td/><td/><td/><td/><td/></tr>", xml);
}
public static class InputStreamResettingHandler implements EmbeddedResourceHandler {
public List<byte[]> bytes = new ArrayList<byte[]>();
@Override
public void handle(String filename, MediaType mediaType,
InputStream stream) {
ByteArrayOutputStream os = new ByteArrayOutputStream();
if (!stream.markSupported()) {
stream = TikaInputStream.get(stream);
}
stream.mark(1000000);
try {
IOUtils.copy(stream, os);
bytes.add(os.toByteArray());
stream.reset();
//now try again
os.reset();
IOUtils.copy(stream, os);
bytes.add(os.toByteArray());
stream.reset();
} catch (IOException e) {
//swallow
}
}
}
//code used for creating the test file
/*
private Connection getConnection(String dbFileName) throws Exception {
File testDirectory = new File(this.getClass().getResource("/test-documents").toURI());
System.out.println("Writing to: " + testDirectory.getAbsolutePath());
File testDB = new File(testDirectory, dbFileName);
Connection c = null;
try {
Class.forName("org.sqlite.JDBC");
c = DriverManager.getConnection("jdbc:sqlite:" + testDB.getAbsolutePath());
} catch ( Exception e ) {
System.err.println( e.getClass().getName() + ": " + e.getMessage() );
System.exit(0);
}
return c;
}
@Test
public void testCreateDB() throws Exception {
Connection c = getConnection("testSqlite3d.db");
Statement st = c.createStatement();
String sql = "DROP TABLE if exists my_table1";
st.execute(sql);
sql = "CREATE TABLE my_table1 (" +
"PK INT PRIMARY KEY, "+
"INT_COL INTEGER, "+
"FLOAT_COL FLOAT, " +
"DOUBLE_COL DOUBLE, " +
"CHAR_COL CHAR(30), "+
"VARCHAR_COL VARCHAR(30), "+
"BOOLEAN_COL BOOLEAN,"+
"DATE_COL DATE,"+
"TIME_STAMP_COL TIMESTAMP,"+
"CLOB_COL CLOB, "+
"BYTES_COL BYTES" +
")";
st.execute(sql);
sql = "insert into my_table1 (PK, INT_COL, FLOAT_COL, DOUBLE_COL, CHAR_COL, " +
"VARCHAR_COL, BOOLEAN_COL, DATE_COL, TIME_STAMP_COL, CLOB_COL, BYTES_COL) " +
"values (?,?,?,?,?,?,?,?,?,?,?)";
SimpleDateFormat f = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
java.util.Date d = f.parse("2015-01-03 15:17:03");
System.out.println(d.getTime());
long d1Long = 1420229823000L;// 2015-01-02 15:17:03
long d2Long = 1420316223000L;// 2015-01-03 15:17:03
PreparedStatement ps = c.prepareStatement(sql);
ps.setInt(1, 0);
ps.setInt(2, 10);
ps.setFloat(3, 2.3f);
ps.setDouble(4, 2.4d);
ps.setString(5, "lorem");
ps.setString(6, "普林斯顿大学");
ps.setBoolean(7, true);
ps.setString(8, "2015-01-02");
ps.setString(9, "2015-01-03 15:17:03");
// ps.setClob(10, new StringReader(sql));
ps.setBytes(10, getByteArray(this.getClass().getResourceAsStream("/test-documents/testWORD_1img.doc")));//contains "quick brown fox"
ps.executeUpdate();
ps.clearParameters();
ps.setInt(1, 1);
ps.setInt(2, 20);
ps.setFloat(3, 4.6f);
ps.setDouble(4, 4.8d);
ps.setString(5, "dolor");
ps.setString(6, "sit");
ps.setBoolean(7, false);
ps.setString(8, "2015-01-04");
ps.setString(9, "2015-01-03 15:17:03");
//ps.setClob(9, new StringReader("consectetur adipiscing elit"));
ps.setBytes(10, getByteArray(this.getClass().getResourceAsStream("/test-documents/testWORD_1img.docx")));//contains "The end!"
ps.executeUpdate();
//now add a fully null row
ps.clearParameters();
ps.setInt(1, 2);
ps.setNull(2, Types.INTEGER);
ps.setNull(3, Types.FLOAT);
ps.setNull(4, Types.DOUBLE);
ps.setNull(5, Types.CHAR);
ps.setNull(6, Types.VARCHAR);
ps.setNull(7, Types.BOOLEAN);
ps.setNull(8, Types.DATE);
ps.setNull(9, Types.TIMESTAMP);
ps.setNull(10, Types.BLOB);
ps.executeUpdate();
//build table2
sql = "DROP TABLE if exists my_table2";
st.execute(sql);
sql = "CREATE TABLE my_table2 (" +
"INT_COL2 INT PRIMARY KEY, "+
"VARCHAR_COL2 VARCHAR(64))";
st.execute(sql);
sql = "INSERT INTO my_table2 values(0,'sed, do eiusmod tempor')";
st.execute(sql);
sql = "INSERT INTO my_table2 values(1,'incididunt \nut labore')";
st.execute(sql);
c.close();
}
private byte[] getByteArray(InputStream is) throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte[] buff = new byte[1024];
for (int bytesRead; (bytesRead = is.read(buff)) != -1;) {
bos.write(buff, 0, bytesRead);
}
return bos.toByteArray();
}
*/
}