SQLite3ParserTest.java example

Explorer
tika-master
package org.apache.tika.parser.jdbc;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.tika.TikaTest;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Database;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ToXMLContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;

public class SQLite3ParserTest extends TikaTest {
    private final static String TEST_FILE_NAME = "testSqlite3b.db";
    private final static String TEST_FILE1 = "/test-documents/" + TEST_FILE_NAME;

    @Test
    public void testBasic() throws Exception {
        Parser p = new AutoDetectParser();

        //test different types of input streams
        //actual inputstream, memory buffered bytearray and literal file
        InputStream[] streams = new InputStream[3];
        streams[0] = getResourceAsStream(TEST_FILE1);
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        IOUtils.copy(getResourceAsStream(TEST_FILE1), bos);
        streams[1] = new ByteArrayInputStream(bos.toByteArray());
        streams[2] = TikaInputStream.get(getResourceAsFile(TEST_FILE1));
        int tests = 0;
        for (InputStream stream : streams) {
            Metadata metadata = new Metadata();
            metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
            //1) getXML closes the stream
            //2) getXML runs recursively on the contents, so the embedded docs should show up
            XMLResult result = getXML(stream, p, metadata);
            stream.close();
            String x = result.xml;
            //first table name
            assertContains("<table name=\"my_table1\"><thead><tr>\t<th>PK</th>", x);
            //non-ascii
            assertContains("<td>普林斯顿大学</td>", x);
            //boolean
            assertContains("<td>true</td>\t<td>2015-01-02</td>", x);
            //date test
            assertContains("2015-01-04", x);
            //timestamp test
            assertContains("2015-01-03 15:17:03", x);
            //first embedded doc's image tag
            assertContains("alt=\"image1.png\"", x);
            //second embedded doc's image tag
            assertContains("alt=\"A description...\"", x);
            //second table name
            assertContains("<table name=\"my_table2\"><thead><tr>\t<th>INT_COL2</th>", x);

            Metadata post = result.metadata;
            String[] tableNames = post.getValues(Database.TABLE_NAME);
            assertEquals(2, tableNames.length);
            assertEquals("my_table1", tableNames[0]);
            assertEquals("my_table2", tableNames[1]);
            tests++;
        }
        assertEquals(3, tests);
    }

    //make sure that table cells and rows are properly marked to
    //yield \t and \n at the appropriate places
    @Test
    public void testSpacesInBodyContentHandler() throws Exception {
        Parser p = new AutoDetectParser();
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext ctx = new ParseContext();
        ctx.set(Parser.class, p);
        try (InputStream stream = getResourceAsStream(TEST_FILE1)) {
            p.parse(stream, handler, metadata, ctx);
        }
        String s = handler.toString();
        assertContains("0\t2.3\t2.4\tlorem", s);
        assertContains("tempor\n", s);
    }

    //test what happens if the user does not want embedded docs handled
    @Test
    public void testNotAddingEmbeddedParserToParseContext() throws Exception {
        Parser p = new AutoDetectParser();
        ContentHandler handler = new ToXMLContentHandler();
        Metadata metadata = new Metadata();
        ParseContext parseContext = new ParseContext();
        parseContext.set(Parser.class, new EmptyParser());
        try (InputStream is = getResourceAsStream(TEST_FILE1)) {
            metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
            p.parse(is, handler, metadata, parseContext);
        }
        String xml = handler.toString();
        //just includes headers for embedded documents
        assertContains("<table name=\"my_table1\"><thead><tr>", xml);
        assertContains("<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>", xml);
        //but no other content
        assertNotContained("dog", xml);
        assertNotContained("alt=\"image1.png\"", xml);
        //second embedded doc's image tag
        assertNotContained("alt=\"A description...\"", xml);
    }

    @Test
    public void testRecursiveParserWrapper() throws Exception {
        Parser p = new AutoDetectParser();

        RecursiveParserWrapper wrapper =
                new RecursiveParserWrapper(p, new BasicContentHandlerFactory(
                        BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
        Metadata metadata = new Metadata();
        try (InputStream is = getResourceAsStream(TEST_FILE1)) {
            metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
            wrapper.parse(is, new BodyContentHandler(-1), metadata, new ParseContext());
        }
        List<Metadata> metadataList = wrapper.getMetadata();
        int i = 0;
        assertEquals(5, metadataList.size());
        //make sure the \t are inserted in a body handler

        String table = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
        assertContains("0\t2.3\t2.4\tlorem", table);
        assertContains("普林斯顿大学", table);

        //make sure the \n is inserted
        String table2 = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
        assertContains("do eiusmod tempor\n", table2);

        assertContains("The quick brown fox", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
        assertContains("The quick brown fox", metadataList.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));

        //confirm .doc was added to blob
        assertEquals("/BYTES_COL_0.doc/image1.png", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
    }

    @Test
    public void testParserContainerExtractor() throws Exception {
        //There should be 6 embedded documents:
        //2x tables -- UTF-8 csv representations of the tables
        //2x word files, one doc and one docx
        //2x png files, the same image embedded in each of the doc and docx

        ParserContainerExtractor ex = new ParserContainerExtractor();
        ByteCopyingHandler byteCopier = new ByteCopyingHandler();
        Metadata metadata = new Metadata();
        try (TikaInputStream is = TikaInputStream.get(getResourceAsStream(TEST_FILE1))) {
            metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
            ex.extract(is, ex, byteCopier);
        }
        assertEquals(4, byteCopier.bytes.size());
        String[] strings = new String[4];
        for (int i = 1; i < byteCopier.bytes.size(); i++) {
            byte[] byteArr = byteCopier.bytes.get(i);
            String s = new String(byteArr, 0, Math.min(byteArr.length, 1000), UTF_8);
            strings[i] = s;
        }
        byte[] oleBytes = new byte[]{
                (byte) -48,
                (byte) -49,
                (byte) 17,
                (byte) -32,
                (byte) -95,
                (byte) -79,
                (byte) 26,
                (byte) -31,
                (byte) 0,
                (byte) 0,
        };
        //test OLE
        for (int i = 0; i < 10; i++) {
            assertEquals(oleBytes[i], byteCopier.bytes.get(0)[i]);
        }
        assertContains("PNG", strings[1]);
        assertContains("PK", strings[2]);
        assertContains("PNG", strings[3]);
    }

    //This confirms that reading the stream twice is not
    //quadrupling the number of attachments.
    @Test
    public void testInputStreamReset() throws Exception {
        //There should be 8 embedded documents:
        //4x word files, two docs and two docxs
        //4x png files, the same image embedded in each of the doc and docx

        ParserContainerExtractor ex = new ParserContainerExtractor();
        InputStreamResettingHandler byteCopier = new InputStreamResettingHandler();
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
        try (InputStream is = getResourceAsStream(TEST_FILE1)) {
            try (TikaInputStream tis = TikaInputStream.get(is)) {
                ex.extract(tis, ex, byteCopier);
                is.reset();
            }
        }
        assertEquals(8, byteCopier.bytes.size());
    }

    @Test
    public void testNulls() throws Exception {
        String xml = getXML(TEST_FILE_NAME).xml.replaceAll("\\s+", "");
        //everything except for the first key column should be empty
        assertContains("<tr><td>2</td><td/><td/><td/><td/><td/><td/><td/><td/><td/></tr>", xml);
    }

    public static class InputStreamResettingHandler implements EmbeddedResourceHandler {

        public List<byte[]> bytes = new ArrayList<byte[]>();

        @Override
        public void handle(String filename, MediaType mediaType,
                           InputStream stream) {
            ByteArrayOutputStream os = new ByteArrayOutputStream();
            if (!stream.markSupported()) {
                stream = TikaInputStream.get(stream);
            }
            stream.mark(1000000);
            try {
                IOUtils.copy(stream, os);
                bytes.add(os.toByteArray());
                stream.reset();
                //now try again
                os.reset();
                IOUtils.copy(stream, os);
                bytes.add(os.toByteArray());
                stream.reset();
            } catch (IOException e) {
                //swallow
            }
        }
    }

    //code used for creating the test file
/*
    private Connection getConnection(String dbFileName) throws Exception {
        File testDirectory = new File(this.getClass().getResource("/test-documents").toURI());
        System.out.println("Writing to: " + testDirectory.getAbsolutePath());
        File testDB = new File(testDirectory, dbFileName);
        Connection c = null;
        try {
            Class.forName("org.sqlite.JDBC");
            c = DriverManager.getConnection("jdbc:sqlite:" + testDB.getAbsolutePath());
        } catch ( Exception e ) {
            System.err.println( e.getClass().getName() + ": " + e.getMessage() );
            System.exit(0);
        }
        return c;
    }

    @Test
    public void testCreateDB() throws Exception {
        Connection c = getConnection("testSqlite3d.db");
        Statement st = c.createStatement();
        String sql = "DROP TABLE if exists my_table1";
        st.execute(sql);
        sql = "CREATE TABLE my_table1 (" +
                "PK INT PRIMARY KEY, "+
                "INT_COL INTEGER, "+
                "FLOAT_COL FLOAT, " +
                "DOUBLE_COL DOUBLE, " +
                "CHAR_COL CHAR(30), "+
                "VARCHAR_COL VARCHAR(30), "+
                "BOOLEAN_COL BOOLEAN,"+
                "DATE_COL DATE,"+
                "TIME_STAMP_COL TIMESTAMP,"+
                "CLOB_COL CLOB, "+
                "BYTES_COL BYTES" +
        ")";
        st.execute(sql);
        sql = "insert into my_table1 (PK, INT_COL, FLOAT_COL, DOUBLE_COL, CHAR_COL, " +
                "VARCHAR_COL, BOOLEAN_COL, DATE_COL, TIME_STAMP_COL, CLOB_COL, BYTES_COL) " +
                "values (?,?,?,?,?,?,?,?,?,?,?)";
        SimpleDateFormat f = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        java.util.Date d = f.parse("2015-01-03 15:17:03");
        System.out.println(d.getTime());
        long d1Long = 1420229823000L;// 2015-01-02 15:17:03
        long d2Long = 1420316223000L;// 2015-01-03 15:17:03
        PreparedStatement ps = c.prepareStatement(sql);
        ps.setInt(1, 0);
        ps.setInt(2, 10);
        ps.setFloat(3, 2.3f);
        ps.setDouble(4, 2.4d);
        ps.setString(5, "lorem");
        ps.setString(6, "普林斯顿大学");
        ps.setBoolean(7, true);
        ps.setString(8, "2015-01-02");
        ps.setString(9, "2015-01-03 15:17:03");
//        ps.setClob(10, new StringReader(sql));
        ps.setBytes(10, getByteArray(this.getClass().getResourceAsStream("/test-documents/testWORD_1img.doc")));//contains "quick brown fox"
        ps.executeUpdate();
        ps.clearParameters();

        ps.setInt(1, 1);
        ps.setInt(2, 20);
        ps.setFloat(3, 4.6f);
        ps.setDouble(4, 4.8d);
        ps.setString(5, "dolor");
        ps.setString(6, "sit");
        ps.setBoolean(7, false);
        ps.setString(8, "2015-01-04");
        ps.setString(9, "2015-01-03 15:17:03");
        //ps.setClob(9, new StringReader("consectetur adipiscing elit"));
        ps.setBytes(10, getByteArray(this.getClass().getResourceAsStream("/test-documents/testWORD_1img.docx")));//contains "The end!"

        ps.executeUpdate();
        //now add a fully null row
        ps.clearParameters();
        ps.setInt(1, 2);
        ps.setNull(2, Types.INTEGER);
        ps.setNull(3, Types.FLOAT);
        ps.setNull(4, Types.DOUBLE);
        ps.setNull(5, Types.CHAR);
        ps.setNull(6, Types.VARCHAR);
        ps.setNull(7, Types.BOOLEAN);
        ps.setNull(8, Types.DATE);
        ps.setNull(9, Types.TIMESTAMP);
        ps.setNull(10, Types.BLOB);
        ps.executeUpdate();

        //build table2
        sql = "DROP TABLE if exists my_table2";
        st.execute(sql);

        sql = "CREATE TABLE my_table2 (" +
                "INT_COL2 INT PRIMARY KEY, "+
                "VARCHAR_COL2 VARCHAR(64))";
        st.execute(sql);
        sql = "INSERT INTO my_table2 values(0,'sed, do eiusmod tempor')";
        st.execute(sql);
        sql = "INSERT INTO my_table2 values(1,'incididunt \nut labore')";
        st.execute(sql);

        c.close();
    }

    private byte[] getByteArray(InputStream is) throws IOException {
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        byte[] buff = new byte[1024];
        for (int bytesRead; (bytesRead = is.read(buff)) != -1;) {
            bos.write(buff, 0, bytesRead);
        }
        return bos.toByteArray();
    }

*/

}