DBFParserTest.java example

Explorer
tika-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.dbf;

import org.apache.commons.io.IOUtils;
import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.junit.Test;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;

import static junit.framework.TestCase.assertEquals;
import static org.junit.Assert.fail;

public class DBFParserTest extends TikaTest {

    @Test
    public void testBasic() throws Exception {
        XMLResult r = getXML("testDBF.dbf");
        assertEquals(DBFReader.Version.FOXBASE_PLUS.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("2016-05-24T00:00:00Z", r.metadata.get(TikaCoreProperties.MODIFIED));
        assertEquals("UTF-8", r.metadata.get(Metadata.CONTENT_ENCODING));

        String xml = r.xml.replaceAll("[\\t\\r\\n]", " ");
        //header
        assertContains("<thead> <th>TEXT_FIELD</th> <th>NUMERIC_FI</th> <th>DATE_FIELD</th></thead>",
                xml);
        //look for contents
        assertContains("普林斯顿大学", xml);
        assertContains("\u0627\u0645\u0639\u0629", xml);
        assertContains("05/26/2016", xml);
        assertContains("<td>4.0</td>", xml);
        //make sure there is no problem around row 10
        //where we're buffering
        assertContains("<td>8.0</td>", xml);
        assertContains("<td>9.0</td>", xml);
        assertContains("<td>10.0</td>", xml);
        assertContains("<td>11.0</td>", xml);
        assertContains("<td>licour</td>", xml);
    }

    @Test
    public void testGB18030Encoded() throws Exception {
        XMLResult r = getXML("testDBF_gb18030.dbf");
        assertEquals(DBFReader.Version.FOXBASE_PLUS.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE));
        assertContains("虽然该", r.xml);
    }

    @Test
    public void testTruncated() throws Exception {
        Parser p = new DBFParser();
        //should throw exception for truncation in header
        for (int i = 1; i < 129; i++) {
            try {
                XMLResult r = getXML(truncate("testDBF.dbf", i), p, new Metadata());
                fail("Should have thrown exception for truncation in header: " + i);
            } catch (IOException | TikaException e) {
                //ok -- expected
            } catch (Throwable e) {
                fail("Should only throw IOExceptions or TikaExceptions");
            }
        }
        //default don't throw exception for truncation while reading body
        for (int i = 129; i < 204; i++) {
            try {
                XMLResult r = getXML(truncate("testDBF.dbf", i), p, new Metadata());
            } catch (IOException | TikaException e) {
                fail("Shouldn't have thrown exception for truncation while reading cells: " + i);
                e.printStackTrace();
            }
        }
        try {
            DBFReader.STRICT = true;
            //if strict is true throw exception for truncation in body
            for (int i = 129; i < 204; i++) {
                try {
                    XMLResult r = getXML(truncate("testDBF.dbf", i), p, new Metadata());
                    fail("Should have thrown exception for truncation while reading cells: " + i);
                } catch (IOException | TikaException e) {
                }
            }
        } finally {
            //reset for other tests
            DBFReader.STRICT = false;
        }
    }

    @Test
    public void testSpecificTruncated() throws Exception {
        XMLResult r = getXML(truncate("testDBF.dbf", 781), new AutoDetectParser(), new Metadata());
        String xml = r.xml.replaceAll("[\\t\\r\\n]", " ");

        //if you don't keep track of bytes read, you could get content from prev row
        assertNotContained("holt red hath in every", xml);
        assertNotContained("<td>holt</td> <td>18.0</td>", xml);
        //check that the last row ends with holt but is correctly formatted
        assertContains("<td>holt</td> <td /> <td /></tr>", xml);
    }

    @Test
    public void testVariants() throws Exception {
        ByteArrayOutputStream bos = new ByteArrayOutputStream();
        try (InputStream is = getResourceAsStream("/test-documents/testDBF.dbf")) {
            IOUtils.copy(is, bos);
        }
        byte[] bytes = bos.toByteArray();

        for (DBFReader.Version version : DBFReader.Version.values()) {
            //this cast happens to work because of the range of possible values
            bytes[0] = (byte) version.getId();
            XMLResult r = getXML(TikaInputStream.get(bytes), new AutoDetectParser(), new Metadata());
            assertEquals(version.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE));
        }
    }

/*
commented out until we get permission to add the test file
    @Test
    public void testEncodingInHeaderAndDateTime() throws Exception {
        XMLResult r = getXML("prem2007_2.dbf");
        String xml = r.xml.replaceAll("[\\r\\n\\t]", " ");
        assertEquals("application/x-dbf; dbf_version=Visual_FoxPro", r.metadata.get(Metadata.CONTENT_TYPE));
        assertContains("<th>莉こ晤鎢</th>", xml);//header
        assertContains("<td>齠褕</td>", xml);//content
        assertContains("<td>2010-04-20T00:00:00Z</td>", xml);
    }
    */

    InputStream truncate(String testFileName, int length) throws IOException {
        byte[] bytes = new byte[length];
        try (InputStream is = getResourceAsStream("/test-documents/" + testFileName)) {
            IOUtils.readFully(is, bytes);
        }
        return new ByteArrayInputStream(bytes);
    }
}