/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.dbf;
import org.apache.commons.io.IOUtils;
import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import static junit.framework.TestCase.assertEquals;
import static org.junit.Assert.fail;
public class DBFParserTest extends TikaTest {
@Test
public void testBasic() throws Exception {
XMLResult r = getXML("testDBF.dbf");
assertEquals(DBFReader.Version.FOXBASE_PLUS.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE));
assertEquals("2016-05-24T00:00:00Z", r.metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("UTF-8", r.metadata.get(Metadata.CONTENT_ENCODING));
String xml = r.xml.replaceAll("[\\t\\r\\n]", " ");
//header
assertContains("<thead> <th>TEXT_FIELD</th> <th>NUMERIC_FI</th> <th>DATE_FIELD</th></thead>",
xml);
//look for contents
assertContains("普林斯顿大学", xml);
assertContains("\u0627\u0645\u0639\u0629", xml);
assertContains("05/26/2016", xml);
assertContains("<td>4.0</td>", xml);
//make sure there is no problem around row 10
//where we're buffering
assertContains("<td>8.0</td>", xml);
assertContains("<td>9.0</td>", xml);
assertContains("<td>10.0</td>", xml);
assertContains("<td>11.0</td>", xml);
assertContains("<td>licour</td>", xml);
}
@Test
public void testGB18030Encoded() throws Exception {
XMLResult r = getXML("testDBF_gb18030.dbf");
assertEquals(DBFReader.Version.FOXBASE_PLUS.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE));
assertContains("虽然该", r.xml);
}
@Test
public void testTruncated() throws Exception {
Parser p = new DBFParser();
//should throw exception for truncation in header
for (int i = 1; i < 129; i++) {
try {
XMLResult r = getXML(truncate("testDBF.dbf", i), p, new Metadata());
fail("Should have thrown exception for truncation in header: " + i);
} catch (IOException | TikaException e) {
//ok -- expected
} catch (Throwable e) {
fail("Should only throw IOExceptions or TikaExceptions");
}
}
//default don't throw exception for truncation while reading body
for (int i = 129; i < 204; i++) {
try {
XMLResult r = getXML(truncate("testDBF.dbf", i), p, new Metadata());
} catch (IOException | TikaException e) {
fail("Shouldn't have thrown exception for truncation while reading cells: " + i);
e.printStackTrace();
}
}
try {
DBFReader.STRICT = true;
//if strict is true throw exception for truncation in body
for (int i = 129; i < 204; i++) {
try {
XMLResult r = getXML(truncate("testDBF.dbf", i), p, new Metadata());
fail("Should have thrown exception for truncation while reading cells: " + i);
} catch (IOException | TikaException e) {
}
}
} finally {
//reset for other tests
DBFReader.STRICT = false;
}
}
@Test
public void testSpecificTruncated() throws Exception {
XMLResult r = getXML(truncate("testDBF.dbf", 781), new AutoDetectParser(), new Metadata());
String xml = r.xml.replaceAll("[\\t\\r\\n]", " ");
//if you don't keep track of bytes read, you could get content from prev row
assertNotContained("holt red hath in every", xml);
assertNotContained("<td>holt</td> <td>18.0</td>", xml);
//check that the last row ends with holt but is correctly formatted
assertContains("<td>holt</td> <td /> <td /></tr>", xml);
}
@Test
public void testVariants() throws Exception {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
try (InputStream is = getResourceAsStream("/test-documents/testDBF.dbf")) {
IOUtils.copy(is, bos);
}
byte[] bytes = bos.toByteArray();
for (DBFReader.Version version : DBFReader.Version.values()) {
//this cast happens to work because of the range of possible values
bytes[0] = (byte) version.getId();
XMLResult r = getXML(TikaInputStream.get(bytes), new AutoDetectParser(), new Metadata());
assertEquals(version.getFullMimeString(), r.metadata.get(Metadata.CONTENT_TYPE));
}
}
/*
commented out until we get permission to add the test file
@Test
public void testEncodingInHeaderAndDateTime() throws Exception {
XMLResult r = getXML("prem2007_2.dbf");
String xml = r.xml.replaceAll("[\\r\\n\\t]", " ");
assertEquals("application/x-dbf; dbf_version=Visual_FoxPro", r.metadata.get(Metadata.CONTENT_TYPE));
assertContains("<th>莉こ晤鎢</th>", xml);//header
assertContains("<td>齠褕</td>", xml);//content
assertContains("<td>2010-04-20T00:00:00Z</td>", xml);
}
*/
InputStream truncate(String testFileName, int length) throws IOException {
byte[] bytes = new byte[length];
try (InputStream is = getResourceAsStream("/test-documents/" + testFileName)) {
IOUtils.readFully(is, bytes);
}
return new ByteArrayInputStream(bytes);
}
}