package edu.oregonstate.cartography.geometryimport; import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Vector; /** * An importer for DBF data base files. * @author Bernhard Jenny, Institute of Cartography, ETH Zurich. */ public class DBFImporter { private void printInfo(String str) { // uncomment the following line for debugging //System.out.println(str); } private final ArrayList fields = new ArrayList(); private String charsetName; /** * Default value for numbers that cannot be read. */ private final Double DEFAULT_NUMBER = new Double(0); /** Creates a new instance of DBFImporter */ public DBFImporter() { } public Table read(String filePath) throws IOException { LittleEndianInputStream is = null; if (filePath == null) return null; try { // read all data into a data buffer. is = new LittleEndianInputStream(new BufferedInputStream(new FileInputStream(filePath))); int fileCode = unsignedByteToInt(is.readByte()); int year = unsignedByteToInt(is.readByte()); int month = unsignedByteToInt(is.readByte()); int day = unsignedByteToInt(is.readByte()); long nbrRecords = is.readInt();// & 0xffffffffL;// unsigned int int headerSize = is.readShort(); int recordSize = is.readShort(); is.readShort(); // overread reserved value is.readByte(); // transaction byte int encrypted = is.readUnsignedByte(); // encription byte is.skipBytes(13); int codepage = is.readUnsignedByte(); is.skipBytes(2); // map the codepage to a string. Based on: // http://www.clicketyclick.dk/databases/xbase/format/dbf.html#DBF_STRUCT switch (codepage) { case 0x01: // DOS USA code page 437 this.charsetName = "IBM437"; break; case 0x02: // DOS Multilingual code page 850 this.charsetName = "IBM850"; break; case 0x03: // Windows ANSI code page 1252 this.charsetName = "windows-1252"; break; case 0x04: // Standard Macintosh this.charsetName = "MacRoman"; break; // ESRI shape files use code 0x57 to indicate that // data is written in ANSI (whatever that means). // http://www.esricanada.com/english/support/faqs/arcview/avfaq21.asp case 0x57: this.charsetName = "windows-1252"; break; case 0x64: // EE MS-DOS code page 852 this.charsetName = "IBM852"; break; case 0x65: // Nordic MS-DOS code page 865 this.charsetName = "IBM865"; break; case 0x66: // Russian MS-DOS code page 866 this.charsetName = "IBM866"; break; case 0x67: // Icelandic MS-DOS this.charsetName = "IBM861"; break; /* case 0x68: // Kamenicky (Czech) MS-DOS // ? break; case 0x69: // Mazovia (Polish) MS-DOS // ? break; */ case 0x6A: // Greek MS-DOS (437G) [?] this.charsetName = "x-IBM737"; break; case 0x6B: // Turkish MS-DOS this.charsetName = "IBM857"; break; case 0x96: // Russian Macintosh this.charsetName = "x-MacCyrillic"; break; case 0x97: // Eastern European Macintosh this.charsetName = "x-MacCentralEurope"; break; case 0x98: // Greek Macintosh this.charsetName = "x-MacGreek"; break; case 0xC8: // Windows EE (=Eastern Europe?) code page 1250 this.charsetName = "windows-1250"; break; case 0xC9: // Russian Windows this.charsetName = "windows-1251"; break; case 0xCA: // Turkish Windows this.charsetName = "windows-1254"; break; case 0xCB: // Greek Windows this.charsetName = "windows-1253"; break; default: this.charsetName = "IBM437"; } this.printInfo("File Code: " + fileCode); this.printInfo("Year: " + year); this.printInfo("Month: " + month); this.printInfo("Day: " + day); this.printInfo("Nbr Records: " + nbrRecords); this.printInfo("Header Size: " + headerSize); this.printInfo("Record Size: " + recordSize); this.printInfo("Ecrypted: " + encrypted); this.printInfo("Codepage: " + codepage); if (encrypted != 0) throw new IOException("Encrypted DBF not supported."); int nFields = (headerSize - 32) / 32; this.readFieldDescriptors(is, nFields); // create an new table Table table = this.initTable("ShapeAttributes"); // read the records and fill the table for (int i = 0; i < nbrRecords; i++) { this.printInfo("Reading Record " + i); this.readRecord(is, recordSize, table); } return table; } finally { if (is != null) is.close(); } } private void readFieldDescriptors(LittleEndianInputStream is, int nFields) throws IOException { // read description of each field byte[] asciiFieldName = new byte[11]; for (int i = 0; i < nFields; i++) { DBFField field = new DBFField(); is.read(asciiFieldName); field.name = bytesToString( asciiFieldName, asciiFieldName.length, this.charsetName); field.type = is.readUnsignedByte(); field.address = is.readInt(); field.length = is.readUnsignedByte(); field.decimalCount = is.readUnsignedByte(); is.readShort();// overread reserved value field.workAreaID = is.readUnsignedByte(); field.multiUserDBase = is.readShort(); field.setFields = is.readUnsignedByte(); is.skipBytes(7); // overread 7 reserved bytes field.fieldInMDXIndex = is.readUnsignedByte(); this.fields.add(field); this.printInfo("\n" + field.toString()); } // overread the Header Record Terminator, which should be 0x0D byte terminator = is.readByte(); if (terminator != 0x0D) { throw new IOException("DBF file is corrupt."); } } private void readRecord(LittleEndianInputStream is, int recordSize, Table table) throws IOException { int deletedFlag = is.readUnsignedByte(); byte[] data = new byte[recordSize]; Vector rowData = new Vector(); java.util.Iterator iterator = this.fields.iterator(); while (iterator.hasNext()) { DBFField field = (DBFField)iterator.next(); is.read(data, 0, field.length); switch (field.type) { case 'C': // character string String string = bytesToString(data, field.length, charsetName); rowData.add(string.trim()); break; case 'F': // floating number try { rowData.add(new Double(new String(data, 0, field.length))); } catch (NumberFormatException exc) { rowData.add(this.DEFAULT_NUMBER); } break; case 'N': // number try { rowData.add(new Double(new String(data, 0, field.length))); } catch (NumberFormatException exc) { rowData.add(this.DEFAULT_NUMBER); } break; case '8': case 'O': // little endian 8 byte double. Not tested !!! ??? long byte1 = data[0]; long byte2 = data[1]; long byte3 = data[2]; long byte4 = data[3]; long byte5 = data[4]; long byte6 = data[5]; long byte7 = data[6]; long byte8 = data[7]; long l = (byte8 << 56) + (byte7 << 48) + (byte6 << 40) + (byte5 << 32) + (byte4 << 24) + (byte3 << 16) + (byte2 << 8) + byte1; rowData.add(new Double(Double.longBitsToDouble(l))); break; case '4': case 'I': // little endian 4 byte integer. Not tested !!! ??? int i = (data[3] << 24) + (data[2] << 16) + (data[1] << 8) + data[0]; rowData.add(new Double(i)); break; case '2': // little endian 2 byte integer. Not tested !!! ??? rowData.add(new Double((data[1] << 8) + data[0])); break; /* case 'D': // date System.out.println ("Date objects in DBF files not tested."); // !!! ??? // Date in format YYYYMMDD. if (field.length != 8) throw new Exception("Date object has non-standard length in DBF file."); int year = Integer.parseInt(new String(data, 0, 4)); int month = Integer.parseInt(new String(data, 4, 2)); int day = Integer.parseInt(new String(data, 6, 2)); Date date = new Date(year, month, day); rowData.add(date.toString()); break; case 'L': // logical // ? Not initialised (default) // Y,y Yes // N,n No // F,f False // T,t True final byte b = data[0]; if (b == 'Y' || b == 'y' || b == 'T' || b == 't') rowData.add("True"); else if (b == 'N' || b == 'n' || b == 'F' || b == 'f') rowData.add("False"); else rowData.add("?"); break; */ default: // add the raw bytes as String rowData.add(new String(data, 0, field.length));; } } table.addRow(rowData); } private Table initTable(String name) { Table table = new Table(this.charsetName); table.setName(name); java.util.Iterator iterator = this.fields.iterator(); while (iterator.hasNext()) { DBFField field = (DBFField)iterator.next(); table.addColumn(field.name); } return table; } /** * Converts a chunk of bytes into a String. Stops when 0x0 is found. * Uses a specified character set for the conversion. If the bytes cannot * be converted with the specified character set, the default character set * is used. * @param bytes The raw bytes containing the string, one byte per character. * @param maxLength * @param charsetName * @maxLength Don't convert more bytes than maxLength. * @charsetName The name of the encoding of the character set. * @return A new String. */ public static String bytesToString(byte[] bytes, int maxLength, String charsetName) { // find the number of valid characters int nbrValidChars = 0; maxLength = Math.min(maxLength, bytes.length); for (int i = 0; i < maxLength; i++) { if (bytes[i] == 0x0) { break; } nbrValidChars++; } try { // try encoding with the passed character set. return new String(bytes, 0, nbrValidChars, charsetName); } catch (UnsupportedEncodingException exc) { // The string could not be encoded with the passed character set. // The character set possibly does not exist on this machine. // Use the default character set instead. return new String(bytes, 0, nbrValidChars); } } private class DBFField { public String name; public int type; public int address; public int length; public int decimalCount; public int workAreaID; public int multiUserDBase; public int setFields; public int fieldInMDXIndex; @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("Name: ").append(name); sb.append(";\tType: ").append(type); switch (type) { case 'C': sb.append(" chars"); break; case 'D': sb.append(" date"); break; case 'F': sb.append(" float"); break; case 'N': sb.append(" number"); break; case 'L': sb.append(" logical"); break; case 'M': sb.append(" memo"); break; case 'V': sb.append(" variable"); break; case 'P': sb.append(" picture"); break; case 'B': sb.append(" binary"); break; case 'G': sb.append(" general"); break; case '2': sb.append(" 2 byte int"); break; case '4': case 'I': sb.append(" 4 byte int"); break; case '8': case 'O': sb.append(" double"); break; default: sb.append(" unknown field"); } sb.append(";\tAddress: ").append(address); sb.append(";\tLength: ").append(length); sb.append(";\tDecimal Count: ").append(decimalCount); sb.append(";\tWork Area ID: ").append(workAreaID); sb.append(";\tMulti User dBase: ").append(multiUserDBase); sb.append(";\tSet Fields: ").append(setFields); sb.append(";\tField .mdx Index: ").append(fieldInMDXIndex); return sb.toString(); } } /** Numbers are signed in Java. This converts an unsigned byte to an int * @param b. * @return */ public static int unsignedByteToInt(byte b) { return (int) b & 0xFF; } }