/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.util.ms; import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.Writer; import java.util.logging.Logger; import org.apache.poi.hdf.extractor.WordDocument; import junit.framework.TestCase; public class DocTest extends TestCase { private final Logger logger = Logger.getLogger(this.getClass().getName()); final private static File TEST_DIR; static { // handle case when unit test is run in either 'commons' or in // enclosing project File f = new File("src/test/java/org/archive/util/ms"); TEST_DIR = f.exists() ? f : new File("commons/src/test/java/org/archive/util/ms"); } // Rename to testAgainstPOI to actually run the test. public void testAgainstPOI() throws IOException { int errors = 0; long start = System.currentTimeMillis(); for (File f : TEST_DIR.listFiles()) { try { start = System.currentTimeMillis(); if (f.getName().endsWith(".doc")) { errors += runDoc(f); } } finally { long duration = System.currentTimeMillis() - start; logger.fine("Duration in milliseconds: " + duration); } } if (errors > 0) { throw new IOException(errors + " errors, see stdout."); } } private int runDoc(File doc) throws IOException { logger.fine("===== Now processing " + doc.getName()); String name = doc.getName(); int p = name.lastIndexOf('.'); String expectedName = name.substring(0, p) + ".txt"; File expectedFile = new File(TEST_DIR, expectedName); if (!expectedFile.exists()) { createExpectedOutput(doc, expectedFile); } return runFiles(doc, expectedFile); } private void createExpectedOutput(File doc, File output) throws IOException { FileInputStream finp = new FileInputStream(doc); FileOutputStream fout = new FileOutputStream(output); try { WordDocument wd = new WordDocument(finp); Writer writer = new OutputStreamWriter(fout, "UTF-16BE"); wd.writeAllText(writer); } finally { close(finp); close(fout); } } private static void close(Closeable c) { try { c.close(); } catch (IOException e) { e.printStackTrace(); } } private int runFiles(File doc, File expected) throws IOException { FileInputStream expectedIn = new FileInputStream(expected); Reader expectedReader = new InputStreamReader(expectedIn, "UTF-16BE"); Reader docReader = Doc.getText(doc); try { return runReaders(docReader, expectedReader); } finally { close(docReader); close(expectedReader); } } private int runReaders(Reader doc, Reader expected) throws IOException { int count = 0; int errors = 0; boolean go = true; while (go) { int ch = doc.read(); int expectedCh = correctPOI(expected.read()); if ((ch < 0) || (expectedCh < 0)) { go = false; if ((ch >= 0) || (expectedCh >= 0)) { errors++; logger.fine("File lengths differ."); } } if (ch != expectedCh) { errors += 1; report(count, expectedCh, ch); } count++; } return errors; } private void report(int count, int expected, int actual) { StringBuilder msg = new StringBuilder("#").append(count); msg.append(": Expected "); msg.append(expected).append(" (").append(toChar(expected)); msg.append(") but got ").append(actual).append(" ("); msg.append(toChar(actual)).append(")."); logger.fine(msg.toString()); } private static String toChar(int ch) { if (ch < 0) { return "EOF"; } else { return Character.toString((char)ch); } } /** * Corrects POI's Cp1252 output. There's a bug somewhere in POI that * makes it produce incorrect characters. Not sure where and don't have * time to track it down. But I have visually checked the input * documents to verify that Doc is producing the right character, and * that POI is not. * * @param ch the POI-produced character to check * @return the corrected character */ private static int correctPOI(int ch) { switch (ch) { case 8734: // POI produced the infinity sign when it should have // produced the degrees sign. return 176; case 214: // POI produced an umat O instead of an ellipses mark. return 8230; case 237: // POI produced an acute i instead of a fancy single quote return 8217; case 236: // POI produced a reverse acute i instead of fancy double quote return 8220; case 238: // POI produced a caret i instead of fancy double quote return 8221; default: return ch; } } }