/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.wordperfect; import java.io.IOException; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; /** * Extracts WordPerfect Document Area text from a WordPerfect document. * @author Pascal Essiembre */ abstract class WPDocumentAreaExtractor { boolean startedP = false; public void extract(WPInputStream in, XHTMLContentHandler xhtml) throws IOException, SAXException { int chunk = 4096; StringBuilder out = new StringBuilder(chunk); int c; while ((c = in.read()) != -1) { extract(c, in, out, xhtml); if (out.length() >= chunk) { xhtml.characters(out.toString()); out.setLength(0); } } endParagraph(out, xhtml); } protected abstract void extract( int c, WPInputStream in, StringBuilder out, XHTMLContentHandler xhtml) throws IOException, SAXException; protected void lazilyStartParagraph(XHTMLContentHandler xhtml) throws SAXException { if (! startedP) { xhtml.startElement("p"); } startedP = true; } /** * This assumes that the <p> was started before you get here. * And the user is required to close the <p> at the end of the document. * * These are currently handled by {@link #extract(WPInputStream, XHTMLContentHandler)}. * * @param xhtml * @throws SAXException */ protected void endParagraph(StringBuilder buffer, XHTMLContentHandler xhtml) throws SAXException { lazilyStartParagraph(xhtml); xhtml.characters(buffer.toString()); buffer.setLength(0); xhtml.endElement("p"); startedP = false; } // Skips until the given character is encountered. protected int skipUntilChar(WPInputStream in, int targetChar) throws IOException { int count = 0; int c; while ((c = in.read()) != -1) { count++; if (c == targetChar) { return count; } } return count; } }