/******************************************************************************* * Copyright (c) 2009, Adobe Systems Incorporated * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * · Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * · Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * · Neither the name of Adobe Systems Incorporated nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ package com.adobe.dp.fb2; import com.adobe.dp.css.*; import com.adobe.dp.epub.util.Base64; import com.adobe.dp.xml.util.SMapAttributesAdapter; import com.adobe.dp.xml.util.SMapImpl; import com.sun.org.apache.xml.internal.utils.XMLChar; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import java.io.*; import java.util.Stack; import java.util.Vector; import java.util.zip.ZipInputStream; public class FB2DocumentParser { private static final String xlinkNS = "http://www.w3.org/1999/xlink"; FB2Document doc; Vector stylesheets = new Vector(); Vector genres = new Vector(); class FB2CSSURLFactory implements CSSURLFactory { public CSSURL createCSSURL(String url) { return new FB2CSSURL(doc, url); } } public FB2DocumentParser(FB2Document doc) { this.doc = doc; } public void parse(InputStream in) throws IOException, FB2FormatException { if (!in.markSupported()) in = new BufferedInputStream(in); // sniff in.mark(4); byte[] sniff = new byte[4]; in.read(sniff); in.reset(); if (sniff[0] == 'P' && sniff[1] == 'K' && sniff[2] == 3 && sniff[3] == 4) { try { // zipped file ZipInputStream zip = new ZipInputStream(in); zip.getNextEntry(); in = new BufferedInputStream(zip); in.mark(4); in.read(sniff); in.reset(); } catch (Exception e) { throw new FB2FormatException("Zip file structure error"); } } String encoding = null; if (sniff[0] == (byte) 0xef && sniff[1] == (byte) 0xbb && sniff[2] == (byte) 0xbf) { // UTF-8 marker. Not all XML parsers correctly ignore that in.skip(3); encoding = "UTF-8"; } if (sniff[0] == (byte) 0xfe && sniff[1] == (byte) 0xff) { // UTF-16be marker. Not all XML parsers correctly ignore that in.skip(2); encoding = "UTF-16BE"; } if (sniff[0] == (byte) 0x00 && sniff[1] == (byte) 0x00 && sniff[2] == (byte) 0xfe && sniff[3] == (byte) 0xff) { // UTF-32be marker. Not all XML parsers correctly ignore that in.skip(4); encoding = "UTF-32BE"; } if (sniff[0] == (byte) 0xff && sniff[1] == (byte) 0xfe) { if (sniff[2] == (byte) 0x00 && sniff[3] == (byte) 0x00) { // UTF-32le marker. Not all XML parsers correctly ignore that in.skip(4); encoding = "UTF-32LE"; } else { // UTF-16le marker. Not all XML parsers correctly ignore that in.skip(2); encoding = "UTF-16LE"; } } if (encoding == null) { // no BOM found - read encoding from prolog in.mark(128); sniff = new byte[128]; in.read(sniff); in.reset(); String head = new String(sniff); int encodingBeginIndex = head.indexOf("encoding=\""); if (encodingBeginIndex > 0) { encodingBeginIndex += "encoding=\"".length(); int encodingEndIndex = head.indexOf('"', encodingBeginIndex + 1); encoding = head.substring(encodingBeginIndex, encodingEndIndex); } } SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setNamespaceAware(true); try { SAXParser parser = factory.newSAXParser(); XMLReader reader = parser.getXMLReader(); XMLHandler handler = new XMLHandler(); reader.setContentHandler(handler); reader.setEntityResolver(handler); InputSource source = new InputSource(new UtfFilterInputStream(in, encoding)); reader.parse(source); int count = handler.bodyElements.size(); if (count == 0) throw new FB2FormatException("No body sections found"); doc.bodySections = new FB2Section[count]; handler.bodyElements.copyInto(doc.bodySections); doc.stylesheets = new CSSStylesheet[stylesheets.size()]; stylesheets.copyInto(doc.stylesheets); } catch (ParserConfigurationException e) { e.printStackTrace(); throw new RuntimeException(e.toString()); } catch (SAXException e) { throw new FB2FormatException("XML Syntax error: " + e.getMessage()); } } FB2Element createElement(String ns, String localName, Attributes attr) { String styleStr = attr.getValue("style"); InlineRule style = null; if (styleStr != null) { CSSParser parser = new CSSParser(); parser.setCSSURLFactory(new FB2CSSURLFactory()); style = parser.readInlineStyle(styleStr); } if (ns.equals(FB2Document.fb2NS)) { if (localName.equals("section") || localName.equals("epigraph") || localName.equals("cite") || localName.equals("poem") || localName.equals("stanza")) return new FB2Section(localName); if (localName.equals("p")) return new FB2Paragraph(style); if (localName.equals("v")) return new FB2Line(style); if (localName.equals("date")) return new FB2Date(); if (localName.equals("text-author")) return new FB2TextAuthor(style); if (localName.equals("title")) return new FB2Title(); if (localName.equals("annotation")) return new FB2Section("annotation"); if (localName.equals("subtitle")) return new FB2Subtitle(style); if (localName.equals("image") || localName.equals("a")) { String link = attr.getValue(xlinkNS, "href"); if (link != null && link.startsWith("#")) link = link.substring(1); else link = null; if (localName.equals("a")) return new FB2Hyperlink(link); else { String alt = attr.getValue("alt"); String title = attr.getValue("title"); return new FB2Image(link, alt, title); } } if (localName.equals("empty-line")) return new FB2EmptyLine(); if (localName.equals("style")) return new FB2StyledText(attr.getValue("name")); if (localName.equals("strong") || localName.equals("emphasis") || localName.equals("sub") || localName.equals("sup") || localName.equals("strikethrough") || localName.equals("code")) return new FB2Text(localName); if (localName.equals("th") || localName.equals("td")) { String val = attr.getValue("colspan"); int colSpan = 1; if (val != null) { try { colSpan = Integer.parseInt(val); } catch (Exception e) { e.printStackTrace(); } } val = attr.getValue("rowspan"); int rowSpan = 1; if (val != null) { try { rowSpan = Integer.parseInt(val); } catch (Exception e) { e.printStackTrace(); } } String align = attr.getValue("align"); return new FB2TableCell(localName, style, align, colSpan, rowSpan); } if (localName.equals("table") || localName.equals("tr")) return new FB2OtherElement(localName, style); } return new FB2UnknownElement(ns, localName); } class Context { Context(FB2Element e) { curr = e; } FB2Element curr; Vector children = new Vector(); } private static class UtfFilterInputStream extends FilterReader { public int read() throws IOException { int read; do read = super.read(); while (read > -1 && !XMLChar.isValid(read)); return read; } public int read(char cbuf[], int off, int len) throws IOException { char buff[] = new char[len]; int read = super.read(buff, 0, len); if (read > -1) { int validChars = 0; for (int i = 0; i < read; i++) if (XMLChar.isValid(buff[i])) cbuf[off + validChars++] = buff[i]; return validChars; } else { return read; } } private UtfFilterInputStream(InputStream in, String encoding) throws UnsupportedEncodingException { super(encoding == null ? new InputStreamReader(in, "UTF-8") : new InputStreamReader(in, encoding)); } } class XMLHandler extends DefaultHandler { Vector bodyElements = new Vector(); Vector emails = new Vector(); Vector homePages = new Vector(); Vector srcUrls = new Vector(); Vector sequences = new Vector(); Vector translators = new Vector(); FB2TitleInfo currTitleInfo; FB2AuthorInfo currAuthorInfo; FB2DocumentInfo currDocumentInfo; FB2PublishInfo currPublishInfo; FB2DateInfo currDateInfo; FB2Binary currBinary; FB2GenreInfo currGenreInfo; Vector authors = new Vector(); Stack contexts = new Stack(); StringBuffer acc = new StringBuffer(); String coverpageImage; private Context getContext() { return (Context) contexts.peek(); } public InputSource resolveEntity(String publicId, String systemId) throws SAXException { throw new SAXException("External entities not allowed"); } private void emptyAcc() { if (acc.length() > 0) acc.replace(0, acc.length(), ""); } private String flushAcc() { String res = acc.toString(); emptyAcc(); return res; } private boolean inContent() { return !contexts.isEmpty(); } public void characters(char[] ch, int start, int length) throws SAXException { if (inContent()) { String text = new String(ch, start, length); if (getContext().curr.acceptsText()) getContext().children.add(text); } else { acc.append(ch, start, length); } } private void addSequence(Attributes attributes) { try { String name = attributes.getValue("name"); String number = attributes.getValue("number"); if (name != null && number != null) { FB2SequenceInfo sequence = new FB2SequenceInfo(); sequence.setName(name); sequence.setNumber(Integer.parseInt(number)); sequences.add(sequence); } } catch (Exception e) { e.printStackTrace(); } } private void addAuthor(Vector vector) { currAuthorInfo = new FB2AuthorInfo(); vector.add(currAuthorInfo); emails.clear(); homePages.clear(); } public void endElement(String uri, String localName, String qName) throws SAXException { if (inContent()) { Context cx = getContext(); cx.curr.children = new Object[cx.children.size()]; cx.children.copyInto(cx.curr.children); contexts.pop(); if (cx.curr instanceof FB2Title && getContext().curr instanceof FB2Section) { ((FB2Section) getContext().curr).title = (FB2Title) cx.curr; } } else if (currDateInfo != null) { if (localName.equals("date")) { currDateInfo.setHumanReadable(flushAcc()); currDateInfo = null; } } else if (currAuthorInfo != null) { if (localName.equals("author") || localName.equals("translator")) { if (homePages.size() > 0) { String[] arr = new String[homePages.size()]; homePages.copyInto(arr); currAuthorInfo.setHomePages(arr); homePages.clear(); } if (emails.size() > 0) { String[] arr = new String[emails.size()]; emails.copyInto(arr); currAuthorInfo.setEmails(arr); emails.clear(); } currAuthorInfo = null; } else if (localName.equals("first-name")) { currAuthorInfo.setFirstName(flushAcc()); } else if (localName.equals("last-name")) { currAuthorInfo.setLastName(flushAcc()); } else if (localName.equals("middle-name")) { currAuthorInfo.setMiddleName(flushAcc()); } else if (localName.equals("nickname")) { currAuthorInfo.setNickname(flushAcc()); } else if (localName.equals("home-page")) { homePages.add(flushAcc()); } else if (localName.equals("email")) { emails.add(flushAcc()); } } else if (currTitleInfo != null) { if (localName.equals("title-info") || localName.equals("src-title-info")) { if (genres.size() > 0) { FB2GenreInfo[] arr = new FB2GenreInfo[genres.size()]; genres.copyInto(arr); currTitleInfo.setGenres(arr); genres.clear(); } if (authors.size() > 0) { FB2AuthorInfo[] arr = new FB2AuthorInfo[authors.size()]; authors.copyInto(arr); currTitleInfo.setAuthors(arr); authors.clear(); } if (translators.size() > 0) { FB2AuthorInfo[] arr = new FB2AuthorInfo[translators.size()]; translators.copyInto(arr); currTitleInfo.setTranslators(arr); translators.clear(); } if (sequences.size() > 0) { FB2SequenceInfo[] arr = new FB2SequenceInfo[sequences.size()]; sequences.copyInto(arr); currTitleInfo.setSequences(arr); sequences.clear(); } currTitleInfo = null; } else if (localName.equals("book-title")) { currTitleInfo.setBookTitle(flushAcc()); } else if (localName.equals("keywords")) { currTitleInfo.setKeywords(flushAcc()); } else if (localName.equals("lang")) { currTitleInfo.setLanguage(flushAcc()); } else if (localName.equals("src-lang")) { currTitleInfo.setSrcLanguage(flushAcc()); } else if (localName.equals("coverpage")) { currTitleInfo.setCoverpageImage(coverpageImage); coverpageImage = null; } } else if (currDocumentInfo != null) { if (localName.equals("document-info")) { if (authors.size() > 0) { FB2AuthorInfo[] arr = new FB2AuthorInfo[authors.size()]; authors.copyInto(arr); currDocumentInfo.setAuthors(arr); authors.clear(); } if (srcUrls.size() > 0) { String[] arr = new String[srcUrls.size()]; srcUrls.copyInto(arr); currDocumentInfo.setSrcUrls(arr); srcUrls.clear(); } currDocumentInfo = null; } else if (localName.equals("program-used")) { currDocumentInfo.setProgramUsed(flushAcc()); } else if (localName.equals("src-url")) { srcUrls.add(flushAcc()); } else if (localName.equals("src-ocr")) { currDocumentInfo.setSrcOcr(flushAcc()); } else if (localName.equals("version")) { currDocumentInfo.setVersion(flushAcc()); } else if (localName.equals("id")) { currDocumentInfo.setId(flushAcc()); } } else if (currPublishInfo != null) { if (localName.equals("publish-info")) { if (sequences.size() > 0) { FB2SequenceInfo[] arr = new FB2SequenceInfo[sequences.size()]; sequences.copyInto(arr); currPublishInfo.setSequences(arr); sequences.clear(); } currPublishInfo = null; } else if (localName.equals("publisher")) { currPublishInfo.setPublisher(flushAcc()); } else if (localName.equals("city")) { currPublishInfo.setCity(flushAcc()); } else if (localName.equals("year")) { currPublishInfo.setYear(flushAcc()); } else if (localName.equals("book-name")) { currPublishInfo.setBookName(flushAcc()); } else if (localName.equals("isbn")) { currPublishInfo.setISBN(flushAcc()); } } else if (currBinary != null) { currBinary.setData(Base64.decode(flushAcc())); currBinary = null; } else if (localName.equals("stylesheet")) { CSSParser parser = new CSSParser(); parser.setCSSURLFactory(new FB2CSSURLFactory()); try { CSSStylesheet stylesheet = parser.readStylesheet(new StringReader(flushAcc())); if (stylesheet != null) stylesheets.add(stylesheet); } catch (Exception e) { // unexpected e.printStackTrace(); } } } public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if (inContent()) { FB2Element element = createElement(uri, localName, attributes); if (attributes.getLength() > 0) { element.attrs = new SMapImpl(new SMapAttributesAdapter(attributes)); String id = attributes.getValue("id"); if (id != null) { element.id = id; doc.idMap.put(id, element); } } getContext().children.add(element); contexts.push(new Context(element)); } else if (currAuthorInfo != null) { emptyAcc(); } else if (currDateInfo != null) { // ignore } else if (currTitleInfo != null) { if (localName.equals("author")) { addAuthor(authors); } else if (localName.equals("translator")) { addAuthor(translators); } else if (localName.equals("genre")) { currGenreInfo = new FB2GenreInfo(); String match = attributes.getValue("match"); if (match != null) { try { currGenreInfo.setMatch(Integer.parseInt(match)); } catch (Exception e) { e.printStackTrace(); } } } else if (localName.equals("sequence")) { addSequence(attributes); } else if (localName.equals("date")) { currDateInfo = new FB2DateInfo(); String date = attributes.getValue("value"); currDateInfo.setMachineReadable(date); currTitleInfo.setDate(currDateInfo); } else if (localName.equals("annotation")) { FB2Section annot = new FB2Section("annotation"); contexts.push(new Context(annot)); currTitleInfo.setAnnotation(annot); } else if (localName.equals("coverpage")) { coverpageImage = null; } else if (localName.equals("image")) { String ref = attributes.getValue(xlinkNS, "href"); if (ref.startsWith("#")) { coverpageImage = ref.substring(1); } } emptyAcc(); } else if (currDocumentInfo != null) { if (localName.equals("autor")) { addAuthor(authors); } else if (localName.equals("date")) { currDateInfo = new FB2DateInfo(); String date = attributes.getValue("value"); currDateInfo.setMachineReadable(date); currDocumentInfo.setDate(currDateInfo); } else if (localName.equals("history")) { FB2Section annot = new FB2Section("history"); contexts.push(new Context(annot)); currDocumentInfo.setHistory(annot); } emptyAcc(); } else if (currPublishInfo != null) { if (localName.equals("sequence")) { addSequence(attributes); } emptyAcc(); } else if (currBinary != null) { // ignore } else if (localName.equals("body")) { FB2Section body = new FB2Section("body"); body.name = attributes.getValue("name"); bodyElements.add(body); contexts.push(new Context(body)); } else if (localName.equals("document-info")) { currDocumentInfo = new FB2DocumentInfo(); authors.clear(); srcUrls.clear(); doc.documentInfo = currDocumentInfo; } else if (localName.equals("title-info")) { currTitleInfo = new FB2TitleInfo(); authors.clear(); sequences.clear(); translators.clear(); genres.clear(); doc.titleInfo = currTitleInfo; } else if (localName.equals("src-title-info")) { currTitleInfo = new FB2TitleInfo(); authors.clear(); doc.srcTitleInfo = currTitleInfo; } else if (localName.equals("publish-info")) { currPublishInfo = new FB2PublishInfo(); doc.publishInfo = currPublishInfo; } else if (localName.equals("binary")) { currBinary = new FB2Binary(); String id = attributes.getValue("id"); String contentType = attributes.getValue("content-type"); doc.binaryResources.put(id, currBinary); currBinary.setMediaType(contentType); emptyAcc(); } else if (localName.equals("stylesheet")) { emptyAcc(); } } } }