/* * Copyright 2010-2011 Øyvind Berg (elacin@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.elacin.pdfextract.xml; import org.apache.log4j.Logger; import org.elacin.pdfextract.tree.*; import org.elacin.pdfextract.tree.Role; import org.jetbrains.annotations.NotNull; import org.tei_c.*; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; import javax.xml.bind.Marshaller; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.lang.String; import java.util.ArrayList; import java.util.List; /** * Created by IntelliJ IDEA. User: elacin Date: 14.01.11 Time: 17.02 To change this template use * File | Settings | File Templates. */ public class TEIOutput implements XMLWriter { // ------------------------------ FIELDS ------------------------------ private static final Logger log = Logger.getLogger(TEIOutput.class); // ------------------------ INTERFACE METHODS ------------------------ // --------------------- Interface XMLWriter --------------------- public void writeTree(@NotNull DocumentNode root, File destination) { long t0 = System.currentTimeMillis(); final TEI tei = new TEI(); addHeader(root, tei); final Text text = new Text(); addFront(root, text); addBody(root, text); addBack(root, text); tei.setText(text); try { JAXBContext jaxbContext = JAXBContext.newInstance("org.tei_c"); Marshaller marshaller = jaxbContext.createMarshaller(); marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, true); marshaller.marshal(tei, new FileOutputStream(destination)); } catch (JAXBException e) { log.warn("LOG01140:", e); return; } catch (FileNotFoundException e) { log.warn("LOG01120:", e); return; } long time = System.currentTimeMillis() - t0; if (log.isInfoEnabled()) { log.info("LOG01510:" + TEIOutput.class + " took " + time + "ms"); } } // -------------------------- OTHER METHODS -------------------------- private void addAbstract(final DocumentNode root, @NotNull Front front) { if (root.getAbstractParagraph() == null) { return; } final P p = new P(); final Div div = new Div().withType("abs"); div.withMeetingsAndBylinesAndDatelines(new Head().withContent("Abstract"), p); for (LineNode lineNode : root.getAbstractParagraph().getChildren()) { addLineToContent(p.getContent(), lineNode); } front.withSetsAndProloguesAndEpilogues(div); } private void addBack(DocumentNode root, @NotNull Text text) { final Back back = new Back(); /* references goes here */ text.withIndicesAndSpenAndSpanGrps(back); } Div currentDiv; Div1 currentDiv1; Div2 currentDiv2; int divLevel; List<Object> currentContent; private void addBody(@NotNull DocumentNode root, @NotNull Text text) { final Body body = new Body(); List<ParagraphNode> prfs = new ArrayList<ParagraphNode>(); for (PageNode pageNode : root.getChildren()) { prfs.addAll(pageNode.getChildren()); } divLevel = 0; currentDiv = new Div(); body.withIndicesAndSpenAndSpanGrps(currentDiv); currentContent = currentDiv.getMeetingsAndBylinesAndDatelines(); boolean createNewP = false; P currentP = new P(); currentContent.add(currentP); for (ParagraphNode prf : prfs) { boolean isHead = false; if (prf.hasRole(Role.DIV1)) { divLevel = 1; currentDiv1 = new Div1(); body.withIndicesAndSpenAndSpanGrps(currentDiv1); currentContent = currentDiv1.getMeetingsAndBylinesAndDatelines(); isHead = true; createNewP = true; } else if (prf.hasRole(Role.DIV2)) { divLevel = 2; currentDiv2 = new Div2(); currentDiv1.getMeetingsAndBylinesAndDatelines().add(currentDiv2); currentContent = currentDiv2.getMeetingsAndBylinesAndDatelines(); isHead = true; createNewP = true; } if (prf.hasRole(Role.FOOTNOTE)) { LineNode firstLine = prf.getChildren().get(0); WordNode firstWord = firstLine.getChildren().get(0); Note note = new Note(); firstLine.removeChild(firstWord); addParagraphToContent(note.getContent(), prf); note.withPlaces("below").withNS(firstWord.getText()); // currentContent.add(note); body.withIndicesAndSpenAndSpanGrps(note); continue; } if (isHead) { LineNode firstLine = prf.getChildren().get(0); String divName = "sec" + firstLine.getChildren().get(0).getText(); Head head = new Head().withId(divName); firstLine.removeChild(firstLine.getChildren().get(0)); addParagraphToContent(head.getContent(), prf); currentContent.add(head); } else { if (createNewP) { currentP = new P(); currentContent.add(currentP); createNewP = false; } for (LineNode line : prf.getChildren()) { boolean indented = line.isIndented(); if (!currentP.getContent().isEmpty() && indented) { currentP = new P(); currentContent.add(currentP); } addLineToContent(currentP.getContent(), line); } } } for (PageNode pageNode : root.getChildren()) { for (GraphicsNode graphicsNode : pageNode.getGraphics()) { if (graphicsNode.getText().isEmpty()) { continue; } Graphic g = new Graphic(); P p = new P(); for (ParagraphNode paragraphNode : graphicsNode.getChildren()) { addParagraphToContent(p.getContent(), paragraphNode); } // body.withIndicesAndSpenAndSpanGrps(new Figure().withHeadsAndPSAndAbs(p)); } } text.withIndicesAndSpenAndSpanGrps(body); } private void addLineToContent(final List<Object> contentList, final LineNode line) { String content = line.getText(); if (!contentList.isEmpty()) { String former = (String) contentList.get(contentList.size() - 1); if (former.endsWith("-")) { String combined = former.substring(0, former.length() - 1) + content; contentList.remove(contentList.size() - 1); contentList.add(combined); return; } } contentList.add(content); } private void addParagraphToContent(final List<Object> content, final ParagraphNode prf) { for (LineNode line : prf.getChildren()) { addLineToContent(content, line); } } private void addFront(DocumentNode root, @NotNull Text text) { final Front front = new Front(); addAbstract(root, front); text.withIndicesAndSpenAndSpanGrps(front); } private void addHeader(DocumentNode root, @NotNull TEI tei) { ParagraphNode title1 = root.getTitle(); if (title1 == null) { return; } final TeiHeader header = new TeiHeader(); final FileDesc fileDesc = new FileDesc(); /* title, author and editor */ final TitleStmt titleStmt = new TitleStmt(); final Title title = new Title(); for (LineNode lineNode : title1.getChildren()) { title.withContent(lineNode.getText()); } titleStmt.withTitles(title); fileDesc.setTitleStmt(titleStmt); header.setFileDesc(fileDesc); tei.setTeiHeader(header); } }