XmlDataMap.java example

/******************************************************************************
 * Copyright (c) 2010 Basis Technology Corp.
 * 
 * Basis Technology Corp. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package com.basistech.readability;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;

import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;

/**
 * Maintains map between PC-DATA offsets and Text nodes in an XML document. Provides some structure for the
 * process of pulling the data out.
 */
public abstract class XmlDataMap {
    protected char forceSentenceChar = '\u2029'; // paragraph

    /**
     * Classify elements in the tree.
     */
    public enum ElementAction {
        /**
         * Ignore text under here.
         */
        Banned,
        /**
         * Not currently used.
         */
        Alt,
        /**
         * Insert whitespace.
         */
        Whitespace,
        /**
         * Treat as sentence boundary.
         */
        Sentence,
        /**
         * Remember where this was.
         */
        Mark;
    }

    public static class Mark {
        private String tag;
        private int offset;

        /**
         * * @return Returns the tag.
         */
        public String getTag() {
            return tag;
        }

        /**
         * @param tag The tag to set.
         */
        public void setTag(String tag) {
            this.tag = tag;
        }

        /**
         * * @return Returns the offset.
         */
        public int getOffset() {
            return offset;
        }

        /**
         * @param offset The offset to set.
         */
        public void setOffset(int offset) {
            this.offset = offset;
        }
    }

    protected StringBuffer pcDataBuffer;
    private List<Mark> marks;
    // CHECKSTYLE:OFF
    private LinkedList<OffsetRange> offsetRanges;
    // CHECKSTYLE:ON
    private int pcDataOffset;
    private boolean justAppendedSpace;
    private boolean justAppendedPeriod;

    private ListIterator<OffsetRange> optimizedListPointer;

    private OffsetRange optimizedRangeListElement;

    protected XmlDataMap() {
        offsetRanges = new LinkedList<OffsetRange>();
        pcDataOffset = 0;
        pcDataBuffer = new StringBuffer();
        justAppendedSpace = false;
        justAppendedPeriod = false;
        marks = new ArrayList<Mark>();
    }

    public OffsetRange findOffsetRangeForOffset(int offset) {
        if (optimizedRangeListElement.offsetInRange(offset)) {
            return optimizedRangeListElement;
        } else if (offset > optimizedRangeListElement.getStart()) {
            while (optimizedRangeListElement.getStart() < offset && optimizedListPointer.hasNext()) {
                optimizedRangeListElement = optimizedListPointer.next();
                if (optimizedRangeListElement.offsetInRange(offset)) {
                    return optimizedRangeListElement;
                }
            }
            throw new RuntimeException("Offset " + offset + " beyond last range");
        } else {
            // we don't expect to exercise this case.
            // has to be smaller, no?
            while (offset < optimizedRangeListElement.getStart() && optimizedListPointer.hasPrevious()) {
                optimizedRangeListElement = optimizedListPointer.previous();
                if (optimizedRangeListElement.offsetInRange(offset)) {
                    return optimizedRangeListElement;
                }
            }
            throw new RuntimeException("Offset " + offset + " before the first offset");
        }
    }

    /**
     * Retrieve the offset ranges for the text nodes of the original tree.
     *
     * @return the ranges.
     */
    public List<OffsetRange> getOffsetRanges() {
        return offsetRanges;
    }

    public String getPcData() {
        return pcDataBuffer.toString();
    }

    /**
     * Retrieve the accumulated pc data.
     *
     * @return
     */
    public StringBuffer getPcDataBuffer() {
        return pcDataBuffer;
    }

    /**
     * If we need to split a range for annotation, we want to keep the map of offset ranges usable. Note that
     * the caller has to revalidate or maintain any indices it has grabbed for ranges after the one we are
     * splitting. Note that this does not insert the new text node into the parent contents, the caller does
     * that.
     *
     * @param range
     * @param splitPoint
     * @return
     */
    public TextNode splitText(int rangePoint, int splitPoint) {
        assert splitPoint > 0;
        OffsetRange range = offsetRanges.get(rangePoint);
        assert splitPoint < range.getText().text().length();
        TextNode newText = new TextNode(range.getText().text().substring(splitPoint), null);
        range.getText().text(range.getText().text().substring(0, splitPoint));
        OffsetRange newRange = new OffsetRange(range.getStart() + splitPoint, range.getEnd(), newText);
        offsetRanges.add(rangePoint + 1, newRange);
        range.setEnd(splitPoint + range.getStart());
        assert range.getText().text().length() == range.getEnd() - range.getStart();
        return newText;
    }

    /**
     * A subclass may process metadata into the pc-data stream by calling this directly.
     *
     * @param textObject
     * @param text
     */
    protected void append(TextNode textObject, String text) {
        // if an entire Text element is whitespace, chances are that it's <div>NL noise. We don't need it.
        boolean spaceText = text.matches("[\\s]*");
        if (spaceText && justAppendedSpace) {
            return;
        }
        if (spaceText) {
            justAppendedSpace = true;
        }

        OffsetRange offsetRange = new OffsetRange(pcDataOffset, pcDataOffset + text.length(), textObject);
        pcDataBuffer.append(text);
        pcDataOffset += text.length();
        justAppendedSpace = Character.isWhitespace(text.charAt(text.length() - 1));
        justAppendedPeriod = eosPunctuation(lastNonWhitepaceCharacter(text));
        offsetRanges.add(offsetRange);
    }

    protected char lastNonWhitepaceCharacter(String text) {
        for (int index = text.length() - 1; index >= 0; index--) {
            char c = text.charAt(index);
            if (!Character.isWhitespace(c)) {
                return c;
            }
        }
        return '\ufeff'; // it won't count as punctuation
    }

    //SK: allow quotes to be considered as EOS punctuation, so that we don't 
    //  add extra punctuation to sentences ending with quotes. This isn't 
    //  entirely unicode-friendly, and we may want to fix that someday.
    private static boolean eosPunctuation(char c) {
        String s = "!?.\u2029\"\u0027\u2018\u2019\u201c\u201d"; 
        return s.indexOf(c) != -1;
    }

    protected void appendPeriod() {
        int startPcDataOffset = pcDataOffset;
        if (!justAppendedSpace && !justAppendedPeriod) {
            String appendMe = " . " + System.getProperty("line.separator");
            pcDataBuffer.append(appendMe);
            pcDataOffset += appendMe.length();
            justAppendedPeriod = true;
            justAppendedSpace = true;
        } else if (!justAppendedPeriod) {
            String appendMe = ". " + System.getProperty("line.separator");
            pcDataBuffer.append(appendMe);
            pcDataOffset += appendMe.length();
            justAppendedPeriod = true;
            justAppendedSpace = true;
        } else if (!justAppendedSpace) {
            String appendMe = " " + System.getProperty("line.separator");
            pcDataBuffer.append(appendMe);
            pcDataOffset += appendMe.length();
            justAppendedPeriod = true;
            justAppendedSpace = true;
        }
        // we make a range so that the code can tell the difference between 'spurious, added, period'
        // and 'bug that failed to make an offset range.'
        OffsetRange offsetRange = new OffsetRange(startPcDataOffset, pcDataOffset, null);
        offsetRanges.add(offsetRange);

    }

    protected void appendSpace() {
        if (!justAppendedSpace && !justAppendedPeriod) {
            justAppendedSpace = true;
            pcDataBuffer.append(' ');
            pcDataOffset++;
        }
    }

    protected abstract ElementAction classifyElement(Element element);

    public void process(Element rootElement) {
        recurse(rootElement);
        optimizedListPointer = offsetRanges.listIterator();
        optimizedRangeListElement = offsetRanges.getFirst();
    }

    private void recurse(Element element) {
        ElementAction action = classifyElement(element);
        if (action == ElementAction.Whitespace || action == ElementAction.Sentence) {
            appendSpace();
        }
        for (Node childNode : element.childNodes()) {
            // n.b., cdata not possible if we are coming from TagSoup. If we also handle
            // real xhtml by directly parsing it, then we have another story on our hands.
            // though we could use canonical XML to get rid of them.
            if (childNode instanceof TextNode && action != ElementAction.Banned) {
                TextNode textContent = (TextNode)childNode;
                String textString = textContent.text();
                append(textContent, textString);
            } else if (childNode instanceof Element) {
                recurse((Element)childNode);
            }
        }
        if (action == ElementAction.Whitespace) {
            appendSpace();
        } else if (action == ElementAction.Sentence) {
            appendPeriod();
        } else if (action == ElementAction.Mark) {
            Mark mark = new Mark();
            mark.setOffset(pcDataOffset);
            mark.setTag(element.tagName());
        }
    }

    /**
     * * @return Returns the marks.
     */
    public List<Mark> getMarks() {
        return marks;
    }

    public char getForceSentenceChar() {
        return forceSentenceChar;
    }

    public void setForceSentenceChar(char forceSentenceChar) {
        this.forceSentenceChar = forceSentenceChar;
    }
}