LineLengthLimitWriter.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2014 Alex Buloichik
               2016 Didier Briel
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.filters2.text;

import java.io.IOException;
import java.io.Writer;

import org.omegat.tokenizer.ITokenizer;
import org.omegat.util.StringUtil;
import org.omegat.util.Token;

/**
 * Output filter for limit line length.
 *
 * @author Alex Buloichik (alex73mail@gmail.com)
 * @author Didier Briel
 */
public class LineLengthLimitWriter extends Writer {
    final Writer out;
    final int lineLength;
    final int maxLineLength;
    final ITokenizer tokenizer;
    final StringBuilder str = new StringBuilder();
    int breakChars;
    char eol1, eol2;

    public LineLengthLimitWriter(Writer out, int lineLength, int maxLineLength, ITokenizer tokenizer) {
        this.out = out;
        this.lineLength = lineLength;
        this.maxLineLength = maxLineLength;
        this.tokenizer = tokenizer;
    }

    @Override
    public void write(char[] cbuf, int off, int len) throws IOException {
        for (int cp, i = 0; i < len; i += Character.charCount(cp)) {
            cp = Character.codePointAt(cbuf, off + i);
            if (breakChars > 0 && cp == str.codePointBefore(str.length())) {
                // the same eol char - flush
                outLine();
            }
            if (cp == '\n' || cp == '\r') {
                str.appendCodePoint(cp);
                breakChars++;
                if (breakChars > 1) {
                    // 2 eol chars - flush
                    outLine();
                }
            } else {
                if (breakChars > 0) {
                    // was eol char - flush
                    outLine();
                }
                str.appendCodePoint(cp);
            }
        }
    }

    void outLine() throws IOException {
        if (str.length() == 0) {
            return;
        }
        int cp = str.codePointBefore(str.length());
        if (cp == '\n' || cp == '\r') {
            // get latest eol char
            eol2 = (char) cp;
            str.setLength(str.length() - 1);
        }
        if (str.length() > 0) {
            // get pre-latest eol char
            cp = str.codePointBefore(str.length());
            if (cp == '\n' || cp == '\r') {
                eol1 = (char) cp;
                str.setLength(str.length() - 1);
            }
        }
        if (str.length() == 0) {
            // was empty line
            writeSourceEol();
        } else {
            Token[] tokens = tokenizer.tokenizeVerbatim(str.toString());
            while (str.length() > 0) {
                int p = getBreakPos(tokens);
                breakAt(p, tokens);
            }
        }
        breakChars = 0;
        eol1 = 0;
        eol2 = 0;
    }

    int getBreakPos(Token[] tokens) {
        if (str.codePointCount(0, str.length()) <= maxLineLength) {
            // line no longer than max length - use full line
            return str.length();
        }
        // check if spaces only more than max length
        int latestNonSpacesTokenPos = 0;
        for (int i = tokens.length - 1; i >= 0; i--) {
            Token t = tokens[i];
            if (t == null) {
                // less than begin
                continue;
            }
            if (isSpaces(t)) {
                continue;
            }
            // non-spaces token
            latestNonSpacesTokenPos = t.getOffset() + t.getLength();
            break;
        }
        if (str.codePointCount(0, latestNonSpacesTokenPos) <= maxLineLength) {
            return str.length();
        }
        // try to break on the space ends
        int spacesStart = -1;
        for (Token t : tokens) {
            if (t == null) {
                // less than begin
                continue;
            }
            if (str.codePointCount(0, t.getOffset()) >= lineLength) {
                // spaces can be after max length
                if (spacesStart >= 0 && str.codePointCount(0, spacesStart) < maxLineLength) {
                    return t.getOffset();
                }
            }
            if (isSpaces(t)) {
                if (spacesStart < 0) {
                    spacesStart = t.getOffset();
                }
            } else {
                spacesStart = -1;
            }
        }
        // try to break on the space boundaries
        for (Token t : tokens) {
            if (t == null) {
                // less than begin
                continue;
            }
            int cps = str.codePointCount(0, t.getOffset());
            if (cps >= lineLength && cps < maxLineLength) {
                if (isSpaces(t)) {
                    return t.getOffset();
                }
            }
            cps = str.codePointCount(0, t.getOffset() + t.getLength());
            if (cps >= lineLength && cps < maxLineLength) {
                if (isSpaces(t)) {
                    return t.getOffset() + t.getLength();
                }
            }
        }
        // impossible to break on space boundaries - break at any token, except brackets
        for (Token t : tokens) {
            if (t == null) {
                // less than begin
                continue;
            }
            int cps = str.codePointCount(0, t.getOffset());
            if (cps >= lineLength && cps < maxLineLength) {
                if (isPossibleBreakBefore(t.getOffset())) {
                    return t.getOffset();
                }
            }
            cps = str.codePointCount(0, t.getOffset() + t.getLength());
            if (cps >= lineLength && cps < maxLineLength) {
                if (isPossibleBreakBefore(t.getOffset() + t.getLength())) {
                    return t.getOffset() + t.getLength();
                }
            }
        }
        // use latest token before line length
        for (int i = 0; i < tokens.length; i++) {
            Token t = tokens[i];
            if (t == null) {
                // less than begin
                continue;
            }
            if (str.codePointCount(0, t.getOffset()) >= lineLength) {
                if (i == 0) {
                    return t.getOffset() + t.getLength();
                }
                int j = i - 1;
                while (j >= 0) {
                    Token tp = tokens[j--];
                    if (tp != null && tp.getOffset() > 0) {
                        if (isPossibleBreakBefore(tp.getOffset())) {
                            return tp.getOffset();
                        }
                    }
                }
                return t.getOffset();
            }
        }
        // use full line
        return str.length();
    }

    boolean isSpaces(Token token) {
        for (int cp, i = 0; i < token.getLength(); i += Character.charCount(cp)) {
            cp = str.codePointAt(token.getOffset() + i);
            if (!Character.isWhitespace(cp)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Write part of line to specified position, and change token offsets.
     */
    void breakAt(int pos, Token[] tokens) throws IOException {
        // Strip and discard whitespace from end of line
        out.write(StringUtil.rstrip(str.substring(0, pos)));
        str.delete(0, pos);
        if (str.length() > 0) {
            writeBreakEol();
        } else {
            writeSourceEol();
        }
        for (int i = 0; i < tokens.length; i++) {
            Token t = tokens[i];
            if (t == null || t.getOffset() < pos) {
                tokens[i] = null;
            } else {
                tokens[i] = new Token(null, t.getOffset() - pos, t.getLength());
            }
        }
    }

    /**
     * Write EOL at the line break. Need to write force EOL, even there is no EOL chars in source file at all.
     */
    void writeBreakEol() throws IOException {
        if (eol1 == 0 && eol2 == 0) {
            // there is no known EOL, use platform-dependent
            out.write(System.lineSeparator());
        } else {
            if (eol1 != 0) {
                out.write(eol1);
            }
            if (eol2 != 0) {
                out.write(eol2);
            }
        }
    }

    /**
     * Write EOL in the source line's end. It can be file without EOL at the end.
     */
    void writeSourceEol() throws IOException {
        if (eol1 != 0) {
            out.write(eol1);
        }
        if (eol2 != 0) {
            out.write(eol2);
        }
    }

    boolean isPossibleBreakBefore(int pos) {
        try {
            // check previous char. Can't split after specified chars.
            int cp = str.codePointBefore(pos);
            // U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
            // U+201E DOUBLE LOW-9 QUOTATION MARK
            if (":\\([{<\u00ab\u201e".indexOf(cp) >= 0) {
                return false;
            }
        } catch (StringIndexOutOfBoundsException ex) {
        }
        try {
            // check next char. Can't split before specified chars.
            int cp = str.codePointAt(pos);
            // U+00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
            // U+201C LEFT DOUBLE QUOTATION MARK
            if ("{:)]}>\u00bb\u201c,.".indexOf(cp) >= 0) {
                return false;
            }
        } catch (StringIndexOutOfBoundsException ex) {
        }
        return true;
    }

    @Override
    public void flush() throws IOException {
    }

    @Override
    public void close() throws IOException {
        outLine();
        out.flush();
        out.close();
    }
}