/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2014 Alex Buloichik
2016 Didier Briel
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.filters2.text;
import java.io.IOException;
import java.io.Writer;
import org.omegat.tokenizer.ITokenizer;
import org.omegat.util.StringUtil;
import org.omegat.util.Token;
/**
* Output filter for limit line length.
*
* @author Alex Buloichik (alex73mail@gmail.com)
* @author Didier Briel
*/
public class LineLengthLimitWriter extends Writer {
final Writer out;
final int lineLength;
final int maxLineLength;
final ITokenizer tokenizer;
final StringBuilder str = new StringBuilder();
int breakChars;
char eol1, eol2;
public LineLengthLimitWriter(Writer out, int lineLength, int maxLineLength, ITokenizer tokenizer) {
this.out = out;
this.lineLength = lineLength;
this.maxLineLength = maxLineLength;
this.tokenizer = tokenizer;
}
@Override
public void write(char[] cbuf, int off, int len) throws IOException {
for (int cp, i = 0; i < len; i += Character.charCount(cp)) {
cp = Character.codePointAt(cbuf, off + i);
if (breakChars > 0 && cp == str.codePointBefore(str.length())) {
// the same eol char - flush
outLine();
}
if (cp == '\n' || cp == '\r') {
str.appendCodePoint(cp);
breakChars++;
if (breakChars > 1) {
// 2 eol chars - flush
outLine();
}
} else {
if (breakChars > 0) {
// was eol char - flush
outLine();
}
str.appendCodePoint(cp);
}
}
}
void outLine() throws IOException {
if (str.length() == 0) {
return;
}
int cp = str.codePointBefore(str.length());
if (cp == '\n' || cp == '\r') {
// get latest eol char
eol2 = (char) cp;
str.setLength(str.length() - 1);
}
if (str.length() > 0) {
// get pre-latest eol char
cp = str.codePointBefore(str.length());
if (cp == '\n' || cp == '\r') {
eol1 = (char) cp;
str.setLength(str.length() - 1);
}
}
if (str.length() == 0) {
// was empty line
writeSourceEol();
} else {
Token[] tokens = tokenizer.tokenizeVerbatim(str.toString());
while (str.length() > 0) {
int p = getBreakPos(tokens);
breakAt(p, tokens);
}
}
breakChars = 0;
eol1 = 0;
eol2 = 0;
}
int getBreakPos(Token[] tokens) {
if (str.codePointCount(0, str.length()) <= maxLineLength) {
// line no longer than max length - use full line
return str.length();
}
// check if spaces only more than max length
int latestNonSpacesTokenPos = 0;
for (int i = tokens.length - 1; i >= 0; i--) {
Token t = tokens[i];
if (t == null) {
// less than begin
continue;
}
if (isSpaces(t)) {
continue;
}
// non-spaces token
latestNonSpacesTokenPos = t.getOffset() + t.getLength();
break;
}
if (str.codePointCount(0, latestNonSpacesTokenPos) <= maxLineLength) {
return str.length();
}
// try to break on the space ends
int spacesStart = -1;
for (Token t : tokens) {
if (t == null) {
// less than begin
continue;
}
if (str.codePointCount(0, t.getOffset()) >= lineLength) {
// spaces can be after max length
if (spacesStart >= 0 && str.codePointCount(0, spacesStart) < maxLineLength) {
return t.getOffset();
}
}
if (isSpaces(t)) {
if (spacesStart < 0) {
spacesStart = t.getOffset();
}
} else {
spacesStart = -1;
}
}
// try to break on the space boundaries
for (Token t : tokens) {
if (t == null) {
// less than begin
continue;
}
int cps = str.codePointCount(0, t.getOffset());
if (cps >= lineLength && cps < maxLineLength) {
if (isSpaces(t)) {
return t.getOffset();
}
}
cps = str.codePointCount(0, t.getOffset() + t.getLength());
if (cps >= lineLength && cps < maxLineLength) {
if (isSpaces(t)) {
return t.getOffset() + t.getLength();
}
}
}
// impossible to break on space boundaries - break at any token, except brackets
for (Token t : tokens) {
if (t == null) {
// less than begin
continue;
}
int cps = str.codePointCount(0, t.getOffset());
if (cps >= lineLength && cps < maxLineLength) {
if (isPossibleBreakBefore(t.getOffset())) {
return t.getOffset();
}
}
cps = str.codePointCount(0, t.getOffset() + t.getLength());
if (cps >= lineLength && cps < maxLineLength) {
if (isPossibleBreakBefore(t.getOffset() + t.getLength())) {
return t.getOffset() + t.getLength();
}
}
}
// use latest token before line length
for (int i = 0; i < tokens.length; i++) {
Token t = tokens[i];
if (t == null) {
// less than begin
continue;
}
if (str.codePointCount(0, t.getOffset()) >= lineLength) {
if (i == 0) {
return t.getOffset() + t.getLength();
}
int j = i - 1;
while (j >= 0) {
Token tp = tokens[j--];
if (tp != null && tp.getOffset() > 0) {
if (isPossibleBreakBefore(tp.getOffset())) {
return tp.getOffset();
}
}
}
return t.getOffset();
}
}
// use full line
return str.length();
}
boolean isSpaces(Token token) {
for (int cp, i = 0; i < token.getLength(); i += Character.charCount(cp)) {
cp = str.codePointAt(token.getOffset() + i);
if (!Character.isWhitespace(cp)) {
return false;
}
}
return true;
}
/**
* Write part of line to specified position, and change token offsets.
*/
void breakAt(int pos, Token[] tokens) throws IOException {
// Strip and discard whitespace from end of line
out.write(StringUtil.rstrip(str.substring(0, pos)));
str.delete(0, pos);
if (str.length() > 0) {
writeBreakEol();
} else {
writeSourceEol();
}
for (int i = 0; i < tokens.length; i++) {
Token t = tokens[i];
if (t == null || t.getOffset() < pos) {
tokens[i] = null;
} else {
tokens[i] = new Token(null, t.getOffset() - pos, t.getLength());
}
}
}
/**
* Write EOL at the line break. Need to write force EOL, even there is no EOL chars in source file at all.
*/
void writeBreakEol() throws IOException {
if (eol1 == 0 && eol2 == 0) {
// there is no known EOL, use platform-dependent
out.write(System.lineSeparator());
} else {
if (eol1 != 0) {
out.write(eol1);
}
if (eol2 != 0) {
out.write(eol2);
}
}
}
/**
* Write EOL in the source line's end. It can be file without EOL at the end.
*/
void writeSourceEol() throws IOException {
if (eol1 != 0) {
out.write(eol1);
}
if (eol2 != 0) {
out.write(eol2);
}
}
boolean isPossibleBreakBefore(int pos) {
try {
// check previous char. Can't split after specified chars.
int cp = str.codePointBefore(pos);
// U+00AB LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
// U+201E DOUBLE LOW-9 QUOTATION MARK
if (":\\([{<\u00ab\u201e".indexOf(cp) >= 0) {
return false;
}
} catch (StringIndexOutOfBoundsException ex) {
}
try {
// check next char. Can't split before specified chars.
int cp = str.codePointAt(pos);
// U+00BB RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
// U+201C LEFT DOUBLE QUOTATION MARK
if ("{:)]}>\u00bb\u201c,.".indexOf(cp) >= 0) {
return false;
}
} catch (StringIndexOutOfBoundsException ex) {
}
return true;
}
@Override
public void flush() throws IOException {
}
@Override
public void close() throws IOException {
outLine();
out.flush();
out.close();
}
}