/* Copyright (2012) Schibsted ASA * This file is part of Possom. * * Possom is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Possom is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with Possom. If not, see <http://www.gnu.org/licenses/>. */ package no.sesat.search.result; import java.util.ArrayDeque; import java.util.Deque; public class StringChopper { private enum State { NONE, TAG, STARTTAG, ENDTAG, CDATA, COMMENT, DECLARATION }; /** * Truncate s to the given length at closest space or xml tag. Any xml tags * will be closed/balanced. * * @param input * The string that should be truncated. * @param length * @return The truncated string */ public static String chop(final String input, final int length) { return chop(input, length, false); } /** * Truncate s to the given length or to closest space/tag depending on chop. * Any xml tags will be closed/balanced. * * @param input * The string that should be truncated. * @param length * max length of string (if choped the string will be '...' * longer then max.) * @param chop * If words should be choped, or if we chop inbetween spaces. * @return The truncated string */ public static String chop(final String input, final int length, final boolean chop) { if (input != null) { final Deque<Integer> stack = new ArrayDeque<Integer>(); final char[] s = input.toCharArray(); final StringBuilder res = new StringBuilder(s.length); State state = State.NONE; int count = 0; int i = 0; main: for (; i < s.length; i++) { char c = s[i]; switch (state) { case NONE: if (c == '<') { state = State.TAG; } else { count++; if (count == length) { res.append(c); break main; } } break; case TAG: if (c == '/') { state = State.ENDTAG; } else if (c == '!') { // ![CDATA[ if (s.length > (i + 7) && s[i + 1] == '[' && (s[i + 2] == 'C' || s[i + 2] == 'c') && (s[i + 3] == 'D' || s[i + 3] == 'd') && (s[i + 4] == 'A' || s[i + 4] == 'a') && (s[i + 5] == 'T' || s[i + 5] == 't') && (s[i + 6] == 'A' || s[i + 6] == 'a') && s[i + 7] == '[') { state = State.CDATA; res.append("![CDATA["); i += 7; continue; } // !-- else if (s.length > (i + 2) && s[i + 1] == '-' && s[i + 2] == '-') { state = State.COMMENT; res.append("!--"); i += 2; continue; } } else if (c == '?') { state = State.DECLARATION; } else { stack.push(i); state = State.STARTTAG; } break; case STARTTAG: if (c == '/') { if (s.length > (i + 1) && s[i + 1] == '>') { state = State.NONE; res.append("/>"); i += 1; if (!stack.isEmpty()) { stack.pop(); } continue; } } else if (c == '>') { state = State.NONE; } break; case ENDTAG: if (c == '>') { state = State.NONE; if (!stack.isEmpty()) { stack.pop(); } } break; case CDATA: if (c == ']') {// ]]> if (s.length > (i + 2) && s[i + 1] == ']' && s[i + 2] == '>') { state = State.NONE; res.append("]]>"); i += 2; continue; } } else { count++; if (count == length) { res.append(c); break main; } } break; case COMMENT: if (c == '-') { // --> if (s.length > (i + 2) && s[i + 1] == '-' && s[i + 2] == '>') { state = State.NONE; res.append("-->"); i += 2; continue; } } break; case DECLARATION: if (c == '?') { if (s.length > (i + 1) && s[i + 1] == '>') { state = State.NONE; res.append("?>"); i += 1; continue; } } break; } res.append(c); } // remove unclosed tag if (state == State.TAG || state == State.STARTTAG || state == State.ENDTAG) { int pos = res.lastIndexOf("<"); res.setLength(pos); if (state == State.STARTTAG) { stack.pop(); } } // append dots if (i < s.length - 1) { if (!chop) { for (int k = i; k > 0 && count > 0; k--) { if (s[k] == ' ' || s[k] == ((state == State.CDATA) ? '[' : '>')) { res.setLength(k + 1); k = 0; } count--; } res.append("..."); } } // close CDATA if we are in one if (state == State.CDATA) { res.append("]]>"); } // close all other open tags while (!stack.isEmpty()) { int j = stack.pop(); res.append("</"); while (s.length > j && (s[j] != '>' && s[j] != ' ')) { res.append(s[j]); j++; } res.append('>'); } return res.toString(); } return null; } }