package org.nines; import java.io.File; import org.apache.commons.lang.StringEscapeUtils; public class TextUtils { /** * Normalize whitespace; collapse into one space/tab and one linefeed * @param srcText * @return */ public static String normalizeWhitespace(final String srcText) { String[] lines = srcText.replaceAll("\n+", "\n").split("\n"); StringBuffer out = new StringBuffer(); for ( int i =0; i<lines.length; i++) { String line = lines[i]; line = line.replaceAll("\t", " "); line = line.replaceAll(" +", " "); line = line.trim(); if ( line.length() > 0) { out.append( line ).append("\n"); } } return out.toString().trim(); } /** * Remove unknown UTF-8 characters (0xFFFD) and log warnings for each * @param value * @return */ public static String stripUnknownUTF8(final String value, ErrorReport errorReport, final File file) { return stripUnknownUTF8(value, errorReport, file, null); } public static String stripUnknownUTF8(final String value, ErrorReport errorReport, final String url) { return stripUnknownUTF8(value, errorReport, null, url); } public static String stripUnknownUTF8(final String value, ErrorReport errorReport, final File file, final String url) { String fileName = ""; if (file != null) { fileName = file.toString(); } // Look for unknown character and warn int curPos= 0; while ( true ) { int pos = value.indexOf("\ufffd", curPos); if (pos == -1) { break; } curPos = pos+1; String snip = value.substring(Math.max(0, pos-25), Math.min(value.length(), pos+25)); errorReport.addError(new IndexerError(fileName, url, "Removed invalid UTF-8 character at position " + pos + " of field text" + "\n Snippet: ["+snip+"]")); } return value.replaceAll("\ufffd", ""); } /** * Unescape all sequences. Check for invalid remaining sequences and strip them out. * @param srcText The unclean text. * @return Cleaned text! */ public static String stripEscapeSequences(final String srcText, ErrorReport errorReport, final String uri) { return stripEscapeSequences(srcText, errorReport, null, uri); } public static String stripEscapeSequences(final String srcText, ErrorReport errorReport, final File file) { return stripEscapeSequences(srcText, errorReport, file, null); } public static String stripEscapeSequences(final String srcText, ErrorReport errorReport, final File file, final String uri) { String fileName = ""; if (file != null) { fileName = file.toString(); } String cleaned = StringEscapeUtils.unescapeXml(srcText); int startPos = 0; while (true) { int pos = cleaned.indexOf("&#", startPos); if (pos == -1) { break; } else { // look for a trainling ; to end the sequence int pos2 = cleaned.indexOf(";", pos); if (pos2 > -1) { // this is likely an escape sequence if (pos2 <= pos + 6) { // dump the bad sequence String bad = cleaned.substring(pos, pos2 + 1); cleaned = cleaned.replaceAll(bad, ""); errorReport.addError(new IndexerError(fileName, uri, "Removed potentially invalid escape sequece [" + bad + "]")); startPos = pos; } else { // no close ; found. Just skip over the &# startPos = pos + 2; } } else { // NO ; found - skip over the &# startPos = pos + 2; } } } return cleaned; } }