package org.xwiki.contrib.mailarchive.utils.internal; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.util.zip.GZIPInputStream; import javax.inject.Inject; import javax.inject.Named; import javax.inject.Singleton; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.xwiki.component.annotation.Component; import org.xwiki.contrib.mailarchive.utils.ITextUtils; import org.xwiki.rendering.parser.StreamParser; import org.xwiki.rendering.renderer.PrintRendererFactory; import org.xwiki.rendering.renderer.printer.DefaultWikiPrinter; import org.xwiki.rendering.renderer.printer.WikiPrinter; /** * @author jbousque */ @Component @Singleton public class TextUtils implements ITextUtils { /** * The component used to parse XHTML obtained after cleaning, when transformations are not executed. */ @Inject @Named("xhtml/1.0") private StreamParser htmlStreamParser; @Inject @Named("plain/1.0") private PrintRendererFactory printRendererFactory; private static Logger logger; public TextUtils() { } public Logger getLogger() { return logger; } public static void setLogger(final Logger loggr) { logger = loggr; } /** * Returns the Levenshtein distance between two strings, averaged by the length of the largest string provided, in * order to return a value n so that 0 < n < 1. * * @param s * @param t * @return */ @Override public double getAveragedLevenshteinDistance(final String s, final String t) { return (double) (StringUtils.getLevenshteinDistance(s, t)) / ((double) Math.max(s.length(), t.length())); } /** * Compare 2 strings for similarity Returns true if strings can be considered similar enough<br/> * - s1 and s2 have a levenshtein distance < 25% <br/> * - s1 or s2 begins with s2 or s1 respectively * * @param defaultMailArchive TODO * @param s1 * @param s2 * @return */ @Override public boolean similarSubjects(final String s1, final String s2) { logger.debug("similarSubjects : comparing [" + s1 + "] and [" + s2 + "]"); String s1Replaced = s1.replaceAll("^([Rr][Ee]:|[Ff][Ww]:)(.*)$", "$2"); String s2Replaced = s2.replaceAll("^([Rr][Ee]:|[Ff][Ww]:)(.*)$", "$2"); logger.debug("similarSubjects : comparing [" + s1 + "] and [" + s2 + "]"); if (s1Replaced == s2Replaced) { logger.debug("similarSubjects : subjects are equal"); return true; } if (s1Replaced != null && s1Replaced.equals(s2Replaced)) { logger.debug("similarSubjects : subjects are the equal"); return true; } if (s1Replaced.length() == 0 || s2Replaced.length() == 0) { logger.debug("similarSubjects : one subject is empty, we consider them different"); return false; } try { double d = getAveragedLevenshteinDistance(s1Replaced, s2Replaced); logger.debug("similarSubjects : Levenshtein distance d=" + d); if (d <= 0.25) { logger.debug("similarSubjects : subjects are considered similar because d <= 0.25"); return true; } } catch (IllegalArgumentException iaE) { return false; } if ((s1Replaced.startsWith(s2Replaced) || s2Replaced.startsWith(s1Replaced))) { logger.debug("similarSubjects : subjects are considered similar because one start with the other"); return true; } return false; } // Truncate a string "s" to obtain less than a certain number of bytes "maxBytes", starting with "maxChars" // characters. @Override public String truncateStringForBytes(final String s, final int maxChars, final int maxBytes) { if (StringUtils.isEmpty(s)) { return ""; } String substring = s; if (s.length() > maxChars) { substring = s.substring(0, maxChars); } byte[] bytes = new byte[] {}; try { bytes = substring.getBytes("UTF8"); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (bytes.length > maxBytes) { logger.debug("Truncate string to " + substring.length() + " characters, result in " + bytes.length + " bytes array"); return truncateStringForBytes(s, maxChars - (bytes.length - maxChars) / 4, maxBytes); } else { logger.debug("String truncated to " + substring.length() + " characters, resulting in " + bytes.length + " bytes array"); return substring; } } @Override public String truncateForString(final String s) { if (s.length() > SHORT_STRINGS_MAX_LENGTH) { return s.substring(0, SHORT_STRINGS_MAX_LENGTH - 1); } return s; } @Override public String truncateForLargeString(final String s) { if (s.length() > LONG_STRINGS_MAX_LENGTH) { return s.substring(0, LONG_STRINGS_MAX_LENGTH - 1); } return s; } // FIXME: find equivalent methods in xwiki utilities libraries @Override public byte charToByte(final char c) { return (byte) "0123456789ABCDEF".indexOf("" + c); } // FIXME: find equivalent methods in xwiki utilities libraries /** * BD : Used to transfer hex string into byte array. two hex string combines one byte. So that means the length of * hex string should be even. Or the null will be returned. * * @param hexStr * @return */ @Override public byte[] hex2byte(final String hexStr) { if (hexStr == null || hexStr.isEmpty() || (hexStr.length() % 2 > 1)) { return null; } String hexStrUp = hexStr.toUpperCase(); int length = hexStrUp.length() / 2; char[] hexChars = hexStrUp.toCharArray(); byte[] resultByte = new byte[length]; for (int i = 0; i < length; i++) { int pos = i * 2; resultByte[i] = (byte) (charToByte(hexChars[pos]) << 4 | charToByte(hexChars[pos + 1])); } return resultByte; } // FIXME: find equivalent methods in xwiki utilities libraries /** * BD : Used to transfer byte array into hex string. * * @param b * @return */ @Override public String byte2hex(final byte[] b) { StringBuffer hexStr = new StringBuffer(""); String stmp = ""; for (byte aB : b) { stmp = (Integer.toHexString(aB & 0xFF)); if (stmp.length() == 1) { hexStr.append("0" + stmp); } else { hexStr.append(stmp); } } return hexStr.toString().toUpperCase(); } @Override public String htmlToPlainText(final String htmlcontent) { String converted = null; try { WikiPrinter printer = new DefaultWikiPrinter(); htmlStreamParser.parse(new StringReader(htmlcontent), printRendererFactory.createRenderer(printer)); converted = printer.toString(); } catch (Throwable t) { logger.warn("Conversion from HTML to plain text thrown exception", t); converted = null; } return converted; } @Override public String unzipString(final String zippedString) throws IOException, UnsupportedEncodingException { String html; InputStream is = new ByteArrayInputStream(hex2byte(zippedString)); GZIPInputStream zis = new GZIPInputStream(is); html = ""; if (zis != null) { StringBuilder sb = new StringBuilder(); String line; try { BufferedReader reader = new BufferedReader(new InputStreamReader(zis, "UTF-8")); while ((line = reader.readLine()) != null) { sb.append(line).append("\n"); } } finally { zis.close(); } html = sb.toString(); } return html; } }