package org.nines; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.nio.charset.CodingErrorAction; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.mozilla.intl.chardet.nsDetector; import org.mozilla.intl.chardet.nsICharsetDetectionObserver; import org.mozilla.universalchardet.UniversalDetector; /** * Cleaner for Raw text files. It will clean out unused tags, * fix escape sequences and strip bad utf-8 characters. Errors * and changes will be written out to the log files. The cleaned file * will be written out to the fullltext area of solr sources * * @author loufoster * */ final class RawTextCleaner { private ErrorReport errorReport; private RDFIndexerConfig config; private Logger log; private String fileEncoding; private long totalOrigChars = 0; private long totalFilesChanged = 0; private long totalCleanedChars = 0; private UniversalDetector detector = null; public RawTextCleaner( RDFIndexerConfig config, ErrorReport errorReport ) { this.errorReport = errorReport; this.config = config; this.log = Logger.getLogger(RawTextCleaner.class.getName()); this.detector = new UniversalDetector(null); } /** * Clean thespecifled file and write the results to the fulltext folder. * Errors will be added to the <code>errorReport</code> * * @param rawTextFile */ public void clean( final File rawTextFile ) { this.log.info("Clean raw text from file "+rawTextFile); // get the filename for the cleaned fulltext file File cleanTextFile = toFullTextFile(rawTextFile); // ensure that the file is UTF-8 encoded... File srcFile = rawTextFile; try { srcFile = fixEncoding(rawTextFile, cleanTextFile); } catch (IOException e) { this.errorReport.addError( new IndexerError(rawTextFile.toString(), "", "Unable to convert raw text file encoding to UTF-8: " + e.toString())); return; } // ...now read the file String content = null; InputStreamReader is = null; try { is = new InputStreamReader(new FileInputStream(srcFile), "UTF-8"); content = IOUtils.toString(is); } catch ( Exception e ) { this.errorReport.addError( new IndexerError(rawTextFile.toString(), "", "Unable to read raw text file: " + e.toString())); return; } finally { IOUtils.closeQuietly(is); } // stats! long startChars = content.length();; this.totalOrigChars += startChars; content = TextUtils.stripUnknownUTF8(content, this.errorReport, rawTextFile); if ( this.config.customCleanClass != null && this.config.customCleanClass.length() > 0) { try { String className = this.config.customCleanClass; @SuppressWarnings("rawtypes") Class newClass = Class.forName("org.nines.cleaner."+className); ICustomCleaner cleaner = (ICustomCleaner)newClass.newInstance(); content = cleaner.clean(this.config.archiveName, content); } catch (Exception e) { errorReport.addError(new IndexerError(rawTextFile.toString(), "", "Unable to run custom cleaner " + this.config.customCleanClass +": " + e.toString())); } } content = cleanText( content ); content = TextUtils.stripEscapeSequences(content, errorReport, rawTextFile); content = TextUtils.normalizeWhitespace(content); long endChars = content.length(); this.totalCleanedChars += endChars; if ( endChars != startChars ) { this.totalFilesChanged++; } this.log.info(" => Original length: "+startChars+", Cleaned length: "+endChars+", Delta:"+(startChars - endChars) ); // Make sure that the directory structure exists if ( cleanTextFile.getParentFile().exists() == false) { if ( cleanTextFile.getParentFile().mkdirs() == false ) { this.errorReport.addError( new IndexerError(cleanTextFile.toString(), "", "Unable to create full text directory tree")); return; } } // dump the content Writer outWriter = null; try { outWriter = new OutputStreamWriter(new FileOutputStream(cleanTextFile), "UTF-8"); outWriter.write( content ); } catch (IOException e) { this.errorReport.addError( new IndexerError(cleanTextFile.toString(), "", "Unable to write cleaned text file: " + e.toString())); } finally { IOUtils.closeQuietly(outWriter); } } private File fixEncoding(File rawTextFile, File cleanTextFile) throws IOException { // Always attempt detect the encoding of the file. // if this is unsuccessful, encoding will be set to // the default specified in the config this.fileEncoding = detectEncoding(rawTextFile); // if it is not utf-8, attempt to convert it! if (this.fileEncoding.equalsIgnoreCase("UTF-8") == false) { this.log.info(" * Converting " + rawTextFile.toString() + " from " + this.fileEncoding + " to UTF-8"); // read from original encoding into 16-bit unicode String nonUtf8Txt = IOUtils.toString(new FileInputStream(rawTextFile), this.fileEncoding); // setup encoders to translate the data. IF bad chars // are encountered, replace them with 0xFFFD (uunkown utf-8 symbol) Charset utf8cs = Charset.availableCharsets().get("UTF-8"); CharsetEncoder utf8en = utf8cs.newEncoder(); utf8en.onMalformedInput(CodingErrorAction.REPLACE); utf8en.onUnmappableCharacter(CodingErrorAction.REPLACE); // encode the 16-bit unicode to UTF-8 and write out the bytes ByteBuffer utf8Buffer = utf8en.encode(CharBuffer.wrap(nonUtf8Txt)); FileOutputStream fos = new FileOutputStream(cleanTextFile); fos.write(utf8Buffer.array()); fos.close(); return cleanTextFile; } return rawTextFile; } private String detectEncoding(File testFile) throws IOException { // feed chunks of data to the detector until it is done this.detector.reset(); byte[] buf = new byte[4096]; FileInputStream fis = new FileInputStream(testFile); int nread; while ((nread = fis.read(buf)) > 0 && !this.detector.isDone()) { this.detector.handleData(buf, 0, nread); } this.detector.dataEnd(); fis.close(); /// see what it thinks.... String encoding = detector.getDetectedCharset(); if (encoding == null) { // try an alternate detector encoding = alternateEncodeDetect(testFile); if ( encoding == null ){ encoding = this.config.defaultEncoding; this.log.info(" * Unable to detect encoding, default to: "+encoding); } } return encoding; } private String alternateEncodeDetect(File testFile) throws IOException { nsDetector det = new nsDetector(); DetectListener listener = new DetectListener(); det.Init( listener ); BufferedInputStream imp = new BufferedInputStream(new FileInputStream(testFile)); byte[] buf = new byte[1024]; int len; boolean done = false; boolean isAscii = true; while ((len = imp.read(buf, 0, buf.length)) != -1) { if (isAscii) { isAscii = det.isAscii(buf, len); } if (!isAscii && !done) { done = det.DoIt(buf, len, false); } } det.DataEnd(); imp.close(); return listener.getEncoding(); } private File toFullTextFile(File rawTextFile) { String cleanedFile = this.config.sourceDir.toString().replace("rawtext", "fulltext") + "/" + RDFIndexerConfig.safeArchive(this.config.archiveName); return new File(cleanedFile +"/" + rawTextFile.getName()); } public long getTotalFilesChanged() { return this.totalFilesChanged; } public long getOriginalLength() { return this.totalOrigChars; } public long getCleanedLength() { return this.totalCleanedChars; } /** * Strip html-ish markup from text * @param fullText * @return */ private String cleanText( String fullText ) { // remove everything between <head>...</head> fullText = removeTag(fullText, "head"); // remove everything between <script>..</script> fullText = removeTag(fullText, "script"); // remove everything between <...> fullText = removeBracketed(fullText, "<", ">"); // Get rid of non-unix line endings fullText = fullText.replaceAll("\r", ""); // Clean up the file a little bit fullText = fullText.replaceAll(" ", " "); fullText = fullText.replaceAll(" ", " "); fullText = fullText.replaceAll(" \n", "\n"); fullText = fullText.replaceAll("\n ", "\n"); return fullText; } private String removeBracketed(String fullText, String left, String right) { int start = fullText.indexOf(left); while (start != -1) { int end = fullText.indexOf(right, start); if (end == -1) { start = -1; } else { String tag = fullText.substring(start+1, end); String insertion = "\n"; if (tag.equals("i") || tag.equals("/i") || tag.equals("b") || tag.equals("/b") || tag.equals("em") || tag.equals("/em") ) { insertion = ""; } fullText = fullText.substring(0, start) + insertion + fullText.substring(end + right.length()); start = fullText.indexOf(left); } } return fullText; } private String removeTag(String fullText, String tag) { return removeBracketed(fullText, "<" + tag, "</" + tag + ">"); } private static class DetectListener implements nsICharsetDetectionObserver { private String encoding; public String getEncoding() { return this.encoding; } public void Notify(String charset) { this.encoding = charset; } } }