package org.nines;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.universalchardet.UniversalDetector;
/**
* Cleaner for Raw text files. It will clean out unused tags,
* fix escape sequences and strip bad utf-8 characters. Errors
* and changes will be written out to the log files. The cleaned file
* will be written out to the fullltext area of solr sources
*
* @author loufoster
*
*/
final class RawTextCleaner {
private ErrorReport errorReport;
private RDFIndexerConfig config;
private Logger log;
private String fileEncoding;
private long totalOrigChars = 0;
private long totalFilesChanged = 0;
private long totalCleanedChars = 0;
private UniversalDetector detector = null;
public RawTextCleaner( RDFIndexerConfig config, ErrorReport errorReport ) {
this.errorReport = errorReport;
this.config = config;
this.log = Logger.getLogger(RawTextCleaner.class.getName());
this.detector = new UniversalDetector(null);
}
/**
* Clean thespecifled file and write the results to the fulltext folder.
* Errors will be added to the <code>errorReport</code>
*
* @param rawTextFile
*/
public void clean( final File rawTextFile ) {
this.log.info("Clean raw text from file "+rawTextFile);
// get the filename for the cleaned fulltext file
File cleanTextFile = toFullTextFile(rawTextFile);
// ensure that the file is UTF-8 encoded...
File srcFile = rawTextFile;
try {
srcFile = fixEncoding(rawTextFile, cleanTextFile);
} catch (IOException e) {
this.errorReport.addError(
new IndexerError(rawTextFile.toString(), "", "Unable to convert raw text file encoding to UTF-8: " + e.toString()));
return;
}
// ...now read the file
String content = null;
InputStreamReader is = null;
try {
is = new InputStreamReader(new FileInputStream(srcFile), "UTF-8");
content = IOUtils.toString(is);
} catch ( Exception e ) {
this.errorReport.addError(
new IndexerError(rawTextFile.toString(), "", "Unable to read raw text file: " + e.toString()));
return;
} finally {
IOUtils.closeQuietly(is);
}
// stats!
long startChars = content.length();;
this.totalOrigChars += startChars;
content = TextUtils.stripUnknownUTF8(content, this.errorReport, rawTextFile);
if ( this.config.customCleanClass != null && this.config.customCleanClass.length() > 0) {
try {
String className = this.config.customCleanClass;
@SuppressWarnings("rawtypes")
Class newClass = Class.forName("org.nines.cleaner."+className);
ICustomCleaner cleaner = (ICustomCleaner)newClass.newInstance();
content = cleaner.clean(this.config.archiveName, content);
} catch (Exception e) {
errorReport.addError(new IndexerError(rawTextFile.toString(), "", "Unable to run custom cleaner "
+ this.config.customCleanClass +": " + e.toString()));
}
}
content = cleanText( content );
content = TextUtils.stripEscapeSequences(content, errorReport, rawTextFile);
content = TextUtils.normalizeWhitespace(content);
long endChars = content.length();
this.totalCleanedChars += endChars;
if ( endChars != startChars ) {
this.totalFilesChanged++;
}
this.log.info(" => Original length: "+startChars+", Cleaned length: "+endChars+", Delta:"+(startChars - endChars) );
// Make sure that the directory structure exists
if ( cleanTextFile.getParentFile().exists() == false) {
if ( cleanTextFile.getParentFile().mkdirs() == false ) {
this.errorReport.addError(
new IndexerError(cleanTextFile.toString(), "", "Unable to create full text directory tree"));
return;
}
}
// dump the content
Writer outWriter = null;
try {
outWriter = new OutputStreamWriter(new FileOutputStream(cleanTextFile), "UTF-8");
outWriter.write( content );
} catch (IOException e) {
this.errorReport.addError(
new IndexerError(cleanTextFile.toString(), "", "Unable to write cleaned text file: " + e.toString()));
} finally {
IOUtils.closeQuietly(outWriter);
}
}
private File fixEncoding(File rawTextFile, File cleanTextFile) throws IOException {
// Always attempt detect the encoding of the file.
// if this is unsuccessful, encoding will be set to
// the default specified in the config
this.fileEncoding = detectEncoding(rawTextFile);
// if it is not utf-8, attempt to convert it!
if (this.fileEncoding.equalsIgnoreCase("UTF-8") == false) {
this.log.info(" * Converting " + rawTextFile.toString() + " from " + this.fileEncoding + " to UTF-8");
// read from original encoding into 16-bit unicode
String nonUtf8Txt = IOUtils.toString(new FileInputStream(rawTextFile), this.fileEncoding);
// setup encoders to translate the data. IF bad chars
// are encountered, replace them with 0xFFFD (uunkown utf-8 symbol)
Charset utf8cs = Charset.availableCharsets().get("UTF-8");
CharsetEncoder utf8en = utf8cs.newEncoder();
utf8en.onMalformedInput(CodingErrorAction.REPLACE);
utf8en.onUnmappableCharacter(CodingErrorAction.REPLACE);
// encode the 16-bit unicode to UTF-8 and write out the bytes
ByteBuffer utf8Buffer = utf8en.encode(CharBuffer.wrap(nonUtf8Txt));
FileOutputStream fos = new FileOutputStream(cleanTextFile);
fos.write(utf8Buffer.array());
fos.close();
return cleanTextFile;
}
return rawTextFile;
}
private String detectEncoding(File testFile) throws IOException {
// feed chunks of data to the detector until it is done
this.detector.reset();
byte[] buf = new byte[4096];
FileInputStream fis = new FileInputStream(testFile);
int nread;
while ((nread = fis.read(buf)) > 0 && !this.detector.isDone()) {
this.detector.handleData(buf, 0, nread);
}
this.detector.dataEnd();
fis.close();
/// see what it thinks....
String encoding = detector.getDetectedCharset();
if (encoding == null) {
// try an alternate detector
encoding = alternateEncodeDetect(testFile);
if ( encoding == null ){
encoding = this.config.defaultEncoding;
this.log.info(" * Unable to detect encoding, default to: "+encoding);
}
}
return encoding;
}
private String alternateEncodeDetect(File testFile) throws IOException {
nsDetector det = new nsDetector();
DetectListener listener = new DetectListener();
det.Init( listener );
BufferedInputStream imp = new BufferedInputStream(new FileInputStream(testFile));
byte[] buf = new byte[1024];
int len;
boolean done = false;
boolean isAscii = true;
while ((len = imp.read(buf, 0, buf.length)) != -1) {
if (isAscii) {
isAscii = det.isAscii(buf, len);
}
if (!isAscii && !done) {
done = det.DoIt(buf, len, false);
}
}
det.DataEnd();
imp.close();
return listener.getEncoding();
}
private File toFullTextFile(File rawTextFile) {
String cleanedFile = this.config.sourceDir.toString().replace("rawtext", "fulltext")
+ "/" + RDFIndexerConfig.safeArchive(this.config.archiveName);
return new File(cleanedFile +"/" + rawTextFile.getName());
}
public long getTotalFilesChanged() {
return this.totalFilesChanged;
}
public long getOriginalLength() {
return this.totalOrigChars;
}
public long getCleanedLength() {
return this.totalCleanedChars;
}
/**
* Strip html-ish markup from text
* @param fullText
* @return
*/
private String cleanText( String fullText ) {
// remove everything between <head>...</head>
fullText = removeTag(fullText, "head");
// remove everything between <script>..</script>
fullText = removeTag(fullText, "script");
// remove everything between <...>
fullText = removeBracketed(fullText, "<", ">");
// Get rid of non-unix line endings
fullText = fullText.replaceAll("\r", "");
// Clean up the file a little bit
fullText = fullText.replaceAll(" ", " ");
fullText = fullText.replaceAll(" ", " ");
fullText = fullText.replaceAll(" \n", "\n");
fullText = fullText.replaceAll("\n ", "\n");
return fullText;
}
private String removeBracketed(String fullText, String left, String right) {
int start = fullText.indexOf(left);
while (start != -1) {
int end = fullText.indexOf(right, start);
if (end == -1) {
start = -1;
} else {
String tag = fullText.substring(start+1, end);
String insertion = "\n";
if (tag.equals("i") || tag.equals("/i") ||
tag.equals("b") || tag.equals("/b") ||
tag.equals("em") || tag.equals("/em") ) {
insertion = "";
}
fullText = fullText.substring(0, start) + insertion + fullText.substring(end + right.length());
start = fullText.indexOf(left);
}
}
return fullText;
}
private String removeTag(String fullText, String tag) {
return removeBracketed(fullText, "<" + tag, "</" + tag + ">");
}
private static class DetectListener implements nsICharsetDetectionObserver {
private String encoding;
public String getEncoding() {
return this.encoding;
}
public void Notify(String charset) {
this.encoding = charset;
}
}
}