package org.open2jam.parsers.utils; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import org.mozilla.intl.chardet.nsDetector; import org.mozilla.intl.chardet.nsICharsetDetectionObserver; import org.mozilla.intl.chardet.nsPSMDetector; /** * wrapper around the mozilla's chardet algorithm * @author fox */ public class CharsetDetector implements nsICharsetDetectionObserver { private String charset; private nsDetector det; private CharsetDetector() {} void start(){ charset = "US-ASCII"; det = new nsDetector(nsPSMDetector.ALL); det.Init(this); } private boolean done = false; private boolean isAscii = true; /** returns true if done */ boolean feed(byte[] b){ if(isAscii)isAscii = det.isAscii(b,b.length); if(!isAscii && !done)done = det.DoIt(b,b.length, false); return done; } String result(){ det.DataEnd(); return charset; } @Override public void Notify(String string) { charset = string; } public static String analyze(InputStream in) throws java.io.IOException { CharsetDetector c = new CharsetDetector(); c.start(); BufferedInputStream imp = new BufferedInputStream(in); byte[] buf = new byte[1024]; while( imp.read(buf,0,buf.length) != -1) { if(c.feed(buf))break; } return c.result(); } public static String analyze(File f) throws java.io.IOException { return analyze(new FileInputStream(f)); } public static String analyze(byte[] buf) { CharsetDetector c = new CharsetDetector(); c.start(); c.feed(buf); return c.result(); } }