package uk.bl.wa.tika.detect;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import org.apache.log4j.Logger;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.HashMap;
public class HighlightJSDetector implements Detector {
private static Logger log = Logger.getLogger(HighlightJSDetector.class.getName());
/**
*
*/
private static final long serialVersionUID = 7717077504684536253L;
/**
* The number of bytes from the beginning of the document stream
* to test for control bytes.
*/
private static final int DEFAULT_NUMBER_OF_BYTES_TO_TEST = 10*1024;
private final int bytesToTest;
private ScriptEngineManager manager;
private ScriptEngine engine;
private static HashMap<String,String> mimeMap;
static {
mimeMap = new HashMap<String,String>();
mimeMap.put("unknown", "application/octet-stream");
mimeMap.put("1c", "application/x-1c");
mimeMap.put("actionscript", "application/x-actionscript");
mimeMap.put("apache", "application/x-apache");
mimeMap.put("avrasm", "application/x-avrasm");
mimeMap.put("axapta", "application/x-axapta");
mimeMap.put("bash", "application/x-bash");
mimeMap.put("coffeescript", "application/x-coffeescript");
mimeMap.put("cpp", "application/x-cpp");
mimeMap.put("cs", "application/x-cs");
mimeMap.put("css", "text/css");
mimeMap.put("d", "application/x-d");
mimeMap.put("delphi", "application/x-delphi");
mimeMap.put("diff", "application/x-diff");
mimeMap.put("xml", "application/xml");
mimeMap.put("dos", "application/x-dos");
mimeMap.put("erlang-repl", "application/x-erlang-repl");
mimeMap.put("erlang", "application/x-erlang");
mimeMap.put("glsl", "application/x-glsl");
mimeMap.put("go", "application/x-go");
mimeMap.put("haskell", "application/x-haskell");
mimeMap.put("http", "message/http");// n.b. message/http;msgtype=response or request
mimeMap.put("ini", "application/x-ini");
mimeMap.put("java", "application/x-java");
mimeMap.put("javascript", "application/javascript");
mimeMap.put("json", "application/x-json");
mimeMap.put("lisp", "application/x-lisp");
mimeMap.put("lua", "application/x-lua");
mimeMap.put("markdown", "application/x-markdown");
mimeMap.put("matlab", "application/x-matlab");
mimeMap.put("mel", "application/x-mel");
mimeMap.put("objectivec", "application/x-objectivec");
mimeMap.put("perl", "application/x-perl");
mimeMap.put("php", "application/x-php");
mimeMap.put("profile", "application/x-profile");
mimeMap.put("python", "application/x-python");
mimeMap.put("r", "application/x-r");
mimeMap.put("rib", "application/x-rib");
mimeMap.put("rsl", "application/x-rsl");
mimeMap.put("ruby", "application/x-ruby");
mimeMap.put("rust", "application/x-rust");
mimeMap.put("scala", "application/x-scala");
mimeMap.put("smalltalk", "application/x-smalltalk");
mimeMap.put("sql", "application/x-sql");
mimeMap.put("tex", "application/x-tex");
mimeMap.put("vala", "application/x-vala");
mimeMap.put("vbscript", "application/x-vbscript");
mimeMap.put("vhdl", "application/x-vhdl");
// Also add some overrides so that these should be used in the case of text/plain:
MediaTypeRegistry reg = MediaTypeRegistry.getDefaultRegistry();
for( String type : mimeMap.values()) {
if( ! "application/octet-stream".equals(type) )
reg.addSuperType( MediaType.parse(type), MediaType.parse("text/plain") );
}
}
/**
*
*/
public HighlightJSDetector() {
this.bytesToTest = DEFAULT_NUMBER_OF_BYTES_TO_TEST;
try {
this.init();
} catch (ScriptException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void init() throws ScriptException, IOException {
manager = new ScriptEngineManager();
engine = manager.getEngineByName("js");
Reader reader = new InputStreamReader(
HighlightJSDetector.class.getResourceAsStream("/highlight-rhino.js"), "UTF-8");
engine.eval(reader);
reader.close();
}
@Override
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
if (input == null) {
return MediaType.OCTET_STREAM;
}
// Default to UTF-8;
String encoding = "UTF-8";
// Pick up the encoding, if set:
if( metadata.get( Metadata.CONTENT_ENCODING) != null ) {
if( Charset.isSupported( metadata.get( Metadata.CONTENT_ENCODING) ) ) {
encoding = metadata.get( Metadata.CONTENT_ENCODING);
}
}
input.mark(bytesToTest);
try {
// Read in and clip to max size:
final char[] buffer = new char[bytesToTest];
StringBuilder out = new StringBuilder();
Reader in = new InputStreamReader( input, encoding);
try {
int read;
// Don't loop - we only want a snippet:
//do {
read = in.read(buffer, 0, buffer.length);
if (read>0) {
out.append(buffer, 0, read);
}
//} while (read>=0);
} finally {
//in.close();
}
// Attempt to identify:
//System.out.println("GOT "+out.toString());
HljsResult result = this.identify( out.toString() );
//log.info("for "+metadata.get( Metadata.CONTENT_TYPE)+ " got "+result.getLanguage());
// Map to MIME type:
if ( mimeMap.containsKey(result.getLanguage()) ) {
MediaType mt = MediaType.parse(mimeMap.get(result.getLanguage()));
metadata.set("X-HLJS-Content-Type", mt.toString());
return mt;
} else {
return MediaType.OCTET_STREAM;
}
} catch ( Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
input.reset();
}
// If something went wrong, assume nothing:
return MediaType.OCTET_STREAM;
}
public class HljsResult {
private String language;
private int relevance;
public HljsResult(String language, int relevance) {
this.language = language;
this.relevance = relevance;
}
/**
* @return the language
*/
public String getLanguage() {
return language;
}
/**
* @return the relevance
*/
public int getRelevance() {
return relevance;
}
/**
*
*/
public String toString() {
return language+"["+relevance+"]";
}
}
private HljsResult getHljsResult(String var) throws ScriptException {
try {
engine.eval("language = "+var+".language;");
String language = (String) engine.get("language");
engine.eval("relevance = "+var+".relevance;");
int relevance = ((Double) engine.get("relevance")).intValue();
// Don't bother setting up the keyword count or highlighted text as this is not of interest here.
//engine.eval("keyword_count = "+var+".keyword_count;");
//double keyword_count = (Double) engine.get("keyword_count");
//engine.eval("text = "+var+".text;");
return new HljsResult(language, relevance);
} catch( Exception e ) {
return new HljsResult("unknown", 0);
}
}
/**
*
* @see http://softwaremaniacs.org/wiki/doku.php/highlight.js:api
*
* @param snippet
* @return
* @throws ScriptException
*/
public HljsResult identify(String snippet) throws ScriptException {
engine.put("snippet", snippet);
engine.eval("hlid = hljs.highlightAuto(snippet)");
// Extract the best match:
HljsResult best = this.getHljsResult("hlid");
// Extract the second-best match:
HljsResult second_best = this.getHljsResult("hlid.second_best");
log.info("Identified: "+best+", "+second_best);
return best;
}
public static void main(String[] args) {
try {
HighlightJSDetector hljs = new HighlightJSDetector();
System.out.println("GOT: "+hljs.detect(new ByteArrayInputStream("<xml></xml>".getBytes()), new Metadata()) );
} catch (Exception e) {
e.printStackTrace();
}
}
}