import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.LinkedList; import java.util.Map; import java.util.Random; import org.luaj.vm2.LuaFunction; import org.luaj.vm2.LuaTable; import org.luaj.vm2.LuaValue; import org.luaj.vm2.Varargs; import org.luaj.vm2.lib.OneArgFunction; import org.luaj.vm2.lib.VarArgFunction; //import opennlp.tools.namefind.NameFinderME; //import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTagger; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.DetokenizationDictionary; import opennlp.tools.tokenize.DetokenizationDictionary.Operation; import opennlp.tools.tokenize.Detokenizer; import opennlp.tools.tokenize.DictionaryDetokenizer; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.Span; import pl.shockah.StringTools; import pl.shockah.shocky.Module; import pl.shockah.shocky.cmds.Command; import pl.shockah.shocky.cmds.CommandCallback; import pl.shockah.shocky.cmds.Parameters; import pl.shockah.shocky.interfaces.ILua; public class ModuleNLP extends Module implements ILua { private static final DetokenizationDictionary detokenizerDict; private TokenizerModel tokenizerModel; private POSModel posModel; private SentenceModel sentenceModel; //private TokenNameFinderModel nameModel; private Command cmdPOS; private Command cmdPOSReplace; static { String tokens[] = new String[]{".", "!", "?", ",", "$", "(", ")", "[", "]", "<", ">", "\"", "'", ":"}; Operation operations[] = new Operation[]{ Operation.MOVE_LEFT, Operation.MOVE_LEFT, Operation.MOVE_LEFT, Operation.MOVE_LEFT, Operation.MOVE_RIGHT, Operation.MOVE_RIGHT, Operation.MOVE_LEFT, Operation.MOVE_RIGHT, Operation.MOVE_LEFT, Operation.MOVE_RIGHT, Operation.MOVE_LEFT, Operation.RIGHT_LEFT_MATCHING, Operation.RIGHT_LEFT_MATCHING, Operation.MOVE_BOTH }; detokenizerDict = new DetokenizationDictionary(tokens, operations); } @Override public String name() { return "nlp"; } @Override public void onEnable(File dir) { InputStream is = null; try { is = new FileInputStream(new File("data", "en-token.bin")); tokenizerModel = new TokenizerModel(is); is.close(); is = new FileInputStream(new File("data", "en-pos-maxent.bin")); posModel = new POSModel(is); is.close(); is = new FileInputStream(new File("data", "en-sent.bin")); sentenceModel = new SentenceModel(is); /*is.close(); is = new FileInputStream(new File("data", "en-ner-person.bin")); nameModel = new TokenNameFinderModel(is);*/ } catch (IOException e) { e.printStackTrace(); } finally { if (is != null) try {is.close();} catch (IOException e) {} } Command.addCommands(this, cmdPOS = new CmdPOS(), cmdPOSReplace = new CmdPOSReplace()); } @Override public void onDisable() { Command.removeCommands(cmdPOS,cmdPOSReplace); tokenizerModel = null; posModel = null; sentenceModel = null; //nameModel = null; } public Span[] tokenize(String str) { Tokenizer tokenizer = new TokenizerME(tokenizerModel); return tokenizer.tokenizePos(str); } public class TokenizeFunction extends OneArgFunction { @Override public LuaValue call(LuaValue arg) { String s = arg.checkjstring(); return convert(Span.spansToStrings(tokenize(s), s)); } } public String detokenize(String[] tokens) { Detokenizer detokenizer = new DictionaryDetokenizer(detokenizerDict); return detokenizer.detokenize(tokens, ""); } public class DetokenizeFunction extends OneArgFunction { @Override public LuaValue call(LuaValue arg) { return valueOf(detokenize(convert(arg))); } } public String[] getPOSTags(String[] tokens) { POSTagger tagger = new POSTaggerME(posModel); return tagger.tag(tokens); } public class POSFunction extends OneArgFunction { @Override public LuaValue call(LuaValue arg) { return convert(getPOSTags(convert(arg))); } } public String[] getSentences(String str) { SentenceDetectorME detect = new SentenceDetectorME(sentenceModel); return detect.sentDetect(str); } public class SentenceFunction extends OneArgFunction { @Override public LuaValue call(LuaValue arg) { return convert(getSentences(arg.checkjstring())); } } private static String[] convert(LuaValue v) { LuaTable t = v.checktable(); LinkedList<String> list = new LinkedList<String>(); for ( int i=0; !(v = t.rawget(++i)).isnil(); ) list.add(v.checkjstring()); return list.toArray(new String[0]); } private static LuaValue convert(String[] s) { LuaValue[] v = new LuaValue[s.length]; for (int i = 0; i < s.length; ++i) v[i] = LuaValue.valueOf(s[i]); return LuaValue.tableOf(null,v); } @Override public void setupLua(LuaTable env) { LuaTable t = new LuaTable(); t.rawset("tok", new TokenizeFunction()); t.rawset("detok", new DetokenizeFunction()); t.rawset("pos", new POSFunction()); t.rawset("rpos", new POSReplaceFunction()); t.rawset("sent", new SentenceFunction()); env.set("nlp", t); } /*public class NameReplaceFunction extends OneArgFunction { @Override public LuaValue call(LuaValue arg) { return convert(getSentences(arg.checkjstring())); } } public String[] nameReplace(String sentence) { NameFinderME nameFinder = new NameFinderME(nameModel); String[] tokens = tokenize(sentence); Span[] spans = nameFinder.find(tokens); for (int i = 0; i < spans.length; ++i) { String name = StringTools.implode(tokens,spans[i].getStart()-1,spans[i].getEnd()-1," "); System.out.println(name); } return null; }*/ public class POSReplaceFunction extends VarArgFunction { @Override public Varargs invoke(Varargs args) { if (args.narg()==3) return asTable(args.checkjstring(1), args.checknumber(2).tofloat(), args.checktable(3)); else return asFunction(args.checkjstring(1), args.checkclosure(2)); } public LuaValue asTable(String sentence, float chance, LuaTable table) { String[][] replacements = new String[table.getn().toint()][]; LuaValue v; for ( int i=0; !(v = table.rawget(++i)).isnil(); ) { LuaTable t = v.checktable(); replacements[i-1]=new String[]{t.get("type").checkjstring(),t.get("word").checkjstring()}; } return valueOf(posReplace(sentence, chance, replacements)); } public LuaValue asFunction(String sentence, LuaFunction func) { return valueOf(posReplace(sentence, func)); } } public String posReplace(String sentence, float chance, String[][] replacements) { Map<String,LinkedList<String>> wordMap = new HashMap<String,LinkedList<String>>(); try { for (int i = replacements.length-1; i >= 0; --i) { String pos = replacements[i][0]; String word = replacements[i][1]; LinkedList<String> list = wordMap.get(pos); if (list == null) { list = new LinkedList<String>(); wordMap.put(pos, list); } list.add(word); } } catch(Exception e) { return null; } Span[] spans = tokenize(sentence); String[] tokens = Span.spansToStrings(spans, sentence); String[] tags = getPOSTags(tokens); boolean shouldRun = false; for (int i = tags.length-1; !shouldRun && i >= 0; --i) if (wordMap.containsKey(tags[i])) shouldRun = true; if (!shouldRun) return sentence; Random rnd = new Random(); StringBuilder sb = null; int replaces = 0; int runs = 0; while (replaces == 0 && runs < 100) { int start = 0; sb = new StringBuilder(sentence.length()); for (int i = 0; i < tokens.length && i < tags.length; ++i) { String tag = tags[i]; String token = tokens[i]; if (token.contentEquals("'") || token.contentEquals(">")) continue; if (wordMap.containsKey(tag) && rnd.nextFloat() < chance) { LinkedList<String> wordList = wordMap.get(tag); String word= (wordList.size()==1) ? wordList.element() : wordList.get(rnd.nextInt(wordList.size())); Span span = spans[i]; if (token.charAt(0)=='<' && token.charAt(token.length()-1)=='>') span = new Span(span.getStart()+1,span.getEnd()-1); else if (token.charAt(0)=='\'' && token.charAt(token.length()-1)!='\'' && (i+1 < tokens.length) && tokens[i+1].contentEquals("'")) span = new Span(span.getStart()+1,span.getEnd()); else if (token.charAt(0)=='<' && token.charAt(token.length()-1)!='>' && (i+1 < tokens.length) && tokens[i+1].contentEquals(">")) span = new Span(span.getStart()+1,span.getEnd()); sb.append(sentence.substring(start, span.getStart())); boolean upper = true; for (int o = 0; upper && o < token.length(); ++o) if (Character.isLowerCase(token.charAt(o))) upper = false; sb.append(upper ? word.toUpperCase() : word); start=span.getEnd(); ++replaces; } } sb.append(sentence.substring(start)); ++runs; } return sb.toString(); } public String posReplace(String sentence, LuaFunction function) { Span[] spans = tokenize(sentence); String[] tokens = Span.spansToStrings(spans, sentence); String[] tags = getPOSTags(tokens); StringBuilder sb = null; int replaces = 0; int runs = 0; while (replaces == 0 && runs < 100) { int start = 0; sb = new StringBuilder(sentence.length()); for (int i = 0; i < tokens.length && i < tags.length; ++i) { String tag = tags[i]; String token = tokens[i]; if (token.contentEquals("'") || token.contentEquals(">")) continue; Varargs result = function.invoke(LuaValue.varargsOf(LuaValue.valueOf(tag),LuaValue.valueOf(token))); if (!result.toboolean(1)) continue; String word = result.checkjstring(1); Span span = spans[i]; if (token.charAt(0)=='<' && token.charAt(token.length()-1)=='>') span = new Span(span.getStart()+1,span.getEnd()-1); else if (token.charAt(0)=='\'' && token.charAt(token.length()-1)!='\'' && (i+1 < tokens.length) && tokens[i+1].contentEquals("'")) span = new Span(span.getStart()+1,span.getEnd()); else if (token.charAt(0)=='<' && token.charAt(token.length()-1)!='>' && (i+1 < tokens.length) && tokens[i+1].contentEquals(">")) span = new Span(span.getStart()+1,span.getEnd()); sb.append(sentence.substring(start, span.getStart())); sb.append(word); start=span.getEnd(); ++replaces; } sb.append(sentence.substring(start)); ++runs; } return sb.toString(); } public class CmdPOS extends Command { public String command() {return "pos";} public String help(Parameters params) { return "pos {sentence} - adds the parts of speech after each word/character."; } public void doCommand(Parameters params, CommandCallback callback) { if (params.tokenCount==0) { callback.type = EType.Notice; callback.append(help(params)); return; } String[] tokens = Span.spansToStrings(tokenize(params.input), params.input); String[] tags = getPOSTags(tokens); StringBuilder sb = new StringBuilder(); for (int i = 0; i < tokens.length && i < tags.length; ++i) { if (i > 0) sb.append(' '); sb.append(tokens[i]).append('/').append(tags[i]); } callback.append(StringTools.limitLength(sb)); } } public class CmdPOSReplace extends Command { public String command() {return "rpos";} public String help(Parameters params) { return "rpos {chance} {total} {{pos} {replacement}} {sentence} - replaces each part of speech with word provided."; } public void doCommand(Parameters params, CommandCallback callback) { if (params.tokenCount<3) { callback.type = EType.Notice; callback.append(help(params)); return; } String[][] replacements; float chance; String sentence; try { chance = Float.parseFloat(params.nextParam()); int total = Integer.parseInt(params.nextParam()); if (params.tokenCount<3+(total<<1)) { callback.type = EType.Notice; callback.append(help(params)); return; } replacements = new String[total][]; for (int i = 0; i < total; ++i) { String pos = params.nextParam(); String word = params.nextParam(); replacements[i] = new String[] {pos, word}; } sentence = params.getParams(0); } catch(Exception e) { callback.type = EType.Notice; callback.append(help(params)); return; } String s = posReplace(sentence, chance, replacements); if (s == null) { callback.type = EType.Notice; callback.append(help(params)); return; } callback.append(StringTools.limitLength(s)); } } }