package rainbownlp.parser; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import opennlp.tools.chunker.ChunkerME; import opennlp.tools.chunker.ChunkerModel; import opennlp.tools.cmdline.PerformanceMonitor; import opennlp.tools.cmdline.postag.POSModelLoader; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSSample; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.Span; import opennlp.uima.tokenize.WhitespaceTokenizer; import rainbownlp.core.Artifact; import rainbownlp.util.HibernateUtil; //This class will read all the training sentences and parse them and put penn tree and dependency and POS in the databse import rainbownlp.util.ConfigurationUtil; public class ParseHandler { public ArrayList<WordTag> sentenceWords = new ArrayList<WordTag>(); POSModel posModel; POSTaggerME tagger; ChunkerME chunkerME; public ParseHandler() throws IOException { POSModel posModel = new POSModelLoader() .load(new File(ConfigurationUtil.getResourcePath("en-pos-maxent.bin"))); tagger = new POSTaggerME(posModel); // // chunker InputStream is = ConfigurationUtil.getResourceStream("en-chunker.bin"); ChunkerModel cModel = new ChunkerModel(is); chunkerME = new ChunkerME(cModel); } public static StanfordParser s_parser = new StanfordParser(); public static void main(String[] args) throws Exception { //// StanfordParser s_parser = new StanfordParser(); // //get all sentence artifact List<Artifact> sentences = Artifact.listByType(Artifact.Type.Sentence,true); ParseHandler ph = new ParseHandler(); for (Artifact sentence:sentences) { ph.sentenceChunker(sentence.getContent()); // calculatePOS(s_parser,sentence); // //now parse the normalized sentence( here just normalized to head) // NormalizedSentence normalized_sent_obj = NormalizedSentence.getInstance(sentence,NormalizationMethod.MethodType.MentionToHead); // String normalized_sent = normalized_sent_obj.getNormalizedContent(); // s_parser.parse(normalized_sent); // // String nor_dependencies = s_parser.getDependencies(); // String nor_penn_tree = s_parser.getPenn(); // normalized_sent_obj.setNormalizedDependency(nor_dependencies); // normalized_sent_obj.setNormalizedPennTree(nor_penn_tree); // // HibernateUtil.save(normalized_sent_obj); // // HibernateUtil.clearLoaderSession(); break; } } public void calculatePOS(Artifact sentence ) throws Exception { if (s_parser == null) { s_parser = new StanfordParser(); } s_parser.parse(sentence.getContent()); //TODO put dependencies String pos_tagged_sentence = s_parser.getTagged(); String dependencies = s_parser.getDependencies(); String penn_tree = s_parser.getPenn(); sentence.setPOS(pos_tagged_sentence); sentence.setStanDependency(dependencies); sentence.setStanPennTree(penn_tree); HibernateUtil.save(sentence); ArrayList<WordTag> w_tags = analyzePOSTaggedSentence(pos_tagged_sentence); for (int i=0;i<w_tags.size();i++) { WordTag wt = w_tags.get(i); //get artifact Artifact word_in_sent = Artifact.findInstance(sentence, i); if (word_in_sent.getContent().matches("\\w+") && !word_in_sent.getContent().equals(wt.content)) { throw (new Exception("Related artifact is not found")); } //set POS word_in_sent.setPOS(wt.POS); HibernateUtil.save(word_in_sent); HibernateUtil.clearLoaderSession(); } } public String calculatePOS(String content) throws Exception { if (s_parser == null) { s_parser = new StanfordParser(); } s_parser.parse(content); //TODO put dependencies String pos_tagged_sentence = s_parser.getTagged(); return pos_tagged_sentence; } //This will return a list of the word tag objects based on the tagged sentence public ArrayList<WordTag> analyzePOSTaggedSentence(String pTaggedSentence) throws Exception { String tokens[] = pTaggedSentence.split(" "); ArrayList<WordTag> word_tags = new ArrayList<ParseHandler.WordTag>(); int count=0; for (String token:tokens) { WordTag wt = new WordTag(); Pattern p = Pattern.compile("(.*)\\/([^\\/]+)"); Matcher m = p.matcher(token); if (m.matches()) { String content = m.group(1); content = content.replaceAll("\\\\/", "/"); wt.content = content; wt.POS = m.group(2); wt.offset = count; word_tags.add(wt); count++; } else { throw (new Exception("the POS tag doesn't match the pattern")); } } return word_tags; } // public static void nounPhraseTagger() throws IOException // { // InputStream modelIn = null; // ChunkerModel model = null; // // modelIn = new FileInputStream("/host/ubnutustuff/projects/rnlp/rnlp/resources/en-chunker.bin"); // model = new ChunkerModel(modelIn); // modelIn.close(); // ChunkerME chunker = new ChunkerME(model); // String input = // "This is a very good test"; // // if (s_parser == null) // { // s_parser = new StanfordParser(); // } // s_parser.parse(input); // //TODO put dependencies // String pos_tagged_sentence = s_parser.getTagged(); // // String sent_tokens[]= input.split(" "); // // String tag[] = chunker.chunk(sent_tokens, pos_tagged_sentence.split(" ")); // // for (int i=0; i<sent_tokens.length;i++) // { // System.out.println(sent_tokens[i]+"**"+tag[i]); // } // } public Span[] sentenceChunker(String sentence_content) throws Exception { PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent"); // String sentence_content = sent.getContent(); // String[] sent_tokens = {}; // List<Artifact> childs = sent.getChildsArtifact(); // // for (int i=0;i<childs.size();i++) // { // sent_tokens[i]=childs.get(i).getContent(); // // } // String sentPOS = calculatePOS(sentence_content); // ObjectStream<String> lineStream = new PlainTextByLineStream( // new StringReader(sentence_content)); perfMon.start(); // String line; // String whitespaceTokenizerLine[] = null; String[] tags = null; // while ((line = lineStream.read()) != null) { // // whitespaceTokenizerLine = WhitespaceTokenizer.INSTANCE // .tokenize(line); // tags = tagger.tag(whitespaceTokenizerLine); // //// POSSample sample = new POSSample(whitespaceTokenizerLine, tags); //// System.out.println(sample.toString()); // // perfMon.incrementCounter(); // } // perfMon.stopAndPrintFinalResult(); String[] tokens= sentence_content.split(" "); tags = tagger.tag(tokens); Span[] spans = chunkerME.chunkAsSpans(tokens, tags); // String shallow_parsed_sent = ""; // String sent_transaction = ""; for (Span s : spans) // System.out.println(s.toString()+ " "+s.getType()); System.out.println(s.toString()+ " "+s.getType()+" "+tokens[s.getStart()] + " "+tokens[s.getEnd()-1]); return spans; } private static class WordTag{ public String content; public String POS; public int offset; public WordTag() { // TODO Auto-generated constructor stub } } }