package edu.cmu.geolocator.parser.Universal; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import edu.cmu.geolocator.io.PipeLineAnnotate; import edu.cmu.geolocator.model.LocEntityAnnotation; import edu.cmu.geolocator.model.Sentence; import edu.cmu.geolocator.model.Token; import edu.cmu.geolocator.model.Tweet; import edu.cmu.geolocator.nlp.ner.FeatureExtractor.ACE_En_FeatureGenerator; import edu.cmu.geolocator.parser.ParserFactory; import edu.cmu.minorthird.classify.ClassLabel; import edu.cmu.minorthird.classify.Example; import edu.cmu.minorthird.classify.Feature; import edu.cmu.minorthird.classify.MutableInstance; import edu.cmu.minorthird.classify.sequential.CMM; import edu.cmu.minorthird.classify.sequential.SequenceClassifier; import edu.cmu.minorthird.util.IOUtil; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.util.CoreMap; public class ACE_MTNERParser { CMM model = null; ACE_En_FeatureGenerator fg; /** * Default protected constructor * * @throws IOException * */ private static ACE_MTNERParser fineenglishparser; public static ACE_MTNERParser getInstance() { if (fineenglishparser == null) { try { fineenglishparser = new ACE_MTNERParser( "20141115ACE-CRF.model120", new ACE_En_FeatureGenerator("res/")); } catch (Exception e) { System.err.println("Spanish NER Model File not found"); e.printStackTrace(); } } return fineenglishparser; } ACE_MTNERParser(String modelname, ACE_En_FeatureGenerator featureg) { try { model = (CMM) IOUtil.loadSerialized(new java.io.File(modelname)); this.fg = featureg; } catch (IOException e) { e.printStackTrace(); } } // Identifying location names by NER Example[] examples; Sentence tweetSentence; /** * extract the entities, and put them into the tweet. return them also. * * @throws IOException */ public List<LocEntityAnnotation> parse(Tweet tweet) throws IOException { tweetSentence = tweet.getSentence(); // /////////////////// PipeLineAnnotate pla = new PipeLineAnnotate( tweetSentence.getSentenceString()); List<CoreMap> NLPsents = pla.getSentences(); ArrayList<Sentence> sentences = new ArrayList<Sentence>(); for (CoreMap NLPsentence : NLPsents) { Sentence mysentence = new Sentence( NLPsentence.get(TextAnnotation.class)); List<CoreLabel> tokens = NLPsentence.get(TokensAnnotation.class); int j = 0; Token[] tokensArray = new Token[tokens.size()]; for (CoreLabel token : tokens) { int pos = token.beginPosition(); int end = token.endPosition(); String word = token.get(TextAnnotation.class); Token myToken = new Token(word, j + "", j); myToken.setLemma(token.lemma()); myToken.setStart(pos); myToken.setEnd(end); tokensArray[j] = myToken; j++; } mysentence.setTokens(tokensArray); mysentence.setStart(tokensArray[0].getStart()); mysentence.setEnd(tokensArray[tokensArray.length - 1].getEnd()); sentences.add(mysentence); } // //////////////////// List<LocEntityAnnotation> locs = new ArrayList<LocEntityAnnotation>(); int sentid = 0; for (Sentence sent : sentences) { Example[] exp = new Example[sent.getTokens().length]; List<ArrayList<Feature>> tweetfeatures = fg.extractFeature(sent); for (int tokid = 0; tokid < sent.getTokens().length; tokid++) { ClassLabel lab = new ClassLabel( sent.getTokens()[tokid].getNE() == null ? "" : sent.getTokens()[tokid].getNE()); MutableInstance inst = new MutableInstance("ACE-NER", sentid + "-" + tokid); for (int j = 0; j < tweetfeatures.get(0).size(); j++) { inst.addBinary(tweetfeatures.get(tokid).get(j)); } exp[tokid] = new Example(inst, lab); } ClassLabel[] resultlabels = model.classification(exp); for (int tokid = 0; tokid < sent.getTokens().length; tokid++) { sent.getTokens()[tokid].setNEprediction(resultlabels[tokid] .bestClassName()); // System.out.println(resultlabels[tokid].bestClassName() + // " "+sent.getTokens()[tokid].getToken()); } /** * rewrite the loc-entity generation, to support positions. */ int startpos = -1, endpos = -1; String current = "O", previous = "O"; for (int k = 0; k < resultlabels.length; k++) { if (k > 0) previous = current; current = resultlabels[k].bestClassName(); if (current.equals("O")) if (previous.equals("O")) continue; else { endpos = k - 1; // System.out.println(startpos + " " + endpos + " " + // previous); Token[] t = new Token[endpos - startpos + 1]; for (int i = startpos; i <= endpos; i++) { t[i - startpos] = sent.getTokens()[i] .setNE(previous); } LocEntityAnnotation le = new LocEntityAnnotation( startpos, endpos, previous, t); // set the probability of the NE type // This may be changed later. le.setNETypeProb(0.95); locs.add(le); } else if (previous.equals("O")) startpos = k; else endpos = k; } sentid++; } // /////////////////// return locs; } public static void main(String argv[]) throws IOException { String sss = "I live in Pittsburgh. I am going to new york."; Tweet t = new Tweet(sss); BufferedReader s = new BufferedReader(new InputStreamReader(System.in, "utf-8")); System.out.println(">"); while (true) { String ss = s.readLine(); if (ss.length() == 0) continue; t.setSentence(ss); double stime = System.currentTimeMillis(); List<LocEntityAnnotation> matches = ParserFactory.getEnAggrParser().parse(t); if (matches == null) System.out.println("No results. "); double etime = System.currentTimeMillis(); System.out.println(matches); System.out.println(etime - stime + "\n>"); } } }