package edu.cmu.geolocator.nlp.tokenizer; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ /* * TweetMotif is licensed under the Apache License 2.0: * http://www.apache.org/licenses/LICENSE-2.0.html Copyright Brendan O'Connor, * Michel Krieger, and David Ahn, 2009-2010. */ /* * Scala verion of TweetMotif is licensed under the Apache License 2.0: * http://www.apache.org/licenses/LICENSE-2.0.html Copyright Jason Baldridge, * and David Snyder, 2011. */ /* * A direct port to Java from Scala version of Twitter tokenizer at * https://bitbucket.org/jasonbaldridge/twokenize Original Python version * TweetMotif can be found at https://github.com/brendano/tweetmotif * * Author: Vinh Khuc (khuc@cse.ohio-state.edu) July 2011 */ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.cmu.geolocator.model.Sentence; import edu.cmu.geolocator.model.Token; /* * The tokenizer from Noah's Ark. * * @ Modified by Wei Zhang */ public class EuroLangTwokenizer { static Pattern Contractions = Pattern .compile("(?i)(\\w+)(['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$"); static Pattern Whitespace = Pattern.compile("[\\s\\p{Zs}]+"); // @ Wei Zhang // added question marks and exclamation mark, and long dash. // added ( and ) as punctuations static String punctChars = "['\"“”‘’\\|.—¿?¡!…,:;]"; // static String punctSeq = punctChars+"+"; //'anthem'. => ' anthem '. static String punctSeq = "['\"“”‘’/_]+|[.¿?¡!,…]+|[:;]+"; // 'anthem'. => // ' anthem ' // . static String entity = "&(?:amp|lt|gt|quot);"; // URLs // BTO 2012-06: everyone thinks the daringfireball regex should be better, // but they're wrong. // If you actually empirically test it the results are bad. // Please see https://github.com/brendano/ark-tweet-nlp/pull/9 static String urlStart1 = "(?:https?://|\\bwww\\.)"; static String commonTLDs = "(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"; static String ccTLDs = "(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + "bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + "er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + "hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + "lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + "nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + "sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + "va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"; // TODO: // remove // obscure // country // domains? static String urlStart2 = "\\b(?:[A-Za-z\\d-])+(?:\\.[A-Za-z0-9]+){0,3}\\." + "(?:" + commonTLDs + "|" + ccTLDs + ")" + "(?:\\." + ccTLDs + ")?(?=\\W|$)"; // Wei Zhang added urlStart3 for HTTP:// static String urlStart3 = "(?:HTTPS?://|\\bwww\\.)"; // //////// static String urlBody = "(?:[^\\.\\s<>][^\\s<>]*?)?"; static String urlExtraCrapBeforeEnd = "(?:" + punctChars + "|" + entity + ")+?"; static String urlEnd = "(?:\\.\\.+|[<>]|\\s|$)"; public static String url = "(?:" + urlStart1 + "|" + urlStart2 + "|" + urlStart3 + ")" + urlBody + "(?=(?:" + urlExtraCrapBeforeEnd + ")?" + urlEnd + ")"; // Numeric static String timeLike = "\\d+(?::\\d+){1,2}"; // static String numNum = "\\d+\\.\\d+"; static String numberWithCommas = "(?:(?<!\\d)\\d{1,3},)+?\\d{3}" + "(?=(?:[^,\\d]|$))"; static String numComb = "\\p{Sc}?\\d+(?:\\.\\d+)+%?"; // Abbreviations static String boundaryNotDot = "(?:$|\\s|[“\\u0022?!,:;]|" + entity + ")"; static String aa1 = "(?:[A-Za-z]\\.){2,}(?=" + boundaryNotDot + ")"; static String aa2 = "[^A-Za-z](?:[A-Za-z]\\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"; static String standardAbbreviations = "\\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\\."; static String arbitraryAbbrev = "(?:" + aa1 + "|" + aa2 + "|" + standardAbbreviations + ")"; static String separators = "(?:--+|―|—|~|–|=)"; static String decorations = "(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\\u2639-\\u263b]+|[\\ue001-\\uebbb]+)"; static String thingsThatSplitWords = "[^\\s\\.,?\"\\-]"; static String embeddedApostrophe = thingsThatSplitWords + "+['’′]" + thingsThatSplitWords + "*"; public static String OR(String... parts) { String prefix = "(?:"; StringBuilder sb = new StringBuilder(); for (String s : parts) { sb.append(prefix); prefix = "|"; sb.append(s); } sb.append(")"); return sb.toString(); } // Emoticons static String normalEyes = "(?iu)[:=]"; // 8 and x are eyes but cause // problems static String wink = "[;]"; static String noseArea = "(?:|-|[^a-zA-Z0-9 ])"; // doesn't get :'-( static String happyMouths = "[D\\)\\]\\}]+"; static String sadMouths = "[\\(\\[\\{]+"; static String tongue = "[pPd3]+"; static String otherMouths = "(?:[oO]+|[/\\\\]+|[vV]+|[Ss]+|[|]+)"; // remove // forward // slash // if // http://'s // aren't // cleaned // mouth repetition examples: // @aliciakeys Put it in a love song :-)) // @hellocalyclops =))=))=)) Oh well static String bfLeft = "(♥|0|o|°|v|\\$|t|x|;|\\u0CA0|@|ʘ|•|・|◕|\\^|¬|\\*)"; static String bfCenter = "(?:[\\.]|[_-]+)"; static String bfRight = "\\2"; static String s3 = "(?:--['\"])"; static String s4 = "(?:<|<|>|>)[\\._-]+(?:<|<|>|>)"; static String s5 = "(?:[.][_]+[.])"; static String basicface = "(?:(?i)" + bfLeft + bfCenter + bfRight + ")|" + s3 + "|" + s4 + "|" + s5; static String eeLeft = "[\\\\\ƪԄ\\((<>;ヽ\\-=~\\*]+"; static String eeRight = "[\\-=\\);'\\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+"; static String eeSymbol = "[^A-Za-z0-9\\s\\(\\)\\*:=-]"; static String eastEmote = eeLeft + "(?:" + basicface + "|" + eeSymbol + ")+" + eeRight; public static String emoticon = OR( // Standard version :) :( :] :D :P "(?:>|>)?" + OR(normalEyes, wink) + OR(noseArea, "[Oo]") + OR(tongue + "(?=\\W|$|RT|rt|Rt)", otherMouths + "(?=\\W|$|RT|rt|Rt)", sadMouths, happyMouths), // reversed version (: D: use positive lookbehind to remove // "(word):" // because eyes on the right side is more ambiguous with the // standard usage of : ; "(?<=(?: |^))" + OR(sadMouths, happyMouths, otherMouths) + noseArea + OR(normalEyes, wink) + "(?:<|<)?", // inspired by // http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style eastEmote.replaceFirst("2", "1"), basicface // iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb] // TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage // told // me (BTO) he does this ); static String Hearts = "(?:<+/?3+)+"; // the other hearts are in // decorations static String Arrows = "(?:<*[-―—=]*>+|<+[-―—=]*>*)|\\p{InArrows}+"; // BTO 2011-06: restored Hashtag, AtMention protection (dropped in // original // scala port) because it fixes // "hello (#hashtag)" ==> "hello (#hashtag )" WRONG // "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT // "hello (@person)" ==> "hello (@person )" WRONG // "hello (@person)" ==> "hello ( @person )" RIGHT // ... Some sort of weird interaction with edgepunct I guess, because // edgepunct // has poor content-symbol detection. // This also gets #1 #40 which probably aren't hashtags .. but good as // tokens. // If you want good hashtag identification, use a different regex. static String Hashtag = "#[a-zA-Z0-9_]+"; // optional: lookbehind for \b // optional: lookbehind for \b, max length 15 static String AtMention = "[@@][a-zA-Z0-9_]+"; // I was worried this would conflict with at-mentions // but seems ok in sample of 5800: 7 changes all email fixes // http://www.regular-expressions.info/email.html static String Bound = "(?:\\W|^|$)"; public static String Email = "(?<=" + Bound + ")[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,4}(?=" + Bound + ")"; // We will be tokenizing using these regexps as delimiters // Additionally, these things are "protected", meaning they shouldn't be // further split themselves. static Pattern Protected = Pattern.compile(OR(Hearts, url, Email, timeLike, // numNum, numberWithCommas, numComb, emoticon, Arrows, entity, punctSeq, arbitraryAbbrev, separators, decorations, embeddedApostrophe, Hashtag, AtMention)); // Edge punctuation // Want: 'foo' => ' foo ' // While also: don't => don't // the first is considered "edge punctuation". // the second is word-internal punctuation -- don't want to mess with it. // BTO (2011-06): the edgepunct system seems to be the #1 source of // problems // these days. // I remember it causing lots of trouble in the past as well. Would be // good // to revisit or eliminate. // Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes) static String edgePunctChars = "'\"“”‘’«»{}\\(\\)\\[\\]\\*&"; // add // \\p{So}? // (symbols) static String edgePunct = "[" + edgePunctChars + "]"; static String notEdgePunct = "[a-zA-Z0-9]"; // content characters // @Wei Zhang: Fixed bug ??(peru) ->?? ( peru ). // added ! and ? and ¿ and ¡. static String offEdge = "(^|$|:|;|!|\\?|¡|¿|\\s|\\.|,)"; // colon here // gets // "(hello):" // ==> // "( hello ):" static Pattern EdgePunctLeft = Pattern.compile(offEdge + "(" + edgePunct + "+)(" + notEdgePunct + ")"); static Pattern EdgePunctRight = Pattern.compile("(" + notEdgePunct + ")(" + edgePunct + "+)" + offEdge); public static String splitEdgePunct(String input) { Matcher m1 = EdgePunctLeft.matcher(input); input = m1.replaceAll("$1$2 $3"); m1 = EdgePunctRight.matcher(input); input = m1.replaceAll("$1 $2$3"); return input; } private static class Pair<T1, T2> { public T1 first; public T2 second; public Pair(T1 x, T2 y) { first = x; second = y; } } // The main work of tokenizing a tweet. private static List<String> simpleTokenize(String text) { // Do the no-brainers first String splitPunctText = splitEdgePunct(text); int textLength = splitPunctText.length(); // BTO: the logic here got quite convoluted via the Scala porting // detour // It would be good to switch back to a nice simple procedural style // like in the Python version // ... Scala is such a pain. Never again. // Find the matches for subsequences that should be protected, // e.g. URLs, 1.0, U.N.K.L.E., 12:53 Matcher matches = Protected.matcher(splitPunctText); // Storing as List[List[String]] to make zip easier later on List<List<String>> bads = new ArrayList<List<String>>(); // linked // list? List<Pair<Integer, Integer>> badSpans = new ArrayList<Pair<Integer, Integer>>(); while (matches.find()) { // The spans of the "bads" should not be split. if (matches.start() != matches.end()) { // unnecessary? List<String> bad = new ArrayList<String>(1); bad.add(splitPunctText.substring(matches.start(), matches.end())); bads.add(bad); badSpans.add(new Pair<Integer, Integer>(matches.start(), matches.end())); } } // Create a list of indices to create the "goods", which can be // split. We are taking "bad" spans like // List((2,5), (8,10)) // to create // / List(0, 2, 5, 8, 10, 12) // where, e.g., "12" here would be the textLength // has an even length and no indices are the same List<Integer> indices = new ArrayList<Integer>(2 + 2 * badSpans.size()); indices.add(0); for (Pair<Integer, Integer> p : badSpans) { indices.add(p.first); indices.add(p.second); } indices.add(textLength); // Group the indices and map them to their respective portion of the // string List<List<String>> splitGoods = new ArrayList<List<String>>(indices.size() / 2); for (int i = 0; i < indices.size(); i += 2) { String goodstr = splitPunctText.substring(indices.get(i), indices.get(i + 1)); List<String> splitstr = Arrays.asList(goodstr.trim().split(" ")); splitGoods.add(splitstr); } // Reinterpolate the 'good' and 'bad' Lists, ensuring that // additonal tokens from last good item get included List<String> zippedStr = new ArrayList<String>(); int i; for (i = 0; i < bads.size(); i++) { zippedStr = addAllnonempty(zippedStr, splitGoods.get(i)); zippedStr = addAllnonempty(zippedStr, bads.get(i)); } zippedStr = addAllnonempty(zippedStr, splitGoods.get(i)); // BTO: our POS tagger wants "ur" and "you're" to both be one token. // Uncomment to get "you 're" ArrayList<String> splitStr = new ArrayList<String>(zippedStr.size()); for (String tok : zippedStr) splitStr.addAll(splitToken(tok)); zippedStr = splitStr; return zippedStr; } private static List<String> addAllnonempty(List<String> master, List<String> smaller) { for (String s : smaller) { String strim = s.trim(); if (strim.length() > 0) master.add(strim); } return master; } /** "foo bar " => "foo bar" */ public static String squeezeWhitespace(String input) { return Whitespace.matcher(input).replaceAll(" ").trim(); } // Final pass tokenization based on special patterns private static List<String> splitToken(String token) { Matcher m = Contractions.matcher(token); if (m.find()) { String[] contract = { m.group(1), m.group(2) }; // System.out.println(Arrays.asList(contract).toString()); return Arrays.asList(contract); } String[] contract = { token }; // System.out.println(Arrays.asList(contract).toString()); return Arrays.asList(contract); } // @Wei Zhang // modified return type from list to string array /** Assume 'text' has no HTML escaping. **/ public static List<String> tokenize(String text) { return simpleTokenize(squeezeWhitespace(text)); } /** * Take charge of generating tokens in a sentence. A raw sentence won't have tokens until it is tokenized. * @param sentence * @return */ public static edu.cmu.geolocator.model.Sentence tokenize(edu.cmu.geolocator.model.Sentence sentence) { String sSent = sentence.getSentenceString(); List<String> tokenized = tokenize(sSent); List<Token> tokens = new ArrayList<Token>(); Token t; for (int i = 0; i < tokenized.size(); i++) { tokens.add(new Token(tokenized.get(i),sentence.getId(),i)); } sentence.setTokens(tokens.toArray(new Token[]{})); return sentence; } public static void main(String[] args) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader(System.in, "utf-8")); System.out.print("> "); // Read user input String inputStr = br.readLine(); while (!inputStr.equals("")) { // inputStr = "terremoto/earthquake"; List<String> tokens = EuroLangTwokenizer.tokenize(inputStr); for (String token : tokens) { System.out.print("[" + token + "]"); } System.out.print("\n> "); inputStr = br.readLine(); } br.close(); } }