package com.twitter; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; /** * A class to extract usernames, lists, hashtags and URLs from Tweet text. */ public class Extractor { public static class Entity { public enum Type { URL, HASHTAG, MENTION, CASHTAG } int start; int end; final String value; // listSlug is used to store the list portion of @mention/list. final String listSlug; final Type type; String displayURL = null; String expandedURL = null; public Entity(int start, int end, String value, String listSlug, Type type) { this.start = start; this.end = end; this.value = value; this.listSlug = listSlug; this.type = type; } public Entity(int start, int end, String value, Type type) { this(start, end, value, null, type); } public Entity(Matcher matcher, Type type, int groupNumber) { // Offset -1 on start index to include @, # symbols for mentions and // hashtags this(matcher, type, groupNumber, -1); } public Entity(Matcher matcher, Type type, int groupNumber, int startOffset) { this(matcher.start(groupNumber) + startOffset, matcher .end(groupNumber), matcher.group(groupNumber), type); } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof Entity)) { return false; } Entity other = (Entity) obj; return this.type.equals(other.type) && this.start == other.start && this.end == other.end && this.value.equals(other.value); } @Override public int hashCode() { return this.type.hashCode() + this.value.hashCode() + this.start + this.end; } @Override public String toString() { return value + "(" + type + ") [" + start + "," + end + "]"; } public Integer getStart() { return start; } public Integer getEnd() { return end; } public String getValue() { return value; } public String getListSlug() { return listSlug; } public Type getType() { return type; } public String getDisplayURL() { return displayURL; } public void setDisplayURL(String displayURL) { this.displayURL = displayURL; } public String getExpandedURL() { return expandedURL; } public void setExpandedURL(String expandedURL) { this.expandedURL = expandedURL; } } private boolean extractURLWithoutProtocol = true; /** * Create a new extractor. */ public Extractor() { } private static void removeOverlappingEntities(List<Entity> entities) { // sort by index Collections.sort(entities, new Comparator<Entity>() { @Override public int compare(Entity e1, Entity e2) { return e1.start - e2.start; } }); // Remove overlapping entities. // Two entities overlap only when one is URL and the other is // hashtag/mention // which is a part of the URL. When it happens, we choose URL over // hashtag/mention // by selecting the one with smaller start index. if (!entities.isEmpty()) { Iterator<Entity> it = entities.iterator(); Entity prev = it.next(); while (it.hasNext()) { Entity cur = it.next(); if (prev.getEnd() > cur.getStart()) { it.remove(); } else { prev = cur; } } } } /** * Extract URLs, @mentions, lists and #hashtag from a given text/tweet. * * @param text text of tweet * @return list of extracted entities */ public List<Entity> extractEntitiesWithIndices(String text) { List<Entity> entities = new ArrayList<Entity>(); entities.addAll(extractURLsWithIndices(text)); entities.addAll(extractHashtagsWithIndices(text, false)); entities.addAll(extractMentionsOrListsWithIndices(text)); entities.addAll(extractCashtagsWithIndices(text)); removeOverlappingEntities(entities); return entities; } /** * Extract @username references from Tweet text. A mention is an occurance * of @username anywhere in a Tweet. * * @param text of the tweet from which to extract usernames * @return List of usernames referenced (without the leading @ sign) */ public List<String> extractMentionedScreennames(String text) { if (text == null || text.isEmpty()) { return Collections.emptyList(); } List<String> extracted = new ArrayList<String>(); for (Entity entity : extractMentionedScreennamesWithIndices(text)) { extracted.add(entity.value); } return extracted; } /** * Extract @username references from Tweet text. A mention is an occurance * of @username anywhere in a Tweet. * * @param text of the tweet from which to extract usernames * @return List of usernames referenced (without the leading @ sign) */ List<Entity> extractMentionedScreennamesWithIndices(String text) { List<Entity> extracted = new ArrayList<Entity>(); for (Entity entity : extractMentionsOrListsWithIndices(text)) { if (entity.listSlug == null) { extracted.add(entity); } } return extracted; } public List<Entity> extractMentionsOrListsWithIndices(String text) { if (text == null || text.isEmpty()) { return Collections.emptyList(); } // Performance optimization. // If text doesn't contain @/@ at all, the text doesn't // contain @mention. So we can simply return an empty list. boolean found = false; for (char c : text.toCharArray()) { if (c == '@' || c == '@') { found = true; break; } } if (!found) { return Collections.emptyList(); } List<Entity> extracted = new ArrayList<Entity>(); Matcher matcher = Regex.VALID_MENTION_OR_LIST.matcher(text); while (matcher.find()) { String after = text.substring(matcher.end()); if (!Regex.INVALID_MENTION_MATCH_END.matcher(after).find()) { if (matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST) == null) { extracted.add(new Entity(matcher, Entity.Type.MENTION, Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME)); } else { extracted .add(new Entity( matcher.start(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME) - 1, matcher.end(Regex.VALID_MENTION_OR_LIST_GROUP_LIST), matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME), matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST), Entity.Type.MENTION)); } } } return extracted; } /** * Extract a @username reference from the beginning of Tweet text. A reply * is an occurance of @username at the beginning of a Tweet, preceded by 0 * or more spaces. * * @param text of the tweet from which to extract the replied to username * @return username referenced, if any (without the leading @ sign). Returns * null if this is not a reply. */ public String extractReplyScreenname(String text) { if (text == null) { return null; } Matcher matcher = Regex.VALID_REPLY.matcher(text); if (matcher.find()) { String after = text.substring(matcher.end()); if (Regex.INVALID_MENTION_MATCH_END.matcher(after).find()) { return null; } else { return matcher.group(Regex.VALID_REPLY_GROUP_USERNAME); } } else { return null; } } /** * Extract URL references from Tweet text. * * @param text of the tweet from which to extract URLs * @return List of URLs referenced. */ public List<String> extractURLs(String text) { if (text == null || text.isEmpty()) { return Collections.emptyList(); } List<String> urls = new ArrayList<String>(); for (Entity entity : extractURLsWithIndices(text)) { urls.add(entity.value); } return urls; } /** * Extract URL references from Tweet text. * * @param text of the tweet from which to extract URLs * @return List of URLs referenced. */ public List<Entity> extractURLsWithIndices(String text) { if (text == null || text.isEmpty() || (extractURLWithoutProtocol ? text.indexOf('.') : text .indexOf(':')) == -1) { // Performance optimization. // If text doesn't contain '.' or ':' at all, text doesn't contain // URL, // so we can simply return an empty list. return Collections.emptyList(); } List<Entity> urls = new ArrayList<Entity>(); Matcher matcher = Regex.VALID_URL.matcher(text); while (matcher.find()) { if (matcher.group(Regex.VALID_URL_GROUP_PROTOCOL) == null) { // skip if protocol is not present and // 'extractURLWithoutProtocol' is false // or URL is preceded by invalid character. if (!extractURLWithoutProtocol || Regex.INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN .matcher( matcher.group(Regex.VALID_URL_GROUP_BEFORE)) .matches()) { continue; } } String url = matcher.group(Regex.VALID_URL_GROUP_URL); int start = matcher.start(Regex.VALID_URL_GROUP_URL); int end = matcher.end(Regex.VALID_URL_GROUP_URL); Matcher tco_matcher = Regex.VALID_TCO_URL.matcher(url); if (tco_matcher.find()) { // In the case of t.co URLs, don't allow additional path // characters. url = tco_matcher.group(); end = start + url.length(); } urls.add(new Entity(start, end, url, Entity.Type.URL)); } return urls; } /** * Extract #hashtag references from Tweet text. * * @param text of the tweet from which to extract hashtags * @return List of hashtags referenced (without the leading # sign) */ public List<String> extractHashtags(String text) { if (text == null || text.isEmpty()) { return Collections.emptyList(); } List<String> extracted = new ArrayList<String>(); for (Entity entity : extractHashtagsWithIndices(text)) { extracted.add(entity.value); } return extracted; } /** * Extract #hashtag references from Tweet text. * * @param text of the tweet from which to extract hashtags * @return List of hashtags referenced (without the leading # sign) */ public List<Entity> extractHashtagsWithIndices(String text) { return extractHashtagsWithIndices(text, true); } /** * Extract #hashtag references from Tweet text. * * @param text of the tweet from which to extract hashtags * @param checkUrlOverlap if true, check if extracted hashtags overlap URLs and remove * overlapping ones * @return List of hashtags referenced (without the leading # sign) */ private List<Entity> extractHashtagsWithIndices(String text, boolean checkUrlOverlap) { if (text == null || text.isEmpty()) { return Collections.emptyList(); } // Performance optimization. // If text doesn't contain #/# at all, text doesn't contain // hashtag, so we can simply return an empty list. boolean found = false; for (char c : text.toCharArray()) { if (c == '#' || c == '#') { found = true; break; } } if (!found) { return Collections.emptyList(); } List<Entity> extracted = new ArrayList<Entity>(); Matcher matcher = Regex.VALID_HASHTAG.matcher(text); while (matcher.find()) { String after = text.substring(matcher.end()); if (!Regex.INVALID_HASHTAG_MATCH_END.matcher(after).find()) { extracted.add(new Entity(matcher, Entity.Type.HASHTAG, Regex.VALID_HASHTAG_GROUP_TAG)); } } if (checkUrlOverlap) { // extract URLs List<Entity> urls = extractURLsWithIndices(text); if (!urls.isEmpty()) { extracted.addAll(urls); // remove overlap removeOverlappingEntities(extracted); // remove URL entities Iterator<Entity> it = extracted.iterator(); while (it.hasNext()) { Entity entity = it.next(); if (entity.getType() != Entity.Type.HASHTAG) { it.remove(); } } } } return extracted; } /** * Extract $cashtag references from Tweet text. * * @param text of the tweet from which to extract cashtags * @return List of cashtags referenced (without the leading $ sign) */ public List<String> extractCashtags(String text) { if (text == null || text.isEmpty()) { return Collections.emptyList(); } List<String> extracted = new ArrayList<String>(); for (Entity entity : extractCashtagsWithIndices(text)) { extracted.add(entity.value); } return extracted; } /** * Extract $cashtag references from Tweet text. * * @param text of the tweet from which to extract cashtags * @return List of cashtags referenced (without the leading $ sign) */ public List<Entity> extractCashtagsWithIndices(String text) { if (text == null || text.isEmpty()) { return Collections.emptyList(); } // Performance optimization. // If text doesn't contain $, text doesn't contain // cashtag, so we can simply return an empty list. if (text.indexOf('$') == -1) { return Collections.emptyList(); } List<Entity> extracted = new ArrayList<Entity>(); Matcher matcher = Regex.VALID_CASHTAG.matcher(text); while (matcher.find()) { extracted.add(new Entity(matcher, Entity.Type.CASHTAG, Regex.VALID_CASHTAG_GROUP_CASHTAG)); } return extracted; } public void setExtractURLWithoutProtocol(boolean extractURLWithoutProtocol) { this.extractURLWithoutProtocol = extractURLWithoutProtocol; } public boolean isExtractURLWithoutProtocol() { return extractURLWithoutProtocol; } /* * Modify Unicode-based indices of the entities to UTF-16 based indices. * * In UTF-16 based indices, Unicode supplementary characters are counted as * two characters. * * This method requires that the list of entities be in ascending order by * start index. * * @param text original text * * @param entities entities with Unicode based indices */ public void modifyIndicesFromUnicodeToUTF16(String text, List<Entity> entities) { IndexConverter convert = new IndexConverter(text); for (Entity entity : entities) { entity.start = convert.codePointsToCodeUnits(entity.start); entity.end = convert.codePointsToCodeUnits(entity.end); } } /* * Modify UTF-16-based indices of the entities to Unicode-based indices. * * In Unicode-based indices, Unicode supplementary characters are counted as * single characters. * * This method requires that the list of entities be in ascending order by * start index. * * @param text original text * * @param entities entities with UTF-16 based indices */ public void modifyIndicesFromUTF16ToToUnicode(String text, List<Entity> entities) { IndexConverter convert = new IndexConverter(text); for (Entity entity : entities) { entity.start = convert.codeUnitsToCodePoints(entity.start); entity.end = convert.codeUnitsToCodePoints(entity.end); } } /** * An efficient converter of indices between code points and code units. */ private static final class IndexConverter { final String text; // Keep track of a single corresponding pair of code unit and code point // offsets so that we can re-use counting work if the next requested // entity is near the most recent entity. int codePointIndex = 0; int charIndex = 0; IndexConverter(String text) { this.text = text; } /** * @param charIndex Index into the string measured in code units. * @return The code point index that corresponds to the specified * character index. */ int codeUnitsToCodePoints(int charIndex) { if (charIndex < this.charIndex) { this.codePointIndex -= text.codePointCount(charIndex, this.charIndex); } else { this.codePointIndex += text.codePointCount(this.charIndex, charIndex); } this.charIndex = charIndex; // Make sure that charIndex never points to the second code unit of // a // surrogate pair. if (charIndex > 0 && Character.isSupplementaryCodePoint(text .codePointAt(charIndex - 1))) { this.charIndex -= 1; } return this.codePointIndex; } /** * @param codePointIndex Index into the string measured in code points. * @return the code unit index that corresponds to the specified code * point index. */ int codePointsToCodeUnits(int codePointIndex) { // Note that offsetByCodePoints accepts negative indices. this.charIndex = text.offsetByCodePoints(this.charIndex, codePointIndex - this.codePointIndex); this.codePointIndex = codePointIndex; return this.charIndex; } } }