package com.twitter; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.regex.Matcher; /** * A class to extract usernames, lists, hashtags and URLs from Tweet text. */ public class Extractor { protected boolean extractURLWithoutProtocol = true; /** * Fullwidth at sign: '@' */ private static final char FULLWIDTH_AT_SIGN = '\uff20'; /** * Fullwidth number sign: '#' */ private static final char FULLWIDTH_NUMBER_SIGN = '\uff03'; /** * Create a new extractor. */ public Extractor() { } /** * Extract $cashtag references from Tweet text. * * @param text of the tweet from which to extract cashtags * @return List of cashtags referenced (without the leading $ sign) */ public List<String> extractCashtags(final String text) { if (text == null || text.length() == 0) return Collections.emptyList(); final ArrayList<String> extracted = new ArrayList<String>(); for (final Entity entity : extractCashtagsWithIndices(text)) { extracted.add(entity.value); } return extracted; } /** * Extract $cashtag references from Tweet text. * * @param text of the tweet from which to extract cashtags * @return List of cashtags referenced (without the leading $ sign) */ public List<Entity> extractCashtagsWithIndices(final String text) { if (text == null || text.length() == 0) return Collections.emptyList(); // Performance optimization. // If text doesn't contain $, text doesn't contain // cashtag, so we can simply return an empty list. if (text.indexOf('$') == -1) return Collections.emptyList(); final ArrayList<Entity> extracted = new ArrayList<Entity>(); final Matcher matcher = Regex.VALID_CASHTAG.matcher(text); while (matcher.find()) { extracted.add(new Entity(matcher, Entity.Type.CASHTAG, Regex.VALID_CASHTAG_GROUP_CASHTAG_FULL)); } return extracted; } /** * Extract URLs, @mentions, lists and #hashtag from a given text/tweet. * * @param text text of tweet * @return list of extracted entities */ public List<Entity> extractEntitiesWithIndices(final String text) { final ArrayList<Entity> entities = new ArrayList<Entity>(); entities.addAll(extractURLsWithIndices(text)); entities.addAll(extractHashtagsWithIndices(text, false)); entities.addAll(extractMentionsOrListsWithIndices(text)); entities.addAll(extractCashtagsWithIndices(text)); removeOverlappingEntities(entities); return entities; } /** * Extract #hashtag references from Tweet text. * * @param text of the tweet from which to extract hashtags * @return List of hashtags referenced (without the leading # sign) */ public List<String> extractHashtags(final String text) { return extractHashtags(text, true); } public List<String> extractHashtags(final String text, final boolean exclude_duplicate) { if (text == null || text.length() == 0) return Collections.emptyList(); final ArrayList<String> extracted = new ArrayList<String>(); for (final Entity entity : extractHashtagsWithIndices(text)) { if (!exclude_duplicate || !extracted.contains(entity.value)) { extracted.add(entity.value); } } return extracted; } /** * Extract #hashtag references from Tweet text. * * @param text of the tweet from which to extract hashtags * @return List of hashtags referenced (without the leading # sign) */ public List<Entity> extractHashtagsWithIndices(final String text) { return extractHashtagsWithIndices(text, true); } /** * Extract @username references from Tweet text. A mention is an occurance * of @username anywhere in a Tweet. * * @param text of the tweet from which to extract usernames * @return List of usernames referenced (without the leading @ sign) */ public Set<String> extractMentionedScreennames(final String text) { return extractMentionedScreennames(text, true); } public Set<String> extractMentionedScreennames(final String text, final boolean exclude_duplicate) { if (text == null || text.length() == 0) return Collections.emptySet(); final Set<String> extracted = new HashSet<String>(); for (final Entity entity : extractMentionedScreennamesWithIndices(text)) { if (!exclude_duplicate || !extracted.contains(entity.value)) { extracted.add(entity.value); } } return extracted; } /** * Extract @username references from Tweet text. A mention is an occurance * of @username anywhere in a Tweet. * * @param text of the tweet from which to extract usernames * @return List of usernames referenced (without the leading @ sign) */ public List<Entity> extractMentionedScreennamesWithIndices(final String text) { final ArrayList<Entity> extracted = new ArrayList<Entity>(); for (final Entity entity : extractMentionsOrListsWithIndices(text)) { if (entity.listSlug == null) { extracted.add(entity); } } return extracted; } public List<Entity> extractMentionsOrListsWithIndices(final String text) { if (text == null || text.length() == 0) return Collections.emptyList(); // Performance optimization. // If text doesn't contain @ at all, the text doesn't // contain @mention. So we can simply return an empty list. boolean found = false; for (final char c : text.toCharArray()) { if (c == '@' || c == FULLWIDTH_AT_SIGN) { found = true; break; } } if (!found) return Collections.emptyList(); final ArrayList<Entity> extracted = new ArrayList<Entity>(); final Matcher matcher = Regex.VALID_MENTION_OR_LIST.matcher(text); while (matcher.find()) { final String after = text.substring(matcher.end()); if (!Regex.INVALID_MENTION_MATCH_END.matcher(after).find()) { if (matcher.group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST) == null) { extracted.add(new Entity(matcher, Entity.Type.MENTION, Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME)); } else { extracted.add(new Entity(matcher.start(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME) - 1, matcher .end(Regex.VALID_MENTION_OR_LIST_GROUP_LIST), matcher .group(Regex.VALID_MENTION_OR_LIST_GROUP_USERNAME), matcher .group(Regex.VALID_MENTION_OR_LIST_GROUP_LIST), Entity.Type.MENTION)); } } } return extracted; } /** * Extract a @username reference from the beginning of Tweet text. A reply * is an occurance of @username at the beginning of a Tweet, preceded by 0 * or more spaces. * * @param text of the tweet from which to extract the replied to username * @return username referenced, if any (without the leading @ sign). Returns * null if this is not a reply. */ public String extractReplyScreenname(final String text) { if (text == null) return null; final Matcher matcher = Regex.VALID_REPLY.matcher(text); if (matcher.find()) { final String after = text.substring(matcher.end()); if (Regex.INVALID_MENTION_MATCH_END.matcher(after).find()) return null; else return matcher.group(Regex.VALID_REPLY_GROUP_USERNAME); } else return null; } /** * Extract URL references from Tweet text. * * @param text of the tweet from which to extract URLs * @return List of URLs referenced. */ public List<String> extractURLs(final String text) { if (text == null || text.length() == 0) return Collections.emptyList(); final ArrayList<String> urls = new ArrayList<String>(); for (final Entity entity : extractURLsWithIndices(text)) { urls.add(entity.value); } return urls; } /** * Extract URL references from Tweet text. * * @param text of the tweet from which to extract URLs * @return List of URLs referenced. */ public List<Entity> extractURLsWithIndices(final String text) { if (text == null || text.length() == 0 || (extractURLWithoutProtocol ? text.indexOf('.') : text.indexOf(':')) == -1) // Performance // optimization. // If text doesn't contain '.' or ':' at all, text doesn't contain // URL, // so we can simply return an empty list. return Collections.emptyList(); final ArrayList<Entity> urls = new ArrayList<Entity>(); final Matcher matcher = Regex.VALID_URL.matcher(text); while (matcher.find()) { if (matcher.group(Regex.VALID_URL_GROUP_PROTOCOL) == null) { // skip if protocol is not present and // 'extractURLWithoutProtocol' is false // or URL is preceded by invalid character. if (!extractURLWithoutProtocol || Regex.INVALID_URL_WITHOUT_PROTOCOL_MATCH_BEGIN.matcher( matcher.group(Regex.VALID_URL_GROUP_BEFORE)).matches()) { continue; } } String url = matcher.group(Regex.VALID_URL_GROUP_URL); final int start = matcher.start(Regex.VALID_URL_GROUP_URL); int end = matcher.end(Regex.VALID_URL_GROUP_URL); final Matcher tco_matcher = Regex.VALID_TCO_URL.matcher(url); if (tco_matcher.find()) { // In the case of t.co URLs, don't allow additional path // characters. url = tco_matcher.group(); end = start + url.length(); } urls.add(new Entity(start, end, url, Entity.Type.URL)); } return urls; } public boolean isExtractURLWithoutProtocol() { return extractURLWithoutProtocol; } /* * Modify Unicode-based indices of the entities to UTF-16 based indices. * * In UTF-16 based indices, Unicode supplementary characters are counted as * two characters. * * This method requires that the list of entities be in ascending order by * start index. * * @param text original text * * @param entities entities with Unicode based indices */ public void modifyIndicesFromUnicodeToUTF16(final String text, final List<Entity> entities) { final IndexConverter convert = new IndexConverter(text); for (final Entity entity : entities) { entity.start = convert.codePointsToCodeUnits(entity.start); entity.end = convert.codePointsToCodeUnits(entity.end); } } /* * Modify UTF-16-based indices of the entities to Unicode-based indices. * * In Unicode-based indices, Unicode supplementary characters are counted as * single characters. * * This method requires that the list of entities be in ascending order by * start index. * * @param text original text * * @param entities entities with UTF-16 based indices */ public void modifyIndicesFromUTF16ToToUnicode(final String text, final List<Entity> entities) { final IndexConverter convert = new IndexConverter(text); for (final Entity entity : entities) { entity.start = convert.codeUnitsToCodePoints(entity.start); entity.end = convert.codeUnitsToCodePoints(entity.end); } } public void setExtractURLWithoutProtocol(final boolean extractURLWithoutProtocol) { this.extractURLWithoutProtocol = extractURLWithoutProtocol; } /** * Extract #hashtag references from Tweet text. * * @param text of the tweet from which to extract hashtags * @param checkUrlOverlap if true, check if extracted hashtags overlap URLs * and remove overlapping ones * @return List of hashtags referenced (without the leading # sign) */ private List<Entity> extractHashtagsWithIndices(final String text, final boolean checkUrlOverlap) { if (text == null || text.length() == 0) return Collections.emptyList(); // Performance optimization. // If text doesn't contain # at all, text doesn't contain // hashtag, so we can simply return an empty list. boolean found = false; for (final char c : text.toCharArray()) { if (c == '#' || c == FULLWIDTH_NUMBER_SIGN) { found = true; break; } } if (!found) return Collections.emptyList(); final ArrayList<Entity> extracted = new ArrayList<Entity>(); final Matcher matcher = Regex.VALID_HASHTAG.matcher(text); while (matcher.find()) { final String after = text.substring(matcher.end()); if (!Regex.INVALID_HASHTAG_MATCH_END.matcher(after).find()) { extracted.add(new Entity(matcher, Entity.Type.HASHTAG, Regex.VALID_HASHTAG_GROUP_TAG)); } } if (checkUrlOverlap) { // extract URLs final List<Entity> urls = extractURLsWithIndices(text); if (!urls.isEmpty()) { extracted.addAll(urls); // remove overlap removeOverlappingEntities(extracted); // remove URL entities final Iterator<Entity> it = extracted.iterator(); while (it.hasNext()) { final Entity entity = it.next(); if (entity.getType() != Entity.Type.HASHTAG) { it.remove(); } } } } return extracted; } private void removeOverlappingEntities(final List<Entity> entities) { // sort by index Collections.<Entity> sort(entities, new Comparator<Entity>() { @Override public int compare(final Entity e1, final Entity e2) { return e1.start - e2.start; } }); // Remove overlapping entities. // Two entities overlap only when one is URL and the other is // hashtag/mention // which is a part of the URL. When it happens, we choose URL over // hashtag/mention // by selecting the one with smaller start index. if (!entities.isEmpty()) { final Iterator<Entity> it = entities.iterator(); Entity prev = it.next(); while (it.hasNext()) { final Entity cur = it.next(); if (prev.getEnd() > cur.getStart()) { it.remove(); } else { prev = cur; } } } } public static class Entity { protected int start; protected int end; protected final String value; // listSlug is used to store the list portion of @mention/list. protected final String listSlug; protected final Type type; protected String displayURL = null; protected String expandedURL = null; public Entity(final int start, final int end, final String value, final String listSlug, final Type type) { this.start = start; this.end = end; this.value = value; this.listSlug = listSlug; this.type = type; } public Entity(final int start, final int end, final String value, final Type type) { this(start, end, value, null, type); } public Entity(final Matcher matcher, final Type type, final int groupNumber) { // Offset -1 on start index to include @, # symbols for mentions and // hashtags this(matcher, type, groupNumber, -1); } public Entity(final Matcher matcher, final Type type, final int groupNumber, final int startOffset) { this(matcher.start(groupNumber) + startOffset, matcher.end(groupNumber), matcher.group(groupNumber), type); } @Override public boolean equals(final Object obj) { if (this == obj) return true; if (!(obj instanceof Entity)) return false; final Entity other = (Entity) obj; if (type.equals(other.type) && start == other.start && end == other.end && value.equals(other.value)) return true; else return false; } public String getDisplayURL() { return displayURL; } public Integer getEnd() { return end; } public String getExpandedURL() { return expandedURL; } public String getListSlug() { return listSlug; } public Integer getStart() { return start; } public Type getType() { return type; } public String getValue() { return value; } @Override public int hashCode() { return type.hashCode() + value.hashCode() + start + end; } public void setDisplayURL(final String displayURL) { this.displayURL = displayURL; } public void setExpandedURL(final String expandedURL) { this.expandedURL = expandedURL; } @Override public String toString() { return value + "(" + type + ") [" + start + "," + end + "]"; } public enum Type { URL, HASHTAG, MENTION, CASHTAG } } /** * An efficient converter of indices between code points and code units. */ private static final class IndexConverter { protected final String text; // Keep track of a single corresponding pair of code unit and code point // offsets so that we can re-use counting work if the next requested // entity is near the most recent entity. protected int codePointIndex = 0; protected int charIndex = 0; IndexConverter(final String text) { this.text = text; } /** * @param codePointIndex Index into the string measured in code points. * @return the code unit index that corresponds to the specified code * point index. */ int codePointsToCodeUnits(final int codePointIndex) { // Note that offsetByCodePoints accepts negative indices. charIndex = text.offsetByCodePoints(charIndex, codePointIndex - this.codePointIndex); this.codePointIndex = codePointIndex; return charIndex; } /** * @param charIndex Index into the string measured in code units. * @return The code point index that corresponds to the specified * character index. */ int codeUnitsToCodePoints(final int charIndex) { if (charIndex < this.charIndex) { codePointIndex -= text.codePointCount(charIndex, this.charIndex); } else { codePointIndex += text.codePointCount(this.charIndex, charIndex); } this.charIndex = charIndex; // Make sure that charIndex never points to the second code unit of // a // surrogate pair. if (charIndex > 0 && Character.isSupplementaryCodePoint(text.codePointAt(charIndex - 1))) { this.charIndex -= 1; } return codePointIndex; } } }