/*
* Copyright (c) 2010 Lockheed Martin Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.eurekastreams.server.domain.strategies;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Extract hashtags from a body of text, recognizing when the hash is part of a url, or parentheses.
*/
public class HashTagExtractor
{
/**
* The valid characters in a hashtag.
*/
private static final String VALID_HASHTAG_CHARS = "0123456789abcdefghijklmnopqrstuvwxyz#"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ-_";
/**
* The characters allowed before a hashtag.
*/
private static final String VALID_CHARS_BEFORE_HASHTAG = "-.,<>()#[]@!$&'()*+,;=% \t\"\n";
/**
* The characters in a valid url.
*/
private static final String VALID_URL_CHARACTERS = VALID_HASHTAG_CHARS + ".~:/?#[]@!$&'()*+,;=%";
/**
* Extract the next hashtag, starting with the input starting position.
*
* @param content
* the content to search for hashtags
* @param inStartingIndex
* the starting index
* @return a Substring representing a found hashtag, or null if not found
*/
public Substring extract(final String content, final int inStartingIndex)
{
if (content == null || content.length() == 0 || content.indexOf('#') == -1)
{
// no hashtags
return null;
}
int pos = inStartingIndex;
int hashPos = -1;
int contentLength = content.length();
while ((hashPos = content.indexOf('#', pos)) > -1)
{
// see if this looks like a hashtag
if (hashPos == 0 || (isValidCharBeforeHashtag(content.charAt(hashPos - 1))
// line break
&& !isInAUrl(content, hashPos) && !isInHrefBlock(content, hashPos)))
{
// this is a hashtag
// walk through this to find its endpoint
pos = hashPos + 1;
char nextChar;
while (pos < contentLength)
{
nextChar = content.charAt(pos);
if (!isValidHashTagCharacter(nextChar))
{
break;
}
pos++;
}
if (pos > hashPos + 1)
{
return new Substring(hashPos, pos - hashPos, content.substring(hashPos, pos));
}
// not a hashtag, just a hash
}
else
{
// not a hashtag
pos = hashPos + 1;
}
}
return null;
}
/**
* Extract all hashtags from the input content.
*
* @param content
* the content to extract
* @return a set of hashtags extracted
*/
public List<String> extractAll(final String content)
{
Set<String> hashTagContents = new HashSet<String>();
// parse the hashtags content, then send them into the mapper to get them from the database
Substring hashTag;
int position = 0;
while (null != (hashTag = extract(content, position)))
{
position = hashTag.getStartIndex() + hashTag.getLength();
hashTagContents.add(hashTag.getContent());
}
return new ArrayList<String>(hashTagContents);
}
/**
* Check if the character at a specific position in content is inside a url.
*
* @param content
* the content to check
* @param pos
* the position to check
* @return whether the character at a specific position in content is inside a url
*/
private boolean isInAUrl(final String content, final int pos)
{
// grab the block of potential url text we're currently in
int blockStartPos = pos;
for (int p = pos - 1; p >= 0 && VALID_URL_CHARACTERS.indexOf(content.charAt(p)) != -1; p--)
{
char c = content.charAt(p);
if (c == '#')
{
// if there's another hash before this hash in the block, it's hashtaggable
return false;
}
blockStartPos = p;
}
String block = content.substring(blockStartPos, pos).toLowerCase();
return block.contains("://") || block.contains("www.");
}
/**
* Test whether the character at the input position is inside a hyperlink tag.
*
* @param content
* the content to check
* @param pos
* the position to check in the content
* @return whether the character at the input position is inside a hyperlink tag
*/
private boolean isInHrefBlock(final String content, final int pos)
{
for (int p = pos - 1; p >= 0; p--)
{
String block = content.substring(p, pos).toLowerCase();
if (block.contains("</a>"))
{
return false;
}
if (block.contains("<a "))
{
return true;
}
}
return false;
}
/**
* Check if the input character is a valid character right before a hashtag.
*
* @param inChar
* the char test
* @return whether the input char is a valid character right before a hashtag
*/
private boolean isValidCharBeforeHashtag(final char inChar)
{
return VALID_CHARS_BEFORE_HASHTAG.indexOf(inChar) != -1;
}
/**
* Check if the input character is a valid hashtag character.
*
* @param inChar
* the char to check
* @return whether the input char is valid in a hashtag
*/
private boolean isValidHashTagCharacter(final char inChar)
{
return VALID_HASHTAG_CHARS.indexOf(inChar) > -1;
}
}