/**
*
*/
package com.personalityextractor.entity.extractor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
/**
* @author semanticvoid
*
*/
public class BaselineExtractor implements IEntityExtractor {
final static List<String> stopWords = Arrays.asList(
"a", "an", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "we", "was", "will", "with", "most", "needs"
);
final static List<String> glueWords = Arrays.asList(
"of", "&", "an"
);
/* (non-Javadoc)
* @see com.personalityextractor.entity.extractor.IEntityExtractor#extract(java.lang.String)
*/
@Override
public ArrayList<String> extract(String text) {
ArrayList<String> entities = new ArrayList<String>();
text = text.replaceAll("@[a-zA-Z0-9_]+", "");
String[] lines = text.split("[.,:()?!'\";]+");
int capsCount = 0;
Pattern p = Pattern.compile("^[A-Z]+.*");
String[] words = text.split("\\s+|[:]+");
for (int i=0; i<words.length; i++) {
String word = words[i];
if (p.matcher(word).matches()) {
capsCount++;
}
}
double ratio = (double)capsCount/(double)words.length;
int lineNum = 0;
for(String line : lines) {
if(line != null) {
words = line.split("\\s+|[:]+");
// TODO needs data analysis for this threshold
if(ratio < 0.75) {
StringBuffer buf = new StringBuffer();
boolean first = true;
for (String word : words) {
if(word.equalsIgnoreCase("")) {
continue;
}
if (p.matcher(word).matches() && !first) {
buf.append(word + " ");
} else {
boolean isGlue = false;
for(String gw : glueWords) {
if(gw.equalsIgnoreCase(word) && buf.length() > 0) {
buf.append(word + " ");
isGlue = true;
break;
}
}
if(isGlue) {
continue;
}
if(!buf.toString().trim().equalsIgnoreCase(""))
entities.add(buf.toString().trim());
buf = new StringBuffer();
}
first = false;
}
if(buf.length() > 0) {
if(!buf.toString().trim().equalsIgnoreCase(""))
entities.add(buf.toString().trim());
}
}
lineNum++;
}
}
return entities;
}
public static void main(String[] args) {
IEntityExtractor e = new BaselineExtractor();
List<String> sentences = Arrays.asList(
"Rest in Peace!",
"New blog post: 50 days with Google Nexus S: http://www.venu.in/blog/?p=314",
"@dpolice Hard to say. If the user is geeky - Nexus S . Otherwise iPhone 4 . :) Both are great phones.",
"About to embark on the unthinkable... Driving to New York City. Wish me luck.",
"Best part of The Hurt Locker ? The lack of background music! Silence speaks quite loudly in this movie.",
"I'm playing the Age of Empires.",
"iTunes / ipod ecosystem needs to learn a thing or two from Doggcatcher. Seriously. This is the best solution for podcast listeners out there.",
"loved India New Land of Opportunity on Boxee http://bit.ly/ghYcfj",
"@vjvegi Why this comment about Pakistan all of a sudden? :)",
"Swapped the Elantra with a Santa Fe to deal with all that snow on the roads."
);
for(String sentence : sentences) {
List<String> entities = e.extract(sentence);
for(String entity : entities) {
System.out.println("'" + entity + "'");
}
}
}
}