/** * */ package com.personalityextractor.entity.extractor; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * @author semanticvoid * */ public class ConsecutiveWordsEntityExtractor implements IEntityExtractor { final static List<String> stopWords = Arrays.asList( "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "most", "needs" ); /* (non-Javadoc) * @see com.personalityextractor.entity.extractor.IEntityExtractor#extract(java.lang.String) */ @Override public ArrayList<String> extract(String line) { if(line == null) { return null; } ArrayList<String> allEntities = new ArrayList<String>(); String[] words = line.split("[ :;'\"?/><,\\.!@#$%^&()-+=~`{}|]+"); ArrayList<String> filteredWords = new ArrayList<String>(); for(String word : words) { if(word.length() <= 0) { filteredWords.add(null); continue; } boolean isStop = false; for(String sWord : stopWords) { if(word.equalsIgnoreCase(sWord)) { isStop = true; break; } } if(isStop) { filteredWords.add(null); continue; } filteredWords.add(word.toLowerCase()); } ArrayList<String> consecutiveWords = new ArrayList<String>(); for(String fw : filteredWords) { if(fw == null) { ArrayList<String> entities = formEntities(consecutiveWords); allEntities.addAll(entities); for(String entity : entities) { // System.out.println(entity); } consecutiveWords = new ArrayList<String>(); continue; } consecutiveWords.add(fw); } if(consecutiveWords.size() > 0) { ArrayList<String> entities = formEntities(consecutiveWords); allEntities.addAll(entities); for(String entity : entities) { // System.out.println(entity); } } return allEntities; } private ArrayList<String> formEntities(ArrayList<String> words) { ArrayList<String> entities = new ArrayList<String>(); for(int i=0; i<words.size(); i++) { StringBuffer buf = new StringBuffer(); for(int j=i; j<words.size(); j++) { buf.append(words.get(j) + " "); String entity = buf.toString().trim(); entities.add(entity); } } return entities; } public static void main(String[] args) { IEntityExtractor e = new ConsecutiveWordsEntityExtractor(); List<String> sentences = Arrays.asList( "Rest in Peace!", "New blog post: 50 days with Google Nexus S: http://www.venu.in/blog/?p=314", "@dpolice Hard to say. If the user is geeky - Nexus S . Otherwise iPhone 4 . :) Both are great phones.", "About to embark on the unthinkable... Driving to New York City. Wish me luck.", "Best part of The Hurt Locker ? The lack of background music! Silence speaks quite loudly in this movie.", "I'm playing the Age of Empires.", "iTunes / ipod ecosystem needs to learn a thing or two from Doggcatcher. Seriously. This is the best solution for podcast listeners out there.", "loved India New Land of Opportunity on Boxee http://bit.ly/ghYcfj", "@vjvegi Why this comment about Pakistan all of a sudden? :)", "Swapped the Elantra with a Santa Fe to deal with all that snow on the roads." ); for(String sentence : sentences) { List<String> entities = e.extract(sentence); for(String entity : entities) { System.out.println("'" + entity + "'"); } } } }