package com.personalityextractor.url.data; import java.io.BufferedReader; import java.io.FileReader; import java.util.ArrayList; import java.util.List; import com.personalityextractor.commons.data.Tweet; import com.personalityextractor.entity.WikipediaEntity; import com.personalityextractor.entity.extractor.EntityExtractFactory; import com.personalityextractor.entity.extractor.IEntityExtractor; import com.personalityextractor.entity.extractor.EntityExtractFactory.Extracter; import com.personalityextractor.entity.resolver.ViterbiResolver; import com.personalityextractor.url.HTMLParser.Readability.Readability; import cs224n.util.Counter; import cs224n.util.PriorityQueue; public class URLEntityExtractor { ViterbiResolver vr = new ViterbiResolver(); public static List<String> extractEntitiesinTitle(String urlStr, IEntityExtractor extractor) { if(extractor==null) extractor = EntityExtractFactory.produceExtractor(Extracter.NOUNPHRASE); ArrayList<String> entities = new ArrayList<String>(); String urlContent = URLContent.fetchURLContent(urlStr); if(urlContent==null) return entities; String title = URLContent.fetchTitleString(urlContent); if(title==null) return entities; Tweet t = new Tweet(title); for (String sentence : t.getSentences()) { entities.addAll(extractor.extract(sentence)); } return entities; } public static List<String> extractTopEntities(String url, IEntityExtractor extractor){ if(extractor==null) extractor = EntityExtractFactory.produceExtractor(Extracter.PROPERNOUNPHRASE); ArrayList<String> topEntities = new ArrayList<String>(); Readability read = new Readability(); String text = read.removeHTML(url); if(text==null) return topEntities; String[] lines = text.split("\n"); Counter<String> entities = new Counter<String>(); int line_count =0; for(String line : lines){ line = line.trim(); if(line.length()==0) continue; if(line_count>2) break; line_count++; System.out.println(line); List<String> ents = extractor.extract(line); System.out.println(ents); if(ents!=null){ entities.incrementAll(ents, 1.0); } } PriorityQueue<String> pq = entities.asPriorityQueue(); int count=0; double leastCount = 0; while(pq.hasNext() && count <2){ count++; leastCount = (entities.getCount(pq.next())); } for(String ent : entities.keySet()){ if(entities.getCount(ent)>=leastCount){ topEntities.add(ent); } } return topEntities; } public List<WikipediaEntity> resolveEntitiesinTitle(List<String> entities){ return vr.resolve(entities); } public List<String> readLinesinFile(String file){ List<String> lines = new ArrayList<String>(); try{ BufferedReader br = new BufferedReader(new FileReader(file)); String line=""; while((line= br.readLine())!=null){ lines.add(line.trim()); } }catch(Exception e){ e.printStackTrace(); } return lines; } public static void main(String[] args) { URLEntityExtractor uee = new URLEntityExtractor(); //System.out.println(uee.extractTopEntities("http://www.pcmag.com/article2/0,2817,2394487,00.asp#fbid=iY_0drVV-th")); } }