package bg.bozho.ikratko.other;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.collections4.trie.PatriciaTrie;
import bg.bozho.ikratko.Checker;
import static bg.bozho.ikratko.Checker.*;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.SyndFeedInput;
public class NewsSitesVocabulary {
private static Map<String, String> feeds = Maps.newLinkedHashMap();
static {
feeds.put("24 часа", "http://www.24chasa.bg/Rss.asp");
feeds.put("Блиц", "http://www.blitz.bg/rss.php?news");
feeds.put("БиНюз", "http://www.bnews.bg/rss.php");
feeds.put("Вести", "http://www.vesti.bg/rss");
feeds.put("Дарик нюз", "http://dariknews.bg/rss.php");
feeds.put("Дневник", "http://www.dnevnik.bg/rss/");
feeds.put("Днес.бг", "http://rss.dnes.bg/c/33162/f/539026/index.rss?today");
feeds.put("Капитал", "http://www.capital.bg/rss/");
feeds.put("Монитор", "http://monitor.bg/rss?id=1");
feeds.put("ПИК", "http://pik.bg/rss/index/2");
feeds.put("Сега", "http://www.segabg.com/rss20.xml");
feeds.put("Стандарт", "http://www.standartnews.com/rss.php?p=1");
feeds.put("Труд", "http://www.trud.bg/rss.asp");
feeds.put("Хроникъл", "http://chronicle.bg/feed/");
}
@SuppressWarnings("unchecked")
public static void main(String[] args) throws Exception {
new Checker().initialize();
PatriciaTrie<String> forms = getFormsDictionaryReferencingBaseForm();
System.out.println(StringUtils.rightPad("Издание", 30) + "думи |корени| к/д | статии");
Set<String> visited = Sets.newHashSet();
Map<String, List<String>> accumulatedWords = Maps.newLinkedHashMap();
Map<String, Integer> entriesCounts = Maps.newHashMap();
for (int i = 0; i < 10; i++) {
for (String site : feeds.keySet()) {
if (!accumulatedWords.containsKey(site)) {
accumulatedWords.put(site, Lists.<String>newArrayList());
entriesCounts.put(site, 0);
}
try {
String url = feeds.get(site);
StringBuilder text = new StringBuilder();
SyndFeedInput input = new SyndFeedInput();
int entriesCount = 0;
try (InputStream in = new URL(url).openStream()) {
SyndFeed feed = input.build(new InputStreamReader(in));
List<SyndEntry> entries = feed.getEntries();
for (SyndEntry entry : entries) {
if (visited.contains(entry.getUri())) {
continue;
}
visited.add(entry.getUri());
entriesCount ++;
// adding the whole text to the accumulated text, stripping the last unfinished word
text.append(" " + entry.getDescription().getValue().replaceAll(" [\\\\pL\\\\pM\\\\p{Nd}\\\\p{Nl}\\\\p{Pc}[\\\\p{InEnclosedAlphanumerics}&&\\\\p{So}]]...", ""));
}
}
List<String> words = Arrays.asList(text.toString().split("[^\\pL\\pM\\p{Nd}\\p{Nl}\\p{Pc}[\\p{InEnclosedAlphanumerics}&&\\p{So}]]+"));
accumulatedWords.get(site).addAll(words);
entriesCounts.put(site, entriesCounts.get(site) + entriesCount);
} catch (Exception ex) {
ex.printStackTrace();
}
}
Thread.sleep(30 * 60 * 1000);
}
for (String site : accumulatedWords.keySet()) {
List<String> words = accumulatedWords.get(site);
Set<String> roots = Sets.newHashSet();
double totalWords = 0;
for (String word : words) {
if (StringUtils.isNotBlank(word) && Character.isLowerCase(word.charAt(0)) && Checker.formsDictionary.containsKey(word)) {
totalWords ++;
roots.add(forms.get(word));
}
}
System.out.println(StringUtils.rightPad(site, 30)
+ StringUtils.rightPad(String.valueOf((int) totalWords), 4) + " | "
+ StringUtils.rightPad(String.valueOf(roots.size()), 4) + " | "
+ String.format("%.2f", roots.size() / totalWords) + " | "
+ entriesCounts.get(site));
}
}
public static PatriciaTrie<String> getFormsDictionaryReferencingBaseForm() {
PatriciaTrie<String> trie = new PatriciaTrie<>();
load();
for (Map.Entry<String, Set<String>> word : dictionary.entrySet()) {
String baseForm = word.getKey();
if (word.getValue().isEmpty()) {
trie.put(baseForm, baseForm);
continue;
}
for (String inflectionClass : word.getValue()) {
Multimap<String, String> inflections = inflectionClasses.get(inflectionClass);
if (inflections == null) {
trie.put(baseForm, baseForm);
continue;
}
for (String ending : inflections.keySet()) {
int endingIdx = baseForm.lastIndexOf(ending);
if (!baseForm.endsWith(ending) || endingIdx == -1) {
continue;
}
trie.put(baseForm, baseForm);
for (String suffix : inflections.get(ending)) {
String inflectedWord = baseForm.substring(0, endingIdx) + suffix;
trie.put(inflectedWord, baseForm);
}
}
}
}
// override the forms of the verb "to be"
for (String sgForm : toBeFormsSg) {
trie.put(sgForm, "съм");
}
for (String plForm : toBeFormsPl) {
trie.put(plForm, "съм");
}
dictionary = null; // eligible for GC. TODO can merge these two load methods, but it's easier not to, for now
return trie;
}
}