package net.yacy.data.ymark; import java.io.IOException; import java.net.MalformedURLException; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ArrayBlockingQueue; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser.Failure; import net.yacy.document.SentenceReader; import net.yacy.document.WordTokenizer; import net.yacy.kelondro.data.word.Word; import net.yacy.repository.LoaderDispatcher; public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandler { private static final String EMPTY_STRING = new String(); public final static String SPACE = " "; public final static String POISON = ""; public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo", "and", "with", "the", "gt", "lt")); private final ArrayBlockingQueue<String> bmkQueue; private final YMarkTables ymarks; private final String bmk_user; private final LoaderDispatcher loader; private final boolean merge; public YMarkAutoTagger(final ArrayBlockingQueue<String> bmkQueue, final LoaderDispatcher loader, final YMarkTables ymarks, final String bmk_user, final boolean merge) { this.bmkQueue = bmkQueue; this.ymarks = ymarks; this.bmk_user = bmk_user; this.loader = loader; this.merge = merge; } public YMarkAutoTagger(final LoaderDispatcher loader, final YMarkTables ymarks, final String bmk_user) { this.bmkQueue = new ArrayBlockingQueue<String>(1); this.ymarks = ymarks; this.bmk_user = bmk_user; this.loader = loader; this.merge = true; } private static Document loadDocument(final String url, final LoaderDispatcher loader, ClientIdentification.Agent agent) throws IOException { DigestURL uri; Response response; try { uri = new DigestURL(url); } catch (final MalformedURLException e) { ConcurrentLog.warn(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to malformed url: "+url); return null; } response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, agent); try { return Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); } catch (final Failure e) { ConcurrentLog.warn(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to a parser failure for url: "+url); return null; } } public static String autoTag(final Document document, final int max, final TreeMap<String, YMarkTag> tags) { final TreeSet<YMarkTag> topwords = new TreeSet<YMarkTag>(); StringBuilder token; if(document == null) { return EMPTY_STRING; } //get words from document final Map<String, Word> words = new Condenser(document, null, true, true, LibraryProvider.dymLib, false, false, 0).words(); // generate potential tags from document title, description and subject final int bufferSize = document.dc_title().length() + document.dc_description().length + document.dc_subject(' ').length() + 32; final StringBuilder buffer = new StringBuilder(bufferSize); final StringBuilder pwords = new StringBuilder(1000); buffer.append(document.dc_title().toLowerCase()); for (String s:document.dc_description()) buffer.append(s.toLowerCase()); buffer.append(document.dc_subject(' ').toLowerCase()); int score = 0; // get phrases final TreeMap<String, YMarkTag> phrases = getPhrases(document, 2); phrases.putAll(getPhrases(document, 3)); final Iterator<String> iter = phrases.keySet().iterator(); while(iter.hasNext()) { score = 10; final String phrase = iter.next(); if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) { score = phrases.get(phrase).size() * CommonPattern.SPACE.split(phrase).length * 20; } if(isDigitSpace(phrase)) { score = 10; } if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) { score = score * 10; } if (tags.containsKey(phrase)) { score = score * 20; } topwords.add(new YMarkTag(phrase, score)); pwords.append(phrase); pwords.append(' '); } // loop through potential tag and rank them WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib); try { while (tokens.hasMoreElements()) { score = 0; token = tokens.nextElement(); // check if the token appears in the text if (words.containsKey(token.toString())) { final Word word = words.get(token.toString()); // token appears in text and matches an existing bookmark tag if (tags.containsKey(token.toString())) { score = word.occurrences() * tags.get(token.toString()).size() * 200; } // token appears in text and has more than 3 characters else if (token.length()>3) { score = word.occurrences() * 100; } // if token is already part of a phrase, reduce score if(pwords.toString().indexOf(token.toString())>1) { score = score / 3; } topwords.add(new YMarkTag(token.toString(), score)); } } } finally { tokens.close(); tokens = null; } score = 0; buffer.setLength(0); for(final YMarkTag tag : topwords) { if(score < max) { if(tag.size() > 100) { buffer.append(tag.name()); buffer.append(YMarkUtil.TAGS_SEPARATOR); score++; } } else { break; } } final String clean = YMarkUtil.cleanTagsString(buffer.toString()); if(clean.equals(YMarkEntry.BOOKMARK.TAGS.deflt())) { return MultiProtocolURL.getFileExtension(document.dc_source().getFileName()); } return clean; } private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) { final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>(); final StringBuilder phrase = new StringBuilder(128); WordTokenizer tokens = new WordTokenizer(new SentenceReader(document.getTextString()), LibraryProvider.dymLib); try { StringBuilder token; int count = 0; // loop through text while(tokens.hasMoreElements()) { token = tokens.nextElement(); if(stopwords.contains(token.toString()) || isDigitSpace(token.toString())) continue; // if we have a full phrase, delete the first token count++; if(count > size) phrase.delete(0, phrase.indexOf(SPACE)+1); // append new token if(phrase.length() > 1) phrase.append(SPACE); phrase.append(token); if(count >= size) { // make sure we really have a phrase if(phrases.containsKey(phrase.toString())) { phrases.get(phrase.toString()).inc(); } else { phrases.put(phrase.toString(), new YMarkTag(phrase.toString())); } } } return phrases; } finally { tokens.close(); tokens = null; } } public static String autoTag(final String url, final LoaderDispatcher loader, ClientIdentification.Agent agent, final int max, final TreeMap<String, YMarkTag> tags) { Document document = null; String exception = "/IOExceptions"; try { document = loadDocument(url, loader, agent); } catch (final IOException e) { exception = e.getMessage(); int start = exception.indexOf('\'')+9; int end = exception.indexOf('\'', start); if(start >= 0 && end > 0 && start < exception.length() && end < exception.length()) exception = "/IOExceptions/" + exception.substring(start, end); } return (document != null) ? autoTag(document, max, tags) : exception; } public static boolean isDigitSpace(String str) { if (str == null) { return false; } int sz = str.length(); for (int i = 0; i < sz; i++) { if ((Character.isDigit(str.charAt(i)) == false) && (str.charAt(i) != ' ')) { return false; } } return true; } @Override public void run() { Thread.currentThread().setUncaughtExceptionHandler(this); String url = null; String tagString; Iterator<String> tit; try { final TreeMap<String, YMarkTag> tags = this.ymarks.getTags(this.bmk_user); while((url = this.bmkQueue.take()) != POISON) { tagString = autoTag(url, this.loader, ClientIdentification.yacyInternetCrawlerAgent, 5, tags); if (tagString.startsWith("/IOExceptions")) { this.ymarks.addFolder(this.bmk_user, url, tagString); tagString = ""; } // update tags this.ymarks.addTags(this.bmk_user, url, tagString, this.merge); // update tags tit = YMarkUtil.keysStringToSet(tagString).iterator(); while(tit.hasNext()) { final String tag = tit.next(); if(tags.containsKey(tag)) { tags.get(tag).inc(); } else { tags.put(tag, new YMarkTag(tag)); } } } } catch (final InterruptedException e) { ConcurrentLog.logException(e); } catch (final IOException e) { ConcurrentLog.warn(YMarkTables.BOOKMARKS_LOG.toString(), "autoTagger - IOException for URL: "+url); } finally { } } @Override public void uncaughtException(final Thread t, final Throwable e) { ConcurrentLog.warn(YMarkTables.BOOKMARKS_LOG, "I caught an uncaughtException in thread "+t.getName()); ConcurrentLog.logException(e); } }