/** * TokenizedStringNavigator.java * (C) 2017 by reger24; https://github.com/reger24 * * This is a part of YaCy, a peer-to-peer based web search engine * * LICENSE * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.search.navigator; import java.util.Collection; import java.util.StringTokenizer; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionSchema; /** * Search navigator for string entries based on ScoreMap to count and * order the result list by counted occurence. The string values are tokenized * and each word is added (lowercased) to the score map. */ public class TokenizedStringNavigator extends StringNavigator implements Navigator { public TokenizedStringNavigator(String title, CollectionSchema field) { super(title, field); } /** * Increase the score for the key value contained in the defined field in * the doc. The value string is tokenized using delimiter " ,;" * @param doc Solrdocument with field for the key content */ @Override public void incDoc(URIMetadataNode doc) { if (field != null) { Object val = doc.getFieldValue(field.getSolrFieldName()); if (val != null) { if (val instanceof Collection) { Collection<?> ll = (Collection<?>) val; for (Object obj : ll) { if(obj instanceof String) { final String s = (String)obj; if (!s.isEmpty()) { StringTokenizer token = new StringTokenizer(s.toLowerCase()," ,;"); // StringTokenizer faster than regex pattern while (token.hasMoreTokens()) { String word = token.nextToken(); if (word.length() > 1 && !Switchboard.stopwords.contains(word)) { this.inc(word); } } } } } } else { StringTokenizer token = new StringTokenizer((String) val, " ,;"); while (token.hasMoreTokens()) { String word = token.nextToken().toLowerCase(); if (word.length() > 1 && !Switchboard.stopwords.contains(word)) { this.inc(word); } } } } } } }