// License: GPL. For details, see LICENSE file. package org.openstreetmap.josm.plugins.osmrec.features; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.analysis.el.GreekAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.util.Version; import org.openstreetmap.josm.plugins.osmrec.container.OSMWay; import org.openstreetmap.josm.plugins.osmrec.extractor.LanguageDetector; import com.cybozu.labs.langdetect.LangDetectException; import de.bwaldvogel.liblinear.FeatureNode; /** * Constructs the textual features from the given textual list. * * @author imis-nkarag */ public class TextualFeatures { private int id; private int numberOfFeatures; private final List<String> textualList; private static String language; private final QueryParser greekParser; private final QueryParser englishParser; private final LanguageDetector languageDetector; public TextualFeatures(int id, List<String> textualList, LanguageDetector languageDetector) { this.id = id; this.textualList = textualList; this.languageDetector = languageDetector; numberOfFeatures = textualList.size() + id; GreekAnalyzer greekAnalyzer = new GreekAnalyzer(Version.LUCENE_36); greekParser = new QueryParser(Version.LUCENE_36, "", greekAnalyzer); EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer(Version.LUCENE_36); englishParser = new QueryParser(Version.LUCENE_36, "", englishAnalyzer); } public void createTextualFeatures(OSMWay wayNode) { //namesList.indexOf(name) this index can be zero. //In that case it conflicts the previous geometry id, so we increment id. //idWords: populated with the ID that will be ginen as a feature, mapped with the word found. //Chose to store the name for future use. Map<Integer, String> idWords = new TreeMap<>(); Map<String, String> tags = wayNode.getTagKeyValue(); if (tags.keySet().contains("name")) { String nameTag = tags.get("name"); //get the value of the name tag of the current node String[] nameTagSplitList = nameTag.split("\\s"); //split the value to compare individually //with the namesList String lang = ""; try { lang = detectLanguage(nameTag); } catch (LangDetectException ex) { Logger.getLogger(TextualFeatures.class.getName()).log(Level.SEVERE, null, ex); } for (String split : nameTagSplitList) { try { //TOGGLE split = split.replaceAll("[-+.^:,?;'{}\"!()\\[\\]]", ""); if (lang.equals("el")) { split = stemGreek(split); } else { split = stemEnglish(split); } if (textualList.contains(split)) { int currentID = textualList.indexOf(split) + id; idWords.put(currentID, split); } } catch (ParseException ex) { Logger.getLogger(TextualFeatures.class.getName()).log(Level.SEVERE, null, ex); } } for (Integer wordID : idWords.keySet()) { wayNode.getFeatureNodeList().add(new FeatureNode(wordID, 1.0)); //System.out.println(wordID); } //System.out.println("until textual " + wayNode.getFeatureNodeList()); } } public int getLastID() { return numberOfFeatures; } private String detectLanguage(String nameTag) throws LangDetectException { //detect language if (!nameTag.isEmpty()) { language = languageDetector.detect(nameTag); return language; } else { return "no_lang"; } } private String stemGreek(String word) throws ParseException { String stemmedWord; if (!word.isEmpty()) { stemmedWord = greekParser.parse(word).toString(); } else { return word; } return stemmedWord; } private String stemEnglish(String word) throws ParseException { String stemmedWord; if (!word.isEmpty()) { stemmedWord = englishParser.parse(word).toString(); } else { return word; } return stemmedWord; } }