// License: GPL. For details, see LICENSE file. package org.openstreetmap.josm.plugins.osmrec.extractor; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.AbstractMap; import java.util.AbstractMap.SimpleEntry; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.analysis.de.GermanAnalyzer; import org.apache.lucene.analysis.el.GreekAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.es.SpanishAnalyzer; import org.apache.lucene.analysis.fr.FrenchAnalyzer; import org.apache.lucene.analysis.hi.HindiAnalyzer; import org.apache.lucene.analysis.ru.RussianAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.util.Version; /** * Analyzes textual information. Language detection, stop words removal, stemming based on language. * Provides methods for retrieving the textual list by frequency and top-K terms. * @author imis-nkarag */ public class Analyzer { private final String osmFilePath; private static final HashSet<String> stopWordsList = new HashSet<>(); //add greek list to same file private ArrayList<Entry<String, Integer>> frequencies; private final LanguageDetector languageDetector; public Analyzer(String osmFilePath, LanguageDetector languageDetector) { this.osmFilePath = osmFilePath; this.languageDetector = languageDetector; } public void runAnalysis() { //textual list FrequenceExtractor frequenceExtractor = new FrequenceExtractor(osmFilePath); frequenceExtractor.parseDocument(); Set<Map.Entry<String, Integer>> frequencyEntries = frequenceExtractor.getFrequency().entrySet(); //parse stop words loadStopWords(); //send some samples ArrayList<Map.Entry<String, Integer>> normalizedList = new ArrayList<>(); ArrayList<String> sampleList = new ArrayList<>(); int iters = 0; for (Map.Entry<String, Integer> frequencyEntry : frequencyEntries) { if (iters < 10) { sampleList.add(frequencyEntry.getKey()); iters++; } //remove parenthesis etc here if (!stopWordsList.contains(frequencyEntry.getKey())) { String normalizedName = frequencyEntry.getKey().toLowerCase(); normalizedName = normalizedName.replaceAll("[-+.^:,?;'{}\"!()\\[\\]]", ""); normalizedList.add(new AbstractMap.SimpleEntry<>(normalizedName, frequencyEntry.getValue())); } } Map<String, Integer> langs = new HashMap<>(); langs.put("en", 0); langs.put("el", 0); langs.put("de", 0); langs.put("es", 0); langs.put("ru", 0); langs.put("hi", 0); langs.put("zh", 0); langs.put("tr", 0); langs.put("fr", 0); for (String word : sampleList) { //System.out.println("to be detected: " + word); if (!word.isEmpty()) { String lang; lang = languageDetector.detect(word); switch (lang) { case "en": //en++; langs.put("en", langs.get("en")+1); break; case "el": //el++; langs.put("el", langs.get("el")+1); break; case "de": //de++; langs.put("de", langs.get("de")+1); break; case "es": //es++; langs.put("es", langs.get("es")+1); break; case "ru": //ru++; langs.put("ru", langs.get("ru")+1); break; case "fr": //fr++; langs.put("fr", langs.get("fr")+1); break; case "zh": //zh++; langs.put("zh", langs.get("zh")+1); break; case "tr": //tr++; langs.put("tr", langs.get("tr")+1); break; case "hi": //hi++; langs.put("hi", langs.get("hi")+1); break; //other lang, no support yet //System.out.println("found other language, no support yet :("); default: break; } } } int maxLangFreq = langs.get("en"); String dominantLanguage = "en"; for (Entry<String, Integer> lang : langs.entrySet()) { if (lang.getValue() > maxLangFreq) { maxLangFreq = lang.getValue(); dominantLanguage = lang.getKey(); } } switch (dominantLanguage) { case "en": normalizedList = stemEnglish(normalizedList); break; case "el": normalizedList = stemGreek(normalizedList); break; case "de": normalizedList = stemGerman(normalizedList); break; case "es": normalizedList = stemSpanish(normalizedList); break; case "ru": normalizedList = stemRussian(normalizedList); break; case "fr": normalizedList = stemFrench(normalizedList); break; case "zh": normalizedList = stemChinese(normalizedList); break; case "tr": normalizedList = stemTurkish(normalizedList); break; case "hi": normalizedList = stemHindi(normalizedList); break; default: normalizedList = stemEnglish(normalizedList); break; } Collections.sort(normalizedList, new Comparator<Map.Entry<String, Integer>>() { @Override public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) { return (o2.getValue()).compareTo(o1.getValue()); } }); setFrequencies(normalizedList); } private static ArrayList<Map.Entry<String, Integer>> stemGreek(List<Map.Entry<String, Integer>> normalizedList) { org.apache.lucene.analysis.Analyzer greekAnalyzer = new GreekAnalyzer(Version.LUCENE_36); QueryParser greekParser = new QueryParser(Version.LUCENE_36, "", greekAnalyzer); ArrayList<Map.Entry<String, Integer>> stemmedList = new ArrayList<>(); for (Map.Entry<String, Integer> entry : normalizedList) { if (!entry.getKey().isEmpty()) { try { //System.out.println("result: " + greekParser.parse(entry.getKey())); String stemmedWord = greekParser.parse(entry.getKey()).toString(); SimpleEntry<String, Integer> stemmed = new SimpleEntry<>(stemmedWord, entry.getValue()); if (!stemmedWord.equals("")) { stemmedList.add(stemmed); } } catch (ParseException ex) { Logger.getLogger(Analyzer.class.getName()).log(Level.SEVERE, null, ex); } } } return stemmedList; } private static ArrayList<Map.Entry<String, Integer>> stemEnglish(List<Map.Entry<String, Integer>> normalizedList) { org.apache.lucene.analysis.Analyzer englishAnalyzer = new EnglishAnalyzer(Version.LUCENE_36); QueryParser englishParser = new QueryParser(Version.LUCENE_36, "", englishAnalyzer); ArrayList<Map.Entry<String, Integer>> stemmedList = new ArrayList<>(); for (Map.Entry<String, Integer> entry : normalizedList) { if (!entry.getKey().isEmpty()) { try { //System.out.println("result: " + englishParser.parse(entry.getKey())); String stemmedWord = englishParser.parse(entry.getKey()).toString(); SimpleEntry<String, Integer> stemmed = new SimpleEntry<>(stemmedWord, entry.getValue()); if (!stemmedWord.equals("")) { stemmedList.add(stemmed); } } catch (ParseException ex) { Logger.getLogger(Analyzer.class.getName()).log(Level.SEVERE, null, ex); } } } return stemmedList; } private static ArrayList<Map.Entry<String, Integer>> stemGerman(List<Map.Entry<String, Integer>> normalizedList) { org.apache.lucene.analysis.Analyzer germanAnalyzer = new GermanAnalyzer(Version.LUCENE_36); QueryParser germanParser = new QueryParser(Version.LUCENE_36, "", germanAnalyzer); ArrayList<Map.Entry<String, Integer>> stemmedList = new ArrayList<>(); for (Map.Entry<String, Integer> entry : normalizedList) { if (!entry.getKey().isEmpty()) { try { //System.out.println("result: " + englishParser.parse(entry.getKey())); String stemmedWord = germanParser.parse(entry.getKey()).toString(); SimpleEntry<String, Integer> stemmed = new SimpleEntry<>(stemmedWord, entry.getValue()); if (!stemmedWord.equals("")) { stemmedList.add(stemmed); } } catch (ParseException ex) { Logger.getLogger(Analyzer.class.getName()).log(Level.SEVERE, null, ex); } } } return stemmedList; } private static ArrayList<Map.Entry<String, Integer>> stemSpanish(List<Map.Entry<String, Integer>> normalizedList) { org.apache.lucene.analysis.Analyzer spanishAnalyzer = new SpanishAnalyzer(Version.LUCENE_36); QueryParser spanishParser = new QueryParser(Version.LUCENE_36, "", spanishAnalyzer); ArrayList<Map.Entry<String, Integer>> stemmedList = new ArrayList<>(); for (Map.Entry<String, Integer> entry : normalizedList) { if (!entry.getKey().isEmpty()) { try { //System.out.println("result: " + englishParser.parse(entry.getKey())); String stemmedWord = spanishParser.parse(entry.getKey()).toString(); SimpleEntry<String, Integer> stemmed = new SimpleEntry<>(stemmedWord, entry.getValue()); if (!stemmedWord.equals("")) { stemmedList.add(stemmed); } } catch (ParseException ex) { Logger.getLogger(Analyzer.class.getName()).log(Level.SEVERE, null, ex); } } } return stemmedList; } private static ArrayList<Map.Entry<String, Integer>> stemRussian(List<Map.Entry<String, Integer>> normalizedList) { org.apache.lucene.analysis.Analyzer russianAnalyzer = new RussianAnalyzer(Version.LUCENE_36); QueryParser russianParser = new QueryParser(Version.LUCENE_36, "", russianAnalyzer); ArrayList<Map.Entry<String, Integer>> stemmedList = new ArrayList<>(); for (Map.Entry<String, Integer> entry : normalizedList) { if (!entry.getKey().isEmpty()) { try { //System.out.println("result: " + englishParser.parse(entry.getKey())); String stemmedWord = russianParser.parse(entry.getKey()).toString(); SimpleEntry<String, Integer> stemmed = new SimpleEntry<>(stemmedWord, entry.getValue()); if (!stemmedWord.equals("")) { stemmedList.add(stemmed); } } catch (ParseException ex) { Logger.getLogger(Analyzer.class.getName()).log(Level.SEVERE, null, ex); } } } return stemmedList; } private static ArrayList<Map.Entry<String, Integer>> stemFrench(List<Map.Entry<String, Integer>> normalizedList) { org.apache.lucene.analysis.Analyzer frenchAnalyzer = new FrenchAnalyzer(Version.LUCENE_36); QueryParser frenchParser = new QueryParser(Version.LUCENE_36, "", frenchAnalyzer); ArrayList<Map.Entry<String, Integer>> stemmedList = new ArrayList<>(); for (Map.Entry<String, Integer> entry : normalizedList) { if (!entry.getKey().isEmpty()) { try { //System.out.println("result: " + englishParser.parse(entry.getKey())); String stemmedWord = frenchParser.parse(entry.getKey()).toString(); SimpleEntry<String, Integer> stemmed = new SimpleEntry<>(stemmedWord, entry.getValue()); if (!stemmedWord.equals("")) { stemmedList.add(stemmed); } } catch (ParseException ex) { Logger.getLogger(Analyzer.class.getName()).log(Level.SEVERE, null, ex); } } } return stemmedList; } private static ArrayList<Map.Entry<String, Integer>> stemChinese(List<Map.Entry<String, Integer>> normalizedList) { org.apache.lucene.analysis.Analyzer chineseAnalyzer = new StandardAnalyzer(Version.LUCENE_36); QueryParser chineseParser = new QueryParser(Version.LUCENE_36, "", chineseAnalyzer); ArrayList<Map.Entry<String, Integer>> stemmedList = new ArrayList<>(); for (Map.Entry<String, Integer> entry : normalizedList) { if (!entry.getKey().isEmpty()) { try { //System.out.println("result: " + englishParser.parse(entry.getKey())); String stemmedWord = chineseParser.parse(entry.getKey()).toString(); SimpleEntry<String, Integer> stemmed = new SimpleEntry<>(stemmedWord, entry.getValue()); if (!stemmedWord.equals("")) { stemmedList.add(stemmed); } } catch (ParseException ex) { Logger.getLogger(Analyzer.class.getName()).log(Level.SEVERE, null, ex); } } } return stemmedList; } private static ArrayList<Map.Entry<String, Integer>> stemTurkish(List<Map.Entry<String, Integer>> normalizedList) { org.apache.lucene.analysis.Analyzer turkishAnalyzer = new TurkishAnalyzer(Version.LUCENE_36); QueryParser turkishParser = new QueryParser(Version.LUCENE_36, "", turkishAnalyzer); ArrayList<Map.Entry<String, Integer>> stemmedList = new ArrayList<>(); for (Map.Entry<String, Integer> entry : normalizedList) { if (!entry.getKey().isEmpty()) { try { //System.out.println("result: " + englishParser.parse(entry.getKey())); String stemmedWord = turkishParser.parse(entry.getKey()).toString(); SimpleEntry<String, Integer> stemmed = new SimpleEntry<>(stemmedWord, entry.getValue()); if (!stemmedWord.equals("")) { stemmedList.add(stemmed); } } catch (ParseException ex) { Logger.getLogger(Analyzer.class.getName()).log(Level.SEVERE, null, ex); } } } return stemmedList; } private static ArrayList<Map.Entry<String, Integer>> stemHindi(List<Map.Entry<String, Integer>> normalizedList) { org.apache.lucene.analysis.Analyzer hindiAnalyzer = new HindiAnalyzer(Version.LUCENE_36); QueryParser hindiParser = new QueryParser(Version.LUCENE_36, "", hindiAnalyzer); ArrayList<Map.Entry<String, Integer>> stemmedList = new ArrayList<>(); for (Map.Entry<String, Integer> entry : normalizedList) { if (!entry.getKey().isEmpty()) { try { //System.out.println("result: " + englishParser.parse(entry.getKey())); String stemmedWord = hindiParser.parse(entry.getKey()).toString(); SimpleEntry<String, Integer> stemmed = new SimpleEntry<>(stemmedWord, entry.getValue()); if (!stemmedWord.equals("")) { stemmedList.add(stemmed); } } catch (ParseException ex) { Logger.getLogger(Analyzer.class.getName()).log(Level.SEVERE, null, ex); } } } return stemmedList; } private void loadStopWords() { //parse stopwordsList InputStream fstream = Analyzer.class.getResourceAsStream("/resources/files/stopWords.txt"); try (BufferedReader br = new BufferedReader(new InputStreamReader(fstream))) { String strLine; while ((strLine = br.readLine()) != null) { stopWordsList.add(strLine); } } catch (IOException ex) { Logger.getLogger(Analyzer.class.getName()).log(Level.SEVERE, null, ex); } } private void setFrequencies(ArrayList<Map.Entry<String, Integer>> frequencies) { this.frequencies = frequencies; } public List<Map.Entry<String, Integer>> getFrequencies() { return Collections.unmodifiableList(frequencies); } public List<Map.Entry<String, Integer>> getTopKMostFrequent(int topK) { //todo recheck if (topK > frequencies.size()) { return Collections.unmodifiableList(frequencies); } else { return frequencies.subList(0, topK); } } public List<Map.Entry<String, Integer>> getWithFrequency(int minFrequency) { ArrayList<Map.Entry<String, Integer>> withFrequency = new ArrayList<>(); for (Map.Entry<String, Integer> entry : frequencies) { if (entry.getValue() > minFrequency) { withFrequency.add(entry); } else { return withFrequency; } } return withFrequency; } }