package com.s24.wiki.exports; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import com.s24.wiki.GermanGrammarPageParser; import com.s24.wiki.GermanNounPageParser; import com.s24.wiki.GermanSubwordPageParser; import com.s24.wiki.GermanWikiParser; import com.s24.wiki.PageParserCallback; /** * * * @author Shopping24 GmbH, Torsten Bøgh Köster (@tboeghk) */ public class SolrGermanSynonyms { private static final String stem = "target/stem_de.sql"; private static final String subword = "target/subword_de.txt"; private static final String nouns = "target/nouns_de.sql"; /** * @param args */ public static void main(String[] args) { // configure sinks final File stemout = new File(stem); final File subwordout = new File(subword); final File nounsout = new File(nouns); final Map<String, Pair<List<String>, List<String>>> subwortmap = new TreeMap<String, Pair<List<String>, List<String>>>(); final Map<String, String> stemmap = new TreeMap<String, String>(); final Collection<String> nounsfound = new HashSet<>(); // create parser & add add parser callbacks GermanWikiParser wp = new GermanWikiParser(); wp.addParser(new GermanSubwordPageParser(new PageParserCallback() { @Override public void callback(List<String> left, List<String> right) { if (left.size() < right.size()) { subwortmap.put(left.get(0), Pair.of(left, right)); } } })); wp.addParser(new GermanGrammarPageParser(new PageParserCallback() { @Override public void callback(List<String> left, List<String> right) { if (left.size() > right.size()) { stemmap.put(StringUtils.join(left, ","), StringUtils.join(right, ",")); } } })); wp.addParser(new GermanNounPageParser(new PageParserCallback() { @Override public void callback(List<String> left, List<String> right) { nounsfound.addAll(left); } })); // parse dump wp.parse(args[0]); // write subwords try { SortedSet<String> keys = new TreeSet<String>(subwortmap.keySet()); FileUtils.writeStringToFile(subwordout, ""); for (String key : keys) { Pair<List<String>, List<String>> pair = subwortmap.get(key); List<String> right = pair.getRight(); SortedSet<String> newright = new TreeSet<String>(right); Iterator<String> iterator = right.iterator(); while (iterator.hasNext()) { // is subword a generic word ? String subword = iterator.next(); if (keys.contains(subword)) { Pair<List<String>, List<String>> subpair = subwortmap.get(subword); newright.addAll(subpair.getRight()); } } FileUtils.writeStringToFile(subwordout, StringUtils.join(pair.getLeft(), ",") + " => " + StringUtils.join(newright, ",") + "\n", "utf-8", true); } } catch (IOException e) { e.printStackTrace(); } // write stems try { SortedSet<String> keys = new TreeSet<String>(stemmap.keySet()); FileUtils.writeStringToFile(stemout, "delete from normalization where type='stemming-wiktionary-de';"); for (String key : keys) { FileUtils.writeStringToFile(stemout, "insert into normalization (type, origin, normalization, modified_by, modified_at, active) values ('stemming-wiktionary-de', '" + key + "','" + stemmap.get(key) + "','TorstenKoester',now(),true);\n", "utf-8", true); } } catch (IOException e) { e.printStackTrace(); } // write nouns as sql try { List<String> sortednouns = new ArrayList<>(nounsfound); Collections.sort(sortednouns); FileUtils.writeStringToFile(nounsout, "delete from lists where type='noun-wiktionary-de';"); for (String n : sortednouns) { FileUtils.writeStringToFile(nounsout, "insert into lists (type, entry, modified_by, modified_at, active) values ('noun-wiktionary-de', '" + n.trim() + "','TorstenKoester',now(),true);\n", "utf-8", true); } } catch (IOException e) { e.printStackTrace(); } } }