package com.s24.wiki.exports; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import com.s24.wiki.EnglishGrammarPageParser; import com.s24.wiki.EnglishWikiParser; import com.s24.wiki.PageParserCallback; /** * * * @author Shopping24 GmbH, Torsten Bøgh Köster (@tboeghk) */ public class SolrEnglishSynonyms { private static final String stem = "target/stem_en.sql"; /** * @param args */ public static void main(String[] args) { // configure sinks final File stemout = new File(stem); final Map<String, String> stemmap = new TreeMap<String, String>(); // create parser & add add parser callbacks EnglishWikiParser wp = new EnglishWikiParser(); wp.addParser(new EnglishGrammarPageParser(new PageParserCallback() { @Override public void callback(List<String> left, List<String> right) { if (left.size() > right.size()) { stemmap.put("\"" + StringUtils.join(left, "\",\"") + "\"", right.get(0)); } } }, false)); // parse dump wp.parse(args[0]); // write stems try { SortedSet<String> keys = new TreeSet<String>(stemmap.keySet()); FileUtils.writeStringToFile(stemout, "delete from normalization where type='stemming-wiktionary-irregular-en';\n", "utf-8"); for (String key : keys) { FileUtils.writeStringToFile(stemout, "insert into normalization (type, origin, normalization, modified_by, modified_at, active) values ('stemming-wiktionary-irregular-en', '{" + key + "}','" + stemmap.get(key) + "','TorstenKoester',now(),true);\n", "utf-8", true); } } catch (IOException e) { e.printStackTrace(); } } }