package org.adsabs.solr.analysis; import java.io.*; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import org.apache.solr.analysis.WriteableExplicitSynonymMap; import org.apache.solr.analysis.WriteableSynonymMap; import org.apache.solr.analysis.author.AuthorQueryVariations; import org.apache.solr.analysis.author.AuthorUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class ProcessCuratedAuthorSynonyms { public static final Logger log = LoggerFactory.getLogger(ProcessCuratedAuthorSynonyms.class); public static ArrayList<List<String>> parseGroups(BufferedReader in) { ArrayList<List<String>> groups = new ArrayList<List<String>>(); ArrayList<String> l = new ArrayList<String>(); String currentLine; try { while ((currentLine = in.readLine()) != null) { currentLine = currentLine.trim(); if (currentLine.length() == 0) { groups.add(l); l = new ArrayList<String>(); } else { l.add(currentLine); } } if (l.size() > 0) { groups.add(l); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return groups; } public static List<String> normalize(final List<String> l) { List<String> normalized = new ArrayList<String>() {{ for (String s : l) { log.debug("normalizing " + s); String n = AuthorUtils.normalizeAuthor(s); log.debug("normalized: " + n); add(n); } }}; return normalized; } public static HashMap<String,HashSet<String>> transformGroup(List<String> group) { log.debug("group: " + group.toString()); // normalize incoming author names final List<String> normalized = normalize(group); log.debug("Normalized group: " + normalized.toString()); // expanded list with all auto-generated synonym possibilities final List<String> withAutoSynonyms = new ArrayList<String>() {{ for (String s : normalized) { addAll(AuthorUtils.getAsciiTransliteratedVariants(s)); add(s); } }}; log.debug("withAutoSynonyms: " + withAutoSynonyms.toString()); // build a map of name -> variations to be used later final HashMap<String,HashSet<String>> variationsMap = new HashMap<String,HashSet<String>>(); for (String s : withAutoSynonyms) { HashMap<String,String> parsedAuthor = null; try { parsedAuthor = AuthorUtils.parseAuthor(s); } catch (Exception e) { log.error("Error parsing " + s); log.error(e.getStackTrace().toString()); continue; } variationsMap.put(s, AuthorQueryVariations.generateSynonymVariations(parsedAuthor)); } log.debug("variations: " + variationsMap.toString()); // build hashmap where keys are each author name from withAutoSynonms // and values are the set of variations generated from each of the remaining names HashMap<String,HashSet<String>> transformed = new HashMap<String,HashSet<String>>(); for (final String synonymKey : withAutoSynonyms) { log.debug("working on " + synonymKey); // create list that includes all of withAutoSynonyms except the current synonymKey final List<String> theRest = new ArrayList<String>() {{ for (String s : withAutoSynonyms) { if (!s.equals(synonymKey)) { add(s); } } }}; Collections.sort(theRest); log.debug("the rest: " + theRest); HashSet<String> synonymValues = new HashSet<String>() {{ for (String s : theRest) { add(s); if (variationsMap.containsKey(s)) { addAll(variationsMap.get(s)); } } }}; log.debug("synonymValues: " + synonymValues); transformed.put(synonymKey, synonymValues); } return transformed; } public static void processSynonyms(BufferedReader in, String outFile) { WriteableSynonymMap synMap = new WriteableExplicitSynonymMap(); synMap.setOutput(outFile); ArrayList<List<String>> groups = parseGroups(in); for (List<String> group : groups) { HashMap<String,HashSet<String>> transformed = transformGroup(group); for (String key : transformed.keySet()) { synMap.put(key, transformed.get(key)); } } try { synMap.persist(false); } catch (IOException e) { throw new RuntimeException(e); } } public static void main(String[] args) { PrintStream out = null; BufferedReader in = null; if (args.length < 2) { System.out.println("Usage: ProcessCuratedAuthorSynonyms <infile> <outfile>"); System.exit(1); } File inFile = new File(args[0]); try { in = new BufferedReader(new InputStreamReader(new FileInputStream(inFile))); } catch (FileNotFoundException e) { throw new RuntimeException("invalid input file: " + args[0]); } processSynonyms(in, args[1]); } }