package org.gbif.nub.lookup.fuzzy;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.commons.io.LineIterator;
import org.apache.commons.lang3.StringUtils;
/**
* Simple manual utility to merge new synonym entries into a single, clean dictionary file
* to be picked up by the nub lookup and hosted at
* http://rs.gbif.org/dictionaries/synonyms/
*/
public class SynonymMerger {
public static void main (String[] args) throws Exception {
File f = new File("/Users/markus/code/rs.gbif.org/dictionaries/synonyms/family.txt");
Map<String, Set<String>> vals = new TreeMap<String, Set<String>>();
LineIterator iter = new LineIterator(new FileReader(f));
while (iter.hasNext()) {
String line = iter.next();
if (StringUtils.isBlank(line)) {
continue;
}
String[] cols = line.split("\t");
if (cols.length != 2) {
System.out.println("IGNORE LINE: "+line);
continue;
}
String syn = cols[0].toUpperCase().trim();
String acc = cols[1].trim();
if (vals.containsKey(syn)) {
if (vals.get(syn).contains(acc)) {
// same entry, just ignore
} else {
vals.get(syn).add(acc);
System.out.println("CONFLICT for " + syn);
System.out.println(" " + vals.get(syn));
}
} else {
vals.put(syn, new HashSet<String>());
vals.get(syn).add(acc);
}
}
// write
File fo = new File(f.getParent(), f.getName()+"-2");
FileWriter out = new FileWriter(fo);
for (Map.Entry<String, Set<String>> entry : vals.entrySet()) {
for (String x: entry.getValue()) {
out.write(entry.getKey());
out.write("\t");
out.write(x);
out.write("\n");
}
}
out.close();
System.out.println("DONE");
}
}