package org.gbif.checklistbank.authorship;
import org.gbif.checklistbank.iterable.ColumnExtractor;
import org.gbif.utils.file.FileUtils;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import com.google.common.base.Joiner;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.commons.io.LineIterator;
import org.junit.Test;
import static org.junit.Assert.assertTrue;
/**
* Utility that reads a stream of author names and splits them into sets of names that are classified as the same name by the author comparator.
*/
public class AuthorBucketerTest {
public static Map<String, Set<String>> clusterNames(Iterator<String> authors) {
Map<String, Set<String>> buckets = Maps.newHashMap();
AuthorComparator comp = AuthorComparator.createWithAuthormap();
while (authors.hasNext()) {
String author = authors.next();
String match = null;
for (String x : buckets.keySet()) {
if (comp.compareStrict(author, null, x, null)) {
match = x;
break;
}
}
if (match == null) {
// new bucket
buckets.put(author, Sets.newHashSet(author));
} else {
buckets.get(match).add(author);
}
}
return buckets;
}
@Test
public void testAuthormap() throws Exception {
LineIterator iter = FileUtils.getLineIterator(FileUtils.classpathStream("authorship/authormap.txt"));
int lines = 0;
while (iter.hasNext()) {
lines++;
iter.next();
}
iter = FileUtils.getLineIterator(FileUtils.classpathStream("authorship/authormap.txt"));
Map<String, Set<String>> buckets = clusterNames(new ColumnExtractor(iter, '\t', 0));
Joiner join = Joiner.on("; ").skipNulls();
for (Map.Entry<String, Set<String>> entry : buckets.entrySet()) {
if (entry.getValue().size() > 1) {
System.out.println(entry.getKey());
System.out.println(" " + join.join(entry.getValue()));
}
}
System.out.println("Lines: " + lines + ", buckets: " + buckets.size());
assertTrue(buckets.size() > 3212);
}
}