package org.apache.solr.analysis.author;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import java.text.*;
public class AuthorUtils {
public static final String AUTHOR_QUERY_VARIANT = "AUTHOR_QUERY_VARIANT";
public static final String AUTHOR_INPUT = "AUTHOR_INPUT";
public static final String AUTHOR_TRANSLITERATED = "AUTHOR_TRANSLITERATED";
public static final String AUTHOR_CURATED_SYN = "AUTHOR_CURATED_SYN";
// to remove commas from behind initials B. => B
static Pattern n0 = Pattern.compile("(?<=\\b\\p{L})\\.(?=\\s*\\b)");
// these are the characters we allow for author names
// we keep any unicode character with its punctuation, digits,
// and some special characters
// original, which may miss diacritics: "(?<=\\b\\p{L})\\.(?=\\s*\\b)" \P{M}\p{M}*+
// [^,\\-\\w\\s\\{N}\\p{L}\\p{M}*+]
static Pattern n1 = Pattern.compile("[^,\\-\\s\\p{N}\\p{L}\\p{M}]");
// to normalize spaces
static Pattern n2 = Pattern.compile("\\s+");
// to normalize non escaped commas
static Pattern n3 = Pattern.compile("(?<!\\\\),\\s*");
// deal with word delimiters
static Pattern n4 = Pattern.compile("(?<=\\p{L})\\'\\s*");
public static String normalizeAuthor(String a) {
a = n4.matcher(a).replaceAll("-");
a = n0.matcher(a).replaceAll(" ");
a = n1.matcher(a).replaceAll("");
a = n3.matcher(a).replaceAll(", ");
a = n2.matcher(a.trim()).replaceAll(" ");
if (!(a.contains(","))) // || a.contains(" ")
a = a + ",";
// do this at the end, we want to see the space instead of '-'
a = a.replace('-', ' ');
// normalize spaces once again
a = n2.matcher(a.trim()).replaceAll(" ");
return a;
}
/**
* this whole thing become obsolete when we included the python
* name parser library (that does MUCH better job in parsing names)
*
* TODO: kill AuthorUtils.parseAuthor
*/
public static HashMap<String,String> parseAuthor(String a) {
return parseAuthor(a, true);
}
public static HashMap<String,String> parseAuthor(String a, boolean normalize) {
HashMap<String,String> parsed = new HashMap<String,String>();
if (a == null || a.length() == 0) {
return parsed;
}
if (normalize) {
a = AuthorUtils.normalizeAuthor(a);
}
NameParser np = new NameParser();
String[] p;
try {
p = np.parseName(a);
} catch (Exception e) {
throw new RuntimeException(e);
}
String[] keys = {"title", "first", "middle", "last", "suffix"};
for (int i = 0; i < keys.length; i++) {
if (p[i] != null) {
parsed.put(keys[i], p[i]);
}
}
return parsed;
}
public static ArrayList<String> getAsciiTransliteratedVariants(String a) {
HashSet<String> synonyms = new HashSet<String>();
// include original
// synonyms.add(a);
// downgrade to ascii
String downgraded = foldToAscii(a);
synonyms.add(downgraded);
// transliterate accents
String transAcc = transliterateAccents(a);
synonyms.add(transAcc);
// handle russian name stuff
HashSet<String> transRus = transliterateRussianNames(new String[] {a, downgraded, transAcc});
synonyms.addAll(transRus);
// remove the original input from the set
synonyms.remove(a);
return new ArrayList<String>(synonyms);
}
public static String foldToAscii(String a) {
char[] in = a.toCharArray();
char[] out = new char[in.length * 4];
int outPos = ASCIIFoldingFilter.foldToASCII(in, 0, out, 0, in.length);
return String.copyValueOf(out).trim();
}
/*
* Splits name into parts (separated by comma and then by space)
* The comma is retained; spaces between parts of names are removed
* e.g 'john , james' becomes: ['john,', 'james']
*/
public static String[] splitName(String name) {
if (name.indexOf(',') > -1) {
//System.out.println(name);
int comma = name.indexOf(',');
String[] nameParts = name.substring(comma+1).trim().split(" ");
if (nameParts[0].equals(""))
return new String[]{name.substring(0, comma).trim() + ","};
String[] out = new String[nameParts.length+1];
out[0] = name.substring(0, comma).trim() + ",";
int i = 1;
for (String s: nameParts) {
out[i] = s;
i += 1;
}
//System.out.println(Arrays.toString(out));
return out;
}
else {
return name.split(" ");
}
}
static String transliterateAccents(String a) {
String decomposed = Normalizer.normalize(a, Normalizer.Form.NFD);
char[] in = decomposed.toCharArray();
char[] out = new char[in.length * 4];
int outPos = 0;
for (int i = 0; i < in.length; i++) {
final char c = in[i];
// prev will be the 1st part of the decomp char
char prev = (i > 0) ? in[i - 1] : '\0';
char replacement;
if (c < '\u0080') {
out[outPos++] = c;
continue;
}
switch (c) {
case '\u0308':
replacement = 'E';
break;
case '\u030a':
replacement = 'A';
break;
case '\u0301':
replacement = 'E';
break;
case '\u030c':
replacement = 'H';
break;
default:
prev = '\0';
replacement = c;
}
if (prev != '\0' && !Character.isUpperCase(prev)) {
replacement = Character.toLowerCase(replacement);
}
out[outPos++] = replacement;
}
return String.copyValueOf(out).trim();
}
// XXX: this doesn't look right to me, the fifth step gets (possibly)
// 5 times more items than the first step
public static HashSet<String> transliterateRussianNames(String[] in) {
HashSet<String> synonyms = new HashSet<String>();
for (String s : in) {
HashSet<String> syn = new HashSet<String>();
syn.add(s);
syn.addAll(translitRussianApostrophes(syn.iterator()));
syn.addAll(translitRussianLastNames1(syn.iterator()));
syn.addAll(translitRussianLastNames2(syn.iterator()));
syn.addAll(translitRussianLastNames3(syn.iterator()));
syn.addAll(translitRussianLastNames4(syn.iterator()));
syn.addAll(translitRussianLastNames5(syn.iterator()));
syn.addAll(translitRussianFirstNames(syn.iterator()));
synonyms.addAll(syn);
}
return synonyms;
}
/*
* take care of russian apostrophes:
* 'E == E == IE == YE
* note that we do not index 'E since the search
* engine simply strips all apostrophes
*/
static Pattern p0 = Pattern.compile("(?<=\\w{2})'(?=[Ee])");
static HashSet<String> translitRussianApostrophes(Iterator<String> itr) {
HashSet<String> syn = new HashSet<String>();
while (itr.hasNext()) {
Matcher m = p0.matcher(itr.next());
if (m.find()) {
syn.add(m.replaceAll("I"));
syn.add(m.replaceAll("Y"));
syn.add(m.replaceAll(""));
}
}
//log.debug("apostrophes: " + syn);
return syn;
}
/* russian last names I:
* [^IJY]EV$ == IEV$ == YEV$ == JEV$
* [^IJY]EVA$ == IEVA$ == YEVA$ == JEVA$
*/
static Pattern p1 = Pattern.compile("(?<![IJY])EV(?=A?,)");
static HashSet<String> translitRussianLastNames1(Iterator<String> itr) {
HashSet<String> syn = new HashSet<String>();
while (itr.hasNext()) {
Matcher m = p1.matcher(itr.next());
if (m.find()) {
syn.add(m.replaceAll("IEV"));
syn.add(m.replaceAll("YEV"));
syn.add(m.replaceAll("JEV"));
}
}
//log.debug("last names I: " + syn);
return syn;
}
/* russian last names II:
* ([NRBO])IA$ == $1IIA$ == $1IYA$
*/
static Pattern p2 = Pattern.compile("(?<=[NRBO])I(?=A,)");
static HashSet<String> translitRussianLastNames2(Iterator<String> itr) {
HashSet<String> syn = new HashSet<String>();
while (itr.hasNext()) {
Matcher m = p2.matcher(itr.next());
if (m.find()) {
syn.add(m.replaceAll("II"));
syn.add(m.replaceAll("IY"));
}
}
//log.debug("last names II: " + syn);
return syn;
}
/* russian last names III:
* ([DHKLMNPSZ])IAN$ == $1YAN$ == $1JAN$
*/
static Pattern p3 = Pattern.compile("(?<=[DHKLMNPSZ])[IJY](?=AN,)");
static HashSet<String> translitRussianLastNames3(Iterator<String> itr) {
HashSet<String> syn = new HashSet<String>();
while (itr.hasNext()) {
Matcher m = p3.matcher(itr.next());
if (m.find()) {
syn.add(m.replaceAll("I"));
syn.add(m.replaceAll("J"));
syn.add(m.replaceAll("Y"));
}
}
//log.debug("last names III: " + syn);
return syn;
}
/* russian last names IV:
* AIA$ == AYA$ == AJA$
*/
static Pattern p4 = Pattern.compile("(?<=[KNV]A)[IJY](?=A,)");
static HashSet<String> translitRussianLastNames4(Iterator<String> itr) {
HashSet<String> syn = new HashSet<String>();
while (itr.hasNext()) {
Matcher m = p4.matcher(itr.next());
if (m.find()) {
syn.add(m.replaceAll("I"));
syn.add(m.replaceAll("J"));
syn.add(m.replaceAll("Y"));
}
}
//log.debug("last names IV: " + syn);
return syn;
}
/* russian last names V:
* KI$ == KII$ == KIJ$ == KIY$ = KYI$
* VI$ == VII$ == VIJ$ == VIY$ = VYI$
* first transform [KVH]I into [KVH]II
*/
static Pattern p5 = Pattern.compile("(?<=[KV])I(?=,)");
static HashSet<String> translitRussianLastNames5(Iterator<String> itr) {
HashSet<String> syn = new HashSet<String>();
while (itr.hasNext()) {
Matcher m = p5.matcher(itr.next());
if (m.find()) {
syn.add(m.replaceAll("I"));
syn.add(m.replaceAll("Y"));
syn.add(m.replaceAll("YI"));
syn.add(m.replaceAll("IY"));
syn.add(m.replaceAll("IJ"));
syn.add(m.replaceAll("II"));
}
}
//log.debug("last names V: " + syn);
return syn;
}
/* russian first names
* ^IU == ^YU
* ^IA == ^YA
*/
static Pattern p6 = Pattern.compile("(?<=, )[YI](?=[AU])");
static HashSet<String> translitRussianFirstNames(Iterator<String> itr) {
HashSet<String> syn = new HashSet<String>();
while (itr.hasNext()) {
Matcher m = p6.matcher(itr.next());
if (m.find()) {
syn.add(m.replaceAll("I"));
syn.add(m.replaceAll("Y"));
}
}
//log.debug("first names: " + syn);
return syn;
}
}