package org.apache.solr.analysis.author;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Creates a variations of the author names, it receives eg:
* John, K
*/
public class AuthorQueryVariations {
public static final Logger log = LoggerFactory.getLogger(AuthorQueryVariations.class);
/**
* This method takes input string, e.g. "Hector, Gomez Q" and generates variations
* of the author name.
*
* <pre>
* HECTOR, GOMEZ
* HECTOR, G
* HECTOR,
* HECTOR, GOMEZ Q*
* HECTOR, G Q*
* </pre>
*
* It is essentially the same output as you get from getSynonymVariations except
* a few special cases. These special cases are variations needed for querying the
* index of author names, but not needed or wanted for the process of transforming
* the curated synonyms
* <p>
* Example "h quintero gomez" will output:
* <pre>
* GOMEZ, H\w* QUINTERO\b.*
* GOMEZ,
* GOMEZ, H\w*
* GOMEZ, H\w* Q\b.* <-- only this one is extra added
* </pre>
*
* @param authorString
* author name
* @return map with string mappings
*/
public static HashSet<String> getQueryVariationsInclRegex(String authorString) {
HashMap<String,String> parsedAuthor = null;
parsedAuthor = AuthorUtils.parseAuthor(authorString);
HashSet<String> variations = new LinkedHashSet<String>();
if (parsedAuthor == null) {
variations.add(authorString);
return variations;
}
generateNameVariations(parsedAuthor, variations);
return variations;
}
public static HashSet<String> generateNameVariations(HashMap<String,String> parsedAuthor) {
HashSet<String> variations = new LinkedHashSet<String>();
return generateNameVariations(parsedAuthor, variations);
}
protected static HashSet<String> generateNameVariations(
HashMap<String,String> parsedAuthor,
HashSet<String> variations) {
// get the base variations
generateSynonymVariations(parsedAuthor, variations);
// add the variations that are needed only for the query phase
String last = parsedAuthor.get("last");
String first = parsedAuthor.get("first");
String middle = parsedAuthor.get("middle");
if (first != null) {
if (middle != null) {
if (first.length() > 1) {
if (middle.length() > 1) {
variations.add(last + ", " + first + " " + middle.substring(0,1) + "\\b.*");
} else {
variations.add(last + ", " + first + " " + middle + ".*");
variations.add(last + ", " + first.substring(0,1) + " " + middle + ".*");
}
} else {
if (middle.length() > 1) {
variations.add(last + ", " + first + "\\w* " + middle.substring(0,1) + "\\b.*");
}
}
} else {
if (first.length() > 1) {
variations.add(last + ", " + first.substring(0,1) + "\\b.*");
}
}
}
return variations;
}
/**
* This method takes input string, e.g. "Hector, Gomez Q" and generates variations
* of the author name PLUS enhances the variations with regular expression patterns.
*
* The process that transforms the curated synonyms uses *only* the variations
* generated here. This limited set is also included in the variations used at query
* time but DON'T ADD THINGS HERE that are only necessary for the query phase--use
* getNameVariations for that
*
* HECTOR, GOMEZ
* HECTOR, G
* HECTOR,
* HECTOR, GOMEZ Q*
* HECTOR, G Q*
*
*
* @param authorString
* name in the natural form
* @return map with string mappings
*/
public static HashSet<String> getQueryVariations(String authorString) {
HashMap<String,String> parsedAuthor = null;
parsedAuthor = AuthorUtils.parseAuthor(authorString);
HashSet<String> variations = new LinkedHashSet<String>();
if (parsedAuthor == null) {
variations.add(authorString);
return variations;
}
return generateSynonymVariations(parsedAuthor, variations);
}
public static HashSet<String> generateSynonymVariations(HashMap<String,String> parsedAuthor) {
HashSet<String> variations = new LinkedHashSet<String>();
return generateSynonymVariations(parsedAuthor, variations);
}
protected static HashSet<String> generateSynonymVariations(
HashMap<String,String> parsedAuthor,
HashSet<String> variations) {
String last = parsedAuthor.get("last");
String first = parsedAuthor.get("first");
String middle = parsedAuthor.get("middle");
if (parsedAuthor.size() == 1 && last != null) {
variations.add(String.format("%s,.*", last)); // all we got was last name
} else {
variations.add(String.format("%s,", last));
}
if (first != null) {
if (middle != null) {
if (first.length() > 1) {
variations.add(last + ", " + first);
variations.add(last + ", " + first.substring(0,1));
if (middle.length() > 1) {
variations.add(last + ", " + first + " " + middle + "\\b.*");
variations.add(last + ", " + first.substring(0,1) + " " + middle.substring(0,1) + "\\b.*");
} else if (middle.length() == 1) {
// variations.add(last + ", " + first.substring(0,1) + " " + middle + ".*");
// variations.add(last + ", " + first + " " + middle + ".*");
}
} else {
variations.add(last + ", " + first + "\\w*");
if (middle.length() > 1) {
variations.add(last + ", " + first + "\\w* " + middle + "\\b.*");
} else if (middle.length() == 1) {
variations.add(last + ", " + first + "\\w* " + middle + ".*");
}
}
} else {
if (first.length() > 1) {
variations.add(last + ", " + first + "\\b.*");
variations.add(last + ", " + first.substring(0,1));
} else if (first.length() == 1) {
variations.add(last + ", " + first + ".*");
}
}
}
return variations;
}
}