package ecologylab.bigsemantics.metametadata.fieldparsers;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class FieldParserForAcmReferences extends FieldParser
{
public static enum Flavor
{
UNKNOWN,
ACM_STANDARD,
BRIEF,
}
public static final String OTHER = "$other";
public static final String TITLE = "$title";
public static final String AUTHOR_LIST = "$author_list";
private static final int MINIMAL_TITLE_LENGTH = 10;
private Pattern pAuthors = Pattern.compile("\\s*(?:and\\s*)?((?:v[ao]n\\s*)?[A-Z][a-z]+(-[A-Z][a-z]+)?, [A-Z]\\.(?:\\s*[A-Z]\\.)?)(?:\\s*,)?");
@Override
public Map<String, String> getKeyValuePairResult(FieldParserElement parserElement, String input)
{
Map<String, String> result = new HashMap<String, String>();
if (input == null || input.length() == 0)
return result;
Flavor flavor = Flavor.UNKNOWN;
if (input.contains(" , ") || input.contains("doi>") || !input.contains("."))
flavor = Flavor.ACM_STANDARD;
else if (input.matches("^(\\d+\\.\\s*)?[A-Z][a-z]+, [A-Z]\\..*"))
flavor = Flavor.BRIEF;
switch (flavor)
{
case ACM_STANDARD:
parseStandard(input, result);
break;
case BRIEF:
parseBrief(input, result);
break;
default:
result.put(TITLE, input);
}
return result;
}
/**
* Parse ACM references in standard format.
* @param input
* @param result
*/
private void parseStandard(String input, Map<String, String> result)
{
String[] authorListAndOther = input.split("(?<=\\S),\\s", 2);
if (authorListAndOther.length == 2)
{
String authorList = authorListAndOther[0];
if (authorList != null)
result.put(AUTHOR_LIST, authorList.trim());
String other = authorListAndOther[1];
if (other != null)
{
String[] titleAndOther = other.split(",\\s(?=[A-Z])", 2);
if (titleAndOther.length == 2)
{
String title = titleAndOther[0];
String other0 = titleAndOther[1];
if (title != null)
result.put(TITLE, trimUntilLetter(title).trim());
if (other0 != null)
result.put(OTHER, trimUntilLetter(other0).trim());
}
}
}
}
/**
* Parse ACM references in a non-standard, brief format.
*
* @param input
* @param result
*/
private void parseBrief(String input, Map<String, String> result)
{
if (input == null)
{
return;
}
input = trimUntilLetter(input);
String authorList = null;
int nextPos = 0;
int et_al_pos = input.indexOf("et al.");
if (et_al_pos >= 0)
{
authorList = input.substring(0, et_al_pos + 6);
nextPos = et_al_pos + 6;
}
else
{
Matcher m = pAuthors.matcher(input);
StringBuilder authors = new StringBuilder();
while (m.find())
{
if (nextPos != m.start()) // assume that author names must be adjacent
break;
if (nextPos > 0)
authors.append(" , ");
authors.append(m.group(1));
nextPos = m.end();
}
authorList = authors.toString();
}
result.put(AUTHOR_LIST, authorList);
int beginTitle = nextPos;
nextPos = nextPos + MINIMAL_TITLE_LENGTH;
if (nextPos > input.length())
nextPos = input.length();
nextPos = skipCharsUntil(input, nextPos, ",.");
int endTitle = nextPos;
nextPos = skipChars(input, nextPos, ", .");
if (Character.isLowerCase(input.charAt(nextPos)) && !input.startsWith("in ", nextPos))
{
endTitle = skipCharsUntil(input, nextPos, ",.");
nextPos = skipChars(input, endTitle, ", .");
}
String title = input.substring(beginTitle, endTitle);
result.put(TITLE, trimUntilLetter(title).trim());
String other = input.substring(nextPos);
result.put(OTHER, trimUntilLetter(other).trim());
}
private static int skipChars(String s, int start, String chars)
{
while (start < s.length() && chars.indexOf(s.charAt(start)) >= 0)
start++;
return start;
}
private static int skipCharsUntil(String s, int start, String chars)
{
while (start < s.length() && chars.indexOf(s.charAt(start)) < 0)
start++;
return start;
}
private static String trimUntilLetter(String s)
{
int p = 0;
while (p < s.length())
{
if (Character.isLetter(s.charAt(p)))
break;
p++;
}
return s.substring(p);
}
public static void main(String[] args)
{
String[] tests = {
"George W. Furnas , Samuel J. Rauch, Considerations for information environments and the NaviQue workspace, Proceedings of the third ACM conference on Digital libraries, p.79-88, June 23-26, 1998, Pittsburgh, Pennsylvania, United States [doi>10.1145/276675.276684]",
"Miller, G.A., The Magical number seven, plus or minus two: some limits on our capacity for processing information, Psychology Review, 63, 81--97, 1956. ",
"Hamming, R. The Art of Doing Science and Engineering: Learning to Learn. CRC Press, 1997, 35. {The original maxim is, of course, \"The purpose of computing is insight, not numbers.\"} ",
"Karlson, A., Piatko, C., and Gersh, J. Semantic navigation in complex graphs. Interactive poster and demonstration. Abstract published in IEEE Symposium on Information Visualization Poster Compendium (Seattle, WA), 2003, 84--85. ",
"Oxford English Dictionary on Compact Disk, 2nd Edition. Oxford: Oxford University Press, 1992.",
"Smith, S. M., Getting Into and Out of Mental Ruts: A theory of Fixation, Incubation, and Insight in Sternberg, R J. and Davidson, J., The Nature of Insight, Cambridge, MA, MIT Press, 1994, 121--149. ",
"Smith, S. M., Dodds, R. A., Incubation. in Runco, M.A., Pritzker, S. R., eds., Encyclopedia of Creativity, Volume 2. San Diego: Assoc Press, 1999, 39--44. ",
"Smith, S.M., Blankenship, S.E., Incubation and the Persistence of Fixation in Problem Solving, Am Journ Psychology, 104, 1991, 61--87. ",
"Shah, J.J., Smith, S.M., Vargas-Hernandez, N. Metrics for Measuring Ideation Effectiveness. Design Studies, 24, 2003, 111--134.",
"Sperling, G. The information available in brief visual presentations. Psychological Monographs, 74:48.",
"Newell, A., Shaw, J. C., Simon, H. A. The process of creative thinking. In Gruber, H. E., Terrell, G., Wertheimer, M., eds., Contemporary approaches to creative thinking, New York: Atherton Press, 1962.",
};
FieldParserForAcmReferences f = new FieldParserForAcmReferences();
String s = "Smith, S. M., Dodds, R. A., Incubation.";
Matcher m = f.pAuthors.matcher(s);
while (m.find())
{
for (int g = 0; g <= m.groupCount(); ++g)
System.out.println("Group " + g + ": " + m.group(g));
System.out.println();
}
for (String test : tests)
{
Map<String, String> kv = f.getKeyValuePairResult(null, test);
System.out.println("authors: " + kv.get(AUTHOR_LIST));
System.out.println("title: " + kv.get(TITLE));
System.out.println("other: " + kv.get(OTHER));
System.out.println();
}
}
}