/*
* EuroCarbDB, a framework for carbohydrate bioinformatics
*
* Copyright (c) 2006-2009, Eurocarb project, or third-party contributors as
* indicated by the @author tags or express copyright attribution
* statements applied by the authors.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
* A copy of this license accompanies this distribution in the file LICENSE.txt.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* Last commit: $Rev: 1210 $ by $Author: glycoslave $ on $Date:: 2009-06-12 #$
*/
package org.eurocarbdb.dataaccess.core;
// stdlib imports
import java.util.List;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.Collections;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.regex.MatchResult;
import java.io.Serializable;
// 3rd party imports
import org.apache.log4j.Logger;
// eurocarb imports
// static imports
import static org.eurocarbdb.util.JavaUtils.*;
import static org.eurocarbdb.util.StringUtils.join;
/**
* Simple wrapper class representing a {@link JournalReference} or
* {@link Reference} author, along with various methods to export/parse
* an Author to/from a {@link String}.
* @see Author.Format
* @author mjh
*/
public class Author implements Serializable
{
// INNER CLASSES
/* enum Format *//*********************************************
*
* Enumeration of string formats to match a person's name.
* Formats are intended to be matched in declared order, using
* the {@link #matches} method, ie:<br/>
*<pre>
* String input = ...;
* for ( Format format : Format.values() )
* if ( format.matches( input )
* Author a = format.getAuthor();
*</pre>
* Similarly, formats are intended to be bi-drectional, ie:
* an {@link Author}'s name ought to be able to be produced in a
* given {@link Format} using the {@link Author#toString(Format)}
* method, ie:
*<pre>
* Author a = ...;
* for ( Format format : Format.values() )
* System.out.println( a.toString( format ) );
*</pre>
*
* @see Author#toString(Format)
*/
public enum Format
{
/* format Lastname_First_Then_Initials *//*****************
*
* Format spec to match a string with lastname first, then an
* optional comma, then 1-3 initials, with or without fullstops,
* eg: <tt>Harrison MJ</tt>, <tt>Harrison, MJ</tt>, or <tt>Harrison, M.J.</tt>.
* Initials need to be capitalised to be recognised as such.
*/
Lastname_First_Then_Initials
(
"(" //--- start capture 1 ---
+ "(?:\\w+)" // a word, then...
+ "(?:" // (a grouping of)
+ "(?:-|\\s)" // ...either hyphen or space
+ "(?:\\w+)" // ...then another word
+ "){0,3}" // ...up to 3 (possibly hyphenated) words
+ ")" //--- end capture group 1 ---
+ "\\s*,?\\s*" // optional comma/spaces
+ "(" //--- start capture group 2 ---
+ "\\b" // start of a word (boundary)
+ "(?:" // (a grouping of)
+ "(?:[A-Z])" // ...an initial
+ "\\.?" // ...then maybe a fullstop
+ "){1,3}" // ...and between 1-3 of them (initials)
+ ")" //--- end capture group 2 ---
)
{
void convertMatchResult2Author( MatchResult result )
{
String lname = result.group(1);
String initials = result.group(2);
if ( log.isTraceEnabled() )
log.trace( "creating Author: lastname="
+ lname
+ ", initials="
+ initials
);
this.author = new Author( lname, initials.split("(\\s|\\.)+") );
}
}
,
/* format Lastname_First_Then_Firstnames *//***************
*
* Format spec to match a string with lastname first, then a comma,
* then a list of firstnames, eg: <tt>Harrison, Mathew John</tt>
*/
Lastname_First_Then_Firstnames
(
"(" //--- start capture group 1 ---
+ "\\w+" // a word, then...
+ "(?:" // (a grouping of)
+ "(?:-|\\s)" // ...either hyphen or space
+ "(?:\\w+)" // ...then another word
+ "){0,3}" // ...up to 3 (possibly hyphenated) words
+ ")" //--- end capture group 1 ---
+ "\\s*,\\s*" // required comma, optional space
+ "(" //--- start capture group 2 ---
+ "\\w+" // a word...
+ "(?:" // (a grouping of)
+ "(?:-|\\s)+" // ...hyphen or space
+ "\\w+" // ...then another word
+ "){0,3}" // ...and up to 3 (possibly hyphenated) words
+ ")" //--- end capture group 2 ---
)
{
void convertMatchResult2Author( MatchResult result )
{
String lname = result.group(1);
String fnames = result.group(2);
if ( log.isTraceEnabled() )
log.trace( "creating Author: lastname="
+ lname
+ ", firstnames="
+ fnames
);
this.author = new Author( lname, fnames.split("\\s+") );
}
}
,
/* format Firstnames_Then_Lastname *//*********************
*
* Format spec to match a string containing a regular list of
* first names then a lastname eg: <tt>Mathew John Harrison</tt>.
*/
Firstnames_Then_Lastname
(
"(" // match
+ "\\w+" // any number
+ "(?:" // of possibly
+ "(?:-|\\s)" // hyphenated
+ "\\w+" // words and
+ ")*" // then split
+ ")" // them later
)
{
void convertMatchResult2Author( MatchResult result )
{
String namelist = result.group();
String[] names = namelist.split("\\s+");
if ( names.length == 1 )
{
if ( log.isTraceEnabled() )
log.trace( "creating Author: lastname="
+ names[0]
+ ", no firstnames"
);
this.author = new Author( names[0], (java.lang.String[]) null );
return;
}
String lastname = names[names.length - 1];
List<String> fnames = new ArrayList<String>();
int i;
for ( i = 0; i < names.length - 1; i++ )
fnames.add( names[i] );
// check for special names, such as those that use
// lastname prefixes like 'de la', 'mc, etc...
i = names.length - 2;
while ( i >= 0 )
{
boolean perform_another_pass = false;
for ( String s : Awkward_Names )
{
if ( s.equalsIgnoreCase( names[i] ) )
{
if ( log.isTraceEnabled() )
log.trace( "assuming " + fnames.get(i) + " is part of lastname" );
lastname = fnames.remove( i ) + " " + lastname;
perform_another_pass = true;
break;
}
}
if ( ! perform_another_pass )
break;
i--;
}
if ( log.isTraceEnabled() )
log.trace( "creating Author: lastname="
+ lastname
+ ", firstnames="
+ join(" ", fnames)
);
this.author = new Author( lastname,
fnames.toArray( new String[fnames.size()] ) );
}
}
; // end enumeration of Formats
Pattern pattern = null;
Author author = null;
/** Constructor */
Format( String regexp )
{
if ( regexp != null && regexp.length() > 0 )
this.pattern = Pattern.compile( regexp );
}
/**
* Convert a {@link MatchResult} to an {@link Author} object;
* intended to be overridden by the individual (anonymous) Enum
* instances.
*/
abstract void convertMatchResult2Author( MatchResult result );
/** Returns an {@link Author} object from the input string passed
* to the {@link #matches} method, or null if unmatched. */
public Author getAuthor() { return author; }
/** Returns true if the passed input String is matched by the
* current format. */
public boolean matches( String input )
{
log.trace("trying to match '" + input + "' against " + this );
if ( this.pattern != null )
{
Matcher m = this.pattern.matcher( input );
if ( m.matches() )
{
log.trace("match succeeded");
convertMatchResult2Author( m.toMatchResult() );
return true;
}
else
{
log.trace("match failed");
return false;
}
}
else
{
return false;
}
}
} // end enum Format --------------------------------------------
//~~~~~~~~~~~~~~~~~~~~~~~ STATIC FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~
/** Logging handle. */
protected static final Logger log
= Logger.getLogger( Author.class.getName() );
/** {@link Pattern} used to {@link Pattern#split split} a
* {@link String} containing potentially multiple {@link Author}s;
* the default pattern is to split on a semicolon ';'. */
public static Pattern Multiple_Authors
= Pattern.compile("\\s*;\\s*");
private static final String[] Awkward_Names =
{ "della" , "de" , "di" , "du" , "le", "la" , "mc" , "mac" };
//~~~~~~~~~~~~~~~~~~~~~~~~ FIELDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/** Author's lastname */
private final String lastname;
/** List of firstnames -- a firstname may be an initial only! */
private final String[] firstnames;
//~~~~~~~~~~~~~~~~~~~~~~ CONSTRUCTORS ~~~~~~~~~~~~~~~~~~~~~~~~~
/**
* Attempts to create an Author by parsing the given text string,
* which is assumed to be the author's full name.
*
* @see #parseAuthor(String)
* @see #parseAuthor(String,Format)
* @throws IllegalArgumentException
* if passed {@link String} is null or zero-length.
*/
public Author( String name )
{
Author their = parseAuthor( name );
this.lastname = their.lastname;
this.firstnames = their.firstnames;
}
/**
* Explicit lastname/firstnames constructor. Initials
* are acceptable as firstnames.
*/
public Author( String last_name, String... first_names )
{
assert first_names != null;
assert first_names.length > 0;
this.firstnames = first_names;
this.lastname = last_name;
}
//~~~~~~~~~~~~~~~~~~~~~~ STATIC METHODS ~~~~~~~~~~~~~~~~~~~~~~~
/**
* Attempts to parse an {@link Author} from a text string of
* arbitrary name format.
*
* @throws IllegalArgumentException
* if passed {@link String} is null, zero-length, or contains
* only space characters.
* @return Author object if parseable, null if not.
*/
public static Author parseAuthor( String text )
{
checkNotNull( text );
String author_text = text.trim();
checkNotEmpty( author_text );
// hack to handle names with apostrophes, otherwise regex-based
// name recognition is going to be too painful
author_text = author_text.replace('\'', '_');
Author a = null;
for ( Format format : Format.values() )
{
if ( format.matches( author_text ) )
{
a = format.getAuthor();
assert a != null;
break;
}
}
if ( a == null )
{
log.trace( "Could not parse an author from text string at '"
+ author_text
+ "'"
);
}
return a;
}
/**
* Attempts to parse an {@link Author} from a text string using
* the given {@link Format}.
*
* @throws IllegalArgumentException
* if passed {@link String} is null, zero-length, or contains
* only space characters.
* @return Author object if parseable, null if not.
*/
public static Author parseAuthor( String text, Format format )
{
checkNotNull( text );
String author_text = text.trim();
checkNotEmpty( author_text );
// hack to handle names with apostrophes, otherwise regex-based
// name recognition is going to be too painful
author_text = author_text.replace('\'', '_');
Author a = null;
if ( format.matches( author_text ) )
{
a = format.getAuthor();
assert a != null;
}
if ( a == null )
{
log.trace( "Could not parse an author from text string at '"
+ author_text
+ "' using format "
+ format
);
}
return a;
}
/**
* Parses a string for Authors, returning the list of Authors
* found.
* @see #Multiple_Authors
*/
public static List<Author> parseAuthorList( String author_list_string )
{
checkNotEmpty( author_list_string );
String[] authors = Multiple_Authors.split( author_list_string );
if ( log.isDebugEnabled() )
log.debug( "input authorlist string '"
+ author_list_string
+ "' parses into list: "
+ join(", ", authors)
);
List<Author> authorlist = new ArrayList<Author>( authors.length );
for ( String author_string : authors )
{
if ( author_string.length() == 0 )
{
log.warn("Encountered zero-length author name, skipping...");
continue;
}
Author a = parseAuthor( author_string );
//checkNotNull( a );
if ( a == null )
{
//authorlist.add( new Author( author_string ) );
log.warn(
"! could not parse an Author from string '"
+ author_string
+ "' -- skipping..."
);
continue;
}
else
{
authorlist.add( a );
}
}
return authorlist;
}
/**
* Parses a string for Authors, returning the list of Authors
* found.
* @see #Multiple_Authors
*/
public static List<Author> parseAuthorList( String author_list_string, Format format )
{
checkNotNull( author_list_string );
checkNotEmpty( author_list_string );
String[] authors = Multiple_Authors.split( author_list_string );
if ( log.isTraceEnabled() )
log.trace( "input authorlist string '"
+ author_list_string
+ "' parses into list: "
+ join(", ", authors)
);
List<Author> authorlist = new ArrayList<Author>( authors.length );
for ( String author_string : authors )
{
if ( author_string.length() == 0 )
{
log.warn("Encountered zero-length author name, skipping...");
continue;
}
Author a = parseAuthor( author_string, format );
//checkNotNull( a );
if ( a == null )
{
log.warn("Couldn't parse an Author from string: " + author_string );
authorlist.add( new Author("<surname>","<name>") );
}
else
authorlist.add( a );
}
return authorlist;
}
//~~~~~~~~~~~~~~~~~~~~~~~~~ METHODS ~~~~~~~~~~~~~~~~~~~~~~~~~~~
/**
* Returns all firstnames, which may be initials if this Author
* was constructed using only firstname initials.
* @see StringUtils#join(Object[])
*/
public String[] getAllFirstnames()
{
return firstnames;
}
/** Returns Author's firstname. */
public String getFirstname()
{
checkNotEmpty( firstnames );
return firstnames[0];
}
/** Returns the list of initials for all firstnames. */
public char[] getFirstnameInitials()
{
if ( firstnames == null || firstnames.length == 0 )
return new char[] {};
char[] initials = new char[ firstnames.length ];
for ( int i = 0; i < firstnames.length; i++ )
initials[i] = firstnames[i].charAt(0);
return initials;
}
/**
* Returns this {@link Author}'s initials as an (uppercase)
* {@link String}, ie: <tt>Mathew John Harrison</tt> returns
* <tt>"MJ"</tt>
*/
public String getFirstnameInitialsString()
{
return new String( getFirstnameInitials() ).toUpperCase();
}
/** Returns author lastname */
public String getLastname()
{
checkNotEmpty( lastname );
return lastname;
}
/**
* Returns this Author's name in the format {@link Lastname_First_Then_Initials},
* eg: <tt>Harrison, M.J.</tt>.
* @see Format#Lastname_First_Then_Initials
*/
public String toCitationString()
{
StringBuilder sb = new StringBuilder( getLastname() );
sb.append(' ');
for ( char i : getFirstnameInitials() )
sb.append( i );
return sb.toString();
}
/**
* Returns this Author's name in the format {@link Firstnames_Then_Lastname},
* eg: <tt>Mathew John Harrison</tt>.
* @see Format#Firstnames_Then_Lastname
*/
public String toString()
{
return join(" ", getAllFirstnames())
+ " "
+ getLastname()
;
}
/**
* Returns this Author's name in the given {@link Format}.
*/
public String toString( Format f )
{
return "TODO";
}
} // end class Author