/*
* Copyright (C) 2011 Laurent Caillette
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.novelang.build.antlr;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.base.Preconditions;
import com.google.common.collect.Sets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.novelang.parser.shared.Lexeme;
/**
* @author Laurent Caillette
*/
/*static*/ class LexemeDeclarationExtractor {
private static final Logger LOGGER = LoggerFactory.getLogger( LexemeDeclarationExtractor.class ) ;
/**
* Groups:
* <ol>
* <li>Token name</li>
* <li>Token litteral as single character.</li>
* <li>Token litteral as escaped character.</li>
* <li>Token litteral as unicode (hex).</li>
* <li>Named HTML entity as comment (optional).</li>
* </ol>
*/
private static final Pattern TOKENS_DECLARATIONS = Pattern.compile(
"([A-Z0-9_]+) *: *(?:'(.)'|'(\\\\.)'|'(\\\\u[a-fA-F0-9]{4})') *;(?: *//(?: *&([A-Za-z0-9]+);)?+(?: *\"([a-zA-Z0-9]+)\")?+)?"
) ;
static {
LOGGER.debug( "Crafted regex {}", TOKENS_DECLARATIONS.toString() ) ;
}
private final CharacterConverter[] converters ;
public LexemeDeclarationExtractor( final CharacterConverter... converters ) {
this.converters = converters.clone() ;
}
public static Set< Lexeme > extractLexemeDeclarations( final String grammar ) {
final Matcher matcher = TOKENS_DECLARATIONS.matcher( grammar ) ;
final Set< Lexeme > declarations = Sets.newHashSet() ;
while( matcher.find() ) {
final Lexeme declaration = extractLexemeDeclaration( matcher ) ;
if( null != declaration ) {
declarations.add( declaration ) ;
}
}
return declarations ;
}
private Lexeme extract( final Matcher matcher ) {
final int expectedGroupCount = converters.length + 3 ;
Preconditions.checkArgument(
matcher.groupCount() == expectedGroupCount,
"Matcher has %i groups (including lexeme name and comment) against %i converters",
matcher.groupCount(),
converters.length
) ;
final String tokenName = matcher.group( 1 ) ;
final String htmlEntityName = matcher.group( matcher.groupCount() - 1 ) ;
final String ascii62 = matcher.group( matcher.groupCount() ) ;
for( int converterIndex = 0 ; converterIndex < converters.length ; converterIndex++ ) {
final int groupIndex = converterIndex + 2 ;
final String match = matcher.group( groupIndex ) ;
if( match != null ) {
final Character character = converters[ converterIndex ].convert( match ) ;
return new Lexeme( tokenName, character, htmlEntityName, ascii62 ) ;
}
}
return null ;
}
private static Lexeme extractLexemeDeclaration( final Matcher matcher ) {
return DECLARATION_EXTRACTOR.extract( matcher ) ;
}
/**
* Declaration order matters: it corresponds to the group index in the regex.
*/
private static final LexemeDeclarationExtractor DECLARATION_EXTRACTOR =
new LexemeDeclarationExtractor(
new LexemeDeclarationExtractor.LitteralCharacterConverter(),
new LexemeDeclarationExtractor.EscapedCharacterConverter(),
new LexemeDeclarationExtractor.UnicodeCharacterConverter()
)
;
// ====================
// Character converters
// ====================
interface CharacterConverter {
Character convert( String declaration ) ;
}
public static class LitteralCharacterConverter implements CharacterConverter {
@Override
public Character convert( final String characterDeclaration ) {
if( 1 != characterDeclaration.length() ) {
throw new IllegalArgumentException(
"Should contains one character only, was: '" + characterDeclaration + "'" ) ;
}
return characterDeclaration.charAt( 0 ) ;
}
}
public static class UnicodeCharacterConverter implements CharacterConverter {
@Override
public Character convert( final String characterDeclaration ) {
if( ! characterDeclaration.startsWith( "\\u" ) ) {
throw new IllegalArgumentException(
"Should be unicode starting with '\\u', was: '" + characterDeclaration + "'" ) ;
}
final String hex = "#" +
characterDeclaration.substring( 2, characterDeclaration.length() ) ;
final Integer decoded = Integer.decode( hex ) ;
return ( char ) decoded.intValue() ;
}
}
public static class EscapedCharacterConverter implements CharacterConverter {
@Override
public Character convert( final String characterDeclaration ) {
if( ! characterDeclaration.startsWith( "\\" ) ) {
throw new IllegalArgumentException(
"Should be escaped starting with '\\', was: '" + characterDeclaration + "'" ) ;
}
if( characterDeclaration.length() != 2 ) {
throw new IllegalArgumentException(
"Should start with '\\' then 1 character, was: '" + characterDeclaration + "'" ) ;
}
return characterDeclaration.charAt( 1 ) ;
}
}
}