/*
* Copyright (C) 2011 Laurent Caillette
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation, either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.novelang.parser;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import org.novelang.logger.Logger;
import org.novelang.logger.LoggerFactory;
import org.novelang.parser.shared.Lexeme;
/**
* Table of escaped symbols, using HTML entity names whenever defined and Unicode names otherwise.
*
* @author Laurent Caillette
*/
public class SourceUnescape {
private static final Logger LOGGER = LoggerFactory.getLogger( SourceUnescape.class );
private static final Map< String, Character > UNICODE_ESCAPES;
private static final Map< String, Character > HTML_ENTITY_NAMES_ESCAPES;
/**
* Left-pointing double angle quotation mark "«".
* Must be the same as declared in the grammar!
*/
public static final Character ESCAPE_START = '\u00ab' ; // «
/**
* Right-pointing double angle quotation mark "»".
* Must be the same as declared in the grammar!
*/
public static final Character ESCAPE_END = '\u00bb' ; // »
private SourceUnescape() { }
static {
final Map< String, Character > escapedCharacters = Maps.newHashMap() ;
final Map< String, Character > escapedCharactersAlternatives = Maps.newHashMap() ;
for( final Lexeme lexeme : GeneratedLexemes.getLexemes().values() ) {
final String htmlEntityName = lexeme.getHtmlEntityName() ;
final Character character = lexeme.getCharacter() ;
escapedCharacters.put( unicodeUpperNameToEscapeName( lexeme.getUnicodeName() ), character ) ;
if( null != htmlEntityName ) {
escapedCharactersAlternatives.put( htmlEntityName, character ) ;
}
}
UNICODE_ESCAPES = ImmutableMap.copyOf( escapedCharacters ) ;
HTML_ENTITY_NAMES_ESCAPES = ImmutableMap.copyOf( escapedCharactersAlternatives ) ;
}
public static Map< String, Character > getMainCharacterEscapes() {
return new ImmutableMap.Builder< String, Character >().putAll( UNICODE_ESCAPES ).build() ;
}
public static Character unescapeCharacter( final String escaped )
throws NoUnescapedCharacterException
{
Character unescaped = UNICODE_ESCAPES.get( escaped ) ;
if( null == unescaped ) {
unescaped = HTML_ENTITY_NAMES_ESCAPES.get( escaped ) ;
if ( null == unescaped ) {
final NoUnescapedCharacterException exception = new NoUnescapedCharacterException( escaped ) ;
LOGGER.warn( exception, "Unsupported symbol" ) ;
throw exception ;
}
}
LOGGER.debug( "Escaped: '", escaped, "'" ) ;
return unescaped ;
}
private static final Pattern PLAIN_ESCAPE_PATTERN =
Pattern.compile( "(" + ESCAPE_START + "(\\w+(?:-\\w+)*)" + ESCAPE_END + ")" ) ;
private static final Pattern HTML_ESCAPE_PATTERN =
Pattern.compile( "(\\&(\\w+);)" ) ;
static {
LOGGER.debug( "Crafted regex ", PLAIN_ESCAPE_PATTERN.pattern() ) ;
LOGGER.debug( "Crafted regex ", HTML_ESCAPE_PATTERN.pattern() ) ;
}
public static String unescapeText( final String text )
throws NoUnescapedCharacterException
{
final Matcher matcher = PLAIN_ESCAPE_PATTERN.matcher( text ) ;
final StringBuilder buffer = new StringBuilder();
int keepFrom = 0 ;
while( matcher.find() ) {
if( matcher.start() > 0 && keepFrom < text.length() ) {
final String previous = text.substring( keepFrom, matcher.start() ) ;
buffer.append( previous ) ;
}
final String escapeCode = matcher.group( 2 ) ;
final Character escapedSymbol = SourceUnescape.unescapeCharacter( escapeCode ) ;
if( null == escapedSymbol ) {
throw new NoUnescapedCharacterException( escapeCode ) ;
}
buffer.append( escapedSymbol ) ;
keepFrom = matcher.end() ;
}
if( keepFrom < text.length() ) {
final String tail = text.substring( keepFrom, text.length() ) ;
buffer.append( tail ) ;
}
return buffer.toString() ;
}
public static String unicodeUpperNameToEscapeName( final String upperName ) {
return upperName.toLowerCase().replace( '_', '-' ) ;
}
}