/* $HeadURL$ * $Id$ * * Copyright (c) 2006-2010 by Public Library of Science * http://plos.org * http://ambraproject.org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ambraproject.util; import freemarker.core.Environment; import freemarker.template.TemplateDirectiveModel; import freemarker.template.TemplateModel; import freemarker.template.TemplateDirectiveBody; import freemarker.template.TemplateException; import freemarker.template.TemplateModelException; import java.util.Map; import java.util.regex.Pattern; import java.io.IOException; import java.io.Writer; import java.nio.charset.CharacterCodingException; import java.nio.charset.CharsetEncoder; import java.nio.charset.CharsetDecoder; import java.nio.charset.Charset; import java.nio.charset.CodingErrorAction; import java.nio.CharBuffer; import java.nio.ByteBuffer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Freemarker directive for formatting and removing characters that may break javascript * */ public class SimpleTextDirective implements TemplateDirectiveModel { private static final Logger log = LoggerFactory.getLogger(SimpleTextDirective.class); private static final Pattern[] PATTERNS = { Pattern.compile("[\\n\\r]"), Pattern.compile("[“”\"]"), // Smart Quotes Pattern.compile("\u00a9"), // Copyright Pattern.compile("\u00ae"), // Registered Trademark Pattern.compile("\u2122"), // Trademark Pattern.compile("\u2013"), // ndash Pattern.compile("\u2014") // mdash }; private static final String[] REPLACEMENTS = { "", """, "©", "®", "™", "–", "—" }; public void execute(Environment environment, Map params, TemplateModel[] loopVars, TemplateDirectiveBody body) throws TemplateException, IOException { if (!params.isEmpty()) { throw new TemplateModelException( "ArticleFormattingDirective doesn't allow parameters."); } if (loopVars.length != 0) { throw new TemplateModelException( "ArticleFormattingDirective doesn't allow loop variables."); } if (body != null) { body.render(new AmbraTextWriter(environment.getOut())); } } private static class AmbraTextWriter extends Writer { private final Writer out; AmbraTextWriter(Writer out) { this.out = out; } public void write(char[] chars, int off, int len) throws IOException { out.write(plainText(new String(chars, off, len))); } public void flush() throws IOException { out.flush(); } public void close() throws IOException { out.close(); } } /** * This will convert a string to use the US-ASCII character set * It removes carrige returns, it replaces some characters with appropriate html entity codes * and it removes anything it doesn't understand * * @param str input string. * @return converted string. */ public static String plainText(String str) { if (str == null) return null; String result = str; //First find any known characters and replace them logically for (int i = 0; i < PATTERNS.length; i++) { result = PATTERNS[i].matcher(result).replaceAll(REPLACEMENTS[i]); } //Now, if there is any other characters we don't yet know about //Let's handle it somewhat gracefully Charset charset = Charset.forName("US-ASCII"); CharsetDecoder decoder = charset.newDecoder(); CharsetEncoder encoder = charset.newEncoder(); //Basically ignore weird characters encoder.onUnmappableCharacter(CodingErrorAction.IGNORE); ByteBuffer bbuf; try { bbuf = encoder.encode(CharBuffer.wrap(result)); CharBuffer cbuf = decoder.decode(bbuf); result = cbuf.toString(); } catch (CharacterCodingException ex) { //Lets not bring down the whole process if we error out log.error("Error trying to decode string: '" + str + "'", ex); return "?"; } return result; } }