/*=============================================================================# # Copyright (c) 2015-2016 Stephan Wahlbrink (WalWare.de) and others. # All rights reserved. This program and the accompanying materials # are made available under the terms of the Eclipse Public License v1.0 # which accompanies this distribution, and is available at # http://www.eclipse.org/legal/epl-v10.html # # Contributors: # Stephan Wahlbrink - initial API and implementation #=============================================================================*/ package de.walware.docmlet.wikitext.commonmark.core; import static de.walware.docmlet.wikitext.internal.commonmark.core.CommonRegex.HTML_ENTITY_PATTERN; import static de.walware.docmlet.wikitext.internal.commonmark.core.CommonRegex.HTML_ENTITY_REGEX; import java.util.regex.Matcher; import java.util.regex.Pattern; import de.walware.ecommons.text.core.util.HtmlUtils; import com.google.common.escape.Escaper; public class ParseHelper { private static final Pattern ESCAPING_PATTERN= Pattern.compile("\\\\(.)" + "|" + HTML_ENTITY_REGEX); private static final Pattern WHITESPACE_PATTERN= Pattern.compile("\\s+"); private final Matcher escapingMatcher= ESCAPING_PATTERN.matcher(""); private final Matcher htmlEntityMatcher= HTML_ENTITY_PATTERN.matcher(""); private final Matcher whitespaceMatcher= WHITESPACE_PATTERN.matcher(""); private final StringBuilder tmpBuilder= new StringBuilder(0x40); public ParseHelper() { } public Matcher getHtmlEntityMatcher() { return this.htmlEntityMatcher; } private Matcher getEscapingMatcher() { return this.escapingMatcher; } private Matcher getWhitespaceMatcher() { return this.whitespaceMatcher; } private StringBuilder getTmpBuilder() { this.tmpBuilder.setLength(0); return this.tmpBuilder; } public String resolveHtmlEntity(final String reference) { try { final String replacement= HtmlUtils.resolveEntity(reference); if (replacement != null && replacement.charAt(0) == 0) { throw new IllegalArgumentException(); } return replacement; } catch (final IllegalArgumentException e) { return "\uFFFD"; //$NON-NLS-1$ } } public String replaceHtmlEntities(final String text, final Escaper escaper) { final StringBuilder sb= getTmpBuilder(); final Matcher matcher= getHtmlEntityMatcher().reset(text); int lastEnd= 0; while (matcher.find()) { try { final int start= matcher.start(); final String reference= matcher.group(1); String replacement= resolveHtmlEntity(reference); if (replacement != null) { if (escaper != null) { replacement= escaper.escape(replacement); } if (lastEnd < start) { sb.append(text, lastEnd, start); } sb.append(replacement); lastEnd= matcher.end(); } } catch (final IllegalArgumentException e) { } } if (lastEnd == 0) { return text; } if (lastEnd < text.length()) { sb.append(text, lastEnd, text.length()); } return sb.toString(); } public String replaceEscaping(final String text) { final StringBuilder sb= getTmpBuilder(); final Matcher matcher= getEscapingMatcher().reset(text); int lastEnd= 0; while (matcher.find()) { final int start= matcher.start(); { final int escapedIdx= matcher.start(1); if (escapedIdx >= 0) { if (isAsciiPunctuation(text.charAt(escapedIdx))) { if (lastEnd < start) { sb.append(text, lastEnd, start); } lastEnd= escapedIdx; } continue; } } try { final String reference= matcher.group(2); final String replacement= resolveHtmlEntity(reference); if (replacement != null) { if (lastEnd < start) { sb.append(text, lastEnd, start); } sb.append(replacement); lastEnd= matcher.end(); } } catch (final IllegalArgumentException e) { if (lastEnd < start) { sb.append(text, lastEnd, start); } sb.append('\uFFFD'); lastEnd= matcher.end(); } } if (lastEnd == 0) { return text; } if (lastEnd < text.length()) { sb.append(text, lastEnd, text.length()); } return sb.toString(); } public String collapseWhitespace(final String text) { final StringBuilder sb= getTmpBuilder(); final Matcher matcher= getWhitespaceMatcher().reset(text); int lastEnd= 0; while (matcher.find()) { final int start= matcher.start(); if (start == 0) { lastEnd= matcher.end(); continue; } sb.append(text, lastEnd, start); lastEnd= matcher.end(); if (lastEnd == text.length()) { break; } sb.append(' '); } if (lastEnd == 0) { return text; } if (lastEnd < text.length()) { sb.append(text, lastEnd, text.length()); } return sb.toString(); } public boolean isUnicodeWhitespace(final char ch) { if (ch >= 0 && ch < 0xA0) { switch (ch) { case '\t': case '\n': case '\r': case '\f': case '\u0020': return true; default: return false; } } else { return (Character.getType(ch) == Character.SPACE_SEPARATOR); } } public boolean isAsciiPunctuation(final char ch) { switch (ch) { case '!': case '"': case '#': case '$': case '%': case '&': case '\'': case '(': case ')': case '*': case '+': case ',': case '-': case '.': case '/': case ':': case ';': case '<': case '=': case '>': case '?': case '@': case '[': case '\\': case ']': case '^': case '_': case '`': case '{': case '|': case '}': case '~': return true; default: return false; } } public boolean isPunctuation(final char ch) { if (isAsciiPunctuation(ch)) { return true; } final int type= Character.getType(ch); switch (type) { case Character.DASH_PUNCTUATION: case Character.START_PUNCTUATION: case Character.END_PUNCTUATION: case Character.CONNECTOR_PUNCTUATION: case Character.OTHER_PUNCTUATION: case Character.INITIAL_QUOTE_PUNCTUATION: case Character.FINAL_QUOTE_PUNCTUATION: return true; default: return false; } } }