/*
* Copyright (C) 2015-2017 Emanuel Moecklin
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.onegravity.rteditor.converter;
import android.text.Annotation;
import android.text.Editable;
import android.text.Html;
import android.text.Spanned;
import com.onegravity.rteditor.api.format.RTHtml;
import com.onegravity.rteditor.api.format.RTPlainText;
import com.onegravity.rteditor.api.media.RTAudio;
import com.onegravity.rteditor.api.media.RTImage;
import com.onegravity.rteditor.api.media.RTVideo;
import org.xml.sax.XMLReader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
/**
* Converts html to plain text
*/
public class ConverterHtmlToText {
/**
* When generating previews, Spannable objects that can't be converted into a String are
* represented as 0xfffc. When displayed, these show up as undisplayed squares. These constants
* define the object character and the replacement character.
*/
private static final char PREVIEW_OBJECT_CHARACTER = (char) 0xfffc;
private static final char PREVIEW_OBJECT_REPLACEMENT = (char) 0x20; // space
/**
* toHtml() converts non-breaking spaces into the UTF-8 non-breaking space, which doesn't get
* rendered properly in some clients. Replace it with a simple space.
*/
private static final char NBSP_CHARACTER = (char) 0x00a0; // utf-8 non-breaking space
private static final char NBSP_REPLACEMENT = (char) 0x20; // space
public static RTPlainText convert(RTHtml<? extends RTImage, ? extends RTAudio, ? extends RTVideo> input) {
String result = Html.fromHtml(input.getText(), null, new HtmlToTextTagHandler())
.toString()
.replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT)
.replace(NBSP_CHARACTER, NBSP_REPLACEMENT);
return new RTPlainText(result);
}
public static String convert(String text) {
return Html.fromHtml(text, null, new HtmlToTextTagHandler())
.toString()
.replace(PREVIEW_OBJECT_CHARACTER, PREVIEW_OBJECT_REPLACEMENT)
.replace(NBSP_CHARACTER, NBSP_REPLACEMENT);
}
/**
* Custom tag handler to use when converting HTML messages to text. It currently handles text
* representations of HTML tags that Android's built-in parser doesn't understand and hides code
* contained in STYLE and SCRIPT blocks.
*/
private static class HtmlToTextTagHandler implements Html.TagHandler {
// List of tags whose content should be ignored.
private static final Set<String> TAGS_WITH_IGNORED_CONTENT;
static {
Set<String> set = new HashSet<String>();
set.add("style");
set.add("script");
set.add("title");
set.add("!"); // comments
TAGS_WITH_IGNORED_CONTENT = Collections.unmodifiableSet(set);
}
@Override
public void handleTag(boolean opening, String tag, Editable output, XMLReader xmlReader) {
tag = tag.toLowerCase(Locale.US);
if (tag.equals("hr") && opening) {
// In the case of an <hr>, replace it with a bunch of underscores. This is roughly
// the behaviour of Outlook in Rich Text mode.
output.append("_____________________________________________\n");
} else if (TAGS_WITH_IGNORED_CONTENT.contains(tag)) {
handleIgnoredTag(opening, output);
}
}
private static final String IGNORED_ANNOTATION_KEY = "RT_ANNOTATION";
private static final String IGNORED_ANNOTATION_VALUE = "hiddenSpan";
/**
* When we come upon an ignored tag, we mark it with an Annotation object with a specific key
* and value as above. We don't really need to be checking these values since Html.fromHtml()
* doesn't use Annotation spans, but we should do it now to be safe in case they do start using
* it in the future.
*
* @param opening If this is an opening tag or not.
* @param output Spannable string that we're working with.
*/
private void handleIgnoredTag(boolean opening, Editable output) {
int len = output.length();
if (opening) {
output.setSpan(new Annotation(IGNORED_ANNOTATION_KEY, IGNORED_ANNOTATION_VALUE), len,
len, Spanned.SPAN_MARK_MARK);
} else {
Object start = getOpeningAnnotation(output);
if (start != null) {
int where = output.getSpanStart(start);
// Remove the temporary Annotation span.
output.removeSpan(start);
// Delete everything between the start of the Annotation and the end of the string
// (what we've generated so far).
output.delete(where, len);
}
}
}
/**
* Fetch the matching opening Annotation object and verify that it's the one added by us.
*
* @param output Spannable string we're working with.
* @return Starting Annotation object.
*/
private Object getOpeningAnnotation(Editable output) {
Object[] objs = output.getSpans(0, output.length(), Annotation.class);
for (int i = objs.length - 1; i >= 0; i--) {
Annotation span = (Annotation) objs[i];
if (output.getSpanFlags(objs[i]) == Spanned.SPAN_MARK_MARK
&& span.getKey().equals(IGNORED_ANNOTATION_KEY)
&& span.getValue().equals(IGNORED_ANNOTATION_VALUE)) {
return objs[i];
}
}
return null;
}
}
}