/**
*
* Copyright
* 2009-2015 Jayway Products AB
* 2016-2017 Föreningen Sambruk
*
* Licensed under AGPL, Version 3.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.gnu.org/licenses/agpl.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package se.streamsource.streamflow.util;
import org.apache.tika.io.IOUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.mozilla.universalchardet.UniversalDetector;
import org.restlet.engine.http.header.ContentType;
import org.xml.sax.ContentHandler;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
*
* @author henrikreinhold
*
*/
public class Translator
{
public static String HTML = "text/html";
public static String PLAIN = "text/plain";
public static String translate(String text, Map<String, String> translations)
{
return translate(text, translations, null);
}
public static String translate(String text, Map<String, String> translations, Map<String,String> variables)
{
if( text.startsWith("{") && text.endsWith("}") )
{
String[] tokens = text.substring(1,text.length()-1).split( "," );
String key = tokens[0];
String translation = translations.get(key);
if (Strings.empty(translation))
return "";
String[] args = new String[tokens.length-1];
System.arraycopy(tokens, 1, args, 0, args.length);
if (variables == null)
{
variables = new HashMap<String,String>();
}
for (String arg : args)
{
String[] variable = arg.split("=", 2);
if (variable.length == 2)
variables.put(variable[0],variable[1]);
}
return MessageTemplate.text(translation, variables);
} else
return text;
}
public static String htmlToText( String html )
{
String result = html;
String encoding = "";
// if HTML contains encoding information we do not have to guess encoding!
Document doc = Jsoup.parse( result );
Element meta = doc.select("meta[http-equiv]").first();
if( meta != null )
{
String contentString = meta.attr("content");
contentString = contentString.indexOf(' ') != -1 ? contentString.replace(' ', ';' ) : contentString;
ContentType contentType = new ContentType( contentString );
encoding = contentType.getCharacterSet() != null ? contentType.getCharacterSet().getName() : "";
}
ContentHandler handler = new BodyContentHandler( );
Metadata metadata = new Metadata();
try
{
// if we already found an encoding - don't guess
new HtmlParser().parse( IOUtils.toInputStream( result, (!Strings.empty(encoding) ? encoding : guessEncoding( result )) ), handler, metadata, new ParseContext());
result = handler.toString();
// replace "EN DASH" unicode char with -
// since Pdfbox COSString would interpret a string containing dash as UTF-16
if( result.indexOf("\u2013") != 1 )
{
result = result.replace("\u2013", "-" );
}
} catch (Exception e)
{
//do nothing
}
return result;
}
public static String cleanHtml( String html ) throws IOException
{
/*Tidy tidy = new Tidy();
tidy.setWord2000( true );
tidy.setQuiet( true );
tidy.setShowWarnings( false );
tidy.getConfiguration().printConfigOptions( new OutputStreamWriter( System.out ), true );
ByteArrayOutputStream baos = new ByteArrayOutputStream();
StringReader reader = new StringReader( html );
tidy.parse( reader, baos );
String result = baos.toString("UTF-8");
baos.close();
reader.close();
return result;*/
return html.replaceAll("<o:p>(\\s| )*</o:p>", "" );
}
public static String guessEncoding( String input) {
byte [] bytes = input.getBytes();
String DEFAULT_ENCODING = "UTF-8";
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(bytes, 0, bytes.length);
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
if (encoding == null) {
encoding = DEFAULT_ENCODING;
}
return encoding;
}
}