package org.openedit.entermedia.util;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.openedit.Data;
import org.openedit.data.Searcher;
import com.openedit.WebPageRequest;
import com.openedit.hittracker.HitTracker;
import com.openedit.hittracker.SearchQuery;
public class HtmlUtil {
public static final int DEFAULT_HTML_LENGTH = 256;
public void trimHtml(WebPageRequest inRequest){
String maxlength = inRequest.findValue("maxlength");
int length = DEFAULT_HTML_LENGTH;
if (maxlength != null && !maxlength.isEmpty()){
try{
length = Integer.parseInt(maxlength);
}catch (Exception e){}//not handled
}
String htmlfields = inRequest.findValue("htmlfields");
if (htmlfields == null){
return;
}
Data item = (Data) inRequest.getPageValue("item");
if (item == null){
item = (Data) inRequest.getPageValue("data");
}
if (item == null){
return;
}
String [] fields = htmlfields.split(",");
for(String field:fields){
if (item.get(field)==null || item.get(field).isEmpty()){
continue;
}
String html = null;
try{
// html = getShortenedHTML(item.get(field),length);
html = truncateHTML(item.get(field),length);
}catch (Exception e){}
if (html == null){
continue;
}
inRequest.putPageValue("trimmed", html);
break;
}
}
protected String getShortenedHTML(String inHTML, int inMaxLength) throws Exception{
StringBuilder content = new StringBuilder();
Stack<String> stack = new Stack<String>();
int contentCount = 0;
int lastStart = -1;
Pattern pattern = Pattern.compile("<(\"[^\"]*\"|'[^']*'|[^'\">])*>");
String input = inHTML;//.replace("\n", "").trim();
Matcher matcher = pattern.matcher(input);
while (matcher.find()){
String tag = matcher.group();
String htmlContent = null;
if (lastStart > 0 && lastStart < matcher.start()){
String substring = input.substring(lastStart, matcher.start()).trim();
if (!substring.isEmpty()){
if ( (contentCount + substring.length()) < inMaxLength){
contentCount += substring.length();
htmlContent = substring;
} else {//figure out best place to break up content
int delta = (inMaxLength - contentCount);
String [] tokens = substring.split("\\s");
StringBuilder buf = new StringBuilder();
for (String token:tokens){
buf.append(token);
if (buf.toString().length() > delta){
if (token.endsWith(".")){
buf.append("..");
} else {
buf.append("...");
}
break;
} else {
buf.append(" ");
}
}
contentCount += buf.toString().length();
htmlContent = buf.toString();
}
}
}
lastStart = matcher.end();
if (tag.endsWith("/>")){// solo tag like <br/>
if (htmlContent!=null){//reassemble in correct order
content.append(htmlContent);
content.append(tag);
if (contentCount > inMaxLength){
break;
}
}
} else if (tag.startsWith("</")){// end tag
String endtag = tag.replace("</", "").replace(">", "");
if (!stack.isEmpty() && endtag.equals(stack.peek())){
stack.pop();
}
if (htmlContent!=null){//reassemble in correct order
content.append(htmlContent);
content.append(tag);
if (contentCount > inMaxLength){
break;
}
}
} else {//start tag
String starttag = tag.replace("<", "").replace(">", "");
stack.push(starttag);
content.append(tag);//reassemble in correct order
if (htmlContent!=null){
content.append(htmlContent);
if (contentCount > inMaxLength){
break;
}
}
}
}
while(!stack.isEmpty()){
content.append("</").append(stack.pop()).append(">");
}
if (content.toString().isEmpty()){
content.append(inHTML);//input did not have any tags, return as whole
}
return content.toString();
}
public String truncateHTML(Data inData, String inField, int inLength){
return truncateHTML(inData.get(inField),inLength);
}
public String truncateHTML(String text, int length) {
// if the plain text is shorter than the maximum length, return the whole text
if(text == null || text.length()== 0){
return null;
}
if (text.replaceAll("<.*?>", "").length() <= length) {
return text;
}
StringBuilder result = new StringBuilder();
boolean trimmed = false;
/*
* This pattern creates tokens, where each line starts with the tag.
* For example, "One, <b>Two</b>, Three" produces the following:
* One,
* <b>Two
* </b>, Three
*/
Pattern tagPattern = Pattern.compile("(<.+?>)?([^<>]*)");
/*
* Checks for an empty tag, for example img, br, etc.
*/
Pattern emptyTagPattern = Pattern.compile("^<\\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param).*>$");
/*
* Modified the pattern to also include H1-H6 tags
* Checks for closing tags, allowing leading and ending space inside the brackets
*/
Pattern closingTagPattern = Pattern.compile("^<\\s*/\\s*([a-zA-Z]+[1-6]?)\\s*>$");
/*
* Modified the pattern to also include H1-H6 tags
* Checks for opening tags, allowing leading and ending space inside the brackets
*/
Pattern openingTagPattern = Pattern.compile("^<\\s*([a-zA-Z]+[1-6]?).*?>$");
/*
* Find > ...
*/
Pattern entityPattern = Pattern.compile("(&[0-9a-z]{2,8};|[0-9]{1,7};|[0-9a-f]{1,6};)");
// splits all html-tags to scanable lines
Matcher tagMatcher = tagPattern.matcher(text);
int numTags = tagMatcher.groupCount();
int totalLength = 3;
List<String> openTags = new ArrayList<String>();
boolean proposingChop = false;
while (tagMatcher.find()) {
String tagText = tagMatcher.group(1);
String plainText = tagMatcher.group(2);
if (proposingChop &&
tagText != null && tagText.length() != 0 &&
plainText != null && plainText.length() != 0) {
trimmed = true;
break;
}
// if there is any html-tag in this line, handle it and add it (uncounted) to the output
if (tagText != null && tagText.length() > 0) {
boolean foundMatch = false;
// if it's an "empty element" with or without xhtml-conform closing slash
Matcher matcher = emptyTagPattern.matcher(tagText);
if (matcher.find()) {
foundMatch = true;
// do nothing
}
// closing tag?
if (!foundMatch) {
matcher = closingTagPattern.matcher(tagText);
if (matcher.find()) {
foundMatch = true;
// delete tag from openTags list
String tagName = matcher.group(1);
openTags.remove(tagName.toLowerCase());
}
}
// opening tag?
if (!foundMatch) {
matcher = openingTagPattern.matcher(tagText);
if (matcher.find()) {
// add tag to the beginning of openTags list
String tagName = matcher.group(1);
openTags.add(0, tagName.toLowerCase());
}
}
// add html-tag to result
result.append(tagText);
}
// calculate the length of the plain text part of the line; handle entities (e.g. ) as one character
int contentLength = plainText.replaceAll("&[0-9a-z]{2,8};|[0-9]{1,7};|[0-9a-f]{1,6};", " ").length();
if (totalLength + contentLength > length) {
// the number of characters which are left
int numCharsRemaining = length - totalLength;
int entitiesLength = 0;
Matcher entityMatcher = entityPattern.matcher(plainText);
while (entityMatcher.find()) {
String entity = entityMatcher.group(1);
if (numCharsRemaining > 0) {
numCharsRemaining--;
entitiesLength += entity.length();
} else {
// no more characters left
break;
}
}
// keep us from chopping words in half
int proposedChopPosition = numCharsRemaining + entitiesLength;
int endOfWordPosition = plainText.indexOf(" ", proposedChopPosition-1);
if (endOfWordPosition == -1) {
endOfWordPosition = plainText.length();
}
int endOfWordOffset = endOfWordPosition - proposedChopPosition;
if (endOfWordOffset > 6) { // chop the word if it's extra long
endOfWordOffset = 0;
}
proposedChopPosition = numCharsRemaining + entitiesLength + endOfWordOffset;
if (plainText.length() >= proposedChopPosition) {
result.append(plainText.substring(0, proposedChopPosition));
proposingChop = true;
if (proposedChopPosition < plainText.length()) {
trimmed = true;
break; // maximum length is reached, so get off the loop
}
} else {
result.append(plainText);
}
} else {
result.append(plainText);
totalLength += contentLength;
}
// if the maximum length is reached, get off the loop
if(totalLength >= length) {
trimmed = true;
break;
}
}
if (trimmed) {
appendSuffix(result);
}
for (String openTag : openTags) {
result.append("</" + openTag + ">");
}
return result.toString();
}
protected void appendSuffix(StringBuilder buf){
if (buf.toString().endsWith("...")){
//no op
} else if (buf.toString().endsWith("..")){
buf.append(".");
} else if (buf.toString().endsWith(".")){
buf.append("..");
} else {
buf.append("...");
}
}
public String truncateHTMLtoPlainText(String inHTML, int inMaxLength){
String html = truncateHTML(inHTML,inMaxLength);
if(html != null){
return html.replaceAll("<.*?>", "");
} else{
return null;
}
}
public void stripHTML(String inHTML, StringBuilder buf){
buf.append(inHTML.replaceAll("<.*?>", ""));
}
public String toHTML(String inText)
{
StringBuilder buf = new StringBuilder();
String [] lines = inText.split("\n");
boolean isList = false;
for(String line:lines)
{
buf.append("<p>").append(line).append("</p>");
}
return buf.toString();
}
public ArrayList<String> generateKeywords(String inCatalogId, Data inData, ArrayList<String> inFields) throws Exception{
HashMap<String,String> map = new HashMap<String,String>();
for(String field:inFields){
String value = inData.get(field);
if (value == null || value.isEmpty()){
continue;
}
StringBuilder buf = new StringBuilder();
stripHTML(value,buf);
ArrayList<String> keywords = getAllKeywords(inCatalogId, buf.toString());
for(String keyword:keywords){
map.put(keyword, keyword);
}
}
Iterator<String> itr = map.keySet().iterator();
ArrayList<String> keywords = new ArrayList<String>();
while (itr.hasNext()){
String key = itr.next();
keywords.add(key);
}
return keywords;
}
protected ArrayList<String> getAllKeywords(String inCatalogId, String inValue) throws Exception{
ArrayList<String> keywords = new ArrayList<String>();
Analyzer analyzer = null;
TokenStream stream = null;
try{
analyzer = new StandardAnalyzer(Version.LUCENE_40);
/*new FullTextAnalyzer(Version.LUCENE_40);*/
/*new EnglishAnalyzer(Version.LUCENE_40); */
stream = analyzer.tokenStream(null, new StringReader(inValue));
stream.reset();
while (stream.incrementToken()) {
String str = stream.getAttribute(CharTermAttribute.class).toString().trim();
if (str.isEmpty() || str.matches(".*\\d.*") || isStopWord(inCatalogId,str) ){
continue;
}
keywords.add(str.trim());
}
} finally {
try{
if (stream!=null) stream.close();
}catch (Exception e){}//not handled
try{
if (analyzer!=null) analyzer.close();
}catch (Exception e){}//not handled
}
return keywords;
}
protected boolean isStopWord(String inCatalogId, String inWord){
if (inWord.matches(".*\\d.*")){
return true;
}
// Searcher searcher = getSearcherManager().getSearcher(inCatalogId, "stopword");
// SearchQuery query = searcher.createSearchQuery();
// query.addMatches("name",inWord);
// HitTracker hits = searcher.search(query);
// return hits.size() > 0;
return false;
}
/*protected Set<String> getStopWords(String inCatalogId) throws Exception{
Set<String> set = new HashSet<String>();
Searcher searcher = getSearcherManager().getSearcher(inCatalogId, "stopword");
HitTracker hits = searcher.getAllHits();
Iterator<?> itr = hits.iterator();
while(itr.hasNext()){
String name = ((Data) itr.next()).getName();
set.add(name);
}
return set;
}*/
}