HtmlUtil.java example

Explorer
extension-aws-master
- src
- webapp
  - WEB-INF
    - base
      - aws
        src
        model
        amazon
        S3RepositoryTest.java
package org.openedit.entermedia.util;

import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.openedit.Data;
import org.openedit.data.Searcher;

import com.openedit.WebPageRequest;
import com.openedit.hittracker.HitTracker;
import com.openedit.hittracker.SearchQuery;


public class HtmlUtil {
	
	public static final int DEFAULT_HTML_LENGTH = 256;
	
	public void trimHtml(WebPageRequest inRequest){
		String maxlength = inRequest.findValue("maxlength");
		int length = DEFAULT_HTML_LENGTH;
		if (maxlength != null && !maxlength.isEmpty()){
			try{
				length = Integer.parseInt(maxlength);
			}catch (Exception e){}//not handled
		}
		String htmlfields = inRequest.findValue("htmlfields");
		if (htmlfields == null){
			return;
		}
		Data item = (Data) inRequest.getPageValue("item");
		if (item == null){
			item = (Data) inRequest.getPageValue("data");
		}
		if (item == null){
			return;
		}
		String [] fields = htmlfields.split(",");
		for(String field:fields){
			if (item.get(field)==null || item.get(field).isEmpty()){
				continue;
			}
			String html = null;
			try{
//				html = getShortenedHTML(item.get(field),length);
				html = truncateHTML(item.get(field),length);
			}catch (Exception e){}
			if (html == null){
				continue;
			}
			inRequest.putPageValue("trimmed", html);
			break;
		}
	}
	
	protected String getShortenedHTML(String inHTML, int inMaxLength) throws Exception{
		StringBuilder content = new StringBuilder();
		Stack<String> stack = new Stack<String>();
		int contentCount = 0;
		int lastStart = -1;
		Pattern pattern = Pattern.compile("<(\"[^\"]*\"|'[^']*'|[^'\">])*>");
		String input = inHTML;//.replace("\n", "").trim();
		Matcher matcher = pattern.matcher(input);
		while (matcher.find()){
			String tag = matcher.group();
			String htmlContent = null;
			if (lastStart > 0 && lastStart < matcher.start()){
				String substring = input.substring(lastStart, matcher.start()).trim();
				if (!substring.isEmpty()){
					if ( (contentCount + substring.length()) < inMaxLength){
						contentCount += substring.length();
						htmlContent = substring;
					} else {//figure out best place to break up content
						int delta = (inMaxLength - contentCount);
						String [] tokens = substring.split("\\s");
						StringBuilder buf = new StringBuilder();
						for (String token:tokens){
							buf.append(token);
							if (buf.toString().length() > delta){
								if (token.endsWith(".")){
									buf.append("..");
								} else {
									buf.append("...");
								}
								break;
							} else {
								buf.append(" ");
							}
						}
						contentCount += buf.toString().length();
						htmlContent = buf.toString();
					}
				}
			}
			lastStart = matcher.end();
			if (tag.endsWith("/>")){// solo tag like <br/>
				if (htmlContent!=null){//reassemble in correct order
					content.append(htmlContent);
					content.append(tag);
					if (contentCount > inMaxLength){
						break;
					}
				}
			} else if (tag.startsWith("</")){// end tag
				String endtag = tag.replace("</", "").replace(">", "");
				if (!stack.isEmpty() && endtag.equals(stack.peek())){
					stack.pop();
				}
				if (htmlContent!=null){//reassemble in correct order
					content.append(htmlContent);
					content.append(tag);
					if (contentCount > inMaxLength){
						break;
					}
				}
			} else {//start tag
				String starttag = tag.replace("<", "").replace(">", "");
				stack.push(starttag);
				content.append(tag);//reassemble in correct order
				if (htmlContent!=null){
					content.append(htmlContent);
					if (contentCount > inMaxLength){
						break;
					}
				}
			}
		}
		while(!stack.isEmpty()){
			content.append("</").append(stack.pop()).append(">");
		}
		if (content.toString().isEmpty()){
			content.append(inHTML);//input did not have any tags, return as whole
		}
		return content.toString();
	}
	
	public String truncateHTML(Data inData, String inField, int inLength){
		return truncateHTML(inData.get(inField),inLength);
	}
	
	public String truncateHTML(String text, int length) {
	    // if the plain text is shorter than the maximum length, return the whole text
	  if(text == null || text.length()== 0){
		  return null;
	  }
		if (text.replaceAll("<.*?>", "").length() <= length) {
	        return text;
	    }
	    StringBuilder result = new StringBuilder();
	    boolean trimmed = false;
	    /*
	     * This pattern creates tokens, where each line starts with the tag.
	     * For example, "One, <b>Two</b>, Three" produces the following:
	     *     One,
	     *     <b>Two
	     *     </b>, Three
	     */
	    Pattern tagPattern = Pattern.compile("(<.+?>)?([^<>]*)");

	    /*
	     * Checks for an empty tag, for example img, br, etc.
	     */
	    Pattern emptyTagPattern = Pattern.compile("^<\\s*(img|br|input|hr|area|base|basefont|col|frame|isindex|link|meta|param).*>$");

	    /*
	     * Modified the pattern to also include H1-H6 tags
	     * Checks for closing tags, allowing leading and ending space inside the brackets
	     */
	    Pattern closingTagPattern = Pattern.compile("^<\\s*/\\s*([a-zA-Z]+[1-6]?)\\s*>$");

	    /*
	     * Modified the pattern to also include H1-H6 tags
	     * Checks for opening tags, allowing leading and ending space inside the brackets
	     */
	    Pattern openingTagPattern = Pattern.compile("^<\\s*([a-zA-Z]+[1-6]?).*?>$");

	    /*
	     * Find   > ...
	     */
	    Pattern entityPattern = Pattern.compile("(&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};)");

	    // splits all html-tags to scanable lines
	    Matcher tagMatcher =  tagPattern.matcher(text);
	    int numTags = tagMatcher.groupCount();

	    int totalLength = 3;
	    List<String> openTags = new ArrayList<String>();

	    boolean proposingChop = false;
	    while (tagMatcher.find()) {
	        String tagText = tagMatcher.group(1);
	        String plainText = tagMatcher.group(2);

	        if (proposingChop &&
	                tagText != null && tagText.length() != 0 &&
	                plainText != null && plainText.length() != 0) {
	            trimmed = true;
	            break;
	        }

	        // if there is any html-tag in this line, handle it and add it (uncounted) to the output
	        if (tagText != null && tagText.length() > 0) {
	            boolean foundMatch = false;

	            // if it's an "empty element" with or without xhtml-conform closing slash
	            Matcher matcher = emptyTagPattern.matcher(tagText);
	            if (matcher.find()) {
	                foundMatch = true;
	                // do nothing
	            }

	            // closing tag?
	            if (!foundMatch) {
	                matcher = closingTagPattern.matcher(tagText);
	                if (matcher.find()) {
	                    foundMatch = true;
	                    // delete tag from openTags list
	                    String tagName = matcher.group(1);
	                    openTags.remove(tagName.toLowerCase());
	                }
	            }

	            // opening tag?
	            if (!foundMatch) {
	                matcher = openingTagPattern.matcher(tagText);
	                if (matcher.find()) {
	                    // add tag to the beginning of openTags list
	                    String tagName = matcher.group(1);
	                    openTags.add(0, tagName.toLowerCase());
	                }
	            }

	            // add html-tag to result
	            result.append(tagText);
	        }

	        // calculate the length of the plain text part of the line; handle entities (e.g.  ) as one character
	        int contentLength = plainText.replaceAll("&[0-9a-z]{2,8};|&#[0-9]{1,7};|[0-9a-f]{1,6};", " ").length();
	        if (totalLength + contentLength > length) {
	            // the number of characters which are left
	            int numCharsRemaining = length - totalLength;
	            int entitiesLength = 0;
	            Matcher entityMatcher = entityPattern.matcher(plainText);
	            while (entityMatcher.find()) {
	                String entity = entityMatcher.group(1);
	                if (numCharsRemaining > 0) {
	                    numCharsRemaining--;
	                    entitiesLength += entity.length();
	                } else {
	                    // no more characters left
	                    break;
	                }
	            }

	            // keep us from chopping words in half
	            int proposedChopPosition = numCharsRemaining + entitiesLength;
	            int endOfWordPosition = plainText.indexOf(" ", proposedChopPosition-1);
	            if (endOfWordPosition == -1) {
	                endOfWordPosition = plainText.length();
	            }
	            int endOfWordOffset = endOfWordPosition - proposedChopPosition;
	            if (endOfWordOffset > 6) { // chop the word if it's extra long
	                endOfWordOffset = 0;
	            }

	            proposedChopPosition = numCharsRemaining + entitiesLength + endOfWordOffset;
	            if (plainText.length() >= proposedChopPosition) {
	                result.append(plainText.substring(0, proposedChopPosition));
	                proposingChop = true;
	                if (proposedChopPosition < plainText.length()) {
	                    trimmed = true;
	                    break; // maximum length is reached, so get off the loop
	                }
	            } else {
	                result.append(plainText);
	            }
	        } else {
	        	result.append(plainText);
	            totalLength += contentLength;
	        }
	        // if the maximum length is reached, get off the loop
	        if(totalLength >= length) {
	            trimmed = true;
	            break;
	        }
	    }
	    if (trimmed) {
	    	appendSuffix(result);
	    }
	    for (String openTag : openTags) {
	    	result.append("</" + openTag + ">");
	    }
	    return result.toString();
	}
	
	protected void appendSuffix(StringBuilder buf){
		if (buf.toString().endsWith("...")){
			//no op
		} else if (buf.toString().endsWith("..")){
			buf.append(".");
		} else if (buf.toString().endsWith(".")){
			buf.append("..");
		} else {
			buf.append("...");
		}
	}
	
	public String truncateHTMLtoPlainText(String inHTML, int inMaxLength){
		String html = truncateHTML(inHTML,inMaxLength);
	   if(html != null){
		return html.replaceAll("<.*?>", "");
	   } else{
		   return null;
	   }
	}
	
	public void stripHTML(String inHTML, StringBuilder buf){
		buf.append(inHTML.replaceAll("<.*?>", ""));
	}
	
	public String toHTML(String inText)
	{
		StringBuilder buf = new StringBuilder();
		String [] lines = inText.split("\n");
		boolean isList = false;
		for(String line:lines)
		{
			
				buf.append("<p>").append(line).append("</p>");
			
		}
		return buf.toString();
	}
	
	public ArrayList<String> generateKeywords(String inCatalogId, Data inData, ArrayList<String> inFields) throws Exception{
		HashMap<String,String> map = new HashMap<String,String>();
		for(String field:inFields){
			String value = inData.get(field);
			if (value == null || value.isEmpty()){
				continue;
			}
			StringBuilder buf = new StringBuilder();
			stripHTML(value,buf);
			ArrayList<String> keywords = getAllKeywords(inCatalogId, buf.toString());
			for(String keyword:keywords){
				map.put(keyword, keyword);
			}
		}
		Iterator<String> itr = map.keySet().iterator();
		ArrayList<String> keywords = new ArrayList<String>();
		while (itr.hasNext()){
			String key = itr.next();
			keywords.add(key);
		}
		return keywords;
	}
	
	protected ArrayList<String> getAllKeywords(String inCatalogId, String inValue) throws Exception{
		ArrayList<String> keywords = new ArrayList<String>();
		Analyzer analyzer = null;
		TokenStream stream = null;
		try{
			analyzer = new StandardAnalyzer(Version.LUCENE_40);
					/*new FullTextAnalyzer(Version.LUCENE_40);*/
					/*new EnglishAnalyzer(Version.LUCENE_40); */
			stream = analyzer.tokenStream(null, new StringReader(inValue));
		    stream.reset();
		    while (stream.incrementToken()) {
		    	String str = stream.getAttribute(CharTermAttribute.class).toString().trim();
		    	if (str.isEmpty() || str.matches(".*\\d.*") || isStopWord(inCatalogId,str) ){
		    		continue;
		    	}
		        keywords.add(str.trim());
		    }
		} finally {
			try{
				if (stream!=null) stream.close();
			}catch (Exception e){}//not handled
			try{
				if (analyzer!=null) analyzer.close();
			}catch (Exception e){}//not handled
		}
		return keywords;
	}
	
	protected boolean isStopWord(String inCatalogId, String inWord){
		if (inWord.matches(".*\\d.*")){
			return true;
		}
//		Searcher searcher = getSearcherManager().getSearcher(inCatalogId, "stopword");
//		SearchQuery query = searcher.createSearchQuery();
//		query.addMatches("name",inWord);
//		HitTracker hits = searcher.search(query);
//		return hits.size() > 0;
		return false;
	}
	
	/*protected Set<String> getStopWords(String inCatalogId) throws Exception{
		Set<String> set = new HashSet<String>();
		Searcher searcher = getSearcherManager().getSearcher(inCatalogId, "stopword");
		HitTracker hits = searcher.getAllHits();
		Iterator<?> itr = hits.iterator();
		while(itr.hasNext()){
			String name = ((Data) itr.next()).getName();
			set.add(name);
		}
		return set;
	}*/

}