/* * $Id: HtmlReferenceRewriter.java,v 1.10.2.1 2007/01/12 19:31:41 idegaweb Exp $ * Created on 3.6.2004 * * Copyright (C) 2004-2005 Idega Software hf. All Rights Reserved. * * This software is the proprietary information of Idega hf. Use is subject to * license terms. */ package com.idega.util; import java.io.BufferedReader; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.URL; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.idega.core.builder.data.ICPage; /** * <p> * This class takes in a source (Reader) of a HTML document parses it and rewrites relative URLs (that are referencing URLs within the same host) * to be an abolute URL with http://[hostname]/[oldurl] * </p> * Last modified: $Date: 2007/01/12 19:31:41 $ by $Author: idegaweb $ * * @author <a href="mailto:tryggvil@idega.com">Tryggvi Larusson</a> * @version $Revision: 1.10.2.1 $ */ public class HtmlReferenceRewriter { private Reader input; private Writer output; private String urlPrefix; private List patterns; private boolean rewriteOptionValues=false; private static String SLASH="/"; /** * @return Returns the rewriteOptionValues. */ public boolean isRewriteOptionValues() { return this.rewriteOptionValues; } /** * Sets if to treat option values (in a select tag) as URLs and rewrite them also * @param rewriteOptionValues The rewriteOptionValues to set. */ public void setRewriteOptionValues(boolean rewriteOptionValues) { this.rewriteOptionValues = rewriteOptionValues; } /** * @return Returns the patterns. */ public List getPatterns() { if(this.patterns==null){ this.patterns = new ArrayList(); Pattern p1 = Pattern.compile("(<a[^>]+href=\")([^#][^\"]+)([^>]+>)",Pattern.CASE_INSENSITIVE); this.patterns.add(p1); Pattern p2 = Pattern.compile("(<link[^>]+href=\")([^#][^\"]+)([^>]+>)",Pattern.CASE_INSENSITIVE); this.patterns.add(p2); Pattern p3 = Pattern.compile("(<img[^>]+src=\")([^#][^\"]+)([^>]+>)",Pattern.CASE_INSENSITIVE); this.patterns.add(p3); Pattern p4 = Pattern.compile("(<script[^>]+src=\")([^#][^\"]+)([^>]+>)",Pattern.CASE_INSENSITIVE); this.patterns.add(p4); Pattern p5 = Pattern.compile("(<input[^>]+src=\")([^#][^\"]+)([^>]+>)",Pattern.CASE_INSENSITIVE); this.patterns.add(p5); Pattern p6 = Pattern.compile("(<form[^>]+action=\")([^#][^\"]+)([^>]+>)",Pattern.CASE_INSENSITIVE); this.patterns.add(p6); Pattern p7 = Pattern.compile("(<embed[^>]+src=\")([^#][^\"]+)([^>]+>)",Pattern.CASE_INSENSITIVE); this.patterns.add(p7); if(this.isRewriteOptionValues()){ Pattern p8 = Pattern.compile("(<option[^>]+value=\")([^#][^\"]+)([^>]+>)",Pattern.CASE_INSENSITIVE); this.patterns.add(p8); } Pattern p9 = Pattern.compile("(<div[^>]+url\\()([^#][^\\)]+)([^>]+>)",Pattern.CASE_INSENSITIVE); this.patterns.add(p9); } return this.patterns; } /** * @param patterns The patterns to set. */ public void setPatterns(List patterns) { this.patterns = patterns; } public static void main(String[] args) throws Exception{ // performReykjavikNetworkTestToFile(); travelTest(); } public static void travelTest()throws Exception{ HtmlReferenceRewriter instance = new HtmlReferenceRewriter(); String fromFile = "/Users/gimmi/Desktop/explore/ExploreIceland.html"; String toFile = "/Users/gimmi/Desktop/explore/expTest.html"; String urlPrefix = "http://www.exploreiceland.is/"; FileReader reader = new FileReader(fromFile); Reader input = new BufferedReader(reader); FileWriter output = new FileWriter(toFile); instance.setInput(input); instance.setOutput(output); instance.setUrlPrefix(urlPrefix); instance.setRewriteOptionValues(true); instance.process(); } public static void performReykjavikFileTest()throws Exception{ HtmlReferenceRewriter instance = new HtmlReferenceRewriter(); String fromFile = "/Users/tryggvil/Documents/Reykjavik/rrvk-dtemplate.html"; String toFile = "/Users/tryggvil/Documents/Reykjavik/rvktest.html"; String urlPrefix = "http://www.rvk.is/"; FileReader reader = new FileReader(fromFile); Reader input = new BufferedReader(reader); FileWriter output = new FileWriter(toFile); instance.setInput(input); instance.setOutput(output); instance.setUrlPrefix(urlPrefix); instance.setRewriteOptionValues(true); instance.process(); } public static void performReykjavikNetworkTestToFile()throws Exception{ String sUrl = "http://nobel.idega.is/rvk/template.html"; URL url = new URL(sUrl); InputStream iStream = url.openStream(); InputStreamReader iReader = new InputStreamReader(iStream); HtmlReferenceRewriter instance = new HtmlReferenceRewriter(); String toFile = "/Users/tryggvil/Documents/Reykjavik/rvktest2.html"; String urlPrefix = "http://www.rvk.is/"; Reader input = new BufferedReader(iReader); FileWriter output = new FileWriter(toFile); instance.setInput(input); instance.setOutput(output); instance.setUrlPrefix(urlPrefix); instance.setRewriteOptionValues(true); instance.process(); } public static void performReykjavikNetworkTestToIBPageTemplate()throws Exception{ String sUrl = "http://nobel.idega.is/rvk/template.html"; URL url = new URL(sUrl); InputStream iStream = url.openStream(); InputStreamReader iReader = new InputStreamReader(iStream); HtmlReferenceRewriter instance = new HtmlReferenceRewriter(); String urlPrefix = "http://www.rvk.is/"; String pageKey = "101"; //ServletContext application = null; //IWApplicationContext iwac = IWMainApplication.getIWMainApplication(application).getIWApplicationContext(); //BuilderLogic.getInstance().getIBXMLPage(pageKey). ICPage ibpage = ((com.idega.core.builder.data.ICPageHome) com.idega.data.IDOLookup.getHome(ICPage.class)).findByPrimaryKey(new Integer(pageKey)); ibpage.setFormat("HTML"); OutputStream outStream = ibpage.getPageValueForWrite(); Reader input = new BufferedReader(iReader); Writer output = new OutputStreamWriter(outStream); instance.setInput(input); instance.setOutput(output); instance.setUrlPrefix(urlPrefix); instance.process(); ibpage.store(); //PageCacher.flagPageInvalid(pageKey); //PageCacher.flagAllPagesInvalid(); } /** * Execute the processing. Read the input, search/replace and write to the output. * This method should be called last, after all set methods are called. */ public void process() { Reader reader = getInput(); StringBuffer sb = new StringBuffer(); int buffersize = 1000; char[] buffer = new char[buffersize]; try { int read = reader.read(buffer); while(read!=-1){ sb.append(buffer,0,read); read = reader.read(buffer); } reader.close(); } catch (IOException e) { e.printStackTrace(); } StringBuffer outString = null; Iterator patternIter = getPatterns().iterator(); StringBuffer replaceBuffer = sb; while (patternIter.hasNext()) { outString = new StringBuffer(); Pattern p = (Pattern) patternIter.next(); Matcher m = p.matcher(replaceBuffer); while (m.find()) { // this pattern matches. int groupCount = m.groupCount(); for(int i=0;i<= groupCount;i++){ String s = m.group(i); System.out.println(s); } String url = m.group(2); if(getIfRewriteURL(url)){ //if this is a relative url: m.appendReplacement(outString,"$1"+getRewrittenURL(url)+"$3"); } else{ //Do not replace the url m.appendReplacement(outString,"$0"); } } m.appendTail(outString); replaceBuffer=new StringBuffer(outString.toString()); } String utfString; try { utfString = new String(outString.toString().getBytes("UTF-8"),"UTF-8"); StringReader sr = new StringReader(utfString); System.out.println("[HTMLReferenceWriter] The final html string in unicode:\n"+utfString); Writer out = getOutput(); int bufferlength=1000; char[] buf = new char[bufferlength]; int read = sr.read(buf); while (read!=-1){ out.write(buf,0,read); read = sr.read(buf); } sr.close(); out.close(); } catch (UnsupportedEncodingException e1) { e1.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * Gets the rewritten URL. this can be overridden */ public String getRewrittenURL(String relativeURL){ String urlPrefix = getUrlPrefix(); if(relativeURL.startsWith(SLASH)&&urlPrefix.endsWith(SLASH)){ return urlPrefix+relativeURL.substring(1,relativeURL.length()); } else{ return this.urlPrefix+relativeURL; } } /** * Gets if th URL is appropriate to be rewritten<br> * e.g. if it does not contain http:, javascript:,mailto: or # prefixes * @param url the found url in the source * @return */ public boolean getIfRewriteURL(String url){ // not if it starts with these prefixes:: return !(url.startsWith("http:")||url.startsWith("javascript:")||url.startsWith("mailto:")||url.startsWith("#")); } /** * @return Returns the input. */ public Reader getInput() { return this.input; } /** * Set the Input (file or stream) * @param input The input to set. */ public void setInput(Reader input) { this.input = input; } /** * @return Returns the output. */ public Writer getOutput() { return this.output; } /** * Set the Output (file or stream) to write the rewritten HTML to. * @param output The output to set. */ public void setOutput(Writer output) { this.output = output; } /** * Returns the set URLPrefix and appends a "/" to the end if it is not set. * @return Returns the urlPrefix. */ public String getUrlPrefix() { if(!this.urlPrefix.endsWith(SLASH)){ return this.urlPrefix+SLASH; } return this.urlPrefix; } /** * @param urlPrefix The urlPrefix to set. */ public void setUrlPrefix(String urlPrefix) { this.urlPrefix = urlPrefix; } }