/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.CharBuffer;
import org.junit.Test;
/**
*
* @author rana
*
*/
public class URLNormalizer {
static char slash[] = new String("/").toCharArray();
static char dotSlash[] = new String("./").toCharArray();
static char dotDotSlash[] = new String("../").toCharArray();
static char slashDotSlash[] = new String("/./").toCharArray();
static char slashDotDotSlash[] = new String("/../").toCharArray();
static void removeCharsAt(CharBuffer buffer,int start,int count) {
buffer.position(start + count);
CharBuffer trailingSequence = buffer.slice();
buffer.position(start);
buffer.put(trailingSequence);
buffer.limit(buffer.limit() - count);
}
static void replaceCharsAt(CharBuffer buffer,int start,int count,char[] newCharSequence) {
if (count > newCharSequence.length) {
buffer.position(start + count);
CharBuffer trailingSequence = buffer.slice();
// position past new char sequence ...
buffer.position(start + newCharSequence.length);
buffer.put(trailingSequence);
}
// reset to start position ...
buffer.position(start);
for (char c : newCharSequence) {
buffer.put(c);
}
if (count > newCharSequence.length) {
buffer.limit(buffer.limit() - (count - newCharSequence.length));
}
else {
buffer.limit(buffer.limit() + (newCharSequence.length - count));
}
}
static CharBuffer copyString(String source) {
// create a char buffer ...
CharBuffer buffer = CharBuffer.allocate(source.length());
// copy in original string ...
buffer.put(source);
return buffer;
}
public static String normalizeString(String input) {
CharSequence sequence = input;
CharBuffer buffer = null;
boolean modified = false;
int indexOut = -1;
int searchStart = 0;
// remove all occurences of '/./'
while ((indexOut = indexOf(sequence,buffer,slashDotSlash,0,slashDotSlash.length)) != -1) {
if (!modified) {
buffer = copyString(input);
modified = true;
sequence = buffer;
}
searchStart = indexOut;
// get sub sequence (advanced past pattern)
buffer.position(indexOut + slashDotSlash.length);
CharBuffer trailingSequence = buffer.slice();
// and append it back into the source buffer ...
buffer.position(indexOut);
// append single slash replacement ...
buffer.put('/');
// and trailing string ...
buffer.put(trailingSequence);
// and reset limit ...
buffer.limit(buffer.position());
// and reset position to search start
buffer.position(searchStart);
}
// now process occurrences of /../
indexOut = -1;
searchStart = 0;
if (modified)
buffer.position(0);
while ((indexOut = indexOf(sequence,buffer,slashDotDotSlash,0,slashDotDotSlash.length)) != -1) {
if (!modified) {
buffer = copyString(input);
modified = true;
sequence = buffer;
}
// get sub sequence (advanced past pattern)
buffer.position(indexOut + slashDotDotSlash.length);
CharBuffer trailingSequence = buffer.slice();
// now walk backwards to previous occurence of /
int previousPos = indexOut;
while (--previousPos >= 0) {
if (buffer.get(previousPos) == '/')
break;
}
// now if we found it ...
if (previousPos != -1) {
searchStart = previousPos;
// set position to new location ...
buffer.position(previousPos + 1);
}
else {
searchStart = indexOut;
// otherwise set position to indexout (just replace pattern itself)...
buffer.position(indexOut);
// append single slash replacement ...
buffer.put('/');
}
// and trailing string ...
buffer.put(trailingSequence);
// and reset limit ...
buffer.limit(buffer.position());
// and reset position to search start
buffer.position(searchStart);
}
if (modified)
buffer.position(0);
// now remove leading all leading ./
while ((indexOut = indexOf(sequence,buffer,dotSlash,0,dotSlash.length)) == 0) {
if (!modified) {
buffer = copyString(input);
modified = true;
sequence = buffer;
}
replaceCharsAt(buffer,indexOut,dotSlash.length,slash);
buffer.position(0);
}
if (modified)
buffer.position(0);
// and leading dot dot slashes ../
while ((indexOut = indexOf(sequence,buffer,dotDotSlash,0,dotDotSlash.length)) == 0) {
if (!modified) {
buffer = copyString(input);
modified = true;
sequence = buffer;
}
replaceCharsAt(buffer,indexOut,dotDotSlash.length,slash);
buffer.position(0);
}
if (modified) {
//return modified content...
// reset position ...
buffer.position(0);
return buffer.toString();
}
else {
// return original string ...
return input;
}
}
/**
* Code shared by String and StringBuffer to do searches. The
* source is the character array being searched, and the target
* is the string being searched for.
*
* @param source the characters being searched.
* @param sourceCount count of the source string.
* @param target the characters being searched for.
* @param targetOffset offset of the target string.
* @param targetCount count of the target string.
*/
static int indexOf(CharSequence source,CharBuffer buffer, char[] target, int targetOffset, int targetCount) {
if (targetCount == 0) {
return -1;
}
int sourceOffset = 0;
int sourceCount = source.length();
char first = target[targetOffset];
int max = sourceOffset + (sourceCount - targetCount);
for (int i = sourceOffset; i <= max; i++) {
/* Look for first character. */
if (source.charAt(i) != first) {
while (++i <= max && source.charAt(i) != first);
}
/* Found first character, now look at the rest of v2 */
if (i <= max) {
int j = i + 1;
int end = j + targetCount - 1;
for (int k = targetOffset + 1; j < end && source.charAt(j) ==
target[k]; j++, k++);
if (j == end) {
/* Found whole string. */
if (buffer != null)
return buffer.position() + (i - sourceOffset);
else
return (i - sourceOffset);
}
}
}
return -1;
}
@Test
public void testNormalizer() throws Exception {
/*
String tests[] = { "/aa/bb/../../cc/../foo.html", "aa/bb/../../cc/../foo.html", "/xx/../",
"./xx/.././././foo/././bar.html",".././././bar../html","/dyn-js/backlink.js?blogID=8100929&postID=9203679950426132998"
};
*/
String tests[] = { "/build_research.htm"
};
for (int i=0;i<tests.length;++i) {
long startTime = System.nanoTime();
String normalized = normalizeString(tests[i]);
long endTime = System.nanoTime();
System.out.println("test:" + i +" orig:" + tests[i] + " result: "+ normalized + " took:" + (endTime-startTime) );
}
}
// @Test
public void testNormalizer2() throws Exception {
URL resourceURL = ClassLoader.getSystemResource("big_list.txt");
if (resourceURL == null) {
throw new FileNotFoundException();
}
InputStream stream = resourceURL.openStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(stream)));
String line = null;
do {
line = reader.readLine();
if (line != null){
try {
URL theURL = new URL(line);
String path = theURL.getPath();
long startTime = System.nanoTime();
String normalizedPath = normalizeString(path);
long endTime = System.nanoTime();
if (!path.equals(normalizedPath)) {
System.out.println("URL:" + theURL.toString() + " Normalized to:" + normalizedPath + " Took:" + (endTime-startTime));
}
}
catch (MalformedURLException e) {
System.out.println("Malformed URL:" + line);
}
}
} while(line != null);
}
}