/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.resourcestore.resourcefile;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.wayback.util.ByteOp;
/**
*
*
* @author brad
* @version $Date$, $Revision$
*/
public class UrlLinkExtractor {
private final static String QUOTED_ATTR_VALUE = "(?:\"[^\">]*\")";
private final static String ESC_QUOTED_ATTR_VALUE = "(?:\\\\\"[^>\\\\]*\\\\\")";
private final static String APOSED_ATTR_VALUE = "(?:'[^'>]*')";
private final static String RAW_ATTR_VALUE = "(?:[^ \\t\\n\\x0B\\f\\r>\"']+)";
private final static String ANY_ATTR_VALUE = QUOTED_ATTR_VALUE + "|"
+ APOSED_ATTR_VALUE + "|" + ESC_QUOTED_ATTR_VALUE + "|"
+ RAW_ATTR_VALUE;
private final static String tagName = "a";
private final static String attrName = "href";
private final static String tagPatString = "<\\s*" + tagName +
"\\s+[^>]*\\b" + attrName +
"\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?";
private final static Pattern pc = Pattern.compile(tagPatString,
Pattern.CASE_INSENSITIVE);
public static List<String> extractLinks(final String url) throws IOException {
URL u = new URL(url);
InputStream is = u.openStream();
InputStreamReader isr = new InputStreamReader(is,ByteOp.UTF8);
StringBuilder sb = new StringBuilder(2000);
int READ_SIZE = 2048;
char cbuf[] = new char[READ_SIZE];
int amt = 0;
while((amt = isr.read(cbuf, 0, READ_SIZE)) != -1) {
sb.append(new String(cbuf,0,amt));
}
return extractAnchors(sb);
}
private static List<String> extractAnchors(final StringBuilder sb) {
Matcher m = pc.matcher(sb);
ArrayList<String> anchors = new ArrayList<String>();
int idx = 0;
while(m.find(idx)) {
anchors.add(trimAttr(m.group(1)));
idx = m.end(1);
}
return anchors;
}
private static String trimAttr(final String attr) {
int attrLength = attr.length();
if (attr.charAt(0) == '"') {
return attr.substring(1, attrLength - 1);
} else if (attr.charAt(0) == '\'') {
return attr.substring(1, attrLength - 1);
} else if (attr.charAt(0) == '\\') {
return attr.substring(2, attrLength - 2);
}
return attr;
}
}