package org.archive.surt;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
/**
* The new SURT tokenizer breaks a SURT up into tokens.
*
* For example "http://(org,archive,www,)/path/file.html?query#anchor" is broken up into:
*
* ["http://("
* "org,"
* "archive,"
* "www,"
* ")/"
* "path/"
* "file.html"
* "?query"
* "#anchor"]
*
* @author aosborne
*
*/
public class NewSurtTokenizer implements Iterable<String> {
private String surt;
private int endOfAuthority;
private int endOfPath;
private int surtLength;
private int preTabLength;
public NewSurtTokenizer(String surt) {
super();
this.surt = surt;
surtLength = surt.length();
if (surt.charAt(surtLength - 1) == '\t') {
preTabLength = surtLength - 1;
} else {
preTabLength = surtLength;
}
endOfAuthority = surt.indexOf(')');
if (endOfAuthority == -1) {
endOfAuthority = surtLength;
}
int hash = surt.indexOf('#');
int question = surt.indexOf('?');
if (hash == -1) {
endOfPath = question;
} else if (question == -1) {
endOfPath = hash;
} else {
endOfPath = hash < question ? hash : question;
}
if (endOfPath == -1) {
endOfPath = surtLength;
}
}
private class NewSurtTokenizerIterator implements Iterator<String> {
int pos = 0;
public boolean hasNext() {
return pos < surtLength;
}
private int nextPieceEnd() {
// Special case: If the SURT ends with a tab, we treat that as an extra token.
// A trailing tab is sometimes used (for better or worse) to make a distinction between
// and exact match and prefix match.
if (pos >= preTabLength && pos < surtLength) {
return surtLength;
}
// Scheme: "http://(..."
if (pos == 0) {
int i = surt.indexOf('(');
if (i == -1) {
return preTabLength;
}
return i + 1; // "http://("
}
// Host components: "foo,..."
if (pos < endOfAuthority || endOfAuthority == -1) {
int endOfHostComponent = surt.indexOf(',', pos);
if (endOfHostComponent == -1) {
return preTabLength;
} else {
return endOfHostComponent + 1;
}
}
// Host index: ")/..."
if (pos == endOfAuthority) {
return pos + 2;
}
// Path segments: "directory/"
if (pos < endOfPath || endOfPath == -1) {
int endOfPathSegment = surt.indexOf('/', pos);
if (endOfPathSegment < endOfPath && endOfPathSegment != -1) {
return endOfPathSegment + 1;
} else if (endOfPath != -1) { // file: "hello.html"
return endOfPath;
} else {
return preTabLength;
}
}
// Query string
if (surt.charAt(pos) == '?') {
int endOfQuery = surt.indexOf('#');
if (endOfQuery != -1) {
return endOfQuery;
} else {
return preTabLength;
}
}
// Anchor "#boo"
return preTabLength;
}
public String next() {
int pieceEnd = nextPieceEnd();
String piece = surt.substring(pos, pieceEnd);
pos = pieceEnd;
return piece;
}
public void remove() {
// TODO Auto-generated method stub
}
}
public Iterator<String> iterator() {
return new NewSurtTokenizerIterator();
}
public List<String> toList() {
List<String> list = new ArrayList<String>();
for (String piece: this) {
list.add(piece);
}
return list;
}
public String[] toArray() {
return (String[]) toList().toArray();
}
/**
* Return a list of searches in order of decreasing length. For example
* given the surt "(org,archive,)/fishing" return:
*
* [ "(org,archive,)/fishing",
* "(org,archive,)/",
* "(org,archive,",
* "(org,",
* "("
* ]
* @return
*/
public List<String> getSearchList() {
List<String> searches = new ArrayList<String>();
String running = "";
for (String token: this) {
running += token;
searches.add(0, running);
}
return searches;
}
}