NewSurtTokenizer.java example

Explorer
wayback-machine-master
package org.archive.surt;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

/**
 * The new SURT tokenizer breaks a SURT up into tokens.
 * 
 * For example "http://(org,archive,www,)/path/file.html?query#anchor" is broken up into:
 * 
 * ["http://(" 
 *  "org,"
 *  "archive," 
 *  "www," 
 *  ")/"
 *  "path/"
 *  "file.html"
 *  "?query"
 *  "#anchor"]
 * 
 * @author aosborne
 *
 */
public class NewSurtTokenizer implements Iterable<String> {
    private String surt;
    private int endOfAuthority;
    private int endOfPath;
    private int surtLength;
    private int preTabLength;

    public NewSurtTokenizer(String surt) {
        super();
        this.surt = surt;
        surtLength = surt.length();
        
        if (surt.charAt(surtLength - 1) == '\t') {
            preTabLength = surtLength - 1;
        } else {
            preTabLength = surtLength;
        }
        
        endOfAuthority = surt.indexOf(')');
        if (endOfAuthority == -1) {
            endOfAuthority = surtLength;
        }
        
        int hash = surt.indexOf('#');
        int question = surt.indexOf('?');
        if (hash == -1) {
            endOfPath = question;
        } else if (question == -1) {
            endOfPath = hash;
        } else {
            endOfPath = hash < question ? hash : question;
        }
        if (endOfPath == -1) {
            endOfPath = surtLength;
        }
        
    }

    private class NewSurtTokenizerIterator implements Iterator<String> {
        int pos = 0;

        public boolean hasNext() {
            return pos < surtLength;
        }

        private int nextPieceEnd() {
            // Special case: If the SURT ends with a tab, we treat that as an extra token.
            // A trailing tab is sometimes used (for better or worse) to make a distinction between
            // and exact match and prefix match.
            if (pos >= preTabLength && pos < surtLength) {
                return surtLength;
            }
            
            // Scheme: "http://(..."
            if (pos == 0) {
                int i = surt.indexOf('(');
                if (i == -1) {
                    return preTabLength;
                }
                return i + 1; // "http://("
            }
            // Host components: "foo,..."
            if (pos < endOfAuthority || endOfAuthority == -1) {
                int endOfHostComponent = surt.indexOf(',', pos);
                if (endOfHostComponent == -1) {
                    return preTabLength;
                } else {
                    return endOfHostComponent + 1;
                }
            } 
            
            // Host index: ")/..."
            if (pos == endOfAuthority) {
                return pos + 2;
            }
            
            // Path segments: "directory/"
            if (pos < endOfPath || endOfPath == -1) {
                int endOfPathSegment = surt.indexOf('/', pos);
                if (endOfPathSegment < endOfPath && endOfPathSegment != -1) {
                    return endOfPathSegment + 1;
                } else if (endOfPath != -1) { // file: "hello.html"
                    return endOfPath;
                } else {
                    return preTabLength;
                }   
            }
            
            // Query string
            if (surt.charAt(pos) == '?') {
                int endOfQuery = surt.indexOf('#');
                if (endOfQuery != -1) {
                    return endOfQuery;
                } else {
                    return preTabLength;
                }
            }
            
            // Anchor "#boo"
            return preTabLength;
        }

        public String next() {
            int pieceEnd = nextPieceEnd();
            String piece = surt.substring(pos, pieceEnd);
            pos = pieceEnd;
            return piece;
        }

        public void remove() {
            // TODO Auto-generated method stub

        }

    }

    public Iterator<String> iterator() {
        return new NewSurtTokenizerIterator();
    }

    public List<String> toList() {
        List<String> list = new ArrayList<String>();
        for (String piece: this) {
            list.add(piece);
        }
        return list;
    }
    public String[] toArray() {
        return (String[]) toList().toArray();
    }

    /**
     * Return a list of searches in order of decreasing length.  For example
     * given the surt "(org,archive,)/fishing" return:
     * 
     * [ "(org,archive,)/fishing",
     *   "(org,archive,)/",
     *   "(org,archive,",
     *   "(org,",
     *   "("
     * ]
     * @return
     */
    public List<String> getSearchList() {
        List<String> searches = new ArrayList<String>();
        String running = "";
        for (String token: this) {
            running += token;
            searches.add(0, running);
        }
        return searches;
    }

}