package edu.cmu.minorthird.text;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Annotate substrings that are legal URLs.
*
*
* @author William Cohen
*/
public class URLAnnotator extends AbstractAnnotator
{
static final Pattern URL_CANDIDATE = Pattern.compile("\\b(\\w+:)?/[/\\w;:\\@\\$\\-~#%\\?\\&\\+=\\.]+");
static final String URL_SPANTYPE = "URL";
static final String URL_ANNOTATION_TYPE = "URL";
@Override
protected void doAnnotate(MonotonicTextLabels labels)
{
for (Iterator<Span> i=labels.getTextBase().documentSpanIterator(); i.hasNext(); ) {
Span docSpan = i.next();
String docString = docSpan.getDocumentContents();
Matcher m = URL_CANDIDATE.matcher(docString);
while (m.find()) {
int lo = m.start();
int hi = m.end();
if (validURL( docString.substring(lo,hi) )) {
labels.addToType( docSpan.charIndexSubSpan(lo,hi), URL_SPANTYPE );
}
}
}
labels.setAnnotatedBy("URL");
}
private boolean validURL(String s)
{
try {
new URL(s);
return true;
} catch (MalformedURLException ex) {
}
return false;
}
@Override
public String explainAnnotation(TextLabels labels,Span documentSpan)
{
return "no explanation available";
}
}