/*
* WPCleaner: A tool to help on Wikipedia maintenance tasks.
* Copyright (C) 2013 Nicolas Vervelle
*
* See README.txt file for licensing information.
*/
package org.wikipediacleaner.utils;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.wikipediacleaner.api.check.HtmlCharacters;
/**
* A text provider for URL titles.
*/
public class TextProviderUrlTitle implements TextProvider {
private final static Logger log = Logger.getLogger(TextProviderUrlTitle.class.getName());
/**
* URL.
*/
private final String url;
/**
* Maximum size of URL contents read to find title.
*/
private final static int MAXIMUM_SIZE = 10000;
/**
* @param url URL.
*/
public TextProviderUrlTitle(String url) {
this.url = url;
}
/**
* @return Possible texts.
*/
@Override
public Collection<String> getTexts() {
log.fine("IN");
Collection<String> result = new ArrayList<String>();
if (url != null) {
GetMethod method = null;
InputStream is = null;
try {
log.fine("new HttpClient()");
HttpClient httpClient = new HttpClient();
method = new GetMethod(url);
System.out.println(url);
int statusCode = httpClient.executeMethod(method);
if (statusCode == HttpStatus.SC_OK) {
log.fine("HttpStatus.SC_OK");
is = method.getResponseBodyAsStream();
byte[] tmpBytes = new byte[MAXIMUM_SIZE];
int size = is.read(tmpBytes);
Charset utf8 = Charset.forName("UTF8");
String text = new String(tmpBytes, 0, size, utf8).replaceAll("\\s", " ");
Pattern pCharset = Pattern.compile(
"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=([^\"]+?)\"",
Pattern.CASE_INSENSITIVE);
Matcher m = pCharset.matcher(text);
if (m.find() == true) {
String charsetName = m.group(1).trim();
try {
Charset charset = Charset.forName(charsetName);
if (!charset.equals(utf8)) {
text = new String(tmpBytes, 0, size, charset).replaceAll("\\s", " ");
}
} catch (Exception e) {
//
}
}
Pattern[] pTitles = {
Pattern.compile("<title>(.+?)</title>", Pattern.CASE_INSENSITIVE),
Pattern.compile("<meta name=\"title\" content=\"([^\"]+?)\"", Pattern.CASE_INSENSITIVE),
Pattern.compile("<meta name=\"description\" content=\"([^\"]+?)\"", Pattern.CASE_INSENSITIVE)
};
for (Pattern pTitle : pTitles) {
m = pTitle.matcher(text);
if (m.find() == true) {
String title = m.group(1).trim();
for (HtmlCharacters htmlChar : HtmlCharacters.values()) {
if (!HtmlCharacters.SYMBOL_AMPERSAND.equals(htmlChar)) {
title = title.replaceAll("" + htmlChar.getNumber() + ";", "" + htmlChar.getValue());
if (htmlChar.getNumber() != htmlChar.getAlternativeNumber()) {
title = title.replaceAll("" + htmlChar.getAlternativeNumber() + ";", "" + htmlChar.getValue());
}
if (htmlChar.getName() != null) {
title = title.replaceAll("&" + htmlChar.getName() + ";", "" + htmlChar.getValue());
}
}
}
result.add(title);
}
}
}
} catch (IOException ex) {
log.fine("IOException");
// Nothing to do
} catch (Exception ex) {
log.fine("Exception");
// Nothing to do
} finally {
log.fine("finally");
if (is != null) {
try {
is.close();
} catch (IOException ex) {
// Nothing to do
}
}
if (method != null) {
method.releaseConnection();
}
}
}
log.fine("OUT");
return result;
}
}