/*
* Copyright 2007 T-Rank AS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package no.trank.openpipe.wikipedia.producer.meta;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.util.DateParseException;
import org.apache.commons.httpclient.util.DateUtil;
/**
* @version $Revision$
*/
public class RssMetaParser {
private static Pattern PATT_RSS = Pattern.compile(
"<channel>\\s*<title>(.+)</title>\\s*<link>(.+/(\\w+)/(\\d+))</link>.*" +
"<item>.*<pubDate>(.+)</pubDate>\\s*</item>\\s*</channel>", Pattern.DOTALL);
private final URL fileUrl;
private final URL md5Url;
private final Date modifiedDate;
private String fileName;
public RssMetaParser(String rss) throws MalformedURLException {
final Matcher matcher = PATT_RSS.matcher(rss);
if (!matcher.find()) {
throw new IllegalArgumentException("input does not match pattern '" + PATT_RSS.pattern() + "' was: [" +
rss + ']');
}
final URL baseUrl = new URL(matcher.group(2) + '/');
final String filePrefix = matcher.group(3) + '-' + matcher.group(4);
fileName = filePrefix + '-' + matcher.group(1);
fileUrl = new URL(baseUrl, fileName);
md5Url = new URL(baseUrl, filePrefix + "-md5sums.txt");
try {
modifiedDate = DateUtil.parseDate(matcher.group(5));
} catch (DateParseException e) {
throw new IllegalArgumentException(e);
}
}
public URL getFileUrl() {
return fileUrl;
}
public URL getMd5Url() {
return md5Url;
}
public String findMd5(String md5sums) {
if (md5sums != null) {
final Matcher matcher = Pattern.compile("^([0-9a-fA-F]{32})\\s+" + fileName + '$', Pattern.MULTILINE).matcher(md5sums);
if (matcher.find()) {
return matcher.group(1);
}
}
return null;
}
public Date getModifiedDate() {
return modifiedDate;
}
}