/* * Copyright 2007 T-Rank AS * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package no.trank.openpipe.wikipedia.step; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.nio.charset.Charset; import no.trank.openpipe.api.BasePipelineStep; import no.trank.openpipe.api.PipelineException; import no.trank.openpipe.api.PipelineStepStatus; import no.trank.openpipe.api.document.Document; import no.trank.openpipe.config.annotation.NotEmpty; import no.trank.openpipe.config.annotation.NotNull; /** * Makes a wikipedia url from the title. * <br/><br/> * Prerequisite: Titlefield must be set. * <br/><br/> * If baseUrl is set to http://en.wikipedia.org/wiki/, the url for the page with title: Hjalmar Johansen will * become: http://en.wikipedia.org/wiki/Hjalmar_Johansen * * @version $Revision$ */ public class WikipediaUrlBuilder extends BasePipelineStep { @NotNull private String baseUrl; @NotEmpty private String titleField = "title"; @NotEmpty private String urlField = "url"; @NotEmpty private String urlEncoding = "UTF-8"; @Override public void prepare() throws PipelineException { super.prepare(); Charset.forName(urlEncoding); } @Override public PipelineStepStatus execute(Document doc) throws PipelineException { try { final String title = doc.getFieldValue(titleField); if (title != null) { doc.setFieldValue(urlField, buildUrl(title)); } return PipelineStepStatus.DEFAULT; } catch (UnsupportedEncodingException e) { // Should never happen since this has been checked in prepare throw new PipelineException("Unsupported urlEncoding", e); } } private String buildUrl(String title) throws UnsupportedEncodingException { int startIdx = 0; final StringBuilder sb = new StringBuilder(title.length() + 16 + baseUrl.length()); sb.append(baseUrl); for (int i = 0; i < title.length(); i++) { final char c = title.charAt(i); if (c == '/') { if (startIdx < i) { sb.append(URLEncoder.encode(title.substring(startIdx, i), urlEncoding)); } sb.append(c); startIdx = i + 1; } else if (c == ' ') { if (startIdx < i) { sb.append(URLEncoder.encode(title.substring(startIdx, i), urlEncoding)); } sb.append('_'); startIdx = i + 1; } } if (startIdx < title.length()) { sb.append(URLEncoder.encode(title.substring(startIdx), urlEncoding)); } return sb.toString(); } /** * Gets the url encoding to use for the created url. * * @return the url encoding to use for the created url. */ public String getUrlEncoding() { return urlEncoding; } /** * Sets the url encoding to use for the created url. * * @param urlEncoding the url encoding to use for the created url. */ public void setUrlEncoding(String urlEncoding) { this.urlEncoding = urlEncoding; } /** * Gets the base url to the wikipedia site. * * @return the base url to the wikipedia site */ public String getBaseUrl() { return baseUrl; } /** * Sets the base url to the wikipedia site * @param baseUrl the base url to the wikipedia site. */ public void setBaseUrl(String baseUrl) { this.baseUrl = baseUrl; } /** * Gets the fieldName for the title of the document. * * @return the fieldName for the title of the document */ public String getTitleField() { return titleField; } /** * Sets the fieldName for the title of the document. * * @param titleField the fieldName for the title of the document */ public void setTitleField(String titleField) { this.titleField = titleField; } /** * Gets the field name of the field that the url should be set. * * @return the field name of the field that the url should be set */ public String getUrlField() { return urlField; } /** * Sets the field name of the field that the url should be set. * * @param urlField the field name of the field that the url should be set */ public void setUrlField(String urlField) { this.urlField = urlField; } @Override public String getRevision() { return "$Revision$"; } }