/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans.standardize; import java.net.URI; import java.net.URISyntaxException; import javax.inject.Named; import org.datacleaner.api.Categorized; import org.datacleaner.api.Configured; import org.datacleaner.api.Description; import org.datacleaner.api.InputColumn; import org.datacleaner.api.InputRow; import org.datacleaner.api.OutputColumns; import org.datacleaner.api.Transformer; import org.datacleaner.components.categories.TextCategory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Strings; @Named("URL parser") @Description("Retrieve the individual parts of an URL, including protocol, domain, port, path and querystring.") @Categorized({ TextCategory.class }) public class UrlStandardizerTransformer implements Transformer { private static final Logger logger = LoggerFactory.getLogger(UrlStandardizerTransformer.class); @Configured InputColumn<String> inputColumn; @Override public OutputColumns getOutputColumns() { return new OutputColumns(String.class, new String[] { "Protocol", "Domain", "Port", "Path", "Querystring" }); } @Override public String[] transform(final InputRow inputRow) { final String value = inputRow.getValue(inputColumn); return transform(value); } public String[] transform(final String value) { String protocol = null; String host = null; String port = null; String path = null; String queryString = null; if (value != null) { try { final URI url = new URI(value); protocol = url.getScheme(); host = url.getHost(); if (url.getPort() != -1) { port = Integer.toString(url.getPort()); } if (!Strings.isNullOrEmpty(url.getPath())) { path = url.getPath(); } queryString = url.getRawQuery(); } catch (final URISyntaxException e) { logger.info("Throwing away illegal URL \"{}\"", value); } } return new String[] { protocol, host, port, path, queryString }; } }