/** * AnalyzerBeans * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.eobjects.analyzer.beans.standardize; import java.util.ArrayList; import java.util.List; import org.eobjects.analyzer.beans.api.Categorized; import org.eobjects.analyzer.beans.api.Configured; import org.eobjects.analyzer.beans.api.Description; import org.eobjects.analyzer.beans.api.Initialize; import org.eobjects.analyzer.beans.api.OutputColumns; import org.eobjects.analyzer.beans.api.Transformer; import org.eobjects.analyzer.beans.api.TransformerBean; import org.eobjects.analyzer.beans.categories.MatchingAndStandardizationCategory; import org.eobjects.analyzer.data.InputColumn; import org.eobjects.analyzer.data.InputRow; import org.eobjects.analyzer.util.HasGroupLiteral; import org.eobjects.analyzer.util.NamedPattern; import org.eobjects.analyzer.util.NamedPatternMatch; @TransformerBean("URL standardizer") @Description("Retrieve the individual parts of an URL, including protocol, domain, port, path and querystring.") @Categorized({ MatchingAndStandardizationCategory.class }) public class UrlStandardizerTransformer implements Transformer<String> { public static final String[] PATTERNS = { "PROTOCOL://DOMAIN:PORTPATH\\?QUERYSTRING", "PROTOCOL://DOMAINPATH\\?QUERYSTRING", "PROTOCOL://DOMAIN:PORTPATH", "PROTOCOL://DOMAIN:PORT\\?QUERYSTRING", "PROTOCOL://DOMAIN\\?QUERYSTRING", "PROTOCOL://DOMAINPATH", "PROTOCOL://DOMAIN:PORT", "PROTOCOL://DOMAIN" }; public static enum UrlPart implements HasGroupLiteral { PROTOCOL, DOMAIN, PORT, PATH, QUERYSTRING; @Override public String getGroupLiteral() { if (this == DOMAIN) { return "([a-zA-Z0-9\\._\\-@]+)"; } if (this == PORT) { return "([0-9]+)"; } if (this == PATH) { return "(/[a-zA-Z0-9\\._\\-/#:%]+)"; } if (this == QUERYSTRING) { return "([a-zA-Z0-9\\.=\\?_\\-/%]+)"; } return null; } } @Configured InputColumn<String> inputColumn; private List<NamedPattern<UrlPart>> namedPatterns; @Initialize public void init() { namedPatterns = new ArrayList<NamedPattern<UrlPart>>(PATTERNS.length); for (String pattern : PATTERNS) { namedPatterns.add(new NamedPattern<UrlPart>(pattern, UrlPart.class)); } } @Override public OutputColumns getOutputColumns() { return new OutputColumns("Protocol", "Domain", "Port", "Path", "Querystring"); } @Override public String[] transform(InputRow inputRow) { String value = inputRow.getValue(inputColumn); return transform(value); } public String[] transform(String value) { String protocol = null; String domain = null; String port = null; String path = null; String queryString = null; if (value != null) { for (NamedPattern<UrlPart> namedPattern : namedPatterns) { NamedPatternMatch<UrlPart> match = namedPattern.match(value); if (match != null) { protocol = match.get(UrlPart.PROTOCOL); domain = match.get(UrlPart.DOMAIN); port = match.get(UrlPart.PORT); path = match.get(UrlPart.PATH); queryString = match.get(UrlPart.QUERYSTRING); break; } } } return new String[] { protocol, domain, port, path, queryString }; } }