/** * License Agreement for OpenSearchServer * <p> * Copyright (C) 2010-2016 Emmanuel Keller / Jaeksoft * <p> * http://www.open-search-server.com * <p> * This file is part of OpenSearchServer. * <p> * OpenSearchServer is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * <p> * OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * <p> * You should have received a copy of the GNU General Public License * along with OpenSearchServer. * If not, see <http://www.gnu.org/licenses/>. **/ package com.jaeksoft.searchlib.crawler.web.database; import com.jaeksoft.searchlib.SearchLibException; import com.jaeksoft.searchlib.crawler.web.database.UrlFilterItem.Type; import com.jaeksoft.searchlib.util.ReadWriteLock; import com.jaeksoft.searchlib.util.StringUtils; import com.jaeksoft.searchlib.util.XPathParser; import com.jaeksoft.searchlib.util.XmlWriter; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPathExpressionException; import java.io.File; import java.io.IOException; import java.util.SortedSet; import java.util.TreeSet; public class UrlFilterList implements XmlWriter.Interface { final private ReadWriteLock rwl = new ReadWriteLock(); private File configFile; private TreeSet<UrlFilterItem> filterSet; private UrlFilterItem[] array; public UrlFilterList(File indexDir, String filename) throws SearchLibException { configFile = new File(indexDir, filename); filterSet = new TreeSet<UrlFilterItem>(); array = null; try { load(); } catch (ParserConfigurationException e) { throw new SearchLibException(e); } catch (SAXException e) { throw new SearchLibException(e); } catch (IOException e) { throw new SearchLibException(e); } catch (XPathExpressionException e) { throw new SearchLibException(e); } } private void load() throws ParserConfigurationException, SAXException, IOException, XPathExpressionException, SearchLibException { if (!configFile.exists()) return; XPathParser xpp = new XPathParser(configFile); NodeList nodeList = xpp.getNodeList("/urlFilters/urlFilter"); int l = nodeList.getLength(); TreeSet<UrlFilterItem> set = new TreeSet<UrlFilterItem>(); for (int i = 0; i < l; i++) { UrlFilterItem item = new UrlFilterItem(nodeList.item(i)); set.add(item); } rwl.w.lock(); try { filterSet = set; array = null; } finally { rwl.w.unlock(); } } @Override public void writeXml(XmlWriter xmlWriter) throws SAXException { rwl.w.lock(); try { xmlWriter.startElement("urlFilters"); for (UrlFilterItem item : filterSet) item.writeXml(xmlWriter); xmlWriter.endElement(); xmlWriter.endDocument(); } finally { rwl.w.unlock(); } } public UrlFilterItem[] getArray() { rwl.r.lock(); try { if (array != null) return array; array = new UrlFilterItem[filterSet.size()]; filterSet.toArray(array); return array; } finally { rwl.r.unlock(); } } public void add(UrlFilterItem item) { rwl.w.lock(); try { filterSet.add(item); array = null; } finally { rwl.w.unlock(); } } public void remove(UrlFilterItem item) { rwl.w.lock(); try { filterSet.remove(item); array = null; } finally { rwl.w.unlock(); } } public UrlFilterItem get(String name) { rwl.r.lock(); try { UrlFilterItem finder = new UrlFilterItem(name, null); SortedSet<UrlFilterItem> s = filterSet.subSet(finder, true, finder, true); if (s == null) return null; if (s.size() == 0) return null; return s.first(); } finally { rwl.r.unlock(); } } private static final String doReplaceQuery(String hostname, String uriString, UrlFilterItem[] urlFilterArray) { int i = uriString.indexOf('?'); if (i == -1) return uriString; StringBuilder newUrl = new StringBuilder(uriString.substring(0, i++)); String queryString = uriString.substring(i); String[] queryParts = queryString.split("\\" + '&'); if (queryParts == null || queryParts.length == 0) return uriString; for (UrlFilterItem urlFilter : urlFilterArray) if (urlFilter.getType() == Type.QUERY) urlFilter.doReplaceQuery(hostname, queryParts); boolean first = true; for (String queryPart : queryParts) { if (queryPart != null) { if (first) { newUrl.append('?'); first = false; } else newUrl.append('&'); newUrl.append(queryPart); } } return newUrl.toString(); } private static final String doReplaceResource(String hostname, String uriString, UrlFilterItem[] urlFilterArray) { int i1 = uriString.indexOf(';'); if (i1 == -1) return uriString; i1++; if (i1 == uriString.length()) return uriString; String part = uriString.substring(i1); int i2 = StringUtils.indexOfAny(part, "/?#&$"); if (i2 != -1) part = part.substring(0, i2); boolean bReplace = false; for (UrlFilterItem urlFilter : urlFilterArray) { if (urlFilter.getType() == Type.QUERY && urlFilter.isReplacePart(hostname, part)) { bReplace = true; break; } } if (!bReplace) return uriString; StringBuilder newUrl = new StringBuilder(uriString.substring(0, i1 - 1)); if (i2 != -1) newUrl.append(uriString.substring(i2 + i1)); return newUrl.toString(); } private static final String doReplaceFragment(String hostname, String uriString, UrlFilterItem[] urlFilterArray) { int i1 = uriString.indexOf('#'); if (i1 == -1) return uriString; String part = uriString.substring(i1 + 1); boolean bReplace = false; for (UrlFilterItem urlFilter : urlFilterArray) { if (urlFilter.getType() == Type.FRAGMENT && urlFilter.isReplacePart(hostname, part)) { bReplace = true; break; } } return bReplace ? uriString.substring(0, i1) : uriString; } public static final String doReplace(String hostname, String uriString, UrlFilterItem[] urlFilterArray) { if (urlFilterArray == null) return uriString; uriString = doReplaceQuery(hostname, uriString, urlFilterArray); uriString = doReplaceResource(hostname, uriString, urlFilterArray); uriString = doReplaceFragment(hostname, uriString, urlFilterArray); return uriString; } }