/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.ingestion.google.webmaster; import java.util.ArrayList; import java.util.Iterator; import java.util.NoSuchElementException; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Triple; public class UrlTriePrefixGrouper implements UrlGrouper { private final int _groupSize; private final UrlTrie _trie; private final Iterator<Pair<String, UrlTrieNode>> _iterator; private Triple<String, GoogleWebmasterFilter.FilterOperator, UrlTrieNode> _retVal; public UrlTriePrefixGrouper(UrlTrie trie, int groupSize) { _trie = trie; _groupSize = groupSize; _iterator = new UrlTriePostOrderIterator(trie, groupSize); } @Override public boolean hasNext() { if (_retVal != null) { return true; } while (_iterator.hasNext() && _retVal == null) { Pair<String, UrlTrieNode> nextPair = _iterator.next(); UrlTrieNode nextNode = nextPair.getRight(); if (nextNode.getSize() <= _groupSize) { _retVal = Triple.of(nextPair.getLeft(), GoogleWebmasterFilter.FilterOperator.CONTAINS, nextNode); return true; } else if (nextNode.isExist()) { _retVal = Triple.of(nextPair.getLeft(), GoogleWebmasterFilter.FilterOperator.EQUALS, nextNode); return true; } } return false; } @Override public Triple<String, GoogleWebmasterFilter.FilterOperator, UrlTrieNode> next() { if (hasNext()) { Triple<String, GoogleWebmasterFilter.FilterOperator, UrlTrieNode> retVal = _retVal; _retVal = null; return retVal; } throw new NoSuchElementException(); } public UrlTrie getTrie() { return _trie; } /** * Get the detailed pages under this group */ public static ArrayList<String> groupToPages(Triple<String, GoogleWebmasterFilter.FilterOperator, UrlTrieNode> group) { ArrayList<String> ret = new ArrayList<>(); if (group.getMiddle().equals(GoogleWebmasterFilter.FilterOperator.EQUALS)) { if (group.getRight().isExist()) { ret.add(group.getLeft()); } } else if (group.getMiddle().equals(GoogleWebmasterFilter.FilterOperator.CONTAINS)) { UrlTrie trie = new UrlTrie(group.getLeft(), group.getRight()); Iterator<Pair<String, UrlTrieNode>> iterator = new UrlTriePostOrderIterator(trie, 1); while (iterator.hasNext()) { Pair<String, UrlTrieNode> next = iterator.next(); if (next.getRight().isExist()) { ret.add(next.getLeft()); } } } return ret; } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public int getGroupSize() { return _groupSize; } }