/* See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* Esri Inc. licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.esri.gpt.control.webharvest.client.waf;
import com.esri.gpt.control.webharvest.IterationContext;
import com.esri.gpt.framework.resource.api.Resource;
import com.esri.gpt.framework.resource.query.Criteria;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* WAF folder (quick version).
*/
class WafFolderQuick extends WafFolder {
private static final Pattern A_PATTERN = Pattern.compile("[<]a[^>]*[>]", Pattern.CASE_INSENSITIVE);
private static final Pattern HREF_PATTERN = Pattern.compile("href\\p{Space}*=\\p{Space}*(\"[^\"]*\"|\'[^\']*\')", Pattern.CASE_INSENSITIVE);
private static final String HREF_VALUE_PATTERN = "^[^\"]*\"|\"[^\"]*$|^[^\']*\'|\'[^\']*$";
private Set<String> processedFolders;
/**
* Creates instance of the WAF folder.
* @param context iteration context
* @param info WAF info
* @param proxy WAF proxy
* @param processedFolders set of processed folders
* @param url folder URL
* @param criteria search criteria
*/
public WafFolderQuick(IterationContext context, WafInfo info, WafProxy proxy, Set<String> processedFolders, String url, Criteria criteria) {
super(context, info, proxy, url, criteria);
this.processedFolders = processedFolders;
}
/**
* Parses WAF response.
* @param response response
* @return collection of resources found in the response
* @throws IOException if unable to parse response
*/
@Override
protected Collection<Resource> parseResonse(String response) throws IOException {
final ArrayList<Resource> directoryUrls = new ArrayList<Resource>();
final HashSet<String> processedFiles = new HashSet<String>();
Matcher aMatcher = A_PATTERN.matcher(response);
int aIdx = 0;
URL baseUrl = new URL(url);
while (aMatcher.find(aIdx)) {
String a = aMatcher.group();
Matcher hrefMatcher = HREF_PATTERN.matcher(a);
int hrefIdx = 0;
while (hrefMatcher.find(hrefIdx)) {
if (criteria == null || criteria.getMaxRecords() == null || criteria.getMaxRecords() == 0 || directoryUrls.size() < criteria.getMaxRecords()) {
String documentUrl = hrefMatcher.group().replaceAll(HREF_VALUE_PATTERN, "");
URL pathUrl = new URL(baseUrl, documentUrl);
if (baseUrl.getHost().equals(pathUrl.getHost())) {
String pathExternalForm = pathUrl.toExternalForm();
if (documentUrl.endsWith("/")) {
if (pathExternalForm.startsWith(url)) {
if (!processedFolders.contains(pathExternalForm.toLowerCase())) {
directoryUrls.add(new WafFolderQuick(context, info, proxy, processedFolders, pathExternalForm, criteria));
processedFolders.add(pathExternalForm.toLowerCase());
}
}
} else if (documentUrl.toLowerCase().endsWith(".xml")) {
if (!processedFiles.contains(pathExternalForm.toLowerCase())) {
directoryUrls.add(new WafFile(proxy, pathExternalForm));
processedFiles.add(pathExternalForm.toLowerCase());
}
}
}
} else {
break;
}
hrefIdx = hrefMatcher.end() + 1;
}
aIdx = aMatcher.end() + 1;
}
return directoryUrls;
}
}