package com.brucezee.jspider.parser.expression;
import com.brucezee.jspider.common.utils.SpiderStrUtils;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 多个正则表达式组合截取字符串
* Created by zhoubing on 2016/11/30.
*/
public class RegexExpression {
private String regex;
private int groupCount = 0;//默认0
private boolean isRelative = false;
private static final String REGEX_KEY = "regex";
private static final String RELATIVE_PREFIX = " ";//相对路径的正则 前面使用空格作为标志
// regex[0]:aaaaaaaa regex[1]:ffffffffff regex[3]:ddddddddd regex:eeeeeeee regex[0]:iiiiiiiii
// 以regex为前缀或不使用前缀,regex[1]表示取分组中的第1项分组内容 regex不加分组标记默认取第0项分组内容,多个表达式用空格隔开
// 整个表达式的前面以空格开头表示相对表达式
public String getRegex() {
return regex;
}
public void setRegex(String regex) {
this.regex = regex;
}
public int getGroupCount() {
return groupCount;
}
public void setGroupCount(int groupCount) {
this.groupCount = groupCount;
}
public boolean isRelative() {
return isRelative;
}
public void setRelative(boolean isRelative) {
this.isRelative = isRelative;
}
public static boolean isRegexExpression(String selector) {
return selector != null && selector.trim().startsWith(REGEX_KEY);
}
//regex[0]:aaaaaaaa regex[1]:ffffffffff regex[3]:ddddddddd regex:eeeeeeee regex[0]:iiiiiiiii
public static List<RegexExpression> parse(String selector) {
List<RegexExpression> selectorList = new LinkedList<RegexExpression>();
while(selector.length() > 0) {
int index = selector.indexOf(REGEX_KEY);
if (index < 0) {
if (selectorList.size() == 0) {
RegexExpression regexExpression = new RegexExpression();
regexExpression.setRelative(selector.startsWith(RELATIVE_PREFIX));
regexExpression.setRegex(selector.trim());
selectorList.add(regexExpression);
}
break;
}
boolean isRelative = selector.startsWith(RELATIVE_PREFIX);
selector = selector.substring(index+5);
int groupCount = 0;
String regex = null;
if (selector.startsWith("[")) {
groupCount = Integer.parseInt(SpiderStrUtils.getMiddleText(selector, "[", "]"));
selector = selector.substring(selector.indexOf("]")+1);
}
if (selector.startsWith(":")) {
index = selector.indexOf(REGEX_KEY);
if (index > 0) {
regex = selector.substring(1, index).trim();
selector = selector.substring(index);
} else {
selector = selector.substring(1);
regex = selector.trim();
}
RegexExpression regexExpression = new RegexExpression();
regexExpression.setGroupCount(groupCount);
regexExpression.setRelative(isRelative);
regexExpression.setRegex(regex);
selectorList.add(regexExpression);
} else {
throw new IllegalArgumentException("illegal regex expression "+selector);
}
}
return selectorList.size() > 0 ? selectorList : null;
}
public static List<String> matcher(String html, String childText, String selector) {
List<RegexExpression> selectorList = parse(selector);
if (selectorList == null || selectorList.size() == 0) {
return null;
}
RegexExpression regexExpression = null;
Matcher matcher = null;
String text = null;
if (selectorList.size() == 1) {
regexExpression = selectorList.get(selectorList.size()-1);
text = regexExpression.isRelative() ? childText : html;
matcher = Pattern.compile(regexExpression.getRegex()).matcher(text);
} else if (selectorList.size() > 1) {
for (int i = 0; i < selectorList.size() - 1; i++) {
regexExpression = selectorList.get(i);
if (text == null) {
text = regexExpression.isRelative() ? childText : html;
}
matcher = Pattern.compile(regexExpression.getRegex()).matcher(text);
if (matcher.find()) {
//中间过程文本
text = matcher.group(regexExpression.getGroupCount());
} else {
//没有找到匹配
text = null;
break;
}
}
if (text == null) {
return null;
}
regexExpression = selectorList.get(selectorList.size()-1);
matcher = Pattern.compile(regexExpression.getRegex()).matcher(text);
}
List<String> list = new LinkedList<String>();
while (matcher.find()) {
list.add(matcher.group(regexExpression.getGroupCount()));
}
return list.size() > 0 ? list : null;
}
}