package com.brucezee.jspider.parser.expression; import com.brucezee.jspider.common.utils.PrefixSuffix; import com.brucezee.jspider.common.utils.SpiderStrUtils; import org.apache.commons.lang3.StringUtils; import java.util.LinkedList; import java.util.List; /** * 分段截取字符串 * Created by zhoubing on 2016/12/1. */ public class SegmentExpression { private static final String SEGMENT_KEY = "segment"; private static final String RELATIVE_PREFIX = " ";//相对路径 前面使用空格作为标志 // segment:adfajsdfasdfasdf($TT)jjjjjjjjjjjjjjjjj segment:tttttttttttt($TF)mmmmmmmmmmmmmm // lazy=true表示找到之后即停止 lazy=false表示找到之后继续找直到再也找不到为止 // ($TT) lazyPrefix=true, lazySuffix=true 最前最小 // ($) lazyPrefix=true, lazySuffix=true 最前最小 // ($TF) lazyPrefix=true, lazySuffix=false 最前最大 // ($FF) lazyPrefix=false, lazySuffix=false 最后最大 // ($FT) lazyPrefix=false, lazySuffix=true 最后最小 // 多个表达式用空格隔开 // 整个表达式的前面以空格开头表示相对表达式 public static boolean isSegmentExpression(String selector) { return selector != null && selector.trim().startsWith(SEGMENT_KEY); } public static String matcher(String html, String childText, String selector) { List<PrefixSuffix> prefixSuffixList = parse(selector); if (prefixSuffixList == null || prefixSuffixList.size() == 0) { return null; } String text = selector.startsWith(RELATIVE_PREFIX) ? childText : html; return SpiderStrUtils.getMiddleText(text, prefixSuffixList.toArray(new PrefixSuffix[prefixSuffixList.size()])); } public static List<PrefixSuffix> parse(String selector) { String[] array = selector.split(SEGMENT_KEY+":"); List<PrefixSuffix> list = new LinkedList<PrefixSuffix>(); for (String str : array) { if (StringUtils.isBlank(str)) { continue; } str = str.trim(); String[] strArray = str.split("\\(\\$TT\\)|\\(\\$FF\\)|\\(\\$TF\\)|\\(\\$FT\\)|\\(\\$\\)"); if (strArray.length != 2) { throw new IllegalArgumentException("illegal segment expression "+str); } PrefixSuffix prefixSuffix = new PrefixSuffix(strArray[0], strArray[1]); boolean flag = str.contains("($TT)") || str.contains("($)");// ($) = ($TT) prefixSuffix.lazyPrefix = flag || str.contains("($TF)"); prefixSuffix.lazySuffix = flag || str.contains("($FT)"); list.add(prefixSuffix); } return list.size() > 0 ? list : null; } }