package bimoku.extract.parser; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Patternmatch_Douban { public static void main(String[] args) { String content = "作者: 董小玉 编 出版社: 四川辞书 出版年: 2005-3 页数: 490 定价: 11.00元 ISBN: 9787806821411" ; patternmatchContent(content); } public static String[] patternmatchContent(String content) { String[] paramcontent = new String[5]; //出版社 Pattern p1 = Pattern .compile("作者: [^/s]*[\u4e00-\u9fa5a-zA-Z]*[^/s]*[/s]*[a-zA-Z]*[\u4e00-\u9fa5a-zA-Z]*[/s]*[^/s]*出版社: "); Matcher m1 = p1.matcher(content); boolean result1 = m1.find(); while (result1) { paramcontent[0] = m1.group(0).replaceAll("作者: ", "") .replaceAll("出版社: ", ""); System.out.println(paramcontent[0]); result1 = m1.find(); } //版次 Pattern p2 = Pattern .compile("出版社: [/s]*[\u4e00-\u9fa5a-zA-Z]*[^/s]*[\u4e00-\u9fa5a-zA-Z]*[^/s]*[/s]*[\u4e00-\u9fa5a-zA-Z]*[/s]*[^/s]*译者:"); Matcher m2 = p2.matcher(content); boolean result2 = m2.find(); if (result2) { paramcontent[1] = m2.group(0) .replaceAll("出版社: ", "") .replaceAll("出版年:", "") .replaceAll("译者:", ""); result2 = m2.find(); paramcontent[1] = paramcontent[1].replaceAll("", ""); System.out.println(paramcontent[1]); }else{ Pattern p22 = Pattern .compile("出版社: [/s]*[\u4e00-\u9fa5a-zA-Z]*[^/s]*[\u4e00-\u9fa5a-zA-Z]*[^/s]*[/s]*[\u4e00-\u9fa5a-zA-Z]*[/s]*[^/s]*出版年:"); Matcher m22 = p22.matcher(content); boolean result3 = m22.find(); if(result3){paramcontent[1] = m22.group(0) .replaceAll("出版社: ", "") .replaceAll("出版年:", "") .replaceAll("译者:", ""); result2 = m22.find(); paramcontent[1] = paramcontent[1].replaceAll("", ""); System.out.println(paramcontent[1]); } // } Pattern p3 = Pattern.compile( "定价: [0123456789]+.[0123456789]+", Pattern.CASE_INSENSITIVE); Matcher m3 = p3.matcher(content); boolean result3 = m3.find(); while (result3) { paramcontent[2] = m3.group(0).replaceAll("定价: ", ""); result3 = m3.find(); System.out.println(paramcontent[2]); } Pattern p4 = Pattern.compile("ISBN: [/s]*[0123456789]+", Pattern.CASE_INSENSITIVE); Matcher m4 = p4.matcher(content); boolean result4 = m4.find(); while (result4) {paramcontent[3] = m4.group(0).replaceAll("ISBN: ", ""); System.out.println(paramcontent[3]); result4 = m4.find(); } Pattern p5 = Pattern.compile("译者: [/s]*[\u4e00-\u9fa5a-zA-Z]*[^/s]*[\u4e00-\u9fa5a-zA-Z]*[^/s]*[/s]*[\u4e00-\u9fa5a-zA-Z]*[/s]*[^/s]*出版年: ", Pattern.CASE_INSENSITIVE); Matcher m5 = p5.matcher(content); boolean result5 = m5.find(); while (result5) {paramcontent[4] = m5.group(0).replaceAll("出版年: ", "") .replaceAll("译者:", ""); System.out.println(paramcontent[4]); result5 = m5.find(); } return paramcontent; } public static String[] patternmatchAUT_TRANS(String content) { String[] paramcontent = new String[2]; //出版社 Pattern p1 = Pattern .compile("~ [^/s]*[\u4e00-\u9fa5]*[^/s]*[a-zA-Z]*[\u4e00-\u9fa5]*[/s]*[^/s]*"); Matcher m1 = p1.matcher(content); boolean result1 = m1.find(); while (result1) { String temp = m1.group(0); //System.out.println(temp); //temp.split(","); //System.out.println(temp.substring(0, temp.indexOf("(作者)")).trim()); //paramcontent[0] = temp.replaceAll("(作者)", " ").replaceAll("~ |,", ""); //System.out.println(paramcontent[0]); result1 = m1.find(); paramcontent[0] = temp; } //版次 Pattern p2 = Pattern .compile(";[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*[平装丛书名精装外文书名语种]"); Matcher m2 = p2.matcher(content); boolean result2 = m2.find(); while (result2) { paramcontent[1] = m2.group(0) .replaceAll(";|平装[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|丛书名[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|精装[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|外文书名[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|语种[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*", " ") .replaceAll("", ""); result2 = m2.find(); paramcontent[1] = paramcontent[1].replaceAll("平装|丛书名|精装|外文书名|语种", ""); System.out.println(paramcontent[1]); } return paramcontent; } }