package bimoku.extract.parser; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Patternmatch_Amazon { public static void main(String[] args) { String content = "软实力:权力,从硬实力到软实力 [精装] ~ 约瑟夫•奈 (Joseph S. Nye Jr.) (作者),郑汶 (作者) 马娟娟 (译者)" ; patternmatchAUT_TRANS(content); } public static String[] patternmatchContent(String content) { String[] paramcontent = new String[4]; //出版社 Pattern p1 = Pattern .compile("出版社: [^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*;"); Matcher m1 = p1.matcher(content); boolean result1 = m1.find(); while (result1) { paramcontent[0] = m1.group(0).replaceAll("出版社: ", "") .replaceAll(";", ""); System.out.println(paramcontent[0]); result1 = m1.find(); } //版次 Pattern p2 = Pattern .compile(";[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*[平装丛书名精装外文书名语种]"); Matcher m2 = p2.matcher(content); boolean result2 = m2.find(); while (result2) { paramcontent[1] = m2.group(0) .replaceAll(";|平装[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|丛书名[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|精装[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|外文书名[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|语种[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*", " ") .replaceAll("", ""); result2 = m2.find(); paramcontent[1] = paramcontent[1].replaceAll("平装|丛书名|精装|外文书名|语种", ""); System.out.println(paramcontent[1]); } Pattern p3 = Pattern.compile( "ASIN: [/s]*[a-zA-Z0-9]*", Pattern.CASE_INSENSITIVE); Matcher m3 = p3.matcher(content); boolean result3 = m3.find(); while (result3) { paramcontent[2] = m3.group(0).replaceAll("ASIN:", ""); result3 = m3.find(); System.out.println(paramcontent[2]); } Pattern p4 = Pattern.compile("ISBN: [/s]*[0123456789]+", Pattern.CASE_INSENSITIVE); Matcher m4 = p4.matcher(content); boolean result4 = m4.find(); while (result4) {paramcontent[3] = m4.group(0).replaceAll("ISBN: ", ""); System.out.println(paramcontent[3]); result4 = m4.find(); } return paramcontent; } public static String[] patternmatchAUT_TRANS(String content) { String[] paramcontent = new String[2]; //出版社 Pattern p1 = Pattern .compile("~ [^/s]*[\u4e00-\u9fa5]*[^/s]*[a-zA-Z]*[\u4e00-\u9fa5]*[/s]*[^/s]*"); Matcher m1 = p1.matcher(content); boolean result1 = m1.find(); while (result1) { String temp = m1.group(0); //System.out.println(temp); //temp.split(","); //System.out.println(temp.substring(0, temp.indexOf("(作者)")).trim()); //paramcontent[0] = temp.replaceAll("(作者)", " ").replaceAll("~ |,", ""); //System.out.println(paramcontent[0]); result1 = m1.find(); paramcontent[0] = temp; } //版次 Pattern p2 = Pattern .compile(";[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*[平装丛书名精装外文书名语种]"); Matcher m2 = p2.matcher(content); boolean result2 = m2.find(); while (result2) { paramcontent[1] = m2.group(0) .replaceAll(";|平装[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|丛书名[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|精装[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|外文书名[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*|语种[^/s]*[\u4e00-\u9fa5]*[^/s]*[\u4e00-\u9fa5]+[/s]*[^/s]*", " ") .replaceAll("", ""); result2 = m2.find(); paramcontent[1] = paramcontent[1].replaceAll("平装|丛书名|精装|外文书名|语种", ""); System.out.println(paramcontent[1]); } return paramcontent; } }