From 2b46b11e55712110512ba34f56a7a463aea86d4d Mon Sep 17 00:00:00 2001 From: Almark Ming Date: Tue, 17 Dec 2013 16:57:22 +0800 Subject: [PATCH] Update RegexSelector.java Optimize regex format check Conflicts: webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java --- .../webmagic/selector/RegexSelector.java | 8 ++++---- .../webmagic/selector/RegexSelectorTest.java | 20 +++++++++++-------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java index 6b1db967b..438189655 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexSelector.java @@ -26,12 +26,12 @@ public RegexSelector(String regexStr, int group) { if (StringUtils.isBlank(regexStr)) { throw new IllegalArgumentException("regex must not be empty"); } - if (!StringUtils.contains(regexStr, "(") && !StringUtils.contains(regexStr, ")")) { + // Check bracket for regex group. Add default group 1 if there is no group. + // Only check if there exists the valid left parenthesis, leave regexp validation for Pattern. + if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == + StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")) { regexStr = "(" + regexStr + ")"; } - if (!StringUtils.contains(regexStr, "(") || !StringUtils.contains(regexStr, ")")) { - throw new IllegalArgumentException("regex must have capture group 1"); - } this.regexStr = regexStr; try { regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java index 21660a19c..a0b8caff1 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java @@ -1,6 +1,6 @@ package us.codecraft.webmagic.selector; -import junit.framework.Assert; +import org.junit.Assert; import org.junit.Test; /** @@ -8,14 +8,18 @@ */ public class RegexSelectorTest { - @Test - public void testInvalidRegex() { + @Test(expected = IllegalArgumentException.class) + public void testRegexWithSingleLeftBracket() { String regex = "\\d+("; - try { - new RegexSelector(regex); - Assert.assertNotNull(regex); - } catch (Exception e) { + new RegexSelector(regex); + } - } + @Test + public void testRegexWithLeftBracketQuoted() { + String regex = "\\(.+"; + String source = "(hello world"; + RegexSelector regexSelector = new RegexSelector(regex); + String select = regexSelector.select(source); + Assert.assertEquals(source,select); } }