diff --git a/javascript/ql/integration-tests/query-suite/javascript-code-quality.qls.expected b/javascript/ql/integration-tests/query-suite/javascript-code-quality.qls.expected
index 876b5f25fa28..451a8b4bf273 100644
--- a/javascript/ql/integration-tests/query-suite/javascript-code-quality.qls.expected
+++ b/javascript/ql/integration-tests/query-suite/javascript-code-quality.qls.expected
@@ -3,4 +3,5 @@ ql/javascript/ql/src/Expressions/ExprHasNoEffect.ql
ql/javascript/ql/src/Expressions/MissingAwait.ql
ql/javascript/ql/src/LanguageFeatures/SpuriousArguments.ql
ql/javascript/ql/src/Quality/UnhandledErrorInStreamPipeline.ql
+ql/javascript/ql/src/RegExp/DuplicateCharacterInCharacterClass.ql
ql/javascript/ql/src/RegExp/RegExpAlwaysMatches.ql
diff --git a/javascript/ql/src/RegExp/DuplicateCharacterInCharacterClass.qhelp b/javascript/ql/src/RegExp/DuplicateCharacterInCharacterClass.qhelp
index 30ef990ec0cd..f94cced3d09a 100644
--- a/javascript/ql/src/RegExp/DuplicateCharacterInCharacterClass.qhelp
+++ b/javascript/ql/src/RegExp/DuplicateCharacterInCharacterClass.qhelp
@@ -5,26 +5,42 @@
-Character classes in regular expressions represent sets of characters, so there is no need to specify
-the same character twice in one character class. Duplicate characters in character classes are at best
-useless, and may even indicate a latent bug.
+Character classes in regular expressions (denoted by square brackets []
) represent sets of characters where the pattern matches any single character from that set. Since character classes are sets, specifying the same character multiple times is redundant and often indicates a programming error.
+
+Common mistakes include:
+
+
+- Using square brackets
[]
instead of parentheses ()
for grouping alternatives
+- Misunderstanding that special regex characters like
|
, *
, +
, ()
, and -
work differently when appearing inside a character class
+- Accidentally duplicating characters or escape sequences that represent the same character
+
+
-If the character was accidentally duplicated, remove it. If the character class was meant to be a
-group, replace the brackets with parentheses.
+
+Examine each duplicate character to determine the intended behavior:
+
+
+- If you see
|
inside square brackets (e.g., [a|b|c]
): This is usually a mistake. The author likely intended alternation. Replace the character class with a group: (a|b|c)
+- If trying to match alternative strings, use parentheses
()
for grouping instead of square brackets
+- If the duplicate was truly accidental, remove the redundant characters
+- If trying to use special regex operators inside square brackets, note that most operators (like
|
) are treated as literal characters
+
+
+Note that simply removing |
characters from character classes is rarely the correct fix. Instead, analyze the pattern to understand what the author intended to match.
+
-In the following example, the character class [password|pwd]
contains two instances each
-of the characters d
, p
, s
, and w
. The programmer
-most likely meant to write (password|pwd)
(a pattern that matches either the string
-"password"
or the string "pwd"
), and accidentally mistyped the enclosing
-brackets.
+Example 1: Confusing character classes with groups
+
+
+The pattern [password|pwd]
does not match "password" or "pwd" as intended. Instead, it matches any single character from the set {p, a, s, w, o, r, d, |}
. Note that |
has no special meaning inside character classes.
@@ -33,10 +49,23 @@ brackets.
To fix this problem, the regular expression should be rewritten to /(password|pwd) =/
.
+
+Example 2: CSS unit matching
+
+
+The pattern r?e[m|x]
appears to be trying to match "rem" or "rex", but actually matches "re" followed by any of the characters {m, |, x}
. The correct pattern should be r?e(m|x)
or r?e[mx]
.
+
+
+
+Similarly, v[h|w|min|max]
should be v(h|w|min|max)
to properly match "vh", "vw", "vmin", or "vmax".
+
+
Mozilla Developer Network: JavaScript Regular Expressions.
+MDN: Character Classes - Details on how character classes work.
+MDN: Groups and Ranges - Proper use of grouping with parentheses.
diff --git a/javascript/ql/src/RegExp/DuplicateCharacterInCharacterClass.ql b/javascript/ql/src/RegExp/DuplicateCharacterInCharacterClass.ql
index 06b6d218acab..00366590fcb5 100644
--- a/javascript/ql/src/RegExp/DuplicateCharacterInCharacterClass.ql
+++ b/javascript/ql/src/RegExp/DuplicateCharacterInCharacterClass.ql
@@ -5,7 +5,8 @@
* @kind problem
* @problem.severity warning
* @id js/regex/duplicate-in-character-class
- * @tags reliability
+ * @tags quality
+ * reliability
* correctness
* regular-expressions
* @precision very-high
diff --git a/javascript/ql/test/query-tests/RegExp/DuplicateCharacterInCharacterClass/DuplicateCharacterInCharacterClass.expected b/javascript/ql/test/query-tests/RegExp/DuplicateCharacterInCharacterClass/DuplicateCharacterInCharacterClass.expected
index 1857b9cc4df4..96f2cf20fd96 100644
--- a/javascript/ql/test/query-tests/RegExp/DuplicateCharacterInCharacterClass/DuplicateCharacterInCharacterClass.expected
+++ b/javascript/ql/test/query-tests/RegExp/DuplicateCharacterInCharacterClass/DuplicateCharacterInCharacterClass.expected
@@ -7,3 +7,39 @@
| tst.js:8:3:8:3 | a | Character 'a' is $@. | tst.js:8:5:8:5 | a | repeated in the same character class |
| tst.js:9:3:9:6 | \\x0a | Character '\\x0a' is $@. | tst.js:9:7:9:10 | \\x0a | repeated in the same character class |
| tst.js:10:3:10:8 | \\u000a | Character '\\u000a' is $@. | tst.js:10:9:10:10 | \\n | repeated in the same character class |
+| tst.js:15:4:15:4 | \| | Character '\|' is $@. | tst.js:15:6:15:6 | \| | repeated in the same character class |
+| tst.js:16:3:16:3 | : | Character ':' is $@. | tst.js:16:9:16:9 | : | repeated in the same character class |
+| tst.js:17:4:17:4 | ^ | Character '^' is $@. | tst.js:17:11:17:11 | ^ | repeated in the same character class |
+| tst.js:17:5:17:5 | s | Character 's' is $@. | tst.js:17:12:17:12 | s | repeated in the same character class |
+| tst.js:17:6:17:6 | t | Character 't' is $@. | tst.js:17:13:17:13 | t | repeated in the same character class |
+| tst.js:17:6:17:6 | t | Character 't' is $@. | tst.js:17:15:17:15 | t | repeated in the same character class |
+| tst.js:17:6:17:6 | t | Character 't' is $@. | tst.js:17:19:17:19 | t | repeated in the same character class |
+| tst.js:17:7:17:7 | y | Character 'y' is $@. | tst.js:17:20:17:20 | y | repeated in the same character class |
+| tst.js:17:8:17:8 | l | Character 'l' is $@. | tst.js:17:21:17:21 | l | repeated in the same character class |
+| tst.js:17:9:17:9 | e | Character 'e' is $@. | tst.js:17:22:17:22 | e | repeated in the same character class |
+| tst.js:18:3:18:3 | . | Character '.' is $@. | tst.js:18:5:18:5 | . | repeated in the same character class |
+| tst.js:19:6:19:6 | \u0645 | Character '\u0645' is $@. | tst.js:19:8:19:8 | \u0645 | repeated in the same character class |
+| tst.js:22:3:22:4 | \\p | Character '\\p' is $@. | tst.js:22:15:22:16 | \\p | repeated in the same character class |
+| tst.js:22:5:22:5 | { | Character '{' is $@. | tst.js:22:17:22:17 | { | repeated in the same character class |
+| tst.js:22:7:22:7 | e | Character 'e' is $@. | tst.js:22:10:22:10 | e | repeated in the same character class |
+| tst.js:22:8:22:8 | t | Character 't' is $@. | tst.js:22:9:22:9 | t | repeated in the same character class |
+| tst.js:22:12:22:12 | } | Character '}' is $@. | tst.js:22:23:22:23 | } | repeated in the same character class |
+| tst.js:22:13:22:13 | & | Character '&' is $@. | tst.js:22:14:22:14 | & | repeated in the same character class |
+| tst.js:22:21:22:21 | I | Character 'I' is $@. | tst.js:22:22:22:22 | I | repeated in the same character class |
+| tst.js:26:3:26:4 | \\p | Character '\\p' is $@. | tst.js:26:13:26:14 | \\p | repeated in the same character class |
+| tst.js:26:5:26:5 | { | Character '{' is $@. | tst.js:26:15:26:15 | { | repeated in the same character class |
+| tst.js:26:7:26:7 | e | Character 'e' is $@. | tst.js:26:10:26:10 | e | repeated in the same character class |
+| tst.js:26:7:26:7 | e | Character 'e' is $@. | tst.js:26:17:26:17 | e | repeated in the same character class |
+| tst.js:26:7:26:7 | e | Character 'e' is $@. | tst.js:26:28:26:28 | e | repeated in the same character class |
+| tst.js:26:8:26:8 | t | Character 't' is $@. | tst.js:26:9:26:9 | t | repeated in the same character class |
+| tst.js:26:11:26:11 | r | Character 'r' is $@. | tst.js:26:29:26:29 | r | repeated in the same character class |
+| tst.js:26:12:26:12 | } | Character '}' is $@. | tst.js:26:30:26:30 | } | repeated in the same character class |
+| tst.js:26:20:26:20 | m | Character 'm' is $@. | tst.js:26:26:26:26 | m | repeated in the same character class |
+| tst.js:28:3:28:3 | / | Character '/' is $@. | tst.js:28:5:28:5 | / | repeated in the same character class |
+| tst.js:30:4:30:4 | ^ | Character '^' is $@. | tst.js:30:5:30:5 | ^ | repeated in the same character class |
+| tst.js:31:4:31:4 | * | Character '*' is $@. | tst.js:31:5:31:5 | * | repeated in the same character class |
+| tst.js:33:5:33:5 | \| | Character '\|' is $@. | tst.js:33:6:33:7 | \\\| | repeated in the same character class |
+| tst_replace.js:3:26:3:26 | n | Character 'n' is $@. | tst_replace.js:3:28:3:28 | n | repeated in the same character class |
+| tst_replace.js:11:18:11:18 | n | Character 'n' is $@. | tst_replace.js:11:20:11:20 | n | repeated in the same character class |
+| tst_replace.js:25:18:25:18 | n | Character 'n' is $@. | tst_replace.js:25:20:25:20 | n | repeated in the same character class |
+| tst_replace.js:42:18:42:18 | n | Character 'n' is $@. | tst_replace.js:42:20:42:20 | n | repeated in the same character class |
diff --git a/javascript/ql/test/query-tests/RegExp/DuplicateCharacterInCharacterClass/tst.js b/javascript/ql/test/query-tests/RegExp/DuplicateCharacterInCharacterClass/tst.js
index c87c7140a16c..fe291137c8a6 100644
--- a/javascript/ql/test/query-tests/RegExp/DuplicateCharacterInCharacterClass/tst.js
+++ b/javascript/ql/test/query-tests/RegExp/DuplicateCharacterInCharacterClass/tst.js
@@ -3,12 +3,31 @@
/[\uDC3A\uDC3C]/;
/[??]/; // $ Alert
/[\u003F\u003f]/; // $ Alert
-/[\u003F?]/; // $ Alert
+/[\u003F?]/; // $ Alert -- \u003F evaluates to ?, which is the same as ? in the character class
/[\x3f\u003f]/; // $ Alert
/[aaa]/; // $ Alert
/[\x0a\x0a]/; // $ Alert
-/[\u000a\n]/; // $ Alert
+/[\u000a\n]/; // $ Alert -- \u000a evaluates to \n, which is the same as \n in the character class
/[\u{ff}]/;
/[\u{12340}-\u{12345}]/u;
new RegExp("[\u{12340}-\u{12345}]", "u");
const regex = /\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv;
+/[a|b|c]/; // $ Alert -- Repeated | character in character class, which has no special meaning in this context
+/[:alnum:]/; // $ Alert -- JavaScript does not support POSIX character classes like `[:alnum:]` in regular expressions, thus characters in the class are treated as literals
+/[(^style|^staticStyle)]/; // $ Alert
+/[.x.]/i; // $ Alert -- Repeated . character in character class
+/^[يفمأمسند]/i; // $ Alert -- م duplicate
+/[\u{1F600}-\u{1F64F}]/u;
+/[\p{Letter}&&\p{ASCII}]/v; // && is an intersection operator while /v flag is present
+/[\p{Letter}&&\p{ASCII}]/; // $ Alert -- without /v flag, && is not a valid operator and treated as member of character class thus duplicate
+/[\p{Decimal_Number}&&[0-9A-F]]/v;
+/[\p{Letter}--[aeiouAEIOU]]/v;
+/[\p{Letter}\p{Decimal_Number}]/v; // Union operation between two character classes only with /v flag
+/[\p{Letter}\p{Decimal_Number}]/; // $ Alert -- without /v flag, this is not a valid operation and treated as member of character class thus duplicate
+/[\[\]]/;
+/[/[/]]/; // $ Alert
+/[^^abc]/; // First `^` is a negation operator, second treated as literal `^` is a member of character class
+/[^^^abc]/; // $ Alert -- Second and third `^` are treated as literals thus duplicates
+/[^**]/; // $ Alert
+/[-a-z]/; // Matches `-` and range `a-z` no duplicate
+/^[:|\|]/ // $ Alert -- `|` is treated as a literal character in the character class, thus duplicate even with escape character
diff --git a/javascript/ql/test/query-tests/RegExp/DuplicateCharacterInCharacterClass/tst_replace.js b/javascript/ql/test/query-tests/RegExp/DuplicateCharacterInCharacterClass/tst_replace.js
new file mode 100644
index 000000000000..afd526007b02
--- /dev/null
+++ b/javascript/ql/test/query-tests/RegExp/DuplicateCharacterInCharacterClass/tst_replace.js
@@ -0,0 +1,44 @@
+function reg(){
+ const nonIdPattern = 'a-z';
+ const basePattern = /[]/.source; // $ SPURIOUS:Alert
+ const finalPattern = basePattern.replace(//g, nonIdPattern);
+ console.log(finalPattern);
+ const regex2 = new RegExp(finalPattern);
+}
+
+function reg1(){
+ const nonIdPattern = 'a-z';
+ const reg = /[]/; // $ SPURIOUS:Alert
+ const basePattern = reg.source;
+ const finalPattern = basePattern.replace(//g, nonIdPattern);
+ console.log(finalPattern);
+ const regex2 = new RegExp(finalPattern);
+}
+
+function replacer(reg1, reg2){
+ const basePattern = reg1.source;
+ const finalPattern = basePattern.replace(//g, reg2);
+ return new RegExp(finalPattern);
+}
+function reg2(){
+ const nonIdPattern = 'a-z';
+ const reg = /[]/; // $ SPURIOUS:Alert
+ replacer(reg, nonIdPattern);
+}
+
+
+function replacer3(str, reg2){
+ const finalPattern = str.replace(//g, reg2);
+ return new RegExp(finalPattern);
+}
+
+function replacer2(reg1, reg2){
+ const basePattern = reg1.source;
+ return replacer3(basePattern, reg2);
+}
+
+function reg3(){
+ const nonIdPattern = 'a-z';
+ const reg = /[]/; // $ SPURIOUS:Alert
+ replacer2(reg, nonIdPattern);
+}