Skip to content

Commit 4a1f323

Browse files
WAT/WET extraction: use compiled regex Pattern to get better performance
1 parent d5af4e5 commit 4a1f323

File tree

1 file changed

+4
-2
lines changed

1 file changed

+4
-2
lines changed

src/main/java/org/archive/resource/html/ExtractingParseObserver.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ public class ExtractingParseObserver implements ParseObserver {
5151
Pattern.compile(jsOnClickUrl2PatString)
5252
};
5353

54+
protected static Pattern wsPattern = Pattern.compile("\\s+");
55+
5456
private final static int MAX_TEXT_LEN = 100;
5557

5658
private final static String[] BLOCK_ELEMENTS = { "address", "article", "aside", "blockquote", "body", "br",
@@ -197,7 +199,7 @@ public void handleTagClose(TagNode tag) {
197199
if((vals != null) && (vals.size() > 0)) {
198200
if(text != null) {
199201
// contained an href - we want to ignore <a name="X"></a>:
200-
String trimmed = Translate.decode(text.toString()).trim().replaceAll("\\s+", " ");
202+
String trimmed = wsPattern.matcher(Translate.decode(text.toString()).trim()).replaceAll(" ");
201203
if(trimmed.length() > MAX_TEXT_LEN) {
202204
trimmed = trimmed.substring(0,MAX_TEXT_LEN);
203205
}
@@ -241,7 +243,7 @@ public void handleTextNode(TextNode text) {
241243
}
242244
}
243245

244-
String t = txt.replaceAll("\\s+", " ");
246+
String t = wsPattern.matcher(txt).replaceAll(" ");
245247

246248
if(t.length() > MAX_TEXT_LEN) {
247249
t = t.substring(0,MAX_TEXT_LEN);

0 commit comments

Comments
 (0)