Skip to content

Commit 85d938e

Browse files
authored
Extract WPURL::replace_base_url() from BlockMarkupUrlProcessor (#190)
Moves the base URL replacement logic from `BlockMarkupUrlProcessor::replace_base_url()` to `WPURL::replace_base_url()`. This way it can be used on any URL without involving the markup parser.
1 parent 1f9dac0 commit 85d938e

File tree

3 files changed

+222
-70
lines changed

3 files changed

+222
-70
lines changed

components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php

Lines changed: 23 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
use Rowbot\URL\URL;
66
use WordPress\DataLiberation\URL\URLInTextProcessor;
77
use WordPress\DataLiberation\URL\WPURL;
8+
use WordPress\DataLiberation\URL\ConvertedUrl;
89

910
use function WordPress\DataLiberation\URL\urldecode_n;
1011

@@ -309,87 +310,39 @@ public function set_url( $raw_url, $parsed_url ) {
309310
* by this WPURL_In_Text_Processor class so maybe the two do go hand in hand?
310311
*/
311312
public function replace_base_url( URL $to_url, ?URL $base_url = null ) {
312-
$updated_url = clone $this->get_parsed_url();
313-
314-
$updated_url->hostname = $to_url->hostname;
315-
$updated_url->protocol = $to_url->protocol;
316-
$updated_url->port = $to_url->port;
317-
318-
// Update the pathname if needed.
319-
$from_url = $this->get_parsed_url();
320-
$from_pathname = $from_url->pathname;
321-
$to_pathname = $to_url->pathname;
322-
323313
$base_url = $base_url ?? $this->base_url_object;
324-
if ( $base_url->pathname !== $to_pathname ) {
325-
$base_pathname_with_trailing_slash = rtrim( $base_url->pathname, '/' ) . '/';
326-
$decoded_matched_pathname = urldecode_n(
327-
$from_pathname,
328-
strlen( $base_pathname_with_trailing_slash )
329-
);
330-
$to_pathname_with_trailing_slash = rtrim( $to_pathname, '/' ) . '/';
331-
$remaining_pathname =
332-
substr(
333-
$decoded_matched_pathname,
334-
strlen( $base_pathname_with_trailing_slash )
335-
);
336-
337-
$updated_url->pathname = $to_pathname_with_trailing_slash . $remaining_pathname;
338-
}
339-
340-
/*
341-
* Stylistic choice – if the updated URL has no trailing slash,
342-
* do not add it to the new URL. The WHATWG URL parser will
343-
* add one automatically if the path is empty, so we have to
344-
* explicitly remove it.
345-
*/
346-
$new_raw_url = $updated_url->toString();
347-
if (
348-
'/' !== $from_url->pathname[ strlen( $from_url->pathname ) - 1 ] &&
349-
'/' !== $from_url->pathname &&
350-
'' === $from_url->search &&
351-
'' === $from_url->hash
352-
) {
353-
$new_raw_url = rtrim( $new_raw_url, '/' );
354-
}
355-
if ( ! $new_raw_url ) {
356-
// @TODO: When does this happen? Let's add the test coverage and
357-
// doubly verify the logic.
314+
if ( ! $base_url ) {
358315
return false;
359316
}
360317

361-
if ( ! $this->is_url_relative() ) {
362-
$this->set_url( $new_raw_url, $updated_url );
363-
364-
return true;
365-
}
318+
$result = WPURL::replace_base_url(
319+
$this->get_parsed_url(),
320+
array(
321+
'old_base_url' => $base_url,
322+
'new_base_url' => $to_url,
323+
'raw_url' => $this->get_raw_url(),
324+
'is_relative' => (
325+
/**
326+
* In text nodes, the only detected URLs are absolute. The tricky part
327+
* is they may start without a protocol, e.g. `wordpress.org`. Therefore,
328+
* we need to tell WPURL::replace_base_url what's our intention regarding
329+
* the URL's relativity. It cannot just infer it from the URL itself.
330+
*/
331+
'#text' !== $this->get_token_type() &&
332+
! WPURL::can_parse( $this->get_raw_url() )
333+
),
334+
)
335+
);
366336

367-
$new_relative_url = $updated_url->pathname;
368-
if ( '' !== $updated_url->search ) {
369-
$new_relative_url .= $updated_url->search;
370-
}
371-
if ( '' !== $updated_url->hash ) {
372-
$new_relative_url .= $updated_url->hash;
337+
if ( false === $result ) {
338+
return false;
373339
}
374340

375-
$this->set_url( $new_relative_url, $updated_url );
341+
$this->set_url( $result . '', $result->new_url );
376342

377343
return true;
378344
}
379345

380-
/**
381-
* Returns true if the currently matched URL is relative.
382-
*
383-
* @return bool Whether the currently matched URL is relative.
384-
*/
385-
public function is_url_relative() {
386-
return (
387-
! WPURL::can_parse( $this->get_raw_url() ) &&
388-
// only absolute URLs are detected in text nodes.
389-
'#text' !== $this->get_token_type()
390-
);
391-
}
392-
393346
/**
394347
* Returns true if the currently matched URL is absolute.
395348
*
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
<?php
2+
3+
namespace WordPress\DataLiberation\URL;
4+
5+
use Rowbot\URL\URL;
6+
7+
/**
8+
* Value object returned by WPURL::replace_base_url().
9+
*
10+
* - Cast to string to get the updated URL as a string.
11+
* - When the original URL was relative, casting returns a relative string against
12+
* the new base.
13+
*/
14+
class ConvertedUrl {
15+
16+
/** @var URL */
17+
public $new_url;
18+
19+
/** @var string */
20+
public $new_raw_url;
21+
22+
/** @var string|null */
23+
public $new_raw_relative_url;
24+
25+
/** @var bool */
26+
public $was_relative = false;
27+
28+
/**
29+
* Returns the updated URL string. If the original was relative, returns a relative string.
30+
*/
31+
public function __toString(): string {
32+
if ( $this->was_relative ) {
33+
return $this->new_raw_relative_url;
34+
}
35+
return $this->new_raw_url;
36+
}
37+
}

components/DataLiberation/URL/class-wpurl.php

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,168 @@ public static function can_parse( $url, $base = null ) {
2525
return URL::canParse( $url, $base );
2626
}
2727

28+
/**
29+
* Replaces the "base" of a URL — scheme, host (and port), and the portion of the path that
30+
* belongs to the old base — with a new base while keeping the remainder of the URL intact.
31+
*
32+
* This is intended for content migrations, where URLs embedded in block markup, HTML attributes,
33+
* or inline text must be moved from one site root to another without losing the rest of the path,
34+
* query, or fragment. It handles simple domain swaps, ports, and deep path bases. When the old
35+
* base includes path segments, only that matched prefix is substituted and the unmatched tail is
36+
* carried over to the target base.
37+
*
38+
* For example:
39+
* * URL: https://example.com/a/b/c/d/e/f/g/h/i/j/page/
40+
* * Old base: https://example.com/a/b/c/d/e/f/
41+
* * New base: https://example.org/docs/
42+
* * Result: https://example.org/docs/g/h/i/j/page/
43+
*
44+
* ## Trailing slash handling
45+
*
46+
* Trailing slash style is preserved from the original URL. If it has no trailing slash, the
47+
* result will also omit the trailing slash and vice versa.
48+
*
49+
* For example, here the final result has no trailing slash:
50+
* * URL: https://example.com/uploads/file.txt
51+
* * Old base: https://example.com/uploads/
52+
* * New base: https://example.org/docs/
53+
* * Result: https://example.org/docs/file.txt
54+
*
55+
* And here it does:
56+
* * URL: https://example.com/uploads/2018/
57+
* * Old base: https://example.com/uploads/
58+
* * New base: https://example.org/docs/
59+
* * Result: https://example.org/docs/2018/
60+
*
61+
* ## URL-encoded path segments
62+
*
63+
* URL-encoded path segments are respected and not inadvertently decoded or re-encoded. Only the
64+
* matched base prefix is considered for alignment, so inputs that contain percent-encoded content
65+
* keep that content exactly as-is in the output. This prevents data corruption in tricky cases such
66+
* as "/~jappleseed/1997.10.1/%2561-reasons-to-migrate-data/" where the "%2561" must remain
67+
* double-escaped after the move.
68+
*
69+
* ## Relative URLs
70+
*
71+
* This method can preserve the relative nature of the original URL. Say you are processing a markup
72+
* that contains `<a href="/uploads/file.txt">`. The original URL string is "/uploads/file.txt",
73+
* and the URL actually resolves to "https://example.com/uploads/file.txt". If you want to replace
74+
* the base URL from "https://example.com/uploads/" to "https://newsite.com/files/" but keep the
75+
* URL relative, you can pass the raw URL string via the "raw_url" option.
76+
*
77+
* For example:
78+
* * URL: https://example.com/uploads/file.txt
79+
* * Raw URL: /uploads/file.txt
80+
* * Old base: https://example.com/uploads/
81+
* * New base: https://example.org/files/
82+
* * Result: /files/file.txt
83+
*
84+
* The method also supports relative inputs commonly found in markup. If you pass the raw URL
85+
* string via the "raw_url" option, the method can infer whether the author originally wrote a
86+
* relative URL like "docs/page.html" or an absolute one. You may also explicitly
87+
* assert relativity with "is_relative" to avoid inference.
88+
*
89+
* @param string|URL $url The URL to replace the base of.
90+
* @param array $options Associative options: old_base_url, new_base_url; optional raw_url.
91+
* @return ConvertedUrl|false Returns a ConvertedUrl value object on success, or false when parsing
92+
* or replacement cannot be performed.
93+
*/
94+
public static function replace_base_url( $url, $options ) {
95+
if ( ! is_array( $options ) ) {
96+
return false;
97+
}
98+
99+
foreach ( array( 'old_base_url', 'new_base_url' ) as $required ) {
100+
if ( ! array_key_exists( $required, $options ) || null === $options[ $required ] ) {
101+
return false;
102+
}
103+
}
104+
105+
$old_base_url = self::parse( $options['old_base_url'] );
106+
$new_base_url = self::parse( $options['new_base_url'] );
107+
$url = self::parse( $url, $old_base_url ? $old_base_url->toString() : null );
108+
109+
if ( false === $old_base_url || false === $new_base_url || false === $url ) {
110+
return false;
111+
}
112+
113+
$updated_url = clone $url;
114+
115+
$updated_url->hostname = $new_base_url->hostname;
116+
$updated_url->protocol = $new_base_url->protocol;
117+
$updated_url->port = $new_base_url->port;
118+
119+
$from_pathname = $url->pathname;
120+
$to_pathname = $new_base_url->pathname;
121+
$base_pathname = $old_base_url->pathname;
122+
123+
if ( $base_pathname !== $to_pathname ) {
124+
$base_pathname_with_trailing_slash = rtrim( $base_pathname, '/' ) . '/';
125+
$decoded_matched_pathname = urldecode_n(
126+
$from_pathname,
127+
strlen( $base_pathname_with_trailing_slash )
128+
);
129+
$to_pathname_with_trailing_slash = rtrim( $to_pathname, '/' ) . '/';
130+
$remaining_pathname = substr(
131+
$decoded_matched_pathname,
132+
strlen( $base_pathname_with_trailing_slash )
133+
);
134+
135+
$updated_url->pathname = $to_pathname_with_trailing_slash . $remaining_pathname;
136+
}
137+
138+
/*
139+
* Stylistic choice – if the updated URL has no trailing slash,
140+
* do not add it to the new URL. The WHATWG URL parser will
141+
* add one automatically if the path is empty, so we have to
142+
* explicitly remove it.
143+
*/
144+
$new_raw_url = $updated_url->toString();
145+
$should_trim_trailing_slash = (
146+
'' !== $from_pathname &&
147+
'/' !== substr( $from_pathname, -1 ) &&
148+
'/' !== $from_pathname &&
149+
'' === $url->search &&
150+
'' === $url->hash
151+
);
152+
if ( $should_trim_trailing_slash ) {
153+
$new_raw_url = rtrim( $new_raw_url, '/' );
154+
}
155+
if ( ! $new_raw_url ) {
156+
// This may technically happen, but does it happen in practice?
157+
return false;
158+
}
159+
160+
$converted_url = new ConvertedUrl();
161+
$converted_url->new_url = $updated_url;
162+
$converted_url->new_raw_url = $new_raw_url;
163+
164+
// Preserve the relative nature of the original URL.
165+
if ( array_key_exists( 'raw_url', $options ) && is_string( $options['raw_url'] ) ) {
166+
if ( ! array_key_exists( 'is_relative', $options ) ) {
167+
$options['is_relative'] = self::can_parse( $options['raw_url'] );
168+
}
169+
if ( $options['is_relative'] ) {
170+
$relative_url = $updated_url->pathname;
171+
// Remove the trailing slash if it's not the root path.
172+
if ( strlen( $relative_url ) > 1 && $should_trim_trailing_slash ) {
173+
$relative_url = rtrim( $relative_url, '/' );
174+
}
175+
if ( '' !== $updated_url->search ) {
176+
$relative_url .= $updated_url->search;
177+
}
178+
if ( '' !== $updated_url->hash ) {
179+
$relative_url .= $updated_url->hash;
180+
}
181+
182+
$converted_url->was_relative = true;
183+
$converted_url->new_raw_relative_url = $relative_url;
184+
}
185+
}
186+
187+
return $converted_url;
188+
}
189+
28190
/**
29191
* Prepends a protocol to any matched URL without the double slash.
30192
*

0 commit comments

Comments
 (0)