1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
|
<?php
namespace WordPress\DataLiberation\URL;
use Rowbot\URL\URL; use WordPress\DataLiberation\BlockMarkup\BlockMarkupUrlProcessor;
/** * Migrate URLs in post content. See WPRewriteUrlsTests for * specific examples. TODO: A better description. * * Example: * * ```php * php > wp_rewrite_urls([ * 'block_markup' => '<!-- wp:image {"src": "http://legacy-blog.com/image.jpg"} -->', * 'url-mapping' => [ * 'http://legacy-blog.com' => 'https://modern-webstore.org' * ] * ]) * <!-- wp:image {"src":"https:\/\/modern-webstore.org\/image.jpg"} --> * ``` * * @TODO Use a proper JSON parser and encoder to: * * Support UTF-16 characters * * Gracefully handle recoverable encoding issues * * Avoid changing the whitespace in the same manner as * we do in WP_HTML_Tag_Processor. e.g. if we start with: * * ```html * <!-- wp:block {"url":"https://w.org"}` --> * ^ no space here * ``` * * then it would be nice to re-encode that block markup also without the space character. This is similar * to how the tag processor avoids changing parts of the tag it doesn't need to change. */ function wp_rewrite_urls( $options ) { if ( empty( $options['base_url'] ) ) { // Use first from-url as base_url if not specified. $from_urls = array_keys( $options['url-mapping'] ); $options['base_url'] = $from_urls[0]; }
$url_mapping = array(); foreach ( $options['url-mapping'] as $from_url_string => $to_url_string ) { $url_mapping[] = array( 'from_url' => WPURL::parse( $from_url_string ), 'to_url' => WPURL::parse( $to_url_string ), ); }
$p = new BlockMarkupUrlProcessor( $options['block_markup'], $options['base_url'] ); while ( $p->next_url() ) { $parsed_url = $p->get_parsed_url(); foreach ( $url_mapping as $mapping ) { if ( is_child_url_of( $parsed_url, $mapping['from_url'] ) ) { $p->replace_base_url( $mapping['to_url'] ); break; } } }
return $p->get_updated_html(); }
/** * Check if a given URL matches the current site URL. * * @param URL $child The URL to check. * @param string $parent_url The current site URL to compare against. * * @return bool Whether the URL matches the current site URL. */ function is_child_url_of( $child, $parent_url ) { $parent_url = is_string( $parent_url ) ? WPURL::parse( $parent_url ) : $parent_url; $child = is_string( $child ) ? WPURL::parse( $child ) : $child; $child_pathname_no_trailing_slash = rtrim( urldecode( $child->pathname ), '/' );
if ( false === $child || false === $parent_url ) { return false; }
if ( $parent_url->hostname !== $child->hostname ) { return false; }
if ( $parent_url->protocol !== $child->protocol ) { return false; }
$parent_pathname = urldecode( $parent_url->pathname );
return ( // Direct match. $parent_pathname === $child_pathname_no_trailing_slash || $parent_pathname === $child_pathname_no_trailing_slash . '/' || // Path prefix. 0 === strncmp( $child_pathname_no_trailing_slash . '/', $parent_pathname, strlen( $parent_pathname ) ) ); }
/** * Decodes the first n **encoded bytes** a URL-encoded string. * * For example, `urldecode_n( '%22is 6 %3C 6?%22 – asked Achilles', 1 )` returns * '"is 6 %3C 6?%22 – asked Achilles' because only the first encoded byte is decoded. * * @param string $input The string to decode. * @param int $decode_n The number of bytes to decode in $input * * @return string The decoded string. */ function urldecode_n( $input, $decode_n ) { // Fast paths: nothing to do. if ( $decode_n <= 0 || false === strpos( $input, '%' ) ) { return $input; }
$result = ''; $at = 0; while ( true ) { if ( $at + 3 > strlen( $input ) ) { break; }
$last_at = $at; $at += strcspn( $input, '%', $at ); // Consume bytes except for the percent sign. $result .= substr( $input, $last_at, $at - $last_at );
// If we've already decoded the requested number of bytes, stop. if ( strlen( $result ) >= $decode_n ) { break; }
++$at; if ( $at > strlen( $input ) ) { break; }
$decodable_length = strspn( $input, '0123456789ABCDEFabcdef', $at, 2 );
if ( 2 === $decodable_length ) { // Decodes the urlencoded hex sequence from URL. // Note: This decodes bytes, not characters. It will recover the original byte sequence, // not necessarily any valid UTF-8 characters. $result .= chr( hexdec( $input[ $at ] . $input[ $at + 1 ] ) ); $at += 2; } else { // Consume the next byte and move on. $result .= '%'; } } $result .= substr( $input, $at );
return $result; }
|