ext ); } // Remove more than two contiguous line breaks. $text = preg_replace( "/\n\n+/", "\n\n", $text ); // Split up the contents into an array of strings, separated by double line breaks. $paragraphs = preg_split( '/\n\s*\n/', $text, -1, PREG_SPLIT_NO_EMPTY ); // Reset $text prior to rebuilding. $text = ''; // Rebuild the content as a string, wrapping every bit with a

. foreach ( $paragraphs as $paragraph ) { $text .= '

' . trim( $paragraph, "\n" ) . "

\n"; } // Under certain strange conditions it could create a P of entirely whitespace. $text = preg_replace( '|

\s*

|', '', $text ); // Add a closing

inside

,
, or
tag if missing. $text = preg_replace( '!

([^<]+)!', '

$1

', $text ); // If an opening or closing block element tag is wrapped in a

, unwrap it. $text = preg_replace( '!

\s*(]*>)\s*

!', '$1', $text ); // In some cases
  • may get wrapped in

    , fix them. $text = preg_replace( '|

    (|', '$1', $text ); // If a

    is wrapped with a

    , move it inside the

    . $text = preg_replace( '|

    ]*)>|i', '

    ', $text ); $text = str_replace( '

    ', '

    ', $text ); // If an opening or closing block element tag is preceded by an opening

    tag, remove it. $text = preg_replace( '!

    \s*(]*>)!', '$1', $text ); // If an opening or closing block element tag is followed by a closing

    tag, remove it. $text = preg_replace( '!(]*>)\s*

    !', '$1', $text ); // Optionally insert line breaks. if ( $br ) { // Replace newlines that shouldn't be touched with a placeholder. $text = preg_replace_callback( '/<(script|style|svg|math).*?<\/\\1>/s', '_autop_newline_preservation_helper', $text ); // Normalize
    $text = str_replace( array( '
    ', '
    ' ), '
    ', $text ); // Replace any new line characters that aren't preceded by a
    with a
    . $text = preg_replace( '|(?)\s*\n|', "
    \n", $text ); // Replace newline placeholders with newlines. $text = str_replace( '', "\n", $text ); } // If a
    tag is after an opening or closing block tag, remove it. $text = preg_replace( '!(]*>)\s*
    !', '$1', $text ); // If a
    tag is before a subset of opening or closing block tags, remove it. $text = preg_replace( '!
    (\s*]*>)!', '$1', $text ); $text = preg_replace( "|\n

    $|", '

    ', $text ); // Replace placeholder
     tags with their original content.
    	if ( ! empty( $pre_tags ) ) {
    		$text = str_replace( array_keys( $pre_tags ), array_values( $pre_tags ), $text );
    	}
    
    	// Restore newlines in all elements.
    	if ( str_contains( $text, '' ) ) {
    		$text = str_replace( array( '  ', '' ), "\n", $text );
    	}
    
    	return $text;
    }
    
    /**
     * Separates HTML elements and comments from the text.
     *
     * @since 4.2.4
     *
     * @param string $input The text which has to be formatted.
     * @return string[] Array of the formatted text.
     */
    function wp_html_split( $input ) {
    	return preg_split( get_html_split_regex(), $input, -1, PREG_SPLIT_DELIM_CAPTURE );
    }
    
    /**
     * Retrieves the regular expression for an HTML element.
     *
     * @since 4.4.0
     *
     * @return string The regular expression
     */
    function get_html_split_regex() {
    	static $regex;
    
    	if ( ! isset( $regex ) ) {
    		// phpcs:disable Squiz.Strings.ConcatenationSpacing.PaddingFound -- don't remove regex indentation
    		$comments =
    			'!'             // Start of comment, after the <.
    			. '(?:'         // Unroll the loop: Consume everything until --> is found.
    			.     '-(?!->)' // Dash not followed by end of comment.
    			.     '[^\-]*+' // Consume non-dashes.
    			. ')*+'         // Loop possessively.
    			. '(?:-->)?';   // End of comment. If not found, match all input.
    
    		$cdata =
    			'!\[CDATA\['    // Start of comment, after the <.
    			. '[^\]]*+'     // Consume non-].
    			. '(?:'         // Unroll the loop: Consume everything until ]]> is found.
    			.     '](?!]>)' // One ] not followed by end of comment.
    			.     '[^\]]*+' // Consume non-].
    			. ')*+'         // Loop possessively.
    			. '(?:]]>)?';   // End of comment. If not found, match all input.
    
    		$escaped =
    			'(?='             // Is the element escaped?
    			.    '!--'
    			. '|'
    			.    '!\[CDATA\['
    			. ')'
    			. '(?(?=!-)'      // If yes, which type?
    			.     $comments
    			. '|'
    			.     $cdata
    			. ')';
    
    		$regex =
    			'/('                // Capture the entire match.
    			.     '<'           // Find start of element.
    			.     '(?'          // Conditional expression follows.
    			.         $escaped  // Find end of escaped element.
    			.     '|'           // ...else...
    			.         '[^>]*>?' // Find end of normal element.
    			.     ')'
    			. ')/';
    		// phpcs:enable
    	}
    
    	return $regex;
    }
    
    /**
     * Retrieves the combined regular expression for HTML and shortcodes.
     *
     * @access private
     * @ignore
     * @internal This function will be removed in 4.5.0 per Shortcode API Roadmap.
     * @since 4.4.0
     *
     * @param string $shortcode_regex Optional. The result from _get_wptexturize_shortcode_regex().
     * @return string The regular expression
     */
    function _get_wptexturize_split_regex( $shortcode_regex = '' ) {
    	static $html_regex;
    
    	if ( ! isset( $html_regex ) ) {
    		// phpcs:disable Squiz.Strings.ConcatenationSpacing.PaddingFound -- don't remove regex indentation
    		$comment_regex =
    			'!'             // Start of comment, after the <.
    			. '(?:'         // Unroll the loop: Consume everything until --> is found.
    			.     '-(?!->)' // Dash not followed by end of comment.
    			.     '[^\-]*+' // Consume non-dashes.
    			. ')*+'         // Loop possessively.
    			. '(?:-->)?';   // End of comment. If not found, match all input.
    
    		$html_regex = // Needs replaced with wp_html_split() per Shortcode API Roadmap.
    			'<'                  // Find start of element.
    			. '(?(?=!--)'        // Is this a comment?
    			.     $comment_regex // Find end of comment.
    			. '|'
    			.     '[^>]*>?'      // Find end of element. If not found, match all input.
    			. ')';
    		// phpcs:enable
    	}
    
    	if ( empty( $shortcode_regex ) ) {
    		$regex = '/(' . $html_regex . ')/';
    	} else {
    		$regex = '/(' . $html_regex . '|' . $shortcode_regex . ')/';
    	}
    
    	return $regex;
    }
    
    /**
     * Retrieves the regular expression for shortcodes.
     *
     * @access private
     * @ignore
     * @since 4.4.0
     *
     * @param string[] $tagnames Array of shortcodes to find.
     * @return string The regular expression
     */
    function _get_wptexturize_shortcode_regex( $tagnames ) {
    	$tagregexp = implode( '|', array_map( 'preg_quote', $tagnames ) );
    	$tagregexp = "(?:$tagregexp)(?=[\\s\\]\\/])"; // Excerpt of get_shortcode_regex().
    	// phpcs:disable Squiz.Strings.ConcatenationSpacing.PaddingFound -- don't remove regex indentation
    	$regex =
    		'\['                // Find start of shortcode.
    		. '[\/\[]?'         // Shortcodes may begin with [/ or [[.
    		. $tagregexp        // Only match registered shortcodes, because performance.
    		. '(?:'
    		.     '[^\[\]<>]+'  // Shortcodes do not contain other shortcodes. Quantifier critical.
    		. '|'
    		.     '<[^\[\]>]*>' // HTML elements permitted. Prevents matching ] before >.
    		. ')*+'             // Possessive critical.
    		. '\]'              // Find end of shortcode.
    		. '\]?';            // Shortcodes may end with ]].
    	// phpcs:enable
    
    	return $regex;
    }
    
    /**
     * Replaces characters or phrases within HTML elements only.
     *
     * @since 4.2.3
     *
     * @param string $haystack      The text which has to be formatted.
     * @param array  $replace_pairs In the form array('from' => 'to', ...).
     * @return string The formatted text.
     */
    function wp_replace_in_html_tags( $haystack, $replace_pairs ) {
    	// Find all elements.
    	$textarr = wp_html_split( $haystack );
    	$changed = false;
    
    	// Optimize when searching for one item.
    	if ( 1 === count( $replace_pairs ) ) {
    		// Extract $needle and $replace.
    		foreach ( $replace_pairs as $needle => $replace ) {
    		}
    
    		// Loop through delimiters (elements) only.
    		for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) {
    			if ( str_contains( $textarr[ $i ], $needle ) ) {
    				$textarr[ $i ] = str_replace( $needle, $replace, $textarr[ $i ] );
    				$changed       = true;
    			}
    		}
    	} else {
    		// Extract all $needles.
    		$needles = array_keys( $replace_pairs );
    
    		// Loop through delimiters (elements) only.
    		for ( $i = 1, $c = count( $textarr ); $i < $c; $i += 2 ) {
    			foreach ( $needles as $needle ) {
    				if ( str_contains( $textarr[ $i ], $needle ) ) {
    					$textarr[ $i ] = strtr( $textarr[ $i ], $replace_pairs );
    					$changed       = true;
    					// After one strtr() break out of the foreach loop and look at next element.
    					break;
    				}
    			}
    		}
    	}
    
    	if ( $changed ) {
    		$haystack = implode( $textarr );
    	}
    
    	return $haystack;
    }
    
    /**
     * Newline preservation help function for wpautop().
     *
     * @since 3.1.0
     * @access private
     *
     * @param array $matches preg_replace_callback matches array
     * @return string
     */
    function _autop_newline_preservation_helper( $matches ) {
    	return str_replace( "\n", '', $matches[0] );
    }
    
    /**
     * Don't auto-p wrap shortcodes that stand alone.
     *
     * Ensures that shortcodes are not wrapped in `

    ...

    `. * * @since 2.9.0 * * @global array $shortcode_tags * * @param string $text The content. * @return string The filtered content. */ function shortcode_unautop( $text ) { global $shortcode_tags; if ( empty( $shortcode_tags ) || ! is_array( $shortcode_tags ) ) { return $text; } $tagregexp = implode( '|', array_map( 'preg_quote', array_keys( $shortcode_tags ) ) ); $spaces = wp_spaces_regexp(); // phpcs:disable Squiz.Strings.ConcatenationSpacing.PaddingFound,Universal.WhiteSpace.PrecisionAlignment.Found -- don't remove regex indentation $pattern = '/' . '

    ' // Opening paragraph. . '(?:' . $spaces . ')*+' // Optional leading whitespace. . '(' // 1: The shortcode. . '\\[' // Opening bracket. . "($tagregexp)" // 2: Shortcode name. . '(?![\\w-])' // Not followed by word character or hyphen. // Unroll the loop: Inside the opening shortcode tag. . '[^\\]\\/]*' // Not a closing bracket or forward slash. . '(?:' . '\\/(?!\\])' // A forward slash not followed by a closing bracket. . '[^\\]\\/]*' // Not a closing bracket or forward slash. . ')*?' . '(?:' . '\\/\\]' // Self closing tag and closing bracket. . '|' . '\\]' // Closing bracket. . '(?:' // Unroll the loop: Optionally, anything between the opening and closing shortcode tags. . '[^\\[]*+' // Not an opening bracket. . '(?:' . '\\[(?!\\/\\2\\])' // An opening bracket not followed by the closing shortcode tag. . '[^\\[]*+' // Not an opening bracket. . ')*+' . '\\[\\/\\2\\]' // Closing shortcode tag. . ')?' . ')' . ')' . '(?:' . $spaces . ')*+' // Optional trailing whitespace. . '<\\/p>' // Closing paragraph. . '/'; // phpcs:enable return preg_replace( $pattern, '$1', $text ); } /** * Checks to see if a string is utf8 encoded. * * NOTE: This function checks for 5-Byte sequences, UTF8 * has Bytes Sequences with a maximum length of 4. * * @author bmorel at ssi dot fr (modified) * @since 1.2.1 * * @param string $str The string to be checked * @return bool True if $str fits a UTF-8 model, false otherwise. */ function seems_utf8( $str ) { mbstring_binary_safe_encoding(); $length = strlen( $str ); reset_mbstring_encoding(); for ( $i = 0; $i < $length; $i++ ) { $c = ord( $str[ $i ] ); if ( $c < 0x80 ) { $n = 0; // 0bbbbbbb } elseif ( ( $c & 0xE0 ) === 0xC0 ) { $n = 1; // 110bbbbb } elseif ( ( $c & 0xF0 ) === 0xE0 ) { $n = 2; // 1110bbbb } elseif ( ( $c & 0xF8 ) === 0xF0 ) { $n = 3; // 11110bbb } elseif ( ( $c & 0xFC ) === 0xF8 ) { $n = 4; // 111110bb } elseif ( ( $c & 0xFE ) === 0xFC ) { $n = 5; // 1111110b } else { return false; // Does not match any model. } for ( $j = 0; $j < $n; $j++ ) { // n bytes matching 10bbbbbb follow ? if ( ( ++$i === $length ) || ( ( ord( $str[ $i ] ) & 0xC0 ) !== 0x80 ) ) { return false; } } } return true; } /** * Converts a number of special characters into their HTML entities. * * Specifically deals with: `&`, `<`, `>`, `"`, and `'`. * * `$quote_style` can be set to ENT_COMPAT to encode `"` to * `"`, or ENT_QUOTES to do both. Default is ENT_NOQUOTES where no quotes are encoded. * * @since 1.2.2 * @since 5.5.0 `$quote_style` also accepts `ENT_XML1`. * @access private * * @param string $text The text which is to be encoded. * @param int|string $quote_style Optional. Converts double quotes if set to ENT_COMPAT, * both single and double if set to ENT_QUOTES or none if set to ENT_NOQUOTES. * Converts single and double quotes, as well as converting HTML * named entities (that are not also XML named entities) to their * code points if set to ENT_XML1. Also compatible with old values; * converting single quotes if set to 'single', * double if set to 'double' or both if otherwise set. * Default is ENT_NOQUOTES. * @param false|string $charset Optional. The character encoding of the string. Default false. * @param bool $double_encode Optional. Whether to encode existing HTML entities. Default false. * @return string The encoded text with HTML entities. */ function _wp_specialchars( $text, $quote_style = ENT_NOQUOTES, $charset = false, $double_encode = false ) { $text = (string) $text; if ( 0 === strlen( $text ) ) { return ''; } // Don't bother if there are no specialchars - saves some processing. if ( ! preg_match( '/[&<>"\']/', $text ) ) { return $text; } // Account for the previous behavior of the function when the $quote_style is not an accepted value. if ( empty( $quote_style ) ) { $quote_style = ENT_NOQUOTES; } elseif ( ENT_XML1 === $quote_style ) { $quote_style = ENT_QUOTES | ENT_XML1; } elseif ( ! in_array( $quote_style, array( ENT_NOQUOTES, ENT_COMPAT, ENT_QUOTES, 'single', 'double' ), true ) ) { $quote_style = ENT_QUOTES; } $charset = _canonical_charset( $charset ? $charset : get_option( 'blog_charset' ) ); $_quote_style = $quote_style; if ( 'double' === $quote_style ) { $quote_style = ENT_COMPAT; $_quote_style = ENT_COMPAT; } elseif ( 'single' === $quote_style ) { $quote_style = ENT_NOQUOTES; } if ( ! $double_encode ) { /* * Guarantee every &entity; is valid, convert &garbage; into &garbage; * This is required for PHP < 5.4.0 because ENT_HTML401 flag is unavailable. */ $text = wp_kses_normalize_entities( $text, ( $quote_style & ENT_XML1 ) ? 'xml' : 'html' ); } $text = htmlspecialchars( $text, $quote_style, $charset, $double_encode ); // Back-compat. if ( 'single' === $_quote_style ) { $text = str_replace( "'", ''', $text ); } return $text; } /** * Converts a number of HTML entities into their special characters. * * Specifically deals with: `&`, `<`, `>`, `"`, and `'`. * * `$quote_style` can be set to ENT_COMPAT to decode `"` entities, * or ENT_QUOTES to do both `"` and `'`. Default is ENT_NOQUOTES where no quotes are decoded. * * @since 2.8.0 * * @param string $text The text which is to be decoded. * @param string|int $quote_style Optional. Converts double quotes if set to ENT_COMPAT, * both single and double if set to ENT_QUOTES or * none if set to ENT_NOQUOTES. * Also compatible with old _wp_specialchars() values; * converting single quotes if set to 'single', * double if set to 'double' or both if otherwise set. * Default is ENT_NOQUOTES. * @return string The decoded text without HTML entities. */ function wp_specialchars_decode( $text, $quote_style = ENT_NOQUOTES ) { $text = (string) $text; if ( 0 === strlen( $text ) ) { return ''; } // Don't bother if there are no entities - saves a lot of processing. if ( ! str_contains( $text, '&' ) ) { return $text; } // Match the previous behavior of _wp_specialchars() when the $quote_style is not an accepted value. if ( empty( $quote_style ) ) { $quote_style = ENT_NOQUOTES; } elseif ( ! in_array( $quote_style, array( 0, 2, 3, 'single', 'double' ), true ) ) { $quote_style = ENT_QUOTES; } // More complete than get_html_translation_table( HTML_SPECIALCHARS ). $single = array( ''' => '\'', ''' => '\'', ); $single_preg = array( '/�*39;/' => ''', '/�*27;/i' => ''', ); $double = array( '"' => '"', '"' => '"', '"' => '"', ); $double_preg = array( '/�*34;/' => '"', '/�*22;/i' => '"', ); $others = array( '<' => '<', '<' => '<', '>' => '>', '>' => '>', '&' => '&', '&' => '&', '&' => '&', ); $others_preg = array( '/�*60;/' => '<', '/�*62;/' => '>', '/�*38;/' => '&', '/�*26;/i' => '&', ); if ( ENT_QUOTES === $quote_style ) { $translation = array_merge( $single, $double, $others ); $translation_preg = array_merge( $single_preg, $double_preg, $others_preg ); } elseif ( ENT_COMPAT === $quote_style || 'double' === $quote_style ) { $translation = array_merge( $double, $others ); $translation_preg = array_merge( $double_preg, $others_preg ); } elseif ( 'single' === $quote_style ) { $translation = array_merge( $single, $others ); $translation_preg = array_merge( $single_preg, $others_preg ); } elseif ( ENT_NOQUOTES === $quote_style ) { $translation = $others; $translation_preg = $others_preg; } // Remove zero padding on numeric entities. $text = preg_replace( array_keys( $translation_preg ), array_values( $translation_preg ), $text ); // Replace characters according to translation table. return strtr( $text, $translation ); } /** * Checks for invalid UTF8 in a string. * * @since 2.8.0 * * @param string $text The text which is to be checked. * @param bool $strip Optional. Whether to attempt to strip out invalid UTF8. Default false. * @return string The checked text. */ function wp_check_invalid_utf8( $text, $strip = false ) { $text = (string) $text; if ( 0 === strlen( $text ) ) { return ''; } // Store the site charset as a static to avoid multiple calls to get_option(). static $is_utf8 = null; if ( ! isset( $is_utf8 ) ) { $is_utf8 = is_utf8_charset(); } if ( ! $is_utf8 ) { return $text; } // Check for support for utf8 in the installed PCRE library once and store the result in a static. static $utf8_pcre = null; if ( ! isset( $utf8_pcre ) ) { // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged $utf8_pcre = @preg_match( '/^./u', 'a' ); } // We can't demand utf8 in the PCRE installation, so just return the string in those cases. if ( ! $utf8_pcre ) { return $text; } // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- preg_match fails when it encounters invalid UTF8 in $text. if ( 1 === @preg_match( '/^./us', $text ) ) { return $text; } // Attempt to strip the bad chars if requested (not recommended). if ( $strip && function_exists( 'iconv' ) ) { return iconv( 'utf-8', 'utf-8', $text ); } return ''; } /** * Encodes the Unicode values to be used in the URI. * * @since 1.5.0 * @since 5.8.3 Added the `encode_ascii_characters` parameter. * * @param string $utf8_string String to encode. * @param int $length Max length of the string * @param bool $encode_ascii_characters Whether to encode ascii characters such as < " ' * @return string String with Unicode encoded for URI. */ function utf8_uri_encode( $utf8_string, $length = 0, $encode_ascii_characters = false ) { $unicode = ''; $values = array(); $num_octets = 1; $unicode_length = 0; mbstring_binary_safe_encoding(); $string_length = strlen( $utf8_string ); reset_mbstring_encoding(); for ( $i = 0; $i < $string_length; $i++ ) { $value = ord( $utf8_string[ $i ] ); if ( $value < 128 ) { $char = chr( $value ); $encoded_char = $encode_ascii_characters ? rawurlencode( $char ) : $char; $encoded_char_length = strlen( $encoded_char ); if ( $length && ( $unicode_length + $encoded_char_length ) > $length ) { break; } $unicode .= $encoded_char; $unicode_length += $encoded_char_length; } else { if ( count( $values ) === 0 ) { if ( $value < 224 ) { $num_octets = 2; } elseif ( $value < 240 ) { $num_octets = 3; } else { $num_octets = 4; } } $values[] = $value; if ( $length && ( $unicode_length + ( $num_octets * 3 ) ) > $length ) { break; } if ( count( $values ) === $num_octets ) { for ( $j = 0; $j < $num_octets; $j++ ) { $unicode .= '%' . dechex( $values[ $j ] ); } $unicode_length += $num_octets * 3; $values = array(); $num_octets = 1; } } } return $unicode; } /** * Converts all accent characters to ASCII characters. * * If there are no accent characters, then the string given is just returned. * * **Accent characters converted:** * * Currency signs: * * | Code | Glyph | Replacement | Description | * | -------- | ----- | ----------- | ------------------- | * | U+00A3 | £ | (empty) | British Pound sign | * | U+20AC | € | E | Euro sign | * * Decompositions for Latin-1 Supplement: * * | Code | Glyph | Replacement | Description | * | ------- | ----- | ----------- | -------------------------------------- | * | U+00AA | ª | a | Feminine ordinal indicator | * | U+00BA | º | o | Masculine ordinal indicator | * | U+00C0 | À | A | Latin capital letter A with grave | * | U+00C1 | Á | A | Latin capital letter A with acute | * | U+00C2 |  | A | Latin capital letter A with circumflex | * | U+00C3 | à | A | Latin capital letter A with tilde | * | U+00C4 | Ä | A | Latin capital letter A with diaeresis | * | U+00C5 | Å | A | Latin capital letter A with ring above | * | U+00C6 | Æ | AE | Latin capital letter AE | * | U+00C7 | Ç | C | Latin capital letter C with cedilla | * | U+00C8 | È | E | Latin capital letter E with grave | * | U+00C9 | É | E | Latin capital letter E with acute | * | U+00CA | Ê | E | Latin capital letter E with circumflex | * | U+00CB | Ë | E | Latin capital letter E with diaeresis | * | U+00CC | Ì | I | Latin capital letter I with grave | * | U+00CD | Í | I | Latin capital letter I with acute | * | U+00CE | Î | I | Latin capital letter I with circumflex | * | U+00CF | Ï | I | Latin capital letter I with diaeresis | * | U+00D0 | Ð | D | Latin capital letter Eth | * | U+00D1 | Ñ | N | Latin capital letter N with tilde | * | U+00D2 | Ò | O | Latin capital letter O with grave | * | U+00D3 | Ó | O | Latin capital letter O with acute | * | U+00D4 | Ô | O | Latin capital letter O with circumflex | * | U+00D5 | Õ | O | Latin capital letter O with tilde | * | U+00D6 | Ö | O | Latin capital letter O with diaeresis | * | U+00D8 | Ø | O | Latin capital letter O with stroke | * | U+00D9 | Ù | U | Latin capital letter U with grave | * | U+00DA | Ú | U | Latin capital letter U with acute | * | U+00DB | Û | U | Latin capital letter U with circumflex | * | U+00DC | Ü | U | Latin capital letter U with diaeresis | * | U+00DD | Ý | Y | Latin capital letter Y with acute | * | U+00DE | Þ | TH | Latin capital letter Thorn | * | U+00DF | ß | s | Latin small letter sharp s | * | U+00E0 | à | a | Latin small letter a with grave | * | U+00E1 | á | a | Latin small letter a with acute | * | U+00E2 | â | a | Latin small letter a with circumflex | * | U+00E3 | ã | a | Latin small letter a with tilde | * | U+00E4 | ä | a | Latin small letter a with diaeresis | * | U+00E5 | å | a | Latin small letter a with ring above | * | U+00E6 | æ | ae | Latin small letter ae | * | U+00E7 | ç | c | Latin small letter c with cedilla | * | U+00E8 | è | e | Latin small letter e with grave | * | U+00E9 | é | e | Latin small letter e with acute | * | U+00EA | ê | e | Latin small letter e with circumflex | * | U+00EB | ë | e | Latin small letter e with diaeresis | * | U+00EC | ì | i | Latin small letter i with grave | * | U+00ED | í | i | Latin small letter i with acute | * | U+00EE | î | i | Latin small letter i with circumflex | * | U+00EF | ï | i | Latin small letter i with diaeresis | * | U+00F0 | ð | d | Latin small letter Eth | * | U+00F1 | ñ | n | Latin small letter n with tilde | * | U+00F2 | ò | o | Latin small letter o with grave | * | U+00F3 | ó | o | Latin small letter o with acute | * | U+00F4 | ô | o | Latin small letter o with circumflex | * | U+00F5 | õ | o | Latin small letter o with tilde | * | U+00F6 | ö | o | Latin small letter o with diaeresis | * | U+00F8 | ø | o | Latin small letter o with stroke | * | U+00F9 | ù | u | Latin small letter u with grave | * | U+00FA | ú | u | Latin small letter u with acute | * | U+00FB | û | u | Latin small letter u with circumflex | * | U+00FC | ü | u | Latin small letter u with diaeresis | * | U+00FD | ý | y | Latin small letter y with acute | * | U+00FE | þ | th | Latin small letter Thorn | * | U+00FF | ÿ | y | Latin small letter y with diaeresis | * * Decompositions for Latin Extended-A: * * | Code | Glyph | Replacement | Description | * | ------- | ----- | ----------- | ------------------------------------------------- | * | U+0100 | Ā | A | Latin capital letter A with macron | * | U+0101 | ā | a | Latin small letter a with macron | * | U+0102 | Ă | A | Latin capital letter A with breve | * | U+0103 | ă | a | Latin small letter a with breve | * | U+0104 | Ą | A | Latin capital letter A with ogonek | * | U+0105 | ą | a | Latin small letter a with ogonek | * | U+01006 | Ć | C | Latin capital letter C with acute | * | U+0107 | ć | c | Latin small letter c with acute | * | U+0108 | Ĉ | C | Latin capital letter C with circumflex | * | U+0109 | ĉ | c | Latin small letter c with circumflex | * | U+010A | Ċ | C | Latin capital letter C with dot above | * | U+010B | ċ | c | Latin small letter c with dot above | * | U+010C | Č | C | Latin capital letter C with caron | * | U+010D | č | c | Latin small letter c with caron | * | U+010E | Ď | D | Latin capital letter D with caron | * | U+010F | ď | d | Latin small letter d with caron | * | U+0110 | Đ | D | Latin capital letter D with stroke | * | U+0111 | đ | d | Latin small letter d with stroke | * | U+0112 | Ē | E | Latin capital letter E with macron | * | U+0113 | ē | e | Latin small letter e with macron | * | U+0114 | Ĕ | E | Latin capital letter E with breve | * | U+0115 | ĕ | e | Latin small letter e with breve | * | U+0116 | Ė | E | Latin capital letter E with dot above | * | U+0117 | ė | e | Latin small letter e with dot above | * | U+0118 | Ę | E | Latin capital letter E with ogonek | * | U+0119 | ę | e | Latin small letter e with ogonek | * | U+011A | Ě | E | Latin capital letter E with caron | * | U+011B | ě | e | Latin small letter e with caron | * | U+011C | Ĝ | G | Latin capital letter G with circumflex | * | U+011D | ĝ | g | Latin small letter g with circumflex | * | U+011E | Ğ | G | Latin capital letter G with breve | * | U+011F | ğ | g | Latin small letter g with breve | * | U+0120 | Ġ | G | Latin capital letter G with dot above | * | U+0121 | ġ | g | Latin small letter g with dot above | * | U+0122 | Ģ | G | Latin capital letter G with cedilla | * | U+0123 | ģ | g | Latin small letter g with cedilla | * | U+0124 | Ĥ | H | Latin capital letter H with circumflex | * | U+0125 | ĥ | h | Latin small letter h with circumflex | * | U+0126 | Ħ | H | Latin capital letter H with stroke | * | U+0127 | ħ | h | Latin small letter h with stroke | * | U+0128 | Ĩ | I | Latin capital letter I with tilde | * | U+0129 | ĩ | i | Latin small letter i with tilde | * | U+012A | Ī | I | Latin capital letter I with macron | * | U+012B | ī | i | Latin small letter i with macron | * | U+012C | Ĭ | I | Latin capital letter I with breve | * | U+012D | ĭ | i | Latin small letter i with breve | * | U+012E | Į | I | Latin capital letter I with ogonek | * | U+012F | į | i | Latin small letter i with ogonek | * | U+0130 | İ | I | Latin capital letter I with dot above | * | U+0131 | ı | i | Latin small letter dotless i | * | U+0132 | IJ | IJ | Latin capital ligature IJ | * | U+0133 | ij | ij | Latin small ligature ij | * | U+0134 | Ĵ | J | Latin capital letter J with circumflex | * | U+0135 | ĵ | j | Latin small letter j with circumflex | * | U+0136 | Ķ | K | Latin capital letter K with cedilla | * | U+0137 | ķ | k | Latin small letter k with cedilla | * | U+0138 | ĸ | k | Latin small letter Kra | * | U+0139 | Ĺ | L | Latin capital letter L with acute | * | U+013A | ĺ | l | Latin small letter l with acute | * | U+013B | Ļ | L | Latin capital letter L with cedilla | * | U+013C | ļ | l | Latin small letter l with cedilla | * | U+013D | Ľ | L | Latin capital letter L with caron | * | U+013E | ľ | l | Latin small letter l with caron | * | U+013F | Ŀ | L | Latin capital letter L with middle dot | * | U+0140 | ŀ | l | Latin small letter l with middle dot | * | U+0141 | Ł | L | Latin capital letter L with stroke | * | U+0142 | ł | l | Latin small letter l with stroke | * | U+0143 | Ń | N | Latin capital letter N with acute | * | U+0144 | ń | n | Latin small letter N with acute | * | U+0145 | Ņ | N | Latin capital letter N with cedilla | * | U+0146 | ņ | n | Latin small letter n with cedilla | * | U+0147 | Ň | N | Latin capital letter N with caron | * | U+0148 | ň | n | Latin small letter n with caron | * | U+0149 | ʼn | n | Latin small letter n preceded by apostrophe | * | U+014A | Ŋ | N | Latin capital letter Eng | * | U+014B | ŋ | n | Latin small letter Eng | * | U+014C | Ō | O | Latin capital letter O with macron | * | U+014D | ō | o | Latin small letter o with macron | * | U+014E | Ŏ | O | Latin capital letter O with breve | * | U+014F | ŏ | o | Latin small letter o with breve | * | U+0150 | Ő | O | Latin capital letter O with double acute | * | U+0151 | ő | o | Latin small letter o with double acute | * | U+0152 | Œ | OE | Latin capital ligature OE | * | U+0153 | œ | oe | Latin small ligature oe | * | U+0154 | Ŕ | R | Latin capital letter R with acute | * | U+0155 | ŕ | r | Latin small letter r with acute | * | U+0156 | Ŗ | R | Latin capital letter R with cedilla | * | U+0157 | ŗ | r | Latin small letter r with cedilla | * | U+0158 | Ř | R | Latin capital letter R with caron | * | U+0159 | ř | r | Latin small letter r with caron | * | U+015A | Ś | S | Latin capital letter S with acute | * | U+015B | ś | s | Latin small letter s with acute | * | U+015C | Ŝ | S | Latin capital letter S with circumflex | * | U+015D | ŝ | s | Latin small letter s with circumflex | * | U+015E | Ş | S | Latin capital letter S with cedilla | * | U+015F | ş | s | Latin small letter s with cedilla | * | U+0160 | Š | S | Latin capital letter S with caron | * | U+0161 | š | s | Latin small letter s with caron | * | U+0162 | Ţ | T | Latin capital letter T with cedilla | * | U+0163 | ţ | t | Latin small letter t with cedilla | * | U+0164 | Ť | T | Latin capital letter T with caron | * | U+0165 | ť | t | Latin small letter t with caron | * | U+0166 | Ŧ | T | Latin capital letter T with stroke | * | U+0167 | ŧ | t | Latin small letter t with stroke | * | U+0168 | Ũ | U | Latin capital letter U with tilde | * | U+0169 | ũ | u | Latin small letter u with tilde | * | U+016A | Ū | U | Latin capital letter U with macron | * | U+016B | ū | u | Latin small letter u with macron | * | U+016C | Ŭ | U | Latin capital letter U with breve | * | U+016D | ŭ | u | Latin small letter u with breve | * | U+016E | Ů | U | Latin capital letter U with ring above | * | U+016F | ů | u | Latin small letter u with ring above | * | U+0170 | Ű | U | Latin capital letter U with double acute | * | U+0171 | ű | u | Latin small letter u with double acute | * | U+0172 | Ų | U | Latin capital letter U with ogonek | * | U+0173 | ų | u | Latin small letter u with ogonek | * | U+0174 | Ŵ | W | Latin capital letter W with circumflex | * | U+0175 | ŵ | w | Latin small letter w with circumflex | * | U+0176 | Ŷ | Y | Latin capital letter Y with circumflex | * | U+0177 | ŷ | y | Latin small letter y with circumflex | * | U+0178 | Ÿ | Y | Latin capital letter Y with diaeresis | * | U+0179 | Ź | Z | Latin capital letter Z with acute | * | U+017A | ź | z | Latin small letter z with acute | * | U+017B | Ż | Z | Latin capital letter Z with dot above | * | U+017C | ż | z | Latin small letter z with dot above | * | U+017D | Ž | Z | Latin capital letter Z with caron | * | U+017E | ž | z | Latin small letter z with caron | * | U+017F | ſ | s | Latin small letter long s | * | U+01A0 | Ơ | O | Latin capital letter O with horn | * | U+01A1 | ơ | o | Latin small letter o with horn | * | U+01AF | Ư | U | Latin capital letter U with horn | * | U+01B0 | ư | u | Latin small letter u with horn | * | U+01CD | Ǎ | A | Latin capital letter A with caron | * | U+01CE | ǎ | a | Latin small letter a with caron | * | U+01CF | Ǐ | I | Latin capital letter I with caron | * | U+01D0 | ǐ | i | Latin small letter i with caron | * | U+01D1 | Ǒ | O | Latin capital letter O with caron | * | U+01D2 | ǒ | o | Latin small letter o with caron | * | U+01D3 | Ǔ | U | Latin capital letter U with caron | * | U+01D4 | ǔ | u | Latin small letter u with caron | * | U+01D5 | Ǖ | U | Latin capital letter U with diaeresis and macron | * | U+01D6 | ǖ | u | Latin small letter u with diaeresis and macron | * | U+01D7 | Ǘ | U | Latin capital letter U with diaeresis and acute | * | U+01D8 | ǘ | u | Latin small letter u with diaeresis and acute | * | U+01D9 | Ǚ | U | Latin capital letter U with diaeresis and caron | * | U+01DA | ǚ | u | Latin small letter u with diaeresis and caron | * | U+01DB | Ǜ | U | Latin capital letter U with diaeresis and grave | * | U+01DC | ǜ | u | Latin small letter u with diaeresis and grave | * * Decompositions for Latin Extended-B: * * | Code | Glyph | Replacement | Description | * | -------- | ----- | ----------- | ----------------------------------------- | * | U+018F | Ə | E | Latin capital letter Ə | * | U+0259 | ǝ | e | Latin small letter ǝ | * | U+0218 | Ș | S | Latin capital letter S with comma below | * | U+0219 | ș | s | Latin small letter s with comma below | * | U+021A | Ț | T | Latin capital letter T with comma below | * | U+021B | ț | t | Latin small letter t with comma below | * * Vowels with diacritic (Chinese, Hanyu Pinyin): * * | Code | Glyph | Replacement | Description | * | -------- | ----- | ----------- | ----------------------------------------------------- | * | U+0251 | ɑ | a | Latin small letter alpha | * | U+1EA0 | Ạ | A | Latin capital letter A with dot below | * | U+1EA1 | ạ | a | Latin small letter a with dot below | * | U+1EA2 | Ả | A | Latin capital letter A with hook above | * | U+1EA3 | ả | a | Latin small letter a with hook above | * | U+1EA4 | Ấ | A | Latin capital letter A with circumflex and acute | * | U+1EA5 | ấ | a | Latin small letter a with circumflex and acute | * | U+1EA6 | Ầ | A | Latin capital letter A with circumflex and grave | * | U+1EA7 | ầ | a | Latin small letter a with circumflex and grave | * | U+1EA8 | Ẩ | A | Latin capital letter A with circumflex and hook above | * | U+1EA9 | ẩ | a | Latin small letter a with circumflex and hook above | * | U+1EAA | Ẫ | A | Latin capital letter A with circumflex and tilde | * | U+1EAB | ẫ | a | Latin small letter a with circumflex and tilde | * | U+1EA6 | Ậ | A | Latin capital letter A with circumflex and dot below | * | U+1EAD | ậ | a | Latin small letter a with circumflex and dot below | * | U+1EAE | Ắ | A | Latin capital letter A with breve and acute | * | U+1EAF | ắ | a | Latin small letter a with breve and acute | * | U+1EB0 | Ằ | A | Latin capital letter A with breve and grave | * | U+1EB1 | ằ | a | Latin small letter a with breve and grave | * | U+1EB2 | Ẳ | A | Latin capital letter A with breve and hook above | * | U+1EB3 | ẳ | a | Latin small letter a with breve and hook above | * | U+1EB4 | Ẵ | A | Latin capital letter A with breve and tilde | * | U+1EB5 | ẵ | a | Latin small letter a with breve and tilde | * | U+1EB6 | Ặ | A | Latin capital letter A with breve and dot below | * | U+1EB7 | ặ | a | Latin small letter a with breve and dot below | * | U+1EB8 | Ẹ | E | Latin capital letter E with dot below | * | U+1EB9 | ẹ | e | Latin small letter e with dot below | * | U+1EBA | Ẻ | E | Latin capital letter E with hook above | * | U+1EBB | ẻ | e | Latin small letter e with hook above | * | U+1EBC | Ẽ | E | Latin capital letter E with tilde | * | U+1EBD | ẽ | e | Latin small letter e with tilde | * | U+1EBE | Ế | E | Latin capital letter E with circumflex and acute | * | U+1EBF | ế | e | Latin small letter e with circumflex and acute | * | U+1EC0 | Ề | E | Latin capital letter E with circumflex and grave | * | U+1EC1 | ề | e | Latin small letter e with circumflex and grave | * | U+1EC2 | Ể | E | Latin capital letter E with circumflex and hook above | * | U+1EC3 | ể | e | Latin small letter e with circumflex and hook above | * | U+1EC4 | Ễ | E | Latin capital letter E with circumflex and tilde | * | U+1EC5 | ễ | e | Latin small letter e with circumflex and tilde | * | U+1EC6 | Ệ | E | Latin capital letter E with circumflex and dot below | * | U+1EC7 | ệ | e | Latin small letter e with circumflex and dot below | * | U+1EC8 | Ỉ | I | Latin capital letter I with hook above | * | U+1EC9 | ỉ | i | Latin small letter i with hook above | * | U+1ECA | Ị | I | Latin capital letter I with dot below | * | U+1ECB | ị | i | Latin small letter i with dot below | * | U+1ECC | Ọ | O | Latin capital letter O with dot below | * | U+1ECD | ọ | o | Latin small letter o with dot below | * | U+1ECE | Ỏ | O | Latin capital letter O with hook above | * | U+1ECF | ỏ | o | Latin small letter o with hook above | * | U+1ED0 | Ố | O | Latin capital letter O with circumflex and acute | * | U+1ED1 | ố | o | Latin small letter o with circumflex and acute | * | U+1ED2 | Ồ | O | Latin capital letter O with circumflex and grave | * | U+1ED3 | ồ | o | Latin small letter o with circumflex and grave | * | U+1ED4 | Ổ | O | Latin capital letter O with circumflex and hook above | * | U+1ED5 | ổ | o | Latin small letter o with circumflex and hook above | * | U+1ED6 | Ỗ | O | Latin capital letter O with circumflex and tilde | * | U+1ED7 | ỗ | o | Latin small letter o with circumflex and tilde | * | U+1ED8 | Ộ | O | Latin capital letter O with circumflex and dot below | * | U+1ED9 | ộ | o | Latin small letter o with circumflex and dot below | * | U+1EDA | Ớ | O | Latin capital letter O with horn and acute | * | U+1EDB | ớ | o | Latin small letter o with horn and acute | * | U+1EDC | Ờ | O | Latin capital letter O with horn and grave | * | U+1EDD | ờ | o | Latin small letter o with horn and grave | * | U+1EDE | Ở | O | Latin capital letter O with horn and hook above | * | U+1EDF | ở | o | Latin small letter o with horn and hook above | * | U+1EE0 | Ỡ | O | Latin capital letter O with horn and tilde | * | U+1EE1 | ỡ | o | Latin small letter o with horn and tilde | * | U+1EE2 | Ợ | O | Latin capital letter O with horn and dot below | * | U+1EE3 | ợ | o | Latin small letter o with horn and dot below | * | U+1EE4 | Ụ | U | Latin capital letter U with dot below | * | U+1EE5 | ụ | u | Latin small letter u with dot below | * | U+1EE6 | Ủ | U | Latin capital letter U with hook above | * | U+1EE7 | ủ | u | Latin small letter u with hook above | * | U+1EE8 | Ứ | U | Latin capital letter U with horn and acute | * | U+1EE9 | ứ | u | Latin small letter u with horn and acute | * | U+1EEA | Ừ | U | Latin capital letter U with horn and grave | * | U+1EEB | ừ | u | Latin small letter u with horn and grave | * | U+1EEC | Ử | U | Latin capital letter U with horn and hook above | * | U+1EED | ử | u | Latin small letter u with horn and hook above | * | U+1EEE | Ữ | U | Latin capital letter U with horn and tilde | * | U+1EEF | ữ | u | Latin small letter u with horn and tilde | * | U+1EF0 | Ự | U | Latin capital letter U with horn and dot below | * | U+1EF1 | ự | u | Latin small letter u with horn and dot below | * | U+1EF2 | Ỳ | Y | Latin capital letter Y with grave | * | U+1EF3 | ỳ | y | Latin small letter y with grave | * | U+1EF4 | Ỵ | Y | Latin capital letter Y with dot below | * | U+1EF5 | ỵ | y | Latin small letter y with dot below | * | U+1EF6 | Ỷ | Y | Latin capital letter Y with hook above | * | U+1EF7 | ỷ | y | Latin small letter y with hook above | * | U+1EF8 | Ỹ | Y | Latin capital letter Y with tilde | * | U+1EF9 | ỹ | y | Latin small letter y with tilde | * * German (`de_DE`), German formal (`de_DE_formal`), German (Switzerland) formal (`de_CH`), * German (Switzerland) informal (`de_CH_informal`), and German (Austria) (`de_AT`) locales: * * | Code | Glyph | Replacement | Description | * | -------- | ----- | ----------- | --------------------------------------- | * | U+00C4 | Ä | Ae | Latin capital letter A with diaeresis | * | U+00E4 | ä | ae | Latin small letter a with diaeresis | * | U+00D6 | Ö | Oe | Latin capital letter O with diaeresis | * | U+00F6 | ö | oe | Latin small letter o with diaeresis | * | U+00DC | Ü | Ue | Latin capital letter U with diaeresis | * | U+00FC | ü | ue | Latin small letter u with diaeresis | * | U+00DF | ß | ss | Latin small letter sharp s | * * Danish (`da_DK`) locale: * * | Code | Glyph | Replacement | Description | * | -------- | ----- | ----------- | --------------------------------------- | * | U+00C6 | Æ | Ae | Latin capital letter AE | * | U+00E6 | æ | ae | Latin small letter ae | * | U+00D8 | Ø | Oe | Latin capital letter O with stroke | * | U+00F8 | ø | oe | Latin small letter o with stroke | * | U+00C5 | Å | Aa | Latin capital letter A with ring above | * | U+00E5 | å | aa | Latin small letter a with ring above | * * Catalan (`ca`) locale: * * | Code | Glyph | Replacement | Description | * | -------- | ----- | ----------- | --------------------------------------- | * | U+00B7 | l·l | ll | Flown dot (between two Ls) | * * Serbian (`sr_RS`) and Bosnian (`bs_BA`) locales: * * | Code | Glyph | Replacement | Description | * | -------- | ----- | ----------- | --------------------------------------- | * | U+0110 | Đ | DJ | Latin capital letter D with stroke | * | U+0111 | đ | dj | Latin small letter d with stroke | * * @since 1.2.1 * @since 4.6.0 Added locale support for `de_CH`, `de_CH_informal`, and `ca`. * @since 4.7.0 Added locale support for `sr_RS`. * @since 4.8.0 Added locale support for `bs_BA`. * @since 5.7.0 Added locale support for `de_AT`. * @since 6.0.0 Added the `$locale` parameter. * @since 6.1.0 Added Unicode NFC encoding normalization support. * * @param string $text Text that might have accent characters. * @param string $locale Optional. The locale to use for accent removal. Some character * replacements depend on the locale being used (e.g. 'de_DE'). * Defaults to the current locale. * @return string Filtered string with replaced "nice" characters. */ function remove_accents( $text, $locale = '' ) { if ( ! preg_match( '/[\x80-\xff]/', $text ) ) { return $text; } if ( seems_utf8( $text ) ) { /* * Unicode sequence normalization from NFD (Normalization Form Decomposed) * to NFC (Normalization Form [Pre]Composed), the encoding used in this function. */ if ( function_exists( 'normalizer_is_normalized' ) && function_exists( 'normalizer_normalize' ) ) { if ( ! normalizer_is_normalized( $text ) ) { $text = normalizer_normalize( $text ); } } $chars = array( // Decompositions for Latin-1 Supplement. 'ª' => 'a', 'º' => 'o', 'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'AE', 'Ç' => 'C', 'È' => 'E', 'É' => 'E', 'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I', 'Í' => 'I', 'Î' => 'I', 'Ï' => 'I', 'Ð' => 'D', 'Ñ' => 'N', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O', 'Õ' => 'O', 'Ö' => 'O', 'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U', 'Ü' => 'U', 'Ý' => 'Y', 'Þ' => 'TH', 'ß' => 's', 'à' => 'a', 'á' => 'a', 'â' => 'a', 'ã' => 'a', 'ä' => 'a', 'å' => 'a', 'æ' => 'ae', 'ç' => 'c', 'è' => 'e', 'é' => 'e', 'ê' => 'e', 'ë' => 'e', 'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i', 'ð' => 'd', 'ñ' => 'n', 'ò' => 'o', 'ó' => 'o', 'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ø' => 'o', 'ù' => 'u', 'ú' => 'u', 'û' => 'u', 'ü' => 'u', 'ý' => 'y', 'þ' => 'th', 'ÿ' => 'y', 'Ø' => 'O', // Decompositions for Latin Extended-A. 'Ā' => 'A', 'ā' => 'a', 'Ă' => 'A', 'ă' => 'a', 'Ą' => 'A', 'ą' => 'a', 'Ć' => 'C', 'ć' => 'c', 'Ĉ' => 'C', 'ĉ' => 'c', 'Ċ' => 'C', 'ċ' => 'c', 'Č' => 'C', 'č' => 'c', 'Ď' => 'D', 'ď' => 'd', 'Đ' => 'D', 'đ' => 'd', 'Ē' => 'E', 'ē' => 'e', 'Ĕ' => 'E', 'ĕ' => 'e', 'Ė' => 'E', 'ė' => 'e', 'Ę' => 'E', 'ę' => 'e', 'Ě' => 'E', 'ě' => 'e', 'Ĝ' => 'G', 'ĝ' => 'g', 'Ğ' => 'G', 'ğ' => 'g', 'Ġ' => 'G', 'ġ' => 'g', 'Ģ' => 'G', 'ģ' => 'g', 'Ĥ' => 'H', 'ĥ' => 'h', 'Ħ' => 'H', 'ħ' => 'h', 'Ĩ' => 'I', 'ĩ' => 'i', 'Ī' => 'I', 'ī' => 'i', 'Ĭ' => 'I', 'ĭ' => 'i', 'Į' => 'I', 'į' => 'i', 'İ' => 'I', 'ı' => 'i', 'IJ' => 'IJ', 'ij' => 'ij', 'Ĵ' => 'J', 'ĵ' => 'j', 'Ķ' => 'K', 'ķ' => 'k', 'ĸ' => 'k', 'Ĺ' => 'L', 'ĺ' => 'l', 'Ļ' => 'L', 'ļ' => 'l', 'Ľ' => 'L', 'ľ' => 'l', 'Ŀ' => 'L', 'ŀ' => 'l', 'Ł' => 'L', 'ł' => 'l', 'Ń' => 'N', 'ń' => 'n', 'Ņ' => 'N', 'ņ' => 'n', 'Ň' => 'N', 'ň' => 'n', 'ʼn' => 'n', 'Ŋ' => 'N', 'ŋ' => 'n', 'Ō' => 'O', 'ō' => 'o', 'Ŏ' => 'O', 'ŏ' => 'o', 'Ő' => 'O', 'ő' => 'o', 'Œ' => 'OE', 'œ' => 'oe', 'Ŕ' => 'R', 'ŕ' => 'r', 'Ŗ' => 'R', 'ŗ' => 'r', 'Ř' => 'R', 'ř' => 'r', 'Ś' => 'S', 'ś' => 's', 'Ŝ' => 'S', 'ŝ' => 's', 'Ş' => 'S', 'ş' => 's', 'Š' => 'S', 'š' => 's', 'Ţ' => 'T', 'ţ' => 't', 'Ť' => 'T', 'ť' => 't', 'Ŧ' => 'T', 'ŧ' => 't', 'Ũ' => 'U', 'ũ' => 'u', 'Ū' => 'U', 'ū' => 'u', 'Ŭ' => 'U', 'ŭ' => 'u', 'Ů' => 'U', 'ů' => 'u', 'Ű' => 'U', 'ű' => 'u', 'Ų' => 'U', 'ų' => 'u', 'Ŵ' => 'W', 'ŵ' => 'w', 'Ŷ' => 'Y', 'ŷ' => 'y', 'Ÿ' => 'Y', 'Ź' => 'Z', 'ź' => 'z', 'Ż' => 'Z', 'ż' => 'z', 'Ž' => 'Z', 'ž' => 'z', 'ſ' => 's', // Decompositions for Latin Extended-B. 'Ə' => 'E', 'ǝ' => 'e', 'Ș' => 'S', 'ș' => 's', 'Ț' => 'T', 'ț' => 't', // Euro sign. '€' => 'E', // GBP (Pound) sign. '£' => '', // Vowels with diacritic (Vietnamese). Unmarked. 'Ơ' => 'O', 'ơ' => 'o', 'Ư' => 'U', 'ư' => 'u', // Grave accent. 'Ầ' => 'A', 'ầ' => 'a', 'Ằ' => 'A', 'ằ' => 'a', 'Ề' => 'E', 'ề' => 'e', 'Ồ' => 'O', 'ồ' => 'o', 'Ờ' => 'O', 'ờ' => 'o', 'Ừ' => 'U', 'ừ' => 'u', 'Ỳ' => 'Y', 'ỳ' => 'y', // Hook. 'Ả' => 'A', 'ả' => 'a', 'Ẩ' => 'A', 'ẩ' => 'a', 'Ẳ' => 'A', 'ẳ' => 'a', 'Ẻ' => 'E', 'ẻ' => 'e', 'Ể' => 'E', 'ể' => 'e', 'Ỉ' => 'I', 'ỉ' => 'i', 'Ỏ' => 'O', 'ỏ' => 'o', 'Ổ' => 'O', 'ổ' => 'o', 'Ở' => 'O', 'ở' => 'o', 'Ủ' => 'U', 'ủ' => 'u', 'Ử' => 'U', 'ử' => 'u', 'Ỷ' => 'Y', 'ỷ' => 'y', // Tilde. 'Ẫ' => 'A', 'ẫ' => 'a', 'Ẵ' => 'A', 'ẵ' => 'a', 'Ẽ' => 'E', 'ẽ' => 'e', 'Ễ' => 'E', 'ễ' => 'e', 'Ỗ' => 'O', 'ỗ' => 'o', 'Ỡ' => 'O', 'ỡ' => 'o', 'Ữ' => 'U', 'ữ' => 'u', 'Ỹ' => 'Y', 'ỹ' => 'y', // Acute accent. 'Ấ' => 'A', 'ấ' => 'a', 'Ắ' => 'A', 'ắ' => 'a', 'Ế' => 'E', 'ế' => 'e', 'Ố' => 'O', 'ố' => 'o', 'Ớ' => 'O', 'ớ' => 'o', 'Ứ' => 'U', 'ứ' => 'u', // Dot below. 'Ạ' => 'A', 'ạ' => 'a', 'Ậ' => 'A', 'ậ' => 'a', 'Ặ' => 'A', 'ặ' => 'a', 'Ẹ' => 'E', 'ẹ' => 'e', 'Ệ' => 'E', 'ệ' => 'e', 'Ị' => 'I', 'ị' => 'i', 'Ọ' => 'O', 'ọ' => 'o', 'Ộ' => 'O', 'ộ' => 'o', 'Ợ' => 'O', 'ợ' => 'o', 'Ụ' => 'U', 'ụ' => 'u', 'Ự' => 'U', 'ự' => 'u', 'Ỵ' => 'Y', 'ỵ' => 'y', // Vowels with diacritic (Chinese, Hanyu Pinyin). 'ɑ' => 'a', // Macron. 'Ǖ' => 'U', 'ǖ' => 'u', // Acute accent. 'Ǘ' => 'U', 'ǘ' => 'u', // Caron. 'Ǎ' => 'A', 'ǎ' => 'a', 'Ǐ' => 'I', 'ǐ' => 'i', 'Ǒ' => 'O', 'ǒ' => 'o', 'Ǔ' => 'U', 'ǔ' => 'u', 'Ǚ' => 'U', 'ǚ' => 'u', // Grave accent. 'Ǜ' => 'U', 'ǜ' => 'u', ); // Used for locale-specific rules. if ( empty( $locale ) ) { $locale = get_locale(); } /* * German has various locales (de_DE, de_CH, de_AT, ...) with formal and informal variants. * There is no 3-letter locale like 'def', so checking for 'de' instead of 'de_' is safe, * since 'de' itself would be a valid locale too. */ if ( str_starts_with( $locale, 'de' ) ) { $chars['Ä'] = 'Ae'; $chars['ä'] = 'ae'; $chars['Ö'] = 'Oe'; $chars['ö'] = 'oe'; $chars['Ü'] = 'Ue'; $chars['ü'] = 'ue'; $chars['ß'] = 'ss'; } elseif ( 'da_DK' === $locale ) { $chars['Æ'] = 'Ae'; $chars['æ'] = 'ae'; $chars['Ø'] = 'Oe'; $chars['ø'] = 'oe'; $chars['Å'] = 'Aa'; $chars['å'] = 'aa'; } elseif ( 'ca' === $locale ) { $chars['l·l'] = 'll'; } elseif ( 'sr_RS' === $locale || 'bs_BA' === $locale ) { $chars['Đ'] = 'DJ'; $chars['đ'] = 'dj'; } $text = strtr( $text, $chars ); } else { $chars = array(); // Assume ISO-8859-1 if not UTF-8. $chars['in'] = "\x80\x83\x8a\x8e\x9a\x9e" . "\x9f\xa2\xa5\xb5\xc0\xc1\xc2" . "\xc3\xc4\xc5\xc7\xc8\xc9\xca" . "\xcb\xcc\xcd\xce\xcf\xd1\xd2" . "\xd3\xd4\xd5\xd6\xd8\xd9\xda" . "\xdb\xdc\xdd\xe0\xe1\xe2\xe3" . "\xe4\xe5\xe7\xe8\xe9\xea\xeb" . "\xec\xed\xee\xef\xf1\xf2\xf3" . "\xf4\xf5\xf6\xf8\xf9\xfa\xfb" . "\xfc\xfd\xff"; $chars['out'] = 'EfSZszYcYuAAAAAACEEEEIIIINOOOOOOUUUUYaaaaaaceeeeiiiinoooooouuuuyy'; $text = strtr( $text, $chars['in'], $chars['out'] ); $double_chars = array(); $double_chars['in'] = array( "\x8c", "\x9c", "\xc6", "\xd0", "\xde", "\xdf", "\xe6", "\xf0", "\xfe" ); $double_chars['out'] = array( 'OE', 'oe', 'AE', 'DH', 'TH', 'ss', 'ae', 'dh', 'th' ); $text = str_replace( $double_chars['in'], $double_chars['out'], $text ); } return $text; } /** * Sanitizes a filename, replacing whitespace with dashes. * * Removes special characters that are illegal in filenames on certain * operating systems and special characters requiring special escaping * to manipulate at the command line. Replaces spaces and consecutive * dashes with a single dash. Trims period, dash and underscore from beginning * and end of filename. It is not guaranteed that this function will return a * filename that is allowed to be uploaded. * * @since 2.1.0 * * @param string $filename The filename to be sanitized. * @return string The sanitized filename. */ function sanitize_file_name( $filename ) { $filename_raw = $filename; $filename = remove_accents( $filename ); $special_chars = array( '?', '[', ']', '/', '\\', '=', '<', '>', ':', ';', ',', "'", '"', '&', '$', '#', '*', '(', ')', '|', '~', '`', '!', '{', '}', '%', '+', '’', '«', '»', '”', '“', chr( 0 ) ); // Check for support for utf8 in the installed PCRE library once and store the result in a static. static $utf8_pcre = null; if ( ! isset( $utf8_pcre ) ) { // phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged $utf8_pcre = @preg_match( '/^./u', 'a' ); } if ( ! seems_utf8( $filename ) ) { $_ext = pathinfo( $filename, PATHINFO_EXTENSION ); $_name = pathinfo( $filename, PATHINFO_FILENAME ); $filename = sanitize_title_with_dashes( $_name ) . '.' . $_ext; } if ( $utf8_pcre ) { $filename = preg_replace( "#\x{00a0}#siu", ' ', $filename ); } /** * Filters the list of characters to remove from a filename. * * @since 2.8.0 * * @param string[] $special_chars Array of characters to remove. * @param string $filename_raw The original filename to be sanitized. */ $special_chars = apply_filters( 'sanitize_file_name_chars', $special_chars, $filename_raw ); $filename = str_replace( $special_chars, '', $filename ); $filename = str_replace( array( '%20', '+' ), '-', $filename ); $filename = preg_replace( '/\.{2,}/', '.', $filename ); $filename = preg_replace( '/[\r\n\t -]+/', '-', $filename ); $filename = trim( $filename, '.-_' ); if ( ! str_contains( $filename, '.' ) ) { $mime_types = wp_get_mime_types(); $filetype = wp_check_filetype( 'test.' . $filename, $mime_types ); if ( $filetype['ext'] === $filename ) { $filename = 'unnamed-file.' . $filetype['ext']; } } // Split the filename into a base and extension[s]. $parts = explode( '.', $filename ); // Return if only one extension. if ( count( $parts ) <= 2 ) { /** This filter is documented in wp-includes/formatting.php */ return apply_filters( 'sanitize_file_name', $filename, $filename_raw ); } // Process multiple extensions. $filename = array_shift( $parts ); $extension = array_pop( $parts ); $mimes = get_allowed_mime_types(); /* * Loop over any intermediate extensions. Postfix them with a trailing underscore * if they are a 2 - 5 character long alpha string not in the allowed extension list. */ foreach ( (array) $parts as $part ) { $filename .= '.' . $part; if ( preg_match( '/^[a-zA-Z]{2,5}\d?$/', $part ) ) { $allowed = false; foreach ( $mimes as $ext_preg => $mime_match ) { $ext_preg = '!^(' . $ext_preg . ')$!i'; if ( preg_match( $ext_preg, $part ) ) { $allowed = true; break; } } if ( ! $allowed ) { $filename .= '_'; } } } $filename .= '.' . $extension; /** * Filters a sanitized filename string. * * @since 2.8.0 * * @param string $filename Sanitized filename. * @param string $filename_raw The filename prior to sanitization. */ return apply_filters( 'sanitize_file_name', $filename, $filename_raw ); } /** * Sanitizes a username, stripping out unsafe characters. * * Removes tags, percent-encoded characters, HTML entities, and if strict is enabled, * will only keep alphanumeric, _, space, ., -, @. After sanitizing, it passes the username, * raw username (the username in the parameter), and the value of $strict as parameters * for the {@see 'sanitize_user'} filter. * * @since 2.0.0 * * @param string $username The username to be sanitized. * @param bool $strict Optional. If set to true, limits $username to specific characters. * Default false. * @return string The sanitized username, after passing through filters. */ function sanitize_user( $username, $strict = false ) { $raw_username = $username; $username = wp_strip_all_tags( $username ); $username = remove_accents( $username ); // Remove percent-encoded characters. $username = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '', $username ); // Remove HTML entities. $username = preg_replace( '/&.+?;/', '', $username ); // If strict, reduce to ASCII for max portability. if ( $strict ) { $username = preg_replace( '|[^a-z0-9 _.\-@]|i', '', $username ); } $username = trim( $username ); // Consolidate contiguous whitespace. $username = preg_replace( '|\s+|', ' ', $username ); /** * Filters a sanitized username string. * * @since 2.0.1 * * @param string $username Sanitized username. * @param string $raw_username The username prior to sanitization. * @param bool $strict Whether to limit the sanitization to specific characters. */ return apply_filters( 'sanitize_user', $username, $raw_username, $strict ); } /** * Sanitizes a string key. * * Keys are used as internal identifiers. Lowercase alphanumeric characters, * dashes, and underscores are allowed. * * @since 3.0.0 * * @param string $key String key. * @return string Sanitized key. */ function sanitize_key( $key ) { $sanitized_key = ''; if ( is_scalar( $key ) ) { $sanitized_key = strtolower( $key ); $sanitized_key = preg_replace( '/[^a-z0-9_\-]/', '', $sanitized_key ); } /** * Filters a sanitized key string. * * @since 3.0.0 * * @param string $sanitized_key Sanitized key. * @param string $key The key prior to sanitization. */ return apply_filters( 'sanitize_key', $sanitized_key, $key ); } /** * Sanitizes a string into a slug, which can be used in URLs or HTML attributes. * * By default, converts accent characters to ASCII characters and further * limits the output to alphanumeric characters, underscore (_) and dash (-) * through the {@see 'sanitize_title'} filter. * * If `$title` is empty and `$fallback_title` is set, the latter will be used. * * @since 1.0.0 * * @param string $title The string to be sanitized. * @param string $fallback_title Optional. A title to use if $title is empty. Default empty. * @param string $context Optional. The operation for which the string is sanitized. * When set to 'save', the string runs through remove_accents(). * Default 'save'. * @return string The sanitized string. */ function sanitize_title( $title, $fallback_title = '', $context = 'save' ) { $raw_title = $title; if ( 'save' === $context ) { $title = remove_accents( $title ); } /** * Filters a sanitized title string. * * @since 1.2.0 * * @param string $title Sanitized title. * @param string $raw_title The title prior to sanitization. * @param string $context The context for which the title is being sanitized. */ $title = apply_filters( 'sanitize_title', $title, $raw_title, $context ); if ( '' === $title || false === $title ) { $title = $fallback_title; } return $title; } /** * Sanitizes a title with the 'query' context. * * Used for querying the database for a value from URL. * * @since 3.1.0 * * @param string $title The string to be sanitized. * @return string The sanitized string. */ function sanitize_title_for_query( $title ) { return sanitize_title( $title, '', 'query' ); } /** * Sanitizes a title, replacing whitespace and a few other characters with dashes. * * Limits the output to alphanumeric characters, underscore (_) and dash (-). * Whitespace becomes a dash. * * @since 1.2.0 * * @param string $title The title to be sanitized. * @param string $raw_title Optional. Not used. Default empty. * @param string $context Optional. The operation for which the string is sanitized. * When set to 'save', additional entities are converted to hyphens * or stripped entirely. Default 'display'. * @return string The sanitized title. */ function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'display' ) { $title = strip_tags( $title ); // Preserve escaped octets. $title = preg_replace( '|%([a-fA-F0-9][a-fA-F0-9])|', '---$1---', $title ); // Remove percent signs that are not part of an octet. $title = str_replace( '%', '', $title ); // Restore octets. $title = preg_replace( '|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title ); if ( seems_utf8( $title ) ) { if ( function_exists( 'mb_strtolower' ) ) { $title = mb_strtolower( $title, 'UTF-8' ); } $title = utf8_uri_encode( $title, 200 ); } $title = strtolower( $title ); if ( 'save' === $context ) { // Convert  , &ndash, and &mdash to hyphens. $title = str_replace( array( '%c2%a0', '%e2%80%93', '%e2%80%94' ), '-', $title ); // Convert  , &ndash, and &mdash HTML entities to hyphens. $title = str_replace( array( ' ', ' ', '–', '–', '—', '—' ), '-', $title ); // Convert forward slash to hyphen. $title = str_replace( '/', '-', $title ); // Strip these characters entirely. $title = str_replace( array( // Soft hyphens. '%c2%ad', // ¡ and ¿. '%c2%a1', '%c2%bf', // Angle quotes. '%c2%ab', '%c2%bb', '%e2%80%b9', '%e2%80%ba', // Curly quotes. '%e2%80%98', '%e2%80%99', '%e2%80%9c', '%e2%80%9d', '%e2%80%9a', '%e2%80%9b', '%e2%80%9e', '%e2%80%9f', // Bullet. '%e2%80%a2', // ©, ®, °, &hellip, and &trade. '%c2%a9', '%c2%ae', '%c2%b0', '%e2%80%a6', '%e2%84%a2', // Acute accents. '%c2%b4', '%cb%8a', '%cc%81', '%cd%81', // Grave accent, macron, caron. '%cc%80', '%cc%84', '%cc%8c', // Non-visible characters that display without a width. '%e2%80%8b', // Zero width space. '%e2%80%8c', // Zero width non-joiner. '%e2%80%8d', // Zero width joiner. '%e2%80%8e', // Left-to-right mark. '%e2%80%8f', // Right-to-left mark. '%e2%80%aa', // Left-to-right embedding. '%e2%80%ab', // Right-to-left embedding. '%e2%80%ac', // Pop directional formatting. '%e2%80%ad', // Left-to-right override. '%e2%80%ae', // Right-to-left override. '%ef%bb%bf', // Byte order mark. '%ef%bf%bc', // Object replacement character. ), '', $title ); // Convert non-visible characters that display with a width to hyphen. $title = str_replace( array( '%e2%80%80', // En quad. '%e2%80%81', // Em quad. '%e2%80%82', // En space. '%e2%80%83', // Em space. '%e2%80%84', // Three-per-em space. '%e2%80%85', // Four-per-em space. '%e2%80%86', // Six-per-em space. '%e2%80%87', // Figure space. '%e2%80%88', // Punctuation space. '%e2%80%89', // Thin space. '%e2%80%8a', // Hair space. '%e2%80%a8', // Line separator. '%e2%80%a9', // Paragraph separator. '%e2%80%af', // Narrow no-break space. ), '-', $title ); // Convert × to 'x'. $title = str_replace( '%c3%97', 'x', $title ); } // Remove HTML entities. $title = preg_replace( '/&.+?;/', '', $title ); $title = str_replace( '.', '-', $title ); $title = preg_replace( '/[^%a-z0-9 _-]/', '', $title ); $title = preg_replace( '/\s+/', '-', $title ); $title = preg_replace( '|-+|', '-', $title ); $title = trim( $title, '-' ); return $title; } /** * Ensures a string is a valid SQL 'order by' clause. * * Accepts one or more columns, with or without a sort order (ASC / DESC). * e.g. 'column_1', 'column_1, column_2', 'column_1 ASC, column_2 DESC' etc. * * Also accepts 'RAND()'. * * @since 2.5.1 * * @param string $orderby Order by clause to be validated. * @return string|false Returns $orderby if valid, false otherwise. */ function sanitize_sql_orderby( $orderby ) { if ( preg_match( '/^\s*(([a-z0-9_]+|`[a-z0-9_]+`)(\s+(ASC|DESC))?\s*(,\s*(?=[a-z0-9_`])|$))+$/i', $orderby ) || preg_match( '/^\s*RAND\(\s*\)\s*$/i', $orderby ) ) { return $orderby; } return false; } /** * Sanitizes an HTML classname to ensure it only contains valid characters. * * Strips the string down to A-Z,a-z,0-9,_,-. If this results in an empty * string then it will return the alternative value supplied. * * @todo Expand to support the full range of CDATA that a class attribute can contain. * * @since 2.8.0 * * @param string $classname The classname to be sanitized. * @param string $fallback Optional. The value to return if the sanitization ends up as an empty string. * Default empty string. * @return string The sanitized value. */ function sanitize_html_class( $classname, $fallback = '' ) { // Strip out any percent-encoded characters. $sanitized = preg_replace( '|%[a-fA-F0-9][a-fA-F0-9]|', '', $classname ); // Limit to A-Z, a-z, 0-9, '_', '-'. $sanitized = preg_replace( '/[^A-Za-z0-9_-]/', '', $sanitized ); if ( '' === $sanitized && $fallback ) { return sanitize_html_class( $fallback ); } /** * Filters a sanitized HTML class string. * * @since 2.8.0 * * @param string $sanitized The sanitized HTML class. * @param string $classname HTML class before sanitization. * @param string $fallback The fallback string. */ return apply_filters( 'sanitize_html_class', $sanitized, $classname, $fallback ); } /** * Strips out all characters not allowed in a locale name. * * @since 6.2.1 * * @param string $locale_name The locale name to be sanitized. * @return string The sanitized value. */ function sanitize_locale_name( $locale_name ) { // Limit to A-Z, a-z, 0-9, '_', '-'. $sanitized = preg_replace( '/[^A-Za-z0-9_-]/', '', $locale_name ); /** * Filters a sanitized locale name string. * * @since 6.2.1 * * @param string $sanitized The sanitized locale name. * @param string $locale_name The locale name before sanitization. */ return apply_filters( 'sanitize_locale_name', $sanitized, $locale_name ); } /** * Converts lone & characters into `&` (a.k.a. `&`) * * @since 0.71 * * @param string $content String of characters to be converted. * @param string $deprecated Not used. * @return string Converted string. */ function convert_chars( $content, $deprecated = '' ) { if ( ! empty( $deprecated ) ) { _deprecated_argument( __FUNCTION__, '0.71' ); } if ( str_contains( $content, '&' ) ) { $content = preg_replace( '/&([^#])(?![a-z1-4]{1,8};)/i', '&$1', $content ); } return $content; } /** * Converts invalid Unicode references range to valid range. * * @since 4.3.0 * * @param string $content String with entities that need converting. * @return string Converted string. */ function convert_invalid_entities( $content ) { $wp_htmltranswinuni = array( '€' => '€', // The Euro sign. '' => '', '‚' => '‚', // These are Windows CP1252 specific characters. 'ƒ' => 'ƒ', // They would look weird on non-Windows browsers. '„' => '„', '…' => '…', '†' => '†', '‡' => '‡', 'ˆ' => 'ˆ', '‰' => '‰', 'Š' => 'Š', '‹' => '‹', 'Œ' => 'Œ', '' => '', 'Ž' => 'Ž', '' => '', '' => '', '‘' => '‘', '’' => '’', '“' => '“', '”' => '”', '•' => '•', '–' => '–', '—' => '—', '˜' => '˜', '™' => '™', 'š' => 'š', '›' => '›', 'œ' => 'œ', '' => '', 'ž' => 'ž', 'Ÿ' => 'Ÿ', ); if ( str_contains( $content, '' ) ) { $content = strtr( $content, $wp_htmltranswinuni ); } return $content; } /** * Balances tags if forced to, or if the 'use_balanceTags' option is set to true. * * @since 0.71 * * @param string $text Text to be balanced * @param bool $force If true, forces balancing, ignoring the value of the option. Default false. * @return string Balanced text */ function balanceTags( $text, $force = false ) { // phpcs:ignore WordPress.NamingConventions.ValidFunctionName.FunctionNameInvalid if ( $force || (int) get_option( 'use_balanceTags' ) === 1 ) { return force_balance_tags( $text ); } else { return $text; } } /** * Balances tags of string using a modified stack. * * @since 2.0.4 * @since 5.3.0 Improve accuracy and add support for custom element tags. * * @author Leonard Lin * @license GPL * @copyright November 4, 2001 * @version 1.1 * @todo Make better - change loop condition to $text in 1.2 * @internal Modified by Scott Reilly (coffee2code) 02 Aug 2004 * 1.1 Fixed handling of append/stack pop order of end text * Added Cleaning Hooks * 1.0 First Version * * @param string $text Text to be balanced. * @return string Balanced text. */ function force_balance_tags( $text ) { $tagstack = array(); $stacksize = 0; $tagqueue = ''; $newtext = ''; // Known single-entity/self-closing tags. $single_tags = array( 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track', 'wbr' ); // Tags that can be immediately nested within themselves. $nestable_tags = array( 'article', 'aside', 'blockquote', 'details', 'div', 'figure', 'object', 'q', 'section', 'span' ); // WP bug fix for comments - in case you REALLY meant to type '< !--'. $text = str_replace( '< !--', '< !--', $text ); // WP bug fix for LOVE <3 (and other situations with '<' before a number). $text = preg_replace( '#<([0-9]{1})#', '<$1', $text ); /** * Matches supported tags. * * To get the pattern as a string without the comments paste into a PHP * REPL like `php -a`. * * @see https://html.spec.whatwg.org/#elements-2 * @see https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name * * @example * ~# php -a * php > $s = [paste copied contents of expression below including parentheses]; * php > echo $s; */ $tag_pattern = ( '#<' . // Start with an opening bracket. '(/?)' . // Group 1 - If it's a closing tag it'll have a leading slash. '(' . // Group 2 - Tag name. // Custom element tags have more lenient rules than HTML tag names. '(?:[a-z](?:[a-z0-9._]*)-(?:[a-z0-9._-]+)+)' . '|' . // Traditional tag rules approximate HTML tag names. '(?:[\w:]+)' . ')' . '(?:' . // We either immediately close the tag with its '>' and have nothing here. '\s*' . '(/?)' . // Group 3 - "attributes" for empty tag. '|' . // Or we must start with space characters to separate the tag name from the attributes (or whitespace). '(\s+)' . // Group 4 - Pre-attribute whitespace. '([^>]*)' . // Group 5 - Attributes. ')' . '>#' // End with a closing bracket. ); while ( preg_match( $tag_pattern, $text, $regex ) ) { $full_match = $regex[0]; $has_leading_slash = ! empty( $regex[1] ); $tag_name = $regex[2]; $tag = strtolower( $tag_name ); $is_single_tag = in_array( $tag, $single_tags, true ); $pre_attribute_ws = isset( $regex[4] ) ? $regex[4] : ''; $attributes = trim( isset( $regex[5] ) ? $regex[5] : $regex[3] ); $has_self_closer = str_ends_with( $attributes, '/' ); $newtext .= $tagqueue; $i = strpos( $text, $full_match ); $l = strlen( $full_match ); // Clear the shifter. $tagqueue = ''; if ( $has_leading_slash ) { // End tag. // If too many closing tags. if ( $stacksize <= 0 ) { $tag = ''; // Or close to be safe $tag = '/' . $tag. // If stacktop value = tag close value, then pop. } elseif ( $tagstack[ $stacksize - 1 ] === $tag ) { // Found closing tag. $tag = ''; // Close tag. array_pop( $tagstack ); --$stacksize; } else { // Closing tag not at top, search for it. for ( $j = $stacksize - 1; $j >= 0; $j-- ) { if ( $tagstack[ $j ] === $tag ) { // Add tag to tagqueue. for ( $k = $stacksize - 1; $k >= $j; $k-- ) { $tagqueue .= ''; --$stacksize; } break; } } $tag = ''; } } else { // Begin tag. if ( $has_self_closer ) { /* * If it presents itself as a self-closing tag, but it isn't a known single-entity self-closing tag, * then don't let it be treated as such and immediately close it with a closing tag. * The tag will encapsulate no text as a result. */ if ( ! $is_single_tag ) { $attributes = trim( substr( $attributes, 0, -1 ) ) . "> 0 && ! in_array( $tag, $nestable_tags, true ) && $tagstack[ $stacksize - 1 ] === $tag ) { $tagqueue = ''; --$stacksize; } $stacksize = array_push( $tagstack, $tag ); } // Attributes. if ( $has_self_closer && $is_single_tag ) { // We need some space - avoid
    and prefer
    . $pre_attribute_ws = ' '; } $tag = '<' . $tag . $pre_attribute_ws . $attributes . '>'; // If already queuing a close tag, then put this tag on too. if ( ! empty( $tagqueue ) ) { $tagqueue .= $tag; $tag = ''; } } $newtext .= substr( $text, 0, $i ) . $tag; $text = substr( $text, $i + $l ); } // Clear tag queue. $newtext .= $tagqueue; // Add remaining text. $newtext .= $text; while ( $x = array_pop( $tagstack ) ) { $newtext .= ''; // Add remaining tags to close. } // WP fix for the bug with HTML comments. $newtext = str_replace( '< !--', '