Create New Item
Item Type
File
Folder
Item Name
Search file in folder and subfolders...
Are you sure want to rename?
forbidals
/
wp-content
/
plugins
/
woocommerce
/
packages
/
email-editor
/
src
/
Integrations
/
Utils
:
class-html-processing-helper.php
Advanced Search
Upload
New Item
Settings
Back
Back Up
Advanced Editor
Save
<?php /** * This file is part of the WooCommerce Email Editor package * * @package Automattic\WooCommerce\EmailEditor */ declare( strict_types = 1 ); namespace Automattic\WooCommerce\EmailEditor\Integrations\Utils; /** * Helper class for HTML processing and manipulation. */ class Html_Processing_Helper { /** * Clean CSS classes by removing background and border related classes. * * @param string $classes CSS classes to clean. * @return string Cleaned CSS classes. */ public static function clean_css_classes( string $classes ): string { // Limit input length to prevent DoS attacks. if ( strlen( $classes ) > 1000 ) { $classes = substr( $classes, 0, 1000 ); } // Remove generic background classes but keep specific color classes. $result = preg_replace( '/\bhas-background\b/', '', $classes ); if ( null === $result ) { $classes = ''; } else { $classes = $result; } // Remove border classes. $result = preg_replace( '/\bhas-[a-z-]*border[a-z-]*\b/', '', $classes ); if ( null === $result ) { $classes = ''; } else { $classes = $result; } $result = preg_replace( '/\b[a-z-]+-border-[a-z-]+\b/', '', $classes ); if ( null === $result ) { $classes = ''; } else { $classes = $result; } // Clean up multiple spaces. $result = preg_replace( '/\s+/', ' ', $classes ); if ( null === $result ) { $classes = ''; } else { $classes = $result; } return trim( $classes ); } /** * Sanitize CSS value to prevent injection attacks. * * @param string $value CSS value to sanitize. * @return string Sanitized CSS value or empty string if invalid. */ public static function sanitize_css_value( string $value ): string { // Remove dangerous script injection characters (angle brackets) but preserve quotes for CSS strings. $result = preg_replace( '/[<>]/', '', $value ); if ( null === $result ) { $value = ''; } else { $value = $result; } // Remove dangerous CSS functions and expressions. $dangerous_patterns = array( '/expression\s*\(/i', '/url\s*\(\s*javascript\s*:/i', '/url\s*\(\s*data\s*:/i', '/url\s*\(\s*vbscript\s*:/i', '/import\s*\(/i', '/behavior\s*:/i', '/binding\s*:/i', '/filter\s*:/i', '/progid\s*:/i', ); foreach ( $dangerous_patterns as $pattern ) { if ( preg_match( $pattern, $value ) ) { return ''; } } return trim( $value ); } /** * Sanitize dimension value to ensure it's a valid CSS dimension. * * Supports numeric values (converted to px) and standard CSS units. * * @param mixed $value The dimension value to sanitize. * @return string Sanitized dimension value or empty string if invalid. */ public static function sanitize_dimension_value( $value ): string { if ( ! is_string( $value ) && ! is_numeric( $value ) ) { return ''; } $value = (string) $value; // If it's just a number, assume pixels. if ( is_numeric( $value ) ) { $value = $value . 'px'; } // Use existing CSS value sanitization for security. $sanitized_value = self::sanitize_css_value( $value ); // Additional validation for dimension-specific units. if ( ! empty( $sanitized_value ) && preg_match( '/^(\d+(?:\.\d+)?)(px|em|rem|%|vh|vw|ex|ch|in|cm|mm|pt|pc)$/', $sanitized_value ) ) { return $sanitized_value; } return ''; } /** * Sanitize color value to ensure it's a valid color format. * * Supports hex colors, rgb/rgba, hsl/hsla, named colors, and CSS variables. * * @param string $color The color value to sanitize. * @return string Sanitized color value or safe default if invalid. */ public static function sanitize_color( string $color ): string { // Remove any whitespace. $color = trim( $color ); // Check if it's a valid hex color (#fff, #ffffff, #ffffffff). if ( preg_match( '/^#([0-9a-fA-F]{3}|[0-9a-fA-F]{6}|[0-9a-fA-F]{8})$/', $color ) ) { return strtolower( $color ); } // Check for rgb/rgba colors. if ( preg_match( '/^rgba?\(\s*(25[0-5]|2[0-4]\d|1\d{2}|\d{1,2})\s*,\s*(25[0-5]|2[0-4]\d|1\d{2}|\d{1,2})\s*,\s*(25[0-5]|2[0-4]\d|1\d{2}|\d{1,2})\s*(?:,\s*(?:1(?:\.0+)?|0(?:\.\d+)?|\.\d+)\s*)?\)$/', $color ) ) { return $color; } // Check for hsl/hsla colors. if ( preg_match( '/^hsla?\(\s*(360|3[0-5]\d|[12]\d{2}|\d{1,2})\s*,\s*(100|[1-9]?\d)%\s*,\s*(100|[1-9]?\d)%\s*(?:,\s*(?:1(?:\.0+)?|0(?:\.\d+)?|\.\d+)\s*)?\)$/', $color ) ) { return $color; } // Check for named colors and other valid CSS color values. // We use a permissive approach: accept any string that doesn't contain dangerous characters // and let the CSS engine handle the actual validation. if ( preg_match( '/^[a-zA-Z][a-zA-Z0-9-]*$/', $color ) && ! preg_match( '/^(expression|javascript|vbscript|data|import|behavior|binding|filter|progid)/i', $color ) ) { return strtolower( $color ); } // Check if it's a CSS variable (var(--variable-name)). if ( preg_match( '/^var\(--[a-zA-Z0-9\-_]+\)$/', $color ) ) { return $color; } // If not a valid color format, return a safe default. return '#000000'; } /** * Normalize rel attribute by lowercasing, deduplicating tokens, and ensuring required tokens. * * @param string|null $rel_value Current rel attribute value. * @param bool $require_security_tokens Whether to require noopener and noreferrer tokens. * @return string Normalized rel attribute value. */ private static function normalize_rel_attribute( ?string $rel_value, bool $require_security_tokens = false ): string { $allowed_tokens = array( 'noopener', 'noreferrer', 'nofollow', 'external' ); $required_tokens = $require_security_tokens ? array( 'noopener', 'noreferrer' ) : array(); // If no rel value and no required tokens, return empty. if ( null === $rel_value && empty( $required_tokens ) ) { return ''; } // Start with required tokens. $tokens = $required_tokens; // If rel value exists, parse and normalize it. if ( null !== $rel_value ) { $existing_tokens = preg_split( '/\s+/', trim( $rel_value ) ); if ( false !== $existing_tokens ) { // Normalize existing tokens: lowercase, remove empty, filter allowed. $normalized_existing = array_filter( array_map( 'strtolower', $existing_tokens ), function ( $token ) use ( $allowed_tokens ) { return ! empty( $token ) && in_array( $token, $allowed_tokens, true ); } ); // Merge with required tokens, removing duplicates. $tokens = array_unique( array_merge( $tokens, $normalized_existing ) ); } } // Return normalized rel attribute or empty string if no valid tokens. return empty( $tokens ) ? '' : implode( ' ', $tokens ); } /** * Validate and sanitize specific caption attributes for security. * * @param \WP_HTML_Tag_Processor $html HTML tag processor. * @param string $attr_name Attribute name to validate. */ public static function validate_caption_attribute( \WP_HTML_Tag_Processor $html, string $attr_name ): void { $attr_value = $html->get_attribute( $attr_name ); if ( null === $attr_value ) { return; } // Block all event handler attributes (on*) - Critical security fix. if ( str_starts_with( $attr_name, 'on' ) ) { $html->remove_attribute( $attr_name ); return; } switch ( $attr_name ) { case 'href': // Only allow http, https, mailto, and tel protocols. if ( ! preg_match( '/^(https?:\/\/|mailto:|tel:)/i', (string) $attr_value ) ) { $html->remove_attribute( $attr_name ); break; } // Sanitize and normalize the URL using WordPress's esc_url_raw. $sanitized_url = esc_url_raw( (string) $attr_value ); if ( empty( $sanitized_url ) ) { // If esc_url_raw returns empty, the URL was invalid - remove the attribute. $html->remove_attribute( $attr_name ); } else { // Set the attribute to the sanitized/normalized value. $html->set_attribute( $attr_name, $sanitized_url ); } break; case 'target': // Allow only common safe targets. $allowed_targets = array( '_blank', '_self' ); $target_value = strtolower( (string) $attr_value ); if ( ! in_array( $target_value, $allowed_targets, true ) ) { $html->remove_attribute( $attr_name ); } elseif ( '_blank' === $target_value ) { // When target is "_blank", ensure rel attribute has noopener and noreferrer. $current_rel = $html->get_attribute( 'rel' ); $rel_value = is_string( $current_rel ) ? $current_rel : null; $normalized_rel = self::normalize_rel_attribute( $rel_value, true ); $html->set_attribute( 'rel', $normalized_rel ); } break; case 'rel': // Normalize rel attribute: lowercase, deduplicate, preserve safe tokens. $rel_value = is_string( $attr_value ) ? $attr_value : null; $normalized_rel = self::normalize_rel_attribute( $rel_value, false ); if ( empty( $normalized_rel ) ) { $html->remove_attribute( $attr_name ); } else { $html->set_attribute( $attr_name, $normalized_rel ); } break; case 'style': // Only allow safe CSS properties for typography and basic styling. $safe_properties = self::get_safe_css_properties(); $sanitized_styles = array(); $style_parts = explode( ';', (string) $attr_value ); foreach ( $style_parts as $style_part ) { $style_part = trim( $style_part ); if ( empty( $style_part ) ) { continue; } $property_parts = explode( ':', $style_part, 2 ); if ( count( $property_parts ) !== 2 ) { continue; } $property = trim( strtolower( $property_parts[0] ) ); $value = trim( $property_parts[1] ); // Only allow safe properties. if ( in_array( $property, $safe_properties, true ) ) { // Use centralized CSS value sanitization. $sanitized_value = self::sanitize_css_value( $value ); if ( ! empty( $sanitized_value ) ) { $sanitized_styles[] = $property . ': ' . $sanitized_value; } } } if ( empty( $sanitized_styles ) ) { $html->remove_attribute( $attr_name ); } else { $html->set_attribute( $attr_name, implode( '; ', $sanitized_styles ) ); } break; case 'class': // Only allow alphanumeric characters, hyphens, and underscores. if ( ! preg_match( '/^[a-zA-Z0-9\s\-_]+$/', (string) $attr_value ) ) { $html->remove_attribute( $attr_name ); } break; case 'data-type': case 'data-id': // Only allow alphanumeric characters, hyphens, and underscores. if ( ! preg_match( '/^[a-zA-Z0-9\-_]+$/', (string) $attr_value ) ) { $html->remove_attribute( $attr_name ); } break; default: // Handle data-* attributes with strict validation. if ( str_starts_with( $attr_name, 'data-' ) ) { if ( ! preg_match( '/^[a-zA-Z0-9\-_]+$/', (string) $attr_value ) ) { $html->remove_attribute( $attr_name ); } break; } // Default deny policy: Remove any attribute not explicitly allowed. $html->remove_attribute( $attr_name ); break; } } /** * Get list of safe CSS properties for typography and basic styling. * * @return array Array of safe CSS property names. */ public static function get_safe_css_properties(): array { return array( 'color', 'background-color', 'font-family', 'font-size', 'font-weight', 'font-style', 'text-decoration', 'text-align', 'line-height', 'letter-spacing', 'text-transform', ); } /** * Get list of safe CSS properties for caption typography (excludes background properties). * * @return array Array of safe CSS property names for captions. */ public static function get_caption_css_properties(): array { return array( 'font-family', 'font-size', 'font-weight', 'font-style', 'text-decoration', 'line-height', 'letter-spacing', 'text-transform', ); } /** * Validate HTML container attributes for security before content extraction. * This method checks if a container element (like figcaption, span) has safe attributes. * * @param string $container_html Full container HTML (e.g., <figcaption class="...">content</figcaption>). * @return bool True if container attributes are safe, false otherwise. */ public static function validate_container_attributes( string $container_html ): bool { // Use WP_HTML_Tag_Processor to validate container attributes. $html = new \WP_HTML_Tag_Processor( $container_html ); if ( ! $html->next_tag() ) { return false; } // Get all attributes and validate each one using our existing validation logic. $attributes = $html->get_attribute_names_with_prefix( '' ); if ( is_array( $attributes ) ) { foreach ( $attributes as $attr_name ) { // Use the same validation logic as validate_caption_attribute for consistency. $attr_value = $html->get_attribute( $attr_name ); if ( null === $attr_value ) { continue; } // Block event handlers immediately. if ( str_starts_with( $attr_name, 'on' ) ) { return false; } // Apply the same validation rules as caption attributes. // Create a temporary processor to test validation. $escaped_value = htmlspecialchars( (string) $attr_value, ENT_QUOTES, 'UTF-8' ); $temp_html = new \WP_HTML_Tag_Processor( '<span ' . $attr_name . '="' . $escaped_value . '">test</span>' ); if ( $temp_html->next_tag() ) { $original_value = $temp_html->get_attribute( $attr_name ); self::validate_caption_attribute( $temp_html, $attr_name ); $validated_value = $temp_html->get_attribute( $attr_name ); // If attribute was removed during validation, container is unsafe. if ( null !== $original_value && null === $validated_value ) { return false; } } } } return true; } /** * Sanitize caption HTML to allow only specific tags and attributes. * * @param string $caption_html Raw caption HTML. * @return string Sanitized caption HTML. */ public static function sanitize_caption_html( string $caption_html ): string { // If no HTML tags, return as-is. if ( false === strpos( $caption_html, '<' ) ) { return $caption_html; } // Remove dangerous content: script, style, and other executable elements. $result = preg_replace( '/<(script|style|iframe|object|embed|form|input|button)\b[^>]*>.*?<\/\1>/is', '', $caption_html ); if ( null === $result ) { $caption_html = ''; } else { $caption_html = $result; } // Use a more conservative approach - only validate attributes, don't modify tags. $allowed_tags = array( 'strong', 'em', 'a', 'mark', 'kbd', 's', 'sub', 'sup', 'span', 'br' ); $html = new \WP_HTML_Tag_Processor( $caption_html ); // First pass: Process attributes for allowed tags only. while ( $html->next_tag() ) { $tag_name = $html->get_tag(); // Skip processing for disallowed tags. if ( ! in_array( $tag_name, $allowed_tags, true ) ) { continue; } // Only process attributes for allowed tags. $attributes = $html->get_attribute_names_with_prefix( '' ); if ( is_array( $attributes ) ) { foreach ( $attributes as $attr_name ) { // Validate and sanitize each attribute individually. self::validate_caption_attribute( $html, $attr_name ); } } } // Second pass: Remove disallowed tags using a simple regex approach. $final_html = $html->get_updated_html(); // Create a regex pattern to match disallowed tags. $allowed_tags_pattern = implode( '|', array_map( 'preg_quote', $allowed_tags ) ); // Remove disallowed opening and closing tags, keeping only their content. $result = preg_replace( '/<(?!(?:' . $allowed_tags_pattern . ')\b)[^>]*>(.*?)<\/(?!(?:' . $allowed_tags_pattern . ')\b)[^>]*>/s', '$1', $final_html ); if ( null === $result ) { $final_html = ''; } else { $final_html = $result; } // Remove disallowed self-closing tags. $result = preg_replace( '/<(?!(?:' . $allowed_tags_pattern . ')\b)[^>]*\/>/s', '', $final_html ); if ( null === $result ) { $final_html = ''; } else { $final_html = $result; } return $final_html; } /** * Sanitize image HTML while preserving necessary attributes for email rendering. * * @param string $image_html Raw image HTML. * @return string Sanitized image HTML. */ public static function sanitize_image_html( string $image_html ): string { // If no HTML tags, return as-is. if ( false === strpos( $image_html, '<' ) ) { return $image_html; } // Extract img tag using regex for reliable processing. if ( ! preg_match( '/<img[^>]*>/i', $image_html, $matches ) ) { return $image_html; } $img_tag = $matches[0]; $sanitized_attributes = array(); $has_src = false; // Extract and sanitize individual attributes using WP_HTML_Tag_Processor for attribute processing. $html = new \WP_HTML_Tag_Processor( $img_tag ); if ( $html->next_tag() ) { $attributes = $html->get_attribute_names_with_prefix( '' ); if ( is_array( $attributes ) ) { foreach ( $attributes as $attr_name ) { $attr_value = $html->get_attribute( $attr_name ); // Sanitize specific attributes. switch ( $attr_name ) { case 'src': // Sanitize image source URL. $sanitized_src = esc_url( (string) $attr_value ); if ( ! empty( $sanitized_src ) ) { $sanitized_attributes[] = $attr_name . '="' . $sanitized_src . '"'; $has_src = true; } break; case 'alt': case 'width': case 'height': // Sanitize text attributes. $sanitized_attributes[] = $attr_name . '="' . esc_attr( (string) $attr_value ) . '"'; break; case 'class': // Clean CSS classes. $cleaned_classes = self::clean_css_classes( (string) $attr_value ); if ( ! empty( $cleaned_classes ) ) { $sanitized_attributes[] = $attr_name . '="' . esc_attr( $cleaned_classes ) . '"'; } break; case 'style': // Sanitize inline styles - only allow safe properties for email rendering. $sanitized_styles = self::sanitize_image_styles( (string) $attr_value ); if ( ! empty( $sanitized_styles ) ) { $sanitized_attributes[] = $attr_name . '="' . esc_attr( $sanitized_styles ) . '"'; } break; } } } } // If no valid src attribute, return empty string. if ( ! $has_src ) { return ''; } // Rebuild the img tag with sanitized attributes. if ( empty( $sanitized_attributes ) ) { return ''; } return '<img ' . implode( ' ', $sanitized_attributes ) . '>'; } /** * Sanitize inline styles for image elements - only allow safe properties for email rendering. * * @param string $style_value Raw style value. * @return string Sanitized style value. */ private static function sanitize_image_styles( string $style_value ): string { $sanitized_styles = array(); $style_parts = explode( ';', $style_value ); foreach ( $style_parts as $style_part ) { $style_part = trim( $style_part ); if ( empty( $style_part ) ) { continue; } $property_parts = explode( ':', $style_part, 2 ); if ( count( $property_parts ) !== 2 ) { continue; } $property = trim( strtolower( $property_parts[0] ) ); $value = trim( $property_parts[1] ); // Allow safe CSS properties for images in email rendering. $safe_properties = array( 'width', 'height', 'max-width', 'max-height', 'display', 'margin', 'padding', 'border', 'border-radius' ); if ( in_array( $property, $safe_properties, true ) ) { $sanitized_value = self::sanitize_css_value( $value ); if ( ! empty( $sanitized_value ) ) { $sanitized_styles[] = $property . ': ' . $sanitized_value; } } } return implode( '; ', $sanitized_styles ); } }